forked from TheAlgorithms/Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
NLP Word Frequency Algorithms (TheAlgorithms#2142)
* NLP Word Frequency Algorithms * Added type hints and Wikipedia link to tf-idf * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Fix line length for flake8 * Fix line length for flake8 V2 * Add line escapes and change int to float * Corrected doctests * Fix for TravisCI * Fix for TravisCI V2 * Tests passing locally * Tests passing locally * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <[email protected]> * Add doctest examples and clean up docstrings Co-authored-by: Christian Clauss <[email protected]>
- Loading branch information
1 parent
c7ca9cf
commit b368b1e
Showing
1 changed file
with
133 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import string | ||
from math import log10 | ||
|
||
""" | ||
tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf | ||
tf-idf and other word frequency algorithms are often used | ||
as a weighting factor in information retrieval and text | ||
mining. 83% of text-based recommender systems use | ||
tf-idf for term weighting. In Layman's terms, tf-idf | ||
is a statistic intended to reflect how important a word | ||
is to a document in a corpus (a collection of documents) | ||
Here I've implemented several word frequency algorithms | ||
that are commonly used in information retrieval: Term Frequency, | ||
Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) | ||
are included. | ||
Term Frequency is a statistical function that | ||
returns a number representing how frequently | ||
an expression occurs in a document. This | ||
indicates how significant a particular term is in | ||
a given document. | ||
Document Frequency is a statistical function that returns | ||
an integer representing the number of documents in a | ||
corpus that a term occurs in (where the max number returned | ||
would be the number of documents in the corpus). | ||
Inverse Document Frequency is mathematically written as | ||
log10(N/df), where N is the number of documents in your | ||
corpus and df is the Document Frequency. If df is 0, a | ||
ZeroDivisionError will be thrown. | ||
Term-Frequency*Inverse-Document-Frequency is a measure | ||
of the originality of a term. It is mathematically written | ||
as tf*log10(N/df). It compares the number of times | ||
a term appears in a document with the number of documents | ||
the term appears in. If df is 0, a ZeroDivisionError will be thrown. | ||
""" | ||
|
||
|
||
def term_frequency(term : str, document : str) -> int: | ||
""" | ||
Return the number of times a term occurs within | ||
a given document. | ||
@params: term, the term to search a document for, and document, | ||
the document to search within | ||
@returns: an integer representing the number of times a term is | ||
found within the document | ||
@examples: | ||
>>> term_frequency("to", "To be, or not to be") | ||
2 | ||
""" | ||
# strip all punctuation and newlines and replace it with '' | ||
document_without_punctuation = document.translate( | ||
str.maketrans("", "", string.punctuation) | ||
).replace("\n", "") | ||
tokenize_document = document_without_punctuation.split(" ") # word tokenization | ||
return len( | ||
[word for word in tokenize_document if word.lower() == term.lower()] | ||
) | ||
|
||
|
||
def document_frequency(term: str, corpus: str) -> int: | ||
""" | ||
Calculate the number of documents in a corpus that contain a | ||
given term | ||
@params : term, the term to search each document for, and corpus, a collection of | ||
documents. Each document should be separated by a newline. | ||
@returns : the number of documents in the corpus that contain the term you are | ||
searching for and the number of documents in the corpus | ||
@examples : | ||
>>> document_frequency("first", "This is the first document in the corpus.\\nThIs\ | ||
is the second document in the corpus.\\nTHIS is \ | ||
the third document in the corpus.") | ||
(1, 3) | ||
""" | ||
corpus_without_punctuation = corpus.translate( | ||
str.maketrans("", "", string.punctuation) | ||
) # strip all punctuation and replace it with '' | ||
documents = corpus_without_punctuation.split("\n") | ||
lowercase_documents = [document.lower() for document in documents] | ||
return len( | ||
[document for document in lowercase_documents if term.lower() in document] | ||
), len(documents) | ||
|
||
|
||
def inverse_document_frequency(df : int, N: int) -> float: | ||
""" | ||
Return an integer denoting the importance | ||
of a word. This measure of importance is | ||
calculated by log10(N/df), where N is the | ||
number of documents and df is | ||
the Document Frequency. | ||
@params : df, the Document Frequency, and N, | ||
the number of documents in the corpus. | ||
@returns : log10(N/df) | ||
@examples : | ||
>>> inverse_document_frequency(3, 0) | ||
Traceback (most recent call last): | ||
... | ||
ValueError: log10(0) is undefined. | ||
>>> inverse_document_frequency(1, 3) | ||
0.477 | ||
>>> inverse_document_frequency(0, 3) | ||
Traceback (most recent call last): | ||
... | ||
ZeroDivisionError: df must be > 0 | ||
""" | ||
if df == 0: | ||
raise ZeroDivisionError("df must be > 0") | ||
elif N == 0: | ||
raise ValueError("log10(0) is undefined.") | ||
return round(log10(N / df), 3) | ||
|
||
|
||
def tf_idf(tf : int, idf: int) -> float: | ||
""" | ||
Combine the term frequency | ||
and inverse document frequency functions to | ||
calculate the originality of a term. This | ||
'originality' is calculated by multiplying | ||
the term frequency and the inverse document | ||
frequency : tf-idf = TF * IDF | ||
@params : tf, the term frequency, and idf, the inverse document | ||
frequency | ||
@examples : | ||
>>> tf_idf(2, 0.477) | ||
0.954 | ||
""" | ||
return round(tf * idf, 3) |