@@ -83,16 +83,17 @@ def document_frequency(term: str, corpus: str) -> int:
83
83
return (len ([doc for doc in docs if term in doc ]), len (docs ))
84
84
85
85
86
- def inverse_document_frequency (df : int , N : int ) -> float :
86
+ def inverse_document_frequency (df : int , N : int , smoothing = False ) -> float :
87
87
"""
88
88
Return an integer denoting the importance
89
89
of a word. This measure of importance is
90
90
calculated by log10(N/df), where N is the
91
91
number of documents and df is
92
92
the Document Frequency.
93
- @params : df, the Document Frequency, and N,
94
- the number of documents in the corpus.
95
- @returns : log10(N/df)
93
+ @params : df, the Document Frequency, N,
94
+ the number of documents in the corpus and
95
+ smoothing, if True return the idf-smooth
96
+ @returns : log10(N/df) or 1+log10(N/1+df)
96
97
@examples :
97
98
>>> inverse_document_frequency(3, 0)
98
99
Traceback (most recent call last):
@@ -104,7 +105,14 @@ def inverse_document_frequency(df: int, N: int) -> float:
104
105
Traceback (most recent call last):
105
106
...
106
107
ZeroDivisionError: df must be > 0
108
+ >>> inverse_document_frequency(0, 3,True)
109
+ 1.477
107
110
"""
111
+ if smoothing :
112
+ if N == 0 :
113
+ raise ValueError ("log10(0) is undefined." )
114
+ return round (1 + log10 (N / (1 + df )), 3 )
115
+
108
116
if df == 0 :
109
117
raise ZeroDivisionError ("df must be > 0" )
110
118
elif N == 0 :
0 commit comments