keybert

wjbmattingly · Aug 23, 2021 · 78b7dc8 · 78b7dc8
1 parent 3c8f3b5
commit 78b7dc8
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 2 deletions.
diff --git a/app.py b/app.py
@@ -335,16 +335,30 @@ def write_data(file, data):
 
         elif nlp_options == "KeyBERT":
             from keybert import KeyBERT
+            top_n = container1.slider("Top-N Words", 1,50)
+            n_gram_low = container2.slider("N-Gram Range (Low)", 1, 3)
+            n_gram_high = container3.slider("N-Gram Range (High)", 1, 3)
+
+            stop_words = container1.selectbox("Stopwords", ("english", "german", "spanish"))
+            nr_candidates = container2.slider("Number of Candidates", 1, 50)
+            diversity = container3.slider("Diversity", 0.0, 1.0, 0.7)
+
+
             kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
             all_keywords = []
             for i in range(len(all_text)):
-                words = kw_extractor.extract_keywords(all_text[i], keyphrase_ngram_range=(1, 2), stop_words='english')
+                words = kw_extractor.extract_keywords(all_text[i],
+                                keyphrase_ngram_range=(n_gram_low, n_gram_high),
+                                stop_words=stop_words,
+                                top_n=top_n,
+                                nr_candidates=nr_candidates)
                 final = []
                 for word in words:
                     new = str(word)
                     final.append(new)
                 all_keywords.append(final)
 
+
             col1.header(f"Key Terms")
             words = "\n * ".join(all_keywords[0])
             words = "* "+words

diff --git a/markdown_pages/keyword.md b/markdown_pages/keyword.md
@@ -10,4 +10,4 @@ TF-IDF stands for Term-Frequency, Inverse-Document Frequency. It is the standard
 
 Graph-based  keyword extraction is still a heuristic algorithm, but the algorithm is a bit more complex than TF-IDF which only considers a few aspects of a word relative to the corpus. Graph-based methods apply graphing, or network, algorithms to text-based questions. Some graphing approaches work at the word-level, while others work at the sentence-level. The idea is to understand a word in context to rank its importance.
 
-KeyBERT is a newer approach. It leverages a BERT model, an advanced language model that is used for everything from text summarization to machine translation. BERT models can be used to extract keywords and phrases from texts, an essential step in the process of text summarization.
+KeyBERT is a newer approach. It leverages a BERT model, an advanced language model that is used for everything from text summarization to machine translation. BERT models can be used to extract keywords and phrases from texts, an essential step in the process of text summarization. KeyBERT is a large machine learning model that must be downloaded the first time the app is used. Because it is a large machine learning model, it can take a longer time to process, but the rights are usually better. You can control the n-gram size of the model.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,4 @@ TF-IDF stands for Term-Frequency, Inverse-Document Frequency. It is the standard

		Graph-based keyword extraction is still a heuristic algorithm, but the algorithm is a bit more complex than TF-IDF which only considers a few aspects of a word relative to the corpus. Graph-based methods apply graphing, or network, algorithms to text-based questions. Some graphing approaches work at the word-level, while others work at the sentence-level. The idea is to understand a word in context to rank its importance.

		KeyBERT is a newer approach. It leverages a BERT model, an advanced language model that is used for everything from text summarization to machine translation. BERT models can be used to extract keywords and phrases from texts, an essential step in the process of text summarization.
		KeyBERT is a newer approach. It leverages a BERT model, an advanced language model that is used for everything from text summarization to machine translation. BERT models can be used to extract keywords and phrases from texts, an essential step in the process of text summarization. KeyBERT is a large machine learning model that must be downloaded the first time the app is used. Because it is a large machine learning model, it can take a longer time to process, but the rights are usually better. You can control the n-gram size of the model.