📝 Writing docs.

lzhan011 · Jul 15, 2019 · 033ed13 · 033ed13
1 parent d81611f
commit 033ed13
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 34 deletions.
diff --git a/mkdocs/docs/about/release-notes.md b/mkdocs/docs/about/release-notes.md
@@ -22,6 +22,8 @@ pip show kashgari-tf
 - 📝 Add Chinese documents
 - 🚸 Add `label2idx`, `token2idx` properties to Embeddings and Models
 - ✨ Add `predict_top_k_class` for classification model to get predict probabilities ([#146](https://github.com/BrikerMan/Kashgari/issues/146))
+- 🚸 Add `tokenizer` property for BERT Embedding. ([#136](https://github.com/BrikerMan/Kashgari/issues/136))
+- 🚸 Add `predict_kwargs` for models `predict()` function
 
 ### [0.5.0] - 2019.07.11
 

diff --git a/mkdocs/docs/api/tasks.classification.md b/mkdocs/docs/api/tasks.classification.md
@@ -266,7 +266,8 @@ def predict(self,
             x_data,
             batch_size=None,
             multi_label_threshold: float = 0.5,
-            debug_info=False):
+            debug_info=False,
+            predict_kwargs: Dict = None):
 ```
 
 __Args__:
@@ -275,6 +276,7 @@ __Args__:
 - **batch_size**: Integer. If unspecified, it will default to 32.
 - **multi\_label\_threshold**:
 - **debug_info**: Bool, Should print out the logging info.
+- **predict_kwargs**: Dict, arguments passed to `predict()` function of [tensorflow.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator)
 
 __Returns__:
 
@@ -291,7 +293,8 @@ def predict_top_k_class(self,
                         x_data,
                         top_k=5,
                         batch_size=32,
-                        debug_info=False) -> List[Dict]:
+                        debug_info=False,
+                        predict_kwargs: Dict = None) -> List[Dict]:
 ```
 
 __Args__:
@@ -300,6 +303,7 @@ __Args__:
 - **top_k**: int
 - **batch_size**: Integer. If unspecified, it will default to 32.
 - **debug_info**: Bool, Should print out the logging info.
+- **predict_kwargs**: Dict, arguments passed to `predict()` function of [tensorflow.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator)
 
 __Returns__:
 

diff --git a/mkdocs/docs/api/tasks.labeling.md b/mkdocs/docs/api/tasks.labeling.md
@@ -265,14 +265,16 @@ Generates output predictions for the input samples. Computation is done in batch
 def predict(self,
             x_data,
             batch_size=32,
-            debug_info=False):
+            debug_info=False,
+            predict_kwargs: Dict = None):
 ```
 
 __Args__:
 
 - **x_data**: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
 - **batch_size**: Integer. If unspecified, it will default to 32.
 - **debug_info**: Bool, Should print out the logging info.
+- **predict_kwargs**: Dict, arguments passed to `predict()` function of [tensorflow.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator)
 
 __Returns__:
 
@@ -287,7 +289,8 @@ def predict_entities(self,
                      x_data,
                      batch_size=None,
                      join_chunk=' ',
-                     debug_info=False):
+                     debug_info=False,
+                     predict_kwargs: Dict = None):
 
 ```
 
@@ -297,6 +300,7 @@ __Args__:
 - batch_size: Integer. If unspecified, it will default to 32.
 - join_chunk: str or False,
 - debug_info: Bool, Should print out the logging info.
+- **predict_kwargs**: Dict, arguments passed to `predict()` function of [tensorflow.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/models/Model#fit_generator)
 
 __Returns__:
 

diff --git a/mkdocs/docs/embeddings/bert-embedding.md b/mkdocs/docs/embeddings/bert-embedding.md
@@ -39,26 +39,19 @@ labels = [
     "class2",
     "class1"
 ]
-########## pre-process input sentences first ##########
-import os
-import codecs
-from keras_bert import Tokenizer
-bert_model_path = "wwm_uncased_L-24_H-1024_A-16/"
-vocab_path = os.path.join(bert_model_path, 'vocab.txt')
-token_dict = {}
-with codecs.open(vocab_path, 'r', 'utf8') as reader:
-    for line in reader:
-        token = line.strip()
-        token_dict[token] = len(token_dict)
-"""
-token_dict should contain something like the following:
-{{'[PAD]': 0, ..., 'stratford': 17723, '##rted': 17724, 'noticeable': 17725, '##evic': 17726, 'imp': 17727, '##rita': 17728, ...}
-"""
-tokenizer = Tokenizer(token_dict)
+########## Load Bert Embedding ##########
+import kashgari
+from kashgari.embeddings import BERTEmbedding
+
+bert_embedding = BERTEmbedding(bert_model_path,
+                               task=kashgari.CLASSIFICATION,
+                               sequence_length=128)
+
+tokenizer = bert_embedding.tokenizer
 sentences_tokenized = []
 for sentence in sentences:
-  sentence_tokenized = tokenizer.tokenize(sentence)
-  sentences_tokenized.append(sentence_tokenized)
+    sentence_tokenized = tokenizer.tokenize(sentence)
+    sentences_tokenized.append(sentence_tokenized)
 """
 The sentences will become tokenized into:
 [
@@ -67,21 +60,16 @@ The sentences will become tokenized into:
     ['[CLS]', 'why', 'did', 'the', 'chicken', 'cross', 'the', 'road', '?', '[SEP]']
 ]
 """
-train_x, train_y = sentences[:2], labels[:2]
-validate_x, validate_y = sentences[2:], labels[2:]
-########## /pre-process input sentences first ##########
+
+# Our tokenizer already added the BOS([CLS]) and EOS([SEP]) token
+# so we need to disable the default add_bos_eos setting.
+bert_embedding.processor.add_bos_eos = False
+
+train_x, train_y = sentences, labels
+validate_x, validate_y = sentences, labels
 
 ########## build model ##########
-from kashgari.embeddings import BERTEmbedding
 from kashgari.tasks.classification import CNNLSTMModel
-import kashgari
-
-bert_embedding = BERTEmbedding(bert_model_path, 
-                               task=kashgari.CLASSIFICATION,
-                               sequence_length=128)
-# Our tokenizer already added the BOS([CLS]) and EOS([SEP]) token
-# so we need to disable the default add_bos_eos setting. 
-bert_embedding.processor.add_bos_eos = True                         
 model = CNNLSTMModel(bert_embedding)
 
 ########## /build model ##########