NER function added

dethnass · Jul 31, 2020 · d081540 · d081540
1 parent 81f991d
commit d081540
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 88 deletions.
diff --git a/libra/queries.py b/libra/queries.py
@@ -1,7 +1,7 @@
 from libra.query.nlp_queries import (image_caption_query,
                                      generate_caption, classify_text,
                                      text_classification_query, get_summary,
-                                     summarization_query)
+                                     summarization_query,get_ner)
 from libra.query.classification_models import (k_means_clustering,
                                                train_svm, nearest_neighbors,
                                                decision_tree, train_xgboost)
@@ -16,8 +16,6 @@
 from libra.data_generation.dataset_labelmatcher import (get_similar_column,
                                                         get_similar_model)
 from libra.plotting.generate_plots import analyze
-from libra.query.recommender_systems import ContentBasedRecommender
-
 from libra.dashboard.auto_eda import edaDashboard
 from colorama import Fore, Style
 import pandas as pd
@@ -29,9 +27,6 @@
 import numpy as np
 from tkinter import filedialog
 from tkinter import *
-from tensorflow.keras.preprocessing.image import img_to_array
-import tensorflow as tf
-from matplotlib import pyplot as plt
 
 # suppressing warnings for cleaner dialogue box
 warnings.simplefilter(action='error', category=FutureWarning)
@@ -134,15 +129,6 @@ def get_models(self, model_requested):
         return get_similar_model(model_requested, self.models.keys())
         clearLog()
 
-    # recommend items based on search criteria(for recommender systems only)
-
-    def recommend(self,search_term):
-        if self.latest_model == 'content_recommender':
-            model = self.models[self.latest_model]
-            return model.recommend(search_term)
-        else:
-            pass
-
     # param modelKey: string representation of the model to make prediction
     # param data: dataframe version of desired prediction set
     def predict(self, data, model=None):
@@ -160,7 +146,6 @@ def predict(self, data, model=None):
             return predictions
         else:
             modeldict = self.models[model]
-
             if modeldict.get('preprocesser'):
                 data = modeldict['preprocesser'].transform(data)
             predictions = modeldict['model'].predict(data)
@@ -572,7 +557,6 @@ def decision_tree_query(
         :return: a model and information to along with it stored in the self.models dictionary.
         '''
 
-
         self.models['decision_tree'] = decision_tree(
             instruction=instruction,
             text=text,
@@ -593,15 +577,6 @@ def decision_tree_query(
         self.latest_model = 'decision_tree'
         clearLog()
 
-    def content_recommender_query(self,feature_names=[],n_recommendations=10,indexer='title'):
-        self.models['content_recommender'] = ContentBasedRecommender(
-            data=self.dataset,
-            feature_names=feature_names,
-            indexer=indexer)
-
-        self.latest_model = 'content_recommender'
-        clearLog()
-
     # query to create a xgboost model
 
     def xgboost_query(self,
@@ -755,8 +730,7 @@ def convolutional_query(self,
                             pretrained=None,
                             epochs=10,
                             height=None,
-                            width=None,
-                            show_feature_map=False):
+                            width=None):
         '''
         Calls the body of the convolutional neural network query which is located in the feedforward.py file
         :param instruction: The objective that you want to model (str).
@@ -771,7 +745,6 @@ def convolutional_query(self,
         :param epochs: Number of epochs (int).
         :param height: Height of the input image (int).
         :param width: Width of the input image (int).
-        :param show_feature_map: Displays feature map graphic (bool).
 
 
         :return: an updated model and history stored in the models dictionary
@@ -794,55 +767,6 @@ def convolutional_query(self,
             height=height,
             width=width)
 
-        if show_feature_map:
-            model = self.models["convolutional_NN"]["model"]
-            X_test = self.models["convolutional_NN"]["data"]["test"]
-
-            # Get first image in test images and format it
-            img = X_test[0][0]
-            img /= 255
-            successive_outputs = [layer.output for layer in model.layers[1:]]
-            visualization_model = tf.keras.models.Model(inputs=model.input, outputs=successive_outputs)
-            successive_feature_maps = visualization_model.predict(img)
-
-            # Add main title to figure
-            firstPlot = True
-
-            # Include names of layers in plot
-            layer_names = [layer.name for layer in model.layers]
-            for layer_name, feature_map in zip(layer_names, successive_feature_maps):
-                if len(feature_map.shape) == 4:
-
-                    # Plot Feature maps for the conv / maxpool layers, not the fully-connected layers
-                    n_features = feature_map.shape[-1]  # number of features in the feature map
-                    height = feature_map.shape[1]       # feature map shape (1, size, size, n_features)
-                    width = feature_map.shape[2]
-                    display_grid = np.zeros((height, width * n_features))
-
-                    # Format features appropriately
-                    for i in range(n_features):
-                        img = feature_map[0, :, :, i]
-                        img -= img.mean()
-                        img /= img.std()
-                        img *= 64
-                        img += 128
-                        img = np.clip(img, 0, 255).astype('uint8')
-
-                        # Tile each filter into a horizontal grid
-                        display_grid[:, i * width: (i + 1) * width] = img
-
-                    # Display the grid
-                    scale = 20. / n_features
-                    plt.figure(figsize=(scale * n_features, scale))
-                    if firstPlot:
-                        plt.title(f'Network Visualization\n\n{layer_name}')
-                        firstPlot = False
-                    else:
-                        plt.title(layer_name)
-                    plt.grid(False)
-                    plt.imshow(display_grid, aspect='auto', cmap='viridis')
-                    plt.show()
-
         self.latest_model = 'convolutional_NN'
         clearLog()
 
@@ -1050,6 +974,15 @@ def image_caption_query(self, instruction, label_column=None,
         self.latest_model = 'image_caption'
         clearLog()
 
+    # name entity recognition query
+    def get_ner(self, target=None):
+        """
+        function to identify name entities using huggingface framework
+        :param target: list with target column names (if None all columns are used) for detection
+        :return: dictionary object with detected name-entities
+        """
+        ner_dict = get_ner(self, target=target)
+        return ner_dict
 
     # shows the names of plots associated with a specific model
     def plot_names(self, model=None):
@@ -1169,4 +1102,6 @@ def analyze(self, model=None, save=True, save_model=False):
 
     def dashboard(self):
         dash = edaDashboard(self.dataset)
-        dash.dashboard()
+        dash.dashboard()    
+
+
diff --git a/libra/query/nlp_queries.py b/libra/query/nlp_queries.py
@@ -2,12 +2,13 @@
 
 import numpy as np
 import tensorflow as tf
+from nltk.corpus import stopwords
 from colorama import Fore, Style
 from keras_preprocessing import sequence
 from sklearn.model_selection import train_test_split
 from tensorflow.python.keras.callbacks import EarlyStopping
-from transformers import TFT5ForConditionalGeneration, T5Tokenizer
-
+from transformers import TFT5ForConditionalGeneration, T5Tokenizer, \
+    pipeline, AutoTokenizer, TFAutoModel
 import libra.plotting.nonkeras_generate_plots
 from libra.data_generation.dataset_labelmatcher import get_similar_column
 from libra.data_generation.grammartree import get_value_instruction
@@ -751,3 +752,49 @@ def val_step(img_tensor, target):
     }
     clearLog()
     return self.models["image_caption"]
+
+# name entity recognition query
+def get_ner(self,target=None):
+    """
+    function to identify name entities using huggingface framework
+    :param target: list with target column names (if None all columns are used) for detection
+    :return: dictionary object with detected name-entities
+    """
+    data = DataReader(self.dataset)
+    data = data.data_generator()
+    if target == None or len(target) == 0:
+        target = list(data.columns.values)
+        logger("data ready for processing")
+    elif not type(target) is list:
+        raise Exception("kindly pass target as a list")
+    elif any(item in target for item in list(data.columns.values)):
+        logger("target data ready for processing")
+    else:
+        raise Exception("kindly pass right column value in target or ignore the target attribute for auto selection")
+
+    # Isolate target column data into one column (seperated by '.') which will be used for detection.
+    data['combined_text_for_ner'] = data[target].apply(lambda row: '.'.join(row.values.astype(str)), axis=1)
+    # Remove stopwords if any from the detection column
+    data['combined_text_for_ner'] = data['combined_text_for_ner'].apply(
+        lambda x: ' '.join([word for word in x.split() if word not in stopwords.words()]))
+    logger("name entities detection in progress........")
+    logger("Detecting Name Entities from : {} data files".format(data.shape[0]))
+
+    # Named entity recognition pipeline, default model selection
+    hugging_face_ner_detector = pipeline('ner',grouped_entities=True, framework = 'tf')
+
+    #Name entity recognition with light weight models
+    #tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english")
+    #model = TFAutoModel.from_pretrained("sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english")
+    #hugging_face_ner_detector = pipeline('ner', model= model, tokenizer=tokenizer, framework ='tf', grouped_entities=True)
+
+    data['ner'] = data['combined_text_for_ner'].apply(lambda x: hugging_face_ner_detector(x))
+    logger("NER detection status complete :)")
+    logger("Storing information in client object under key 'ner'")
+    self.models["ner"] = {
+        "model": hugging_face_ner_detector.model,
+        "tokenizer": hugging_face_ner_detector.tokenizer,
+        'name_entities': data['ner'].to_dict()}
+    logger("returning back a dictionary")
+    clearLog()
+    return data['ner'].to_dict()
diff --git a/tests/tests.py b/tests/tests.py
@@ -19,7 +19,7 @@ def compare(a, b):
 
 class TestQueries(unittest.TestCase):
 
-    newClient = client('tools/data/structured_data/housing.csv')
+    newClient = client('tools/data/nlp_data/housing.csv')
 
     """
     TEST QUERIES
@@ -100,13 +100,12 @@ def test_text_classification(self):
         x = client("tools/data/nlp_data/smallSentimentAnalysis.csv")
         x.text_classification_query("get captions", epochs=1)
 
-    # Test whether content based recommender works without error, and creates a key in models dictionary
+    # Tests whether name entity recognition query works without errors, and creates a key in models dictionary
     @ordered
-    def test_content_recommender(self):
-        x = client('tools/data/recommender_systems_data/disney_plus_shows.csv')
-        x.content_recommender_query()
-        assert('recommendations' in x.recommend('Coco'))
-
+    def test_get_ner(self):
+        self.newClient.get_ner(['Summary','Text'])
+        self.assertTrue('ner' in self.newClient.models)
+        del self.newClient.models['ner']
 
 
     """