Skip to content

Commit

Permalink
NER function added
Browse files Browse the repository at this point in the history
  • Loading branch information
Piyush1416 committed Jul 31, 2020
1 parent 81f991d commit d081540
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 88 deletions.
93 changes: 14 additions & 79 deletions libra/queries.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from libra.query.nlp_queries import (image_caption_query,
generate_caption, classify_text,
text_classification_query, get_summary,
summarization_query)
summarization_query,get_ner)
from libra.query.classification_models import (k_means_clustering,
train_svm, nearest_neighbors,
decision_tree, train_xgboost)
Expand All @@ -16,8 +16,6 @@
from libra.data_generation.dataset_labelmatcher import (get_similar_column,
get_similar_model)
from libra.plotting.generate_plots import analyze
from libra.query.recommender_systems import ContentBasedRecommender

from libra.dashboard.auto_eda import edaDashboard
from colorama import Fore, Style
import pandas as pd
Expand All @@ -29,9 +27,6 @@
import numpy as np
from tkinter import filedialog
from tkinter import *
from tensorflow.keras.preprocessing.image import img_to_array
import tensorflow as tf
from matplotlib import pyplot as plt

# suppressing warnings for cleaner dialogue box
warnings.simplefilter(action='error', category=FutureWarning)
Expand Down Expand Up @@ -134,15 +129,6 @@ def get_models(self, model_requested):
return get_similar_model(model_requested, self.models.keys())
clearLog()

# recommend items based on search criteria(for recommender systems only)

def recommend(self,search_term):
if self.latest_model == 'content_recommender':
model = self.models[self.latest_model]
return model.recommend(search_term)
else:
pass

# param modelKey: string representation of the model to make prediction
# param data: dataframe version of desired prediction set
def predict(self, data, model=None):
Expand All @@ -160,7 +146,6 @@ def predict(self, data, model=None):
return predictions
else:
modeldict = self.models[model]

if modeldict.get('preprocesser'):
data = modeldict['preprocesser'].transform(data)
predictions = modeldict['model'].predict(data)
Expand Down Expand Up @@ -572,7 +557,6 @@ def decision_tree_query(
:return: a model and information to along with it stored in the self.models dictionary.
'''


self.models['decision_tree'] = decision_tree(
instruction=instruction,
text=text,
Expand All @@ -593,15 +577,6 @@ def decision_tree_query(
self.latest_model = 'decision_tree'
clearLog()

def content_recommender_query(self,feature_names=[],n_recommendations=10,indexer='title'):
self.models['content_recommender'] = ContentBasedRecommender(
data=self.dataset,
feature_names=feature_names,
indexer=indexer)

self.latest_model = 'content_recommender'
clearLog()

# query to create a xgboost model

def xgboost_query(self,
Expand Down Expand Up @@ -755,8 +730,7 @@ def convolutional_query(self,
pretrained=None,
epochs=10,
height=None,
width=None,
show_feature_map=False):
width=None):
'''
Calls the body of the convolutional neural network query which is located in the feedforward.py file
:param instruction: The objective that you want to model (str).
Expand All @@ -771,7 +745,6 @@ def convolutional_query(self,
:param epochs: Number of epochs (int).
:param height: Height of the input image (int).
:param width: Width of the input image (int).
:param show_feature_map: Displays feature map graphic (bool).
:return: an updated model and history stored in the models dictionary
Expand All @@ -794,55 +767,6 @@ def convolutional_query(self,
height=height,
width=width)

if show_feature_map:
model = self.models["convolutional_NN"]["model"]
X_test = self.models["convolutional_NN"]["data"]["test"]

# Get first image in test images and format it
img = X_test[0][0]
img /= 255
successive_outputs = [layer.output for layer in model.layers[1:]]
visualization_model = tf.keras.models.Model(inputs=model.input, outputs=successive_outputs)
successive_feature_maps = visualization_model.predict(img)

# Add main title to figure
firstPlot = True

# Include names of layers in plot
layer_names = [layer.name for layer in model.layers]
for layer_name, feature_map in zip(layer_names, successive_feature_maps):
if len(feature_map.shape) == 4:

# Plot Feature maps for the conv / maxpool layers, not the fully-connected layers
n_features = feature_map.shape[-1] # number of features in the feature map
height = feature_map.shape[1] # feature map shape (1, size, size, n_features)
width = feature_map.shape[2]
display_grid = np.zeros((height, width * n_features))

# Format features appropriately
for i in range(n_features):
img = feature_map[0, :, :, i]
img -= img.mean()
img /= img.std()
img *= 64
img += 128
img = np.clip(img, 0, 255).astype('uint8')

# Tile each filter into a horizontal grid
display_grid[:, i * width: (i + 1) * width] = img

# Display the grid
scale = 20. / n_features
plt.figure(figsize=(scale * n_features, scale))
if firstPlot:
plt.title(f'Network Visualization\n\n{layer_name}')
firstPlot = False
else:
plt.title(layer_name)
plt.grid(False)
plt.imshow(display_grid, aspect='auto', cmap='viridis')
plt.show()

self.latest_model = 'convolutional_NN'
clearLog()

Expand Down Expand Up @@ -1050,6 +974,15 @@ def image_caption_query(self, instruction, label_column=None,
self.latest_model = 'image_caption'
clearLog()

# name entity recognition query
def get_ner(self, target=None):
"""
function to identify name entities using huggingface framework
:param target: list with target column names (if None all columns are used) for detection
:return: dictionary object with detected name-entities
"""
ner_dict = get_ner(self, target=target)
return ner_dict

# shows the names of plots associated with a specific model
def plot_names(self, model=None):
Expand Down Expand Up @@ -1169,4 +1102,6 @@ def analyze(self, model=None, save=True, save_model=False):

def dashboard(self):
dash = edaDashboard(self.dataset)
dash.dashboard()
dash.dashboard()


51 changes: 49 additions & 2 deletions libra/query/nlp_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
from colorama import Fore, Style
from keras_preprocessing import sequence
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.callbacks import EarlyStopping
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

from transformers import TFT5ForConditionalGeneration, T5Tokenizer, \
pipeline, AutoTokenizer, TFAutoModel
import libra.plotting.nonkeras_generate_plots
from libra.data_generation.dataset_labelmatcher import get_similar_column
from libra.data_generation.grammartree import get_value_instruction
Expand Down Expand Up @@ -751,3 +752,49 @@ def val_step(img_tensor, target):
}
clearLog()
return self.models["image_caption"]

# name entity recognition query
def get_ner(self,target=None):
"""
function to identify name entities using huggingface framework
:param target: list with target column names (if None all columns are used) for detection
:return: dictionary object with detected name-entities
"""
data = DataReader(self.dataset)
data = data.data_generator()
if target == None or len(target) == 0:
target = list(data.columns.values)
logger("data ready for processing")
elif not type(target) is list:
raise Exception("kindly pass target as a list")
elif any(item in target for item in list(data.columns.values)):
logger("target data ready for processing")
else:
raise Exception("kindly pass right column value in target or ignore the target attribute for auto selection")

# Isolate target column data into one column (seperated by '.') which will be used for detection.
data['combined_text_for_ner'] = data[target].apply(lambda row: '.'.join(row.values.astype(str)), axis=1)
# Remove stopwords if any from the detection column
data['combined_text_for_ner'] = data['combined_text_for_ner'].apply(
lambda x: ' '.join([word for word in x.split() if word not in stopwords.words()]))
logger("name entities detection in progress........")
logger("Detecting Name Entities from : {} data files".format(data.shape[0]))

# Named entity recognition pipeline, default model selection
hugging_face_ner_detector = pipeline('ner',grouped_entities=True, framework = 'tf')

#Name entity recognition with light weight models
#tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english")
#model = TFAutoModel.from_pretrained("sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english")
#hugging_face_ner_detector = pipeline('ner', model= model, tokenizer=tokenizer, framework ='tf', grouped_entities=True)

data['ner'] = data['combined_text_for_ner'].apply(lambda x: hugging_face_ner_detector(x))
logger("NER detection status complete :)")
logger("Storing information in client object under key 'ner'")
self.models["ner"] = {
"model": hugging_face_ner_detector.model,
"tokenizer": hugging_face_ner_detector.tokenizer,
'name_entities': data['ner'].to_dict()}
logger("returning back a dictionary")
clearLog()
return data['ner'].to_dict()
13 changes: 6 additions & 7 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def compare(a, b):

class TestQueries(unittest.TestCase):

newClient = client('tools/data/structured_data/housing.csv')
newClient = client('tools/data/nlp_data/housing.csv')

"""
TEST QUERIES
Expand Down Expand Up @@ -100,13 +100,12 @@ def test_text_classification(self):
x = client("tools/data/nlp_data/smallSentimentAnalysis.csv")
x.text_classification_query("get captions", epochs=1)

# Test whether content based recommender works without error, and creates a key in models dictionary
# Tests whether name entity recognition query works without errors, and creates a key in models dictionary
@ordered
def test_content_recommender(self):
x = client('tools/data/recommender_systems_data/disney_plus_shows.csv')
x.content_recommender_query()
assert('recommendations' in x.recommend('Coco'))

def test_get_ner(self):
self.newClient.get_ner(['Summary','Text'])
self.assertTrue('ner' in self.newClient.models)
del self.newClient.models['ner']


"""
Expand Down

0 comments on commit d081540

Please sign in to comment.