Initial upload

harsha-hl · Jul 25, 2022 · 070b1a1 · 070b1a1
commit 070b1a1
Show file tree

Hide file tree

Showing 56 changed files with 1,400 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.DS_Store
+*.pyc 
diff --git a/.ipynb_checkpoints/nlp-checkpoint.ipynb b/.ipynb_checkpoints/nlp-checkpoint.ipynb
@@ -0,0 +1,110 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "sentence = \"\"\"At eight o'clock on Thursday morning\n",
+    "... Arthur didn't feel very good.\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['At',\n",
+       " 'eight',\n",
+       " \"o'clock\",\n",
+       " 'on',\n",
+       " 'Thursday',\n",
+       " 'morning',\n",
+       " '...',\n",
+       " 'Arthur',\n",
+       " 'did',\n",
+       " \"n't\",\n",
+       " 'feel',\n",
+       " 'very',\n",
+       " 'good',\n",
+       " '.']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokens = nltk.word_tokenize(sentence)\n",
+    "tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('At', 'IN'),\n",
+       " ('eight', 'CD'),\n",
+       " (\"o'clock\", 'NN'),\n",
+       " ('on', 'IN'),\n",
+       " ('Thursday', 'NNP'),\n",
+       " ('morning', 'NN')]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tagged = nltk.pos_tag(tokens)\n",
+    "tagged[0:6]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# NLTK
diff --git a/app.py b/app.py
@@ -0,0 +1,85 @@
+# FLASK_APP=app.py FLASK_ENV=development flask run
+import text_preprocessing 
+from flask import Flask
+from flask import Flask, flash, redirect, render_template, request, session, jsonify
+import json
+
+app = Flask(__name__)  
+
+# @app.route("/")
+# def home():
+#   return render_template("h.html")  
+
+
+@app.route("/experiment")
+def experiment():
+  objects = []    # a Python object (dict):
+  new=[]
+  check_box=[] 
+  f = open("static/text/salt_analysis.txt", "r")
+  para= f.read()
+  #length = text_preprocessing.sen()    #var is the no of sentences in the paragraph
+  qwe=text_preprocessing.main()   # qwe={(name:testtube,pos:up),(name:beaker,pos:down)}
+  var = text_preprocessing.sen()
+  apparatus = text_preprocessing.apparatus() 
+  print("this is var",var)
+  for i in range(var):    #if return render_template is within this for loop only the objects of first sentence are displayed
+    print("this is newest",qwe[i])   #qwe[i] is each sentence
+    for q in qwe[i]:
+        print("this is individual objects in each sent",q)
+        objects.append(json.dumps(q))
+    new.append(objects)
+    check_box.append(objects)
+    objects=[]
+  return render_template("experimentPage.html", objs = new, para=para,abc=check_box, instruments = apparatus)     
+
+
+@app.route("/experiment1")
+def experiment1():
+  objects = []    # a Python object (dict):
+  new=[]
+  check_box=[] 
+  f = open("static/text/basic_radical.txt", "r")
+  para= f.read()
+  #length = text_preprocessing.sen()    #var is the no of sentences in the paragraph
+  qwe=text_preprocessing.main1()   # qwe={(name:testtube,pos:up),(name:beaker,pos:down)}
+  var = text_preprocessing.sen1()
+  apparatus = text_preprocessing.apparatus() 
+  print("this is var",var)
+  for i in range(var):    #if return render_template is within this for loop only the objects of first sentence are displayed
+    print("this is newest",qwe[i])   #qwe[i] is each sentence
+    for q in qwe[i]:
+        print("this is individual objects in each sent",q)
+        objects.append(json.dumps(q))
+    new.append(objects)
+    check_box.append(objects)
+    objects=[]
+  return render_template("experimentPage.html", objs = new, para=para,abc=check_box, instruments = apparatus)  
+
+
+
+@app.route("/experiment2")
+def experiment2():
+  objects = []    # a Python object (dict):
+  new=[]
+  check_box=[] 
+  f = open("static/text/titration.txt", "r")
+  para= f.read()
+  #length = text_preprocessing.sen()    #var is the no of sentences in the paragraph
+  qwe=text_preprocessing.main2()   # qwe={(name:testtube,pos:up),(name:beaker,pos:down)}
+  var = text_preprocessing.sen2()
+  apparatus = text_preprocessing.apparatus() 
+  print("this is var",var)
+  for i in range(var):    #if return render_template is within this for loop only the objects of first sentence are displayed
+    print("this is newest",qwe[i])   #qwe[i] is each sentence
+    for q in qwe[i]:
+        print("this is individual objects in each sent",q)
+        objects.append(json.dumps(q))
+    new.append(objects)
+    check_box.append(objects)
+    objects=[]
+
+  return render_template("experimentPage.html", objs = new, para=para,abc=check_box, instruments = apparatus)  
+
+if __name__=='__main__':
+  app.run(debug=True,port=5000)
diff --git a/data.xml b/data.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="utf-8" ?>
+
+
+<data>
+<noun>
+    <obj name="beaker"> beaker </obj>
+    <obj name="burette"> burette </obj>
+    <obj name="pipette"> pipette </obj>
+    <obj name="burner"> bunsenburner </obj>
+    <obj name="roundbottomflask"> roundbottomflask </obj>
+    <obj name="flask"> conicalflask </obj>
+    <obj name="tripodstand"> tripodstand </obj>
+    <obj name="wiregauze"> wiregauze </obj>
+    <obj name="tube"> testtube </obj>
+    <obj name="paper"> paper </obj>
+    <obj name="precipitate">ppt </obj>
+    <obj name="dish"> petri </obj>
+    <obj name="container"> container </obj>
+    <obj name="gas"> gas </obj>
+    <obj name="ring"> ring </obj>
+    <obj name="rod"> rod </obj>
+</noun>
+<position>
+    <pos name= "under" x="500" y="-710"> under </pos>
+    <pos name= "over" x="500" y="-450"> over </pos>
+    <pos name= "above" x="500" y="-450"> above </pos>
+    <pos name= "below" x="500" y="-710"> below </pos>
+    <pos name= "near" x="500" y="-600"> near </pos>
+    <pos name= "in" x="500" y="-600"> in </pos>
+    <pos name= "inside" x="500" y="-600"> in </pos>
+    <pos name= "on" x="500" y="-450"> on </pos>
+
+</position>
+
+<image>
+    <img name="beaker" src="static/beaker.png"> beaker </img>
+    <img name="burette" src="static/burette.png"> burette </img>
+    <img name="pipette" src=""> pipette </img>
+    <img name="burner" src="static/bunsen_burner.png"> bunsenburner </img>
+    <img name="roundbottomflask" src="static/round_bottom_flask.png"> roundbottomflask </img>
+    <img name="flask" src="static/conical.png"> conicalflask </img>
+    <img name="tripodstand" src="static/tripod.png"> tripodstand </img>
+    <img name="wiregauze" src=""> wiregauze </img>
+    <img name="tube" src="static/fullTestTube.png"> testtube </img>
+    <img name="precipitate" src="static/pptTestTube.png"> ppt </img>
+    <img name="dish" src="static/dish.png"> petri </img>
+    <img name="container" src="static/beaker.png"> container </img>
+    <img name="gas" src="static/gas.png"> gas </img>
+    <img name="ring" src="static/ring.png"> ring </img>
+    <img name="rod" src="static/rod.png"> rod </img>
+</image>
+
+<verbs>
+    <verb name="pour" deg="45" > pour </verb>
+    <verb name="place" deg="0"> place </verb>
+    <verb name="add"> add </verb>
+
+</verbs>
+<colours>
+    <colour name="green" hex="#368370"> green </colour>
+    <colour name="blue"  hex="#151871"> blue </colour>
+    <colour name="black" hex="#565656"> black </colour>
+    <colour name="brown" hex="#5C4033"> brown </colour>
+    <colour name="white" hex="#ffffff"> white </colour>
+    <colour name="yellow" hex="#ffff00"> yellow </colour>
+    <colour name="purple" hex="#A020F0"> purple </colour>
+
+</colours>
+
+</data>    
+
diff --git a/pos_tagging.py b/pos_tagging.py
@@ -0,0 +1,79 @@
+import nltk
+#nltk.download('treebank')
+import pprint 
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import Pipeline
+from nltk.tokenize import sent_tokenize, word_tokenize
+
+tagged_sentences =[ [('chocolate', 'NN'), ('brown', 'JJ'), ('precipitate', 'NN'), ('of', 'IN'), ('copper', 'NN'), ('ferrocyanide', 'NN'), ('is', 'VBZ'), ('formed', 'VBN'), ('in', 'IN'), ('test', 'NN'), ('tube', 'NN'), ('confirming', 'VBG'), ('the', 'DT'), ('presence', 'NN'), ('of', 'IN'), ('cu2', 'NN'), ('ions', 'NNS')] ,[('solution', 'NN'), ('in', 'IN'), ('test', 'NN'), ('tube', 'NN'), ('turns', 'VBZ'), ('green', 'JJ')]]
+
+print(tagged_sentences[0])
+print("Tagged sentences: ", len(tagged_sentences))
+print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))
+def features(sentence, index):
+    """ sentence: [w1, w2, ...], index: the index of the word """
+    return {
+        'word': sentence[index],
+        'is_first': index == 0,
+        'is_last': index == len(sentence) - 1,
+        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
+        'is_all_caps': sentence[index].upper() == sentence[index],
+        'is_all_lower': sentence[index].lower() == sentence[index],
+        'prefix-1': sentence[index][0],
+        'prefix-2': sentence[index][:2],
+        'prefix-3': sentence[index][:3],
+        'suffix-1': sentence[index][-1],
+        'suffix-2': sentence[index][-2:],
+        'suffix-3': sentence[index][-3:],
+        'prev_word': '' if index == 0 else sentence[index - 1],
+        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
+        'has_hyphen': '-' in sentence[index],
+        'is_numeric': sentence[index].isdigit(),
+        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
+    }
+
+# pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
+
+def untag(tagged_sentence):
+    return [w for w, t in tagged_sentence]
+
+def transform_to_dataset(tagged_sentences):
+    X, y = [], []
+
+    for tagged in tagged_sentences:
+        for index in range(len(tagged)):
+            X.append(features(untag(tagged), index))
+            y.append(tagged[index][1])
+
+    return X, y
+
+# Split the dataset for training and testing
+
+cutoff = int(.75 * len(tagged_sentences))
+training_sentences = tagged_sentences[:cutoff]
+test_sentences = tagged_sentences[cutoff:]
+
+print( len(training_sentences))
+print( len(test_sentences))   
+
+X, y = transform_to_dataset(training_sentences)
+
+clf = Pipeline([
+    ('vectorizer', DictVectorizer(sparse=False)),
+    ('classifier', DecisionTreeClassifier(criterion='entropy'))
+])
+
+clf.fit(X[:1], y[:1])   # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)
+
+print('Training completed')
+
+X_test, y_test = transform_to_dataset(test_sentences)
+
+print("Accuracy:", clf.score(X_test, y_test))
+
+def pos_tag(sentence):
+    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
+    return sentence, tags
+
+print(pos_tag(word_tokenize('Chocolate brown precipitate of Copper ferrocyanide is formed in test tube confirming the presence of Cu2+ ions.')))
diff --git a/requirements.txt b/requirements.txt
diff --git a/static/beaker.png b/static/beaker.png
diff --git a/static/beaker_pour.png b/static/beaker_pour.png
diff --git a/static/bunsen_burner.png b/static/bunsen_burner.png
diff --git a/static/bunsen_burner1.png b/static/bunsen_burner1.png
diff --git a/static/burette.png b/static/burette.png
diff --git a/static/cat1_exp1.png b/static/cat1_exp1.png
diff --git a/static/cat1_exp2.png b/static/cat1_exp2.png
diff --git a/static/cat1_exp3.png b/static/cat1_exp3.png
diff --git a/static/conical.png b/static/conical.png
diff --git a/static/conical1.png b/static/conical1.png
diff --git a/static/container_pour.png b/static/container_pour.png
diff --git a/static/dish.png b/static/dish.png
diff --git a/static/emptyTestTube.png b/static/emptyTestTube.png
diff --git a/static/flask_pour.png b/static/flask_pour.png
diff --git a/static/fullTestTube.png b/static/fullTestTube.png
diff --git a/static/gas.png b/static/gas.png
diff --git a/static/gas1.png b/static/gas1.png
diff --git a/static/halfTestTube.png b/static/halfTestTube.png