modiffy tagging demo

jewelcai · Apr 23, 2018 · f87b396 · f87b396
1 parent 607cb3c
commit f87b396
Show file tree

Hide file tree

Showing 9 changed files with 34,416 additions and 61 deletions.
diff --git a/demo/Model/neo_models.py b/demo/Model/neo_models.py
@@ -22,7 +22,7 @@ def getEntityRelationbyEntity(self,value):
 		answer = self.graph.data("MATCH (entity1) - [rel] -> (entity2)  WHERE entity1.title = \"" +value +"\" RETURN rel,entity2")
 		return answer
 
-	#查找entity1及其对应的关系（与findRelationBetweentEntities的差别就是返回值不一样）
+	#查找entity1及其对应的关系（与getEntityRelationbyEntity的差别就是返回值不一样）
 	def findRelationByEntity(self,entity1):
 		answer = self.graph.data("MATCH (n1:HudongItem {title:\""+entity1+"\"})- [rel] -> (n2) RETURN n1,rel,n2" )
 		if(len(answer) == 0):

diff --git a/demo/demo/tagging.py b/demo/demo/tagging.py
@@ -10,6 +10,7 @@
 import json
 file_path = os.path.abspath(os.path.join(os.getcwd(),".."))
 sys.path.append(file_path)
+print(sys.path)
 
 from toolkit.pre_load import collection
 from toolkit.pre_load import testDataCollection
@@ -22,21 +23,13 @@ def tagging(request):
 		# statement = request.POST.get("statement")
 		post = json.loads(request.body)
 		post_id = testDataCollection.insert_one(post)
-		collection.delete_many( {'Entity1':post.get('entity1') , 'Entity2':post.get('entity2'),'Relation':post.get('relation'),'Statement':post.get('statement')} )
+		collection.delete_many( {'entity1Pos':post.get('entity1Pos') , 'entity1':post.get('entity1') ,'entity2Pos':post.get('entity2Pos'),'entity2':post.get('entity2Pos'),'relation':post.get('relation'),'statement':post.get('statement')})
 		return JsonResponse({'code':200})
 	else:
 		while(True):
 			documents_count = collection.count()
 			rint = random.randint(0,documents_count-1)
 			result = collection.find_one(skip = rint )
-			if(len(result) == 5 ):
-
-				# #从测试集中选取一个句子和标签
-				# filePath = os.path.abspath(os.path.join(os.getcwd(),"../TrainDataBaseOnWiki/finalData/train_data.txt"))
-				# statement , entity1 ,entity2 ,relation = statementSelector()
-
-				# #如果标签是对的，则将这个样本写到训练集文件中(已标注)
-				# #如果标签是错的，则填写一个正确的标签
-				# #如果不知道该如何标注，换一个
-
+			print(result)
+			if(len(result) == 7 ):
 				return render(request,'taggingSentences.html',{"result": result})
diff --git a/demo/templates/taggingSentences.html b/demo/templates/taggingSentences.html
@@ -12,8 +12,8 @@
                         <h1>Statement</h1>
                         <hr/>
                     </p>
-                    <h3 id="statement"><span style="color: red"></span>{{result.Statement}}<span style="color: red"></span></h3>
-                    <h4><span style="color: green"><strong>Entity1: <span id="entity1">{{ result.Entity1 }}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: orange"><strong>Relation: <span id="relation">{{result.Relation}}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: green"><strong>Entity2: <span id="entity2">{{ result.Entity2 }}</span></strong></span></h4>
+                    <h3 id="statement">{{ result.statement }}</h3>
+                    <h4><span style="color: green"><strong>Entity1: <span id="entity1">{{ result.entity1 }}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: orange"><strong>relation: <span id="relation">{{ result.relation }}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: green"><strong>Entity2: <span id="entity2">{{ result.entity2 }}</span></strong></span></h4>
                 </div>
             </div>
 
@@ -187,10 +187,21 @@ <h5 class="modal-title" id="exampleModalLabel">Marked data count</h5>
 	let entity1 = document.querySelector('#entity1').innerText;
 	let entity2 = document.querySelector('#entity2').innerText;
 	let statementContext = statement.innerText;
-	let execEntity1 = new RegExp(entity1, 'g');
-	let execEntity2 = new RegExp(entity2, 'g');
-	statementContext = statementContext.replace(execEntity1, `<span style="color: red">${entity1}</span>`);
-	statementContext = statementContext.replace(execEntity2, `<span style="color: red">${entity2}</span>`);
+	let entity1Pos = parseInt( {{ result.entity1Pos|safe }} ) ;
+	let entity2Pos = parseInt( {{ result.entity2Pos|safe }} ) ;
+	if(entity1Pos > entity2Pos){
+		let tmp  = entity1Pos ;
+		entity1Pos = entity2Pos ;
+		entity2Pos = tmp ;
+		tmp = entity1 ;
+		entity1  =entity2 ;
+		entity2 = tmp ;
+	}
+
+	// 从１到len-1，去掉双引号
+	statementContext = statementContext.substring(1,entity1Pos+1)+'<span style="color: red">'+entity1+'</span>'+statementContext.substring(entity1Pos+entity1.length+1,entity2Pos+1)+'<span style="color: red">'+entity2+'</span>'+statementContext.substring(entity2Pos+entity2.length+1,statementContext.length-1)	;
+	// statementContext = statementContext.replace(execEntity1, `<span style="color: red">${entity1}</span>`);
+	// statementContext = statementContext.replace(execEntity2, `<span style="color: red">${entity2}</span>`);
 	statement.innerHTML = statementContext;
 
 	let other = document.querySelector('#Otherrelation');

diff --git a/wikidataSpider/TrainDataBaseOnWiki/dataScrubbing.py b/wikidataSpider/TrainDataBaseOnWiki/dataScrubbing.py
@@ -74,7 +74,7 @@ def handleError(self):
 				os.remove(readFilePath)
 				os.rename(writeFilePath,readFilePath)
     #选择和农业有关的训练集,选择关系"instance of" "taxon rank" "subclass of" "parent taxon"
-	def selectAgricultureData(self):
+	def selectAgricultureData(self,filename):
 		#预加载实体列表(挑选出如下类别的实体: 5:Animal,6:Plant,7:Chemicals,9:Food items,10:Diseases,12:Nutrients,13:Biochemistry.14:Agricultural implements,15:Technology )
 		entityFilePath = os.path.abspath(os.path.join(self.pythonFilePath,"../wikientities/predict_labels.txt"))
 		entitySet = set()
@@ -97,48 +97,74 @@ def selectAgricultureData(self):
 					or entityNumber == "14" or entityNumber == "15"):
 					entitySet.add(entity)
 
+		# 对当前目录下的所有文件进行操作
+		# for root,dirs,files in os.walk(self.pythonFilePath):
+		# 	for file in files:
+		# 		if(file[-3:]!="txt" or file == "fileReaded.txt" or file == "entityrelation.txt" or file =="entitySet.txt"):
+		# 			continue
+		# 		count = 0
+		# 		#print(file)
+		# 		readFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file))
+		# 		writeFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file+"(2)"))
+		# 		with open(readFilePath,'r') as fr:
+		# 			with open(writeFilePath,'w') as fw:
+		# 				for line in fr:
+		# 					count+=1
+		# 					# if(count%1000):
+		# 					# 	print(count)
+		# 					triplet = line.strip().split('\t')
+		# 					if(len(triplet) == 4):
+		# 						entity1 = triplet[0]
+		# 						entity2 = triplet[1]
+		# 						statement = triplet[2]
+		# 						relation = triplet[3]
+		# 						if((relation == "instance of" or relation == "taxon rank" or relation == "subclass of" or relation =="parent taxon" ) and ((entity1 in entitySet) or \
+		# 						 (entity2 in entitySet))):
+		# 							fw.write(entity1+'\t'+entity2+'\t'+statement+'\t'+relation+'\n')
+		# 		os.remove(readFilePath)
+		# 		os.rename(writeFilePath,readFilePath)
 
-		for root,dirs,files in os.walk(self.pythonFilePath):
-			for file in files:
-				if(file[-3:]!="txt" or file == "fileReaded.txt" or file == "entityrelation.txt" or file =="entitySet.txt"):
-					continue
-				count = 0
-				#print(file)
-				readFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file))
-				writeFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file+"(2)"))
-				with open(readFilePath,'r') as fr:
-					with open(writeFilePath,'w') as fw:
-						for line in fr:
-							count+=1
-							# if(count%1000):
-							# 	print(count)
-							triplet = line.strip().split('\t')
-							if(len(triplet) == 4):
-								entity1 = triplet[0]
-								entity2 = triplet[1]
-								statement = triplet[2]
-								relation = triplet[3]
-								if((relation == "instance of" or relation == "taxon rank" or relation == "subclass of" or relation =="parent taxon" ) and ((entity1 in entitySet) or \
-								 (entity2 in entitySet))):
-									fw.write(entity1+'\t'+entity2+'\t'+statement+'\t'+relation+'\n')
-				os.remove(readFilePath)
-				os.rename(writeFilePath,readFilePath)
+		#对指定的文件进行操作
+		readFilePath = os.path.abspath(os.path.join(self.pythonFilePath,filename))
+		writeFilePath = os.path.abspath(os.path.join(self.pythonFilePath,filename+"(2)"))
+		count = 0 
+		with open(readFilePath,'r') as fr:
+			with open(writeFilePath,'w') as fw:
+				for line in fr:
+					count += 1
+					triplet = line.strip().split('\t') 
+					if(len(triplet) == 6):
+						entity1Pos = triplet[0]
+						entity1 = triplet[1]
+						entity2Pos = triplet[2]
+						entity2 = triplet[3] 
+						statement = triplet[4]
+						relation = triplet[5]
+						if( (relation == "instance of" or relation == "taxon rank" or relation == "subclass of" or relation == "parent taxon") and ((entity1 in entitySet) or \
+							(entity2 in entitySet)) ):
+							fw.write(entity1Pos+"\t"+entity1+"\t"+entity2Pos+"\t"+entity2+"\t"+"\""+statement+"\""+"\t"+relation+"\n")
 
 if __name__ == "__main__":
 	if(len(sys.argv) == 1):
 		print("Missing parameters:  ")
-		print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to selecgt agricultural data ")
-	elif(len(sys.argv) > 2):
+		print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
+	elif(len(sys.argv) > 3):
 		print("Too many parameters: ")
-		print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to selecgt agricultural data ")
+		print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
 	else:
 		dataScrubbing = DataScrubbing()
-		if(sys.argv[1] == "handleError"):
+		if(sys.argv[1] == "handleError" and len(sys.argv)>2):
+			print("parameter error!")
+			print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
+		elif(sys.argv[1] == "handleError"):
 			dataScrubbing.handleError()
+		elif(sys.argv[1] == "selectAgriculturalData" and len(sys.argv)<3):
+			print("parameter error!")
+			print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
 		elif(sys.argv[1] == "selectAgriculturalData"):
-			dataScrubbing.selectAgricultureData()
+			dataScrubbing.selectAgricultureData(sys.argv[2])
 		else:
 			print("Parameter error: no such parameter")
-			print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to selecgt agricultural data ")
+			print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to select agricultural data ")
 
 
diff --git a/wikidataSpider/TrainDataBaseOnWiki/extractTrainingData.py b/wikidataSpider/TrainDataBaseOnWiki/extractTrainingData.py
@@ -34,11 +34,10 @@ def CutStatements(line):
 		fileReadedList.append(line.strip())
 		print(line.strip())
 #递归遍历语料库文件夹
-with open("train_data6.txt",'w') as fw:
+with open("train_data4.txt",'w') as fw:
 	with open("fileReaded.txt","a") as filereaded:
-		fw.write('entity1\tentity2\tstatement\trelation\n')
-		#存储已经搜索过的实体，避免重复查询
-		entityRelationDict = {}
+		fw.write('entity1Pos\tentity1\tentity2Pos\tentity2\tstatement\trelation\n')
+
 		for root,dirs,files in os.walk(corpusPath):			
 			for file in files:
 				filePath = os.path.join(root,file)
@@ -60,18 +59,26 @@ def CutStatements(line):
 								#分词
 								cutResult = get_NE(statement.strip())
 								#得到每句话的实体列表后，两两匹配查询是否具有某种关系,如果有的话就写到文件中
+								#entityList 存储实体列表和实体出现的位置,entity1存储实体名称，entity1Index存储实体位置
 								entityList = []
+								nowIndex = -1
 								for word in cutResult:
 									if(word[1]!=0 and not temporaryok(word[1])):
-										entityList.append(word[0])
+										entity1Index = statement.index(word[0],nowIndex+1)
+										entityList.append({'entity1':word[0],'entity1Index':entity1Index})
+										nowIndex = entity1Index+len(word[0])-1
+
 								entityNumber = len(entityList)
 								for i in range(entityNumber):
 									answer = None
-									answer = entityRelationDict.get(entityList[i])
-									if(entityRelationDict.get(entityList[i]) is None):
-										answer = db.findRelationBetweenEntities(entityList[i])
-										entityRelationDict[entityList[i]] = answer
+									#answer = entityRelationDict.get(entityList[i].get('entity1'))
+									#if(entityRelationDict.get(entityList[i].get('entity1')) is None):
+									answer = db.findRelationBetweenEntities(entityList[i].get('entity1'))
+										#entityRelationDict[entityList[i].get('entity1')] = answer
 									for relation in answer:
+										#对neo4j的返回值进行处理，原来的返回值中包含一些没用的字符，最终得到的关系是rel,实体是entity2
+										if(len(str(relation['rel']).split("\"")) < 2):
+											continue
 										rel = str(relation['rel']).split("\"")[1]
 										n2 = str(relation['n2'])
 										index = n2.find('title')
@@ -86,8 +93,10 @@ def CutStatements(line):
 												entity2 = entity2+n2[index]
 											index += 1
 										#与entity1相关联的实体也出现在同一句话中，则可以制造一条训练样本
-										if(entity2 in entityList):
-											fw.write(entityList[i]+'\t'+entity2+'\t'+statement.strip()+'\t'+rel+'\n')
+										nowIndex = -1 ;
+										for item in entityList:
+											if(entity2 == item.get('entity1') and item.get('entity1Index') != entityList[i].get('entity1Index')):
+												fw.write(str(entityList[i].get('entity1Index'))+'\t'+entityList[i].get('entity1')+'\t'+str(item.get('entity1Index') )+'\t'+entity2+'\t'+statement.strip()+'\t'+rel+'\n')
 
 					filereaded.write(filePath+'\n')
 

diff --git a/wikidataSpider/TrainDataBaseOnWiki/finalData/readme.md b/wikidataSpider/TrainDataBaseOnWiki/finalData/readme.md
@@ -0,0 +1 @@
+使用python texToJson.py 源文件 结果文件名来将txt转成json
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		使用python texToJson.py 源文件结果文件名来将txt转成json