Skip to content

Commit

Permalink
modiffy tagging demo
Browse files Browse the repository at this point in the history
  • Loading branch information
JunKuang-algo committed Apr 23, 2018
1 parent 607cb3c commit f87b396
Show file tree
Hide file tree
Showing 9 changed files with 34,416 additions and 61 deletions.
2 changes: 1 addition & 1 deletion demo/Model/neo_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def getEntityRelationbyEntity(self,value):
answer = self.graph.data("MATCH (entity1) - [rel] -> (entity2) WHERE entity1.title = \"" +value +"\" RETURN rel,entity2")
return answer

#查找entity1及其对应的关系(与findRelationBetweentEntities的差别就是返回值不一样
#查找entity1及其对应的关系(与getEntityRelationbyEntity的差别就是返回值不一样
def findRelationByEntity(self,entity1):
answer = self.graph.data("MATCH (n1:HudongItem {title:\""+entity1+"\"})- [rel] -> (n2) RETURN n1,rel,n2" )
if(len(answer) == 0):
Expand Down
15 changes: 4 additions & 11 deletions demo/demo/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import json
file_path = os.path.abspath(os.path.join(os.getcwd(),".."))
sys.path.append(file_path)
print(sys.path)

from toolkit.pre_load import collection
from toolkit.pre_load import testDataCollection
Expand All @@ -22,21 +23,13 @@ def tagging(request):
# statement = request.POST.get("statement")
post = json.loads(request.body)
post_id = testDataCollection.insert_one(post)
collection.delete_many( {'Entity1':post.get('entity1') , 'Entity2':post.get('entity2'),'Relation':post.get('relation'),'Statement':post.get('statement')} )
collection.delete_many( {'entity1Pos':post.get('entity1Pos') , 'entity1':post.get('entity1') ,'entity2Pos':post.get('entity2Pos'),'entity2':post.get('entity2Pos'),'relation':post.get('relation'),'statement':post.get('statement')})
return JsonResponse({'code':200})
else:
while(True):
documents_count = collection.count()
rint = random.randint(0,documents_count-1)
result = collection.find_one(skip = rint )
if(len(result) == 5 ):

# #从测试集中选取一个句子和标签
# filePath = os.path.abspath(os.path.join(os.getcwd(),"../TrainDataBaseOnWiki/finalData/train_data.txt"))
# statement , entity1 ,entity2 ,relation = statementSelector()

# #如果标签是对的,则将这个样本写到训练集文件中(已标注)
# #如果标签是错的,则填写一个正确的标签
# #如果不知道该如何标注,换一个

print(result)
if(len(result) == 7 ):
return render(request,'taggingSentences.html',{"result": result})
23 changes: 17 additions & 6 deletions demo/templates/taggingSentences.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
<h1>Statement</h1>
<hr/>
</p>
<h3 id="statement"><span style="color: red"></span>{{result.Statement}}<span style="color: red"></span></h3>
<h4><span style="color: green"><strong>Entity1: <span id="entity1">{{ result.Entity1 }}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: orange"><strong>Relation: <span id="relation">{{result.Relation}}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: green"><strong>Entity2: <span id="entity2">{{ result.Entity2 }}</span></strong></span></h4>
<h3 id="statement">{{ result.statement }}</h3>
<h4><span style="color: green"><strong>Entity1: <span id="entity1">{{ result.entity1 }}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: orange"><strong>relation: <span id="relation">{{ result.relation }}</span></strong></span>&nbsp&nbsp&nbsp&nbsp<span style="color: green"><strong>Entity2: <span id="entity2">{{ result.entity2 }}</span></strong></span></h4>
</div>
</div>

Expand Down Expand Up @@ -187,10 +187,21 @@ <h5 class="modal-title" id="exampleModalLabel">Marked data count</h5>
let entity1 = document.querySelector('#entity1').innerText;
let entity2 = document.querySelector('#entity2').innerText;
let statementContext = statement.innerText;
let execEntity1 = new RegExp(entity1, 'g');
let execEntity2 = new RegExp(entity2, 'g');
statementContext = statementContext.replace(execEntity1, `<span style="color: red">${entity1}</span>`);
statementContext = statementContext.replace(execEntity2, `<span style="color: red">${entity2}</span>`);
let entity1Pos = parseInt( {{ result.entity1Pos|safe }} ) ;
let entity2Pos = parseInt( {{ result.entity2Pos|safe }} ) ;
if(entity1Pos > entity2Pos){
let tmp = entity1Pos ;
entity1Pos = entity2Pos ;
entity2Pos = tmp ;
tmp = entity1 ;
entity1 =entity2 ;
entity2 = tmp ;
}

// 从1到len-1,去掉双引号
statementContext = statementContext.substring(1,entity1Pos+1)+'<span style="color: red">'+entity1+'</span>'+statementContext.substring(entity1Pos+entity1.length+1,entity2Pos+1)+'<span style="color: red">'+entity2+'</span>'+statementContext.substring(entity2Pos+entity2.length+1,statementContext.length-1) ;
// statementContext = statementContext.replace(execEntity1, `<span style="color: red">${entity1}</span>`);
// statementContext = statementContext.replace(execEntity2, `<span style="color: red">${entity2}</span>`);
statement.innerHTML = statementContext;

let other = document.querySelector('#Otherrelation');
Expand Down
90 changes: 58 additions & 32 deletions wikidataSpider/TrainDataBaseOnWiki/dataScrubbing.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def handleError(self):
os.remove(readFilePath)
os.rename(writeFilePath,readFilePath)
#选择和农业有关的训练集,选择关系"instance of" "taxon rank" "subclass of" "parent taxon"
def selectAgricultureData(self):
def selectAgricultureData(self,filename):
#预加载实体列表(挑选出如下类别的实体: 5:Animal,6:Plant,7:Chemicals,9:Food items,10:Diseases,12:Nutrients,13:Biochemistry.14:Agricultural implements,15:Technology )
entityFilePath = os.path.abspath(os.path.join(self.pythonFilePath,"../wikientities/predict_labels.txt"))
entitySet = set()
Expand All @@ -97,48 +97,74 @@ def selectAgricultureData(self):
or entityNumber == "14" or entityNumber == "15"):
entitySet.add(entity)

# 对当前目录下的所有文件进行操作
# for root,dirs,files in os.walk(self.pythonFilePath):
# for file in files:
# if(file[-3:]!="txt" or file == "fileReaded.txt" or file == "entityrelation.txt" or file =="entitySet.txt"):
# continue
# count = 0
# #print(file)
# readFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file))
# writeFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file+"(2)"))
# with open(readFilePath,'r') as fr:
# with open(writeFilePath,'w') as fw:
# for line in fr:
# count+=1
# # if(count%1000):
# # print(count)
# triplet = line.strip().split('\t')
# if(len(triplet) == 4):
# entity1 = triplet[0]
# entity2 = triplet[1]
# statement = triplet[2]
# relation = triplet[3]
# if((relation == "instance of" or relation == "taxon rank" or relation == "subclass of" or relation =="parent taxon" ) and ((entity1 in entitySet) or \
# (entity2 in entitySet))):
# fw.write(entity1+'\t'+entity2+'\t'+statement+'\t'+relation+'\n')
# os.remove(readFilePath)
# os.rename(writeFilePath,readFilePath)

for root,dirs,files in os.walk(self.pythonFilePath):
for file in files:
if(file[-3:]!="txt" or file == "fileReaded.txt" or file == "entityrelation.txt" or file =="entitySet.txt"):
continue
count = 0
#print(file)
readFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file))
writeFilePath = os.path.abspath(os.path.join(self.pythonFilePath,file+"(2)"))
with open(readFilePath,'r') as fr:
with open(writeFilePath,'w') as fw:
for line in fr:
count+=1
# if(count%1000):
# print(count)
triplet = line.strip().split('\t')
if(len(triplet) == 4):
entity1 = triplet[0]
entity2 = triplet[1]
statement = triplet[2]
relation = triplet[3]
if((relation == "instance of" or relation == "taxon rank" or relation == "subclass of" or relation =="parent taxon" ) and ((entity1 in entitySet) or \
(entity2 in entitySet))):
fw.write(entity1+'\t'+entity2+'\t'+statement+'\t'+relation+'\n')
os.remove(readFilePath)
os.rename(writeFilePath,readFilePath)
#对指定的文件进行操作
readFilePath = os.path.abspath(os.path.join(self.pythonFilePath,filename))
writeFilePath = os.path.abspath(os.path.join(self.pythonFilePath,filename+"(2)"))
count = 0
with open(readFilePath,'r') as fr:
with open(writeFilePath,'w') as fw:
for line in fr:
count += 1
triplet = line.strip().split('\t')
if(len(triplet) == 6):
entity1Pos = triplet[0]
entity1 = triplet[1]
entity2Pos = triplet[2]
entity2 = triplet[3]
statement = triplet[4]
relation = triplet[5]
if( (relation == "instance of" or relation == "taxon rank" or relation == "subclass of" or relation == "parent taxon") and ((entity1 in entitySet) or \
(entity2 in entitySet)) ):
fw.write(entity1Pos+"\t"+entity1+"\t"+entity2Pos+"\t"+entity2+"\t"+"\""+statement+"\""+"\t"+relation+"\n")

if __name__ == "__main__":
if(len(sys.argv) == 1):
print("Missing parameters: ")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to selecgt agricultural data ")
elif(len(sys.argv) > 2):
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
elif(len(sys.argv) > 3):
print("Too many parameters: ")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to selecgt agricultural data ")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
else:
dataScrubbing = DataScrubbing()
if(sys.argv[1] == "handleError"):
if(sys.argv[1] == "handleError" and len(sys.argv)>2):
print("parameter error!")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
elif(sys.argv[1] == "handleError"):
dataScrubbing.handleError()
elif(sys.argv[1] == "selectAgriculturalData" and len(sys.argv)<3):
print("parameter error!")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData filename\" to select agricultural data ")
elif(sys.argv[1] == "selectAgriculturalData"):
dataScrubbing.selectAgricultureData()
dataScrubbing.selectAgricultureData(sys.argv[2])
else:
print("Parameter error: no such parameter")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to selecgt agricultural data ")
print("Please use \"python dataScrubbing.py handleError\" to solve error or use \"python dataScrubbing.py selectAgriculturalData\" to select agricultural data ")


31 changes: 20 additions & 11 deletions wikidataSpider/TrainDataBaseOnWiki/extractTrainingData.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,10 @@ def CutStatements(line):
fileReadedList.append(line.strip())
print(line.strip())
#递归遍历语料库文件夹
with open("train_data6.txt",'w') as fw:
with open("train_data4.txt",'w') as fw:
with open("fileReaded.txt","a") as filereaded:
fw.write('entity1\tentity2\tstatement\trelation\n')
#存储已经搜索过的实体,避免重复查询
entityRelationDict = {}
fw.write('entity1Pos\tentity1\tentity2Pos\tentity2\tstatement\trelation\n')

for root,dirs,files in os.walk(corpusPath):
for file in files:
filePath = os.path.join(root,file)
Expand All @@ -60,18 +59,26 @@ def CutStatements(line):
#分词
cutResult = get_NE(statement.strip())
#得到每句话的实体列表后,两两匹配查询是否具有某种关系,如果有的话就写到文件中
#entityList 存储实体列表和实体出现的位置,entity1存储实体名称,entity1Index存储实体位置
entityList = []
nowIndex = -1
for word in cutResult:
if(word[1]!=0 and not temporaryok(word[1])):
entityList.append(word[0])
entity1Index = statement.index(word[0],nowIndex+1)
entityList.append({'entity1':word[0],'entity1Index':entity1Index})
nowIndex = entity1Index+len(word[0])-1

entityNumber = len(entityList)
for i in range(entityNumber):
answer = None
answer = entityRelationDict.get(entityList[i])
if(entityRelationDict.get(entityList[i]) is None):
answer = db.findRelationBetweenEntities(entityList[i])
entityRelationDict[entityList[i]] = answer
#answer = entityRelationDict.get(entityList[i].get('entity1'))
#if(entityRelationDict.get(entityList[i].get('entity1')) is None):
answer = db.findRelationBetweenEntities(entityList[i].get('entity1'))
#entityRelationDict[entityList[i].get('entity1')] = answer
for relation in answer:
#对neo4j的返回值进行处理,原来的返回值中包含一些没用的字符,最终得到的关系是rel,实体是entity2
if(len(str(relation['rel']).split("\"")) < 2):
continue
rel = str(relation['rel']).split("\"")[1]
n2 = str(relation['n2'])
index = n2.find('title')
Expand All @@ -86,8 +93,10 @@ def CutStatements(line):
entity2 = entity2+n2[index]
index += 1
#与entity1相关联的实体也出现在同一句话中,则可以制造一条训练样本
if(entity2 in entityList):
fw.write(entityList[i]+'\t'+entity2+'\t'+statement.strip()+'\t'+rel+'\n')
nowIndex = -1 ;
for item in entityList:
if(entity2 == item.get('entity1') and item.get('entity1Index') != entityList[i].get('entity1Index')):
fw.write(str(entityList[i].get('entity1Index'))+'\t'+entityList[i].get('entity1')+'\t'+str(item.get('entity1Index') )+'\t'+entity2+'\t'+statement.strip()+'\t'+rel+'\n')

filereaded.write(filePath+'\n')

Expand Down
1 change: 1 addition & 0 deletions wikidataSpider/TrainDataBaseOnWiki/finalData/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
使用python texToJson.py 源文件 结果文件名来将txt转成json
Loading

0 comments on commit f87b396

Please sign in to comment.