Implement helper function CoNLLSentence.from_str

cold11 · Jan 29, 2020 · 7cc641d · 7cc641d
1 parent c0e9585
commit 7cc641d
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -139,16 +139,11 @@ Parsers take both tokens and part-of-speech tags as input. The output is a tree
 
 ```python
 >>> syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
->>> print(syntactic_parser([('中国', 'NR'),('批准', 'VV'),('设立', 'VV'),('了', 'AS'),('三十万', 'CD'),('家', 'M'),('外商', 'NN'),('投资', 'NN'), ('企业', 'NN')]))
-1	中国	_	NR	_	_	2	nsubj	_	_
-2	批准	_	VV	_	_	0	root	_	_
-3	设立	_	VV	_	_	2	ccomp	_	_
-4	了	_	AS	_	_	3	asp	_	_
-5	三十万	_	CD	_	_	6	nummod	_	_
-6	家	_	M	_	_	9	clf	_	_
-7	外商	_	NN	_	_	9	nn	_	_
-8	投资	_	NN	_	_	9	nn	_	_
-9	企业	_	NN	_	_	3	dobj	_	_
+>>> print(syntactic_parser([('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]))
+1	蜡烛	_	NN	_	_	4	nsubj	_	_
+2	两	_	CD	_	_	3	nummod	_	_
+3	头	_	NN	_	_	4	dep	_	_
+4	烧	_	VV	_	_	0	root	_	_
 ```
 
 ### Semantic Dependency Parsing
@@ -175,21 +170,15 @@ HanLP implements the biaffine[^biaffine] model which delivers the SOTA performan
 
 ```python
 >>> semantic_parser = hanlp.load(SEMEVAL16_NEWS_BIAFFINE_ZH)
->>> print(semantic_parser([('中国', 'NR'),('批准', 'VV'),('设立', 'VV'),('了', 'AS'),('三十万', 'CD'),('家', 'M'),('外商', 'NN'),('投资', 'NN'), ('企业', 'NN')]))
-1	中国	_	NR	_	_	2	Agt	_	_
-1	中国	_	NR	_	_	3	Agt	_	_
-2	批准	_	VV	_	_	0	Root	_	_
-3	设立	_	VV	_	_	2	eProg	_	_
-4	了	_	AS	_	_	3	mTime	_	_
-5	三十万	_	CD	_	_	6	Quan	_	_
-6	家	_	M	_	_	9	Qp	_	_
-7	外商	_	NN	_	_	8	Agt	_	_
-8	投资	_	NN	_	_	9	rDatv	_	_
-9	企业	_	NN	_	_	2	Pat	_	_
-9	企业	_	NN	_	_	3	Prod	_	_
+>>> print(semantic_parser([('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]))
+1	蜡烛	_	NN	_	_	3	Poss	_	_
+1	蜡烛	_	NN	_	_	4	Pat	_	_
+2	两	_	CD	_	_	3	Quan	_	_
+3	头	_	NN	_	_	4	Loc	_	_
+4	烧	_	VV	_	_	0	Root	_	_
 ```
 
-The output is a `CoNLLSentence` too. However, it's not a tree but a graph in which one node can have multiple heads, e.g. `中国` has two heads (ID 2 and 3).
+The output is a `CoNLLSentence` too. However, it's not a tree but a graph in which one node can have multiple heads, e.g. `蜡烛` has two heads (ID 3 and 4).
 
 ### Pipelines
 

diff --git a/hanlp/components/parsers/conll.py b/hanlp/components/parsers/conll.py
@@ -2,7 +2,7 @@
 # Author: hankcs
 # Date: 2019-12-26 15:37
 from collections import Counter
-from typing import Generator, Tuple, Union, Iterable, Any
+from typing import Generator, Tuple, Union, Iterable, Any, List
 
 import tensorflow as tf
 import numpy as np
@@ -93,6 +93,41 @@ def __init__(self, words=None):
     def __str__(self):
         return '\n'.join([word.__str__() for word in self])
 
+    @staticmethod
+    def from_str(conll: str):
+        """
+        Build a CoNLLSentence from CoNLL-X format str
+
+        Parameters
+        ----------
+        conll : str
+             CoNLL-X format string
+
+        Returns
+        -------
+        CoNLLSentence
+
+        """
+        words: List[CoNLLWord] = []
+        prev_id = None
+        for line in conll.strip().split('\n'):
+            if line.startswith('#'):
+                continue
+            cells = line.split()
+            cells[0] = int(cells[0])
+            cells[6] = int(cells[6])
+            if cells[0] != prev_id:
+                words.append(CoNLLWord(*cells))
+            else:
+                if isinstance(words[-1].head, list):
+                    words[-1].head.append(cells[6])
+                    words[-1].deprel.append(cells[7])
+                else:
+                    words[-1].head = [words[-1].head] + [cells[6]]
+                    words[-1].deprel = [words[-1].deprel] + [cells[7]]
+            prev_id = cells[0]
+        return CoNLLSentence(words)
+
 
 def read_conll(filepath):
     sent = []

diff --git a/hanlp/version.py b/hanlp/version.py
@@ -2,4 +2,4 @@
 # Author: hankcs
 # Date: 2019-12-28 19:26
 
-__version__ = '2.0.0-alpha.32'
+__version__ = '2.0.0-alpha.33'
diff --git a/tests/debug/debug_conll_sent.py b/tests/debug/debug_conll_sent.py
@@ -0,0 +1,16 @@
+# -*- coding:utf-8 -*-
+# Author: hankcs
+# Date: 2020-01-29 16:55
+from hanlp.components.parsers.conll import CoNLLSentence
+
+conll = '''\
+1	蜡烛	蜡烛	NN	NN	_	3	Poss	_	_
+1	蜡烛	蜡烛	NN	NN	_	4	Pat	_	_
+2	两	两	CD	CD	_	3	Quan	_	_
+3	头	头	NN	NN	_	4	Loc	_	_
+4	烧	烧	VV	VV	_	0	Root	_	_
+'''
+
+sent = CoNLLSentence.from_str(conll)
+print(sent)
+print([(x.form, x.pos) for x in sent])
diff --git a/tests/demo/zh/demo_dep.py b/tests/demo/zh/demo_dep.py
@@ -4,6 +4,6 @@
 import hanlp
 
 syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
-sent = [('中国', 'NR'),('批准', 'VV'),('设立', 'VV'),('了', 'AS'),('三十万', 'CD'),('家', 'M'),('外商', 'NN'),('投资', 'NN'), ('企业', 'NN')]
-tree = syntactic_parser.predict(sent)
+sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
+tree = syntactic_parser(sent)
 print(tree)
diff --git a/tests/demo/zh/demo_sdp.py b/tests/demo/zh/demo_sdp.py
@@ -4,13 +4,5 @@
 import hanlp
 
 semantic_parser = hanlp.load('SEMEVAL16_NEWS_BIAFFINE_ZH')
-sent = [('中国', 'NR'),
-        ('批准', 'VV'),
-        ('设立', 'VV'),
-        ('了', 'AS'),
-        ('三十万', 'CD'),
-        ('家', 'M'),
-        ('外商', 'NN'),
-        ('投资', 'NN'),
-        ('企业', 'NN')]
-print(semantic_parser.predict(sent))
+sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
+print(semantic_parser(sent))