Skip to content

Commit

Permalink
Implement helper function CoNLLSentence.from_str
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Jan 29, 2020
1 parent c0e9585 commit 7cc641d
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 37 deletions.
35 changes: 12 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,16 +139,11 @@ Parsers take both tokens and part-of-speech tags as input. The output is a tree

```python
>>> syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
>>> print(syntactic_parser([('中国', 'NR'),('批准', 'VV'),('设立', 'VV'),('', 'AS'),('三十万', 'CD'),('', 'M'),('外商', 'NN'),('投资', 'NN'), ('企业', 'NN')]))
1 中国 _ NR _ _ 2 nsubj _ _
2 批准 _ VV _ _ 0 root _ _
3 设立 _ VV _ _ 2 ccomp _ _
4 了 _ AS _ _ 3 asp _ _
5 三十万 _ CD _ _ 6 nummod _ _
6 家 _ M _ _ 9 clf _ _
7 外商 _ NN _ _ 9 nn _ _
8 投资 _ NN _ _ 9 nn _ _
9 企业 _ NN _ _ 3 dobj _ _
>>> print(syntactic_parser([('蜡烛', 'NN'), ('', 'CD'), ('', 'NN'), ('', 'VV')]))
1 蜡烛 _ NN _ _ 4 nsubj _ _
2 两 _ CD _ _ 3 nummod _ _
3 头 _ NN _ _ 4 dep _ _
4 烧 _ VV _ _ 0 root _ _
```

### Semantic Dependency Parsing
Expand All @@ -175,21 +170,15 @@ HanLP implements the biaffine[^biaffine] model which delivers the SOTA performan

```python
>>> semantic_parser = hanlp.load(SEMEVAL16_NEWS_BIAFFINE_ZH)
>>> print(semantic_parser([('中国', 'NR'),('批准', 'VV'),('设立', 'VV'),('', 'AS'),('三十万', 'CD'),('', 'M'),('外商', 'NN'),('投资', 'NN'), ('企业', 'NN')]))
1 中国 _ NR _ _ 2 Agt _ _
1 中国 _ NR _ _ 3 Agt _ _
2 批准 _ VV _ _ 0 Root _ _
3 设立 _ VV _ _ 2 eProg _ _
4 了 _ AS _ _ 3 mTime _ _
5 三十万 _ CD _ _ 6 Quan _ _
6 家 _ M _ _ 9 Qp _ _
7 外商 _ NN _ _ 8 Agt _ _
8 投资 _ NN _ _ 9 rDatv _ _
9 企业 _ NN _ _ 2 Pat _ _
9 企业 _ NN _ _ 3 Prod _ _
>>> print(semantic_parser([('蜡烛', 'NN'), ('', 'CD'), ('', 'NN'), ('', 'VV')]))
1 蜡烛 _ NN _ _ 3 Poss _ _
1 蜡烛 _ NN _ _ 4 Pat _ _
2 两 _ CD _ _ 3 Quan _ _
3 头 _ NN _ _ 4 Loc _ _
4 烧 _ VV _ _ 0 Root _ _
```

The output is a `CoNLLSentence` too. However, it's not a tree but a graph in which one node can have multiple heads, e.g. `中国` has two heads (ID 2 and 3).
The output is a `CoNLLSentence` too. However, it's not a tree but a graph in which one node can have multiple heads, e.g. `蜡烛` has two heads (ID 3 and 4).

### Pipelines

Expand Down
37 changes: 36 additions & 1 deletion hanlp/components/parsers/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Author: hankcs
# Date: 2019-12-26 15:37
from collections import Counter
from typing import Generator, Tuple, Union, Iterable, Any
from typing import Generator, Tuple, Union, Iterable, Any, List

import tensorflow as tf
import numpy as np
Expand Down Expand Up @@ -93,6 +93,41 @@ def __init__(self, words=None):
def __str__(self):
return '\n'.join([word.__str__() for word in self])

@staticmethod
def from_str(conll: str):
"""
Build a CoNLLSentence from CoNLL-X format str
Parameters
----------
conll : str
CoNLL-X format string
Returns
-------
CoNLLSentence
"""
words: List[CoNLLWord] = []
prev_id = None
for line in conll.strip().split('\n'):
if line.startswith('#'):
continue
cells = line.split()
cells[0] = int(cells[0])
cells[6] = int(cells[6])
if cells[0] != prev_id:
words.append(CoNLLWord(*cells))
else:
if isinstance(words[-1].head, list):
words[-1].head.append(cells[6])
words[-1].deprel.append(cells[7])
else:
words[-1].head = [words[-1].head] + [cells[6]]
words[-1].deprel = [words[-1].deprel] + [cells[7]]
prev_id = cells[0]
return CoNLLSentence(words)


def read_conll(filepath):
sent = []
Expand Down
2 changes: 1 addition & 1 deletion hanlp/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# Author: hankcs
# Date: 2019-12-28 19:26

__version__ = '2.0.0-alpha.32'
__version__ = '2.0.0-alpha.33'
16 changes: 16 additions & 0 deletions tests/debug/debug_conll_sent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-29 16:55
from hanlp.components.parsers.conll import CoNLLSentence

conll = '''\
1 蜡烛 蜡烛 NN NN _ 3 Poss _ _
1 蜡烛 蜡烛 NN NN _ 4 Pat _ _
2 两 两 CD CD _ 3 Quan _ _
3 头 头 NN NN _ 4 Loc _ _
4 烧 烧 VV VV _ 0 Root _ _
'''

sent = CoNLLSentence.from_str(conll)
print(sent)
print([(x.form, x.pos) for x in sent])
4 changes: 2 additions & 2 deletions tests/demo/zh/demo_dep.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
import hanlp

syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
sent = [('中国', 'NR'),('批准', 'VV'),('设立', 'VV'),('了', 'AS'),('三十万', 'CD'),('家', 'M'),('外商', 'NN'),('投资', 'NN'), ('企业', 'NN')]
tree = syntactic_parser.predict(sent)
sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
tree = syntactic_parser(sent)
print(tree)
12 changes: 2 additions & 10 deletions tests/demo/zh/demo_sdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,5 @@
import hanlp

semantic_parser = hanlp.load('SEMEVAL16_NEWS_BIAFFINE_ZH')
sent = [('中国', 'NR'),
('批准', 'VV'),
('设立', 'VV'),
('了', 'AS'),
('三十万', 'CD'),
('家', 'M'),
('外商', 'NN'),
('投资', 'NN'),
('企业', 'NN')]
print(semantic_parser.predict(sent))
sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
print(semantic_parser(sent))

0 comments on commit 7cc641d

Please sign in to comment.