Skip to content

Commit

Permalink
Release language identification APIs which can recognize 176 languages
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Sep 28, 2022
1 parent 125b2b0 commit 5ba95ea
Show file tree
Hide file tree
Showing 13 changed files with 271 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ the [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) licens
<dependency>
<groupId>com.hankcs.hanlp.restful</groupId>
<artifactId>hanlp-restful</artifactId>
<version>0.0.11</version>
<version>0.0.12</version>
</dependency>
```

Expand Down
95 changes: 95 additions & 0 deletions hanlp/components/classifiers/fasttext_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 13:31
import os
import sys
from typing import List, Union

import fasttext
from fasttext.FastText import _FastText

import hanlp
from hanlp.common.component import Component
from hanlp.utils.io_util import get_resource, stdout_redirected
from hanlp_common.io import load_json
from hanlp_common.reflection import classpath_of
from hanlp_common.structure import SerializableDict


class FastTextClassifier(Component):

def __init__(self) -> None:
super().__init__()
self._model: _FastText = None
self.config = SerializableDict({
'classpath': classpath_of(self),
'hanlp_version': hanlp.__version__,
})

def load(self, save_dir, model_path=None, **kwargs):
config_path = os.path.join(save_dir, 'config.json')
if os.path.isfile(config_path):
self.config: dict = load_json(config_path)
model_path = self.config.get('model_path', model_path)
else:
model_path = model_path or save_dir
self.config['model_path'] = model_path
filepath = get_resource(model_path)
with stdout_redirected(to=os.devnull, stdout=sys.stderr):
self._model = fasttext.load_model(filepath)

def predict(self, text: Union[str, List[str]], topk=False, prob=False, max_len=None, **kwargs):
"""
Classify text.
Args:
text: A document or a list of documents.
topk: ``True`` or ``int`` to return the top-k labels.
prob: Return also probabilities.
max_len: Strip long document into ``max_len`` characters for faster prediction.
**kwargs: Not used
Returns:
Classification results.
"""
num_labels = len(self._model.get_labels())
flat = isinstance(text, str)
if flat:
text = [text]
if not isinstance(topk, list):
topk = [topk] * len(text)
if not isinstance(prob, list):
prob = [prob] * len(text)
if max_len:
text = [x[:max_len] for x in text]
text = [x.replace('\n', ' ') for x in text]
batch_labels, batch_probs = self._model.predict(text, k=num_labels)
results = []
for labels, probs, k, p in zip(batch_labels, batch_probs, topk, prob):
labels = [self._strip_prefix(x) for x in labels]
if k is False:
labels = labels[0]
elif k is True:
pass
elif k:
labels = labels[:k]
if p:
probs = probs.tolist()
if k is False:
result = labels, probs[0]
else:
result = dict(zip(labels, probs))
else:
result = labels
results.append(result)
if flat:
results = results[0]
return results

@property
def labels(self):
return [self._strip_prefix(x) for x in self._model.get_labels()]

@staticmethod
def _strip_prefix(label: str):
return label[len('__label__'):]
9 changes: 9 additions & 0 deletions hanlp/pretrained/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,13 @@
CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip'
SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip'

LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
'''
126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
'''
LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
'''
917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
'''

ALL = {}
7 changes: 7 additions & 0 deletions hanlp/utils/component_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only
'embed': {'classpath': 'hanlp.layers.embeddings.fast_text.FastTextEmbedding',
'filepath': identifier, 'src': 'token'},
'hanlp_version': version.__version__}, metapath)
elif identifier in {pretrained.classifiers.LID_176_FASTTEXT_SMALL,
pretrained.classifiers.LID_176_FASTTEXT_BASE}:
save_dir = os.path.dirname(save_dir)
metapath = os.path.join(save_dir, 'config.json')
save_json({'classpath': 'hanlp.components.classifiers.fasttext_classifier.FastTextClassifier',
'model_path': identifier,
'hanlp_version': version.__version__}, metapath)
else:
raise FileNotFoundError(f'The identifier {save_dir} resolves to a nonexistent meta file {metapath}. {tips}')
meta: dict = load_json(metapath)
Expand Down
2 changes: 1 addition & 1 deletion hanlp/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Author: hankcs
# Date: 2019-12-28 19:26

__version__ = '2.1.0-beta.41'
__version__ = '2.1.0-beta.42'
"""HanLP version"""


Expand Down
19 changes: 19 additions & 0 deletions plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 16:49
import hanlp

lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE)

print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.'))
lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
print(f'{lang} language identified with probability {prob:.3%}')
print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2))

# For a combination of languages, predict top-k languages with probabilities:
text = '''
2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。
In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.
'''

print(lid(text, topk=3, prob=True))
12 changes: 12 additions & 0 deletions plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2022-09-28 16:49
from hanlp_restful import HanLPClient

HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')

print(HanLP.language_identification([
'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
'2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
'2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
]))
50 changes: 50 additions & 0 deletions plugins/hanlp_restful/hanlp_restful/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,3 +468,53 @@ def grammatical_error_correction(self, text: Union[str, List[str]], language: st
{'text': text,
'language': language or self._language})
return response

def text_classification(self, text: Union[str, List[str]], model, topk=False, prob=False) -> Union[
str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
"""
Text classification is the task of assigning a sentence or document an appropriate category.
The categories depend on the chosen dataset and can range from topics.
Args:
text: A document or a list of documents.
model: The model to use for prediction.
topk: ``True`` or ``int`` to return the top-k labels.
prob: Return also probabilities.
Returns:
Classification results.
"""
response = self._send_post_json(self._url + '/text_classification',
{'text': text, 'model': model, 'topk': topk, 'prob': prob})
return response

def language_identification(self, text: Union[str, List[str]], topk=False, prob=False) -> Union[
str, Dict[str, float], List[Union[str, Dict[str, float]]]]:
"""
Recognize the language of a given text.
Args:
text: A document or a list of documents.
topk: ``True`` or ``int`` to return the top-k languages.
prob: Return also probabilities.
Returns:
Identified language in `ISO 639-1 codes`_.
Examples::
lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')
'en'
lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
('ja', 0.9976244568824768)
lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)
['zh', 'ja']
lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2, prob=True)
{'zh': 0.3952908217906952, 'en': 0.37189167737960815, 'ja': 0.056213412433862686}
.. _ISO 639-1 codes:
https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
"""
return self.text_classification(text, 'lid', topk, prob)
2 changes: 1 addition & 1 deletion plugins/hanlp_restful/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='hanlp_restful',
version='0.0.20',
version='0.0.21',
description='HanLP: Han Language Processing',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion plugins/hanlp_restful_java/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>com.hankcs.hanlp.restful</groupId>
<artifactId>hanlp-restful</artifactId>
<version>0.0.11</version>
<version>0.0.12</version>

<name>HanLP RESTful Client in Java</name>
<url>https://github.com/hankcs/HanLP</url>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,67 @@ public Map<String, Double> extractiveSummarization(String text, int topk) throws
return mapper.readValue(post("/extractive_summarization", input), LinkedHashMap.class);
}

/**
* Text classification is the task of assigning a sentence or document an appropriate category.
* The categories depend on the chosen dataset and can range from topics.
*
* @param text The text content of the document.
* @param model The model to use for prediction.
* @return Classification results.
* @throws IOException HTTP errors.
*/
public String textClassification(String text, String model) throws IOException
{
return (String) textClassification(text, model, false, false);
}


/**
* Text classification is the task of assigning a sentence or document an appropriate category.
* The categories depend on the chosen dataset and can range from topics.
*
* @param text A document or a list of documents.
* @param model The model to use for prediction.
* @param topk `true` or `int` to return the top-k languages.
* @param prob Return also probabilities.
* @return Classification results.
* @throws IOException HTTP errors.
*/
public Object textClassification(Object text, String model, Object topk, boolean prob) throws IOException
{
Map<String, Object> input = new HashMap<>();
input.put("text", text);
input.put("model", model);
input.put("topk", topk);
input.put("prob", prob);
//noinspection unchecked
return mapper.readValue(post("/text_classification", input), Object.class);
}

/**
* Recognize the language of a given text.
*
* @param text The text content of the document.
* @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
* @throws IOException HTTP errors.
*/
public String languageIdentification(String text) throws IOException
{
return textClassification(text, "lid");
}

/**
* Recognize the language of a given text.
*
* @param text The text content of the document.
* @return Identified language in <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes">ISO 639-1 codes</a>.
* @throws IOException HTTP errors.
*/
public List<String> languageIdentification(String[] text) throws IOException
{
return (List<String>) textClassification(text, "lid", false, false);
}

/**
* Keyphrase extraction aims to identify keywords or phrases reflecting the main topics of a document.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,16 @@ void grammaticalErrorCorrection() throws IOException
prettyPrint(client.grammaticalErrorCorrection(new String[]{"每个青年都应当有远大的报复。", "有的同学对语言很兴趣。"}));
}

@Test
void languageIdentification() throws IOException
{
prettyPrint(client.languageIdentification(new String[]{
"In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.",
"2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。",
"2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。",
}));
}

void prettyPrint(Object object) throws JsonProcessingException
{
ObjectMapper mapper = new ObjectMapper();
Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,21 @@
with open(join(this_dir, "hanlp", "version.py")) as fp:
exec(fp.read(), version)

FASTTEXT = 'fasttext-wheel==0.9.2'
extras_require = {
'amr': [
'penman==1.2.1',
'networkx>=2.5.1',
'perin-parser>=0.0.12',
],
'fasttext': [FASTTEXT],
'tf': [
'fasttext-wheel==0.9.2',
FASTTEXT,
'tensorflow==2.6.0',
'keras==2.6.0',
]
}
extras_require['full'] = sum(extras_require.values(), [])
extras_require['full'] = list(set(sum(extras_require.values(), [])))

setup(
name='hanlp',
Expand Down

0 comments on commit 5ba95ea

Please sign in to comment.