Skip to content

Commit

Permalink
Solved tokenization bug for wikiberts
Browse files Browse the repository at this point in the history
  • Loading branch information
avramandrei committed Aug 9, 2021
1 parent bbb97da commit 5d39070
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
8 changes: 5 additions & 3 deletions pyeurovoc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os
import torch
import pickle
import json
from transformers import AutoTokenizer
from transformers import AutoTokenizer, BertTokenizer
from .util import download_file


Expand Down Expand Up @@ -77,7 +76,10 @@ def __init__(self, lang="en"):
self.mlb_encoder = pickle.load(pck_file)

# load the tokenizer according to the model dictionary
self.tokenizer = AutoTokenizer.from_pretrained(DICT_MODELS[lang])
if "wikibert" in DICT_MODELS[lang]:
self.tokenizer = BertTokenizer.from_pretrained(DICT_MODELS[lang])
else:
self.tokenizer = AutoTokenizer.from_pretrained(DICT_MODELS[lang])

def __call__(self, document_text, num_labels=6):
input_ids = self.tokenizer.encode(
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# $ pip install sampleproject
name='pyeurovoc', # Required

version='1.0.1', # Required
version='1.0.2', # Required

description='Python API for multilingual legal document classification with EuroVoc descriptors using BERT models.', # Required

Expand Down

0 comments on commit 5d39070

Please sign in to comment.