Skip to content

Commit

Permalink
added huggingface vocabulary builder
Browse files Browse the repository at this point in the history
  • Loading branch information
arunppsg committed Mar 28, 2023
1 parent 3714655 commit 3c9dff1
Showing 1 changed file with 33 additions and 0 deletions.
33 changes: 33 additions & 0 deletions deepchem/feat/vocabulary_builders/hf_vocab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from deepchem.feat.vocabulary_builders.vocabulary_builder import VocabularyBuilder
from tokenizers import Tokenizer
from typing import List


class HuggingFaceVocabularyBuilder(VocabularyBuilder):

def __init__(self, model, trainer):
# This tokenizer is from transformers
self.model = model
self.trainer = trainer
self.tokenizer = Tokenizer(model)

# superclass accepts a DeepChem dataset while huggingface vocabulary builders
# reads data from file
def build(self, paths: List[str]): # type: ignore

self.tokenizer.train(paths, self.trainer)

@classmethod
def load(cls, fname: str):
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file=fname)
return tokenizer

def save(self, fname: str):
"""
Parameters
----------
fname: str
A json file path
"""
self.tokenizer.save(fname)

0 comments on commit 3c9dff1

Please sign in to comment.