Skip to content

Commit

Permalink
Correct tokenizer to chunk long documents
Browse files Browse the repository at this point in the history
  • Loading branch information
jncraton committed Jul 22, 2023
1 parent 6588cfd commit aee2076
Showing 1 changed file with 17 additions and 1 deletion.
18 changes: 17 additions & 1 deletion languagemodels/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,18 @@ def get_token_ids(doc):

generative_tokenizer, _ = get_model("instruct", tokenizer_only=True)

return generative_tokenizer.encode(doc, add_special_tokens=False).ids
# We need to disable and re-enable truncation here
# This allows us to tokenize very large documents
# We won't be feeding the tokens themselves to a model, so this
# shouldn't cause any problems.
trunk = generative_tokenizer.truncation
generative_tokenizer.no_truncation()
ids = generative_tokenizer.encode(doc, add_special_tokens=False).ids
generative_tokenizer.enable_truncation(
trunk["max_length"], stride=trunk["stride"], strategy=trunk["strategy"]
)

return ids


class Document:
Expand Down Expand Up @@ -82,6 +93,11 @@ class RetrievalContext:
>>> rc.clear()
>>> rc.get_match("Where is Paris?")
>>> rc.clear()
>>> rc.store(' '.join(['Python'] * 4096))
>>> len(rc.chunks)
73
>>> rc.clear()
>>> rc.store(' '.join(['Python'] * 232))
>>> len(rc.chunks)
Expand Down

0 comments on commit aee2076

Please sign in to comment.