Final conversion for HF mGENRE

SKN443 · Jun 8, 2022 · 9c720f5 · 9c720f5
1 parent a4d75ef
commit 9c720f5
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 6 deletions.
diff --git a/examples_mgenre/README.md b/examples_mgenre/README.md
@@ -71,7 +71,10 @@ sentences = ["[START] Einstein [END] era un fisico tedesco."]
 model.sample(
     sentences,
     prefix_allowed_tokens_fn=lambda batch_id, sent: [
-        e for e in trie.get(sent.tolist()) if e < len(model.task.target_dictionary)
+        e for e in trie.get(sent.tolist())
+        if e < len(model.task.target_dictionary)
+        # for huggingface/transformers
+        # if e < len(model2.tokenizer) - 1
     ],
 )
 ```
@@ -94,7 +97,10 @@ Additionally, we can use the `lang_title2wikidataID` dictionary to map the gener
 model.sample(
     sentences,
     prefix_allowed_tokens_fn=lambda batch_id, sent: [
-        e for e in trie.get(sent.tolist()) if e < len(model.task.target_dictionary)
+        e for e in trie.get(sent.tolist())
+        if e < len(model.task.target_dictionary)
+        # for huggingface/transformers
+        # if e < len(model2.tokenizer) - 1
     ],
     text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(" >> ")))], key=lambda y: int(y[1:])),
     marginalize=True,
@@ -155,7 +161,10 @@ trie_of_mention = Trie([
 model.sample(
     sentences,
     prefix_allowed_tokens_fn=lambda batch_id, sent: [
-        e for e in trie_of_mention.get(sent.tolist()) if e < len(model.task.target_dictionary)
+        e for e in trie_of_mention.get(sent.tolist())
+        if e < len(model.task.target_dictionary)
+        # for huggingface/transformers
+        # if e < len(model2.tokenizer) - 1
     ],
     text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(" >> ")))], key=lambda y: int(y[1:])),
     marginalize=True,

diff --git a/examples_mgenre/examples.ipynb b/examples_mgenre/examples.ipynb
@@ -119,7 +119,10 @@
     "model.sample(\n",
     "    sentences,\n",
     "    prefix_allowed_tokens_fn=lambda batch_id, sent: [\n",
-    "        e for e in trie.get(sent.tolist()) if e < len(model.task.target_dictionary)\n",
+    "        e for e in trie.get(sent.tolist())\n",
+    "        if e < len(model.task.target_dictionary)\n",
+    "        # for huggingface/transformers\n",
+    "        # if e < len(model2.tokenizer) - 1\n",
     "    ],\n",
     ")"
    ]
@@ -164,7 +167,10 @@
     "model.sample(\n",
     "    sentences,\n",
     "    prefix_allowed_tokens_fn=lambda batch_id, sent: [\n",
-    "        e for e in trie.get(sent.tolist()) if e < len(model.task.target_dictionary)\n",
+    "        e for e in trie.get(sent.tolist())\n",
+    "        if e < len(model.task.target_dictionary)\n",
+    "        # for huggingface/transformers\n",
+    "        # if e < len(model2.tokenizer) - 1\n",
     "    ],\n",
     "    text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(\" >> \")))], key=lambda y: int(y[1:])),\n",
     "    marginalize=True,\n",
@@ -249,7 +255,10 @@
     "model.sample(\n",
     "    sentences,\n",
     "    prefix_allowed_tokens_fn=lambda batch_id, sent: [\n",
-    "        e for e in trie_of_mention.get(sent.tolist()) if e < len(model.task.target_dictionary)\n",
+    "        e for e in trie_of_mention.get(sent.tolist())\n",
+    "        if e < len(model.task.target_dictionary)\n",
+    "        # for huggingface/transformers\n",
+    "        # if e < len(model2.tokenizer) - 1\n",
     "    ],\n",
     "    text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(\" >> \")))], key=lambda y: int(y[1:])),\n",
     "    marginalize=True,\n",