Skip to content

Commit

Permalink
handle ngrams in fastText binaries
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume Lample committed Apr 11, 2018
1 parent 3170e94 commit 6e0b460
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 8 deletions.
1 change: 1 addition & 0 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
# data
parser.add_argument("--src_lang", type=str, default="", help="Source language")
Expand Down
21 changes: 13 additions & 8 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,16 @@ def get_exp_path(params):
exp_folder = os.path.join(exp_folder, params.exp_name)
if not os.path.exists(exp_folder):
subprocess.Popen("mkdir %s" % exp_folder, shell=True).wait()
chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
while True:
exp_name = ''.join(random.choice(chars) for _ in range(10))
exp_path = os.path.join(exp_folder, exp_name)
if not os.path.isdir(exp_path):
break
if params.exp_id == '':
chars = 'abcdefghijklmnopqrstuvwxyz0123456789'
while True:
exp_id = ''.join(random.choice(chars) for _ in range(10))
exp_path = os.path.join(exp_folder, exp_id)
if not os.path.isdir(exp_path):
break
else:
exp_path = os.path.join(exp_folder, params.exp_id)
assert not os.path.isdir(exp_path), exp_path
# create the dump folder
if not os.path.isdir(exp_path):
subprocess.Popen("mkdir %s" % exp_path, shell=True).wait()
Expand Down Expand Up @@ -361,10 +365,11 @@ def load_bin_embeddings(params, source, full_vocab):
lang = params.src_lang if source else params.tgt_lang
model = load_fasttext_model(params.src_emb if source else params.tgt_emb)
words = model.get_labels()
embeddings = torch.from_numpy(model.get_input_matrix())
assert model.get_dimension() == params.emb_dim
logger.info("Loaded binary model. Generating embeddings ...")
embeddings = torch.from_numpy(np.concatenate([model.get_word_vector(w)[None] for w in words], 0))
logger.info("Generated embeddings for %i words." % len(words))
assert embeddings.size() == (len(words), params.emb_dim)
logger.info("Loaded %i pre-trained word embeddings." % len(words))

# select a subset of word embeddings (to deal with casing)
if not full_vocab:
Expand Down
2 changes: 2 additions & 0 deletions supervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from src.evaluation import Evaluator


# VALIDATION_METRIC = 'precision_at_1-nn'
VALIDATION_METRIC = 'precision_at_1-csls_knn_10'
# unsupervised criterion: 'mean_cosine-csls_knn_10-S2T-10000'
# supervised criterion: 'precision_at_1-csls_knn_10'
Expand All @@ -27,6 +28,7 @@
parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)")

Expand Down
1 change: 1 addition & 0 deletions unsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
parser.add_argument("--verbose", type=int, default=2, help="Verbose level (2:debug, 1:info, 0:warning)")
parser.add_argument("--exp_path", type=str, default="", help="Where to store experiment logs and models")
parser.add_argument("--exp_name", type=str, default="debug", help="Experiment name")
parser.add_argument("--exp_id", type=str, default="", help="Experiment ID")
parser.add_argument("--cuda", type=bool_flag, default=True, help="Run on GPU")
parser.add_argument("--export", type=str, default="txt", help="Export embeddings after training (txt / pth)")
# data
Expand Down

0 comments on commit 6e0b460

Please sign in to comment.