-
Notifications
You must be signed in to change notification settings - Fork 1
/
eval.mk
101 lines (76 loc) · 3.02 KB
/
eval.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- Mode: Makefile -*-
#
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This makefile pulls down the evaluation datasets and formats them uniformly.
# Word similarity evaluations are formatted to contain exactly three columns:
# the two words being compared and the human judgement.
#
# Use wordsim.py and analogy to run the actual evaluations.
CXXFLAGS=-std=c++11 -m64 -mavx -g -Ofast -Wall
LDLIBS=-lpthread -lm
WORDSIM_EVALS= ws353sim.ws.tab \
ws353rel.ws.tab \
men.ws.tab \
mturk.ws.tab \
rarewords.ws.tab \
simlex999.ws.tab \
$(NULL)
ANALOGY_EVALS= mikolov.an.tab \
msr.an.tab \
$(NULL)
all: $(WORDSIM_EVALS) $(ANALOGY_EVALS) analogy
ws353sim.ws.tab: ws353simrel.tar.gz
tar Oxfz $^ wordsim353_sim_rel/wordsim_similarity_goldstandard.txt > $@
ws353rel.ws.tab: ws353simrel.tar.gz
tar Oxfz $^ wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt > $@
men.ws.tab: MEN.tar.gz
tar Oxfz $^ MEN/MEN_dataset_natural_form_full | tr ' ' '\t' > $@
mturk.ws.tab: Mtruk.csv
cat $^ | tr -d '\r' | tr ',' '\t' > $@
rarewords.ws.tab: rw.zip
unzip -p $^ rw/rw.txt | cut -f1-3 -d $$'\t' > $@
simlex999.ws.tab: SimLex-999.zip
unzip -p $^ SimLex-999/SimLex-999.txt \
| tail -n +2 | cut -f1,2,4 -d $$'\t' > $@
mikolov.an.tab: questions-words.txt
egrep -v -E '^:' $^ | tr '[A-Z] ' '[a-z]\t' > $@
msr.an.tab: word_relationship.questions word_relationship.answers
cat word_relationship.questions | tr ' ' '\t' > /tmp/q
cat word_relationship.answers | cut -f2 -d ' ' > /tmp/a
paste /tmp/q /tmp/a > $@
rm -f /tmp/q /tmp/a
# wget commands to fetch the datasets. Please see the original datasets for
# appropriate references if you use these.
ws353simrel.tar.gz:
wget http://alfonseca.org/pubs/ws353simrel.tar.gz
MEN.tar.gz:
wget http://clic.cimec.unitn.it/~elia.bruni/resources/MEN.tar.gz
Mtruk.csv:
wget http://www.kiraradinsky.com/files/Mtruk.csv
rw.zip:
wget http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip
SimLex-999.zip:
wget http://www.cl.cam.ac.uk/~fh295/SimLex-999.zip
questions-words.txt:
wget http://download.tensorflow.org/data/questions-words.txt
word_relationship.questions:
wget https://github.com/darshanhegde/SNLPProject/raw/master/word2vec/eval/word_relationship.questions
word_relationship.answers:
wget https://github.com/darshanhegde/SNLPProject/raw/master/word2vec/eval/word_relationship.answers
analogy: analogy.cc
clean:
rm -f *.ws.tab *.an.tab analogy *.pyc
distclean: clean
rm -f *.tgz *.tar.gz *.zip Mtruk.csv questions-words.txt word_relationship.{questions,answers}