Skip to content

Commit 19f24eb

Browse files
sebschuStanford NLP
authored and
Stanford NLP
committed
Merge branch 'master' into mt-preordering
1 parent 239dd56 commit 19f24eb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1792
-921
lines changed

CONTRIBUTING.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ In order for us to continue to be able to dual-license Stanford CoreNLP, we need
1010
Therefore, we can accept contributions on any of the following terms:
1111
* If your contribution is a bug fix of 6 lines or less of new code, we will accept it on the basis that both you and us regard the contribution as de minimis, and not requiring further hassle.
1212
* You can declare that the contribution is in the public domain (in your commit message or pull request).
13-
* You can make your contribution available under a non-restrictive open source licensing, such as the Revised (or 3-clause) BSD license, with appropriate licensing information included with the submitted code.
14-
* You can sign and return to us a contributor license agreement, explicitly licensing us to be able to use the code. Contact us at: [email protected] .
13+
* You can make your contribution available under a non-restrictive open source license, such as the Revised (or 3-clause) BSD license, with appropriate licensing information included with the submitted code.
14+
* You can sign and return to us a contributor license agreement (CLA), explicitly licensing us to be able to use the code. You can find these agreements at http://nlp.stanford.edu/software/CLA/ . You can send them to us or contact us at: [email protected] .
1515

1616
You should do development against our master branch. You should make sure that all unit tests still pass. (In general, you will not be able to run our integration tests, since they rely on resources in our filesystem.)
+9-9
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
CONLL EVAL SUMMARY (Before COREF)
2-
Identification of Mentions: Recall: (12407 / 14291) 86.81% Precision: (12407 / 34999) 35.44% F1: 50.34%
2+
Identification of Mentions: Recall: (12407 / 14291) 86.81% Precision: (12407 / 34999) 35.44% F1: 50.34%
33

44
CONLL EVAL SUMMARY (After COREF)
5-
METRIC muc:Coreference: Recall: (6260 / 10539) 59.39% Precision: (6260 / 10027) 62.43% F1: 60.87%
6-
METRIC bcub:Coreference: Recall: (12379.37 / 18298) 67.65% Precision: (13598.84 / 18298) 74.31% F1: 70.83%
7-
METRIC ceafm:Coreference: Recall: (10894 / 18298) 59.53% Precision: (10894 / 18298) 59.53% F1: 59.53%
8-
METRIC ceafe:Coreference: Recall: (3811.5 / 7759) 49.12% Precision: (3811.5 / 8271) 46.08% F1: 47.55%
9-
METRIC blanc:Coreference links: Recall: (25257 / 54427) 46.4% Precision: (25257 / 40544) 62.29% F1: 53.18%
10-
Non-coreference links: Recall: (922975 / 938262) 98.37% Precision: (922975 / 952145) 96.93% F1: 97.64%
11-
BLANC: Recall: (0.72 / 1) 72.38% Precision: (0.8 / 1) 79.61% F1: 75.41%
5+
METRIC muc:Coreference: Recall: (6256 / 10539) 59.36% Precision: (6256 / 10078) 62.07% F1: 60.68%
6+
METRIC bcub:Coreference: Recall: (12462.33 / 18385) 67.78% Precision: (13629.92 / 18385) 74.13% F1: 70.81%
7+
METRIC ceafm:Coreference: Recall: (10928 / 18385) 59.43% Precision: (10928 / 18385) 59.43% F1: 59.43%
8+
METRIC ceafe:Coreference: Recall: (3832.95 / 7846) 48.85% Precision: (3832.95 / 8307) 46.14% F1: 47.45%
9+
METRIC blanc:Coreference links: Recall: (25245 / 54427) 46.38% Precision: (25245 / 40608) 62.16% F1: 53.12%
10+
Non-coreference links: Recall: (932068 / 947431) 98.37% Precision: (932068 / 961250) 96.96% F1: 97.66%
11+
BLANC: Recall: (0.72 / 1) 72.38% Precision: (0.8 / 1) 79.56% F1: 75.39%
1212

13-
Final conll score ((muc+bcub+ceafe)/3) = 59.75
13+
Final conll score ((muc+bcub+ceafe)/3) = 59.65
1414
Final score (pairwise) Precision = 0.57
1515
done

itest/src/edu/stanford/nlp/pipeline/DeterministicCorefAnnotatorITest.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
import edu.stanford.nlp.ling.CoreAnnotations;
1313
import edu.stanford.nlp.ling.CoreLabel;
1414
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
15-
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
1615
import edu.stanford.nlp.util.CoreMap;
1716

1817
public class DeterministicCorefAnnotatorITest extends TestCase {
1918
private static AnnotationPipeline pipeline;
2019

20+
@Override
2121
public void setUp() throws Exception {
2222
synchronized(DeterministicCorefAnnotatorITest.class) {
2323
pipeline = new AnnotationPipeline();
@@ -131,4 +131,5 @@ public static void main(String[] args) throws Exception {
131131
DeterministicCorefAnnotatorITest itest = new DeterministicCorefAnnotatorITest();
132132
itest.testDeterministicCorefAnnotator();
133133
}
134+
134135
}

src/edu/stanford/nlp/classify/GeneralDataset.java

+15-9
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,9 @@ public void addAll(Iterable<? extends Datum<L,F>> data) {
243243
* @param fold The number of this fold (must be between 0 and (numFolds - 1)
244244
* @param numFolds The number of folds to divide the data into (must be greater than or equal to the
245245
* size of the data set)
246-
* @return A Pair of data sets, the first being the remainder of size this.size() - (end-start)
247-
* and the second usually being of size this.size() / numFolds but with the last fold of size
248-
* this.size() - (this.size() / numFolds * (numFolds - 1)
246+
* @return A Pair of data sets, the first being roughly (numFolds-1)/numFolds of the data items
247+
* (for use as training data_, and the second being 1/numFolds of the data, taken from the
248+
* fold<sup>th</sup> part of the data (for use as devTest data)
249249
*/
250250
public Pair<GeneralDataset<L, F>, GeneralDataset<L, F>> splitOutFold(int fold, int numFolds) {
251251
if (numFolds < 2 || numFolds > size() || fold < 0 || fold >= numFolds) {
@@ -296,14 +296,20 @@ protected double[][] trimToSize(double[][] i) {
296296
* Randomizes the data array in place.
297297
* Note: this cannot change the values array or the datum weights,
298298
* so redefine this for RVFDataset and WeightedDataset!
299-
* @param randomSeed
299+
* This uses the Fisher-Yates (or Durstenfeld-Knuth) shuffle, which is unbiased.
300+
* The same algorithm is used by shuffle() in j.u.Collections, and so you should get compatible
301+
* results if using it on a Collection with the same seed (as of JDK1.7, at least).
302+
*
303+
* @param randomSeed A seed for the Random object (allows you to reproduce the same ordering)
300304
*/
301-
public void randomize(int randomSeed) {
305+
// todo: Probably should be renamed 'shuffle' to be consistent with Java Collections API
306+
public void randomize(long randomSeed) {
302307
Random rand = new Random(randomSeed);
303-
for(int j = size - 1; j > 0; j --){
308+
for (int j = size - 1; j > 0; j--) {
309+
// swap each item with some lower numbered item
304310
int randIndex = rand.nextInt(j);
305311

306-
int [] tmp = data[randIndex];
312+
int[] tmp = data[randIndex];
307313
data[randIndex] = data[j];
308314
data[j] = tmp;
309315

@@ -317,9 +323,9 @@ public GeneralDataset<L,F> sampleDataset(int randomSeed, double sampleFrac, bool
317323
int sampleSize = (int)(this.size()*sampleFrac);
318324
Random rand = new Random(randomSeed);
319325
GeneralDataset<L,F> subset;
320-
if(this instanceof RVFDataset)
326+
if (this instanceof RVFDataset) {
321327
subset = new RVFDataset<L,F>();
322-
else if (this instanceof Dataset) {
328+
} else if (this instanceof Dataset) {
323329
subset = new Dataset<L,F>();
324330
}
325331
else {

src/edu/stanford/nlp/classify/LinearClassifier.java

+52-39
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
/**
4848
* Implements a multiclass linear classifier. At classification time this
4949
* can be any generalized linear model classifier (such as a perceptron,
50-
* naive logistic regression, SVM).
50+
* a maxent classifier (softmax logistic regression), or an SVM).
5151
*
5252
* @author Dan Klein
5353
* @author Jenny Finkel
@@ -180,6 +180,7 @@ public double scoreOf(Datum<L, F> example, L label) {
180180
* values the score (unnormalized log probability) of each class
181181
* for an RVFDatum.
182182
*/
183+
@Override
183184
@Deprecated
184185
public Counter<L> scoresOf(RVFDatum<L, F> example) {
185186
Counter<L> scores = new ClassicCounter<L>();
@@ -236,8 +237,8 @@ private double scoreOfRVFDatum(RVFDatum<L, F> example, L label) {
236237
* Doesn't consider a value for each feature.
237238
*/
238239
private double scoreOf(int[] feats, L label) {
239-
assert labelIndex.indexOf(label, false) >= 0;
240240
int iLabel = labelIndex.indexOf(label);
241+
assert iLabel >= 0;
241242
double score = 0.0;
242243
for (int feat : feats) {
243244
score += weight(feat, iLabel);
@@ -297,6 +298,7 @@ public Counter<L> probabilityOf(RVFDatum<L, F> example) {
297298
* that class for a certain example.
298299
* Looking at the the sum of e^v for each count v, should be 1.0.
299300
*/
301+
@Override
300302
public Counter<L> logProbabilityOf(Datum<L, F> example) {
301303
if(example instanceof RVFDatum<?, ?>)return logProbabilityOfRVFDatum((RVFDatum<L,F>)example);
302304
Counter<L> scores = scoresOf(example);
@@ -1182,9 +1184,9 @@ public <T> void justificationOf(Datum<L, F> example, PrintWriter pw,
11821184
/**
11831185
* This method returns a map from each label to a counter of feature weights for that label.
11841186
* Useful for feature analysis.
1187+
*
11851188
* @return a map of counters
11861189
*/
1187-
11881190
public Map<L,Counter<F>> weightsAsMapOfCounters() {
11891191
Map<L,Counter<F>> mapOfCounters = Generics.newHashMap();
11901192
for(L label : labelIndex){
@@ -1239,39 +1241,41 @@ public Counter<L> scoresOf(Datum<L, F> example, Collection<L> possibleLabels) {
12391241
return scores;
12401242
}
12411243

1244+
/* -- looks like a failed attempt at micro-optimization --
12421245
12431246
public L experimentalClassOf(Datum<L,F> example) {
1244-
if(example instanceof RVFDatum<?, ?>) {
1245-
throw new UnsupportedOperationException();
1246-
}
1247-
1248-
int labelCount = weights[0].length;
1249-
//System.out.printf("labelCount: %d\n", labelCount);
1250-
Collection<F> features = example.asFeatures();
1251-
1252-
int[] featureInts = new int[features.size()];
1253-
int fI = 0;
1254-
for (F feature : features) {
1255-
featureInts[fI++] = featureIndex.indexOf(feature);
1256-
}
1257-
//System.out.println("Features: "+features);
1258-
double bestScore = Double.NEGATIVE_INFINITY;
1259-
int bestI = 0;
1260-
for (int i = 0; i < labelCount; i++) {
1261-
double score = 0;
1262-
for (int j = 0; j < featureInts.length; j++) {
1263-
if (featureInts[j] < 0) continue;
1264-
score += weights[featureInts[j]][i];
1265-
}
1266-
if (score > bestScore) {
1267-
bestI = i;
1268-
bestScore = score;
1269-
}
1270-
//System.out.printf("Score: %s(%d): %e\n", labelIndex.get(i), i, score);
1271-
}
1272-
//System.out.printf("label(%d): %s\n", bestI, labelIndex.get(bestI));;
1273-
return labelIndex.get(bestI);
1247+
if(example instanceof RVFDatum<?, ?>) {
1248+
throw new UnsupportedOperationException();
1249+
}
1250+
1251+
int labelCount = weights[0].length;
1252+
//System.out.printf("labelCount: %d\n", labelCount);
1253+
Collection<F> features = example.asFeatures();
1254+
1255+
int[] featureInts = new int[features.size()];
1256+
int fI = 0;
1257+
for (F feature : features) {
1258+
featureInts[fI++] = featureIndex.indexOf(feature);
1259+
}
1260+
//System.out.println("Features: "+features);
1261+
double bestScore = Double.NEGATIVE_INFINITY;
1262+
int bestI = 0;
1263+
for (int i = 0; i < labelCount; i++) {
1264+
double score = 0;
1265+
for (int j = 0; j < featureInts.length; j++) {
1266+
if (featureInts[j] < 0) continue;
1267+
score += weights[featureInts[j]][i];
1268+
}
1269+
if (score > bestScore) {
1270+
bestI = i;
1271+
bestScore = score;
1272+
}
1273+
//System.out.printf("Score: %s(%d): %e\n", labelIndex.get(i), i, score);
1274+
}
1275+
//System.out.printf("label(%d): %s\n", bestI, labelIndex.get(bestI));;
1276+
return labelIndex.get(bestI);
12741277
}
1278+
-- */
12751279

12761280
@Override
12771281
public L classOf(Datum<L, F> example) {
@@ -1286,12 +1290,20 @@ private L classOfRVFDatum(RVFDatum<L, F> example) {
12861290
return Counters.argmax(scores);
12871291
}
12881292

1293+
@Override
12891294
@Deprecated
12901295
public L classOf(RVFDatum<L, F> example) {
12911296
Counter<L> scores = scoresOf(example);
12921297
return Counters.argmax(scores);
12931298
}
12941299

1300+
/** Make a linear classifier from the parameters. The parameters are used, not copied.
1301+
*
1302+
* @param weights The parameters of the classifier. The first index is the
1303+
* featureIndex value and second index is the labelIndex value.
1304+
* @param featureIndex An index from F to integers used to index the features in the weights array
1305+
* @param labelIndex An index from L to integers used to index the labels in the weights array
1306+
*/
12951307
public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labelIndex) {
12961308
this.featureIndex = featureIndex;
12971309
this.labelIndex = labelIndex;
@@ -1300,6 +1312,7 @@ public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labe
13001312
Arrays.fill(thresholds, 0.0);
13011313
}
13021314

1315+
// todo: This is unused and seems broken (ignores passed in thresholds)
13031316
public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labelIndex,
13041317
double[] thresholds) throws Exception {
13051318
this.featureIndex = featureIndex;
@@ -1315,26 +1328,26 @@ public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labe
13151328
Arrays.fill(thresholds, 0.0);
13161329
}
13171330

1318-
public LinearClassifier(double[] weights, Index<Pair<F, L>> weightIndex) {
1331+
private static <F, L> Counter<Pair<F, L>> makeWeightCounter(double[] weights, Index<Pair<F, L>> weightIndex) {
13191332
Counter<Pair<F,L>> weightCounter = new ClassicCounter<Pair<F,L>>();
13201333
for (int i = 0; i < weightIndex.size(); i++) {
13211334
if (weights[i] == 0) {
13221335
continue; // no need to save 0 weights
13231336
}
13241337
weightCounter.setCount(weightIndex.get(i), weights[i]);
13251338
}
1326-
init(weightCounter, new ClassicCounter<L>());
1339+
return weightCounter;
1340+
}
1341+
1342+
public LinearClassifier(double[] weights, Index<Pair<F, L>> weightIndex) {
1343+
this(makeWeightCounter(weights, weightIndex));
13271344
}
13281345

13291346
public LinearClassifier(Counter<? extends Pair<F, L>> weightCounter) {
13301347
this(weightCounter, new ClassicCounter<L>());
13311348
}
13321349

13331350
public LinearClassifier(Counter<? extends Pair<F, L>> weightCounter, Counter<L> thresholdsC) {
1334-
init(weightCounter,thresholdsC);
1335-
}
1336-
1337-
private void init(Counter<? extends Pair<F, L>> weightCounter, Counter<L> thresholdsC) {
13381351
Collection<? extends Pair<F, L>> keys = weightCounter.keySet();
13391352
featureIndex = new HashIndex<F>();
13401353
labelIndex = new HashIndex<L>();

src/edu/stanford/nlp/classify/RVFDataset.java

+9-5
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,8 @@
4040
* @author Anna Rafferty (various refactoring with GeneralDataset/Dataset)
4141
* @author Sarah Spikes ([email protected]) (Templatization)
4242
*
43-
* @param <L>
44-
* The type of the labels in the Dataset
45-
* @param <F>
46-
* The type of the features in the Dataset
43+
* @param <L> The type of the labels in the Dataset
44+
* @param <F> The type of the features in the Dataset
4745
*/
4846
public class RVFDataset<L, F> extends GeneralDataset<L, F> { // implements Iterable<RVFDatum<L, F>>, Serializable
4947

@@ -656,6 +654,7 @@ public void applyFeatureCountThreshold(int k) {
656654
* Applies a feature max count threshold to the RVFDataset. All features that
657655
* occur greater than <i>k</i> times are expunged.
658656
*/
657+
@Override
659658
public void applyFeatureMaxCountThreshold(int k) {
660659
float[] counts = getFeatureCounts();
661660
HashIndex<F> newFeatureIndex = new HashIndex<F>();
@@ -789,6 +788,7 @@ public void writeSVMLightFormat(PrintWriter writer) {
789788
* {@link #printSparseFeatureMatrix(PrintWriter)} to {@link System#out
790789
* System.out}.
791790
*/
791+
@Override
792792
public void printSparseFeatureMatrix() {
793793
printSparseFeatureMatrix(new PrintWriter(System.out, true));
794794
}
@@ -797,6 +797,7 @@ public void printSparseFeatureMatrix() {
797797
* Prints a sparse feature matrix representation of the Dataset. Prints the
798798
* actual {@link Object#toString()} representations of features.
799799
*/
800+
@Override
800801
public void printSparseFeatureMatrix(PrintWriter pw) {
801802
String sep = "\t";
802803
for (int i = 0; i < size; i++) {
@@ -922,6 +923,7 @@ public Iterator<RVFDatum<L, F>> iterator() {
922923
return new Iterator<RVFDatum<L, F>>() {
923924
private int index; // = 0;
924925

926+
@Override
925927
public boolean hasNext() {
926928
return this.index < size;
927929
}
@@ -935,6 +937,7 @@ public RVFDatum<L, F> next() {
935937
return next;
936938
}
937939

940+
@Override
938941
public void remove() {
939942
throw new UnsupportedOperationException();
940943
}
@@ -946,7 +949,7 @@ public void remove() {
946949
* need to randomize the values as well.
947950
*/
948951
@Override
949-
public void randomize(int randomSeed) {
952+
public void randomize(long randomSeed) {
950953
Random rand = new Random(randomSeed);
951954
for (int j = size - 1; j > 0; j--) {
952955
int randIndex = rand.nextInt(j);
@@ -963,4 +966,5 @@ public void randomize(int randomSeed) {
963966
values[j] = tmpv;
964967
}
965968
}
969+
966970
}

0 commit comments

Comments
 (0)