odora
diff --git a/‎CONTRIBUTING.md
+2-2 b/‎CONTRIBUTING.md
+2-2
diff --git a/‎data/edu/stanford/nlp/dcoref/expected.txt
+9-9 b/‎data/edu/stanford/nlp/dcoref/expected.txt
+9-9
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/DeterministicCorefAnnotatorITest.java
+2-1 b/‎itest/src/edu/stanford/nlp/pipeline/DeterministicCorefAnnotatorITest.java
+2-1
diff --git a/‎src/edu/stanford/nlp/classify/GeneralDataset.java
+15-9 b/‎src/edu/stanford/nlp/classify/GeneralDataset.java
+15-9
diff --git a/‎src/edu/stanford/nlp/classify/LinearClassifier.java
+52-39 b/‎src/edu/stanford/nlp/classify/LinearClassifier.java
+52-39
diff --git a/‎src/edu/stanford/nlp/classify/RVFDataset.java
+9-5 b/‎src/edu/stanford/nlp/classify/RVFDataset.java
+9-5
@@ -10,7 +10,7 @@ In order for us to continue to be able to dual-license Stanford CoreNLP, we need
 Therefore, we can accept contributions on any of the following terms:
  * If your contribution is a bug fix of 6 lines or less of new code, we will accept it on the basis that both you and us regard the contribution as de minimis, and not requiring further hassle.
  * You can declare that the contribution is in the public domain (in your commit message or pull request).
- * You can make your contribution available under a non-restrictive open source licensing, such as the Revised (or 3-clause) BSD license, with appropriate licensing information included with the submitted code.
- * You can sign and return to us a contributor license agreement, explicitly licensing us to be able to use the code. Contact us at: [email protected] .
+ * You can make your contribution available under a non-restrictive open source license, such as the Revised (or 3-clause) BSD license, with appropriate licensing information included with the submitted code.
+ * You can sign and return to us a contributor license agreement (CLA), explicitly licensing us to be able to use the code. You can find these agreements at http://nlp.stanford.edu/software/CLA/ . You can send them to us or contact us at: [email protected] .
 
 You should do development against our master branch. You should make sure that all unit tests still pass. (In general, you will not be able to run our integration tests, since they rely on resources in our filesystem.)
@@ -1,15 +1,15 @@
 CONLL EVAL SUMMARY (Before COREF)
-Identification of Mentions: Recall: (12407 / 14291) 86.81%  Precision: (12407 / 34999) 35.44% F1: 50.34%
+Identification of Mentions: Recall: (12407 / 14291) 86.81%	Precision: (12407 / 34999) 35.44%	F1: 50.34%
 
 CONLL EVAL SUMMARY (After COREF)
-METRIC muc:Coreference: Recall: (6260 / 10539) 59.39% Precision: (6260 / 10027) 62.43% F1: 60.87%
-METRIC bcub:Coreference: Recall: (12379.37 / 18298) 67.65% Precision: (13598.84 / 18298) 74.31% F1: 70.83%
-METRIC ceafm:Coreference: Recall: (10894 / 18298) 59.53% Precision: (10894 / 18298) 59.53% F1: 59.53%
-METRIC ceafe:Coreference: Recall: (3811.5 / 7759) 49.12%	Precision: (3811.5 / 8271) 46.08%	F1: 47.55%
-METRIC blanc:Coreference links: Recall: (25257 / 54427) 46.4% Precision: (25257 / 40544) 62.29% F1: 53.18%
-Non-coreference links: Recall: (922975 / 938262) 98.37%	Precision: (922975 / 952145) 96.93%	F1: 97.64%
-BLANC: Recall: (0.72 / 1) 72.38%	Precision: (0.8 / 1) 79.61%	F1: 75.41%
+METRIC muc:Coreference: Recall: (6256 / 10539) 59.36%     Precision: (6256 / 10078) 62.07%        F1: 60.68%
+METRIC bcub:Coreference: Recall: (12462.33 / 18385) 67.78%      Precision: (13629.92 / 18385) 74.13%    F1: 70.81%
+METRIC ceafm:Coreference: Recall: (10928 / 18385) 59.43%        Precision: (10928 / 18385) 59.43%       F1: 59.43%
+METRIC ceafe:Coreference: Recall: (3832.95 / 7846) 48.85%       Precision: (3832.95 / 8307) 46.14%      F1: 47.45%
+METRIC blanc:Coreference links: Recall: (25245 / 54427) 46.38% Precision: (25245 / 40608) 62.16% F1: 53.12%
+Non-coreference links: Recall: (932068 / 947431) 98.37% Precision: (932068 / 961250) 96.96%	F1: 97.66%
+BLANC: Recall: (0.72 / 1) 72.38% Precision: (0.8 / 1) 79.56% F1: 75.39%
 
-Final conll score ((muc+bcub+ceafe)/3) = 59.75
+Final conll score ((muc+bcub+ceafe)/3) = 59.65
 Final score (pairwise) Precision = 0.57
 done
@@ -12,12 +12,12 @@
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
-import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
 import edu.stanford.nlp.util.CoreMap;
 
 public class DeterministicCorefAnnotatorITest extends TestCase {
   private static AnnotationPipeline pipeline;
 
+  @Override
   public void setUp() throws Exception {
     synchronized(DeterministicCorefAnnotatorITest.class) {
       pipeline = new AnnotationPipeline();
@@ -131,4 +131,5 @@ public static void main(String[] args) throws Exception {
     DeterministicCorefAnnotatorITest itest = new DeterministicCorefAnnotatorITest();
     itest.testDeterministicCorefAnnotator();
   }
+
 }
@@ -243,9 +243,9 @@ public void addAll(Iterable<? extends Datum<L,F>> data) {
    *  @param fold The number of this fold (must be between 0 and (numFolds - 1)
    *  @param numFolds The number of folds to divide the data into (must be greater than or equal to the
    *                  size of the data set)
-   *  @return A Pair of data sets, the first being the remainder of size this.size() - (end-start)
-   *          and the second usually being of size this.size() / numFolds but with the last fold of size
-   *          this.size() - (this.size() / numFolds * (numFolds - 1)
+   *  @return A Pair of data sets, the first being roughly (numFolds-1)/numFolds of the data items
+   *         (for use as training data_, and the second being 1/numFolds of the data, taken from the
+   *         fold<sup>th</sup> part of the data (for use as devTest data)
    */
   public Pair<GeneralDataset<L, F>, GeneralDataset<L, F>> splitOutFold(int fold, int numFolds) {
     if (numFolds < 2 || numFolds > size() || fold < 0 || fold >= numFolds) {
@@ -296,14 +296,20 @@ protected double[][] trimToSize(double[][] i) {
    * Randomizes the data array in place.
    * Note: this cannot change the values array or the datum weights,
    * so redefine this for RVFDataset and WeightedDataset!
-   * @param randomSeed
+   * This uses the Fisher-Yates (or Durstenfeld-Knuth) shuffle, which is unbiased.
+   * The same algorithm is used by shuffle() in j.u.Collections, and so you should get compatible
+   * results if using it on a Collection with the same seed (as of JDK1.7, at least).
+   *
+   * @param randomSeed A seed for the Random object (allows you to reproduce the same ordering)
    */
-  public void randomize(int randomSeed) {
+  // todo: Probably should be renamed 'shuffle' to be consistent with Java Collections API
+  public void randomize(long randomSeed) {
     Random rand = new Random(randomSeed);
-    for(int j = size - 1; j > 0; j --){
+    for (int j = size - 1; j > 0; j--) {
+      // swap each item with some lower numbered item
       int randIndex = rand.nextInt(j);
 
-      int [] tmp = data[randIndex];
+      int[] tmp = data[randIndex];
       data[randIndex] = data[j];
       data[j] = tmp;
 
@@ -317,9 +323,9 @@ public GeneralDataset<L,F> sampleDataset(int randomSeed, double sampleFrac, bool
     int sampleSize = (int)(this.size()*sampleFrac);
     Random rand = new Random(randomSeed);
     GeneralDataset<L,F> subset;
-    if(this instanceof RVFDataset)
+    if (this instanceof RVFDataset) {
       subset = new RVFDataset<L,F>();
-    else if (this instanceof Dataset) {
+    } else if (this instanceof Dataset) {
       subset = new Dataset<L,F>();
     }
     else {
 
@@ -47,7 +47,7 @@
 /**
  * Implements a multiclass linear classifier. At classification time this
  * can be any generalized linear model classifier (such as a perceptron,
- * naive logistic regression, SVM).
+ * a maxent classifier (softmax logistic regression), or an SVM).
  *
  * @author Dan Klein
  * @author Jenny Finkel
@@ -180,6 +180,7 @@ public double scoreOf(Datum<L, F> example, L label) {
    *  values the score (unnormalized log probability) of each class
    *  for an RVFDatum.
    */
+  @Override
   @Deprecated
   public Counter<L> scoresOf(RVFDatum<L, F> example) {
     Counter<L> scores = new ClassicCounter<L>();
@@ -236,8 +237,8 @@ private double scoreOfRVFDatum(RVFDatum<L, F> example, L label) {
    *  Doesn't consider a value for each feature.
    */
   private double scoreOf(int[] feats, L label) {
-    assert labelIndex.indexOf(label, false) >= 0;
     int iLabel = labelIndex.indexOf(label);
+    assert iLabel >= 0;
     double score = 0.0;
     for (int feat : feats) {
       score += weight(feat, iLabel);
@@ -297,6 +298,7 @@ public Counter<L> probabilityOf(RVFDatum<L, F> example) {
    * that class for a certain example.
    * Looking at the the sum of e^v for each count v, should be 1.0.
    */
+  @Override
   public Counter<L> logProbabilityOf(Datum<L, F> example) {
     if(example instanceof RVFDatum<?, ?>)return logProbabilityOfRVFDatum((RVFDatum<L,F>)example);
     Counter<L> scores = scoresOf(example);
@@ -1182,9 +1184,9 @@ public <T> void justificationOf(Datum<L, F> example, PrintWriter pw,
 /**
  * This method returns a map from each label to a counter of feature weights for that label.
  * Useful for feature analysis.
+ *
  * @return a map of counters
  */
-
   public Map<L,Counter<F>> weightsAsMapOfCounters() {
     Map<L,Counter<F>> mapOfCounters = Generics.newHashMap();
     for(L label : labelIndex){
@@ -1239,39 +1241,41 @@ public Counter<L> scoresOf(Datum<L, F> example, Collection<L> possibleLabels) {
     return scores;
   }
 
+  /* -- looks like a failed attempt at micro-optimization --
 
   public L experimentalClassOf(Datum<L,F> example) {
-	  if(example instanceof RVFDatum<?, ?>) {
-		  throw new UnsupportedOperationException();
-	  }
-
-	  int labelCount = weights[0].length;
-	  //System.out.printf("labelCount: %d\n", labelCount);
-	  Collection<F> features = example.asFeatures();
-
-	  int[] featureInts = new int[features.size()];
-	  int fI = 0;
-	  for (F feature : features) {
-		  featureInts[fI++] = featureIndex.indexOf(feature);
-	  }
-	  //System.out.println("Features: "+features);
-	  double bestScore = Double.NEGATIVE_INFINITY;
-	  int bestI = 0;
-	  for (int i = 0; i < labelCount; i++) {
-		  double score = 0;
-		  for (int j = 0; j < featureInts.length; j++) {
-			  if (featureInts[j] < 0) continue;
-			  score += weights[featureInts[j]][i];
-		  }
-		  if (score > bestScore) {
-			  bestI = i;
-			  bestScore = score;
-		  }
-		  //System.out.printf("Score: %s(%d): %e\n", labelIndex.get(i), i, score);
-	  }
-	  //System.out.printf("label(%d): %s\n", bestI, labelIndex.get(bestI));;
-	  return labelIndex.get(bestI);
+    if(example instanceof RVFDatum<?, ?>) {
+      throw new UnsupportedOperationException();
+    }
+
+    int labelCount = weights[0].length;
+    //System.out.printf("labelCount: %d\n", labelCount);
+    Collection<F> features = example.asFeatures();
+
+    int[] featureInts = new int[features.size()];
+    int fI = 0;
+    for (F feature : features) {
+      featureInts[fI++] = featureIndex.indexOf(feature);
+    }
+    //System.out.println("Features: "+features);
+    double bestScore = Double.NEGATIVE_INFINITY;
+    int bestI = 0;
+    for (int i = 0; i < labelCount; i++) {
+      double score = 0;
+      for (int j = 0; j < featureInts.length; j++) {
+        if (featureInts[j] < 0) continue;
+        score += weights[featureInts[j]][i];
+      }
+      if (score > bestScore) {
+        bestI = i;
+        bestScore = score;
+      }
+      //System.out.printf("Score: %s(%d): %e\n", labelIndex.get(i), i, score);
+    }
+    //System.out.printf("label(%d): %s\n", bestI, labelIndex.get(bestI));;
+    return labelIndex.get(bestI);
   }
+  -- */
 
   @Override
   public L classOf(Datum<L, F> example) {
@@ -1286,12 +1290,20 @@ private L classOfRVFDatum(RVFDatum<L, F> example) {
     return Counters.argmax(scores);
   }
 
+  @Override
   @Deprecated
   public L classOf(RVFDatum<L, F> example) {
     Counter<L> scores = scoresOf(example);
     return Counters.argmax(scores);
   }
 
+  /** Make a linear classifier from the parameters. The parameters are used, not copied.
+   *
+   *  @param weights The parameters of the classifier. The first index is the
+   *                 featureIndex value and second index is the labelIndex value.
+   * @param featureIndex An index from F to integers used to index the features in the weights array
+   * @param labelIndex An index from L to integers used to index the labels in the weights array
+   */
   public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labelIndex) {
     this.featureIndex = featureIndex;
     this.labelIndex = labelIndex;
@@ -1300,6 +1312,7 @@ public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labe
     Arrays.fill(thresholds, 0.0);
   }
 
+  // todo: This is unused and seems broken (ignores passed in thresholds)
   public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labelIndex,
       double[] thresholds) throws Exception {
     this.featureIndex = featureIndex;
@@ -1315,26 +1328,26 @@ public LinearClassifier(double[][] weights, Index<F> featureIndex, Index<L> labe
     Arrays.fill(thresholds, 0.0);
   }
 
-  public LinearClassifier(double[] weights, Index<Pair<F, L>> weightIndex) {
+  private static <F, L> Counter<Pair<F, L>> makeWeightCounter(double[] weights, Index<Pair<F, L>> weightIndex) {
     Counter<Pair<F,L>> weightCounter = new ClassicCounter<Pair<F,L>>();
     for (int i = 0; i < weightIndex.size(); i++) {
       if (weights[i] == 0) {
         continue; // no need to save 0 weights
       }
       weightCounter.setCount(weightIndex.get(i), weights[i]);
     }
-    init(weightCounter, new ClassicCounter<L>());
+    return weightCounter;
+  }
+
+  public LinearClassifier(double[] weights, Index<Pair<F, L>> weightIndex) {
+    this(makeWeightCounter(weights, weightIndex));
   }
 
   public LinearClassifier(Counter<? extends Pair<F, L>> weightCounter) {
     this(weightCounter, new ClassicCounter<L>());
   }
 
   public LinearClassifier(Counter<? extends Pair<F, L>> weightCounter, Counter<L> thresholdsC) {
-    init(weightCounter,thresholdsC);
-  }
-
-  private void init(Counter<? extends Pair<F, L>> weightCounter, Counter<L> thresholdsC) {
     Collection<? extends Pair<F, L>> keys = weightCounter.keySet();
     featureIndex = new HashIndex<F>();
     labelIndex = new HashIndex<L>();
 
@@ -40,10 +40,8 @@
  * @author Anna Rafferty (various refactoring with GeneralDataset/Dataset)
  * @author Sarah Spikes ([email protected]) (Templatization)
  *
- * @param <L>
- *          The type of the labels in the Dataset
- * @param <F>
- *          The type of the features in the Dataset
+ * @param <L> The type of the labels in the Dataset
+ * @param <F> The type of the features in the Dataset
  */
 public class RVFDataset<L, F> extends GeneralDataset<L, F> { // implements Iterable<RVFDatum<L, F>>, Serializable
 
@@ -656,6 +654,7 @@ public void applyFeatureCountThreshold(int k) {
    * Applies a feature max count threshold to the RVFDataset. All features that
    * occur greater than <i>k</i> times are expunged.
    */
+  @Override
   public void applyFeatureMaxCountThreshold(int k) {
     float[] counts = getFeatureCounts();
     HashIndex<F> newFeatureIndex = new HashIndex<F>();
@@ -789,6 +788,7 @@ public void writeSVMLightFormat(PrintWriter writer) {
    * {@link #printSparseFeatureMatrix(PrintWriter)} to {@link System#out
    * System.out}.
    */
+  @Override
   public void printSparseFeatureMatrix() {
     printSparseFeatureMatrix(new PrintWriter(System.out, true));
   }
@@ -797,6 +797,7 @@ public void printSparseFeatureMatrix() {
    * Prints a sparse feature matrix representation of the Dataset. Prints the
    * actual {@link Object#toString()} representations of features.
    */
+  @Override
   public void printSparseFeatureMatrix(PrintWriter pw) {
     String sep = "\t";
     for (int i = 0; i < size; i++) {
@@ -922,6 +923,7 @@ public Iterator<RVFDatum<L, F>> iterator() {
     return new Iterator<RVFDatum<L, F>>() {
       private int index; // = 0;
 
+      @Override
       public boolean hasNext() {
         return this.index < size;
       }
@@ -935,6 +937,7 @@ public RVFDatum<L, F> next() {
         return next;
       }
 
+      @Override
       public void remove() {
         throw new UnsupportedOperationException();
       }
@@ -946,7 +949,7 @@ public void remove() {
    * need to randomize the values as well.
    */
   @Override
-  public void randomize(int randomSeed) {
+  public void randomize(long randomSeed) {
     Random rand = new Random(randomSeed);
     for (int j = size - 1; j > 0; j--) {
       int randIndex = rand.nextInt(j);
@@ -963,4 +966,5 @@ public void randomize(int randomSeed) {
       values[j] = tmpv;
     }
   }
+
 }