Support weight in starspace (facebookresearch#76)

* local changes * add support to weights * [going to revert this] * [new change: add a flag] * save new flag * Update README.md * print new flat
xujianjlu · Nov 10, 2017 · eec8249 · eec8249
1 parent e180b8c
commit eec8249
Show file tree

Hide file tree

Showing 17 changed files with 162 additions and 90 deletions.
diff --git a/README.md b/README.md
@@ -263,6 +263,7 @@ use <a href="https://github.com/facebookresearch/Starspace/blob/master/examples/
 
     The following arguments are optional:
       -normalizeText   whether to run basic text preprocess for input files [1]
+      -useWeight       whether input file contains weights [0]
       -verbose         verbosity level [0]
       -debug           whether it's in debug mode [0]
       -thread          number of threads [10]

diff --git a/src/data.cpp b/src/data.cpp
@@ -131,7 +131,7 @@ void InternDataHandler::convert(
 }
 
 void InternDataHandler::getWordExamples(
-    const vector<int32_t>& doc,
+    const vector<Base>& doc,
     vector<ParseResults>& rslts) const {
 
   rslts.clear();
@@ -206,7 +206,7 @@ void InternDataHandler::getNextKExamples(int K, vector<ParseResults>& c) {
 
 // Randomly sample one example and randomly sample a label from this example
 // The result is usually used as negative samples in training
-void InternDataHandler::getRandomRHS(vector<int32_t>& results) const {
+void InternDataHandler::getRandomRHS(vector<Base>& results) const {
   assert(size_ > 0);
   results.clear();
   auto& ex = examples_[rand() % size_];
@@ -231,10 +231,10 @@ void InternDataHandler::save(std::ostream& out) {
   out << "data size : " << size_ << endl;
   for (auto& example : examples_) {
     out << "lhs : ";
-    for (auto t : example.LHSTokens) {out << t << ' ';}
+    for (auto t : example.LHSTokens) {out << t.first << ':' << t.second << ' ';}
     out << endl;
     out << "rhs : ";
-    for (auto t : example.RHSTokens) {out << t << ' ';}
+    for (auto t : example.RHSTokens) {out << t.first << ':' << t.second << ' ';}
     out << endl;
   }
 }

diff --git a/src/data.h b/src/data.h
@@ -26,14 +26,14 @@ class InternDataHandler {
 
   virtual void convert(const ParseResults& example, ParseResults& rslt) const;
 
-  virtual void getRandomRHS(std::vector<int32_t>& results) const;
+  virtual void getRandomRHS(std::vector<Base>& results) const;
 
   virtual void save(std::ostream& out);
 
   virtual void getWordExamples(int idx, std::vector<ParseResults>& rslt) const;
 
   void getWordExamples(
-      const std::vector<int32_t>& doc,
+      const std::vector<Base>& doc,
       std::vector<ParseResults>& rslt) const;
 
   void addExample(const ParseResults& example);

diff --git a/src/doc_data.cpp b/src/doc_data.cpp
@@ -65,8 +65,8 @@ void LayerDataHandler::loadFromFile(
 }
 
 void LayerDataHandler::insert(
-    vector<int32_t>& rslt,
-    const vector<int32_t>& ex,
+    vector<Base>& rslt,
+    const vector<Base>& ex,
     float dropout) const {
 
   if (dropout < 1e-8) {
@@ -156,7 +156,7 @@ void LayerDataHandler::convert(
   }
 }
 
-void LayerDataHandler::getRandomRHS(vector<int32_t>& result) const {
+void LayerDataHandler::getRandomRHS(vector<Base>& result) const {
   assert(size_ > 0);
   auto& ex = examples_[rand() % size_];
   int r = rand() % ex.RHSFeatures.size();
@@ -183,11 +183,11 @@ void LayerDataHandler::save(ostream& out) {
   for (auto example : examples_) {
     out << "lhs: ";
     for (auto t : example.LHSTokens) {
-      out << t << ' ';
+      out << t.first << ':' << t.second << ' ';
     }
     out << "\nrhs: ";
     for (auto feat : example.RHSFeatures) {
-      for (auto r : feat) { cout << r << ' '; }
+      for (auto r : feat) { cout << r.first << ':' << r.second << ' '; }
       out << "\t";
     }
     out << endl;

diff --git a/src/doc_data.h b/src/doc_data.h
@@ -36,14 +36,14 @@ class LayerDataHandler : public InternDataHandler {
   void loadFromFile(const std::string& file,
                     std::shared_ptr<DataParser> parser) override;
 
-  void getRandomRHS(std::vector<int32_t>& results) const override;
+  void getRandomRHS(std::vector<Base>& results) const override;
 
   void save(std::ostream& out) override;
 
 private:
   void insert(
-      std::vector<int32_t>& rslt,
-      const std::vector<int32_t>& ex,
+      std::vector<Base>& rslt,
+      const std::vector<Base>& ex,
       float dropout = 0.0) const;
 
 };

diff --git a/src/doc_parser.cpp b/src/doc_parser.cpp
@@ -26,20 +26,30 @@ LayerDataParser::LayerDataParser(
 
 bool LayerDataParser::parse(
     string& s,
-    vector<int32_t>& feats,
+    vector<Base>& feats,
     const string& sep) {
 
   // split each part into tokens
   vector<string> tokens;
   boost::split(tokens, s, boost::is_any_of(string(sep)));
 
   for (auto token : tokens) {
+    string t = token;
+    float weight = 1.0;
+    if (args_->useWeight) {
+      std::size_t pos = token.find(":");
+      if (pos != std::string::npos) {
+        t = token.substr(0, pos);
+        weight = atof(token.substr(pos + 1).c_str());
+      }
+    }
+
     if (args_->normalizeText) {
-      normalize_text(token);
+      normalize_text(t);
     }
-    int32_t wid = dict_->getId(token);
+    int32_t wid = dict_->getId(t);
     if (wid != -1)  {
-      feats.push_back(wid);
+      feats.push_back(make_pair(wid, weight));
     }
   }
 
@@ -64,7 +74,7 @@ bool LayerDataParser::parse(
     start_idx = 1;
   }
   for (int i = start_idx; i < parts.size(); i++) {
-    vector<int32_t> feats;
+    vector<Base> feats;
     if (parse(parts[i], feats)) {
       rslt.RHSFeatures.push_back(feats);
     }

diff --git a/src/doc_parser.h b/src/doc_parser.h
@@ -31,7 +31,7 @@ class LayerDataParser : public DataParser {
 
   bool parse(
       std::string& line,
-      std::vector<int32_t>& rslt,
+      std::vector<Base>& rslt,
       const std::string& sep=" ");
 
   bool parse(

diff --git a/src/model.cpp b/src/model.cpp
@@ -84,19 +84,30 @@ Real norm2(Matrix<Real>::Row a) {
   return std::max(std::numeric_limits<Real>::epsilon(), retval);
 }
 
-Matrix<Real> EmbedModel::projectRHS(std::vector<int32_t> ws) {
+// consistent accessor methods for straight indices and index-weight pairs
+int32_t index(int32_t idx) { return idx; }
+int32_t index(std::pair<int32_t, Real> idxWeightPair) {
+  return idxWeightPair.first;
+}
+
+constexpr float weight(int32_t idx) { return 1.0; }
+float weight(std::pair<int32_t, Real> idxWeightPair) {
+  return idxWeightPair.second;
+}
+
+Matrix<Real> EmbedModel::projectRHS(const std::vector<Base>& ws) {
   Matrix<Real> retval;
   projectRHS(ws, retval);
   return retval;
 }
 
-Matrix<Real> EmbedModel::projectLHS(std::vector<int32_t> ws) {
+Matrix<Real> EmbedModel::projectLHS(const std::vector<Base>& ws) {
   Matrix<Real> retval;
   projectLHS(ws, retval);
   return retval;
 }
 
-void EmbedModel::projectLHS(std::vector<int32_t> ws, Matrix<Real>& retval) {
+void EmbedModel::projectLHS(const std::vector<Base>& ws, Matrix<Real>& retval) {
   LHSEmbeddings_->forward(ws, retval);
   if (ws.size()) {
     auto norm = (args_->similarity == "dot") ?
@@ -105,7 +116,7 @@ void EmbedModel::projectLHS(std::vector<int32_t> ws, Matrix<Real>& retval) {
   }
 }
 
-void EmbedModel::projectRHS(std::vector<int32_t> ws, Matrix<Real>& retval) {
+void EmbedModel::projectRHS(const std::vector<Base>& ws, Matrix<Real>& retval) {
   RHSEmbeddings_->forward(ws, retval);
   if (ws.size()) {
     auto norm = (args_->similarity == "dot") ?
@@ -172,10 +183,10 @@ Real EmbedModel::train(shared_ptr<InternDataHandler> data,
           continue;
         }
 
-        if (args_->debug) {
-          auto printVec = [&](const vector<int32_t>& vec) {
+        if (amMaster && args_->debug) {
+          auto printVec = [&](const vector<Base>& vec) {
             cout << "vec : ";
-            for (auto v : vec) {cout << v << ' ';}
+            for (auto v : vec) {cout << v.first << ':' << v.second << ' ';}
             cout << endl;
           };
 
@@ -304,8 +315,8 @@ void EmbedModel::normalize(Matrix<float>::Row row, double maxNorm) {
 }
 
 float EmbedModel::trainOne(shared_ptr<InternDataHandler> data,
-                           const vector<int32_t>& items,
-                           const vector<int32_t>& labels,
+                           const vector<Base>& items,
+                           const vector<Base>& labels,
                            size_t negSearchLimit,
                            Real rate0) {
   if (items.size() == 0) return 0.0; // nothing to learn.
@@ -344,14 +355,14 @@ float EmbedModel::trainOne(shared_ptr<InternDataHandler> data,
   // Select negative examples
   Real loss = 0.0;
   std::vector<Matrix<Real>> negs;
-  std::vector<std::vector<int32_t>> negLabelsBatch;
+  std::vector<std::vector<Base>> negLabelsBatch;
   Matrix<Real> negMean;
   negMean.matrix = zero_matrix<Real>(1, cols);
 
   for (int i = 0; i < negSearchLimit &&
                   negs.size() < args_->maxNegSamples; i++) {
 
-    std::vector<int32_t> negLabels;
+    std::vector<Base> negLabels;
     do {
       data->getRandomRHS(negLabels);
     } while (negLabels == labels);
@@ -407,8 +418,8 @@ float EmbedModel::trainOne(shared_ptr<InternDataHandler> data,
 }
 
 float EmbedModel::trainNLL(shared_ptr<InternDataHandler> data,
-                           const vector<int32_t>& items,
-                           const vector<int32_t>& labels,
+                           const vector<Base>& items,
+                           const vector<Base>& labels,
                            int32_t negSearchLimit,
                            Real rate0) {
   if (items.size() == 0) return 0.0; // nothing to learn.
@@ -426,13 +437,13 @@ float EmbedModel::trainNLL(shared_ptr<InternDataHandler> data,
   auto numClass = args_->negSearchLimit + 1;
   std::vector<Real> prob(numClass);
   std::vector<Matrix<Real>> negClassVec;
-  std::vector<std::vector<int32_t>> negLabelsBatch;
+  std::vector<std::vector<Base>> negLabelsBatch;
 
   prob[0] = dot(lhs, rhsP);
   Real max = prob[0];
 
   for (int i = 1; i < numClass; i++) {
-    std::vector<int32_t> negLabels;
+    std::vector<Base> negLabels;
     do {
       data->getRandomRHS(negLabels);
     } while (negLabels == labels);
@@ -491,9 +502,9 @@ float EmbedModel::trainNLL(shared_ptr<InternDataHandler> data,
 }
 
 void EmbedModel::backward(
-    const vector<int32_t>& items,
-    const vector<int32_t>& labels,
-    const vector<vector<int32_t>>& negLabels,
+    const vector<Base>& items,
+    const vector<Base>& labels,
+    const vector<vector<Base>>& negLabels,
     Matrix<Real>& gradW,
     Matrix<Real>& lhs,
     Real rate_lhs,
@@ -535,21 +546,21 @@ void EmbedModel::backward(
 
   // Update input items.
   for (auto w : items) {
-    auto row = LHSEmbeddings_->row(w);
-    update(row, gradW, rate_lhs, n1, LHSUpdates_, w);
+    auto row = LHSEmbeddings_->row(index(w));
+    update(row, gradW, rate_lhs * weight(w), n1, LHSUpdates_, index(w));
   }
 
   // Update positive example.
-  for (auto label : labels) {
-    auto row = RHSEmbeddings_->row(label);
-    update(row, lhs, rate_rhsP, n2, RHSUpdates_, label);
+  for (auto la : labels) {
+    auto row = RHSEmbeddings_->row(index(la));
+    update(row, lhs, rate_rhsP * weight(la), n2, RHSUpdates_, index(la));
   }
 
   // Update negative example.
   for (size_t i = 0; i < negLabels.size(); i++) {
-    for (auto label : negLabels[i]) {
-      auto row = RHSEmbeddings_->row(label);
-      update(row, lhs, rate_rhsN[i], n2, RHSUpdates_, label);
+    for (auto la : negLabels[i]) {
+      auto row = RHSEmbeddings_->row(index(la));
+      update(row, lhs, rate_rhsN[i] * weight(la), n2, RHSUpdates_, index(la));
     }
   }
 }

diff --git a/src/model.h b/src/model.h
@@ -55,20 +55,20 @@ struct EmbedModel : public boost::noncopyable {
   }
 
   float trainOne(std::shared_ptr<InternDataHandler> data,
-                 const std::vector<int32_t>& items,
-                 const std::vector<int32_t>& labels,
+                 const std::vector<Base>& items,
+                 const std::vector<Base>& labels,
                  size_t maxNegSamples,
                  Real rate);
 
   float trainNLL(std::shared_ptr<InternDataHandler> data,
-                 const std::vector<int32_t>& items,
-                 const std::vector<int32_t>& labels,
+                 const std::vector<Base>& items,
+                 const std::vector<Base>& labels,
                  int32_t negSearchLimit,
                  Real rate);
 
-  void backward(const std::vector<int32_t>& items,
-                const std::vector<int32_t>& labels,
-                const std::vector<std::vector<int32_t>>& negLabels,
+  void backward(const std::vector<Base>& items,
+                const std::vector<Base>& labels,
+                const std::vector<std::vector<Base>>& negLabels,
                 Matrix<Real>& gradW,
                 Matrix<Real>& lhs,
                 Real rate_lhs,
@@ -91,11 +91,11 @@ struct EmbedModel : public boost::noncopyable {
     return kNN(RHSEmbeddings_, point, numSim);
   }
 
-  Matrix<Real> projectRHS(std::vector<int32_t> ws);
-  Matrix<Real> projectLHS(std::vector<int32_t> ws);
+  Matrix<Real> projectRHS(const std::vector<Base>& ws);
+  Matrix<Real> projectLHS(const std::vector<Base>& ws);
 
-  void projectLHS(std::vector<int32_t> ws, Matrix<Real>& retval);
-  void projectRHS(std::vector<int32_t> ws, Matrix<Real>& retval);
+  void projectLHS(const std::vector<Base>& ws, Matrix<Real>& retval);
+  void projectRHS(const std::vector<Base>& ws, Matrix<Real>& retval);
 
   void loadTsv(std::istream& in, const std::string sep = "\t ");
   void loadTsv(const char* fname, const std::string sep = "\t ");