Merge branch 'master' into active_factor

provemyself · Jul 5, 2014 · d597b5e · d597b5e
2 parents 39fd1f2 + 706f883
commit d597b5e
Show file tree

Hide file tree

Showing 15 changed files with 69 additions and 90 deletions.
diff --git a/cluster/README_cluster b/cluster/README_cluster
@@ -16,13 +16,13 @@ To run the code on non-Hadoop clusters, the script 'single_machine'
 has the simplest possible invocation.
 
 In general: start the span server on one of the cluster nodes:
-./spanning_tree 
+./spanning_tree
 
-Launch vw on each of the worker nodes: 
+Launch vw on each of the worker nodes:
 
 ./vw --span_server <location> --total <t> --node <n> --unique_id <u> -d <file>
 
-where: 
+where:
 <location> is the host running spanning_tree
 <t> is the total number of nodes
 <n> is the node id number
@@ -38,23 +38,23 @@ Decide if you are going to control the number of tasks by:
 (b) controlling the number of reducers.
 We'll assume (a) below.
 
-Connect to the span server node for the Hadoop cluster:
+Start the span server for the Hadoop cluster:
+
 ./spanning_tree
 
-Start the map-reduce job using Hadoop streaming: 
+Start the map-reduce job using Hadoop streaming:
 
-hadoop jar $HADOOP_HOME/hadoop-streaming.jar
--Dmapred.job.map.memory.mb=2500 -input <input> -output <output> -file
-vw -file runvw.sh -mapper 'runvw.sh <output> <span_server>' -reducer
-NONE
+hadoop jar $HADOOP_HOME/hadoop-streaming.jar \
+  -files vw,runvw.sh \
+  -Dmapred.job.map.memory.mb=2500 -input <input> -output <output> \
+  -mapper runvw.sh -reducer NONE
 
 where <output> is the directory on HDFS where you want the trained
-model to be saved and <span_server> is the hostname of the gateway where
-spanning_tree runs. The trained model is saved to the file
+model to be saved. The trained model is saved to the file
 <output>/model on HDFS and can be retreived by hadoop -get.
 
 To modify the arguments to VW, edit the script runvw.sh. Arguments to
-hadoop can be directly added in the hadoop streaming command. 
+hadoop can be directly added in the hadoop streaming command.
 
 See the 'mapscript.sh' which uses 'runvw.sh' for an advanced example
 of running VW in a Hadoop enviornmnent.
@@ -64,13 +64,13 @@ of running VW in a Hadoop enviornmnent.
 The files you need to know about:
 
 runvw.sh: This is the mapper code. It takes as arguments:
-          
+
 The output directory. The trained model from the first mapper is
 stored as the file "model" in the output directory.
 
 The hostname of the cluster gateway, so that the mappers can connect
 to the gateway
-                    
+
 All the other standard VW options are currently hardcoded in the
 script, feel free to mess around with them.
 
@@ -84,15 +84,15 @@ communicate amongst themselves.
 
 #########################################################################
 
-allreduce.h: This is the header file for the nodes. 
+allreduce.h: This is the header file for the nodes.
 
 #########################################################################
 
-allreduce.cc: This is the code for doing allreduce. It implement the 
-routine described above. all_reduce is implemented as a combination of 
-reduce and broadcast routines. reduce reads data from children, adds it 
-with local data and passes it up to the parent with a call to pass_up. 
-broadcast receives data from parent, and passes it down to children with 
+allreduce.cc: This is the code for doing allreduce. It implement the
+routine described above. all_reduce is implemented as a combination of
+reduce and broadcast routines. reduce reads data from children, adds it
+with local data and passes it up to the parent with a call to pass_up.
+broadcast receives data from parent, and passes it down to children with
 a call to pass_down.
 
 #########################################################################

diff --git a/cluster/runvw.sh b/cluster/runvw.sh
@@ -1,41 +1,38 @@
 #!/bin/bash
 mapper=`printenv mapred_task_id | cut -d "_" -f 5`
-echo $mapper > /dev/stderr
 rm -f temp.cache
-echo 'Starting training' > /dev/stderr
-echo $1 > /dev/stderr
+date +"%F %T Start training mapper=$mapper" > /dev/stderr
+vwcmd="./vw -b 24 --total $mapred_map_tasks --node $mapper --cache_file temp.cache --span_server $mapreduce_job_submithost --loss_function=logistic"
 mapred_job_id=`echo $mapred_job_id | tr -d 'job_'`
-gdcmd="./vw -b 24 --total $mapred_map_tasks --node $mapper --unique_id $mapred_job_id --cache_file temp.cache --passes 1 --regularization=1 --adaptive --exact_adaptive_norm -d /dev/stdin -f tempmodel --span_server $mapreduce_job_submithost --loss_function=logistic" 
+gdcmd="$vwcmd --unique_id $mapred_job_id --passes 1 --adaptive --exact_adaptive_norm -d /dev/stdin -f tempmodel"
 mapred_job_id=`expr $mapred_job_id \* 2` #create new nonce
-bfgscmd="./vw -b 24 --total $mapred_map_tasks --node $mapper --unique_id $mapred_job_id --cache_file temp.cache --bfgs --mem 5 --passes 20 --regularization=1 --span_server $mapreduce_job_submithost -f model -i tempmodel --loss_function=logistic"
-if [ "$mapper" == '000000' ]
-then
+bfgscmd="$vwcmd --unique_id $mapred_job_id --bfgs --mem 5 --passes 20 -f model -i tempmodel"
+if [ "$mapper" == '000000' ]; then
     $gdcmd > mapperout 2>&1
-    if [ $? -ne 0 ] 
-    then
-       exit 1
-    fi 
+    if [ $? -ne 0 ]; then
+      date +"%F %T Failed mapper=$mapper cmd=$gdcmd" > /dev/stderr
+      exit 1
+    fi
     $bfgscmd >> mapperout 2>&1
     outfile=$mapred_output_dir/model
     mapperfile=$mapred_output_dir/mapperout
     found=`hadoop fs -lsr | grep $mapred_output_dir | grep mapperout`
-    if [ "$found" != "" ]
-    then
-	hadoop fs -rmr $mapperfile
+    if [ "$found" != "" ]; then
+      hadoop fs -rm -r $mapperfile
     fi
     found=`hadoop fs -lsr | grep $mapred_output_dir | grep model`
-    if [ "$found" != "" ]
-    then
-	hadoop fs -rmr $outfile
+    if [ "$found" != "" ]; then
+      hadoop fs -rm -r $outfile
     fi
-    echo $outfile > /dev/stderr
+    date +"%F %T outfile=$outfile" > /dev/stderr
     hadoop fs -put model $outfile
     hadoop fs -put mapperout $mapperfile
 else
     $gdcmd
-    if [ $? -ne 0 ]    
-    then
-       exit 1
+    if [ $? -ne 0 ]; then
+      date +"%F %T Failed mapper=$mapper cmd=$gdcmd" > /dev/stderr
+      exit 1
     fi
     $bfgscmd
 fi
+date +"%F %T Done mapper=$mapper" > /dev/stderr
diff --git a/cluster/spanning_tree.cc b/cluster/spanning_tree.cc
@@ -152,7 +152,7 @@ int main(int argc, char* argv[]) {
   short unsigned int port = 26543;
 
   address.sin_port = htons(port);
-  if (bind(sock,(sockaddr*)&address, sizeof(address)) < 0)
+  if (::bind(sock,(sockaddr*)&address, sizeof(address)) < 0)
 	  report_error("bind: ");
 
   if (argc == 2 && strcmp("--nondaemon",argv[1])==0)

diff --git a/demo/random-noise/vw-demo b/demo/random-noise/vw-demo
@@ -11,7 +11,7 @@
 #       3b) distrib.r - Density distribution plot utility, written in R
 #       3c) x-vs-y.r  - X vs Y correleation plot utility, written in R
 #
-export PATH=$PATH:.
+export PATH=.:$PATH
 Pager='less'
 ImgViewCandidates="gwenview display irfanview xee preview"
 ImgViewer=

diff --git a/test/RunTests b/test/RunTests
@@ -960,7 +960,7 @@ __DATA__
     train-sets/ref/sequencespan_data.nonldf.test-beam1.predict
 
 # Test 54: searn sequence SPAN labeling BIO, non-ldf test, beam 20
-{VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.predict --search_beam 20 --search_kbest 20
+{VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.predict --search_beam 20 --search_kbest 20 --quiet
     train-sets/ref/sequencespan_data.nonldf.test-beam20.stderr
     train-sets/ref/sequencespan_data.nonldf.test-beam20.predict
 

diff --git a/test/train-sets/ref/sequencespan_data.nonldf.test-beam20.stderr b/test/train-sets/ref/sequencespan_data.nonldf.test-beam20.stderr
@@ -1,24 +1 @@
-only testing
-Num weight bits = 18
-learning rate = 10
-initial_t = 1
-power_t = 0.5
-predictions = sequencespan_data.predict
 switching to BILOU encoding for sequence span labeling
-using no cache
-Reading datafile = train-sets/sequencespan_data
-num sources = 1
-average    since         example     example  current  current  current
-loss       last          counter      weight    label  predict features
-average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
-loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
-10.000000  10.000000         1        1.000000   [2 1 1 2 2 1 6 7 7 ..] [2 6 1 6 2 1 6 4 5 ..]        0     0     0            1192               0
-
-finished run
-number of examples per pass = 1
-passes used = 1
-weighted example sum = 1
-weighted label sum = 0
-average loss = 10
-best constant = -inf
-total feature number = 0
diff --git a/utl/vw-hypersearch b/utl/vw-hypersearch
@@ -463,6 +463,8 @@ sub is_integer_option($) {
     my $expects_integer =
         ($opt =~ qr{^-*
             bs?
+            |batch_sz
+            |bootstrap|B
             |(?:csoaa|wap)(?:_ldf)?
             |cb
             |num_children

diff --git a/utl/vw-top-errors b/utl/vw-top-errors
@@ -166,8 +166,8 @@ sub collect_errors($) {
 #   compare function to sort features by abs(weight) largest first
 #
 sub by_abs_weight_desc {
-    my $weight1 = (split(':', $a))[3];
-    my $weight2 = (split(':', $b))[3];
+    my $weight1 = (split(':', $a))[-1];
+    my $weight2 = (split(':', $b))[-1];
     $weight1 =~ s/\@.*$//;
     $weight2 =~ s/\@.*$//;
 
@@ -234,6 +234,16 @@ sub audit_top_weights($$@) {
                 next unless (defined $feature);
 
                 my ($name, $hash, $value, $weight) = split(':', $feature);
+                unless (defined $weight) {
+                    # multiple passes (using cache) don't have the
+                    # name but only the hashed value of it so
+                    # fields get shifted by one...
+                    printf STDERR "Undefined weight in audit data: feature=%s\n", $feature;
+                    $weight = $value;
+                    $value = $hash;
+                    $hash = $name;
+                    $name = "[$hash]";
+                }
                 $weight =~ s/\@.*$//;
                 printf "\t%s\t%7.6f\t%s\n", $name, $weight, $feature;
 

diff --git a/vowpalwabbit/allreduce.cc b/vowpalwabbit/allreduce.cc
@@ -155,7 +155,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
     bool listening = false;
     while(!listening)
     {
-      if (bind(sock,(sockaddr*)&address, sizeof(address)) < 0)
+      if (::bind(sock,(sockaddr*)&address, sizeof(address)) < 0)
       {
 #ifdef _WIN32
         if (WSAGetLastError() == WSAEADDRINUSE)

diff --git a/vowpalwabbit/binary.cc b/vowpalwabbit/binary.cc
@@ -22,7 +22,7 @@ namespace BINARY {
     if (ld->label == ld->prediction)
       ec.loss = 0.;
     else
-      ec.loss = 1.;
+      ec.loss = ld->weight;
   }
 
   learner* setup(vw& all, po::variables_map& vm)

diff --git a/vowpalwabbit/cbify.cc b/vowpalwabbit/cbify.cc
@@ -107,8 +107,6 @@ namespace CBIFY {
 	data.cb_label.costs.push_back(l);
       }
 
-    cout << data.cb_label.costs[0].probability << endl;
-
     if (is_learn)
       base.learn(ec);
 

diff --git a/vowpalwabbit/example.cc b/vowpalwabbit/example.cc
@@ -107,7 +107,7 @@ struct features_and_source
 {
   v_array<feature> feature_map; //map to store sparse feature vectors  
   uint32_t stride_shift;
-  size_t mask;
+  uint32_t mask;
   weight* base;
 };
 

diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc
@@ -20,7 +20,7 @@ using namespace LEARNER;
 namespace NN {
   const float hidden_min_activation = -3;
   const float hidden_max_activation = 3;
-  const int nn_constant = 533357803;
+  const uint32_t nn_constant = 533357803;
 
   struct nn {
     uint32_t k;

diff --git a/vowpalwabbit/oaa.cc b/vowpalwabbit/oaa.cc
@@ -27,23 +27,18 @@ namespace OAA {
 
   template <bool is_learn>
   void predict_or_learn(oaa& o, learner& base, example& ec) {
-    vw* all = o.all;
-
     multiclass* mc_label_data = (multiclass*)ec.ld;
-    uint32_t prediction = 1;
-    float score = INT_MIN;
-
     if (mc_label_data->label == 0 || (mc_label_data->label > o.k && mc_label_data->label != (uint32_t)-1))
       cout << "label " << mc_label_data->label << " is not in {1,"<< o.k << "} This won't work right." << endl;
 
+    label_data simple_temp = {0.f, mc_label_data->weight, 0.f, 0.f};
+    ec.ld = &simple_temp;
+
     string outputString;
     stringstream outputStringStream(outputString);
 
-    label_data simple_temp;
-    simple_temp.initial = 0.;
-    simple_temp.weight = mc_label_data->weight;
-    ec.ld = &simple_temp;
-
+    uint32_t prediction = 1;
+    float score = INT_MIN;
     for (uint32_t i = 1; i <= o.k; i++)
       {
 	if (is_learn)
@@ -52,7 +47,7 @@ namespace OAA {
 	      simple_temp.label = 1;
 	    else
 	      simple_temp.label = -1;
-
+	    
 	    base.learn(ec, i-1);
 	  }
 	else
@@ -73,7 +68,7 @@ namespace OAA {
     ec.ld = mc_label_data;
 
     if (o.shouldOutput) 
-      all->print_text(all->raw_prediction, outputStringStream.str(), ec.tag);
+      o.all->print_text(o.all->raw_prediction, outputStringStream.str(), ec.tag);
   }
 
   void finish_example(vw& all, oaa&, example& ec)

diff --git a/vowpalwabbit/searn_sequencetask.cc b/vowpalwabbit/searn_sequencetask.cc
@@ -87,9 +87,9 @@ namespace ArgmaxTask {
 
     for (size_t i=0; i<ec.size(); i++) {
       // labels should be 1 or 2, and our output is MAX of all predicted values
-      srn.snapshot(i, 1, &i, sizeof(i), true); //save state for optimization
-      srn.snapshot(i, 2, &max_prediction, sizeof(max_prediction), false); 
-
+      //      srn.snapshot(i, 1, &i, sizeof(i), true); //save state for optimization
+      //srn.snapshot(i, 2, &max_prediction, sizeof(max_prediction), false); 
+      
       uint32_t prediction;
       if (my_task_data->predict_max)
 	prediction = srn.predict(ec[i], max_label);