issue: implement OWLQN and LBFGS and pass the unittests

issue: rename lbfgs_minimizer_test.go to minimizer_test.go
BigTong · Dec 30, 2013 · 16f0990 · 16f0990
1 parent 2eef907
commit 16f0990
Show file tree

Hide file tree

Showing 6 changed files with 260 additions and 30 deletions.
diff --git a/diff_function.go b/diff_function.go
@@ -0,0 +1,7 @@
+package hector 
+
+// Description: function for minimizer such as LBFGS and OWLQN 
+type DiffFunction interface {
+    Value(pos *Vector) float64
+    Gradient(pos *Vector) *Vector
+}
diff --git a/lbfgs_minimizer.go b/lbfgs_minimizer.go
@@ -2,42 +2,57 @@ package hector
 
 import ("fmt")
 
-const numHist int = 10
-const maxIteration int = 200
-
+/**
+ * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
+ * by Galen Andrew and Jianfeng Gao
+ * user: weixuan
+ * To change this template use File | Settings | File Templates.
+ */
 type LBFGSMinimizer struct {
 	costFun DiffFunction
+    numHist int
+    maxIteration int
+    tolerance float64
 }
 
-type DiffFunction interface {
-    Value(pos *Vector) float64
-    Gradient(pos *Vector) *Vector
+func NewLBFGSMinimizer() (*LBFGSMinimizer) {
+    m := new(LBFGSMinimizer)
+    m.numHist = 10
+    m.maxIteration = 200
+    m.tolerance = 1e-4
+    return m
 }
 
-func (minimizer *LBFGSMinimizer) Minimize(costfun DiffFunction, init *Vector) *Vector {
-	minimizer.costFun = costfun;
+func (m *LBFGSMinimizer) Minimize(costfun DiffFunction, init *Vector) *Vector {
+	m.costFun = costfun;
     var cost float64 = costfun.Value(init)
     var grad *Vector = costfun.Gradient(init).Copy()
-    var pos *Vector = init
+    var pos *Vector = init.Copy()
+    var terminalCriterion *relativeMeanImprCriterion  = NewRelativeMeanImprCriterion(m.tolerance)
+    terminalCriterion.addCost(cost)
 
-    var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(numHist, minimizer, pos, grad)
+    var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(m.numHist, m, pos, grad)
     fmt.Println("Iter\tcost\timprovement")
-    fmt.Printf("%d\t%eN/A\n", 0, cost)
-    for iter:=1; iter <= maxIteration; iter++ {
+    fmt.Printf("%d\t%e\tUndefined\n", 0, cost)
+    for iter:=1; iter <= m.maxIteration; iter++ {
         dir := grad.Copy()
         dir.ApplyScale(-1.0)
         helper.ApplyQuasiInverseHession(dir)
         newCost, newPos := helper.BackTrackingLineSearch(cost, pos, grad, dir, iter==1)
-        if cost <= newCost {
-            break
-        }
-        fmt.Printf("%d\t%e\t%e\n", iter, newCost, (cost-newCost)/cost)
-        if (cost-newCost)/cost <= 0.0001 {
+        if cost == newCost {
             break
         }
         cost = newCost
         pos = newPos
+        terminalCriterion.addCost(cost)
+        fmt.Printf("%d\t%e\t%e\n", iter, newCost, terminalCriterion.improvement)
+        if terminalCriterion.isTerminable() {
+            break
+        }
         grad = costfun.Gradient(pos).Copy()
+		if helper.UpdateState(pos, grad) {
+			break
+		}
     }
 	return pos
 }
@@ -47,5 +62,5 @@ func (m *LBFGSMinimizer) Evaluate(pos *Vector) float64 {
 }
 
 func (m *LBFGSMinimizer) NextPoint(curPos *Vector, dir *Vector, alpha float64) *Vector {
-    return curPos.ElemWiseMultiplyAdd(dir, alpha)
-}
+	return curPos.ElemWiseMultiplyAdd(dir, alpha)
+}
diff --git a/lbfgs_minimizer_test.go → minimizer_test.go b/lbfgs_minimizer_test.go → minimizer_test.go
@@ -15,20 +15,20 @@ type mseDiffFunction struct {
 
 func getMSECostFunction() *mseDiffFunction{
 	f := new(mseDiffFunction)
-    f.center.data = map[int64]float64 {0:0, 1:1}
+    f.center.data = map[int64]float64 {0:0, 1:0}
     f.weights.data = map[int64]float64 {0:1, 1:0.01}
     f.init.data = map[int64]float64 {0:1, 1:1}
     f.grad.data = map[int64]float64 {0:0, 1:0}
     return f
 }
 
 func (f *mseDiffFunction) Value(x *Vector) float64 {
-    var val float64 = 0
+    var cost float64 = 0
     for n, val := range x.data {
 		diff := val - f.center.GetValue(n)
-        val += f.weights.GetValue(n) * diff * diff
+        cost += f.weights.GetValue(n) * diff * diff
     }
-    return 0.5 * val
+    return 0.5 * cost
 }
 
 // Gradients for different points could use the same memory
@@ -37,23 +37,30 @@ func (f *mseDiffFunction) Gradient(x *Vector) *Vector {
         f.grad.SetValue(n, f.weights.GetValue(n) * (val - f.center.GetValue(n)))
     }
     return &f.grad
-}	
+}
 
 func (f *mseDiffFunction) testResult(result *Vector, tolerance float64, t *testing.T) {
 	fmt.Println("Index\tTrue\tResult")
-    for n, val := range result.data {
+    for n, val := range f.center.data {
 		fmt.Printf("%d\t%e\t%e\n", n, val, result.GetValue(n))
 	}
     for n, val := range result.data {
-		if math.Abs(val - f.center.GetValue(n)) <= tolerance {
-			t.Errorf("Mismatch\nIndex\tTrue\tResult\n%d\t%e\t%e", n, f.center.GetValue(n), val)
+		if math.Abs(val - f.center.GetValue(n)) > tolerance {
+			t.Errorf("Mismatch\nIndex\tTrue\tResult\n%d\t%e\t%e\n", n, f.center.GetValue(n), val)
 		}
 	}
 }
 
 func TestLBFGS(t *testing.T) {
 	diffFunc := getMSECostFunction()
-	minimizer := new(LBFGSMinimizer)
+	minimizer := NewLBFGSMinimizer()
 	result := minimizer.Minimize(diffFunc, &(diffFunc.init))
 	diffFunc.testResult(result, 1e-6, t)
 }
+
+func TestOWLQN(t *testing.T) {
+    diffFunc := getMSECostFunction()
+    minimizer := NewOWLQNMinimizer(0.001)
+    result := minimizer.Minimize(diffFunc, &(diffFunc.init))
+    diffFunc.testResult(result, 0, t)
+}
diff --git a/owlqn_minimizer.go b/owlqn_minimizer.go
@@ -0,0 +1,143 @@
+package hector
+
+import ("fmt"
+        "math")
+
+/**
+ * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
+ * by Galen Andrew and Jianfeng Gao
+ * user: weixuan
+ * To change this template use File | Settings | File Templates.
+ */
+type OWLQNMinimizer struct {
+    l1reg float64
+	costFun DiffFunction
+    numHist int
+    maxIteration int
+    tolerance float64
+}
+
+func NewOWLQNMinimizer(l1reg float64) *OWLQNMinimizer {
+    m := new(OWLQNMinimizer)
+    m.l1reg = l1reg
+    m.numHist = 10
+    m.maxIteration = 20
+    m.tolerance = 1e-4
+    return m
+}
+
+func (m *OWLQNMinimizer) Minimize(costfun DiffFunction, init *Vector) *Vector {
+	m.costFun = costfun;
+    var cost float64 = m.Evaluate(init)
+    var grad *Vector = costfun.Gradient(init).Copy()
+    var pos *Vector = init.Copy()
+    var terminalCriterion *relativeMeanImprCriterion = NewRelativeMeanImprCriterion(m.tolerance)
+    terminalCriterion.addCost(cost)
+
+    var helper *QuasiNewtonHelper = NewQuasiNewtonHelper(m.numHist, m, pos, grad)
+    fmt.Println("Iter\tcost\timprovement")
+    fmt.Printf("%d\t%e\tUndefined\n", 0, cost)
+    for iter:=1; iter <= m.maxIteration; iter++ {
+        // customed steepest descending dir
+        steepestDescDir := grad.Copy()
+        m.updateGrad(pos, steepestDescDir)
+        steepestDescDir.ApplyScale(-1.0)
+        dir := steepestDescDir.Copy()
+        // quasi-newton dir
+        helper.ApplyQuasiInverseHession(dir)
+        m.fixDirSign(dir, steepestDescDir)
+        // customed grad for the new position
+        potentialGrad := grad.Copy()
+        m.updateGradForNewPos(pos, potentialGrad, dir)
+        newCost, newPos := helper.BackTrackingLineSearch(cost, pos, potentialGrad, dir, iter==1)
+        if cost == newCost {
+            break
+        }
+        cost = newCost
+        pos = newPos
+        terminalCriterion.addCost(cost)
+        fmt.Printf("%d\t%e\t%e\n", iter, newCost, terminalCriterion.improvement)
+        if terminalCriterion.isTerminable() {
+            break
+        }
+        grad = costfun.Gradient(pos).Copy()
+		if helper.UpdateState(pos, grad) {
+			break
+		}
+    }
+	return pos
+}
+
+func (m *OWLQNMinimizer) updateGradForNewPos(x *Vector, grad *Vector, dir *Vector) {
+    if m.l1reg == 0 {
+        return
+    }
+    for key, val := range grad.data {
+        xval := x.GetValue(key)
+        if xval < 0 {
+            grad.SetValue(key, val - m.l1reg)
+        } else if xval > 0 {
+            grad.SetValue(key, val + m.l1reg)
+        } else {
+            dirval := dir.GetValue(key)
+            if dirval < 0 {
+                grad.SetValue(key, val - m.l1reg)
+            } else if dirval > 0 {
+                grad.SetValue(key, val + m.l1reg)
+            }
+        }
+    }
+    return
+}
+
+func (m *OWLQNMinimizer) updateGrad(x *Vector, grad *Vector) {
+    if m.l1reg == 0 {
+        return
+    }
+    for key, val := range grad.data {
+        xval := x.GetValue(key)
+        if xval < 0 {
+            grad.SetValue(key, val - m.l1reg)
+        } else if xval > 0 {
+            grad.SetValue(key, val + m.l1reg)
+        } else {
+            if val < -m.l1reg {
+                grad.SetValue(key, val + m.l1reg)
+            } else if val > m.l1reg {
+                grad.SetValue(key, val - m.l1reg)
+            }
+        }
+    }
+    return
+}
+
+func (m *OWLQNMinimizer) fixDirSign(dir *Vector, steepestDescDir *Vector) {
+    if m.l1reg == 0 {
+        return
+    }
+    for key, val := range dir.data {
+        if val * steepestDescDir.GetValue(key) <= 0 {
+            dir.SetValue(key, 0)
+        }
+    }
+}
+
+func (m *OWLQNMinimizer) Evaluate(pos *Vector) float64 {
+    cost := m.costFun.Value(pos)
+    for _, val := range pos.data {
+        cost += math.Abs(val) * m.l1reg
+    }
+    return cost
+}
+
+func (m *OWLQNMinimizer) NextPoint(curPos *Vector, dir *Vector, alpha float64) *Vector {
+    newPos := curPos.ElemWiseMultiplyAdd(dir, alpha)
+    if m.l1reg > 0 {
+        for key, val := range curPos.data {
+            if val * newPos.GetValue(key) < 0 {
+                newPos.SetValue(key, 0)
+            }
+        }
+    }
+    return newPos
+}
diff --git a/quasinewton_helper.go b/quasinewton_helper.go
@@ -26,7 +26,8 @@ type Minimizer interface {
 }
 
 const MAX_BACKTRACKING_ITER = 50
-
+
+// Description: the pos and gradient arguments should NOT be modified outside
 func NewQuasiNewtonHelper(numHist int, minimizer Minimizer, curPos *Vector, curGrad *Vector) (*QuasiNewtonHelper) {
     h := new(QuasiNewtonHelper)
     h.numHist = int64(numHist)
@@ -92,7 +93,8 @@ func (h *QuasiNewtonHelper) BackTrackingLineSearch(cost float64, pos *Vector, gr
     return nextCost, nextPos
 }
 
-func (h *QuasiNewtonHelper) updateState(nextPos *Vector, nextGrad *Vector) (isOptimal bool) {
+// Description: the pos and gradient arguments should NOT be modified outside
+func (h *QuasiNewtonHelper) UpdateState(nextPos *Vector, nextGrad *Vector) (isOptimal bool) {
 	if int64(len(h.sList)) >= h.numHist {
 		h.sList = h.sList[1:]
 		h.yList = h.yList[1:]

diff --git a/terminal_criterion.go b/terminal_criterion.go
@@ -0,0 +1,56 @@
+package hector 
+
+import ("math")
+
+/**
+ * It's based the paper "Scalable Training of L1-Regularized Log-Linear Models"
+ * by Galen Andrew and Jianfeng Gao
+ * user: weixuan
+ * To change this template use File | Settings | File Templates.
+ */
+type relativeMeanImprCriterion struct {
+    minHist int
+    maxHist int
+    tolerance float64
+    improvement float64
+    costList []float64
+}
+
+func NewRelativeMeanImprCriterion(tolerance float64) *relativeMeanImprCriterion {
+    tc := new(relativeMeanImprCriterion)
+    tc.minHist = 5
+    tc.maxHist = 10
+    tc.costList = make([]float64, 0, tc.maxHist)
+    tc.tolerance = tolerance
+    return tc
+}
+
+func (tc *relativeMeanImprCriterion) calImprovement() float64{
+    sz := len(tc.costList)
+    if sz <= tc.minHist {
+        return math.MaxFloat32
+    }
+    first := tc.costList[0]
+    last := tc.costList[sz-1]
+    impr := (first - last) /float64(sz-1)
+    if last != 0 {
+        impr = math.Abs(impr / last)
+    } else if first != 0 {
+        impr = math.Abs(impr / first)
+    } else {
+        impr = 0
+    }
+    if sz > tc.maxHist {
+        tc.costList = tc.costList[1:]
+    }
+    return impr
+}
+
+func (tc *relativeMeanImprCriterion) addCost(latestCost float64) {
+    tc.costList = append(tc.costList, latestCost)
+    tc.improvement = tc.calImprovement()
+}
+
+func (tc *relativeMeanImprCriterion) isTerminable() bool {
+    return tc.improvement  <= tc.tolerance
+}