1 update perceptron README; 2 update code; 3 update unit_test

GUANFULUO · Sep 29, 2018 · 9e31af9 · 9e31af9
1 parent 3ce83b6
commit 9e31af9
Show file tree

Hide file tree

Showing 9 changed files with 234 additions and 108 deletions.
diff --git a/CH01/README.md b/CH01/README.md
@@ -62,6 +62,15 @@
 
 在监督学习过程中, 模型就是所要学习的**条件概率分布**或者**决策函数**.
 
+注意书中的这部分描述，整理了一下到表格里：
+
+|              | 假设空间$\cal F$                                             | 输入空间$\cal X$ | 输出空间$\cal Y$ | 参数空间      |
+| ------------ | ------------------------------------------------------------ | ---------------- | ---------------- | ------------- |
+| 决策函数     | $\cal F\it =\{f_{\theta} |Y=f_{\theta}(x), \theta \in \bf R \it ^n\}$ | 变量             | 变量             | $\bf R\it ^n$ |
+| 条件概率分布 | $\cal F\it =\{P|P_{\theta}(Y|X),\theta\in \bf R \it ^n\}$    | 随机变量         | 随机变量         | $\bf R\it ^n$ |
+
+书中描述的时候，有提到**条件概率分布族**，这个留一下，后面[CH06](../CH06/README.md)有提到确认逻辑斯谛分布属于指数分布族。
+
 ### 策略
 
 #### 损失函数与风险函数

diff --git a/CH02/Input/data_2-1.txt b/CH02/Input/data_2-1.txt
@@ -0,0 +1,3 @@
+3 3 1
+4 3 1
+1 1 -1
diff --git a/CH02/Input/logic_data_1.txt b/CH02/Input/logic_data_1.txt
@@ -0,0 +1,4 @@
+1 1 1 1 -1
+1 0 -1 1 -1
+0 1 -1 1 1
+0 0 -1 -1 1
diff --git a/CH02/Input/logic_data_2.txt b/CH02/Input/logic_data_2.txt
@@ -0,0 +1,8 @@
+1 1 1 1 1 -1
+1 1 0 -1 1 -1
+1 0 0 -1 1 -1
+0 1 1 -1 1 1
+0 1 0 -1 1 1
+0 0 0 -1 -1 1
+1 0 1 -1 1 -1
+0 0 1 -1 1 1
diff --git a/CH02/README.md b/CH02/README.md
@@ -18,11 +18,16 @@
 
 ### 导读
 
-感知机是二类分类的线性分类模型. 
+感知机是二类分类的线性分类模型。
 
-本章中涉及到向量内积，有超平面的概念，也有线性可分数据集的说明，在策略部分有说明损关于失函数的选择的考虑，可以和[CH07](../CH07/README.md)一起看。
+- 本章中涉及到向量内积，有超平面的概念，也有线性可分数据集的说明，在策略部分有说明损关于失函数的选择的考虑，可以和[CH07](../CH07/README.md)一起看。
 
-感知机的激活函数是符号函数.
+- 本章涉及的两个例子，思考一下为什么$\eta=1$，进而思考一下参数空间，这两个例子设计了相应的测试案例实现， 在后面的内容中也有展示。
+
+- 在收敛性证明那部分提到了偏置合并到权值向量的技巧，这点在LR和SVM中都有应用。
+
+- 第一次涉及Gram Matrix $G=[x_i\cdot x_j]_{N\times N}$
+- 感知机的激活函数是符号函数.
 
 ## 三要素
 
@@ -76,7 +81,10 @@ $$
 >
 > 1. 转至(2)，直至训练集中没有误分类点
 
+注意这个原始形式中的迭代公式，可以对$x$补1，将$w$和$b$合并在一起
+
 #### 对偶形式
+
 > 输入：$T=\{(x_1,y_1),(x_2,y_2),\dots,(x_N,y_N)\}\\ x_i\in \cal X=\bf R^n\mit , y_i\in \cal Y\it =\{-1,+1\}, i=1,2,\dots,N; \ \ 0<\eta\leqslant 1$
 >
 > 输出：
@@ -106,5 +114,46 @@ $$
 G=[x_i\cdot x_j]_{N\times N} \nonumber
 $$
 
+## 例子
+
+### 例2.1
+
+这个例子里面$\eta = 1$
+
+> 感知机学习算法由于采用不同的初值或选取不同的误分类点，解可以不同。
+
+另外，在这个例子之后，证明算法收敛性的部分，有一段**为了便于叙述与推导**的描述，提到了将偏置并入权重向量的方法，这个在涉及到内积计算的时候可能都可以用到，可以扩展阅读[CH06](../CH06/README.md)，[CH07](../CH07/README.md)部分的内容描述。
+
+### 例2.2
+
+这个例子也简单，注意两点
+
+1. $\eta=1$
+1. $\alpha_i\leftarrow \alpha_i+1, b\leftarrow b+y_i$
+
+以上：
+
+1. 为什么$\eta$选了1，这样得到的值数量级是1
+1. 这个表达式中用到了上面的$\eta=1$这个结果，已经做了简化
+
+所以，这里可以体会下，调整学习率的作用。学习率决定了参数空间。
+
+### Logic_01
+
+经常被举例子的**异或**问题[^1]，用感知机不能实现，因为对应的数据非线性可分。但是可以用感知机实现其他逻辑运算，也就是提供对应的逻辑运算的数据，然后学习模型。
+
+这个例子的数据是二元的，其中NOT运算只针对输入向量的第一个维度
+
+### Logic_02
+
+这个例子的数据是三元的
+
+### MNIST_01
+
+这个选择两类数据进行区分，不同的选择应该得到的结果会有一定差异，数据不上传了，在sklearn里面有相应的数据，直接引用了，注意测试案例里面用的是01，相对来讲好区分一些。
+
+
+
 ## 参考
 
+1. [^1]: [XOR](https://alan.do/minskys-and-or-theorem-a-single-perceptron-s-limitations-490c63a02e9f)
diff --git a/CH02/perceptron.py b/CH02/perceptron.py
@@ -1,65 +1,59 @@
 # -*-coding:utf-8-*-
-# Project: CH2  
+# Project: CH02
 # Filename: perceptron
 # Author: 😏 <smirk dot cao at gmail dot com>
-import pandas as pd
 import numpy as np
 import random
-
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score
+import argparse
+import logging
 
 
 class Perceptron(object):
-
     def __init__(self,
                  max_iter=5000,
-                 eta=0.00001):
+                 eta=0.00001,
+                 verbose=True):
         self.eta_ = eta
         self.max_iter_ = max_iter
+        self.w = 0
+        self.verbose = verbose
 
-    def fit(self, x_, y_):
-        self.w = np.zeros(x_[0].shape[0] + 1)
+    def fit(self, X, y):
+        self.w = np.zeros(X.shape[1] + 1)
         correct_count = 0
         n_iter_ = 0
 
         while n_iter_ < self.max_iter_:
-            index = random.randint(0, y_.shape[0] - 1)
-            xx_ = np.hstack([x_[index], 1])
-            yy_ = 2 * y_[index] - 1
-            wx = sum((self.w*xx_).T)
+            index = random.randint(0, y.shape[0] - 1)
+            xx_ = np.hstack([X[index], 1])
+            yy_ = 2 * y[index] - 1
+            wx = np.dot(self.w, xx_)
 
             if wx * yy_ > 0:
                 correct_count += 1
                 if correct_count > self.max_iter_:
                     break
                 continue
 
-            self.w += self.eta_*yy_*xx_
+            self.w += self.eta_ * yy_ * xx_
             n_iter_ += 1
-
-    def predict(self, x_):
-        x_ = np.hstack([x_, np.ones(x_.shape[0]).reshape((-1, 1))])
-        rst = np.array([1 if rst else 0 for rst in sum((x_ * self.w).T) > 0])
+            if self.verbose:
+                print(n_iter_)
+
+    def predict(self, X):
+        # for b
+        X = np.hstack([X, np.ones(X.shape[0]).reshape((-1, 1))])
+        # activation function for perceptron: sign
+        rst = np.array([1 if rst else -1 for rst in np.dot(X, self.w) > 0])
+        # np.sign(0) == 0
+        # rst = np.sign(np.dot(X, self.w))
         return rst
 
 
 if __name__ == '__main__':
-    print('Start read data')
-    raw_data = pd.read_csv('./data/train_binary.csv', header=0)
-    data = raw_data.values
-
-    X = data[0::, 1::]
-    y = data[::, 0]
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018)
-
-    print('Start training')
-    p = Perceptron()
-    p.fit(X_train, y_train)
-
-    print('Start predicting')
-    test_predict = p.predict(X_test)
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    logger = logging.getLogger(__name__)
 
-    score = accuracy_score(y_test, test_predict)
-    print("The accruacy socre is ", score)
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-p", "--path", required=False, help="path to input data file")
+    args = vars(ap.parse_args())
diff --git a/CH02/unit_test.py b/CH02/unit_test.py
@@ -1,56 +1,115 @@
 # -*-coding:utf-8-*-
-# Project: CH2  
+# Project: CH02
 # Filename: unit_test
 # Author: 😏 <smirk dot cao at gmail dot com>
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+from sklearn.datasets import load_digits
 from perceptron import *
 import numpy as np
+import argparse
+import logging
+import unittest
 
 
-def test_logic(x_, y_):
-    p = Perceptron(max_iter=100, eta=0.01)
-    p.fit(x_, y_)
-    print("w,b", p.w)
-    print(p.predict(x_))
+class TestPerceptron(unittest.TestCase):
+
+    def test_e21(self):
+        logger.info("test case e21")
+        # data e2.1
+        data_raw = np.loadtxt("Input/data_2-1.txt")
+        X = data_raw[:, :2]
+        y = data_raw[:, -1]
+        clf = Perceptron(eta=1)
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        logger.info(clf.w)
+        logger.info(str(y_pred))
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+
+    def test_e22(self):
+        logger.info("test case e22")
+        # data e2.1
+        data_raw = np.loadtxt("Input/data_2-1.txt")
+        X = data_raw[:, :2]
+        y = data_raw[:, -1]
+        clf = Perceptron(verbose=False)
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        logger.info(clf.w)
+        logger.info(str(y_pred))
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+
+    def test_logic_1(self):
+        # loaddata
+        data_raw = np.loadtxt("Input/logic_data_1.txt")
+        X = data_raw[:, :2]
+        clf = Perceptron(max_iter=100, eta=0.0001, verbose=False)
+        # test and
+        y = data_raw[:, 2]
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        logger.info("test case logic_1 and")
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+        # test or
+        logger.info("test logic_1 or")
+        y = data_raw[:, 3]
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+        # test not
+        logger.info("test logic_1 not")
+        y = data_raw[:, 4]
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+
+    def test_logic_2(self):
+        # loaddata
+        data_raw = np.loadtxt("Input/logic_data_2.txt")
+        X = data_raw[:, :3]
+        clf = Perceptron(max_iter=100, eta=0.0001, verbose=False)
+        # test and
+        y = data_raw[:, 3]
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        logger.info("test case logic_2 and")
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+        # test or
+        logger.info("test logic_2 or")
+        y = data_raw[:, 4]
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+        # test not
+        logger.info("test logic_2 not")
+        y = data_raw[:, 5]
+        clf.fit(X, y)
+        y_pred = clf.predict(X)
+        self.assertListEqual(y.tolist(), y_pred.tolist())
+
+    def test_mnist(self):
+        raw_data = load_digits(n_class=2)
+        X = raw_data.data
+        y = raw_data.target
+        # 0和1比较容易分辨吧
+        y[y == 0] = -1
+
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2018)
+
+        clf = Perceptron()
+        clf.fit(X_train, y_train)
+        test_predict = clf.predict(X_test)
+        score = accuracy_score(y_test, test_predict)
+        logger.info("The accruacy socre is %2.2f" % score)
 
 
 if __name__ == '__main__':
-    X = np.array([[1, 1], [1, 0], [0, 1], [0, 0]])
-    y_and = np.array([1, 0, 0, 0])
-    y_or = np.array([1, 1, 1, 0])
-    y_not = np.array([0, 0, 1, 1])
-    """
-    学习率大分不开
-    and
-    w,b [ 0.02  0.01  0.03 -0.05]
-    [1 0 0 0 0 0 0 0]
-    or
-    w,b [ 0.02  0.02  0.02 -0.01]
-    [1 1 1 1 1 0 1 1]
-    not
-    w,b [-0.03  0.01  0.    0.01]
-    [0 0 0 1 1 1 0 1]
-    """
-    X = np.array([[1, 1, 1], [1, 1, 0], [1, 0, 0],
-                  [0, 1, 1], [0, 1, 0],
-                  [0, 0, 0], [1, 0, 1], [0, 0, 1]])
-    y_and = np.array([1, 0, 0, 0, 0, 0, 0, 0])
-    y_or = np.array([1, 1, 1, 1, 1, 0, 1, 1])
-    y_not = np.array([0, 0, 0, 1, 1, 1, 0, 1])
-    """
-    and
-    w,b [ 3.  3.  1. -4.]
-    [1 1 0 0 0 0 0 0]
-    or
-    w,b [ 2.  2.  2. -1.]
-    [1 1 1 1 1 0 1 1]
-    not
-    w,b [-4.  1.  0.  1.]
-    [0 0 0 1 1 1 0 1]
-    """
-    print("and")
-    test_logic(X, y_and)
-    print("or")
-    test_logic(X, y_or)
-    print("not")
-    test_logic(X, y_not)
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    logger = logging.getLogger(__name__)
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument("-p", "--path", required=False, help="path to input data file")
+    args = vars(ap.parse_args())
 
+    unittest.main()
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+3 1
+3 1
+1 -1