Linear classifiers: Perceptron, Adaline, DecisionStump (#50)

* Linear classifiers * Code formatting to PSR-2 * Added basic test cases for linear classifiers
wuxiangwa · Feb 16, 2017 · cf222bc · cf222bc
1 parent f0a7984
commit cf222bc
Show file tree

Hide file tree

Showing 9 changed files with 676 additions and 9 deletions.
diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php
@@ -56,6 +56,11 @@ class DecisionTree implements Classifier
      */
     private $numUsableFeatures = 0;
 
+    /**
+     * @var array
+     */
+    private $selectedFeatures;
+
     /**
      * @var array
      */
@@ -126,33 +131,45 @@ protected function getSplitLeaf($records, $depth = 0)
         if ($this->actualDepth < $depth) {
             $this->actualDepth = $depth;
         }
+
+        // Traverse all records to see if all records belong to the same class,
+        // otherwise group the records so that we can classify the leaf
+        // in case maximum depth is reached
         $leftRecords = [];
         $rightRecords= [];
         $remainingTargets = [];
         $prevRecord = null;
         $allSame = true;
+
         foreach ($records as $recordNo) {
+            // Check if the previous record is the same with the current one
             $record = $this->samples[$recordNo];
             if ($prevRecord && $prevRecord != $record) {
                 $allSame = false;
             }
             $prevRecord = $record;
+
+            // According to the split criteron, this record will
+            // belong to either left or the right side in the next split
             if ($split->evaluate($record)) {
                 $leftRecords[] = $recordNo;
             } else {
                 $rightRecords[]= $recordNo;
             }
+
+            // Group remaining targets
             $target = $this->targets[$recordNo];
-            if (! in_array($target, $remainingTargets)) {
-                $remainingTargets[] = $target;
+            if (! array_key_exists($target, $remainingTargets)) {
+                $remainingTargets[$target] = 1;
+            } else {
+                $remainingTargets[$target]++;
             }
         }
 
         if (count($remainingTargets) == 1 || $allSame || $depth >= $this->maxDepth) {
             $split->isTerminal = 1;
-            $classes = array_count_values($remainingTargets);
-            arsort($classes);
-            $split->classValue = key($classes);
+            arsort($remainingTargets);
+            $split->classValue = key($remainingTargets);
         } else {
             if ($leftRecords) {
                 $split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
@@ -200,15 +217,31 @@ protected function getBestSplit($records)
     }
 
     /**
+     * Returns available features/columns to the tree for the decision making
+     * process. <br>
+     *
+     * If a number is given with setNumFeatures() method, then a random selection
+     * of features up to this number is returned. <br>
+     *
+     * If some features are manually selected by use of setSelectedFeatures(),
+     * then only these features are returned <br>
+     *
+     * If any of above methods were not called beforehand, then all features
+     * are returned by default.
+     *
      * @return array
      */
     protected function getSelectedFeatures()
     {
         $allFeatures = range(0, $this->featureCount - 1);
-        if ($this->numUsableFeatures == 0) {
+        if ($this->numUsableFeatures == 0 && ! $this->selectedFeatures) {
             return $allFeatures;
         }
 
+        if ($this->selectedFeatures) {
+            return $this->selectedFeatures;
+        }
+
         $numFeatures = $this->numUsableFeatures;
         if ($numFeatures > $this->featureCount) {
             $numFeatures = $this->featureCount;
@@ -323,6 +356,16 @@ public function setNumFeatures(int $numFeatures)
         return $this;
     }
 
+    /**
+     * Used to set predefined features to consider while deciding which column to use for a split,
+     *
+     * @param array $features
+     */
+    protected function setSelectedFeatures(array $selectedFeatures)
+    {
+        $this->selectedFeatures = $selectedFeatures;
+    }
+
     /**
      * A string array to represent columns. Useful when HTML output or
      * column importances are desired to be inspected.

diff --git a/src/Phpml/Classification/Linear/Adaline.php b/src/Phpml/Classification/Linear/Adaline.php
@@ -0,0 +1,148 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Phpml\Classification\Linear;
+
+use Phpml\Helper\Predictable;
+use Phpml\Helper\Trainable;
+use Phpml\Classification\Classifier;
+use Phpml\Classification\Linear\Perceptron;
+use Phpml\Preprocessing\Normalizer;
+
+class Adaline extends Perceptron
+{
+
+    /**
+     * Batch training is the default Adaline training algorithm
+     */
+    const BATCH_TRAINING    = 1;
+
+    /**
+     * Online training: Stochastic gradient descent learning
+     */
+    const ONLINE_TRAINING    = 2;
+
+    /**
+     * The function whose result will be used to calculate the network error
+     * for each instance
+     *
+     * @var string
+     */
+    protected static $errorFunction = 'output';
+
+    /**
+     * Training type may be either 'Batch' or 'Online' learning
+     *
+     * @var string
+     */
+    protected $trainingType;
+
+    /**
+     * @var Normalizer
+     */
+    private $normalizer;
+
+    /**
+     * Initalize an Adaline (ADAptive LInear NEuron) classifier with given learning rate and maximum
+     * number of iterations used while training the classifier <br>
+     *
+     * Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive) <br>
+     * Maximum number of iterations can be an integer value greater than 0 <br>
+     * If normalizeInputs is set to true, then every input given to the algorithm will be standardized
+     * by use of standard deviation and mean calculation
+     *
+     * @param int $learningRate
+     * @param int $maxIterations
+     */
+    public function __construct(float $learningRate = 0.001, int $maxIterations = 1000,
+        bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING)
+    {
+        if ($normalizeInputs) {
+            $this->normalizer = new Normalizer(Normalizer::NORM_STD);
+        }
+
+        if (! in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) {
+            throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm");
+        }
+        $this->trainingType = $trainingType;
+
+        parent::__construct($learningRate, $maxIterations);
+    }
+
+    /**
+     * @param array $samples
+     * @param array $targets
+     */
+    public function train(array $samples, array $targets)
+    {
+        if ($this->normalizer) {
+            $this->normalizer->transform($samples);
+        }
+
+        parent::train($samples, $targets);
+    }
+
+    /**
+     * Adapts the weights with respect to given samples and targets
+     * by use of gradient descent learning rule
+     */
+    protected function runTraining()
+    {
+        // If online training is chosen, then the parent runTraining method
+        // will be executed with the 'output' method as the error function
+        if ($this->trainingType == self::ONLINE_TRAINING) {
+            return parent::runTraining();
+        }
+
+        // Batch learning is executed:
+        $currIter = 0;
+        while ($this->maxIterations > $currIter++) {
+            $outputs = array_map([$this, 'output'], $this->samples);
+            $updates = array_map([$this, 'gradient'], $this->targets, $outputs);
+            $sum = array_sum($updates);
+
+            // Updates all weights at once
+            for ($i=0; $i <= $this->featureCount; $i++) {
+                if ($i == 0) {
+                    $this->weights[0] += $this->learningRate * $sum;
+                } else {
+                    $col = array_column($this->samples, $i - 1);
+                    $error = 0;
+                    foreach ($col as $index => $val) {
+                        $error += $val * $updates[$index];
+                    }
+
+                    $this->weights[$i] += $this->learningRate * $error;
+                }
+            }
+        }
+    }
+
+    /**
+     * Returns the direction of gradient given the desired and actual outputs
+     *
+     * @param int $desired
+     * @param int $output
+     * @return int
+     */
+    protected function gradient($desired, $output)
+    {
+        return $desired - $output;
+    }
+
+    /**
+     * @param array $sample
+     * @return mixed
+     */
+    public function predictSample(array $sample)
+    {
+        if ($this->normalizer) {
+            $samples = [$sample];
+            $this->normalizer->transform($samples);
+            $sample = $samples[0];
+        }
+
+        return parent::predictSample($sample);
+    }
+}
diff --git a/src/Phpml/Classification/Linear/DecisionStump.php b/src/Phpml/Classification/Linear/DecisionStump.php
@@ -0,0 +1,56 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Phpml\Classification\Linear;
+
+use Phpml\Helper\Predictable;
+use Phpml\Helper\Trainable;
+use Phpml\Classification\Classifier;
+use Phpml\Classification\DecisionTree;
+
+class DecisionStump extends DecisionTree
+{
+    use Trainable, Predictable;
+
+    /**
+     * @var int
+     */
+    protected $columnIndex;
+
+
+    /**
+     * A DecisionStump classifier is a one-level deep DecisionTree. It is generally
+     * used with ensemble algorithms as in the weak classifier role. <br>
+     *
+     * If columnIndex is given, then the stump tries to produce a decision node
+     * on this column, otherwise in cases given the value of -1, the stump itself
+     * decides which column to take for the decision (Default DecisionTree behaviour)
+     *
+     * @param int $columnIndex
+     */
+    public function __construct(int $columnIndex = -1)
+    {
+        $this->columnIndex = $columnIndex;
+
+        parent::__construct(1);
+    }
+
+    /**
+     * @param array $samples
+     * @param array $targets
+     */
+    public function train(array $samples, array $targets)
+    {
+        // Check if a column index was given
+        if ($this->columnIndex >= 0 && $this->columnIndex > count($samples[0]) - 1) {
+            $this->columnIndex = -1;
+        }
+
+        if ($this->columnIndex >= 0) {
+            $this->setSelectedFeatures([$this->columnIndex]);
+        }
+
+        parent::train($samples, $targets);
+    }
+}