diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php
index 6a860ebe..c73f8706 100644
--- a/src/Phpml/Classification/DecisionTree.php
+++ b/src/Phpml/Classification/DecisionTree.php
@@ -56,6 +56,11 @@ class DecisionTree implements Classifier
*/
private $numUsableFeatures = 0;
+ /**
+ * @var array
+ */
+ private $selectedFeatures;
+
/**
* @var array
*/
@@ -126,33 +131,45 @@ protected function getSplitLeaf($records, $depth = 0)
if ($this->actualDepth < $depth) {
$this->actualDepth = $depth;
}
+
+ // Traverse all records to see if all records belong to the same class,
+ // otherwise group the records so that we can classify the leaf
+ // in case maximum depth is reached
$leftRecords = [];
$rightRecords= [];
$remainingTargets = [];
$prevRecord = null;
$allSame = true;
+
foreach ($records as $recordNo) {
+ // Check if the previous record is the same with the current one
$record = $this->samples[$recordNo];
if ($prevRecord && $prevRecord != $record) {
$allSame = false;
}
$prevRecord = $record;
+
+ // According to the split criteron, this record will
+ // belong to either left or the right side in the next split
if ($split->evaluate($record)) {
$leftRecords[] = $recordNo;
} else {
$rightRecords[]= $recordNo;
}
+
+ // Group remaining targets
$target = $this->targets[$recordNo];
- if (! in_array($target, $remainingTargets)) {
- $remainingTargets[] = $target;
+ if (! array_key_exists($target, $remainingTargets)) {
+ $remainingTargets[$target] = 1;
+ } else {
+ $remainingTargets[$target]++;
}
}
if (count($remainingTargets) == 1 || $allSame || $depth >= $this->maxDepth) {
$split->isTerminal = 1;
- $classes = array_count_values($remainingTargets);
- arsort($classes);
- $split->classValue = key($classes);
+ arsort($remainingTargets);
+ $split->classValue = key($remainingTargets);
} else {
if ($leftRecords) {
$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
@@ -200,15 +217,31 @@ protected function getBestSplit($records)
}
/**
+ * Returns available features/columns to the tree for the decision making
+ * process.
+ *
+ * If a number is given with setNumFeatures() method, then a random selection
+ * of features up to this number is returned.
+ *
+ * If some features are manually selected by use of setSelectedFeatures(),
+ * then only these features are returned
+ *
+ * If any of above methods were not called beforehand, then all features
+ * are returned by default.
+ *
* @return array
*/
protected function getSelectedFeatures()
{
$allFeatures = range(0, $this->featureCount - 1);
- if ($this->numUsableFeatures == 0) {
+ if ($this->numUsableFeatures == 0 && ! $this->selectedFeatures) {
return $allFeatures;
}
+ if ($this->selectedFeatures) {
+ return $this->selectedFeatures;
+ }
+
$numFeatures = $this->numUsableFeatures;
if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount;
@@ -323,6 +356,16 @@ public function setNumFeatures(int $numFeatures)
return $this;
}
+ /**
+ * Used to set predefined features to consider while deciding which column to use for a split,
+ *
+ * @param array $features
+ */
+ protected function setSelectedFeatures(array $selectedFeatures)
+ {
+ $this->selectedFeatures = $selectedFeatures;
+ }
+
/**
* A string array to represent columns. Useful when HTML output or
* column importances are desired to be inspected.
diff --git a/src/Phpml/Classification/Linear/Adaline.php b/src/Phpml/Classification/Linear/Adaline.php
new file mode 100644
index 00000000..94283d97
--- /dev/null
+++ b/src/Phpml/Classification/Linear/Adaline.php
@@ -0,0 +1,148 @@
+
+ *
+ * Learning rate should be a float value between 0.0(exclusive) and 1.0 (inclusive)
+ * Maximum number of iterations can be an integer value greater than 0
+ * If normalizeInputs is set to true, then every input given to the algorithm will be standardized
+ * by use of standard deviation and mean calculation
+ *
+ * @param int $learningRate
+ * @param int $maxIterations
+ */
+ public function __construct(float $learningRate = 0.001, int $maxIterations = 1000,
+ bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING)
+ {
+ if ($normalizeInputs) {
+ $this->normalizer = new Normalizer(Normalizer::NORM_STD);
+ }
+
+ if (! in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) {
+ throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm");
+ }
+ $this->trainingType = $trainingType;
+
+ parent::__construct($learningRate, $maxIterations);
+ }
+
+ /**
+ * @param array $samples
+ * @param array $targets
+ */
+ public function train(array $samples, array $targets)
+ {
+ if ($this->normalizer) {
+ $this->normalizer->transform($samples);
+ }
+
+ parent::train($samples, $targets);
+ }
+
+ /**
+ * Adapts the weights with respect to given samples and targets
+ * by use of gradient descent learning rule
+ */
+ protected function runTraining()
+ {
+ // If online training is chosen, then the parent runTraining method
+ // will be executed with the 'output' method as the error function
+ if ($this->trainingType == self::ONLINE_TRAINING) {
+ return parent::runTraining();
+ }
+
+ // Batch learning is executed:
+ $currIter = 0;
+ while ($this->maxIterations > $currIter++) {
+ $outputs = array_map([$this, 'output'], $this->samples);
+ $updates = array_map([$this, 'gradient'], $this->targets, $outputs);
+ $sum = array_sum($updates);
+
+ // Updates all weights at once
+ for ($i=0; $i <= $this->featureCount; $i++) {
+ if ($i == 0) {
+ $this->weights[0] += $this->learningRate * $sum;
+ } else {
+ $col = array_column($this->samples, $i - 1);
+ $error = 0;
+ foreach ($col as $index => $val) {
+ $error += $val * $updates[$index];
+ }
+
+ $this->weights[$i] += $this->learningRate * $error;
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns the direction of gradient given the desired and actual outputs
+ *
+ * @param int $desired
+ * @param int $output
+ * @return int
+ */
+ protected function gradient($desired, $output)
+ {
+ return $desired - $output;
+ }
+
+ /**
+ * @param array $sample
+ * @return mixed
+ */
+ public function predictSample(array $sample)
+ {
+ if ($this->normalizer) {
+ $samples = [$sample];
+ $this->normalizer->transform($samples);
+ $sample = $samples[0];
+ }
+
+ return parent::predictSample($sample);
+ }
+}
diff --git a/src/Phpml/Classification/Linear/DecisionStump.php b/src/Phpml/Classification/Linear/DecisionStump.php
new file mode 100644
index 00000000..18d44497
--- /dev/null
+++ b/src/Phpml/Classification/Linear/DecisionStump.php
@@ -0,0 +1,56 @@
+
+ *
+ * If columnIndex is given, then the stump tries to produce a decision node
+ * on this column, otherwise in cases given the value of -1, the stump itself
+ * decides which column to take for the decision (Default DecisionTree behaviour)
+ *
+ * @param int $columnIndex
+ */
+ public function __construct(int $columnIndex = -1)
+ {
+ $this->columnIndex = $columnIndex;
+
+ parent::__construct(1);
+ }
+
+ /**
+ * @param array $samples
+ * @param array $targets
+ */
+ public function train(array $samples, array $targets)
+ {
+ // Check if a column index was given
+ if ($this->columnIndex >= 0 && $this->columnIndex > count($samples[0]) - 1) {
+ $this->columnIndex = -1;
+ }
+
+ if ($this->columnIndex >= 0) {
+ $this->setSelectedFeatures([$this->columnIndex]);
+ }
+
+ parent::train($samples, $targets);
+ }
+}
diff --git a/src/Phpml/Classification/Linear/Perceptron.php b/src/Phpml/Classification/Linear/Perceptron.php
new file mode 100644
index 00000000..963638e6
--- /dev/null
+++ b/src/Phpml/Classification/Linear/Perceptron.php
@@ -0,0 +1,174 @@
+
+ *
+ * Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)
+ * Maximum number of iterations can be an integer value greater than 0
+ * @param int $learningRate
+ * @param int $maxIterations
+ */
+ public function __construct(float $learningRate = 0.001, int $maxIterations = 1000)
+ {
+ if ($learningRate <= 0.0 || $learningRate > 1.0) {
+ throw new \Exception("Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)");
+ }
+
+ if ($maxIterations <= 0) {
+ throw new \Exception("Maximum number of iterations should be an integer greater than 0");
+ }
+
+ $this->learningRate = $learningRate;
+ $this->maxIterations = $maxIterations;
+ }
+
+ /**
+ * @param array $samples
+ * @param array $targets
+ */
+ public function train(array $samples, array $targets)
+ {
+ $this->labels = array_keys(array_count_values($targets));
+ if (count($this->labels) > 2) {
+ throw new \Exception("Perceptron is for only binary (two-class) classification");
+ }
+
+ // Set all target values to either -1 or 1
+ $this->labels = [1 => $this->labels[0], -1 => $this->labels[1]];
+ foreach ($targets as $target) {
+ $this->targets[] = $target == $this->labels[1] ? 1 : -1;
+ }
+
+ // Set samples and feature count vars
+ $this->samples = array_merge($this->samples, $samples);
+ $this->featureCount = count($this->samples[0]);
+
+ // Init weights with random values
+ $this->weights = array_fill(0, $this->featureCount + 1, 0);
+ foreach ($this->weights as &$weight) {
+ $weight = rand() / (float) getrandmax();
+ }
+ // Do training
+ $this->runTraining();
+ }
+
+ /**
+ * Adapts the weights with respect to given samples and targets
+ * by use of perceptron learning rule
+ */
+ protected function runTraining()
+ {
+ $currIter = 0;
+ while ($this->maxIterations > $currIter++) {
+ foreach ($this->samples as $index => $sample) {
+ $target = $this->targets[$index];
+ $prediction = $this->{static::$errorFunction}($sample);
+ $update = $target - $prediction;
+ // Update bias
+ $this->weights[0] += $update * $this->learningRate; // Bias
+ // Update other weights
+ for ($i=1; $i <= $this->featureCount; $i++) {
+ $this->weights[$i] += $update * $sample[$i - 1] * $this->learningRate;
+ }
+ }
+ }
+ }
+
+ /**
+ * Calculates net output of the network as a float value for the given input
+ *
+ * @param array $sample
+ * @return int
+ */
+ protected function output(array $sample)
+ {
+ $sum = 0;
+ foreach ($this->weights as $index => $w) {
+ if ($index == 0) {
+ $sum += $w;
+ } else {
+ $sum += $w * $sample[$index - 1];
+ }
+ }
+
+ return $sum;
+ }
+
+ /**
+ * Returns the class value (either -1 or 1) for the given input
+ *
+ * @param array $sample
+ * @return int
+ */
+ protected function outputClass(array $sample)
+ {
+ return $this->output($sample) > 0 ? 1 : -1;
+ }
+
+ /**
+ * @param array $sample
+ * @return mixed
+ */
+ protected function predictSample(array $sample)
+ {
+ $predictedClass = $this->outputClass($sample);
+
+ return $this->labels[ $predictedClass ];
+ }
+}
diff --git a/src/Phpml/Preprocessing/Normalizer.php b/src/Phpml/Preprocessing/Normalizer.php
index 5cff6e84..42a8f1c2 100644
--- a/src/Phpml/Preprocessing/Normalizer.php
+++ b/src/Phpml/Preprocessing/Normalizer.php
@@ -5,17 +5,35 @@
namespace Phpml\Preprocessing;
use Phpml\Exception\NormalizerException;
+use Phpml\Math\Statistic\StandardDeviation;
+use Phpml\Math\Statistic\Mean;
class Normalizer implements Preprocessor
{
const NORM_L1 = 1;
const NORM_L2 = 2;
+ const NORM_STD= 3;
/**
* @var int
*/
private $norm;
+ /**
+ * @var bool
+ */
+ private $fitted = false;
+
+ /**
+ * @var array
+ */
+ private $std;
+
+ /**
+ * @var array
+ */
+ private $mean;
+
/**
* @param int $norm
*
@@ -23,7 +41,7 @@ class Normalizer implements Preprocessor
*/
public function __construct(int $norm = self::NORM_L2)
{
- if (!in_array($norm, [self::NORM_L1, self::NORM_L2])) {
+ if (!in_array($norm, [self::NORM_L1, self::NORM_L2, self::NORM_STD])) {
throw NormalizerException::unknownNorm();
}
@@ -35,7 +53,20 @@ public function __construct(int $norm = self::NORM_L2)
*/
public function fit(array $samples)
{
- // intentionally not implemented
+ if ($this->fitted) {
+ return;
+ }
+
+ if ($this->norm == self::NORM_STD) {
+ $features = range(0, count($samples[0]) - 1);
+ foreach ($features as $i) {
+ $values = array_column($samples, $i);
+ $this->std[$i] = StandardDeviation::population($values);
+ $this->mean[$i] = Mean::arithmetic($values);
+ }
+ }
+
+ $this->fitted = true;
}
/**
@@ -43,7 +74,15 @@ public function fit(array $samples)
*/
public function transform(array &$samples)
{
- $method = sprintf('normalizeL%s', $this->norm);
+ $methods = [
+ self::NORM_L1 => 'normalizeL1',
+ self::NORM_L2 => 'normalizeL2',
+ self::NORM_STD=> 'normalizeSTD'
+ ];
+ $method = $methods[$this->norm];
+
+ $this->fit($samples);
+
foreach ($samples as &$sample) {
$this->$method($sample);
}
@@ -88,4 +127,14 @@ private function normalizeL2(array &$sample)
}
}
}
+
+ /**
+ * @param array $sample
+ */
+ private function normalizeSTD(array &$sample)
+ {
+ foreach ($sample as $i => $val) {
+ $sample[$i] = ($sample[$i] - $this->mean[$i]) / $this->std[$i];
+ }
+ }
}
diff --git a/tests/Phpml/Classification/Linear/AdalineTest.php b/tests/Phpml/Classification/Linear/AdalineTest.php
new file mode 100644
index 00000000..7ea63ab2
--- /dev/null
+++ b/tests/Phpml/Classification/Linear/AdalineTest.php
@@ -0,0 +1,55 @@
+train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.1, 0.2]));
+ $this->assertEquals(0, $classifier->predict([0.1, 0.99]));
+ $this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+
+ // OR problem
+ $samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
+ $targets = [0, 1, 1, 1];
+ $classifier = new Adaline();
+ $classifier->train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.1, 0.2]));
+ $this->assertEquals(1, $classifier->predict([0.1, 0.99]));
+ $this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+
+ return $classifier;
+ }
+
+ public function testSaveAndRestore()
+ {
+ // Instantinate new Percetron trained for OR problem
+ $samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
+ $targets = [0, 1, 1, 1];
+ $classifier = new Adaline();
+ $classifier->train($samples, $targets);
+ $testSamples = [[0, 1], [1, 1], [0.2, 0.1]];
+ $predicted = $classifier->predict($testSamples);
+
+ $filename = 'adaline-test-'.rand(100, 999).'-'.uniqid();
+ $filepath = tempnam(sys_get_temp_dir(), $filename);
+ $modelManager = new ModelManager();
+ $modelManager->saveToFile($classifier, $filepath);
+
+ $restoredClassifier = $modelManager->restoreFromFile($filepath);
+ $this->assertEquals($classifier, $restoredClassifier);
+ $this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
+ }
+}
diff --git a/tests/Phpml/Classification/Linear/DecisionStumpTest.php b/tests/Phpml/Classification/Linear/DecisionStumpTest.php
new file mode 100644
index 00000000..f83e0953
--- /dev/null
+++ b/tests/Phpml/Classification/Linear/DecisionStumpTest.php
@@ -0,0 +1,59 @@
+train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.1, 0.2]));
+ $this->assertEquals(0, $classifier->predict([1.1, 0.2]));
+ $this->assertEquals(1, $classifier->predict([0.1, 0.99]));
+ $this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+
+ // Then: vertical test
+ $samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
+ $targets = [0, 1, 0, 1];
+ $classifier = new DecisionStump();
+ $classifier->train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.1, 0.2]));
+ $this->assertEquals(0, $classifier->predict([0.1, 1.1]));
+ $this->assertEquals(1, $classifier->predict([1.0, 0.99]));
+ $this->assertEquals(1, $classifier->predict([1.1, 0.1]));
+
+ return $classifier;
+ }
+
+ public function testSaveAndRestore()
+ {
+ // Instantinate new Percetron trained for OR problem
+ $samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
+ $targets = [0, 1, 1, 1];
+ $classifier = new DecisionStump();
+ $classifier->train($samples, $targets);
+ $testSamples = [[0, 1], [1, 1], [0.2, 0.1]];
+ $predicted = $classifier->predict($testSamples);
+
+ $filename = 'dstump-test-'.rand(100, 999).'-'.uniqid();
+ $filepath = tempnam(sys_get_temp_dir(), $filename);
+ $modelManager = new ModelManager();
+ $modelManager->saveToFile($classifier, $filepath);
+
+ $restoredClassifier = $modelManager->restoreFromFile($filepath);
+ $this->assertEquals($classifier, $restoredClassifier);
+ $this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
+ }
+}
diff --git a/tests/Phpml/Classification/Linear/PerceptronTest.php b/tests/Phpml/Classification/Linear/PerceptronTest.php
new file mode 100644
index 00000000..bf1b3847
--- /dev/null
+++ b/tests/Phpml/Classification/Linear/PerceptronTest.php
@@ -0,0 +1,55 @@
+train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0.1, 0.2]));
+ $this->assertEquals(0, $classifier->predict([0.1, 0.99]));
+ $this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+
+ // OR problem
+ $samples = [[0, 0], [0.1, 0.2], [1, 0], [0, 1], [1, 1]];
+ $targets = [0, 0, 1, 1, 1];
+ $classifier = new Perceptron(0.001, 5000);
+ $classifier->train($samples, $targets);
+ $this->assertEquals(0, $classifier->predict([0, 0]));
+ $this->assertEquals(1, $classifier->predict([0.1, 0.99]));
+ $this->assertEquals(1, $classifier->predict([1.1, 0.8]));
+
+ return $classifier;
+ }
+
+ public function testSaveAndRestore()
+ {
+ // Instantinate new Percetron trained for OR problem
+ $samples = [[0, 0], [1, 0], [0, 1], [1, 1]];
+ $targets = [0, 1, 1, 1];
+ $classifier = new Perceptron();
+ $classifier->train($samples, $targets);
+ $testSamples = [[0, 1], [1, 1], [0.2, 0.1]];
+ $predicted = $classifier->predict($testSamples);
+
+ $filename = 'perceptron-test-'.rand(100, 999).'-'.uniqid();
+ $filepath = tempnam(sys_get_temp_dir(), $filename);
+ $modelManager = new ModelManager();
+ $modelManager->saveToFile($classifier, $filepath);
+
+ $restoredClassifier = $modelManager->restoreFromFile($filepath);
+ $this->assertEquals($classifier, $restoredClassifier);
+ $this->assertEquals($predicted, $restoredClassifier->predict($testSamples));
+ }
+}
diff --git a/tests/Phpml/Preprocessing/NormalizerTest.php b/tests/Phpml/Preprocessing/NormalizerTest.php
index 99ebf4e7..07d121cc 100644
--- a/tests/Phpml/Preprocessing/NormalizerTest.php
+++ b/tests/Phpml/Preprocessing/NormalizerTest.php
@@ -100,4 +100,32 @@ public function testL1NormWithZeroSumCondition()
$this->assertEquals($normalized, $samples, '', $delta = 0.01);
}
+
+ public function testStandardNorm()
+ {
+ // Generate 10 random vectors of length 3
+ $samples = [];
+ srand(time());
+ for ($i=0; $i<10; $i++) {
+ $sample = array_fill(0, 3, 0);
+ for ($k=0; $k<3; $k++) {
+ $sample[$k] = rand(1, 100);
+ }
+ $samples[] = $sample;
+ }
+
+ // Use standard normalization
+ $normalizer = new Normalizer(Normalizer::NORM_STD);
+ $normalizer->transform($samples);
+
+ // Values in the vector should be some value between -3 and +3
+ $this->assertCount(10, $samples);
+ foreach ($samples as $sample) {
+ $errors = array_filter($sample,
+ function ($element) {
+ return $element < -3 || $element > 3;
+ });
+ $this->assertCount(0, $errors);
+ }
+ }
}