diff --git a/src/Phpml/Classification/DecisionTree.php b/src/Phpml/Classification/DecisionTree.php index c73f8706..231d766a 100644 --- a/src/Phpml/Classification/DecisionTree.php +++ b/src/Phpml/Classification/DecisionTree.php @@ -24,7 +24,7 @@ class DecisionTree implements Classifier /** * @var array */ - private $columnTypes; + protected $columnTypes; /** * @var array @@ -39,12 +39,12 @@ class DecisionTree implements Classifier /** * @var DecisionTreeLeaf */ - private $tree = null; + protected $tree = null; /** * @var int */ - private $maxDepth; + protected $maxDepth; /** * @var int @@ -79,6 +79,7 @@ public function __construct($maxDepth = 10) { $this->maxDepth = $maxDepth; } + /** * @param array $samples * @param array $targets @@ -209,6 +210,17 @@ protected function getBestSplit($records) $split->columnIndex = $i; $split->isContinuous = $this->columnTypes[$i] == self::CONTINUOS; $split->records = $records; + + // If a numeric column is to be selected, then + // the original numeric value and the selected operator + // will also be saved into the leaf for future access + if ($this->columnTypes[$i] == self::CONTINUOS) { + $matches = []; + preg_match("/^([<>=]{1,2})\s*(.*)/", strval($split->value), $matches); + $split->operator = $matches[1]; + $split->numericValue = floatval($matches[2]); + } + $bestSplit = $split; $bestGiniVal = $gini; } @@ -318,15 +330,21 @@ protected function preprocess(array $samples) protected function isCategoricalColumn(array $columnValues) { $count = count($columnValues); + // There are two main indicators that *may* show whether a // column is composed of discrete set of values: - // 1- Column may contain string values + // 1- Column may contain string values and not float values // 2- Number of unique values in the column is only a small fraction of // all values in that column (Lower than or equal to %20 of all values) $numericValues = array_filter($columnValues, 'is_numeric'); + $floatValues = array_filter($columnValues, 'is_float'); + if ($floatValues) { + return false; + } if (count($numericValues) != $count) { return true; } + $distinctValues = array_count_values($columnValues); if (count($distinctValues) <= $count / 5) { return true; @@ -357,9 +375,9 @@ public function setNumFeatures(int $numFeatures) } /** - * Used to set predefined features to consider while deciding which column to use for a split, + * Used to set predefined features to consider while deciding which column to use for a split * - * @param array $features + * @param array $selectedFeatures */ protected function setSelectedFeatures(array $selectedFeatures) { diff --git a/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php b/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php index e30fc109..bbb31751 100644 --- a/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php +++ b/src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php @@ -11,6 +11,16 @@ class DecisionTreeLeaf */ public $value; + /** + * @var float + */ + public $numericValue; + + /** + * @var string + */ + public $operator; + /** * @var int */ @@ -66,13 +76,15 @@ class DecisionTreeLeaf public function evaluate($record) { $recordField = $record[$this->columnIndex]; - if ($this->isContinuous && preg_match("/^([<>=]{1,2})\s*(.*)/", strval($this->value), $matches)) { - $op = $matches[1]; - $value= floatval($matches[2]); + + if ($this->isContinuous) { + $op = $this->operator; + $value= $this->numericValue; $recordField = strval($recordField); eval("\$result = $recordField $op $value;"); return $result; } + return $recordField == $this->value; } diff --git a/src/Phpml/Classification/Ensemble/AdaBoost.php b/src/Phpml/Classification/Ensemble/AdaBoost.php new file mode 100644 index 00000000..70440a69 --- /dev/null +++ b/src/Phpml/Classification/Ensemble/AdaBoost.php @@ -0,0 +1,190 @@ +maxIterations = $maxIterations; + } + + /** + * @param array $samples + * @param array $targets + */ + public function train(array $samples, array $targets) + { + // Initialize usual variables + $this->labels = array_keys(array_count_values($targets)); + if (count($this->labels) != 2) { + throw new \Exception("AdaBoost is a binary classifier and can only classify between two classes"); + } + + // Set all target values to either -1 or 1 + $this->labels = [1 => $this->labels[0], -1 => $this->labels[1]]; + foreach ($targets as $target) { + $this->targets[] = $target == $this->labels[1] ? 1 : -1; + } + + $this->samples = array_merge($this->samples, $samples); + $this->featureCount = count($samples[0]); + $this->sampleCount = count($this->samples); + + // Initialize AdaBoost parameters + $this->weights = array_fill(0, $this->sampleCount, 1.0 / $this->sampleCount); + $this->classifiers = []; + $this->alpha = []; + + // Execute the algorithm for a maximum number of iterations + $currIter = 0; + while ($this->maxIterations > $currIter++) { + // Determine the best 'weak' classifier based on current weights + // and update alpha & weight values at each iteration + list($classifier, $errorRate) = $this->getBestClassifier(); + $alpha = $this->calculateAlpha($errorRate); + $this->updateWeights($classifier, $alpha); + + $this->classifiers[] = $classifier; + $this->alpha[] = $alpha; + } + } + + /** + * Returns the classifier with the lowest error rate with the + * consideration of current sample weights + * + * @return Classifier + */ + protected function getBestClassifier() + { + // This method works only for "DecisionStump" classifier, for now. + // As a future task, it will be generalized enough to work with other + // classifiers as well + $minErrorRate = 1.0; + $bestClassifier = null; + for ($i=0; $i < $this->featureCount; $i++) { + $stump = new DecisionStump($i); + $stump->setSampleWeights($this->weights); + $stump->train($this->samples, $this->targets); + + $errorRate = $stump->getTrainingErrorRate(); + if ($errorRate < $minErrorRate) { + $bestClassifier = $stump; + $minErrorRate = $errorRate; + } + } + + return [$bestClassifier, $minErrorRate]; + } + + /** + * Calculates alpha of a classifier + * + * @param float $errorRate + * @return float + */ + protected function calculateAlpha(float $errorRate) + { + if ($errorRate == 0) { + $errorRate = 1e-10; + } + return 0.5 * log((1 - $errorRate) / $errorRate); + } + + /** + * Updates the sample weights + * + * @param DecisionStump $classifier + * @param float $alpha + */ + protected function updateWeights(DecisionStump $classifier, float $alpha) + { + $sumOfWeights = array_sum($this->weights); + $weightsT1 = []; + foreach ($this->weights as $index => $weight) { + $desired = $this->targets[$index]; + $output = $classifier->predict($this->samples[$index]); + + $weight *= exp(-$alpha * $desired * $output) / $sumOfWeights; + + $weightsT1[] = $weight; + } + + $this->weights = $weightsT1; + } + + /** + * @param array $sample + * @return mixed + */ + public function predictSample(array $sample) + { + $sum = 0; + foreach ($this->alpha as $index => $alpha) { + $h = $this->classifiers[$index]->predict($sample); + $sum += $h * $alpha; + } + + return $this->labels[ $sum > 0 ? 1 : -1]; + } +} diff --git a/src/Phpml/Classification/Linear/Adaline.php b/src/Phpml/Classification/Linear/Adaline.php index 94283d97..aeff95e2 100644 --- a/src/Phpml/Classification/Linear/Adaline.php +++ b/src/Phpml/Classification/Linear/Adaline.php @@ -8,7 +8,6 @@ use Phpml\Helper\Trainable; use Phpml\Classification\Classifier; use Phpml\Classification\Linear\Perceptron; -use Phpml\Preprocessing\Normalizer; class Adaline extends Perceptron { @@ -38,11 +37,6 @@ class Adaline extends Perceptron */ protected $trainingType; - /** - * @var Normalizer - */ - private $normalizer; - /** * Initalize an Adaline (ADAptive LInear NEuron) classifier with given learning rate and maximum * number of iterations used while training the classifier
@@ -58,29 +52,13 @@ class Adaline extends Perceptron public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, bool $normalizeInputs = true, int $trainingType = self::BATCH_TRAINING) { - if ($normalizeInputs) { - $this->normalizer = new Normalizer(Normalizer::NORM_STD); - } - if (! in_array($trainingType, [self::BATCH_TRAINING, self::ONLINE_TRAINING])) { throw new \Exception("Adaline can only be trained with batch and online/stochastic gradient descent algorithm"); } - $this->trainingType = $trainingType; - parent::__construct($learningRate, $maxIterations); - } - - /** - * @param array $samples - * @param array $targets - */ - public function train(array $samples, array $targets) - { - if ($this->normalizer) { - $this->normalizer->transform($samples); - } + $this->trainingType = $trainingType; - parent::train($samples, $targets); + parent::__construct($learningRate, $maxIterations, $normalizeInputs); } /** @@ -100,22 +78,8 @@ protected function runTraining() while ($this->maxIterations > $currIter++) { $outputs = array_map([$this, 'output'], $this->samples); $updates = array_map([$this, 'gradient'], $this->targets, $outputs); - $sum = array_sum($updates); - - // Updates all weights at once - for ($i=0; $i <= $this->featureCount; $i++) { - if ($i == 0) { - $this->weights[0] += $this->learningRate * $sum; - } else { - $col = array_column($this->samples, $i - 1); - $error = 0; - foreach ($col as $index => $val) { - $error += $val * $updates[$index]; - } - - $this->weights[$i] += $this->learningRate * $error; - } - } + + $this->updateWeights($updates); } } @@ -132,17 +96,27 @@ protected function gradient($desired, $output) } /** - * @param array $sample - * @return mixed + * Updates the weights of the network given the direction of the + * gradient for each sample + * + * @param array $updates */ - public function predictSample(array $sample) + protected function updateWeights(array $updates) { - if ($this->normalizer) { - $samples = [$sample]; - $this->normalizer->transform($samples); - $sample = $samples[0]; - } + // Updates all weights at once + for ($i=0; $i <= $this->featureCount; $i++) { + if ($i == 0) { + $this->weights[0] += $this->learningRate * array_sum($updates); + } else { + $col = array_column($this->samples, $i - 1); + + $error = 0; + foreach ($col as $index => $val) { + $error += $val * $updates[$index]; + } - return parent::predictSample($sample); + $this->weights[$i] += $this->learningRate * $error; + } + } } } diff --git a/src/Phpml/Classification/Linear/DecisionStump.php b/src/Phpml/Classification/Linear/DecisionStump.php index 18d44497..1220d48d 100644 --- a/src/Phpml/Classification/Linear/DecisionStump.php +++ b/src/Phpml/Classification/Linear/DecisionStump.php @@ -8,6 +8,7 @@ use Phpml\Helper\Trainable; use Phpml\Classification\Classifier; use Phpml\Classification\DecisionTree; +use Phpml\Classification\DecisionTree\DecisionTreeLeaf; class DecisionStump extends DecisionTree { @@ -19,6 +20,22 @@ class DecisionStump extends DecisionTree protected $columnIndex; + /** + * Sample weights : If used the optimization on the decision value + * will take these weights into account. If not given, all samples + * will be weighed with the same value of 1 + * + * @var array + */ + protected $weights = null; + + /** + * Lowest error rate obtained while training/optimizing the model + * + * @var float + */ + protected $trainingErrorRate; + /** * A DecisionStump classifier is a one-level deep DecisionTree. It is generally * used with ensemble algorithms as in the weak classifier role.
@@ -42,8 +59,7 @@ public function __construct(int $columnIndex = -1) */ public function train(array $samples, array $targets) { - // Check if a column index was given - if ($this->columnIndex >= 0 && $this->columnIndex > count($samples[0]) - 1) { + if ($this->columnIndex > count($samples[0]) - 1) { $this->columnIndex = -1; } @@ -51,6 +67,113 @@ public function train(array $samples, array $targets) $this->setSelectedFeatures([$this->columnIndex]); } + if ($this->weights) { + $numWeights = count($this->weights); + if ($numWeights != count($samples)) { + throw new \Exception("Number of sample weights does not match with number of samples"); + } + } else { + $this->weights = array_fill(0, count($samples), 1); + } + parent::train($samples, $targets); + + $this->columnIndex = $this->tree->columnIndex; + + // For numerical values, try to optimize the value by finding a different threshold value + if ($this->columnTypes[$this->columnIndex] == self::CONTINUOS) { + $this->optimizeDecision($samples, $targets); + } + } + + /** + * Used to set sample weights. + * + * @param array $weights + */ + public function setSampleWeights(array $weights) + { + $this->weights = $weights; + } + + /** + * Returns the training error rate, the proportion of wrong predictions + * over the total number of samples + * + * @return float + */ + public function getTrainingErrorRate() + { + return $this->trainingErrorRate; + } + + /** + * Tries to optimize the threshold by probing a range of different values + * between the minimum and maximum values in the selected column + * + * @param array $samples + * @param array $targets + */ + protected function optimizeDecision(array $samples, array $targets) + { + $values = array_column($samples, $this->columnIndex); + $minValue = min($values); + $maxValue = max($values); + $stepSize = ($maxValue - $minValue) / 100.0; + + $leftLabel = $this->tree->leftLeaf->classValue; + $rightLabel= $this->tree->rightLeaf->classValue; + + $bestOperator = $this->tree->operator; + $bestThreshold = $this->tree->numericValue; + $bestErrorRate = $this->calculateErrorRate( + $bestThreshold, $bestOperator, $values, $targets, $leftLabel, $rightLabel); + + foreach (['<=', '>'] as $operator) { + for ($step = $minValue; $step <= $maxValue; $step+= $stepSize) { + $threshold = (float)$step; + $errorRate = $this->calculateErrorRate( + $threshold, $operator, $values, $targets, $leftLabel, $rightLabel); + + if ($errorRate < $bestErrorRate) { + $bestErrorRate = $errorRate; + $bestThreshold = $threshold; + $bestOperator = $operator; + } + }// for + } + + // Update the tree node value + $this->tree->numericValue = $bestThreshold; + $this->tree->operator = $bestOperator; + $this->tree->value = "$bestOperator $bestThreshold"; + $this->trainingErrorRate = $bestErrorRate; + } + + /** + * Calculates the ratio of wrong predictions based on the new threshold + * value given as the parameter + * + * @param float $threshold + * @param string $operator + * @param array $values + * @param array $targets + * @param mixed $leftLabel + * @param mixed $rightLabel + */ + protected function calculateErrorRate(float $threshold, string $operator, array $values, array $targets, $leftLabel, $rightLabel) + { + $total = (float) array_sum($this->weights); + $wrong = 0; + + foreach ($values as $index => $value) { + eval("\$predicted = \$value $operator \$threshold ? \$leftLabel : \$rightLabel;"); + + if ($predicted != $targets[$index]) { + $wrong += $this->weights[$index]; + } + } + + return $wrong / $total; } } diff --git a/src/Phpml/Classification/Linear/Perceptron.php b/src/Phpml/Classification/Linear/Perceptron.php index 963638e6..78a204a1 100644 --- a/src/Phpml/Classification/Linear/Perceptron.php +++ b/src/Phpml/Classification/Linear/Perceptron.php @@ -7,6 +7,7 @@ use Phpml\Helper\Predictable; use Phpml\Helper\Trainable; use Phpml\Classification\Classifier; +use Phpml\Preprocessing\Normalizer; class Perceptron implements Classifier { @@ -55,6 +56,11 @@ class Perceptron implements Classifier */ protected $maxIterations; + /** + * @var Normalizer + */ + protected $normalizer; + /** * Initalize a perceptron classifier with given learning rate and maximum * number of iterations used while training the perceptron
@@ -64,7 +70,8 @@ class Perceptron implements Classifier * @param int $learningRate * @param int $maxIterations */ - public function __construct(float $learningRate = 0.001, int $maxIterations = 1000) + public function __construct(float $learningRate = 0.001, int $maxIterations = 1000, + bool $normalizeInputs = true) { if ($learningRate <= 0.0 || $learningRate > 1.0) { throw new \Exception("Learning rate should be a float value between 0.0(exclusive) and 1.0(inclusive)"); @@ -74,6 +81,10 @@ public function __construct(float $learningRate = 0.001, int $maxIterations = 10 throw new \Exception("Maximum number of iterations should be an integer greater than 0"); } + if ($normalizeInputs) { + $this->normalizer = new Normalizer(Normalizer::NORM_STD); + } + $this->learningRate = $learningRate; $this->maxIterations = $maxIterations; } @@ -89,6 +100,10 @@ public function train(array $samples, array $targets) throw new \Exception("Perceptron is for only binary (two-class) classification"); } + if ($this->normalizer) { + $this->normalizer->transform($samples); + } + // Set all target values to either -1 or 1 $this->labels = [1 => $this->labels[0], -1 => $this->labels[1]]; foreach ($targets as $target) { @@ -167,6 +182,12 @@ protected function outputClass(array $sample) */ protected function predictSample(array $sample) { + if ($this->normalizer) { + $samples = [$sample]; + $this->normalizer->transform($samples); + $sample = $samples[0]; + } + $predictedClass = $this->outputClass($sample); return $this->labels[ $predictedClass ]; diff --git a/tests/Phpml/Classification/Ensemble/AdaBoostTest.php b/tests/Phpml/Classification/Ensemble/AdaBoostTest.php new file mode 100644 index 00000000..c9e4d86a --- /dev/null +++ b/tests/Phpml/Classification/Ensemble/AdaBoostTest.php @@ -0,0 +1,64 @@ +train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(0, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + // OR problem + $samples = [[0, 0], [0.1, 0.2], [0.2, 0.1], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 0, 0, 1, 1, 1]; + $classifier = new AdaBoost(); + $classifier->train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.2])); + $this->assertEquals(1, $classifier->predict([0.1, 0.99])); + $this->assertEquals(1, $classifier->predict([1.1, 0.8])); + + // XOR problem + $samples = [[0.1, 0.2], [1., 1.], [0.9, 0.8], [0., 1.], [1., 0.], [0.2, 0.8]]; + $targets = [0, 0, 0, 1, 1, 1]; + $classifier = new AdaBoost(5); + $classifier->train($samples, $targets); + $this->assertEquals(0, $classifier->predict([0.1, 0.1])); + $this->assertEquals(1, $classifier->predict([0, 0.999])); + $this->assertEquals(0, $classifier->predict([1.1, 0.8])); + + return $classifier; + } + + public function testSaveAndRestore() + { + // Instantinate new Percetron trained for OR problem + $samples = [[0, 0], [1, 0], [0, 1], [1, 1]]; + $targets = [0, 1, 1, 1]; + $classifier = new AdaBoost(); + $classifier->train($samples, $targets); + $testSamples = [[0, 1], [1, 1], [0.2, 0.1]]; + $predicted = $classifier->predict($testSamples); + + $filename = 'adaboost-test-'.rand(100, 999).'-'.uniqid(); + $filepath = tempnam(sys_get_temp_dir(), $filename); + $modelManager = new ModelManager(); + $modelManager->saveToFile($classifier, $filepath); + + $restoredClassifier = $modelManager->restoreFromFile($filepath); + $this->assertEquals($classifier, $restoredClassifier); + $this->assertEquals($predicted, $restoredClassifier->predict($testSamples)); + } +}