Skip to content

Commit

Permalink
Partial training base (#78)
Browse files Browse the repository at this point in the history
* Cost values for multiclass OneVsRest uses

* Partial training interface

* Reduce linear classifiers memory usage

* Testing partial training and isolated training

* Partial trainer naming switched to incremental estimator

Other changes according to review's feedback.

* Clean optimization data once optimize is finished

* Abstract resetBinary
dmonllao authored and akondas committed Apr 19, 2017
1 parent c0463ae commit e1854d4
Showing 11 changed files with 283 additions and 107 deletions.
7 changes: 5 additions & 2 deletions src/Phpml/Classification/Linear/Adaline.php
Original file line number Diff line number Diff line change
@@ -53,8 +53,11 @@ public function __construct(float $learningRate = 0.001, int $maxIterations = 10
/**
* Adapts the weights with respect to given samples and targets
* by use of gradient descent learning rule
*
* @param array $samples
* @param array $targets
*/
protected function runTraining()
protected function runTraining(array $samples, array $targets)
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target) {
@@ -69,6 +72,6 @@ protected function runTraining()

$isBatch = $this->trainingType == self::BATCH_TRAINING;

return parent::runGradientDescent($callback, $isBatch);
return parent::runGradientDescent($samples, $targets, $callback, $isBatch);
}
}
54 changes: 32 additions & 22 deletions src/Phpml/Classification/Linear/DecisionStump.php
Original file line number Diff line number Diff line change
@@ -89,35 +89,33 @@ public function __construct(int $columnIndex = self::AUTO_SELECT)
* @param array $targets
* @throws \Exception
*/
protected function trainBinary(array $samples, array $targets)
protected function trainBinary(array $samples, array $targets, array $labels)
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->binaryLabels = array_keys(array_count_values($this->targets));
$this->featureCount = count($this->samples[0]);
$this->binaryLabels = $labels;
$this->featureCount = count($samples[0]);

// If a column index is given, it should be among the existing columns
if ($this->givenColumnIndex > count($this->samples[0]) - 1) {
if ($this->givenColumnIndex > count($samples[0]) - 1) {
$this->givenColumnIndex = self::AUTO_SELECT;
}

// Check the size of the weights given.
// If none given, then assign 1 as a weight to each sample
if ($this->weights) {
$numWeights = count($this->weights);
if ($numWeights != count($this->samples)) {
if ($numWeights != count($samples)) {
throw new \Exception("Number of sample weights does not match with number of samples");
}
} else {
$this->weights = array_fill(0, count($this->samples), 1);
$this->weights = array_fill(0, count($samples), 1);
}

// Determine type of each column as either "continuous" or "nominal"
$this->columnTypes = DecisionTree::getColumnTypes($this->samples);
$this->columnTypes = DecisionTree::getColumnTypes($samples);

// Try to find the best split in the columns of the dataset
// by calculating error rate for each split point in each column
$columns = range(0, count($this->samples[0]) - 1);
$columns = range(0, count($samples[0]) - 1);
if ($this->givenColumnIndex != self::AUTO_SELECT) {
$columns = [$this->givenColumnIndex];
}
@@ -128,9 +126,9 @@ protected function trainBinary(array $samples, array $targets)
'trainingErrorRate' => 1.0];
foreach ($columns as $col) {
if ($this->columnTypes[$col] == DecisionTree::CONTINUOUS) {
$split = $this->getBestNumericalSplit($col);
$split = $this->getBestNumericalSplit($samples, $targets, $col);
} else {
$split = $this->getBestNominalSplit($col);
$split = $this->getBestNominalSplit($samples, $targets, $col);
}

if ($split['trainingErrorRate'] < $bestSplit['trainingErrorRate']) {
@@ -161,13 +159,15 @@ public function setNumericalSplitCount(float $count)
/**
* Determines best split point for the given column
*
* @param array $samples
* @param array $targets
* @param int $col
*
* @return array
*/
protected function getBestNumericalSplit(int $col)
protected function getBestNumericalSplit(array $samples, array $targets, int $col)
{
$values = array_column($this->samples, $col);
$values = array_column($samples, $col);
// Trying all possible points may be accomplished in two general ways:
// 1- Try all values in the $samples array ($values)
// 2- Artificially split the range of values into several parts and try them
@@ -182,7 +182,7 @@ protected function getBestNumericalSplit(int $col)
// Before trying all possible split points, let's first try
// the average value for the cut point
$threshold = array_sum($values) / (float) count($values);
list($errorRate, $prob) = $this->calculateErrorRate($threshold, $operator, $values);
list($errorRate, $prob) = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if ($split == null || $errorRate < $split['trainingErrorRate']) {
$split = ['value' => $threshold, 'operator' => $operator,
'prob' => $prob, 'column' => $col,
@@ -192,7 +192,7 @@ protected function getBestNumericalSplit(int $col)
// Try other possible points one by one
for ($step = $minValue; $step <= $maxValue; $step+= $stepSize) {
$threshold = (float)$step;
list($errorRate, $prob) = $this->calculateErrorRate($threshold, $operator, $values);
list($errorRate, $prob) = $this->calculateErrorRate($targets, $threshold, $operator, $values);
if ($errorRate < $split['trainingErrorRate']) {
$split = ['value' => $threshold, 'operator' => $operator,
'prob' => $prob, 'column' => $col,
@@ -205,21 +205,23 @@ protected function getBestNumericalSplit(int $col)
}

/**
* @param array $samples
* @param array $targets
* @param int $col
*
* @return array
*/
protected function getBestNominalSplit(int $col) : array
protected function getBestNominalSplit(array $samples, array $targets, int $col) : array
{
$values = array_column($this->samples, $col);
$values = array_column($samples, $col);
$valueCounts = array_count_values($values);
$distinctVals= array_keys($valueCounts);

$split = null;

foreach (['=', '!='] as $operator) {
foreach ($distinctVals as $val) {
list($errorRate, $prob) = $this->calculateErrorRate($val, $operator, $values);
list($errorRate, $prob) = $this->calculateErrorRate($targets, $val, $operator, $values);

if ($split == null || $split['trainingErrorRate'] < $errorRate) {
$split = ['value' => $val, 'operator' => $operator,
@@ -260,13 +262,14 @@ protected function evaluate($leftValue, $operator, $rightValue)
* Calculates the ratio of wrong predictions based on the new threshold
* value given as the parameter
*
* @param array $targets
* @param float $threshold
* @param string $operator
* @param array $values
*
* @return array
*/
protected function calculateErrorRate(float $threshold, string $operator, array $values) : array
protected function calculateErrorRate(array $targets, float $threshold, string $operator, array $values) : array
{
$wrong = 0.0;
$prob = [];
@@ -280,8 +283,8 @@ protected function calculateErrorRate(float $threshold, string $operator, array
$predicted = $rightLabel;
}

$target = $this->targets[$index];
if (strval($predicted) != strval($this->targets[$index])) {
$target = $targets[$index];
if (strval($predicted) != strval($targets[$index])) {
$wrong += $this->weights[$index];
}

@@ -340,6 +343,13 @@ protected function predictSampleBinary(array $sample)
return $this->binaryLabels[1];
}

/**
* @return void
*/
protected function resetBinary()
{
}

/**
* @return string
*/
23 changes: 14 additions & 9 deletions src/Phpml/Classification/Linear/LogisticRegression.php
Original file line number Diff line number Diff line change
@@ -123,34 +123,39 @@ public function setLambda(float $lambda)
/**
* Adapts the weights with respect to given samples and targets
* by use of selected solver
*
* @param array $samples
* @param array $targets
*/
protected function runTraining()
protected function runTraining(array $samples, array $targets)
{
$callback = $this->getCostFunction();

switch ($this->trainingType) {
case self::BATCH_TRAINING:
return $this->runGradientDescent($callback, true);
return $this->runGradientDescent($samples, $targets, $callback, true);

case self::ONLINE_TRAINING:
return $this->runGradientDescent($callback, false);
return $this->runGradientDescent($samples, $targets, $callback, false);

case self::CONJUGATE_GRAD_TRAINING:
return $this->runConjugateGradient($callback);
return $this->runConjugateGradient($samples, $targets, $callback);
}
}

/**
* Executes Conjugate Gradient method to optimize the
* weights of the LogReg model
*/
protected function runConjugateGradient(\Closure $gradientFunc)
protected function runConjugateGradient(array $samples, array $targets, \Closure $gradientFunc)
{
$optimizer = (new ConjugateGradient($this->featureCount))
->setMaxIterations($this->maxIterations);
if (empty($this->optimizer)) {
$this->optimizer = (new ConjugateGradient($this->featureCount))
->setMaxIterations($this->maxIterations);
}

$this->weights = $optimizer->runOptimization($this->samples, $this->targets, $gradientFunc);
$this->costValues = $optimizer->getCostValues();
$this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc);
$this->costValues = $this->optimizer->getCostValues();
}

/**
80 changes: 50 additions & 30 deletions src/Phpml/Classification/Linear/Perceptron.php
Original file line number Diff line number Diff line change
@@ -10,20 +10,17 @@
use Phpml\Helper\Optimizer\GD;
use Phpml\Classification\Classifier;
use Phpml\Preprocessing\Normalizer;
use Phpml\IncrementalEstimator;
use Phpml\Helper\PartiallyTrainable;

class Perceptron implements Classifier
class Perceptron implements Classifier, IncrementalEstimator
{
use Predictable, OneVsRest;

/**
* @var array
*/
protected $samples = [];

/**
* @var array
* @var \Phpml\Helper\Optimizer\Optimizer
*/
protected $targets = [];
protected $optimizer;

/**
* @var array
@@ -93,32 +90,47 @@ public function __construct(float $learningRate = 0.001, int $maxIterations = 10
$this->maxIterations = $maxIterations;
}

/**
* @param array $samples
* @param array $targets
* @param array $labels
*/
public function partialTrain(array $samples, array $targets, array $labels = array())
{
return $this->trainByLabel($samples, $targets, $labels);
}

/**
* @param array $samples
* @param array $targets
* @param array $labels
*/
public function trainBinary(array $samples, array $targets)
public function trainBinary(array $samples, array $targets, array $labels)
{
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) > 2) {
throw new \Exception("Perceptron is for binary (two-class) classification only");
}

if ($this->normalizer) {
$this->normalizer->transform($samples);
}

// Set all target values to either -1 or 1
$this->labels = [1 => $this->labels[0], -1 => $this->labels[1]];
foreach ($targets as $target) {
$this->targets[] = strval($target) == strval($this->labels[1]) ? 1 : -1;
$this->labels = [1 => $labels[0], -1 => $labels[1]];
foreach ($targets as $key => $target) {
$targets[$key] = strval($target) == strval($this->labels[1]) ? 1 : -1;
}

// Set samples and feature count vars
$this->samples = array_merge($this->samples, $samples);
$this->featureCount = count($this->samples[0]);
$this->featureCount = count($samples[0]);

$this->runTraining($samples, $targets);
}

$this->runTraining();
protected function resetBinary()
{
$this->labels = [];
$this->optimizer = null;
$this->featureCount = 0;
$this->weights = null;
$this->costValues = [];
}

/**
@@ -151,8 +163,11 @@ public function getCostValues()
/**
* Trains the perceptron model with Stochastic Gradient Descent optimization
* to get the correct set of weights
*
* @param array $samples
* @param array $targets
*/
protected function runTraining()
protected function runTraining(array $samples, array $targets)
{
// The cost function is the sum of squares
$callback = function ($weights, $sample, $target) {
@@ -165,25 +180,30 @@ protected function runTraining()
return [$error, $gradient];
};

$this->runGradientDescent($callback);
$this->runGradientDescent($samples, $targets, $callback);
}

/**
* Executes Stochastic Gradient Descent algorithm for
* Executes a Gradient Descent algorithm for
* the given cost function
*
* @param array $samples
* @param array $targets
*/
protected function runGradientDescent(\Closure $gradientFunc, bool $isBatch = false)
protected function runGradientDescent(array $samples, array $targets, \Closure $gradientFunc, bool $isBatch = false)
{
$class = $isBatch ? GD::class : StochasticGD::class;

$optimizer = (new $class($this->featureCount))
->setLearningRate($this->learningRate)
->setMaxIterations($this->maxIterations)
->setChangeThreshold(1e-6)
->setEarlyStop($this->enableEarlyStop);
if (empty($this->optimizer)) {
$this->optimizer = (new $class($this->featureCount))
->setLearningRate($this->learningRate)
->setMaxIterations($this->maxIterations)
->setChangeThreshold(1e-6)
->setEarlyStop($this->enableEarlyStop);
}

$this->weights = $optimizer->runOptimization($this->samples, $this->targets, $gradientFunc);
$this->costValues = $optimizer->getCostValues();
$this->weights = $this->optimizer->runOptimization($samples, $targets, $gradientFunc);
$this->costValues = $this->optimizer->getCostValues();
}

/**
137 changes: 99 additions & 38 deletions src/Phpml/Helper/OneVsRest.php
Original file line number Diff line number Diff line change
@@ -6,30 +6,23 @@

trait OneVsRest
{
/**
* @var array
*/
protected $samples = [];

/**
* @var array
*/
protected $targets = [];

/**
* @var array
*/
protected $classifiers;
protected $classifiers = [];

/**
* All provided training targets' labels.
*
* @var array
*/
protected $labels;
protected $allLabels = [];

/**
* @var array
*/
protected $costValues;
protected $costValues = [];

/**
* Train a binary classifier in the OvR style
@@ -39,51 +32,111 @@ trait OneVsRest
*/
public function train(array $samples, array $targets)
{
// Clone the current classifier, so that
// we don't mess up its variables while training
// multiple instances of this classifier
$classifier = clone $this;
$this->classifiers = [];
// Clears previous stuff.
$this->reset();

return $this->trainBylabel($samples, $targets);
}

/**
* @param array $samples
* @param array $targets
* @param array $allLabels All training set labels
* @return void
*/
protected function trainByLabel(array $samples, array $targets, array $allLabels = array())
{

// Overwrites the current value if it exist. $allLabels must be provided for each partialTrain run.
if (!empty($allLabels)) {
$this->allLabels = $allLabels;
} else {
$this->allLabels = array_keys(array_count_values($targets));
}
sort($this->allLabels, SORT_STRING);

// If there are only two targets, then there is no need to perform OvR
$this->labels = array_keys(array_count_values($targets));
if (count($this->labels) == 2) {
$classifier->trainBinary($samples, $targets);
$this->classifiers[] = $classifier;
if (count($this->allLabels) == 2) {

// Init classifier if required.
if (empty($this->classifiers)) {
$this->classifiers[0] = $this->getClassifierCopy();
}

$this->classifiers[0]->trainBinary($samples, $targets, $this->allLabels);
} else {
// Train a separate classifier for each label and memorize them
$this->samples = $samples;
$this->targets = $targets;
foreach ($this->labels as $label) {
$predictor = clone $classifier;
$targets = $this->binarizeTargets($label);
$predictor->trainBinary($samples, $targets);
$this->classifiers[$label] = $predictor;

foreach ($this->allLabels as $label) {

// Init classifier if required.
if (empty($this->classifiers[$label])) {
$this->classifiers[$label] = $this->getClassifierCopy();
}

list($binarizedTargets, $classifierLabels) = $this->binarizeTargets($targets, $label);
$this->classifiers[$label]->trainBinary($samples, $binarizedTargets, $classifierLabels);
}
}

// If the underlying classifier is capable of giving the cost values
// during the training, then assign it to the relevant variable
if (method_exists($this->classifiers[0], 'getCostValues')) {
$this->costValues = $this->classifiers[0]->getCostValues();
// Adding just the first classifier cost values to avoid complex average calculations.
$classifierref = reset($this->classifiers);
if (method_exists($classifierref, 'getCostValues')) {
$this->costValues = $classifierref->getCostValues();
}
}

/**
* Resets the classifier and the vars internally used by OneVsRest to create multiple classifiers.
*/
public function reset()
{
$this->classifiers = [];
$this->allLabels = [];
$this->costValues = [];

$this->resetBinary();
}

/**
* Returns an instance of the current class after cleaning up OneVsRest stuff.
*
* @return \Phpml\Estimator
*/
protected function getClassifierCopy()
{

// Clone the current classifier, so that
// we don't mess up its variables while training
// multiple instances of this classifier
$classifier = clone $this;
$classifier->reset();
return $classifier;
}

/**
* Groups all targets into two groups: Targets equal to
* the given label and the others
*
* $targets is not passed by reference nor contains objects so this method
* changes will not affect the caller $targets array.
*
* @param array $targets
* @param mixed $label
* @return array Binarized targets and target's labels
*/
private function binarizeTargets($label)
private function binarizeTargets($targets, $label)
{
$targets = [];

foreach ($this->targets as $target) {
$targets[] = $target == $label ? $label : "not_$label";
$notLabel = "not_$label";
foreach ($targets as $key => $target) {
$targets[$key] = $target == $label ? $label : $notLabel;
}

return $targets;
$labels = array($label, $notLabel);
return array($targets, $labels);
}


@@ -94,7 +147,7 @@ private function binarizeTargets($label)
*/
protected function predictSample(array $sample)
{
if (count($this->labels) == 2) {
if (count($this->allLabels) == 2) {
return $this->classifiers[0]->predictSampleBinary($sample);
}

@@ -113,8 +166,16 @@ protected function predictSample(array $sample)
*
* @param array $samples
* @param array $targets
* @param array $labels
*/
abstract protected function trainBinary(array $samples, array $targets, array $labels);

/**
* To be overwritten by OneVsRest classifiers.
*
* @return void
*/
abstract protected function trainBinary(array $samples, array $targets);
abstract protected function resetBinary();

/**
* Each classifier that make use of OvR approach should be able to
2 changes: 2 additions & 0 deletions src/Phpml/Helper/Optimizer/ConjugateGradient.php
Original file line number Diff line number Diff line change
@@ -57,6 +57,8 @@ public function runOptimization(array $samples, array $targets, \Closure $gradie
}
}

$this->clear();

return $this->theta;
}

15 changes: 14 additions & 1 deletion src/Phpml/Helper/Optimizer/GD.php
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@ class GD extends StochasticGD
*
* @var int
*/
protected $sampleCount;
protected $sampleCount = null;

/**
* @param array $samples
@@ -49,6 +49,8 @@ public function runOptimization(array $samples, array $targets, \Closure $gradie
}
}

$this->clear();

return $this->theta;
}

@@ -105,4 +107,15 @@ protected function updateWeightsWithUpdates(array $updates, float $penalty)
}
}
}

/**
* Clears the optimizer internal vars after the optimization process.
*
* @return void
*/
protected function clear()
{
$this->sampleCount = null;
parent::clear();
}
}
20 changes: 17 additions & 3 deletions src/Phpml/Helper/Optimizer/StochasticGD.php
Original file line number Diff line number Diff line change
@@ -16,22 +16,22 @@ class StochasticGD extends Optimizer
*
* @var array
*/
protected $samples;
protected $samples = [];

/**
* y (targets)
*
* @var array
*/
protected $targets;
protected $targets = [];

/**
* Callback function to get the gradient and cost value
* for a specific set of theta (ϴ) and a pair of sample & target
*
* @var \Closure
*/
protected $gradientCb;
protected $gradientCb = null;

/**
* Maximum number of iterations used to train the model
@@ -192,6 +192,8 @@ public function runOptimization(array $samples, array $targets, \Closure $gradie
}
}

$this->clear();

// Solution in the pocket is better than or equal to the last state
// so, we use this solution
return $this->theta = $bestTheta;
@@ -268,4 +270,16 @@ public function getCostValues()
{
return $this->costValues;
}

/**
* Clears the optimizer internal vars after the optimization process.
*
* @return void
*/
protected function clear()
{
$this->samples = [];
$this->targets = [];
$this->gradientCb = null;
}
}
16 changes: 16 additions & 0 deletions src/Phpml/IncrementalEstimator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

declare(strict_types=1);

namespace Phpml;

interface IncrementalEstimator
{

/**
* @param array $samples
* @param array $targets
* @param array $labels
*/
public function partialTrain(array $samples, array $targets, array $labels = array());
}
18 changes: 17 additions & 1 deletion tests/Phpml/Classification/Linear/AdalineTest.php
Original file line number Diff line number Diff line change
@@ -45,7 +45,23 @@ public function testPredictSingleSample()
$this->assertEquals(1, $classifier->predict([6.0, 5.0]));
$this->assertEquals(2, $classifier->predict([3.0, 9.5]));

return $classifier;
// Extra partial training should lead to the same results.
$classifier->partialTrain([[0, 1], [1, 0]], [0, 0], [0, 1, 2]);
$this->assertEquals(0, $classifier->predict([0.5, 0.5]));
$this->assertEquals(1, $classifier->predict([6.0, 5.0]));
$this->assertEquals(2, $classifier->predict([3.0, 9.5]));

// Train should clear previous data.
$samples = [
[0, 0], [0, 1], [1, 0], [1, 1], // First group : a cluster at bottom-left corner in 2D
[5, 5], [6, 5], [5, 6], [7, 5], // Second group: another cluster at the middle-right
[3, 10],[3, 10],[3, 8], [3, 9] // Third group : cluster at the top-middle
];
$targets = [2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1];
$classifier->train($samples, $targets);
$this->assertEquals(2, $classifier->predict([0.5, 0.5]));
$this->assertEquals(0, $classifier->predict([6.0, 5.0]));
$this->assertEquals(1, $classifier->predict([3.0, 9.5]));
}

public function testSaveAndRestore()
18 changes: 17 additions & 1 deletion tests/Phpml/Classification/Linear/PerceptronTest.php
Original file line number Diff line number Diff line change
@@ -48,7 +48,23 @@ public function testPredictSingleSample()
$this->assertEquals(1, $classifier->predict([6.0, 5.0]));
$this->assertEquals(2, $classifier->predict([3.0, 9.5]));

return $classifier;
// Extra partial training should lead to the same results.
$classifier->partialTrain([[0, 1], [1, 0]], [0, 0], [0, 1, 2]);
$this->assertEquals(0, $classifier->predict([0.5, 0.5]));
$this->assertEquals(1, $classifier->predict([6.0, 5.0]));
$this->assertEquals(2, $classifier->predict([3.0, 9.5]));

// Train should clear previous data.
$samples = [
[0, 0], [0, 1], [1, 0], [1, 1], // First group : a cluster at bottom-left corner in 2D
[5, 5], [6, 5], [5, 6], [7, 5], // Second group: another cluster at the middle-right
[3, 10],[3, 10],[3, 8], [3, 9] // Third group : cluster at the top-middle
];
$targets = [2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1];
$classifier->train($samples, $targets);
$this->assertEquals(2, $classifier->predict([0.5, 0.5]));
$this->assertEquals(0, $classifier->predict([6.0, 5.0]));
$this->assertEquals(1, $classifier->predict([3.0, 9.5]));
}

public function testSaveAndRestore()

0 comments on commit e1854d4

Please sign in to comment.