Skip to content

Commit

Permalink
Ensemble Classifiers : Bagging and RandomForest (#36)
Browse files Browse the repository at this point in the history
* Fuzzy C-Means implementation

* Update FuzzyCMeans

* Rename FuzzyCMeans to FuzzyCMeans.php

* Update NaiveBayes.php

* Small fix applied to improve training performance

array_unique is replaced with array_count_values+array_keys which is way
faster

* Revert "Small fix applied to improve training performance"

This reverts commit c20253f.

* Revert "Revert "Small fix applied to improve training performance""

This reverts commit ea10e13.

* Revert "Small fix applied to improve training performance"

This reverts commit c20253f.

* First DecisionTree implementation

* Revert "First DecisionTree implementation"

This reverts commit 4057a08.

* DecisionTree

* FCM Test

* FCM Test

* DecisionTree Test

* Ensemble classifiers: Bagging and RandomForests

* test

* Fixes for conflicted files

* Bagging and RandomForest ensemble algorithms

* Changed unit test

* Changed unit test

* Changed unit test

* Bagging and RandomForest ensemble algorithms

* Baggging and RandomForest ensemble algorithms

* Bagging and RandomForest ensemble algorithms

RandomForest algorithm is improved with changes to original DecisionTree

* Bagging and RandomForest ensemble algorithms

* Slight fix about use of global Exception class

* Fixed the error about wrong use of global Exception class

* RandomForest code formatting
  • Loading branch information
MustafaKarabulut authored and akondas committed Feb 7, 2017
1 parent 72b25ff commit 1d73503
Show file tree
Hide file tree
Showing 6 changed files with 507 additions and 7 deletions.
60 changes: 54 additions & 6 deletions src/Phpml/Classification/DecisionTree.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ class DecisionTree implements Classifier
*/
public $actualDepth = 0;

/**
* @var int
*/
private $numUsableFeatures = 0;

/**
* @param int $maxDepth
*/
Expand Down Expand Up @@ -144,15 +149,15 @@ protected function getBestSplit($records)
$samples = array_combine($records, $this->preprocess($samples));
$bestGiniVal = 1;
$bestSplit = null;
for ($i=0; $i<$this->featureCount; $i++) {
$features = $this->getSelectedFeatures();
foreach ($features as $i) {
$colValues = [];
$baseValue = null;
foreach ($samples as $index => $row) {
$colValues[$index] = $row[$i];
if ($baseValue === null) {
$baseValue = $row[$i];
}
}
$counts = array_count_values($colValues);
arsort($counts);
$baseValue = key($counts);
$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
if ($bestSplit == null || $bestGiniVal > $gini) {
$split = new DecisionTreeLeaf();
Expand All @@ -167,6 +172,27 @@ protected function getBestSplit($records)
return $bestSplit;
}

/**
* @return array
*/
protected function getSelectedFeatures()
{
$allFeatures = range(0, $this->featureCount - 1);
if ($this->numUsableFeatures == 0) {
return $allFeatures;
}

$numFeatures = $this->numUsableFeatures;
if ($numFeatures > $this->featureCount) {
$numFeatures = $this->featureCount;
}
shuffle($allFeatures);
$selectedFeatures = array_slice($allFeatures, 0, $numFeatures, false);
sort($selectedFeatures);

return $selectedFeatures;
}

/**
* @param string $baseValue
* @param array $colValues
Expand Down Expand Up @@ -248,6 +274,27 @@ protected function isCategoricalColumn(array $columnValues)
return false;
}

/**
* This method is used to set number of columns to be used
* when deciding a split at an internal node of the tree. <br>
* If the value is given 0, then all features are used (default behaviour),
* otherwise the given value will be used as a maximum for number of columns
* randomly selected for each split operation.
*
* @param int $numFeatures
* @return $this
* @throws Exception
*/
public function setNumFeatures(int $numFeatures)
{
if ($numFeatures < 0) {
throw new \Exception("Selected column count should be greater or equal to zero");
}

$this->numUsableFeatures = $numFeatures;
return $this;
}

/**
* @return string
*/
Expand All @@ -273,6 +320,7 @@ protected function predictSample(array $sample)
$node = $node->rightLeaf;
}
} while ($node);
return $node->classValue;

return $node ? $node->classValue : $this->labels[0];
}
}
2 changes: 1 addition & 1 deletion src/Phpml/Classification/DecisionTree/DecisionTreeLeaf.php
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class DecisionTreeLeaf
public function evaluate($record)
{
$recordField = $record[$this->columnIndex];
if (preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
if (is_string($this->value) && preg_match("/^([<>=]{1,2})\s*(.*)/", $this->value, $matches)) {
$op = $matches[1];
$value= floatval($matches[2]);
$recordField = strval($recordField);
Expand Down
198 changes: 198 additions & 0 deletions src/Phpml/Classification/Ensemble/Bagging.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
<?php

declare(strict_types=1);

namespace Phpml\Classification\Ensemble;

use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;
use Phpml\Classification\Classifier;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\NaiveBayes;

class Bagging implements Classifier
{
use Trainable, Predictable;

/**
* @var int
*/
protected $numSamples;

/**
* @var array
*/
private $targets = [];

/**
* @var int
*/
protected $featureCount = 0;

/**
* @var int
*/
protected $numClassifier;

/**
* @var Classifier
*/
protected $classifier = DecisionTree::class;

/**
* @var array
*/
protected $classifierOptions = ['depth' => 20];

/**
* @var array
*/
protected $classifiers;

/**
* @var float
*/
protected $subsetRatio = 0.5;

/**
* @var array
*/
private $samples = [];

/**
* Creates an ensemble classifier with given number of base classifiers<br>
* Default number of base classifiers is 100.
* The more number of base classifiers, the better performance but at the cost of procesing time
*
* @param int $numClassifier
*/
public function __construct($numClassifier = 50)
{
$this->numClassifier = $numClassifier;
}

/**
* This method determines the ratio of samples used to create the 'bootstrap' subset,
* e.g., random samples drawn from the original dataset with replacement (allow repeats),
* to train each base classifier.
*
* @param float $ratio
* @return $this
* @throws Exception
*/
public function setSubsetRatio(float $ratio)
{
if ($ratio < 0.1 || $ratio > 1.0) {
throw new \Exception("Subset ratio should be between 0.1 and 1.0");
}
$this->subsetRatio = $ratio;
return $this;
}

/**
* This method is used to set the base classifier. Default value is
* DecisionTree::class, but any class that implements the <i>Classifier</i>
* can be used. <br>
* While giving the parameters of the classifier, the values should be
* given in the order they are in the constructor of the classifier and parameter
* names are neglected.
*
* @param string $classifier
* @param array $classifierOptions
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
$this->classifier = $classifier;
$this->classifierOptions = $classifierOptions;
return $this;
}

/**
* @param array $samples
* @param array $targets
*/
public function train(array $samples, array $targets)
{
$this->samples = array_merge($this->samples, $samples);
$this->targets = array_merge($this->targets, $targets);
$this->featureCount = count($samples[0]);
$this->numSamples = count($this->samples);

// Init classifiers and train them with random sub-samples
$this->classifiers = $this->initClassifiers();
$index = 0;
foreach ($this->classifiers as $classifier) {
list($samples, $targets) = $this->getRandomSubset($index);
$classifier->train($samples, $targets);
++$index;
}
}

/**
* @param int $index
* @return array
*/
protected function getRandomSubset($index)
{
$subsetLength = (int)ceil(sqrt($this->numSamples));
$denom = $this->subsetRatio / 2;
$subsetLength = $this->numSamples / (1 / $denom);
$index = $index * $subsetLength % $this->numSamples;
$samples = [];
$targets = [];
for ($i=0; $i<$subsetLength * 2; $i++) {
$rand = rand($index, $this->numSamples - 1);
$samples[] = $this->samples[$rand];
$targets[] = $this->targets[$rand];
}
return [$samples, $targets];
}

/**
* @return array
*/
protected function initClassifiers()
{
$classifiers = [];
for ($i=0; $i<$this->numClassifier; $i++) {
$ref = new \ReflectionClass($this->classifier);
if ($this->classifierOptions) {
$obj = $ref->newInstanceArgs($this->classifierOptions);
} else {
$obj = $ref->newInstance();
}
$classifiers[] = $this->initSingleClassifier($obj, $i);
}
return $classifiers;
}

/**
* @param Classifier $classifier
* @param int $index
* @return Classifier
*/
protected function initSingleClassifier($classifier, $index)
{
return $classifier;
}

/**
* @param array $sample
* @return mixed
*/
protected function predictSample(array $sample)
{
$predictions = [];
foreach ($this->classifiers as $classifier) {
/* @var $classifier Classifier */
$predictions[] = $classifier->predict($sample);
}

$counts = array_count_values($predictions);
arsort($counts);
reset($counts);
return key($counts);
}
}
89 changes: 89 additions & 0 deletions src/Phpml/Classification/Ensemble/RandomForest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
<?php

declare(strict_types=1);

namespace Phpml\Classification\Ensemble;

use Phpml\Classification\Ensemble\Bagging;
use Phpml\Classification\DecisionTree;
use Phpml\Classification\NaiveBayes;
use Phpml\Classification\Classifier;

class RandomForest extends Bagging
{
/**
* @var float|string
*/
protected $featureSubsetRatio = 'log';

public function __construct($numClassifier = 50)
{
parent::__construct($numClassifier);

$this->setSubsetRatio(1.0);
}

/**
* This method is used to determine how much of the original columns (features)
* will be used to construct subsets to train base classifiers.<br>
*
* Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br>
*
* If there are many features that diminishes classification performance, then
* small values should be preferred, otherwise, with low number of features,
* default value (0.7) will result in satisfactory performance.
*
* @param mixed $ratio string or float should be given
* @return $this
* @throws Exception
*/
public function setFeatureSubsetRatio($ratio)
{
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) {
throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0");
}
if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') {
throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' ");
}
$this->featureSubsetRatio = $ratio;
return $this;
}

/**
* RandomForest algorithm is usable *only* with DecisionTree
*
* @param string $classifier
* @param array $classifierOptions
* @return $this
*/
public function setClassifer(string $classifier, array $classifierOptions = [])
{
if ($classifier != DecisionTree::class) {
throw new \Exception("RandomForest can only use DecisionTree as base classifier");
}

return parent::setClassifer($classifier, $classifierOptions);
}

/**
* @param DecisionTree $classifier
* @param int $index
* @return DecisionTree
*/
protected function initSingleClassifier($classifier, $index)
{
if (is_float($this->featureSubsetRatio)) {
$featureCount = (int)($this->featureSubsetRatio * $this->featureCount);
} elseif ($this->featureCount == 'sqrt') {
$featureCount = (int)sqrt($this->featureCount) + 1;
} else {
$featureCount = (int)log($this->featureCount, 2) + 1;
}

if ($featureCount >= $this->featureCount) {
$featureCount = $this->featureCount;
}

return $classifier->setNumFeatures($featureCount);
}
}
Loading

0 comments on commit 1d73503

Please sign in to comment.