forked from jorgecasas/php-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ensemble Classifiers : Bagging and RandomForest (#36)
* Fuzzy C-Means implementation * Update FuzzyCMeans * Rename FuzzyCMeans to FuzzyCMeans.php * Update NaiveBayes.php * Small fix applied to improve training performance array_unique is replaced with array_count_values+array_keys which is way faster * Revert "Small fix applied to improve training performance" This reverts commit c20253f. * Revert "Revert "Small fix applied to improve training performance"" This reverts commit ea10e13. * Revert "Small fix applied to improve training performance" This reverts commit c20253f. * First DecisionTree implementation * Revert "First DecisionTree implementation" This reverts commit 4057a08. * DecisionTree * FCM Test * FCM Test * DecisionTree Test * Ensemble classifiers: Bagging and RandomForests * test * Fixes for conflicted files * Bagging and RandomForest ensemble algorithms * Changed unit test * Changed unit test * Changed unit test * Bagging and RandomForest ensemble algorithms * Baggging and RandomForest ensemble algorithms * Bagging and RandomForest ensemble algorithms RandomForest algorithm is improved with changes to original DecisionTree * Bagging and RandomForest ensemble algorithms * Slight fix about use of global Exception class * Fixed the error about wrong use of global Exception class * RandomForest code formatting
- Loading branch information
1 parent
72b25ff
commit 1d73503
Showing
6 changed files
with
507 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Classification\Ensemble; | ||
|
||
use Phpml\Helper\Predictable; | ||
use Phpml\Helper\Trainable; | ||
use Phpml\Math\Statistic\Mean; | ||
use Phpml\Classification\Classifier; | ||
use Phpml\Classification\DecisionTree; | ||
use Phpml\Classification\NaiveBayes; | ||
|
||
class Bagging implements Classifier | ||
{ | ||
use Trainable, Predictable; | ||
|
||
/** | ||
* @var int | ||
*/ | ||
protected $numSamples; | ||
|
||
/** | ||
* @var array | ||
*/ | ||
private $targets = []; | ||
|
||
/** | ||
* @var int | ||
*/ | ||
protected $featureCount = 0; | ||
|
||
/** | ||
* @var int | ||
*/ | ||
protected $numClassifier; | ||
|
||
/** | ||
* @var Classifier | ||
*/ | ||
protected $classifier = DecisionTree::class; | ||
|
||
/** | ||
* @var array | ||
*/ | ||
protected $classifierOptions = ['depth' => 20]; | ||
|
||
/** | ||
* @var array | ||
*/ | ||
protected $classifiers; | ||
|
||
/** | ||
* @var float | ||
*/ | ||
protected $subsetRatio = 0.5; | ||
|
||
/** | ||
* @var array | ||
*/ | ||
private $samples = []; | ||
|
||
/** | ||
* Creates an ensemble classifier with given number of base classifiers<br> | ||
* Default number of base classifiers is 100. | ||
* The more number of base classifiers, the better performance but at the cost of procesing time | ||
* | ||
* @param int $numClassifier | ||
*/ | ||
public function __construct($numClassifier = 50) | ||
{ | ||
$this->numClassifier = $numClassifier; | ||
} | ||
|
||
/** | ||
* This method determines the ratio of samples used to create the 'bootstrap' subset, | ||
* e.g., random samples drawn from the original dataset with replacement (allow repeats), | ||
* to train each base classifier. | ||
* | ||
* @param float $ratio | ||
* @return $this | ||
* @throws Exception | ||
*/ | ||
public function setSubsetRatio(float $ratio) | ||
{ | ||
if ($ratio < 0.1 || $ratio > 1.0) { | ||
throw new \Exception("Subset ratio should be between 0.1 and 1.0"); | ||
} | ||
$this->subsetRatio = $ratio; | ||
return $this; | ||
} | ||
|
||
/** | ||
* This method is used to set the base classifier. Default value is | ||
* DecisionTree::class, but any class that implements the <i>Classifier</i> | ||
* can be used. <br> | ||
* While giving the parameters of the classifier, the values should be | ||
* given in the order they are in the constructor of the classifier and parameter | ||
* names are neglected. | ||
* | ||
* @param string $classifier | ||
* @param array $classifierOptions | ||
* @return $this | ||
*/ | ||
public function setClassifer(string $classifier, array $classifierOptions = []) | ||
{ | ||
$this->classifier = $classifier; | ||
$this->classifierOptions = $classifierOptions; | ||
return $this; | ||
} | ||
|
||
/** | ||
* @param array $samples | ||
* @param array $targets | ||
*/ | ||
public function train(array $samples, array $targets) | ||
{ | ||
$this->samples = array_merge($this->samples, $samples); | ||
$this->targets = array_merge($this->targets, $targets); | ||
$this->featureCount = count($samples[0]); | ||
$this->numSamples = count($this->samples); | ||
|
||
// Init classifiers and train them with random sub-samples | ||
$this->classifiers = $this->initClassifiers(); | ||
$index = 0; | ||
foreach ($this->classifiers as $classifier) { | ||
list($samples, $targets) = $this->getRandomSubset($index); | ||
$classifier->train($samples, $targets); | ||
++$index; | ||
} | ||
} | ||
|
||
/** | ||
* @param int $index | ||
* @return array | ||
*/ | ||
protected function getRandomSubset($index) | ||
{ | ||
$subsetLength = (int)ceil(sqrt($this->numSamples)); | ||
$denom = $this->subsetRatio / 2; | ||
$subsetLength = $this->numSamples / (1 / $denom); | ||
$index = $index * $subsetLength % $this->numSamples; | ||
$samples = []; | ||
$targets = []; | ||
for ($i=0; $i<$subsetLength * 2; $i++) { | ||
$rand = rand($index, $this->numSamples - 1); | ||
$samples[] = $this->samples[$rand]; | ||
$targets[] = $this->targets[$rand]; | ||
} | ||
return [$samples, $targets]; | ||
} | ||
|
||
/** | ||
* @return array | ||
*/ | ||
protected function initClassifiers() | ||
{ | ||
$classifiers = []; | ||
for ($i=0; $i<$this->numClassifier; $i++) { | ||
$ref = new \ReflectionClass($this->classifier); | ||
if ($this->classifierOptions) { | ||
$obj = $ref->newInstanceArgs($this->classifierOptions); | ||
} else { | ||
$obj = $ref->newInstance(); | ||
} | ||
$classifiers[] = $this->initSingleClassifier($obj, $i); | ||
} | ||
return $classifiers; | ||
} | ||
|
||
/** | ||
* @param Classifier $classifier | ||
* @param int $index | ||
* @return Classifier | ||
*/ | ||
protected function initSingleClassifier($classifier, $index) | ||
{ | ||
return $classifier; | ||
} | ||
|
||
/** | ||
* @param array $sample | ||
* @return mixed | ||
*/ | ||
protected function predictSample(array $sample) | ||
{ | ||
$predictions = []; | ||
foreach ($this->classifiers as $classifier) { | ||
/* @var $classifier Classifier */ | ||
$predictions[] = $classifier->predict($sample); | ||
} | ||
|
||
$counts = array_count_values($predictions); | ||
arsort($counts); | ||
reset($counts); | ||
return key($counts); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Classification\Ensemble; | ||
|
||
use Phpml\Classification\Ensemble\Bagging; | ||
use Phpml\Classification\DecisionTree; | ||
use Phpml\Classification\NaiveBayes; | ||
use Phpml\Classification\Classifier; | ||
|
||
class RandomForest extends Bagging | ||
{ | ||
/** | ||
* @var float|string | ||
*/ | ||
protected $featureSubsetRatio = 'log'; | ||
|
||
public function __construct($numClassifier = 50) | ||
{ | ||
parent::__construct($numClassifier); | ||
|
||
$this->setSubsetRatio(1.0); | ||
} | ||
|
||
/** | ||
* This method is used to determine how much of the original columns (features) | ||
* will be used to construct subsets to train base classifiers.<br> | ||
* | ||
* Allowed values: 'sqrt', 'log' or any float number between 0.1 and 1.0 <br> | ||
* | ||
* If there are many features that diminishes classification performance, then | ||
* small values should be preferred, otherwise, with low number of features, | ||
* default value (0.7) will result in satisfactory performance. | ||
* | ||
* @param mixed $ratio string or float should be given | ||
* @return $this | ||
* @throws Exception | ||
*/ | ||
public function setFeatureSubsetRatio($ratio) | ||
{ | ||
if (is_float($ratio) && ($ratio < 0.1 || $ratio > 1.0)) { | ||
throw new \Exception("When a float given, feature subset ratio should be between 0.1 and 1.0"); | ||
} | ||
if (is_string($ratio) && $ratio != 'sqrt' && $ratio != 'log') { | ||
throw new \Exception("When a string given, feature subset ratio can only be 'sqrt' or 'log' "); | ||
} | ||
$this->featureSubsetRatio = $ratio; | ||
return $this; | ||
} | ||
|
||
/** | ||
* RandomForest algorithm is usable *only* with DecisionTree | ||
* | ||
* @param string $classifier | ||
* @param array $classifierOptions | ||
* @return $this | ||
*/ | ||
public function setClassifer(string $classifier, array $classifierOptions = []) | ||
{ | ||
if ($classifier != DecisionTree::class) { | ||
throw new \Exception("RandomForest can only use DecisionTree as base classifier"); | ||
} | ||
|
||
return parent::setClassifer($classifier, $classifierOptions); | ||
} | ||
|
||
/** | ||
* @param DecisionTree $classifier | ||
* @param int $index | ||
* @return DecisionTree | ||
*/ | ||
protected function initSingleClassifier($classifier, $index) | ||
{ | ||
if (is_float($this->featureSubsetRatio)) { | ||
$featureCount = (int)($this->featureSubsetRatio * $this->featureCount); | ||
} elseif ($this->featureCount == 'sqrt') { | ||
$featureCount = (int)sqrt($this->featureCount) + 1; | ||
} else { | ||
$featureCount = (int)log($this->featureCount, 2) + 1; | ||
} | ||
|
||
if ($featureCount >= $this->featureCount) { | ||
$featureCount = $this->featureCount; | ||
} | ||
|
||
return $classifier->setNumFeatures($featureCount); | ||
} | ||
} |
Oops, something went wrong.