forked from jorgecasas/php-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement VarianceThreshold - simple baseline approach to feature sel…
…ection. (#228) * Add sum of squares deviations * Calculate population variance * Add VarianceThreshold - feature selection transformer * Add docs about VarianceThreshold * Add missing code for pipeline usage
- Loading branch information
Showing
10 changed files
with
279 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
docs/machine-learning/feature-selection/variance-threshold.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Variance Threshold | ||
|
||
`VarianceThreshold` is a simple baseline approach to feature selection. | ||
It removes all features whose variance doesn’t meet some threshold. | ||
By default, it removes all zero-variance features, i.e. features that have the same value in all samples. | ||
|
||
## Constructor Parameters | ||
|
||
* $threshold (float) - features with a variance lower than this threshold will be removed (default 0.0) | ||
|
||
```php | ||
use Phpml\FeatureSelection\VarianceThreshold; | ||
|
||
$transformer = new VarianceThreshold(0.15); | ||
``` | ||
|
||
## Example of use | ||
|
||
As an example, suppose that we have a dataset with boolean features and | ||
we want to remove all features that are either one or zero (on or off) | ||
in more than 80% of the samples. | ||
Boolean features are Bernoulli random variables, and the variance of such | ||
variables is given by | ||
``` | ||
Var[X] = p(1 - p) | ||
``` | ||
so we can select using the threshold .8 * (1 - .8): | ||
|
||
```php | ||
use Phpml\FeatureSelection\VarianceThreshold; | ||
|
||
$samples = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]; | ||
$transformer = new VarianceThreshold(0.8 * (1 - 0.8)); | ||
|
||
$transformer->fit($samples); | ||
$transformer->transform($samples); | ||
|
||
/* | ||
$samples = [[0, 1], [1, 0], [0, 0], [1, 1], [1, 0], [1, 1]]; | ||
*/ | ||
``` | ||
|
||
## Pipeline | ||
|
||
`VarianceThreshold` implements `Transformer` interface so it can be used as part of pipeline: | ||
|
||
```php | ||
use Phpml\FeatureSelection\VarianceThreshold; | ||
use Phpml\Classification\SVC; | ||
use Phpml\FeatureExtraction\TfIdfTransformer; | ||
use Phpml\Pipeline; | ||
|
||
$transformers = [ | ||
new TfIdfTransformer(), | ||
new VarianceThreshold(0.1) | ||
]; | ||
$estimator = new SVC(); | ||
|
||
$pipeline = new Pipeline($transformers, $estimator); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\FeatureSelection; | ||
|
||
use Phpml\Exception\InvalidArgumentException; | ||
use Phpml\Math\Matrix; | ||
use Phpml\Math\Statistic\Variance; | ||
use Phpml\Transformer; | ||
|
||
final class VarianceThreshold implements Transformer | ||
{ | ||
/** | ||
* @var float | ||
*/ | ||
private $threshold; | ||
|
||
/** | ||
* @var array | ||
*/ | ||
private $variances = []; | ||
|
||
/** | ||
* @var array | ||
*/ | ||
private $keepColumns = []; | ||
|
||
public function __construct(float $threshold = 0.0) | ||
{ | ||
if ($threshold < 0) { | ||
throw new InvalidArgumentException('Threshold can\'t be lower than zero'); | ||
} | ||
|
||
$this->threshold = $threshold; | ||
$this->variances = []; | ||
$this->keepColumns = []; | ||
} | ||
|
||
public function fit(array $samples): void | ||
{ | ||
$this->variances = array_map(function (array $column) { | ||
return Variance::population($column); | ||
}, Matrix::transposeArray($samples)); | ||
|
||
foreach ($this->variances as $column => $variance) { | ||
if ($variance > $this->threshold) { | ||
$this->keepColumns[$column] = true; | ||
} | ||
} | ||
} | ||
|
||
public function transform(array &$samples): void | ||
{ | ||
foreach ($samples as &$sample) { | ||
$sample = array_values(array_intersect_key($sample, $this->keepColumns)); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Math\Statistic; | ||
|
||
/** | ||
* In probability theory and statistics, variance is the expectation of the squared deviation of a random variable from its mean. | ||
* Informally, it measures how far a set of (random) numbers are spread out from their average value | ||
* https://en.wikipedia.org/wiki/Variance | ||
*/ | ||
final class Variance | ||
{ | ||
/** | ||
* Population variance | ||
* Use when all possible observations of the system are present. | ||
* If used with a subset of data (sample variance), it will be a biased variance. | ||
* | ||
* ∑⟮xᵢ - μ⟯² | ||
* σ² = ---------- | ||
* N | ||
*/ | ||
public static function population(array $population): float | ||
{ | ||
return StandardDeviation::sumOfSquares($population) / count($population); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Tests\FeatureSelection; | ||
|
||
use Phpml\Exception\InvalidArgumentException; | ||
use Phpml\FeatureSelection\VarianceThreshold; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
final class VarianceThresholdTest extends TestCase | ||
{ | ||
public function testVarianceThreshold(): void | ||
{ | ||
$samples = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]; | ||
$transformer = new VarianceThreshold(0.8 * (1 - 0.8)); // 80% of samples - boolean features are Bernoulli random variables | ||
$transformer->fit($samples); | ||
$transformer->transform($samples); | ||
|
||
// expecting to remove first column | ||
self::assertEquals([[0, 1], [1, 0], [0, 0], [1, 1], [1, 0], [1, 1]], $samples); | ||
} | ||
|
||
public function testVarianceThresholdWithZeroThreshold(): void | ||
{ | ||
$samples = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]; | ||
$transformer = new VarianceThreshold(); | ||
$transformer->fit($samples); | ||
$transformer->transform($samples); | ||
|
||
self::assertEquals([[2, 0], [1, 4], [1, 1]], $samples); | ||
} | ||
|
||
public function testThrowExceptionWhenThresholdBelowZero(): void | ||
{ | ||
$this->expectException(InvalidArgumentException::class); | ||
new VarianceThreshold(-0.1); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Tests\Math\Statistic; | ||
|
||
use Phpml\Math\Statistic\Variance; | ||
use PHPUnit\Framework\TestCase; | ||
|
||
final class VarianceTest extends TestCase | ||
{ | ||
/** | ||
* @dataProvider dataProviderForPopulationVariance | ||
*/ | ||
public function testVarianceFromInt(array $numbers, float $variance): void | ||
{ | ||
self::assertEquals($variance, Variance::population($numbers), '', 0.001); | ||
} | ||
|
||
public function dataProviderForPopulationVariance() | ||
{ | ||
return [ | ||
[[0, 0, 0, 0, 0, 1], 0.138], | ||
[[-11, 0, 10, 20, 30], 208.16], | ||
[[7, 8, 9, 10, 11, 12, 13], 4.0], | ||
[[300, 570, 170, 730, 300], 41944], | ||
[[-4, 2, 7, 8, 3], 18.16], | ||
[[3, 7, 34, 25, 46, 7754, 3, 6], 6546331.937], | ||
[[4, 6, 1, 1, 1, 1, 2, 2, 1, 3], 2.56], | ||
[[-3732, 5, 27, 9248, -174], 18741676.56], | ||
[[-554, -555, -554, -554, -555, -555, -556], 0.4897], | ||
]; | ||
} | ||
} |