Skip to content

Commit

Permalink
Implement FeatureUnion 🚀 (#382)
Browse files Browse the repository at this point in the history
  • Loading branch information
akondas authored May 14, 2019
1 parent ff118eb commit b500f0b
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 34 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* Regression
* Workflow
* [Pipeline](http://php-ml.readthedocs.io/en/latest/machine-learning/workflow/pipeline)
* FeatureUnion
* Neural Network
* [Multilayer Perceptron Classifier](http://php-ml.readthedocs.io/en/latest/machine-learning/neural-network/multilayer-perceptron-classifier/)
* Cross Validation
Expand All @@ -103,6 +104,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* LabelEncoder
* LambdaTransformer
* NumberConverter
* ColumnFilter
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer
Expand Down
72 changes: 72 additions & 0 deletions src/FeatureUnion.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<?php

declare(strict_types=1);

namespace Phpml;

use Phpml\Exception\InvalidArgumentException;

final class FeatureUnion implements Transformer
{
/**
* @var Pipeline[]
*/
private $pipelines = [];

/**
* @var Pipeline[]
*/
public function __construct(array $pipelines)
{
if ($pipelines === []) {
throw new InvalidArgumentException('At least one pipeline is required');
}

$this->pipelines = array_map(static function (Pipeline $pipeline): Pipeline {
return $pipeline;
}, $pipelines);
}

public function fit(array $samples, ?array $targets = null): void
{
$originSamples = $samples;
foreach ($this->pipelines as $pipeline) {
foreach ($pipeline->getTransformers() as $transformer) {
$transformer->fit($samples, $targets);
$transformer->transform($samples, $targets);
}
$samples = $originSamples;
}
}

public function transform(array &$samples, ?array &$targets = null): void
{
$this->transformSamples($samples, $targets);
}

public function fitAndTransform(array &$samples, ?array &$targets = null): void
{
$this->transformSamples($samples, $targets, true);
}

private function transformSamples(array &$samples, ?array &$targets = null, bool $fit = false): void
{
$union = [];
$originSamples = $samples;
foreach ($this->pipelines as $pipeline) {
foreach ($pipeline->getTransformers() as $transformer) {
if ($fit) {
$transformer->fit($samples, $targets);
}
$transformer->transform($samples, $targets);
}

foreach ($samples as $index => $sample) {
$union[$index] = array_merge($union[$index] ?? [], is_array($sample) ? $sample : [$sample]);
}
$samples = $originSamples;
}

$samples = $union;
}
}
2 changes: 1 addition & 1 deletion src/Metric/Regression.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public static function meanSquaredLogarithmicError(array $targets, array $predic

$errors = [];
foreach ($targets as $index => $target) {
$errors[] = (log(1 + $target) - log(1 + $predictions[$index])) ** 2;
$errors[] = log((1 + $target) / (1 + $predictions[$index])) ** 2;
}

return Mean::arithmetic($errors);
Expand Down
51 changes: 29 additions & 22 deletions src/Pipeline.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,37 +4,28 @@

namespace Phpml;

class Pipeline implements Estimator
use Phpml\Exception\InvalidOperationException;

class Pipeline implements Estimator, Transformer
{
/**
* @var Transformer[]
*/
private $transformers = [];

/**
* @var Estimator
* @var Estimator|null
*/
private $estimator;

/**
* @param Transformer[] $transformers
*/
public function __construct(array $transformers, Estimator $estimator)
{
foreach ($transformers as $transformer) {
$this->addTransformer($transformer);
}

$this->estimator = $estimator;
}

public function addTransformer(Transformer $transformer): void
{
$this->transformers[] = $transformer;
}

public function setEstimator(Estimator $estimator): void
public function __construct(array $transformers, ?Estimator $estimator = null)
{
$this->transformers = array_map(static function (Transformer $transformer): Transformer {
return $transformer;
}, $transformers);
$this->estimator = $estimator;
}

Expand All @@ -46,16 +37,20 @@ public function getTransformers(): array
return $this->transformers;
}

public function getEstimator(): Estimator
public function getEstimator(): ?Estimator
{
return $this->estimator;
}

public function train(array $samples, array $targets): void
{
if ($this->estimator === null) {
throw new InvalidOperationException('Pipeline without estimator can\'t use train method');
}

foreach ($this->transformers as $transformer) {
$transformer->fit($samples, $targets);
$transformer->transform($samples);
$transformer->transform($samples, $targets);
}

$this->estimator->train($samples, $targets);
Expand All @@ -66,15 +61,27 @@ public function train(array $samples, array $targets): void
*/
public function predict(array $samples)
{
$this->transformSamples($samples);
if ($this->estimator === null) {
throw new InvalidOperationException('Pipeline without estimator can\'t use predict method');
}

$this->transform($samples);

return $this->estimator->predict($samples);
}

private function transformSamples(array &$samples): void
public function fit(array $samples, ?array $targets = null): void
{
foreach ($this->transformers as $transformer) {
$transformer->fit($samples, $targets);
$transformer->transform($samples, $targets);
}
}

public function transform(array &$samples, ?array &$targets = null): void
{
foreach ($this->transformers as $transformer) {
$transformer->transform($samples);
$transformer->transform($samples, $targets);
}
}
}
105 changes: 105 additions & 0 deletions tests/FeatureUnionTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
<?php

declare(strict_types=1);

namespace Phpml\Tests;

use Phpml\Exception\InvalidArgumentException;
use Phpml\FeatureUnion;
use Phpml\Pipeline;
use Phpml\Preprocessing\ColumnFilter;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
use Phpml\Preprocessing\LabelEncoder;
use Phpml\Preprocessing\LambdaTransformer;
use Phpml\Preprocessing\NumberConverter;
use PHPUnit\Framework\TestCase;

final class FeatureUnionTest extends TestCase
{
public function testFitAndTransform(): void
{
$columns = ['age', 'income', 'sex'];
$samples = [
['23', '100000', 'male'],
['23', '200000', 'female'],
['43', '150000', 'female'],
['33', 'n/a', 'male'],
];
$targets = ['1', '2', '1', '3'];

$union = new FeatureUnion([
new Pipeline([
new ColumnFilter($columns, ['sex']),
new LambdaTransformer(function (array $sample) {
return $sample[0];
}),
new LabelEncoder(),
]),
new Pipeline([
new ColumnFilter($columns, ['age', 'income']),
new NumberConverter(true),
new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN),
]),
]);

$union->fitAndTransform($samples, $targets);

self::assertEquals([
[0, 23.0, 100000.0],
[1, 23.0, 200000.0],
[1, 43.0, 150000.0],
[0, 33.0, 150000.0],
], $samples);
self::assertEquals([1, 2, 1, 3], $targets);
}

public function testFitAndTransformSeparate(): void
{
$columns = ['age', 'income', 'sex'];
$trainSamples = [
['23', '100000', 'male'],
['23', '200000', 'female'],
['43', '150000', 'female'],
['33', 'n/a', 'male'],
];
$testSamples = [
['43', '500000', 'female'],
['13', 'n/a', 'male'],
['53', 'n/a', 'male'],
['43', 'n/a', 'female'],
];

$union = new FeatureUnion([
new Pipeline([
new ColumnFilter($columns, ['sex']),
new LambdaTransformer(function (array $sample) {
return $sample[0];
}),
new LabelEncoder(),
]),
new Pipeline([
new ColumnFilter($columns, ['age', 'income']),
new NumberConverter(),
new Imputer(null, new MeanStrategy(), Imputer::AXIS_COLUMN),
]),
]);

$union->fit($trainSamples);
$union->transform($testSamples);

self::assertEquals([
[1, 43.0, 500000.0],
[0, 13.0, 150000.0],
[0, 53.0, 150000.0],
[1, 43.0, 150000.0],
], $testSamples);
}

public function testNotAllowForEmptyPipelines(): void
{
$this->expectException(InvalidArgumentException::class);

new FeatureUnion([]);
}
}
35 changes: 24 additions & 11 deletions tests/PipelineTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
use Phpml\ModelManager;
use Phpml\Pipeline;
use Phpml\Preprocessing\Imputer;
use Phpml\Preprocessing\Imputer\Strategy\MeanStrategy;
use Phpml\Preprocessing\Imputer\Strategy\MostFrequentStrategy;
use Phpml\Preprocessing\Normalizer;
use Phpml\Regression\SVR;
use Phpml\Tokenization\WordTokenizer;
use PHPUnit\Framework\TestCase;

Expand All @@ -32,16 +32,6 @@ public function testPipelineConstruction(): void
self::assertEquals($estimator, $pipeline->getEstimator());
}

public function testPipelineEstimatorSetter(): void
{
$pipeline = new Pipeline([new TfIdfTransformer()], new SVC());

$estimator = new SVR();
$pipeline->setEstimator($estimator);

self::assertEquals($estimator, $pipeline->getEstimator());
}

public function testPipelineWorkflow(): void
{
$transformers = [
Expand Down Expand Up @@ -119,6 +109,29 @@ public function testPipelineTransformersWithTargets(): void
self::assertEquals(['b'], $pipeline->predict([[1, 3, 5]]));
}

public function testPipelineAsTransformer(): void
{
$pipeline = new Pipeline([
new Imputer(null, new MeanStrategy()),
]);

$trainSamples = [
[10, 20, 30],
[20, 30, 40],
[30, 40, 50],
];

$pipeline->fit($trainSamples);

$testSamples = [
[null, null, null],
];

$pipeline->transform($testSamples);

self::assertEquals([[20.0, 30.0, 40.0]], $testSamples);
}

public function testSaveAndRestore(): void
{
$pipeline = new Pipeline([
Expand Down

0 comments on commit b500f0b

Please sign in to comment.