Skip to content

Commit

Permalink
Implement LabelEncoder (#369)
Browse files Browse the repository at this point in the history
  • Loading branch information
akondas authored Apr 2, 2019
1 parent d3888ef commit dbbce0e
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- [Preprocessing] Implement LabelEncoder

## [0.8.0] - 2019-03-20
### Added
- [Tokenization] Added NGramTokenizer (#350)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets](
* Preprocessing
* [Normalization](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/normalization/)
* [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/)
* LabelEncoder
* Feature Extraction
* [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/)
* NGramTokenizer
Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Example scripts are available in a separate repository [php-ai/php-ml-examples](
* Preprocessing
* [Normalization](machine-learning/preprocessing/normalization.md)
* [Imputation missing values](machine-learning/preprocessing/imputation-missing-values.md)
* LabelEncoder
* Feature Extraction
* [Token Count Vectorizer](machine-learning/feature-extraction/token-count-vectorizer.md)
* [Tf-idf Transformer](machine-learning/feature-extraction/tf-idf-transformer.md)
Expand Down
47 changes: 47 additions & 0 deletions src/Preprocessing/LabelEncoder.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?php

declare(strict_types=1);

namespace Phpml\Preprocessing;

final class LabelEncoder implements Preprocessor
{
/**
* @var int[]
*/
private $classes = [];

public function fit(array $samples, ?array $targets = null): void
{
$this->classes = [];

foreach ($samples as $sample) {
if (!isset($this->classes[(string) $sample])) {
$this->classes[(string) $sample] = count($this->classes);
}
}
}

public function transform(array &$samples): void
{
foreach ($samples as &$sample) {
$sample = $this->classes[(string) $sample];
}
}

public function inverseTransform(array &$samples): void
{
$classes = array_flip($this->classes);
foreach ($samples as &$sample) {
$sample = $classes[$sample];
}
}

/**
* @return string[]
*/
public function classes(): array
{
return array_keys($this->classes);
}
}
68 changes: 68 additions & 0 deletions tests/Preprocessing/LabelEncoderTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<?php

declare(strict_types=1);

namespace Phpml\Tests\Preprocessing;

use Phpml\Preprocessing\LabelEncoder;
use PHPUnit\Framework\TestCase;

final class LabelEncoderTest extends TestCase
{
/**
* @dataProvider labelEncoderDataProvider
*/
public function testFitAndTransform(array $samples, array $transformed): void
{
$le = new LabelEncoder();
$le->fit($samples);
$le->transform($samples);

self::assertEquals($transformed, $samples);
}

public function labelEncoderDataProvider(): array
{
return [
[['one', 'one', 'two', 'three'], [0, 0, 1, 2]],
[['one', 1, 'two', 'three'], [0, 1, 2, 3]],
[['one', null, 'two', 'three'], [0, 1, 2, 3]],
[['one', 'one', 'one', 'one'], [0, 0, 0, 0]],
[['one', 'one', 'one', 'one', null, null, 1, 1, 2, 'two'], [0, 0, 0, 0, 1, 1, 2, 2, 3, 4]],
];
}

public function testResetClassesAfterNextFit(): void
{
$samples = ['Shanghai', 'Beijing', 'Karachi'];

$le = new LabelEncoder();
$le->fit($samples);

self::assertEquals(['Shanghai', 'Beijing', 'Karachi'], $le->classes());

$samples = ['Istanbul', 'Dhaka', 'Tokyo'];

$le->fit($samples);

self::assertEquals(['Istanbul', 'Dhaka', 'Tokyo'], $le->classes());
}

public function testFitAndTransformFullCycle(): void
{
$samples = ['Shanghai', 'Beijing', 'Karachi', 'Beijing', 'Beijing', 'Karachi'];
$encoded = [0, 1, 2, 1, 1, 2];

$le = new LabelEncoder();
$le->fit($samples);

self::assertEquals(['Shanghai', 'Beijing', 'Karachi'], $le->classes());

$transformed = $samples;
$le->transform($transformed);
self::assertEquals($encoded, $transformed);

$le->inverseTransform($transformed);
self::assertEquals($samples, $transformed);
}
}

0 comments on commit dbbce0e

Please sign in to comment.