Skip to content

Commit

Permalink
Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363)
Browse files Browse the repository at this point in the history
  • Loading branch information
akondas authored Mar 20, 2019
1 parent 02dab41 commit 5e02b89
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 5 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.8.0] - 2019-03-20
### Added
- [Tokenization] Added NGramTokenizer (#350)
- editorconfig file (#355)
### Fixed
- [Dataset] FilesDataset read samples without additional array (#363)
- [Tokenization] fixed error with numeric token values (#363)
### Changed
- [Math] improved performance with pow and sqrt replacement (#350)
- [Math] reduce duplicated code in distance metrics (#348)
- update phpunit to 7.5.1 (#335)
- code style fixes (#334)

## [0.7.0] - 2018-11-07
### Added
- [Clustering] added KMeans associative clustering (#262)
Expand Down
2 changes: 1 addition & 1 deletion src/Dataset/FilesDataset.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ private function scanDir(string $dir): void
$target = basename($dir);

foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
$this->samples[] = [file_get_contents($file)];
$this->samples[] = file_get_contents($file);
$this->targets[] = $target;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/FeatureExtraction/TokenCountVectorizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ private function getBeyondMinimumIndexes(int $samplesCount): array
$indexes = [];
foreach ($this->frequencies as $token => $frequency) {
if (($frequency / $samplesCount) < $this->minDF) {
$indexes[] = $this->getTokenIndex($token);
$indexes[] = $this->getTokenIndex((string) $token);
}
}

Expand Down
4 changes: 2 additions & 2 deletions tests/Dataset/FilesDatasetTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ public function testLoadFilesDatasetWithBBCData(): void
self::assertEquals($targets, array_values(array_unique($dataset->getTargets())));

$firstSample = file_get_contents($rootPath.'/business/001.txt');
self::assertEquals($firstSample, $dataset->getSamples()[0][0]);
self::assertEquals($firstSample, $dataset->getSamples()[0]);

$firstTarget = 'business';
self::assertEquals($firstTarget, $dataset->getTargets()[0]);

$lastSample = file_get_contents($rootPath.'/tech/010.txt');
self::assertEquals($lastSample, $dataset->getSamples()[49][0]);
self::assertEquals($lastSample, $dataset->getSamples()[49]);

$lastTarget = 'tech';
self::assertEquals($lastTarget, $dataset->getTargets()[49]);
Expand Down
7 changes: 6 additions & 1 deletion tests/FeatureExtraction/TokenCountVectorizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
{
// word at least in half samples
$samples = [
'Lorem ipsum dolor sit amet',
'Lorem ipsum dolor sit amet 1550',
'Lorem ipsum sit amet',
'ipsum sit amet',
'ipsum sit amet',
Expand All @@ -96,6 +96,7 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
2 => 'dolor',
3 => 'sit',
4 => 'amet',
5 => 1550,
];

$tokensCounts = [
Expand All @@ -105,27 +106,31 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
[
0 => 1,
1 => 1,
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
[
0 => 0,
1 => 1,
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
[
0 => 0,
1 => 1,
2 => 0,
3 => 1,
4 => 1,
5 => 0,
],
];

Expand Down

0 comments on commit 5e02b89

Please sign in to comment.