Fix FilesDataset arrays and TokenCountVectorizer numeric token (#363)

omysurya · Mar 20, 2019 · 5e02b89 · 5e02b89
1 parent 02dab41
commit 5e02b89
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.8.0] - 2019-03-20
+### Added
+- [Tokenization] Added NGramTokenizer (#350)
+- editorconfig file (#355)
+### Fixed
+- [Dataset] FilesDataset read samples without additional array (#363)
+- [Tokenization] fixed error with numeric token values (#363)
+### Changed
+- [Math] improved performance with pow and sqrt replacement (#350) 
+- [Math] reduce duplicated code in distance metrics (#348)
+- update phpunit to 7.5.1 (#335)
+- code style fixes (#334)
+
 ## [0.7.0] - 2018-11-07
 ### Added
 - [Clustering] added KMeans associative clustering (#262)

diff --git a/src/Dataset/FilesDataset.php b/src/Dataset/FilesDataset.php
@@ -29,7 +29,7 @@ private function scanDir(string $dir): void
         $target = basename($dir);
 
         foreach (array_filter(glob($dir.DIRECTORY_SEPARATOR.'*'), 'is_file') as $file) {
-            $this->samples[] = [file_get_contents($file)];
+            $this->samples[] = file_get_contents($file);
             $this->targets[] = $target;
         }
     }

diff --git a/src/FeatureExtraction/TokenCountVectorizer.php b/src/FeatureExtraction/TokenCountVectorizer.php
@@ -157,7 +157,7 @@ private function getBeyondMinimumIndexes(int $samplesCount): array
         $indexes = [];
         foreach ($this->frequencies as $token => $frequency) {
             if (($frequency / $samplesCount) < $this->minDF) {
-                $indexes[] = $this->getTokenIndex($token);
+                $indexes[] = $this->getTokenIndex((string) $token);
             }
         }
 

diff --git a/tests/Dataset/FilesDatasetTest.php b/tests/Dataset/FilesDatasetTest.php
@@ -29,13 +29,13 @@ public function testLoadFilesDatasetWithBBCData(): void
         self::assertEquals($targets, array_values(array_unique($dataset->getTargets())));
 
         $firstSample = file_get_contents($rootPath.'/business/001.txt');
-        self::assertEquals($firstSample, $dataset->getSamples()[0][0]);
+        self::assertEquals($firstSample, $dataset->getSamples()[0]);
 
         $firstTarget = 'business';
         self::assertEquals($firstTarget, $dataset->getTargets()[0]);
 
         $lastSample = file_get_contents($rootPath.'/tech/010.txt');
-        self::assertEquals($lastSample, $dataset->getSamples()[49][0]);
+        self::assertEquals($lastSample, $dataset->getSamples()[49]);
 
         $lastTarget = 'tech';
         self::assertEquals($lastTarget, $dataset->getTargets()[49]);

diff --git a/tests/FeatureExtraction/TokenCountVectorizerTest.php b/tests/FeatureExtraction/TokenCountVectorizerTest.php
@@ -84,7 +84,7 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
     {
         // word at least in half samples
         $samples = [
-            'Lorem ipsum dolor sit amet',
+            'Lorem ipsum dolor sit amet 1550',
             'Lorem ipsum sit amet',
             'ipsum sit amet',
             'ipsum sit amet',
@@ -96,6 +96,7 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
             2 => 'dolor',
             3 => 'sit',
             4 => 'amet',
+            5 => 1550,
         ];
 
         $tokensCounts = [
@@ -105,27 +106,31 @@ public function testTransformationWithMinimumDocumentTokenCountFrequency(): void
                 2 => 0,
                 3 => 1,
                 4 => 1,
+                5 => 0,
             ],
             [
                 0 => 1,
                 1 => 1,
                 2 => 0,
                 3 => 1,
                 4 => 1,
+                5 => 0,
             ],
             [
                 0 => 0,
                 1 => 1,
                 2 => 0,
                 3 => 1,
                 4 => 1,
+                5 => 0,
             ],
             [
                 0 => 0,
                 1 => 1,
                 2 => 0,
                 3 => 1,
                 4 => 1,
+                5 => 0,
             ],
         ];