diff --git a/README.md b/README.md index f518fd0d..4df5730e 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,9 @@ Public datasets are available in a separate repository [php-ai/php-ml-datasets]( * [Imputation missing values](http://php-ml.readthedocs.io/en/latest/machine-learning/preprocessing/imputation-missing-values/) * Feature Extraction * [Token Count Vectorizer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/token-count-vectorizer/) + * NGramTokenizer + * WhitespaceTokenizer + * WordTokenizer * [Tf-idf Transformer](http://php-ml.readthedocs.io/en/latest/machine-learning/feature-extraction/tf-idf-transformer/) * Dimensionality Reduction * PCA (Principal Component Analysis) diff --git a/docs/machine-learning/feature-extraction/token-count-vectorizer.md b/docs/machine-learning/feature-extraction/token-count-vectorizer.md index c4ede683..8e2e9fd0 100644 --- a/docs/machine-learning/feature-extraction/token-count-vectorizer.md +++ b/docs/machine-learning/feature-extraction/token-count-vectorizer.md @@ -53,3 +53,21 @@ $vectorizer->getVocabulary(); * WhitespaceTokenizer - select tokens by whitespace. * WordTokenizer - select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). +* NGramTokenizer - continuous sequence of characters of the specified length. They are useful for querying languages that don’t use spaces or that have long compound words, like German. + +**NGramTokenizer** + +The NGramTokenizer tokenizer accepts the following parameters: + +`$minGram` - minimum length of characters in a gram. Defaults to 1. +`$maxGram` - maximum length of characters in a gram. Defaults to 2. + +```php +use Phpml\Tokenization\NGramTokenizer; + +$tokenizer = new NGramTokenizer(1, 2); + +$tokenizer->tokenize('Quick Fox'); + +// returns ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'] +``` diff --git a/src/Tokenization/NGramTokenizer.php b/src/Tokenization/NGramTokenizer.php new file mode 100644 index 00000000..59e6f258 --- /dev/null +++ b/src/Tokenization/NGramTokenizer.php @@ -0,0 +1,59 @@ + $maxGram) { + throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); + } + + $this->minGram = $minGram; + $this->maxGram = $maxGram; + } + + /** + * {@inheritdoc} + */ + public function tokenize(string $text): array + { + $words = []; + preg_match_all('/\w\w+/u', $text, $words); + + $nGrams = []; + foreach ($words[0] as $word) { + $this->generateNGrams($word, $nGrams); + } + + return $nGrams; + } + + private function generateNGrams(string $word, array &$nGrams): void + { + $length = mb_strlen($word); + + for ($j = 1; $j <= $this->maxGram; $j++) { + for ($k = 0; $k < $length - $j + 1; $k++) { + if ($j >= $this->minGram) { + $nGrams[] = mb_substr($word, $k, $j); + } + } + } + } +} diff --git a/tests/Performance/Tokenization/NGramTokenizerBench.php b/tests/Performance/Tokenization/NGramTokenizerBench.php new file mode 100644 index 00000000..f99128d2 --- /dev/null +++ b/tests/Performance/Tokenization/NGramTokenizerBench.php @@ -0,0 +1,33 @@ +tokenize( + 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent placerat blandit cursus. Suspendisse sed + turpis sit amet enim viverra sodales a euismod est. Ut vitae tincidunt est. Proin venenatis placerat nunc + sed ornare. Etiam feugiat, nisl nec sollicitudin sodales, nulla massa sollicitudin ipsum, vitae cursus ante + velit vitae arcu. Vestibulum feugiat ultricies hendrerit. Morbi sed varius metus. Nam feugiat maximus + turpis, a sollicitudin ligula porttitor eu.Fusce hendrerit tellus et dignissim sagittis. Nulla consectetur + condimentum tortor, non bibendum erat lacinia eget. Integer vitae maximus tortor. Vestibulum ante ipsum + primis in faucibus orci luctus et ultrices posuere cubilia Curae; Pellentesque suscipit sem ipsum, in + tincidunt risus pellentesque vel. Nullam hendrerit consequat leo, in suscipit lectus euismod non. Cras arcu + lacus, lacinia semper mauris vel, pharetra dignissim velit. Nam lacinia turpis a nibh bibendum, et + placerat tellus accumsan. Sed tincidunt cursus nisi in laoreet. Suspendisse amet.' + ); + } +} diff --git a/tests/Tokenization/NGramTokenizerTest.php b/tests/Tokenization/NGramTokenizerTest.php new file mode 100644 index 00000000..2df95314 --- /dev/null +++ b/tests/Tokenization/NGramTokenizerTest.php @@ -0,0 +1,100 @@ +tokenize($text)); + } + + public function testMinGramGreaterThanMaxGramNotAllowed(): void + { + self::expectException(InvalidArgumentException::class); + + new NGramTokenizer(5, 2); + } + + public function testMinGramValueTooSmall(): void + { + self::expectException(InvalidArgumentException::class); + + new NGramTokenizer(0, 2); + } + + public function testMaxGramValueTooSmall(): void + { + self::expectException(InvalidArgumentException::class); + + new NGramTokenizer(1, 0); + } + + public function textDataProvider(): array + { + return [ + [ + 1, 2, + 'Quick Fox', + ['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'], + ], + [ + 3, 3, + 'Quick Foxes', + ['Qui', 'uic', 'ick', 'Fox', 'oxe', 'xes'], + ], + [ + 1, 2, + '快狐跑过 边缘跑', + ['快', '狐', '跑', '过', '快狐', '狐跑', '跑过', '边', '缘', '跑', '边缘', '缘跑'], + ], + [ + 3, 3, + '快狐跑过狐 边缘跑狐狐', + ['快狐跑', '狐跑过', '跑过狐', '边缘跑', '缘跑狐', '跑狐狐'], + ], + [ + 2, 4, + $this->getSimpleText(), + [ + 'Lo', 'or', 're', 'em', 'Lor', 'ore', 'rem', 'Lore', 'orem', 'ip', 'ps', 'su', 'um', 'ips', 'psu', 'sum', 'ipsu', + 'psum', 'do', 'ol', 'lo', 'or', 'dol', 'olo', 'lor', 'dolo', 'olor', 'si', 'it', 'sit', 'am', 'me', 'et', 'ame', + 'met', 'amet', 'co', 'on', 'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', + 'tet', 'etu', 'tur', 'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'ad', 'di', 'ip', 'pi', 'is', + 'sc', 'ci', 'in', 'ng', 'adi', 'dip', 'ipi', 'pis', 'isc', 'sci', 'cin', 'ing', 'adip', 'dipi', 'ipis', 'pisc', + 'isci', 'scin', 'cing', 'el', 'li', 'it', 'eli', 'lit', 'elit', 'Cr', 'ra', 'as', 'Cra', 'ras', 'Cras', 'co', 'on', + 'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', 'tet', 'etu', 'tur', + 'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'du', 'ui', 'dui', 'et', 'lo', 'ob', 'bo', 'or', + 'rt', 'ti', 'is', 'lob', 'obo', 'bor', 'ort', 'rti', 'tis', 'lobo', 'obor', 'bort', 'orti', 'rtis', 'au', 'uc', + 'ct', 'to', 'or', 'auc', 'uct', 'cto', 'tor', 'auct', 'ucto', 'ctor', 'Nu', 'ul', 'll', 'la', 'Nul', 'ull', 'lla', + 'Null', 'ulla', 'vi', 'it', 'ta', 'ae', 'vit', 'ita', 'tae', 'vita', 'itae', 'co', 'on', 'ng', 'gu', 'ue', 'con', + 'ong', 'ngu', 'gue', 'cong', 'ongu', 'ngue', 'lo', 'or', 're', 'em', 'lor', 'ore', 'rem', 'lore', 'orem', + ], + ], + [ + 2, 4, + $this->getUtf8Text(), + [ + '鋍鞎', '鞮鞢', '鞢騉', '鞮鞢騉', '袟袘', '袘觕', '袟袘觕', '炟砏', '謺貙', '貙蹖', '謺貙蹖', '偢偣', '偣唲', + '偢偣唲', '箷箯', '箯緷', '箷箯緷', '鑴鱱', '鱱爧', '鑴鱱爧', '覮轀', '剆坲', '煘煓', '煓瑐', '煘煓瑐', '鬐鶤', + '鶤鶐', '鬐鶤鶐', '飹勫', '勫嫢', '飹勫嫢', '枲柊', '柊氠', '枲柊氠', '鍎鞚', '鞚韕', '鍎鞚韕', '焲犈', '殍涾', + '涾烰', '殍涾烰', '齞齝', '齝囃', '齞齝囃', '蹅輶', '孻憵', '擙樲', '樲橚', '擙樲橚', '藒襓', '襓謥', '藒襓謥', + '岯岪', '岪弨', '岯岪弨', '廞徲', '孻憵', '憵懥', '孻憵懥', '趡趛', '趛踠', '趡趛踠', + ], + ], + ]; + } +} diff --git a/tests/Tokenization/TokenizerTest.php b/tests/Tokenization/TokenizerTest.php new file mode 100644 index 00000000..5d0833cd --- /dev/null +++ b/tests/Tokenization/TokenizerTest.php @@ -0,0 +1,24 @@ +tokenize($text)); + self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText())); } public function testTokenizationOnUtf8(): void { $tokenizer = new WhitespaceTokenizer(); - $text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀, - 剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈, - 殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏'; - $tokens = ['鋍鞎', '鳼', '鞮鞢騉', '袟袘觕,', '炟砏', '蒮', '謺貙蹖', '偢偣唲', '蒛', '箷箯緷', '鑴鱱爧', '覮轀,', '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '銪', '餀', '枲柊氠', '鍎鞚韕', '焲犈,', '殍涾烰', '齞齝囃', '蹅輶', '鄜,', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '蒮', '廞徲', '孻憵懥', '趡趛踠', '槏', ]; - self::assertEquals($tokens, $tokenizer->tokenize($text)); + self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text())); } } diff --git a/tests/Tokenization/WordTokenizerTest.php b/tests/Tokenization/WordTokenizerTest.php index 39448b78..9c55dd60 100644 --- a/tests/Tokenization/WordTokenizerTest.php +++ b/tests/Tokenization/WordTokenizerTest.php @@ -5,37 +5,28 @@ namespace Phpml\Tests\Tokenization; use Phpml\Tokenization\WordTokenizer; -use PHPUnit\Framework\TestCase; -class WordTokenizerTest extends TestCase +class WordTokenizerTest extends TokenizerTest { public function testTokenizationOnAscii(): void { $tokenizer = new WordTokenizer(); - $text = 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit. - Cras consectetur, dui et lobortis;auctor. - Nulla vitae ,.,/ congue lorem.'; - $tokens = ['Lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit', 'Cras', 'consectetur', 'dui', 'et', 'lobortis', 'auctor', 'Nulla', 'vitae', 'congue', 'lorem', ]; - self::assertEquals($tokens, $tokenizer->tokenize($text)); + self::assertEquals($tokens, $tokenizer->tokenize($this->getSimpleText())); } public function testTokenizationOnUtf8(): void { $tokenizer = new WordTokenizer(); - $text = '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀, - 剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈, - 殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏'; - $tokens = ['鋍鞎', '鞮鞢騉', '袟袘觕', '炟砏', '謺貙蹖', '偢偣唲', '箷箯緷', '鑴鱱爧', '覮轀', '剆坲', '煘煓瑐', '鬐鶤鶐', '飹勫嫢', '枲柊氠', '鍎鞚韕', '焲犈', '殍涾烰', '齞齝囃', '蹅輶', '孻憵', '擙樲橚', '藒襓謥', '岯岪弨', '廞徲', '孻憵懥', '趡趛踠', ]; - self::assertEquals($tokens, $tokenizer->tokenize($text)); + self::assertEquals($tokens, $tokenizer->tokenize($this->getUtf8Text())); } }