forked from jorgecasas/php-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Provide a new NGramTokenizer with minGram and maxGram support (#350)
* Issue #349: Provide a new NGramTokenizer. * Issue #349: Add tests. * Fixes from code review. * Implement NGramTokenizer with min and max gram support * Add missing tests for ngram * Add info about NGramTokenizer to docs and readme * Add performance test for tokenization
- Loading branch information
Showing
8 changed files
with
246 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Tokenization; | ||
|
||
use Phpml\Exception\InvalidArgumentException; | ||
|
||
class NGramTokenizer extends WordTokenizer | ||
{ | ||
/** | ||
* @var int | ||
*/ | ||
private $minGram; | ||
|
||
/** | ||
* @var int | ||
*/ | ||
private $maxGram; | ||
|
||
public function __construct(int $minGram = 1, int $maxGram = 2) | ||
{ | ||
if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) { | ||
throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); | ||
} | ||
|
||
$this->minGram = $minGram; | ||
$this->maxGram = $maxGram; | ||
} | ||
|
||
/** | ||
* {@inheritdoc} | ||
*/ | ||
public function tokenize(string $text): array | ||
{ | ||
$words = []; | ||
preg_match_all('/\w\w+/u', $text, $words); | ||
|
||
$nGrams = []; | ||
foreach ($words[0] as $word) { | ||
$this->generateNGrams($word, $nGrams); | ||
} | ||
|
||
return $nGrams; | ||
} | ||
|
||
private function generateNGrams(string $word, array &$nGrams): void | ||
{ | ||
$length = mb_strlen($word); | ||
|
||
for ($j = 1; $j <= $this->maxGram; $j++) { | ||
for ($k = 0; $k < $length - $j + 1; $k++) { | ||
if ($j >= $this->minGram) { | ||
$nGrams[] = mb_substr($word, $k, $j); | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Tests\Performance\Tokenization; | ||
|
||
use PhpBench\Benchmark\Metadata\Annotations\Iterations; | ||
use PhpBench\Benchmark\Metadata\Annotations\Revs; | ||
use Phpml\Tokenization\NGramTokenizer; | ||
|
||
final class NGramTokenizerBench | ||
{ | ||
/** | ||
* @Revs(1000) | ||
* @Iterations(5) | ||
*/ | ||
public function benchSimpleTokenizer(): void | ||
{ | ||
$tokenizer = new NGramTokenizer(2, 3); | ||
$tokenizer->tokenize( | ||
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent placerat blandit cursus. Suspendisse sed | ||
turpis sit amet enim viverra sodales a euismod est. Ut vitae tincidunt est. Proin venenatis placerat nunc | ||
sed ornare. Etiam feugiat, nisl nec sollicitudin sodales, nulla massa sollicitudin ipsum, vitae cursus ante | ||
velit vitae arcu. Vestibulum feugiat ultricies hendrerit. Morbi sed varius metus. Nam feugiat maximus | ||
turpis, a sollicitudin ligula porttitor eu.Fusce hendrerit tellus et dignissim sagittis. Nulla consectetur | ||
condimentum tortor, non bibendum erat lacinia eget. Integer vitae maximus tortor. Vestibulum ante ipsum | ||
primis in faucibus orci luctus et ultrices posuere cubilia Curae; Pellentesque suscipit sem ipsum, in | ||
tincidunt risus pellentesque vel. Nullam hendrerit consequat leo, in suscipit lectus euismod non. Cras arcu | ||
lacus, lacinia semper mauris vel, pharetra dignissim velit. Nam lacinia turpis a nibh bibendum, et | ||
placerat tellus accumsan. Sed tincidunt cursus nisi in laoreet. Suspendisse amet.' | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Tests\Tokenization; | ||
|
||
use Phpml\Exception\InvalidArgumentException; | ||
use Phpml\Tokenization\NGramTokenizer; | ||
|
||
/** | ||
* Inspiration: https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html | ||
*/ | ||
class NGramTokenizerTest extends TokenizerTest | ||
{ | ||
/** | ||
* @dataProvider textDataProvider | ||
*/ | ||
public function testNGramTokenization(int $minGram, int $maxGram, string $text, array $tokens): void | ||
{ | ||
$tokenizer = new NGramTokenizer($minGram, $maxGram); | ||
|
||
self::assertEquals($tokens, $tokenizer->tokenize($text)); | ||
} | ||
|
||
public function testMinGramGreaterThanMaxGramNotAllowed(): void | ||
{ | ||
self::expectException(InvalidArgumentException::class); | ||
|
||
new NGramTokenizer(5, 2); | ||
} | ||
|
||
public function testMinGramValueTooSmall(): void | ||
{ | ||
self::expectException(InvalidArgumentException::class); | ||
|
||
new NGramTokenizer(0, 2); | ||
} | ||
|
||
public function testMaxGramValueTooSmall(): void | ||
{ | ||
self::expectException(InvalidArgumentException::class); | ||
|
||
new NGramTokenizer(1, 0); | ||
} | ||
|
||
public function textDataProvider(): array | ||
{ | ||
return [ | ||
[ | ||
1, 2, | ||
'Quick Fox', | ||
['Q', 'u', 'i', 'c', 'k', 'Qu', 'ui', 'ic', 'ck', 'F', 'o', 'x', 'Fo', 'ox'], | ||
], | ||
[ | ||
3, 3, | ||
'Quick Foxes', | ||
['Qui', 'uic', 'ick', 'Fox', 'oxe', 'xes'], | ||
], | ||
[ | ||
1, 2, | ||
'快狐跑过 边缘跑', | ||
['快', '狐', '跑', '过', '快狐', '狐跑', '跑过', '边', '缘', '跑', '边缘', '缘跑'], | ||
], | ||
[ | ||
3, 3, | ||
'快狐跑过狐 边缘跑狐狐', | ||
['快狐跑', '狐跑过', '跑过狐', '边缘跑', '缘跑狐', '跑狐狐'], | ||
], | ||
[ | ||
2, 4, | ||
$this->getSimpleText(), | ||
[ | ||
'Lo', 'or', 're', 'em', 'Lor', 'ore', 'rem', 'Lore', 'orem', 'ip', 'ps', 'su', 'um', 'ips', 'psu', 'sum', 'ipsu', | ||
'psum', 'do', 'ol', 'lo', 'or', 'dol', 'olo', 'lor', 'dolo', 'olor', 'si', 'it', 'sit', 'am', 'me', 'et', 'ame', | ||
'met', 'amet', 'co', 'on', 'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', | ||
'tet', 'etu', 'tur', 'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'ad', 'di', 'ip', 'pi', 'is', | ||
'sc', 'ci', 'in', 'ng', 'adi', 'dip', 'ipi', 'pis', 'isc', 'sci', 'cin', 'ing', 'adip', 'dipi', 'ipis', 'pisc', | ||
'isci', 'scin', 'cing', 'el', 'li', 'it', 'eli', 'lit', 'elit', 'Cr', 'ra', 'as', 'Cra', 'ras', 'Cras', 'co', 'on', | ||
'ns', 'se', 'ec', 'ct', 'te', 'et', 'tu', 'ur', 'con', 'ons', 'nse', 'sec', 'ect', 'cte', 'tet', 'etu', 'tur', | ||
'cons', 'onse', 'nsec', 'sect', 'ecte', 'ctet', 'tetu', 'etur', 'du', 'ui', 'dui', 'et', 'lo', 'ob', 'bo', 'or', | ||
'rt', 'ti', 'is', 'lob', 'obo', 'bor', 'ort', 'rti', 'tis', 'lobo', 'obor', 'bort', 'orti', 'rtis', 'au', 'uc', | ||
'ct', 'to', 'or', 'auc', 'uct', 'cto', 'tor', 'auct', 'ucto', 'ctor', 'Nu', 'ul', 'll', 'la', 'Nul', 'ull', 'lla', | ||
'Null', 'ulla', 'vi', 'it', 'ta', 'ae', 'vit', 'ita', 'tae', 'vita', 'itae', 'co', 'on', 'ng', 'gu', 'ue', 'con', | ||
'ong', 'ngu', 'gue', 'cong', 'ongu', 'ngue', 'lo', 'or', 're', 'em', 'lor', 'ore', 'rem', 'lore', 'orem', | ||
], | ||
], | ||
[ | ||
2, 4, | ||
$this->getUtf8Text(), | ||
[ | ||
'鋍鞎', '鞮鞢', '鞢騉', '鞮鞢騉', '袟袘', '袘觕', '袟袘觕', '炟砏', '謺貙', '貙蹖', '謺貙蹖', '偢偣', '偣唲', | ||
'偢偣唲', '箷箯', '箯緷', '箷箯緷', '鑴鱱', '鱱爧', '鑴鱱爧', '覮轀', '剆坲', '煘煓', '煓瑐', '煘煓瑐', '鬐鶤', | ||
'鶤鶐', '鬐鶤鶐', '飹勫', '勫嫢', '飹勫嫢', '枲柊', '柊氠', '枲柊氠', '鍎鞚', '鞚韕', '鍎鞚韕', '焲犈', '殍涾', | ||
'涾烰', '殍涾烰', '齞齝', '齝囃', '齞齝囃', '蹅輶', '孻憵', '擙樲', '樲橚', '擙樲橚', '藒襓', '襓謥', '藒襓謥', | ||
'岯岪', '岪弨', '岯岪弨', '廞徲', '孻憵', '憵懥', '孻憵懥', '趡趛', '趛踠', '趡趛踠', | ||
], | ||
], | ||
]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
namespace Phpml\Tests\Tokenization; | ||
|
||
use PHPUnit\Framework\TestCase; | ||
|
||
abstract class TokenizerTest extends TestCase | ||
{ | ||
public function getSimpleText(): string | ||
{ | ||
return 'Lorem ipsum-dolor sit amet, consectetur/adipiscing elit. | ||
Cras consectetur, dui et lobortis;auctor. | ||
Nulla vitae ,.,/ congue lorem.'; | ||
} | ||
|
||
public function getUtf8Text(): string | ||
{ | ||
return '鋍鞎 鳼 鞮鞢騉 袟袘觕, 炟砏 蒮 謺貙蹖 偢偣唲 蒛 箷箯緷 鑴鱱爧 覮轀, | ||
剆坲 煘煓瑐 鬐鶤鶐 飹勫嫢 銪 餀 枲柊氠 鍎鞚韕 焲犈, | ||
殍涾烰 齞齝囃 蹅輶 鄜, 孻憵 擙樲橚 藒襓謥 岯岪弨 蒮 廞徲 孻憵懥 趡趛踠 槏'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters