Skip to content

Commit

Permalink
Code updated for use on PHP 7.4
Browse files Browse the repository at this point in the history
  • Loading branch information
Pavel Alekseev committed Oct 13, 2023
1 parent d718336 commit 2a33a1e
Show file tree
Hide file tree
Showing 19 changed files with 76 additions and 100,589 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
![Build status](https://img.shields.io/github/actions/workflow/status/yethee/tiktoken-php/ci.yml?branch=master)
![License](https://img.shields.io/github/license/yethee/tiktoken-php)

This is a port of the [tiktoken](https://github.com/openai/tiktoken).
This is a port of the [tiktoken](https://github.com/openai/tiktoken) for PHP 7.4.

## Installation

Expand Down
31 changes: 4 additions & 27 deletions composer.json
Original file line number Diff line number Diff line change
@@ -1,44 +1,21 @@
{
"name": "yethee/tiktoken",
"name": "guttedgarden/tiktoken",
"type": "library",
"license": "MIT",
"description": "PHP version of tiktoken",
"description": "PHP 7.4 version of tiktoken",
"keywords": ["openai", "tiktoken", "tokenizer", "bpe", "encode", "decode"],
"require": {
"php": "^8.1",
"symfony/service-contracts": "^2.5 || ^3.0"
},
"require-dev": {
"doctrine/coding-standard": "^11.1",
"phpunit/phpunit": "^10.3",
"psalm/plugin-phpunit": "^0.18.3",
"vimeo/psalm": "5.9.0"
"php": ">=7.4"
},
"autoload": {
"psr-4": {
"Yethee\\Tiktoken\\": "src"
}
},
"autoload-dev": {
"psr-4": {
"Yethee\\Tiktoken\\Tests\\": "tests"
"guttedgarden\\Tiktoken\\": "src"
}
},
"config": {
"sort-packages": true,
"allow-plugins": {
"dealerdirect/phpcodesniffer-composer-installer": true
}
},
"scripts": {
"check": [
"@cs-check",
"@analyse",
"@test"
],
"analyse": "psalm --stats",
"cs-check": "phpcs",
"cs-fix": "phpcbf",
"test": "phpunit --colors=always --no-coverage"
}
}
20 changes: 0 additions & 20 deletions phpcs.xml

This file was deleted.

29 changes: 0 additions & 29 deletions phpunit.xml.dist

This file was deleted.

13 changes: 0 additions & 13 deletions psalm-baseline.xml

This file was deleted.

21 changes: 0 additions & 21 deletions psalm.xml

This file was deleted.

39 changes: 29 additions & 10 deletions src/Encoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,40 @@

declare(strict_types=1);

namespace Yethee\Tiktoken;
namespace guttedgarden\Tiktoken;

use Stringable;
use Yethee\Tiktoken\Exception\RegexError;
use Yethee\Tiktoken\Util\EncodeUtil;
use Yethee\Tiktoken\Vocab\Vocab;
use Closure;
use guttedgarden\Tiktoken\Exception\RegexError;
use guttedgarden\Tiktoken\Util\EncodeUtil;
use guttedgarden\Tiktoken\Vocab\Vocab;

use function array_map;
use function array_slice;
use function array_values;
use function assert;
use function count;
use function implode;
use function preg_last_error_msg;
use function preg_match_all;
use function range;
use function sprintf;

use const PHP_INT_MAX;

/** @psalm-import-type NonEmptyByteVector from EncodeUtil */
final class Encoder implements Stringable
final class Encoder
{
private string $name;
private Vocab $vocab;
private string $pattern;
/**
* @param non-empty-string $name
* @param non-empty-string $pattern
*/
public function __construct(public readonly string $name, private Vocab $vocab, private string $pattern)
public function __construct(string $name, Vocab $vocab, string $pattern)
{
$this->name = $name;
$this->vocab = $vocab;
$this->pattern = $pattern;
}

public function __toString(): string
Expand All @@ -46,7 +51,7 @@ public function encode(string $text): array
}

if (preg_match_all($this->pattern, $text, $matches) === false) {
throw new RegexError(sprintf('Matching failed with error: %s', preg_last_error_msg()));
throw new RegexError(sprintf('Matching failed with error: %s', $this->pregErrorString(preg_last_error())));
}

$tokens = [];
Expand Down Expand Up @@ -80,7 +85,7 @@ public function decode(array $tokens): string
return '';
}

return implode(array_map($this->vocab->getToken(...), $tokens));
return implode(array_map(Closure::fromCallable([$this->vocab, 'getToken']), $tokens));
}

/**
Expand Down Expand Up @@ -158,4 +163,18 @@ function (int $i) use ($bytes): array {

return $res;
}

private function pregErrorString($errorConstant)
{
static $errorMessages = [
PREG_NO_ERROR => 'No error',
PREG_INTERNAL_ERROR => 'Internal error',
PREG_BACKTRACK_LIMIT_ERROR=> 'Backtrack limit error',
PREG_RECURSION_LIMIT_ERROR=> 'Recursion limit error',
PREG_BAD_UTF8_ERROR => 'Bad UTF8 error',
PREG_BAD_UTF8_OFFSET_ERROR=> 'Bad UTF8 offset error',
];

return $errorMessages[$errorConstant] ?? 'Unknown error';
}
}
20 changes: 9 additions & 11 deletions src/EncoderProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,20 @@

declare(strict_types=1);

namespace Yethee\Tiktoken;
namespace guttedgarden\Tiktoken;

use InvalidArgumentException;
use Symfony\Contracts\Service\ResetInterface;
use Yethee\Tiktoken\Vocab\Loader\DefaultVocabLoader;
use Yethee\Tiktoken\Vocab\Vocab;
use Yethee\Tiktoken\Vocab\VocabLoader;
use guttedgarden\Tiktoken\Vocab\Loader\DefaultVocabLoader;
use guttedgarden\Tiktoken\Vocab\Vocab;
use guttedgarden\Tiktoken\Vocab\VocabLoader;

use function getenv;
use function sprintf;
use function str_starts_with;
use function sys_get_temp_dir;

use const DIRECTORY_SEPARATOR;

final class EncoderProvider implements ResetInterface
final class EncoderProvider
{
private const ENCODINGS = [
'r50k_base' => [
Expand Down Expand Up @@ -75,8 +73,8 @@ final class EncoderProvider implements ResetInterface
'code-search-ada-code-001' => 'r50k_base',
];

private VocabLoader|null $vocabLoader = null;
private string|null $vocabCacheDir;
private ?VocabLoader $vocabLoader = null;
private ?string $vocabCacheDir;

/** @var array<non-empty-string, Encoder> */
private array $encoders = [];
Expand All @@ -103,7 +101,7 @@ public function getForModel(string $model): Encoder
}

foreach (self::MODEL_PREFIX_TO_ENCODING as $prefix => $modelEncoding) {
if (str_starts_with($model, $prefix)) {
if (strpos($model, $prefix) === 0) {
return $this->get($modelEncoding);
}
}
Expand Down Expand Up @@ -132,7 +130,7 @@ public function get(string $encodingName): Encoder
}

/** @param non-empty-string|null $cacheDir */
public function setVocabCache(string|null $cacheDir): void
public function setVocabCache(?string $cacheDir): void
{
$this->vocabCacheDir = $cacheDir;
$this->vocabLoader = null;
Expand Down
2 changes: 1 addition & 1 deletion src/Exception/ParseError.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

declare(strict_types=1);

namespace Yethee\Tiktoken\Exception;
namespace guttedgarden\Tiktoken\Exception;

use RuntimeException;

Expand Down
2 changes: 1 addition & 1 deletion src/Exception/RegexError.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

declare(strict_types=1);

namespace Yethee\Tiktoken\Exception;
namespace guttedgarden\Tiktoken\Exception;

use RuntimeException;

Expand Down
6 changes: 3 additions & 3 deletions src/Util/EncodeUtil.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

declare(strict_types=1);

namespace Yethee\Tiktoken\Util;
namespace guttedgarden\Tiktoken\Util;

use Closure;
use function array_map;
use function bin2hex;
use function hexdec;
use function pack;
use function str_split;

Expand All @@ -20,7 +20,7 @@ final class EncodeUtil
*/
public static function toBytes(string $text): array
{
return array_map(hexdec(...), str_split(bin2hex($text), 2));
return array_map(Closure::fromCallable('hexdec'), str_split(bin2hex($text), 2));
}

/**
Expand Down
10 changes: 6 additions & 4 deletions src/Vocab/Loader/DefaultVocabLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

declare(strict_types=1);

namespace Yethee\Tiktoken\Vocab\Loader;
namespace guttedgarden\Tiktoken\Vocab\Loader;

use RuntimeException;
use Yethee\Tiktoken\Vocab\Vocab;
use Yethee\Tiktoken\Vocab\VocabLoader;
use guttedgarden\Tiktoken\Vocab\Vocab;
use guttedgarden\Tiktoken\Vocab\VocabLoader;

use function assert;
use function fclose;
Expand All @@ -24,8 +24,10 @@

final class DefaultVocabLoader implements VocabLoader
{
public function __construct(private string|null $cacheDir = null)
private ?string $cacheDir;
public function __construct(?string $cacheDir = null)
{
$this->cacheDir = $cacheDir;
}

public function load(string $uri): Vocab
Expand Down
Loading

0 comments on commit 2a33a1e

Please sign in to comment.