Skip to content

Commit

Permalink
Merge pull request latitudegames#2 from schnerd/fix-exports
Browse files Browse the repository at this point in the history
Fix exports & decoding
  • Loading branch information
nickwalton authored Sep 8, 2020
2 parents 2bc87a2 + c325df9 commit 1e87339
Show file tree
Hide file tree
Showing 8 changed files with 4,766 additions and 8 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/node.js.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# This workflow will do a clean install of node dependencies, build the source code and run tests across different versions of node
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions

name: Node.js CI

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:

runs-on: ubuntu-latest

strategy:
matrix:
node-version: [12.x, 14.x]

steps:
- uses: actions/checkout@v2
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1
with:
node-version: ${{ matrix.node-version }}
- run: npm ci
- run: npm run build --if-present
- run: npm test

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
10 changes: 6 additions & 4 deletions Encoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ const chr = x => {

const textEncoder = new TextEncoder("utf-8")
const encodeStr = str => {
//return str.split('').map(x => ord(x))
return Array.from(textEncoder.encode(str)).map(x => x.toString())
}

const textDecoder = new TextDecoder("utf-8")
const decodeStr = arr => {
return arr.map(x => String.fromCharCode(x)).join('')
return textDecoder.decode(new Uint8Array(arr));
}

const dictZip = (x, y) => {
Expand Down Expand Up @@ -172,5 +172,7 @@ function decode(tokens) {
return text
}

// const encoded = encode('hello 👋 world 🌍 This is a long string to test whether or not the emoji issue was fixed!')
// console.log({encoded})
module.exports = {
encode,
decode
};
37 changes: 37 additions & 0 deletions Encoder.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
const {encode, decode} = require('./Encoder.js');

test('empty string', () => {
const str = "";
expect(encode(str)).toEqual([])
expect(decode(encode(str))).toEqual(str)
});

test('space', () => {
const str = " ";
expect(encode(str)).toEqual([220])
expect(decode(encode(str))).toEqual(str)
});

test('tab', () => {
const str = "\t";
expect(encode(str)).toEqual([197])
expect(decode(encode(str))).toEqual(str)
});

test('simple text', () => {
const str = "This is some text";
expect(encode(str)).toEqual([1212, 318, 617, 2420])
expect(decode(encode(str))).toEqual(str)
});

test('multi-token word', () => {
const str = "indivisible";
expect(encode(str)).toEqual([521, 452, 12843])
expect(decode(encode(str))).toEqual(str)
});

test('emojis', () => {
const str = "hello 👋 world 🌍";
expect(encode(str)).toEqual([31373, 50169, 233, 995, 12520, 234, 235])
expect(decode(encode(str))).toEqual(str)
});
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@ Javascript BPE Encoder Decoder for GPT-2 / GPT-3
GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to feed into the model. This is a javascript implementation of OpenAI's original python encoder/decoder which can be found [here](https://github.com/openai/gpt-2)

## Install with npm

`npm install gpt-3-encoder`

## Usage

Compatible with Node >= 12

```
const {encode, decode} = require('gpt-3-encoder')
Expand Down
11 changes: 11 additions & 0 deletions jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// For a detailed explanation regarding each configuration property, visit:
// https://jestjs.io/docs/en/configuration.html

module.exports = {
// Automatically clear mock calls and instances between every test
clearMocks: true,
// Indicates which provider should be used to instrument code for coverage
coverageProvider: "v8",
// The test environment that will be used for testing
testEnvironment: "node",
};
Loading

0 comments on commit 1e87339

Please sign in to comment.