Skip to content

Commit

Permalink
add CLI + standardize options object
Browse files Browse the repository at this point in the history
  • Loading branch information
kefniark committed Aug 11, 2021
1 parent 311424a commit 9d906ab
Show file tree
Hide file tree
Showing 12 changed files with 7,875 additions and 7,528 deletions.
1 change: 1 addition & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
node_modules
bin
dist
tests
36 changes: 32 additions & 4 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,42 @@ detectAll('ceci est un text en francais.') // [ { lang: 'fr', accuracy: 0.5238 }

---

## TinyLD (Light Flavor, for web usage)
### **TinyLD CLI**

Time to time, it can be easier to use the library from a terminal _(Example: testing or debugging)_

```sh
tinyld This is the text that I want to check
# [ { lang: 'en', accuracy: 1 } ]

tinyld これはテストです
# [ { lang: 'ja', accuracy: 1 } ]

tinyld Єсть на світі доля
# [ { lang: 'uk', accuracy: 1 } ]
```

_Options_

- `--verbose` : Get an explanation of why **TinyLD** pick a language
- `--only=en,ja,fr` : Restrict the detection to a subset of languages

Can also be run with:

- Npx: `npx tinyld [message]`
- Yarn: `yarn tinyld [message]`
- Bash: `./node_modules/.bin/tinyld [message]`

---

### **TinyLD** (Light Flavor, for web usage)

The normal library can be a bit massive (mostly caused by the language profile database), which can be problematic for web usage.

For this usage we also provide a lighter version (a tradeoff between disk size and accuracy)

- import with: `import { detect } from 'tinyld/dist/tinyld.light.cjs'`
- normal version ~800KB, light version is only ~90KB
- normal version ~800KB, light version is only ~90KB (~25KB with gzip)
- only 30 languages supported
- slightly less accurate, only ~90%

Expand Down Expand Up @@ -101,8 +129,8 @@ At the end, sort by score and return the most probable one.

| Library | Script | Properly Identified | Improperly identified | Not identified | Avg Execution Time | Disk Size |
| -------------- | --------------------------- | ------------------- | --------------------- | -------------- | ------------------ | --------- |
| TinyLD | `yarn bench:tinyld` | 95.8876%% | 4.1124% | 0% | 45.4203ms. | 878KB |
| TinyLD Light | `yarn bench:tinyld-light` | 91.822% | 8.178% | 0% | 36.4051ms. | 92KB |
| TinyLD | `yarn bench:tinyld` | 95.6304%% | 4.3696% | 0% | 50.4203ms. | 878KB |
| TinyLD Light | `yarn bench:tinyld-light` | 91.7805% | 8.2195% | 0% | 38.4051ms. | 92KB |
| node-cld | `yarn bench:cld` | 87.1121% | 1.8074% | 11.08% | 56.38ms. | > 10MB |
| franc | `yarn bench:franc` | 65.3913% | 34.6087% | 0% | 132.59ms. | 353.5kb |
| languagedetect | `yarn bench:languagedetect` | 58.0877% | 13.4809% | 28.4414% | 159.56ms. | 243.6kb |
Expand Down
29 changes: 29 additions & 0 deletions bin/tinyld.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#! /usr/bin/env node
const { detectAll } = require('../dist/tinyld.cjs')

function main() {
const [, , ...args] = process.argv

let onlyLangs = []
let verbose = false

const texts = []
for (const arg of [...args]) {
if (arg.startsWith('--only=')) {
onlyLangs = arg.replace('--only=', '').split(',')
continue
}

if (arg.startsWith('--verbose') || arg.startsWith('-v')) {
verbose = true
continue
}

texts.push(arg)
}
const message = texts.join(' ')
const options = { only: onlyLangs, verbose }
console.log(detectAll(message, options))
}

main()
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
{
"name": "tinyld",
"version": "1.0.5",
"version": "1.0.6",
"main": "./dist/tinyld.cjs.js",
"module": "./dist/tinyld.esm.js",
"license": "MIT",
"types": "./src/index.ts",
"bin": {
"tinyld": "./bin/tinyld.js"
},
"keywords": [
"language",
"detection",
Expand Down
22 changes: 16 additions & 6 deletions src/clean/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,25 @@ export function isString(value: unknown): boolean {
return typeof value === 'string' || value instanceof String
}

function stripPunctuation(val: string): string {
return val.replace(/[,.!¿?;:/«»"_~\\/]/gi, ' ')
}

function stripNumbers(val: string): string {
return val.replace(/[0-9]/g, '')
}

function replaceFullwidthNumbers(val: string): string {
return val.replace(/[\uFF10-\uFF19]/g, function (m) {
return String.fromCharCode(m.charCodeAt(0) - 0xfee0)
})
}

export function cleanString(value: string): string {
const data = value
.replace(//gi, "'")
.replace(/[,.!¿?;:/«»"_~\\/]/gi, ' ')
.replace(/[0-9]/g, '')
const data = value.replace(//gi, "'")
return stripPunctuation(stripNumbers(replaceFullwidthNumbers(data.toLowerCase())))
.replace(/\s\s+/g, ' ')
.trim()
.toLowerCase()
return data
}

export function normalize(value: string): string {
Expand Down
11 changes: 11 additions & 0 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,17 @@ export function isExtraSample(country: string): boolean {

type LangOption = { code: string; alias?: string[]; skipLight?: boolean; skipProb?: boolean; extraSample?: boolean }

export const parseDetectOption = (options?: Partial<DetectOption>): DetectOption => {
const data = { only: [], verbose: false }
if (!options) return data
return Object.assign(data, options)
}

export interface DetectOption {
only: string[]
verbose: boolean
}

// Map ISO 639-3 <-> ISO 639-1
const langMap: { [id: string]: LangOption } = {
// asia
Expand Down
16 changes: 9 additions & 7 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import { cleanString, isString, normalize } from './clean'
import { ILangProfiles } from './core'
import { DetectOption, ILangProfiles, parseDetectOption } from './core'
import data from './profiles/normal.json'
import { detectAllPotentialGrams, detectPotentialGrams, detectUniqueGrams } from './tokenizer'

const profiles = data as ILangProfiles

export function detect(text: string): string {
export function detect(text: string, opts?: Partial<DetectOption>): string {
const options = parseDetectOption(opts)
if (!isString(text)) return ''

const txt = cleanString(text) // clean input
if (!txt) return ''

const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
if (res !== '') return res

return detectPotentialGrams(normalize(txt), profiles) // pass 2 : use probabilities
return detectPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
}

export function detectAll(text: string, verbose = false): { lang: string; accuracy: number }[] {
export function detectAll(text: string, opts?: Partial<DetectOption>): { lang: string; accuracy: number }[] {
const options = parseDetectOption(opts)
if (!isString(text)) return []

const txt = cleanString(text) // clean input
if (!txt) return []

const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
if (res !== '') return [{ lang: res, accuracy: 1 }]

return detectAllPotentialGrams(normalize(txt), profiles, verbose) // pass 2 : use probabilities
return detectAllPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
}

export { cleanString } from './clean'
Expand Down
16 changes: 9 additions & 7 deletions src/index_light.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import { cleanString, isString, normalize } from './clean'
import { ILangProfiles } from './core'
import { DetectOption, ILangProfiles, parseDetectOption } from './core'
import data from './profiles/light.json'
import { detectAllPotentialGrams, detectPotentialGrams, detectUniqueGrams } from './tokenizer'

const profiles = data as ILangProfiles

export function detect(text: string): string {
export function detect(text: string, opts?: Partial<DetectOption>): string {
const options = parseDetectOption(opts)
if (!isString(text)) return ''

const txt = cleanString(text) // clean input
if (!txt) return ''

const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
if (res !== '') return res

return detectPotentialGrams(normalize(txt), profiles) // pass 2 : use probabilities
return detectPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
}

export function detectAll(text: string, verbose = false): { lang: string; accuracy: number }[] {
export function detectAll(text: string, opts?: Partial<DetectOption>): { lang: string; accuracy: number }[] {
const options = parseDetectOption(opts)
if (!isString(text)) return []

const txt = cleanString(text) // clean input
if (!txt) return []

const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
if (res !== '') return [{ lang: res, accuracy: 1 }]

return detectAllPotentialGrams(normalize(txt), profiles, verbose) // pass 2 : use probabilities
return detectAllPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
}

export { cleanString } from './clean'
Expand Down
Loading

0 comments on commit 9d906ab

Please sign in to comment.