add CLI + standardize options object

komodojp · Aug 11, 2021 · 9d906ab · 9d906ab
1 parent 311424a
commit 9d906ab
Show file tree

Hide file tree

Showing 12 changed files with 7,875 additions and 7,528 deletions.
diff --git a/.eslintignore b/.eslintignore
@@ -1,3 +1,4 @@
 node_modules
+bin
 dist
 tests
diff --git a/Readme.md b/Readme.md
@@ -46,14 +46,42 @@ detectAll('ceci est un text en francais.') // [ { lang: 'fr', accuracy: 0.5238 }
 
 ---
 
-## TinyLD (Light Flavor, for web usage)
+### **TinyLD CLI**
+
+Time to time, it can be easier to use the library from a terminal _(Example: testing or debugging)_
+
+```sh
+tinyld This is the text that I want to check
+# [ { lang: 'en', accuracy: 1 } ]
+
+tinyld これはテストです
+# [ { lang: 'ja', accuracy: 1 } ]
+
+tinyld Єсть на світі доля
+# [ { lang: 'uk', accuracy: 1 } ]
+```
+
+_Options_
+
+- `--verbose` : Get an explanation of why **TinyLD** pick a language
+- `--only=en,ja,fr` : Restrict the detection to a subset of languages
+
+Can also be run with:
+
+- Npx: `npx tinyld [message]`
+- Yarn: `yarn tinyld [message]`
+- Bash: `./node_modules/.bin/tinyld [message]`
+
+---
+
+### **TinyLD** (Light Flavor, for web usage)
 
 The normal library can be a bit massive (mostly caused by the language profile database), which can be problematic for web usage.
 
 For this usage we also provide a lighter version (a tradeoff between disk size and accuracy)
 
 - import with: `import { detect } from 'tinyld/dist/tinyld.light.cjs'`
-- normal version ~800KB, light version is only ~90KB
+- normal version ~800KB, light version is only ~90KB (~25KB with gzip)
 - only 30 languages supported
 - slightly less accurate, only ~90%
 
@@ -101,8 +129,8 @@ At the end, sort by score and return the most probable one.
 
 | Library        | Script                      | Properly Identified | Improperly identified | Not identified | Avg Execution Time | Disk Size |
 | -------------- | --------------------------- | ------------------- | --------------------- | -------------- | ------------------ | --------- |
-| TinyLD         | `yarn bench:tinyld`         | 95.8876%%           | 4.1124%               | 0%             | 45.4203ms.         | 878KB     |
-| TinyLD Light   | `yarn bench:tinyld-light`   | 91.822%             | 8.178%                | 0%             | 36.4051ms.         | 92KB      |
+| TinyLD         | `yarn bench:tinyld`         | 95.6304%%           | 4.3696%               | 0%             | 50.4203ms.         | 878KB     |
+| TinyLD Light   | `yarn bench:tinyld-light`   | 91.7805%            | 8.2195%               | 0%             | 38.4051ms.         | 92KB      |
 | node-cld       | `yarn bench:cld`            | 87.1121%            | 1.8074%               | 11.08%         | 56.38ms.           | > 10MB    |
 | franc          | `yarn bench:franc`          | 65.3913%            | 34.6087%              | 0%             | 132.59ms.          | 353.5kb   |
 | languagedetect | `yarn bench:languagedetect` | 58.0877%            | 13.4809%              | 28.4414%       | 159.56ms.          | 243.6kb   |

diff --git a/bin/tinyld.js b/bin/tinyld.js
@@ -0,0 +1,29 @@
+#! /usr/bin/env node
+const { detectAll } = require('../dist/tinyld.cjs')
+
+function main() {
+  const [, , ...args] = process.argv
+
+  let onlyLangs = []
+  let verbose = false
+
+  const texts = []
+  for (const arg of [...args]) {
+    if (arg.startsWith('--only=')) {
+      onlyLangs = arg.replace('--only=', '').split(',')
+      continue
+    }
+
+    if (arg.startsWith('--verbose') || arg.startsWith('-v')) {
+      verbose = true
+      continue
+    }
+
+    texts.push(arg)
+  }
+  const message = texts.join(' ')
+  const options = { only: onlyLangs, verbose }
+  console.log(detectAll(message, options))
+}
+
+main()
diff --git a/package.json b/package.json
@@ -1,10 +1,13 @@
 {
   "name": "tinyld",
-  "version": "1.0.5",
+  "version": "1.0.6",
   "main": "./dist/tinyld.cjs.js",
   "module": "./dist/tinyld.esm.js",
   "license": "MIT",
   "types": "./src/index.ts",
+  "bin": {
+    "tinyld": "./bin/tinyld.js"
+  },
   "keywords": [
     "language",
     "detection",

diff --git a/src/clean/index.ts b/src/clean/index.ts
@@ -2,15 +2,25 @@ export function isString(value: unknown): boolean {
   return typeof value === 'string' || value instanceof String
 }
 
+function stripPunctuation(val: string): string {
+  return val.replace(/[,.。，、!¿?！？;:…/„“«»”"“_–—~\\/]/gi, ' ')
+}
+
+function stripNumbers(val: string): string {
+  return val.replace(/[0-9]/g, '')
+}
+
+function replaceFullwidthNumbers(val: string): string {
+  return val.replace(/[\uFF10-\uFF19]/g, function (m) {
+    return String.fromCharCode(m.charCodeAt(0) - 0xfee0)
+  })
+}
+
 export function cleanString(value: string): string {
-  const data = value
-    .replace(/’/gi, "'")
-    .replace(/[,.。，、!¿?！？;:…/„“«»”"“_–—~\\/]/gi, ' ')
-    .replace(/[0-9]/g, '')
+  const data = value.replace(/’/gi, "'")
+  return stripPunctuation(stripNumbers(replaceFullwidthNumbers(data.toLowerCase())))
     .replace(/\s\s+/g, ' ')
     .trim()
-    .toLowerCase()
-  return data
 }
 
 export function normalize(value: string): string {

diff --git a/src/core.ts b/src/core.ts
@@ -48,6 +48,17 @@ export function isExtraSample(country: string): boolean {
 
 type LangOption = { code: string; alias?: string[]; skipLight?: boolean; skipProb?: boolean; extraSample?: boolean }
 
+export const parseDetectOption = (options?: Partial<DetectOption>): DetectOption => {
+  const data = { only: [], verbose: false }
+  if (!options) return data
+  return Object.assign(data, options)
+}
+
+export interface DetectOption {
+  only: string[]
+  verbose: boolean
+}
+
 // Map ISO 639-3 <-> ISO 639-1
 const langMap: { [id: string]: LangOption } = {
   // asia

diff --git a/src/index.ts b/src/index.ts
@@ -1,32 +1,34 @@
 import { cleanString, isString, normalize } from './clean'
-import { ILangProfiles } from './core'
+import { DetectOption, ILangProfiles, parseDetectOption } from './core'
 import data from './profiles/normal.json'
 import { detectAllPotentialGrams, detectPotentialGrams, detectUniqueGrams } from './tokenizer'
 
 const profiles = data as ILangProfiles
 
-export function detect(text: string): string {
+export function detect(text: string, opts?: Partial<DetectOption>): string {
+  const options = parseDetectOption(opts)
   if (!isString(text)) return ''
 
   const txt = cleanString(text) // clean input
   if (!txt) return ''
 
-  const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
+  const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
   if (res !== '') return res
 
-  return detectPotentialGrams(normalize(txt), profiles) // pass 2 : use probabilities
+  return detectPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
 }
 
-export function detectAll(text: string, verbose = false): { lang: string; accuracy: number }[] {
+export function detectAll(text: string, opts?: Partial<DetectOption>): { lang: string; accuracy: number }[] {
+  const options = parseDetectOption(opts)
   if (!isString(text)) return []
 
   const txt = cleanString(text) // clean input
   if (!txt) return []
 
-  const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
+  const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
   if (res !== '') return [{ lang: res, accuracy: 1 }]
 
-  return detectAllPotentialGrams(normalize(txt), profiles, verbose) // pass 2 : use probabilities
+  return detectAllPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
 }
 
 export { cleanString } from './clean'

diff --git a/src/index_light.ts b/src/index_light.ts
@@ -1,32 +1,34 @@
 import { cleanString, isString, normalize } from './clean'
-import { ILangProfiles } from './core'
+import { DetectOption, ILangProfiles, parseDetectOption } from './core'
 import data from './profiles/light.json'
 import { detectAllPotentialGrams, detectPotentialGrams, detectUniqueGrams } from './tokenizer'
 
 const profiles = data as ILangProfiles
 
-export function detect(text: string): string {
+export function detect(text: string, opts?: Partial<DetectOption>): string {
+  const options = parseDetectOption(opts)
   if (!isString(text)) return ''
 
   const txt = cleanString(text) // clean input
   if (!txt) return ''
 
-  const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
+  const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
   if (res !== '') return res
 
-  return detectPotentialGrams(normalize(txt), profiles) // pass 2 : use probabilities
+  return detectPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
 }
 
-export function detectAll(text: string, verbose = false): { lang: string; accuracy: number }[] {
+export function detectAll(text: string, opts?: Partial<DetectOption>): { lang: string; accuracy: number }[] {
+  const options = parseDetectOption(opts)
   if (!isString(text)) return []
 
   const txt = cleanString(text) // clean input
   if (!txt) return []
 
-  const res = detectUniqueGrams(txt, profiles) // pass 1 : unique grams
+  const res = detectUniqueGrams(txt, profiles, options) // pass 1 : unique grams
   if (res !== '') return [{ lang: res, accuracy: 1 }]
 
-  return detectAllPotentialGrams(normalize(txt), profiles, verbose) // pass 2 : use probabilities
+  return detectAllPotentialGrams(normalize(txt), profiles, options) // pass 2 : use probabilities
 }
 
 export { cleanString } from './clean'