Skip to content

Commit

Permalink
Added python script to determine the language.
Browse files Browse the repository at this point in the history
  • Loading branch information
dietercastel committed Oct 2, 2019
1 parent ab16ad0 commit 8d90ec6
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 11 deletions.
48 changes: 48 additions & 0 deletions classlang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python3
#
# This script classifies the language of a file based on an advance NLP AI model.
# (Well or just some clever programming actually.)
#
# Usage:
# python3 classlang.py filetocheck
# Returns:
# A string corresponding to the most likely language in the file.
#
# To be used in conjunction with vim spelllang setting to automatically select the language.

import sys

# Only look at first 100 lines
startoffile = open(sys.argv[1], encoding='utf-8').readlines()[0:99]

print(startoffile)

commonWordsDict = {
"en_us" : [" the "," and "," a ", " to ", "The ", " an "],
"nl" : [" de "," en "," in ", " van ", " op ", "De ", "Het "]
}

langs = list(commonWordsDict.keys())
indexDict = dict(zip(range(0,len(langs)), langs))
print(indexDict)

def countOccurences(lang):
print(lang)
tot = 0
for l in startoffile:
for w in commonWordsDict[lang]:
print(l)
print(w)
tot += l.count(w)
return tot

def classifyLanguage():
scores = map(countOccurences, langs)
scl = list(scores)
print(scl)
maxVal = max(scl)
print(maxVal)
maxIdx = scl.index(maxVal)
print(indexDict[maxIdx])

classifyLanguage()
11 changes: 0 additions & 11 deletions getlang.py

This file was deleted.

0 comments on commit 8d90ec6

Please sign in to comment.