.

tonifuc3m · Jan 29, 2021 · 6345435 · 6345435
1 parent 4eb33c0
commit 6345435
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 0 deletions.
diff --git a/__pycache__/tokenize.cpython-38.pyc b/__pycache__/tokenize.cpython-38.pyc
diff --git a/sentence_splitter.py b/sentence_splitter.py
@@ -0,0 +1,47 @@
+from sentence_splitter import SentenceSplitter # recommended by jordi
+import argparse
+import os
+
+def argparser():
+    '''
+    DESCRIPTION: Parse command line arguments
+    '''
+
+    parser = argparse.ArgumentParser(description='process user given parameters')
+    parser.add_argument("-d", "--datapath", required = True, dest = "path", 
+                        help = "absolute path to directory with files") 
+
+
+    return parser.parse_args().path
+
+
+def split_to_sentences(text, target_lang='es'):
+    '''
+    DESCRIPTION: Split text into sentences.
+
+    Parameters
+    ----------
+    text : string
+        String with entire document.
+
+    Returns
+    -------
+    sentences: list of str
+        List with sentences of document
+
+    '''  
+    splitter = SentenceSplitter(language=target_lang)
+    return splitter.split(text) 
+
+
+if __name__ == '__main__':
+    path = argparser()
+
+    nsent = 0
+    for f in os.listdir(path):
+        if f[-3:]!='txt':
+            continue
+        txt = open(os.path.join(path, f)).read() 
+        ntokens = nsent + len(split_to_sentences(txt))
+    print('Files in {} have {} sent'.format(path, nsent))
+
diff --git a/tokenize.py b/tokenize.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jan 18 16:12:53 2021
+
+@author: antonio
+"""
+from spacy.lang.es import Spanish
+import os
+import argparse
+
+def argparser():
+    '''
+    DESCRIPTION: Parse command line arguments
+    '''
+
+    parser = argparse.ArgumentParser(description='process user given parameters')
+    parser.add_argument("-d", "--datapath", required = True, dest = "path", 
+                        help = "absolute path to directory with files") 
+
+
+    return parser.parse_args().path
+
+
+def tokenize(text):
+    tokenized = []
+    nlp = Spanish()
+    doc = nlp(text)
+    token_list = []
+    for token in doc:
+        token_list.append(token.text)
+        tokenized.append(token_list)
+    return token_list
+
+if __name__ == '__main__':
+    path = argparser()
+
+    ntokens = 0
+    for f in os.listdir(path):
+        if f[-3:]!='txt':
+            continue
+        txt = open(os.path.join(path, f)).read() 
+        ntokens = ntokens + len(tokenize(txt))
+    print('Files in {} have {} tokens'.format(path, ntokens))