Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
tonifuc3m committed Jan 29, 2021
1 parent 4eb33c0 commit 6345435
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 0 deletions.
Binary file added __pycache__/tokenize.cpython-38.pyc
Binary file not shown.
47 changes: 47 additions & 0 deletions sentence_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from sentence_splitter import SentenceSplitter # recommended by jordi
import argparse
import os

def argparser():
'''
DESCRIPTION: Parse command line arguments
'''

parser = argparse.ArgumentParser(description='process user given parameters')
parser.add_argument("-d", "--datapath", required = True, dest = "path",
help = "absolute path to directory with files")


return parser.parse_args().path


def split_to_sentences(text, target_lang='es'):
'''
DESCRIPTION: Split text into sentences.
Parameters
----------
text : string
String with entire document.
Returns
-------
sentences: list of str
List with sentences of document
'''
splitter = SentenceSplitter(language=target_lang)
return splitter.split(text)


if __name__ == '__main__':
path = argparser()

nsent = 0
for f in os.listdir(path):
if f[-3:]!='txt':
continue
txt = open(os.path.join(path, f)).read()
ntokens = nsent + len(split_to_sentences(txt))
print('Files in {} have {} sent'.format(path, nsent))

44 changes: 44 additions & 0 deletions tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 18 16:12:53 2021
@author: antonio
"""
from spacy.lang.es import Spanish
import os
import argparse

def argparser():
'''
DESCRIPTION: Parse command line arguments
'''

parser = argparse.ArgumentParser(description='process user given parameters')
parser.add_argument("-d", "--datapath", required = True, dest = "path",
help = "absolute path to directory with files")


return parser.parse_args().path


def tokenize(text):
tokenized = []
nlp = Spanish()
doc = nlp(text)
token_list = []
for token in doc:
token_list.append(token.text)
tokenized.append(token_list)
return token_list

if __name__ == '__main__':
path = argparser()

ntokens = 0
for f in os.listdir(path):
if f[-3:]!='txt':
continue
txt = open(os.path.join(path, f)).read()
ntokens = ntokens + len(tokenize(txt))
print('Files in {} have {} tokens'.format(path, ntokens))

0 comments on commit 6345435

Please sign in to comment.