Skip to content

Commit

Permalink
Merge branch 'develop-fixed' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
Hobson Lane committed Jun 30, 2020
2 parents 6315770 + 2f69b89 commit 803c459
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 68 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ source = */nlpia/*
omit =
src/nlpia/data/*
src/nlpia/book/*
src/nlpia/second_edition/*
src/nlpia/talk.py
src/nlpia/highd.py

Expand Down
23 changes: 1 addition & 22 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ norecursedirs =
src/nlpia/book
src/book
src/nlpia/scripts
src/nlpia/second_edition
book
dist
build
Expand All @@ -29,25 +30,3 @@ addopts =
--doctest-glob='test*.md'
--ignore='src/nlpia/talk.py'
--ignore='src/nlpia/highd.py'

# python_files (args) glob-style file patterns for Python test module discovery
# python_classes (args) prefixes or glob names for Python test class discovery
# python_functions (args) prefixes or glob names for Python test function and method discovery
# junit_suite_name (string) Test suite name for JUnit report
# doctest_encoding (string) encoding used for doctest files
# cache_dir (string) cache directory path.
# filterwarnings (linelist) Each line specifies a pattern for warnings.filterwarnings. Processed after -W and --pythonwarnings.
# log_print (bool) default value for --no-print-logs
# log_level (string) default value for --log-level
# log_format (string) default value for --log-format
# log_date_format (string) default value for --log-date-format
# log_cli (bool) enable log display during test run (also known as "live logging").
# log_cli_level (string) default value for --log-cli-level
# log_cli_format (string) default value for --log-cli-format
# log_cli_date_format (string) default value for --log-cli-date-format
# log_file (string) default value for --log-file
# log_file_level (string) default value for --log-file-level
# log_file_format (string) default value for --log-file-format
# log_file_date_format (string) default value for --log-file-date-format
# addopts (args) extra command line options
# minversion (string) minimally required pytest version
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ doctest_optionflags =
norecursedirs =
src/nlpia/book
src/book
src/second_edition
book
dist
build
Expand Down
4 changes: 2 additions & 2 deletions src/nlpia/anki.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from nlpia.constants import BIGDATA_PATH
from nlpia.loaders import get_data, ANKI_LANGUAGES, LANG2ANKI, nlp

logger = logging.getLogger(__name__)
log = logging.getLogger(__name__)


def get_anki_phrases(lang='english', limit=None):
Expand Down Expand Up @@ -88,7 +88,7 @@ def get_word_vectors(vocab):
if w in wv:
vectors[i, :] = wv[w]
if not np.sum(np.abs(vectors[i])):
logger.warning('Unable to find {}, {}, or {} in word2vec.'.format(*variations))
log.warning('Unable to find {}, {}, or {} in word2vec.'.format(*variations))
return vectors


Expand Down
88 changes: 44 additions & 44 deletions src/nlpia/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@

np = pd.np

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
# logging.config.dictConfig(LOGGING_CONFIG)
# # doesn't display line number, etc
# if os.environ.get('DEBUG'):
Expand Down Expand Up @@ -130,15 +130,15 @@ def load_imdb_df(dirpath=os.path.join(BIGDATA_PATH, 'aclImdb'), subdirectories=(
urlspath = os.path.join(dirpath, subdirs[0], 'urls_{}.txt'.format(subdirs[1]))
if not os.path.isfile(urlspath):
if subdirs != ('test', 'unsup'): # test/ dir doesn't usually have an unsup subdirectory
logger.warning('Unable to find expected IMDB review list of URLs: {}'.format(urlspath))
log.warning('Unable to find expected IMDB review list of URLs: {}'.format(urlspath))
continue
df = pd.read_csv(urlspath, header=None, names=['url'])
# df.index.name = 'id'
df['url'] = series_strip(df.url, endswith='/usercomments')

textsdir = os.path.join(dirpath, subdirs[0], subdirs[1])
if not os.path.isdir(textsdir):
logger.warning('Unable to find expected IMDB review text subdirectory: {}'.format(textsdir))
log.warning('Unable to find expected IMDB review text subdirectory: {}'.format(textsdir))
continue
filenames = [fn for fn in os.listdir(textsdir) if fn.lower().endswith('.txt')]
df['index0'] = subdirs[0] # TODO: column names more generic so will work on other datasets
Expand Down Expand Up @@ -459,11 +459,11 @@ def generate_big_urls_glove(bigurls=None):

try:
BIGDATA_INFO = pd.read_csv(BIGDATA_INFO_FILE, header=0)
logger.warning('Found BIGDATA index in {default} so it will overwrite nlpia.loaders.BIG_URLS !!!'.format(
log.warning('Found BIGDATA index in {default} so it will overwrite nlpia.loaders.BIG_URLS !!!'.format(
default=BIGDATA_INFO_FILE))
except (IOError, pd.errors.EmptyDataError):
BIGDATA_INFO = pd.DataFrame(columns='name url file_size'.split())
logger.info('No BIGDATA index found in {default} so copy {latest} to {default} if you want to "freeze" it.'.format(
log.info('No BIGDATA index found in {default} so copy {latest} to {default} if you want to "freeze" it.'.format(
default=BIGDATA_INFO_FILE, latest=BIGDATA_INFO_LATEST))
BIG_URLS.update(dict(zip(BIGDATA_INFO.name, zip(BIGDATA_INFO.url, BIGDATA_INFO.file_size))))
BIGDATA_INFO = pd.DataFrame(list(
Expand Down Expand Up @@ -507,11 +507,11 @@ def rename_file(source, dest):
>>> os.path.isfile(os.path.join(tmpdir, 'Fake_Data.bin.gz'))
True
"""
logger.debug('nlpia.loaders.rename_file(source={}, dest={})'.format(source, dest))
log.debug('nlpia.loaders.rename_file(source={}, dest={})'.format(source, dest))
if not isinstance(source, str):
dest = [dest] if isinstance(dest, str) else dest
return [rename_file(s, d) for (s, d) in zip_longest(source, dest, fillvalue=[source, dest][int(len(source) > len(dest))])]
logger.debug('nlpia.loaders.os.rename(source={}, dest={})'.format(source, dest))
log.debug('nlpia.loaders.os.rename(source={}, dest={})'.format(source, dest))
if source == dest:
return dest
os.rename(source, dest)
Expand Down Expand Up @@ -548,7 +548,7 @@ def untar(fname, verbose=True):
if os.path.isdir(dirpath):
return dirpath
else:
logger.warning("Not a tar.gz file: {}".format(fname))
log.warning("Not a tar.gz file: {}".format(fname))


def series_rstrip(series, endswith='/usercomments', ignorecase=True):
Expand All @@ -569,7 +569,7 @@ def series_strip(series, startswith=None, endswith=None, startsorendswith=None,
else:
mask = series
if not (startsorendswith or endswith or startswith):
logger.warning('In series_strip(): You must specify endswith, startswith, or startsorendswith string arguments.')
log.warning('In series_strip(): You must specify endswith, startswith, or startsorendswith string arguments.')
return series
if startsorendswith:
startswith = endswith = startsorendswith
Expand Down Expand Up @@ -647,7 +647,7 @@ def get_leet_map():
table.append((row['eng'].strip(), s.strip()))
table = pd.DataFrame(table, columns=df.columns)
leet_path = os.path.join(DATA_PATH, 'l33t.csv')
logger.info('Saving l33t dictionary (character mapping) to {}'.format(leet_path))
log.info('Saving l33t dictionary (character mapping) to {}'.format(leet_path))
table.to_csv(leet_path)
return table

Expand All @@ -658,7 +658,7 @@ def get_netspeak_map():
df = dfs[0].drop(index=0)
df.columns = ['abbrev', 'definition']
csv_path = os.path.join(DATA_PATH, 'netspeak.csv')
logger.info('Saving netspeak dictionary (word mapping) to {}'.format(csv_path))
log.info('Saving netspeak dictionary (word mapping) to {}'.format(csv_path))
df.to_csv(csv_path)
return df

Expand Down Expand Up @@ -763,38 +763,38 @@ def unzip(filepath, verbose=True):
if not os.path.isdir(unzip_dir) or not len(os.listdir(unzip_dir)) == len(z.filelist):
z.extractall(path=unzip_dir)

logger.info('unzip_dir contains: {}'.format(os.listdir(unzip_dir)))
log.info('unzip_dir contains: {}'.format(os.listdir(unzip_dir)))
# for f in os.listdir(unzip_dir):
# if f.lower().endswith('about.txt'):
# os.remove(os.path.join(unzip_dir, f))
for f in tqdm_prog(os.listdir(unzip_dir)):
if f[-1] in ' \t\r\n\f':
bad_path = os.path.join(unzip_dir, f)
logger.warning('Stripping whitespace from end of filename: {} -> {}'.format(
log.warning('Stripping whitespace from end of filename: {} -> {}'.format(
repr(bad_path), repr(bad_path.rstrip())))
shutil.move(bad_path, bad_path.rstrip())
# rename_file(source=bad_path, dest=bad_path.rstrip())
anki_paths = [os.path.join(unzip_dir, f) for f in os.listdir(unzip_dir)
if f.lower()[:3] in ANKI_LANGUAGES and f.lower()[3:] == '.txt']
logger.info('anki_paths: {}'.format(anki_paths))
log.info('anki_paths: {}'.format(anki_paths))

w2v_paths = [os.path.join(BIGDATA_PATH, f[:-4] + '.w2v.txt') for f in os.listdir(unzip_dir)
if f.lower().endswith('.txt') and 'glove' in f.lower()]
for f, word2vec_output_file in zip(os.listdir(unzip_dir), w2v_paths):
glove_input_file = os.path.join(unzip_dir, f)
logger.info('Attempting to converting GloVE format to Word2vec: {} -> {}'.format(
log.info('Attempting to converting GloVE format to Word2vec: {} -> {}'.format(
repr(glove_input_file), repr(word2vec_output_file)))
try:
glove2word2vec(glove_input_file=glove_input_file, word2vec_output_file=word2vec_output_file)
except: # noqa
logger.info('Failed to convert GloVE format to Word2vec: {} -> {}'.format(
log.info('Failed to convert GloVE format to Word2vec: {} -> {}'.format(
repr(glove_input_file), repr(word2vec_output_file)))

txt_paths = [os.path.join(BIGDATA_PATH, f.lower()[:-4] + '.txt') for f in os.listdir(unzip_dir) if f.lower().endswith('.asc')]
for f, txt_file in zip(os.listdir(unzip_dir), txt_paths):
if f.lower().endswith('.asc'):
input_file = os.path.join(unzip_dir, f)
logger.info('Renaming .asc file to .txt: {} -> {}'.format(
log.info('Renaming .asc file to .txt: {} -> {}'.format(
repr(input_file), repr(txt_file)))
shutil.move(input_file, txt_file)

Expand Down Expand Up @@ -855,15 +855,15 @@ def download_unzip(names=None, normalize_filenames=False, verbose=True):
if not filepath:
continue
file_paths[name] = normalize_ext_rename(filepath)
logger.debug('downloaded name={} to filepath={}'.format(name, file_paths[name]))
log.debug('downloaded name={} to filepath={}'.format(name, file_paths[name]))
fplower = file_paths[name].lower()
if fplower.endswith('.tar.gz'):
logger.info('Extracting {}'.format(file_paths[name]))
log.info('Extracting {}'.format(file_paths[name]))
file_paths[name] = untar(file_paths[name], verbose=verbose)
logger.debug('download_untar.filepaths=' + str(file_paths))
log.debug('download_untar.filepaths=' + str(file_paths))
elif file_paths[name].lower().endswith('.zip'):
file_paths[name] = unzip(file_paths[name], verbose=verbose)
logger.debug('download_unzip.filepaths=' + str(file_paths))
log.debug('download_unzip.filepaths=' + str(file_paths))
else:
df = pd.read_html(DATA_INFO['url'][name], **DATA_INFO['downloader_kwargs'][name])[-1]
df.columns = clean_columns(df.columns)
Expand Down Expand Up @@ -911,22 +911,22 @@ def download_file(url, data_path=BIGDATA_PATH, filename=None, size=None, chunk_s
filepath = os.path.join(data_path, filename)
if normalize_filename:
filepath = normalize_filepath(filepath)
logger.info('expanded+normalized file path: {}'.format(filepath))
log.info('expanded+normalized file path: {}'.format(filepath))
tqdm_prog = tqdm if verbose else no_tqdm
logger.info('requesting URL: {}'.format(url))
log.info('requesting URL: {}'.format(url))

logger.info('remote_size: {}'.format(remote_size))
log.info('remote_size: {}'.format(remote_size))
stat = path_status(filepath)
local_size = stat.get('size', None)
logger.info('local_size: {}'.format(local_size))
log.info('local_size: {}'.format(local_size))

r = None
if not remote_size or not stat['type'] == 'file' or not local_size >= remote_size or not stat['size'] > MIN_DATA_FILE_SIZE:
try:
r = requests_get(url, stream=True, allow_redirects=True, timeout=5)
remote_size = r.headers.get('Content-Length', -1)
except ConnectionError:
logger.error('ConnectionError for url: {} => request {}'.format(url, r))
log.error('ConnectionError for url: {} => request {}'.format(url, r))
remote_size = -1 if remote_size is None else remote_size
except (InvalidURL, InvalidSchema, InvalidHeader, MissingSchema) as e:
log.warning(e)
Expand All @@ -941,31 +941,31 @@ def download_file(url, data_path=BIGDATA_PATH, filename=None, size=None, chunk_s
# TODO: check md5 or get the right size of remote file
if stat['type'] == 'file' and local_size >= remote_size and stat['size'] > MIN_DATA_FILE_SIZE:
r = r.close() if r else r
logger.info('retained: {}'.format(filepath))
log.info('retained: {}'.format(filepath))
return filepath

filedir = os.path.dirname(filepath)
created_dir = mkdir_p(filedir)
logger.info('data path created: {}'.format(created_dir))
log.info('data path created: {}'.format(created_dir))
assert os.path.isdir(filedir)
assert created_dir.endswith(filedir)
bytes_downloaded = 0
if r:
logger.info('downloading to: {}'.format(filepath))
log.info('downloading to: {}'.format(filepath))
with open(filepath, 'wb') as f:
for chunk in tqdm_prog(r.iter_content(chunk_size=chunk_size), total=ceil(remote_size / float(chunk_size))):
bytes_downloaded += len(chunk)
if chunk: # filter out keep-alive chunks
f.write(chunk)
r.close()
else:
logger.error(f'Unable to requests.get(url={url}) using request object {r}')
log.error(f'Unable to requests.get(url={url}) using request object {r}')
return None

logger.debug('nlpia.loaders.download_file: bytes={}'.format(bytes_downloaded))
log.debug('nlpia.loaders.download_file: bytes={}'.format(bytes_downloaded))
stat = path_status(filepath)
logger.info("local file stat {}".format(stat))
logger.debug("filepath={}: local_size={}, remote_size={}, downloaded_bytes={}".format(
log.info("local file stat {}".format(stat))
log.debug("filepath={}: local_size={}, remote_size={}, downloaded_bytes={}".format(
filepath, size, remote_size, bytes_downloaded))
return filepath

Expand Down Expand Up @@ -1069,11 +1069,11 @@ def get_data(name='sms-spam', nrows=None, limit=None):
"""
nrows = nrows or limit
if name in BIG_URLS:
logger.info('Downloading {}'.format(name))
log.info('Downloading {}'.format(name))
filepaths = download_unzip(name, normalize_filenames=True)
logger.debug('nlpia.loaders.get_data.filepaths=' + str(filepaths))
log.debug('nlpia.loaders.get_data.filepaths=' + str(filepaths))
filepath = filepaths[name][0] if isinstance(filepaths[name], (list, tuple)) else filepaths[name]
logger.debug('nlpia.loaders.get_data.filepath=' + str(filepath))
log.debug('nlpia.loaders.get_data.filepath=' + str(filepath))
filepathlow = filepath.lower()

if len(BIG_URLS[name]) >= 4:
Expand Down Expand Up @@ -1124,7 +1124,7 @@ def get_data(name='sms-spam', nrows=None, limit=None):
msg = 'Unable to find dataset "{}"" in {} or {} (*.csv.gz, *.csv, *.json, *.zip, or *.txt)\n'.format(
name, DATA_PATH, BIGDATA_PATH)
msg += 'Available dataset names include:\n{}'.format('\n'.join(DATASET_NAMES))
logger.error(msg)
log.error(msg)
raise IOError(msg)


Expand Down Expand Up @@ -1228,7 +1228,7 @@ def clean_column_values(df, inplace=True):
except ValueError:
values = None
except: # noqa
logger.error('Error on column {} with dtype {}'.format(c, df[c].dtype))
log.error('Error on column {} with dtype {}'.format(c, df[c].dtype))
raise

if values is not None:
Expand Down Expand Up @@ -1294,7 +1294,7 @@ def cleaner(row):
else:
cleaned = list(cleaned) + [None] * (5 - len(cleaned))
if not np.all(np.array(row.values)[:3] == np.array(cleaned)[:3]):
logger.info('{} => {}'.format(row.values, cleaned))
log.info('{} => {}'.format(row.values, cleaned))
return list(cleaned)

cleancanon = canonical.apply(cleaner, axis=1)
Expand All @@ -1321,7 +1321,7 @@ def clean_cornell_movies(filename='cornell_movie_dialogs_corpus.zip', subdir='co
subdir = 'cornell movie-dialogs corpus'
if fullpath_zipfile.lower().endswith('.zip'):
retval = unzip(fullpath_zipfile)
logger.debug(f'unzip({fullpath_zipfile}) return value: {retval}')
log.debug(f'unzip({fullpath_zipfile}) return value: {retval}')
dirname = dirname[:-4]
fullpath_movie_lines = os.path.join(BIGDATA_PATH, dirname, subdir, 'movie_lines.txt')
dialog = pd.read_csv(
Expand Down Expand Up @@ -1403,7 +1403,7 @@ def nlp(texts, lang='en', linesep=None, verbose=True):
try:
spacy.cli.download(lang)
except URLError:
logger.warning("Unable to download Spacy language model '{}' so nlp(text) just returns text.split()".format(lang))
log.warning("Unable to download Spacy language model '{}' so nlp(text) just returns text.split()".format(lang))
parse = _parse or str.split
# TODO: reverse this recursion (str first then sequence) to allow for sequences of sequences of texts
if isinstance(texts, str):
Expand Down Expand Up @@ -1439,6 +1439,6 @@ def clean_win_tsv(filepath=os.path.join(DATA_PATH, 'Products.txt'),
df = df[~(df[index_col] == INT_NAN)]
df.set_index(index_col, inplace=True)
if len(df) != original_len:
logger.warning(('Loaded {} rows from tsv. Original file, "{}", contained {} seemingly valid lines.' +
'Index column: {}').format(len(df), original_len, filepath, index_col))
log.warning(('Loaded {} rows from tsv. Original file, "{}", contained {} seemingly valid lines.' +
'Index column: {}').format(len(df), original_len, filepath, index_col))
return df

0 comments on commit 803c459

Please sign in to comment.