-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_utils.py
102 lines (85 loc) · 3.75 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import json
import collections
import tensorflow as tf
import tqdm
import numpy as np
import pprint
import warnings
from gensim.models.keyedvectors import KeyedVectors
from nltk.tokenize import TweetTokenizer
global _TOKENIZER
_TOKENIZER = TweetTokenizer()
def preprocess_caption(cap_in):
return ' '.join(_TOKENIZER.tokenize(cap_in)).lower()
def get_vocab(data, min_count=5, cached=None):
if cached is None or not os.path.exists(cached):
voc_counter = collections.Counter()
for c in tqdm.tqdm(data):
voc_counter.update(preprocess_caption(c).split())
word2idx = {'<PAD>': 0, '<UNK>': 1}
idx = len(word2idx)
for v, c in sorted(voc_counter.items(), key=lambda x: x[1], reverse=True):
if c < min_count:
break
word2idx[v] = idx
idx += 1
if cached is not None:
with open(cached, 'w') as f:
f.write(json.dumps(word2idx))
else:
with open(cached) as f:
word2idx = json.loads(f.read())
return word2idx
def get_word2vec_matrix(vocab, cache_file, word2vec_binary):
if cache_file is None and word2vec_binary is None:
return None
if cache_file is None or not os.path.exists(cache_file):
print('Loading word2vec binary...')
word2vec = KeyedVectors.load_word2vec_format(word2vec_binary, binary=True)
word2vec_cachable = {}
for w, idx in vocab.items():
if w in word2vec:
word2vec_cachable[w] = list([float(x) for x in word2vec[w]])
if cache_file is not None:
with open(cache_file, 'w') as f:
f.write(json.dumps(word2vec_cachable))
else:
with open(cache_file) as f:
word2vec_cachable = json.loads(f.read())
word2vec = {w:np.array(v) for w, v in word2vec_cachable.items()}
m_matrix = np.random.uniform(-.2, .2, size=(len(vocab), 300))
for w, idx in vocab.items():
if w in word2vec:
m_matrix[idx, :] = word2vec[w]
return m_matrix
def text_to_matrix(captions, vocab, max_len=15, padding='post'):
seqs = []
for c in captions:
tokens = preprocess_caption(c).split()
# for reasons I dont understand, the new version of CUDNN
# doesn't play nice with padding, etc. After painstakingly
# narrowing down why this happens, CUDNN errors when:
# 1) you're using RNN
# 2) your batch consists of fully-padded sequences and non-padded sequences only
# I filed a tensorflow issue:
# see https://github.com/tensorflow/tensorflow/issues/36139
# Upon a tensorflower's reply, CuDNN currently doesn't like empty sequences,
# and the CuDNN kernel only gets called when things are right-padded
# so, for now, until this bug is fixed, so we can still use CuDNN:
# 1) we will right/post-pad
# 2) in the data iterator, for padding sequences, we will prepend with
# an unk. These sentences don't affect the gradient, and we are
# expecting CuDNN to return junk anyway in those cases, so this
# should be fine, but I will experimentally verify
idxs = [vocab[v] if v in vocab else vocab['<UNK>'] for v in tokens]
if len(idxs) == 0:
warnings.warn(
'Wanring: detected at least one zero-length sentence. '
'Running will continue, but check your inputs.')
idxs = [vocab['<UNK>']]
seqs.append(idxs)
m_mat = tf.keras.preprocessing.sequence.pad_sequences(seqs, maxlen=max_len,
padding=padding, truncating='post',
value=0)
return m_mat