forked from DorinK/Deep-Learning-Gradient-based-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
86 lines (60 loc) · 2.7 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from collections import Counter
STUDENT = {'name': 'Dorin Keshales'}
# Reading the data drom the requested file
def read_data(fname):
with open(fname, "r", encoding="utf-8") as file:
data = []
for line in file:
label, text = line.strip().lower().split("\t", 1)
data.append((label, text))
return data
# Splitting the data into bigrams
def text_to_bigrams(text):
return ["%s%s" % (c1, c2) for c1, c2 in zip(text, text[1:])]
# Splitting the data into unigrams
def text_to_unigrams(text):
return ["%s" % c1 for c1 in text]
# Replacing the labels of all examples in the data set into the respective ID of each
def language_to_index(dataset, labels):
for index in range(len(dataset)):
label = labels.index(dataset[index][0], 0, len(labels))
feats = dataset[index][1]
dataset[index] = (label, feats)
return dataset
# Replacing the ID of the label with the respective label(=language)
def index_to_language(pred):
language = keys[pred]
return language
# Returns the list of common features on the test set
def get_common_features():
return features
# Loading the test set data
def load_test_set():
TEST = [(l, text_to_bigrams(t)) for l, t in read_data("test")]
return TEST
# Loading the validation set data according to the requested representation of the features.
def load_validation_set(representation):
DEV = [(l, text_to_bigrams(t)) for l, t in read_data("dev")] if representation == 'bigrams' else [
(l, text_to_unigrams(t)) for l, t in read_data("dev")]
return language_to_index(DEV, keys)
# Loading the training set data according to the requested representation of the features and pulling out the common
# features on the training set
def load_train_set(representation):
TRAIN = [(l, text_to_bigrams(t)) for l, t in read_data("train")] if representation == 'bigrams' else [
(l, text_to_unigrams(t)) for l, t in read_data("train")]
num_desired_features = 700 if representation == 'bigrams' else 90
fc = Counter()
for l, feats in TRAIN:
fc.update(feats)
# 700 most common bigrams/unigrams(following representation) in the training set.
vocab = set([x for x, c in fc.most_common(num_desired_features)])
# label strings to IDs
L2I = {l: i for i, l in enumerate(list(sorted(set([l for l, t in TRAIN]))))}
# feature strings to IDs
F2I = {f: i for i, f in enumerate(list(sorted(vocab)))}
global keys, features
features = list(F2I.keys())
keys = list(L2I.keys())
# Replacing the labels of all examples in the data set into the respective ID of each.
new_train = language_to_index(TRAIN, keys)
return new_train, len(features), len(keys)