-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaugment.py
149 lines (124 loc) · 5.84 KB
/
augment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 16 14:02:46 2017
@author: tunoat
"""
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
def prepare_sentence_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
tensor = torch.LongTensor(idxs)
return autograd.Variable(tensor)
def prepare_word_sequence(seq, to_ix):
idxs =[]
for word in seq:
idxs.append([to_ix[ch] for ch in word])
tensor = []
for i in idxs:
tensor.append(autograd.Variable(torch.LongTensor(i)))
return tensor
training_data = [
("The dog happily ate the apple".split(), ["DET", "NN", "ADV", "V", "DET", "NN"]),
("Everybody read that book silently".split(), ["NN", "V", "DET", "NN", "ADV"])
]
word_to_ix = {}
char_to_ix = {'PAD':0}
training_data2 = []
for sent, tags in training_data:
for word in sent:
training_data2.append((list(word),word))
for char in word:
if char not in char_to_ix:
char_to_ix[char] = len(char_to_ix)
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
print(char_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2, "ADV": 3}
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
WORD_EMBEDDING_DIM = 6
CHAR_EMBEDDING_DIM = 9
WORD_HIDDEN_DIM = 6
CHAR_REP_DIM = 3
######################################################################
# Create the model:
class LSTMTagger(nn.Module):
def __init__(self, word_embedding_dim, char_embedding_dim, word_hidden_dim, word_vocab_size, char_vocab_size, char_rep_dim, tagset_size):
super(LSTMTagger, self).__init__()
self.char_rep_dim = char_rep_dim
self.word_hidden_dim = word_hidden_dim
self.char_embeddings = nn.Embedding(char_vocab_size, char_embedding_dim)
self.word_embeddings = nn.Embedding(word_vocab_size, word_embedding_dim)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm_word = nn.LSTM(char_embedding_dim, char_rep_dim)
self.lstm_sentence = nn.LSTM(word_embedding_dim + char_rep_dim, word_hidden_dim)
self.word_hidden2tag = nn.Linear(word_hidden_dim, tagset_size)
self.word_hidden = self.word_init_hidden()
self.sentence_hidden = self.sentence_init_hidden()
def word_init_hidden(self):
return (autograd.Variable(torch.zeros(1, 1, self.char_rep_dim)),
autograd.Variable(torch.zeros(1, 1, self.char_rep_dim)))
def sentence_init_hidden(self):
return (autograd.Variable(torch.zeros(1, 1, self.word_hidden_dim)),
autograd.Variable(torch.zeros(1, 1, self.word_hidden_dim)))
def forward(self, word_list, sentence):
representative_word_output = []
for word in word_list:
char_embeds = self.char_embeddings(word)
word_lstm_out, self.word_hidden = self.lstm_word(
char_embeds.view(len(word), 1, -1), self.word_hidden)
representative_word_output.append(word_lstm_out[-1])
rep_word_for_aug = representative_word_output[0]
for i in representative_word_output[1:]:
rep_word_for_aug = torch.cat((rep_word_for_aug, i), 0)
word_embeds = self.word_embeddings(sentence)
#print(torch.cat((word_embeds, rep_word_for_aug), 1))
aug_word_embeds = torch.cat((word_embeds, rep_word_for_aug), 1)
sentence_lstm_out, self.sentence_hidden = self.lstm_sentence(
aug_word_embeds.view(len(sentence), 1, -1), self.sentence_hidden)
tag_space = self.word_hidden2tag(sentence_lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space)
return tag_scores
######################################################################
# Train the model:
model = LSTMTagger(WORD_EMBEDDING_DIM, CHAR_EMBEDDING_DIM, WORD_HIDDEN_DIM, len(word_to_ix), len(char_to_ix), CHAR_REP_DIM, len(tag_to_ix))
optimizer = optim.SGD(model.parameters(), lr=0.1)
loss_function = nn.NLLLoss()
for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data
for sentence, tags in training_data:
# Step 1. Remember that Pytorch accumulates gradients.
# We need to clear them out before each instance
model.zero_grad()
# Also, we need to clear out the hidden state of the LSTM,
# detaching it from its history on the last instance.
model.word_hidden = model.word_init_hidden()
model.sentence_hidden = model.sentence_init_hidden()
# Step 2. Get our inputs ready for the network, that is, turn them into
# Variables of word indices.
sentence_in = prepare_sentence_sequence(sentence, word_to_ix)
word_list_in = prepare_word_sequence(sentence, char_to_ix)
targets = prepare_sentence_sequence(tags, tag_to_ix)
# Step 3. Run our forward pass.
tag_scores = model(word_list_in, sentence_in)
# Step 4. Compute the loss, gradients, and update the parameters by
# calling optimizer.step()
loss = loss_function(tag_scores, targets)
loss.backward(retain_graph=True)
optimizer.step()
print(epoch)
# See what the scores are after training
inputs = prepare_sentence_sequence(training_data[0][0], word_to_ix)
word_list_in = prepare_word_sequence(training_data[0][0], char_to_ix)
tag_scores = model(word_list_in, inputs)
# The sentence is "the dog ate the apple". i,j corresponds to score for tag j
# for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)