Skip to content

Commit

Permalink
Update by commit
Browse files Browse the repository at this point in the history
  • Loading branch information
threelittlemonkeys committed Nov 2, 2018
1 parent 33897a0 commit 7ded2ff
Show file tree
Hide file tree
Showing 11 changed files with 5,062 additions and 5,064 deletions.
8 changes: 3 additions & 5 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import torch
import torch.nn as nn

UNIT = "char" # unit for tokenization (char, word)
BATCH_SIZE = 64
EMBED_SIZE = 300
HIDDEN_SIZE = 1000
Expand All @@ -13,8 +14,8 @@
SAVE_EVERY = 10

PAD = "<PAD>" # padding
EOS = "<EOS>" # end of sequence
SOS = "<SOS>" # start of sequence
EOS = "<EOS>" # end of sequence
UNK = "<UNK>" # unknown token

PAD_IDX = 0
Expand Down Expand Up @@ -138,7 +139,7 @@ def decode(self, h, mask): # Viterbi decoding
best_path = [[i] for i in best_tag.tolist()]
for b in range(BATCH_SIZE):
x = best_tag[b] # best tag
l = int(scalar(mask[b].sum()))
l = mask[b].sum().int().tolist()
for bptr_t in reversed(bptr[b][:l]):
x = bptr_t[x]
best_path[b].append(x)
Expand All @@ -163,9 +164,6 @@ def zeros(*args):
x = torch.zeros(*args)
return x.cuda() if CUDA else x

def scalar(x):
return x.view(-1).data.tolist()[0]

def log_sum_exp(x):
m = torch.max(x, -1)[0]
return m + torch.log(torch.sum(torch.exp(x - m.unsqueeze(-1)), -1))
4 changes: 2 additions & 2 deletions predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def run_model(model, idx_to_tag, data):
batch = [x + [PAD_IDX] * (batch_len - len(x)) for _, _, x in data]
result = model.decode(LongTensor(batch))
for i in range(z):
data[i] = data[i][:-1] + [idx_to_tag[j] for j in result[i]]
data[i] = data[i][:-1] + [tuple([idx_to_tag[j] for j in result[i] if j > EOS_IDX])]
return [(x[1], x[2]) for x in sorted(data[:z])]

def predict():
Expand All @@ -33,7 +33,7 @@ def predict():
fo = open(sys.argv[4])
for line in fo:
line = line.strip()
x = tokenize(line, "char")
x = tokenize(line, UNIT)
x = [word_to_idx[i] if i in word_to_idx else UNK_IDX for i in x] + [EOS_IDX]
data.append([idx, line, x])
if len(data) == BATCH_SIZE:
Expand Down
4 changes: 2 additions & 2 deletions sequence-labelling/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

def load_data():
data = []
word_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, UNK: UNK_IDX}
tag_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, SOS: SOS_IDX}
word_to_idx = {PAD: PAD_IDX, SOS: SOS_IDX, EOS: EOS_IDX, UNK: UNK_IDX}
tag_to_idx = {PAD: PAD_IDX, SOS: SOS_IDX, EOS: EOS_IDX}
fo = open(sys.argv[1])
for line in fo:
line = line.strip()
Expand Down
2 changes: 1 addition & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def train():
loss = torch.mean(model(x, y)) # forward pass and compute loss
loss.backward() # compute gradients
optim.step() # update parameters
loss = scalar(loss)
loss = loss.tolist()
loss_sum += loss
timer = time.time() - timer
loss_sum /= len(data)
Expand Down
6 changes: 3 additions & 3 deletions word-segmentation/prepare.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import sys
import re
from model import SOS, EOS, PAD, UNK, SOS_IDX, EOS_IDX, PAD_IDX, UNK_IDX
from model import PAD, EOS, SOS, UNK, PAD_IDX, EOS_IDX, SOS_IDX, UNK_IDX

MIN_LEN = 2
MAX_LEN = 50

def load_data():
data = []
word_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, UNK: UNK_IDX}
tag_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, SOS: SOS_IDX}
word_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, SOS: SOS_IDX, UNK: UNK_IDX}
tag_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX}
# IOB tags
tag_to_idx["B"] = len(tag_to_idx)
tag_to_idx["I"] = len(tag_to_idx)
Expand Down
9,970 changes: 4,985 additions & 4,985 deletions word-segmentation/sample.ko.csv

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion word-segmentation/sample.ko.tag_to_idx
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<PAD>
<EOS>
<SOS>
B
I
1 change: 1 addition & 0 deletions word-segmentation/sample.ko.word_to_idx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<PAD>
<SOS>
<EOS>
<UNK>
1
Expand Down
128 changes: 64 additions & 64 deletions word-segmentation/sample.zh.csv

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion word-segmentation/sample.zh.tag_to_idx
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
<PAD>
<EOS>
<SOS>
B
I
1 change: 1 addition & 0 deletions word-segmentation/sample.zh.word_to_idx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<PAD>
<SOS>
<EOS>
<UNK>
Expand Down

0 comments on commit 7ded2ff

Please sign in to comment.