Update by commit

DeepQuantitative · Nov 2, 2018 · 7ded2ff · 7ded2ff
1 parent 33897a0
commit 7ded2ff
Show file tree

Hide file tree

Showing 11 changed files with 5,062 additions and 5,064 deletions.
diff --git a/model.py b/model.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 
+UNIT = "char" # unit for tokenization (char, word)
 BATCH_SIZE = 64
 EMBED_SIZE = 300
 HIDDEN_SIZE = 1000
@@ -13,8 +14,8 @@
 SAVE_EVERY = 10
 
 PAD = "<PAD>" # padding
-EOS = "<EOS>" # end of sequence
 SOS = "<SOS>" # start of sequence
+EOS = "<EOS>" # end of sequence
 UNK = "<UNK>" # unknown token
 
 PAD_IDX = 0
@@ -138,7 +139,7 @@ def decode(self, h, mask): # Viterbi decoding
         best_path = [[i] for i in best_tag.tolist()]
         for b in range(BATCH_SIZE):
             x = best_tag[b] # best tag
-            l = int(scalar(mask[b].sum()))
+            l = mask[b].sum().int().tolist()
             for bptr_t in reversed(bptr[b][:l]):
                 x = bptr_t[x]
                 best_path[b].append(x)
@@ -163,9 +164,6 @@ def zeros(*args):
     x = torch.zeros(*args)
     return x.cuda() if CUDA else x
 
-def scalar(x):
-    return x.view(-1).data.tolist()[0]
-
 def log_sum_exp(x):
     m = torch.max(x, -1)[0]
     return m + torch.log(torch.sum(torch.exp(x - m.unsqueeze(-1)), -1))
diff --git a/predict.py b/predict.py
@@ -23,7 +23,7 @@ def run_model(model, idx_to_tag, data):
     batch = [x + [PAD_IDX] * (batch_len - len(x)) for _, _, x in data]
     result = model.decode(LongTensor(batch))
     for i in range(z):
-        data[i] = data[i][:-1] + [idx_to_tag[j] for j in result[i]]
+        data[i] = data[i][:-1] + [tuple([idx_to_tag[j] for j in result[i] if j > EOS_IDX])]
     return [(x[1], x[2]) for x in sorted(data[:z])]
 
 def predict():
@@ -33,7 +33,7 @@ def predict():
     fo = open(sys.argv[4])
     for line in fo:
         line = line.strip()
-        x = tokenize(line, "char")
+        x = tokenize(line, UNIT)
         x = [word_to_idx[i] if i in word_to_idx else UNK_IDX for i in x] + [EOS_IDX]
         data.append([idx, line, x])
         if len(data) == BATCH_SIZE:

diff --git a/sequence-labelling/prepare.py b/sequence-labelling/prepare.py
@@ -8,8 +8,8 @@
 
 def load_data():
     data = []
-    word_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, UNK: UNK_IDX}
-    tag_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, SOS: SOS_IDX}
+    word_to_idx = {PAD: PAD_IDX, SOS: SOS_IDX, EOS: EOS_IDX, UNK: UNK_IDX}
+    tag_to_idx = {PAD: PAD_IDX, SOS: SOS_IDX, EOS: EOS_IDX}
     fo = open(sys.argv[1])
     for line in fo:
         line = line.strip()

diff --git a/train.py b/train.py
@@ -49,7 +49,7 @@ def train():
             loss = torch.mean(model(x, y)) # forward pass and compute loss
             loss.backward() # compute gradients
             optim.step() # update parameters
-            loss = scalar(loss)
+            loss = loss.tolist()
             loss_sum += loss
         timer = time.time() - timer
         loss_sum /= len(data)

diff --git a/word-segmentation/prepare.py b/word-segmentation/prepare.py
@@ -1,14 +1,14 @@
 import sys
 import re
-from model import SOS, EOS, PAD, UNK, SOS_IDX, EOS_IDX, PAD_IDX, UNK_IDX
+from model import PAD, EOS, SOS, UNK, PAD_IDX, EOS_IDX, SOS_IDX, UNK_IDX
 
 MIN_LEN = 2
 MAX_LEN = 50
 
 def load_data():
     data = []
-    word_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, UNK: UNK_IDX}
-    tag_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, SOS: SOS_IDX}
+    word_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX, SOS: SOS_IDX, UNK: UNK_IDX}
+    tag_to_idx = {PAD: PAD_IDX, EOS: EOS_IDX}
     # IOB tags
     tag_to_idx["B"] = len(tag_to_idx)
     tag_to_idx["I"] = len(tag_to_idx)

diff --git a/word-segmentation/sample.ko.csv b/word-segmentation/sample.ko.csv
diff --git a/word-segmentation/sample.ko.tag_to_idx b/word-segmentation/sample.ko.tag_to_idx
@@ -1,5 +1,4 @@
 <PAD>
 <EOS>
-<SOS>
 B
 I
diff --git a/word-segmentation/sample.ko.word_to_idx b/word-segmentation/sample.ko.word_to_idx
@@ -1,4 +1,5 @@
 <PAD>
+<SOS>
 <EOS>
 <UNK>
 1

diff --git a/word-segmentation/sample.zh.csv b/word-segmentation/sample.zh.csv
diff --git a/word-segmentation/sample.zh.tag_to_idx b/word-segmentation/sample.zh.tag_to_idx
@@ -1,5 +1,4 @@
 <PAD>
 <EOS>
-<SOS>
 B
 I
diff --git a/word-segmentation/sample.zh.word_to_idx b/word-segmentation/sample.zh.word_to_idx
@@ -1,4 +1,5 @@
 <PAD>
+<SOS>
 <EOS>
 <UNK>
 虎
-Original file line number
+Diff line change
@@ -1,5 +1,4 @@
     <PAD>
     <EOS>
-    <SOS>
     B
     I