add predict module

ZeyuSun · Aug 14, 2019 · 3f2b78e · 3f2b78e
1 parent e2786c5
commit 3f2b78e
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -58,6 +58,16 @@ The training info will be outputted in standard output and log.logger\_file.
 
 The evaluation info will be outputed in eval.dir.
 
+### Prediction
+    python predict.py conf/train.json data/predict.json 
+
+* predict.json should be of json format, while each instance has a dummy label like "其他" or any other label in label map.
+* eval.model\_dir is the model to predict.
+* eval.top\_k is the number of labels to output.
+* eval.threshold is the probability threshold.
+
+The predict info will be outputed in predict.txt.
+
 ## Input Data Format
 
     JSON example:

diff --git a/dataset/classification_dataset.py b/dataset/classification_dataset.py
@@ -160,7 +160,7 @@ def _get_vocab_id_list(self, json_obj):
                               self.token_ngram_map,
                               self.config.feature.max_char_len,
                               self.config.feature.max_char_len_per_token)
-        return {self.DOC_LABEL: self._label_to_id(doc_labels, self.label_map),
+        return {self.DOC_LABEL: self._label_to_id(doc_labels, self.label_map) if self.model_mode != ModeType.PREDICT else [0],
                 self.DOC_TOKEN: token_ids, self.DOC_CHAR: char_ids,
                 self.DOC_CHAR_IN_TOKEN: char_in_token_ids,
                 self.DOC_TOKEN_NGRAM: token_ngram_ids,

diff --git a/dataset/dataset.py b/dataset/dataset.py
@@ -66,7 +66,7 @@ def __init__(self, config, json_files, generate_dict=False,
         self._init_dict()
         self.sample_index = []
         self.sample_size = 0
-        self.mode = mode
+        self.model_mode = mode
 
         self.files = json_files
         for i, json_file in enumerate(json_files):

diff --git a/model/classification/classifier.py b/model/classification/classifier.py
@@ -42,7 +42,8 @@ def __init__(self, dataset, config):
                           high=self.config.embedding.uniform_bound,
                           std=self.config.embedding.random_stddev,
                           fan_mode=self.config.embedding.fan_mode,
-                          activation_type=ActivationType.NONE)
+                          activation_type=ActivationType.NONE,
+                          model_mode=dataset.model_mode)
             self.char_embedding = \
                 Embedding(dataset.char_map, config.embedding.dimension,
                           cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
@@ -53,7 +54,8 @@ def __init__(self, dataset, config):
                           high=self.config.embedding.uniform_bound,
                           std=self.config.embedding.random_stddev,
                           fan_mode=self.config.embedding.fan_mode,
-                          activation_type=ActivationType.NONE)
+                          activation_type=ActivationType.NONE,
+                          model_mode=dataset.model_mode)
         elif config.embedding.type == EmbeddingType.REGION_EMBEDDING:
             self.token_embedding = RegionEmbeddingLayer(
                 dataset.token_map, config.embedding.dimension,
@@ -67,6 +69,7 @@ def __init__(self, dataset, config):
                 high=self.config.embedding.uniform_bound,
                 std=self.config.embedding.random_stddev,
                 fan_mode=self.config.embedding.fan_mode,
+                model_mode=dataset.model_mode,
                 region_embedding_type=config.embedding.region_embedding_type)
 
             self.char_embedding = RegionEmbeddingLayer(
@@ -79,6 +82,7 @@ def __init__(self, dataset, config):
                 high=self.config.embedding.uniform_bound,
                 std=self.config.embedding.random_stddev,
                 fan_mode=self.config.embedding.fan_mode,
+                model_mode=dataset.model_mode,
                 region_embedding_type=config.embedding.region_embedding_type)
         else:
             raise TypeError(

diff --git a/model/embedding.py b/model/embedding.py
@@ -21,7 +21,7 @@
 from model.model_util import InitType
 from model.model_util import init_tensor
 from util import Logger
-from util import Type
+from util import Type, ModeType
 
 
 class EmbeddingType(Type):
@@ -33,7 +33,7 @@ class EmbeddingType(Type):
     """
     EMBEDDING = 'embedding'
     REGION_EMBEDDING = 'region_embedding'
-
+    
     @classmethod
     def str(cls):
         return ",".join([cls.EMBEDDING, cls.REGION_EMBEDDING])
@@ -53,7 +53,7 @@ class EmbeddingProcessType(Type):
     FLAT = 'flat'
     MEAN = 'mean'
     SUM = 'sum'
-
+    
     @classmethod
     def str(cls):
         return ",".join([cls.FLAT, cls.MEAN, cls.SUM])
@@ -64,7 +64,8 @@ def __init__(self, dict_map, embedding_dim, name, config, padding_idx=None,
                  pretrained_embedding_file=None, mode=EmbeddingProcessType.FLAT,
                  dropout=0, init_type=InitType.XAVIER_UNIFORM, low=0, high=1,
                  mean=0, std=1, activation_type=ActivationType.NONE,
-                 fan_mode=FAN_MODE.FAN_IN, negative_slope=0):
+                 fan_mode=FAN_MODE.FAN_IN, negative_slope=0,
+                 model_mode=ModeType.TRAIN):
         super(Embedding, self).__init__()
         self.logger = Logger(config)
         self.dropout = torch.nn.Dropout(p=dropout)
@@ -80,7 +81,8 @@ def __init__(self, dict_map, embedding_dim, name, config, padding_idx=None,
             init_type=init_type, low=low, high=high, mean=mean, std=std,
             activation_type=activation_type, fan_mode=fan_mode,
             negative_slope=negative_slope)
-        if pretrained_embedding_file is not None and \
+        if model_mode == ModeType.TRAIN and \
+                pretrained_embedding_file is not None and \
                 pretrained_embedding_file != "":
             self.load_pretrained_embedding(
                 embedding_lookup_table, dict_map, embedding_dim, name,
@@ -130,7 +132,7 @@ class RegionEmbeddingType(Type):
     """
     WC = 'word_context'
     CW = 'context_word'
-
+    
     @classmethod
     def str(cls):
         return ",".join([cls.WC, cls.CW])
@@ -144,7 +146,7 @@ class RegionEmbeddingLayer(torch.nn.Module):
     def __init__(self, dict_map, embedding_dim, region_size, name, config,
                  padding=None, pretrained_embedding_file=None, dropout=0,
                  init_type=InitType.XAVIER_UNIFORM, low=0, high=1, mean=0,
-                 std=1, fan_mode=FAN_MODE.FAN_IN,
+                 std=1, fan_mode=FAN_MODE.FAN_IN, model_mode=ModeType.TRAIN,
                  region_embedding_type=RegionEmbeddingType.WC):
         super(RegionEmbeddingLayer, self).__init__()
         self.region_embedding_type = region_embedding_type
@@ -157,7 +159,7 @@ def __init__(self, dict_map, embedding_dim, region_size, name, config,
             padding_idx=padding,
             pretrained_embedding_file=pretrained_embedding_file,
             dropout=dropout, init_type=init_type, low=low, high=high, mean=mean,
-            std=std, fan_mode=fan_mode)
+            std=std, fan_mode=fan_mode, model_mode=model_mode)
         self.context_embedding = Embedding(
             dict_map, embedding_dim * region_size, "RegionContext" + name,
             config=config, padding_idx=padding, dropout=dropout,
@@ -208,7 +210,6 @@ def forward(self, vocab_ids):
 
         return region_embedding
 
-
 class PositionEmbedding(torch.nn.Module):
     ''' Reference: attention is all you need '''
 
@@ -226,6 +227,7 @@ def forward(self, src_pos):
 
     @staticmethod
     def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
+
         def cal_angle(position, hid_idx):
             return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 

diff --git a/predict.py b/predict.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python 
+# coding: utf-8 
+"""
+Tencent is pleased to support the open source community by making NeuralClassifier available.
+Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+Licensed under the MIT License (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+http://opensource.org/licenses/MIT
+Unless required by applicable law or agreed to in writing, software distributed under the License
+is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+or implied. See the License for thespecific language governing permissions and limitations under
+the License.
+""" 
+
+import codecs
+import math
+import numpy as np
+import os
+import sys
+import json
+
+import torch
+from torch.utils.data import DataLoader
+
+from config import Config
+from dataset.classification_dataset import ClassificationDataset
+from dataset.collator import ClassificationCollator
+from dataset.collator import ClassificationType
+from dataset.collator import FastTextCollator
+from model.classification.drnn import DRNN
+from model.classification.fasttext import FastText
+from model.classification.textcnn import TextCNN
+from model.classification.textvdcnn import TextVDCNN
+from model.classification.textrnn import TextRNN
+from model.classification.textrcnn import TextRCNN
+from model.classification.transformer import Transformer
+from model.classification.dpcnn import DPCNN
+from model.classification.attentive_convolution import AttentiveConvNet
+from model.classification.region_embedding import RegionEmbedding
+from model.model_util import get_optimizer, get_hierar_relations
+
+ClassificationDataset, ClassificationCollator, FastTextCollator,FastText, TextCNN, TextRNN, TextRCNN, DRNN, TextVDCNN, Transformer, DPCNN, AttentiveConvNet, RegionEmbedding
+
+class Predictor(object):
+    def __init__(self, config):
+        self.config = config
+        self.model_name = config.model_name
+        self.use_cuda = config.device.startswith("cuda")
+        self.dataset_name = "ClassificationDataset"
+        self.collate_name = "FastTextCollator" if self.model_name == "FastText" \
+                else "ClassificationCollator"
+        self.dataset = globals()[self.dataset_name](config, [], mode="infer")
+        self.collate_fn = globals()[self.collate_name](config, len(self.dataset.label_map))
+        self.model = Predictor._get_classification_model(self.model_name, self.dataset, config)
+        Predictor._load_checkpoint(config.eval.model_dir, self.model, self.use_cuda)
+        self.model.eval()
+
+    @staticmethod
+    def _get_classification_model(model_name, dataset, conf):
+        model = globals()[model_name](dataset, conf)
+        model = model.cuda(conf.device) if conf.device.startswith("cuda") else model
+        return model
+
+    @staticmethod
+    def _load_checkpoint(file_name, model, use_cuda):
+        if use_cuda:
+            checkpoint = torch.load(file_name)
+        else:
+            checkpoint = torch.load(file_name, map_location=lambda storage, loc: storage)
+        model.load_state_dict(checkpoint["state_dict"])
+
+    def predict(self, texts):
+        """
+        input texts should be json objects
+        """
+        with torch.no_grad():
+            batch_texts = [self.dataset._get_vocab_id_list(json.loads(text)) for text in texts]
+            batch_texts = self.collate_fn(batch_texts)
+            logits = self.model(batch_texts)
+            if self.config.task_info.label_type != ClassificationType.MULTI_LABEL:
+                probs = torch.softmax(logits, dim=1)
+            else:
+                probs = torch.sigmoid(logits)
+            probs = probs.cpu().tolist()
+            return np.array(probs)
+
+if __name__ == "__main__":
+    config = Config(config_file=sys.argv[1])
+    predictor = Predictor(config)
+    batch_size = config.eval.batch_size
+    input_texts = []
+    predict_probs = []
+    is_multi = config.task_info.label_type == ClassificationType.MULTI_LABEL
+    for line in codecs.open(sys.argv[2], "r", predictor.dataset.CHARSET):
+        input_texts.append(line.strip("\n"))
+    epoches = math.ceil(int(len(input_texts)/batch_size))
+    for i in range(epoches):
+        batch_texts = input_texts[i*batch_size:(i+1)*batch_size]
+        predict_prob = predictor.predict(batch_texts)
+        for j in predict_prob:
+            predict_probs.append(j)
+    with codecs.open("predict.txt", "w", predictor.dataset.CHARSET) as of:
+        for predict_prob in predict_probs:
+            if not is_multi:
+                predict_label_ids = [predict_prob.argmax()]
+            else:
+                predict_label_ids = []
+                predict_label_idx = np.argsort(-predict_prob)
+                for j in range(0, config.eval.top_k):
+                    if predict_prob[predict_label_idx[j]] > config.eval.threshold:
+                        predict_label_ids.append(predict_label_idx[j])
+            predict_label_name = [predictor.dataset.id_to_label_map[predict_label_id] \
+                    for predict_label_id in predict_label_ids]
+            of.write(";".join(predict_label_name) + "\n")