Skip to content

Commit

Permalink
add predict module
Browse files Browse the repository at this point in the history
  • Loading branch information
coderbyr committed Aug 14, 2019
1 parent e2786c5 commit 3f2b78e
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 13 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ The training info will be outputted in standard output and log.logger\_file.

The evaluation info will be outputed in eval.dir.

### Prediction
python predict.py conf/train.json data/predict.json

* predict.json should be of json format, while each instance has a dummy label like "其他" or any other label in label map.
* eval.model\_dir is the model to predict.
* eval.top\_k is the number of labels to output.
* eval.threshold is the probability threshold.

The predict info will be outputed in predict.txt.

## Input Data Format

JSON example:
Expand Down
2 changes: 1 addition & 1 deletion dataset/classification_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _get_vocab_id_list(self, json_obj):
self.token_ngram_map,
self.config.feature.max_char_len,
self.config.feature.max_char_len_per_token)
return {self.DOC_LABEL: self._label_to_id(doc_labels, self.label_map),
return {self.DOC_LABEL: self._label_to_id(doc_labels, self.label_map) if self.model_mode != ModeType.PREDICT else [0],
self.DOC_TOKEN: token_ids, self.DOC_CHAR: char_ids,
self.DOC_CHAR_IN_TOKEN: char_in_token_ids,
self.DOC_TOKEN_NGRAM: token_ngram_ids,
Expand Down
2 changes: 1 addition & 1 deletion dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(self, config, json_files, generate_dict=False,
self._init_dict()
self.sample_index = []
self.sample_size = 0
self.mode = mode
self.model_mode = mode

self.files = json_files
for i, json_file in enumerate(json_files):
Expand Down
8 changes: 6 additions & 2 deletions model/classification/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def __init__(self, dataset, config):
high=self.config.embedding.uniform_bound,
std=self.config.embedding.random_stddev,
fan_mode=self.config.embedding.fan_mode,
activation_type=ActivationType.NONE)
activation_type=ActivationType.NONE,
model_mode=dataset.model_mode)
self.char_embedding = \
Embedding(dataset.char_map, config.embedding.dimension,
cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING,
Expand All @@ -53,7 +54,8 @@ def __init__(self, dataset, config):
high=self.config.embedding.uniform_bound,
std=self.config.embedding.random_stddev,
fan_mode=self.config.embedding.fan_mode,
activation_type=ActivationType.NONE)
activation_type=ActivationType.NONE,
model_mode=dataset.model_mode)
elif config.embedding.type == EmbeddingType.REGION_EMBEDDING:
self.token_embedding = RegionEmbeddingLayer(
dataset.token_map, config.embedding.dimension,
Expand All @@ -67,6 +69,7 @@ def __init__(self, dataset, config):
high=self.config.embedding.uniform_bound,
std=self.config.embedding.random_stddev,
fan_mode=self.config.embedding.fan_mode,
model_mode=dataset.model_mode,
region_embedding_type=config.embedding.region_embedding_type)

self.char_embedding = RegionEmbeddingLayer(
Expand All @@ -79,6 +82,7 @@ def __init__(self, dataset, config):
high=self.config.embedding.uniform_bound,
std=self.config.embedding.random_stddev,
fan_mode=self.config.embedding.fan_mode,
model_mode=dataset.model_mode,
region_embedding_type=config.embedding.region_embedding_type)
else:
raise TypeError(
Expand Down
20 changes: 11 additions & 9 deletions model/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from model.model_util import InitType
from model.model_util import init_tensor
from util import Logger
from util import Type
from util import Type, ModeType


class EmbeddingType(Type):
Expand All @@ -33,7 +33,7 @@ class EmbeddingType(Type):
"""
EMBEDDING = 'embedding'
REGION_EMBEDDING = 'region_embedding'

@classmethod
def str(cls):
return ",".join([cls.EMBEDDING, cls.REGION_EMBEDDING])
Expand All @@ -53,7 +53,7 @@ class EmbeddingProcessType(Type):
FLAT = 'flat'
MEAN = 'mean'
SUM = 'sum'

@classmethod
def str(cls):
return ",".join([cls.FLAT, cls.MEAN, cls.SUM])
Expand All @@ -64,7 +64,8 @@ def __init__(self, dict_map, embedding_dim, name, config, padding_idx=None,
pretrained_embedding_file=None, mode=EmbeddingProcessType.FLAT,
dropout=0, init_type=InitType.XAVIER_UNIFORM, low=0, high=1,
mean=0, std=1, activation_type=ActivationType.NONE,
fan_mode=FAN_MODE.FAN_IN, negative_slope=0):
fan_mode=FAN_MODE.FAN_IN, negative_slope=0,
model_mode=ModeType.TRAIN):
super(Embedding, self).__init__()
self.logger = Logger(config)
self.dropout = torch.nn.Dropout(p=dropout)
Expand All @@ -80,7 +81,8 @@ def __init__(self, dict_map, embedding_dim, name, config, padding_idx=None,
init_type=init_type, low=low, high=high, mean=mean, std=std,
activation_type=activation_type, fan_mode=fan_mode,
negative_slope=negative_slope)
if pretrained_embedding_file is not None and \
if model_mode == ModeType.TRAIN and \
pretrained_embedding_file is not None and \
pretrained_embedding_file != "":
self.load_pretrained_embedding(
embedding_lookup_table, dict_map, embedding_dim, name,
Expand Down Expand Up @@ -130,7 +132,7 @@ class RegionEmbeddingType(Type):
"""
WC = 'word_context'
CW = 'context_word'

@classmethod
def str(cls):
return ",".join([cls.WC, cls.CW])
Expand All @@ -144,7 +146,7 @@ class RegionEmbeddingLayer(torch.nn.Module):
def __init__(self, dict_map, embedding_dim, region_size, name, config,
padding=None, pretrained_embedding_file=None, dropout=0,
init_type=InitType.XAVIER_UNIFORM, low=0, high=1, mean=0,
std=1, fan_mode=FAN_MODE.FAN_IN,
std=1, fan_mode=FAN_MODE.FAN_IN, model_mode=ModeType.TRAIN,
region_embedding_type=RegionEmbeddingType.WC):
super(RegionEmbeddingLayer, self).__init__()
self.region_embedding_type = region_embedding_type
Expand All @@ -157,7 +159,7 @@ def __init__(self, dict_map, embedding_dim, region_size, name, config,
padding_idx=padding,
pretrained_embedding_file=pretrained_embedding_file,
dropout=dropout, init_type=init_type, low=low, high=high, mean=mean,
std=std, fan_mode=fan_mode)
std=std, fan_mode=fan_mode, model_mode=model_mode)
self.context_embedding = Embedding(
dict_map, embedding_dim * region_size, "RegionContext" + name,
config=config, padding_idx=padding, dropout=dropout,
Expand Down Expand Up @@ -208,7 +210,6 @@ def forward(self, vocab_ids):

return region_embedding


class PositionEmbedding(torch.nn.Module):
''' Reference: attention is all you need '''

Expand All @@ -226,6 +227,7 @@ def forward(self, src_pos):

@staticmethod
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):

def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

Expand Down
114 changes: 114 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/usr/bin/env python
# coding: utf-8
"""
Tencent is pleased to support the open source community by making NeuralClassifier available.
Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
or implied. See the License for thespecific language governing permissions and limitations under
the License.
"""

import codecs
import math
import numpy as np
import os
import sys
import json

import torch
from torch.utils.data import DataLoader

from config import Config
from dataset.classification_dataset import ClassificationDataset
from dataset.collator import ClassificationCollator
from dataset.collator import ClassificationType
from dataset.collator import FastTextCollator
from model.classification.drnn import DRNN
from model.classification.fasttext import FastText
from model.classification.textcnn import TextCNN
from model.classification.textvdcnn import TextVDCNN
from model.classification.textrnn import TextRNN
from model.classification.textrcnn import TextRCNN
from model.classification.transformer import Transformer
from model.classification.dpcnn import DPCNN
from model.classification.attentive_convolution import AttentiveConvNet
from model.classification.region_embedding import RegionEmbedding
from model.model_util import get_optimizer, get_hierar_relations

ClassificationDataset, ClassificationCollator, FastTextCollator,FastText, TextCNN, TextRNN, TextRCNN, DRNN, TextVDCNN, Transformer, DPCNN, AttentiveConvNet, RegionEmbedding

class Predictor(object):
def __init__(self, config):
self.config = config
self.model_name = config.model_name
self.use_cuda = config.device.startswith("cuda")
self.dataset_name = "ClassificationDataset"
self.collate_name = "FastTextCollator" if self.model_name == "FastText" \
else "ClassificationCollator"
self.dataset = globals()[self.dataset_name](config, [], mode="infer")
self.collate_fn = globals()[self.collate_name](config, len(self.dataset.label_map))
self.model = Predictor._get_classification_model(self.model_name, self.dataset, config)
Predictor._load_checkpoint(config.eval.model_dir, self.model, self.use_cuda)
self.model.eval()

@staticmethod
def _get_classification_model(model_name, dataset, conf):
model = globals()[model_name](dataset, conf)
model = model.cuda(conf.device) if conf.device.startswith("cuda") else model
return model

@staticmethod
def _load_checkpoint(file_name, model, use_cuda):
if use_cuda:
checkpoint = torch.load(file_name)
else:
checkpoint = torch.load(file_name, map_location=lambda storage, loc: storage)
model.load_state_dict(checkpoint["state_dict"])

def predict(self, texts):
"""
input texts should be json objects
"""
with torch.no_grad():
batch_texts = [self.dataset._get_vocab_id_list(json.loads(text)) for text in texts]
batch_texts = self.collate_fn(batch_texts)
logits = self.model(batch_texts)
if self.config.task_info.label_type != ClassificationType.MULTI_LABEL:
probs = torch.softmax(logits, dim=1)
else:
probs = torch.sigmoid(logits)
probs = probs.cpu().tolist()
return np.array(probs)

if __name__ == "__main__":
config = Config(config_file=sys.argv[1])
predictor = Predictor(config)
batch_size = config.eval.batch_size
input_texts = []
predict_probs = []
is_multi = config.task_info.label_type == ClassificationType.MULTI_LABEL
for line in codecs.open(sys.argv[2], "r", predictor.dataset.CHARSET):
input_texts.append(line.strip("\n"))
epoches = math.ceil(int(len(input_texts)/batch_size))
for i in range(epoches):
batch_texts = input_texts[i*batch_size:(i+1)*batch_size]
predict_prob = predictor.predict(batch_texts)
for j in predict_prob:
predict_probs.append(j)
with codecs.open("predict.txt", "w", predictor.dataset.CHARSET) as of:
for predict_prob in predict_probs:
if not is_multi:
predict_label_ids = [predict_prob.argmax()]
else:
predict_label_ids = []
predict_label_idx = np.argsort(-predict_prob)
for j in range(0, config.eval.top_k):
if predict_prob[predict_label_idx[j]] > config.eval.threshold:
predict_label_ids.append(predict_label_idx[j])
predict_label_name = [predictor.dataset.id_to_label_map[predict_label_id] \
for predict_label_id in predict_label_ids]
of.write(";".join(predict_label_name) + "\n")

0 comments on commit 3f2b78e

Please sign in to comment.