Skip to content

Commit

Permalink
1. 加入自动调参机制[实验性]
Browse files Browse the repository at this point in the history
2. 稍微修改了SRL模型的结构
3. 更多的可调整参数
  • Loading branch information
AlongWY committed Dec 23, 2020
1 parent 1b95af4 commit 0f43160
Show file tree
Hide file tree
Showing 27 changed files with 915 additions and 901 deletions.
2 changes: 1 addition & 1 deletion ltp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*_
# Author: Yunlong Feng <[email protected]>
__version__ = '4.1.2'
__version__ = '4.1.3'

from . import const
from . import nn, utils
Expand Down
54 changes: 51 additions & 3 deletions ltp/data/dataset/conllu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import logging

import os
import itertools
from collections import Counter

import datasets
from os.path import join
from dataclasses import dataclass
Expand Down Expand Up @@ -28,6 +31,38 @@
_TEST_FILE = "test.conllu"


def build_vocabs(data_dir, train_file, dev_file=None, test_file=None, min_freq=5):
counters = {
'word': (1, Counter()), 'lemma': (2, Counter()), 'upos': (3, Counter()),
'xpos': (4, Counter()), 'feats': (5, Counter()), 'deprel': (7, Counter()),
# FOR CHAR FEATS
'word_char': (1, Counter())
}

if any([os.path.exists(os.path.join(data_dir, 'vocabs', f'{key}.txt')) for key in counters]):
return

if not os.path.exists(os.path.join(data_dir, 'vocabs')):
os.makedirs(os.path.join(data_dir, 'vocabs'))

for file_name in [train_file, dev_file, test_file]:
for line_num, block in iter_blocks(filename=os.path.join(data_dir, file_name)):
values = [list(value) for value in zip(*block)]

for name, (row, counter) in counters.items():
if 'char' in name:
counter.update(itertools.chain(*values[row]))
else:
counter.update(values[row])

for feat, (row, counter) in counters.items():
if 'word' in feat:
counter = Counter({word: count for word, count in counter.items() if count > min_freq})

with open(os.path.join(data_dir, 'vocabs', f'{feat}.txt'), mode='w') as f:
f.write('\n'.join(sorted(counter.keys())))


def create_feature(file=None):
if file:
return datasets.ClassLabel(names_file=file)
Expand All @@ -40,6 +75,7 @@ class ConlluConfig(datasets.BuilderConfig):

upos: str = None
xpos: str = None
feats: str = None
deprel: str = None
deps: str = None

Expand All @@ -49,18 +85,30 @@ class Conllu(datasets.GeneratorBasedBuilder):
BUILDER_CONFIG_CLASS = ConlluConfig

def _info(self):
build_vocabs(self.config.data_dir, _TRAINING_FILE, _DEV_FILE, _TEST_FILE)
feats = {
'upos': self.config.upos,
'xpos': self.config.xpos,
'feats': self.config.feats,
'deprel': self.config.deprel,
}

for key in feats:
if feats[key] is None:
feats[key] = os.path.join(self.config.data_dir, 'vocabs', f'{key}.txt')

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Sequence(datasets.Value("int64")),
"form": datasets.Sequence(datasets.Value("string")),
"lemma": datasets.Sequence(datasets.Value("string")),
"upos": datasets.Sequence(create_feature(self.config.upos)),
"xpos": datasets.Sequence(create_feature(self.config.xpos)),
"upos": datasets.Sequence(create_feature(feats['upos'])),
"xpos": datasets.Sequence(create_feature(feats['xpos'])),
"feats": datasets.Sequence(datasets.Value("string")),
"head": datasets.Sequence(datasets.Value("int64")),
"deprel": datasets.Sequence(create_feature(self.config.deprel)),
"deprel": datasets.Sequence(create_feature(feats['deprel'])),
"deps": datasets.Sequence(
{
'id': datasets.Value('int64'),
Expand Down
3 changes: 3 additions & 0 deletions ltp/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ltp.algorithms import Trie, eisner, split_sentence
from ltp.transformer_multitask import TransformerMultiTask as Model
from ltp.utils import length_to_mask, get_entities, fake_import_pytorch_lightning
from ltp.patchs import patch_4_1_3

try:
from torch.hub import _get_torch_home
Expand Down Expand Up @@ -124,6 +125,8 @@ def __init__(self, path: str = 'small', device=None, **kwargs):
fake_import_pytorch_lightning()
ckpt = torch.load(os.path.join(path, "ltp.model"), map_location=self.device)

patch_4_1_3(ckpt)

self.cache_dir = path
config = AutoConfig.for_model(**ckpt['transformer_config'])
self.model = Model(ckpt['model_config'], config=config).to(self.device)
Expand Down
85 changes: 38 additions & 47 deletions ltp/multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
from ltp.data import dataset as datasets
from ltp.data.utils import collate, MultiTaskDataloader
from ltp.transformer_multitask import TransformerMultiTask as Model
from ltp.utils import TaskInfo, common_train
from ltp.utils import TaskInfo, common_train, tune_train
from ltp.utils import deploy_model

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='multitask', metric_name='metric_mean')

# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/multitask.py --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --seg_data_dir=data/seg --pos_data_dir=data/pos --ner_data_dir=data/ner

task_builder = {
Expand All @@ -42,33 +40,31 @@ def build_dataset(model, **kwargs):
metrics = OrderedDict()

for task, task_data_dir in kwargs.items():
dataset, metric = task_builder[task].build_dataset(model, task_data_dir)
dataset, metric = task_builder[task].build_dataset(model, task_data_dir, task)
datasets[task] = dataset
metrics[task] = metric

return datasets, metrics


def validation_method(metric: dict = None, loss_tag='val_loss', metric_tag: str = None, metric_tags: dict = None,
log=True):
ret=True):
if metric is None or metric_tags is None:
raise NotImplemented

task_mapper = []
step_mapper = []
epoch_mapper = []
metric_tag_mapper = []

for task, task_metric in metric.items():
task_metric_tag = metric_tags[task]
task_step, task_epoch_end = task_builder[task].validation_method(
task_metric, loss_tag=f'{loss_tag}/{task}', metric_tag=task_metric_tag, log=False
task_metric, loss_tag=f'{loss_tag}/{task}', metric_tag=f'{task_metric_tag}/{task}', ret=True
)

task_mapper.append(task)
step_mapper.append(task_step)
epoch_mapper.append(task_epoch_end)
metric_tag_mapper.append(task_metric_tag)

def step(self, batch, batch_idx, dataloader_idx=0):
batch['task'] = task_mapper[dataloader_idx]
Expand All @@ -79,21 +75,16 @@ def epoch_end(self, outputs):
for idx, task_output in enumerate(outputs):
metric = epoch_mapper[idx](self, task_output)
metrics.append(metric)
self.log(
f'{metric_tag_mapper[idx]}/{task_mapper[idx]}', metric,
on_step=False, on_epoch=True, prog_bar=True, logger=True
)
metric_mean = sum(metrics) / len(metrics)
if log:
self.log(metric_tag, metric_mean, on_step=False, on_epoch=True, prog_bar=True, logger=True)
else:
self.log(metric_tag, metric_mean, on_step=False, on_epoch=True, prog_bar=True, logger=True)
if ret:
return metric_mean

return step, epoch_end


def build_method(model):
multi_dataset, multi_metric = build_dataset(
def build_method(model: Model, task_info: TaskInfo):
multi_dataset, multi_metric = task_info.build_dataset(
model,
seg=model.hparams.seg_data_dir,
pos=model.hparams.pos_data_dir,
Expand All @@ -118,9 +109,9 @@ def train_dataloader(self):
return res

def training_step(self, batch, batch_idx):
loss, logits = self(**batch)
self.log("loss", loss.item())
return {"loss": loss}
result = self(**batch)
self.log("loss", result.loss.item())
return {"loss": result.loss}

def val_dataloader(self):
return [
Expand Down Expand Up @@ -151,22 +142,11 @@ def configure_optimizers(self: Model):
for dataset in multi_dataset.values()
)
num_train_steps = num_epoch_steps * self.hparams.max_epochs
optimizer, scheduler = optimization.create_optimizer(
self,
lr=self.hparams.lr,
optimizer, scheduler = optimization.from_argparse_args(
self.hparams,
model=self,
num_train_steps=num_train_steps,
weight_decay=self.hparams.weight_decay,
warmup_steps=self.hparams.warmup_steps,
warmup_proportion=self.hparams.warmup_proportion,
layerwise_lr_decay_power=self.hparams.layerwise_lr_decay_power,
n_transformer_layers=self.transformer.config.num_hidden_layers,
get_layer_lrs=optimization.get_layer_lrs_with_crf,
get_layer_lrs_kwargs={'crf_preffix': 'rel_crf'},
lr_scheduler=optimization.get_polynomial_decay_schedule_with_warmup,
lr_scheduler_kwargs={
'lr_end': self.hparams.lr_end,
'power': self.hparams.lr_decay_power
}
n_transformer_layers=self.transformer.config.num_hidden_layers
)
return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

Expand All @@ -175,21 +155,21 @@ def configure_optimizers(self: Model):
model.train_dataloader = types.MethodType(train_dataloader, model)
model.training_step = types.MethodType(training_step, model)

validation_step, validation_epoch_end = validation_method(
validation_step, validation_epoch_end = task_info.validation_method(
multi_metric, loss_tag='val_loss', metric_tags={
task_name: f"val_{task_module.task_info.metric_name}"
for task_name, task_module in task_builder
for task_name, task_module in task_builder.items()
}, metric_tag=f"val_{task_info.metric_name}"
)

model.val_dataloader = types.MethodType(val_dataloader, model)
model.validation_step = types.MethodType(validation_step, model)
model.validation_epoch_end = types.MethodType(validation_epoch_end, model)

test_step, test_epoch_end = validation_method(
test_step, test_epoch_end = task_info.validation_method(
multi_metric, loss_tag='test_loss', metric_tags={
task_name: f"test_{task_module.task_info.metric_name}"
for task_name, task_module in task_builder
for task_name, task_module in task_builder.items()
}, metric_tag=f"test_{task_info.metric_name}"
)

Expand All @@ -198,11 +178,25 @@ def configure_optimizers(self: Model):
model.test_epoch_end = types.MethodType(test_epoch_end, model)


task_info = TaskInfo(
task_name='multitask',
metric_name='metric_mean',
build_dataset=build_dataset,
validation_method=validation_method
)


def add_task_specific_args(parent_parser):
parser = ArgumentParser(parents=[parent_parser], add_help=False)
parser.add_argument('--seed', type=int, default=19980524)
parser.add_argument('--tune', action='store_true')
parser.add_argument('--offline', action='store_true')
parser.add_argument('--patience', type=int, default=5)
parser.add_argument('--batch_size', type=int, default=8)
parser.add_argument('--gpus_per_trial', type=float, default=1.0)
parser.add_argument('--cpus_per_trial', type=float, default=5.0)
parser.add_argument('--num_workers', type=int, default=4)
parser.add_argument('--num_samples', type=int, default=10)
parser.add_argument('--tau', type=float, default=0.8)
parser.add_argument('--ltp_model', type=str, default=None)
parser.add_argument('--ltp_version', type=str, default=ltp.__version__)
Expand All @@ -222,19 +216,16 @@ def main():
parser = Model.add_model_specific_args(parser)
parser = optimization.add_optimizer_specific_args(parser)
parser = Trainer.add_argparse_args(parser)
parser.set_defaults(gradient_clip_val=1.0)
parser.set_defaults(min_epochs=1, max_epochs=10)
parser.set_defaults(gradient_clip_val=1.0, lr_layers_getter='get_layer_lrs_with_crf')
args = parser.parse_args()

if args.ltp_model is not None and args.resume_from_checkpoint is not None:
deploy_model(args, args.ltp_version)
elif args.tune:
tune_train(args, model_class=Model, task_info=task_info, build_method=build_method)
else:
common_train(
args,
metric=f'val_{task_info.metric_name}',
model_class=Model,
build_method=build_method,
task=task_info.task_name
)
common_train(args, model_class=Model, task_info=task_info, build_method=build_method)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 0f43160

Please sign in to comment.