Skip to content

Commit

Permalink
电子病历NER识别
Browse files Browse the repository at this point in the history
  • Loading branch information
lichangzhen committed Dec 11, 2023
1 parent 8b912b8 commit 4813849
Show file tree
Hide file tree
Showing 5 changed files with 339 additions and 0 deletions.
99 changes: 99 additions & 0 deletions 231017000010/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# 电子病历NER识别
- 团队:
- 李昌振 231017000010
- 陈娟 231017000036
- 武玮
- 丁雨聪


- 业务目标:从电子病历识别患者是否患有DVT或PE以及对应的时间,本质为NER识别
- DVT的样例:双下肢深静脉血栓、左侧小腿多条肌间静脉血栓、右侧小腿肌间静脉血栓、右侧髂静脉血栓形成
- PE的样例:肺栓塞、肺动脉栓塞、慢性血栓栓塞性肺动脉高压
- 时间的样例:2022-09-02、1年多前、2020年、2022年1月3日
- 处理流程:
- 准备数据集
- 提取原始电子病历文档,进行句子拆分
- 通过正则表达式进行初步识别,简化人工标注工作量
- 机器标注结果上传到文本标注平台(Doccano平台)[https://github.com/doccano/doccano]
- 人工审核与标注
- 导出标注结果,转换为模型输入格式
- 模型定义:使用开源框架DeepKE
- 模型训练与调参
- 测试
- 数据集和NLP标注:
标注样例如图所示:
![](./img/daccono_sample.png)

标注数据集导出如图所示:
![](./img/dataset.png)
导出的jsonl转换为模型输入的`BIO格式`
```
动 O
脉 O
多 O
发 O
斑 O
块 O
伴 O
股 O
浅 O
动 O
脉 O
多 O
节 O
段 O
狭 O
窄 O
左 B-DVT
侧 I-DVT
小 I-DVT
腿 I-DVT
肌 I-DVT
间 I-DVT
静 I-DVT
脉 I-DVT
血 I-DVT
栓 I-DVT
形 I-DVT
成 I-DVT
```
- 数据集情况说明:
- 总句子数:18942个句子,其中
- "DVT"类型标注7701个实体
- "PE"类型标注3747个实体
- "TIME"类型标注7444个实体
- 训练集:1399985 字符
- 测试集:53838字符
- 验证集: 154010 字符
- 模型描述:
- 基于开源[DeepKE](https://github.com/zjunlp/DeepKE)框架,进行NER识别
- 基于bert的模型关键超参数(conf/hydra/model/bert.yaml)
- bert_model: "bert-base-chinese"
- do_lower_case: True
- fp16: False
- fp16_opt_level: "01"
- local_rank: -1
- loss_scale: 0.0
- max_grad_norm: 1.0
- max_seq_length: 512 # `初始默认为128`
- 模型训练:
- 训练环境:NVIDIA Geforce RTX 3090
- (deepke) root@ubuntu:/home/deploy/vte-nlp-ner/DeepKE-2.1.0/example/ner/standard# python run_bert.py
- 说明:若服务器网络不好的话,运行bert需要先从huggingface下载bert-base-chinese模型到本地,在配置文件中对应的bert_model: "bert-base-chinese"改为本地保存模型的目录
- 训练过程可以通过wandb进行监控:
![](./img/wandb.png)

- 模型运行结果(bert)
```
precision recall f1-score support
DVT 0.9091 0.9390 0.9238 426
PE 0.9074 0.9378 0.9224 209
TIME 0.9962 0.9962 0.9962 263
micro avg 0.9336 0.9555 0.9444 898
macro avg 0.9376 0.9577 0.9474 898
weighted avg 0.9342 0.9555 0.9447 898
```


Binary file added 231017000010/img/daccono_sample.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added 231017000010/img/dataset.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added 231017000010/img/wandb.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
240 changes: 240 additions & 0 deletions 231017000010/src/run_bert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
from __future__ import absolute_import, division, print_function

import csv
import json
import logging
import os
import random
import sys
import numpy as np
import torch
import torch.nn.functional as F
from pytorch_transformers import (WEIGHTS_NAME, AdamW, BertConfig, BertForTokenClassification, BertTokenizer, WarmupLinearSchedule)
from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from seqeval.metrics import classification_report
import hydra
from hydra import utils
from deepke.name_entity_re.standard import *

import wandb


logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)


class TrainNer(BertForTokenClassification):

def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,valid_ids=None,attention_mask_label=None,device=None):
sequence_output = self.bert(input_ids, token_type_ids, attention_mask,head_mask=None)[0]
batch_size,max_len,feat_dim = sequence_output.shape
valid_output = torch.zeros(batch_size,max_len,feat_dim,dtype=torch.float32,device=device)
for i in range(batch_size):
jj = -1
for j in range(max_len):
if valid_ids[i][j].item() == 1:
jj += 1
valid_output[i][jj] = sequence_output[i][j]
sequence_output = self.dropout(valid_output)
logits = self.classifier(sequence_output)

if labels is not None:
loss_fct = nn.CrossEntropyLoss(ignore_index=0)
if attention_mask_label is not None:
active_loss = attention_mask_label.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return loss
else:
return logits

wandb.init(project="DeepKE_NER_Standard")
@hydra.main(config_path="conf", config_name='config_bert')
def main(cfg):

# Use gpu or not
if cfg.use_gpu and torch.cuda.is_available():
device = torch.device('cuda', cfg.gpu_id)
else:
device = torch.device('cpu')

if cfg.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(cfg.gradient_accumulation_steps))

cfg.train_batch_size = cfg.train_batch_size // cfg.gradient_accumulation_steps

random.seed(cfg.seed)
np.random.seed(cfg.seed)
torch.manual_seed(cfg.seed)

if not cfg.do_train and not cfg.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")

# Checkpoints
if os.path.exists(utils.get_original_cwd()+'/'+cfg.output_dir) and os.listdir(utils.get_original_cwd()+'/'+cfg.output_dir) and cfg.do_train:
raise ValueError("Output directory ({}) already exists and is not empty.".format(utils.get_original_cwd()+'/'+cfg.output_dir))
if not os.path.exists(utils.get_original_cwd()+'/'+cfg.output_dir):
os.makedirs(utils.get_original_cwd()+'/'+cfg.output_dir)

# Preprocess the input dataset
processor = NerProcessor()
label_list = processor.get_labels(cfg)
num_labels = len(label_list) + 1

# Prepare the model
tokenizer = BertTokenizer.from_pretrained(cfg.bert_model, do_lower_case=cfg.do_lower_case)

train_examples = None
num_train_optimization_steps = 0
if cfg.do_train:
train_examples = processor.get_train_examples(utils.get_original_cwd()+'/'+cfg.data_dir)
num_train_optimization_steps = int(len(train_examples) / cfg.train_batch_size / cfg.gradient_accumulation_steps) * cfg.num_train_epochs

config = BertConfig.from_pretrained(cfg.bert_model, num_labels=num_labels, finetuning_task=cfg.task_name)
model = TrainNer.from_pretrained(cfg.bert_model,from_tf = False,config = config)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias','LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': cfg.weight_decay},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
warmup_steps = int(cfg.warmup_proportion * num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.learning_rate, eps=cfg.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps)
global_step = 0
nb_tr_steps = 0
tr_loss = 0
label_map = {i : label for i, label in enumerate(label_list,1)}
if cfg.do_train:
train_features = convert_examples_to_features(train_examples, label_list, cfg.max_seq_length, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long)
all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=cfg.train_batch_size)

model.train()

for _ in trange(int(cfg.num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids, valid_ids,l_mask = batch
loss = model(input_ids, segment_ids, input_mask, label_ids,valid_ids,l_mask,device)
if cfg.gradient_accumulation_steps > 1:
loss = loss / cfg.gradient_accumulation_steps

loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)

tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % cfg.gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step() # Update learning rate schedule
model.zero_grad()
global_step += 1
wandb.log({
"train_loss":tr_loss/nb_tr_steps
})
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
model_to_save.save_pretrained(utils.get_original_cwd()+'/'+cfg.output_dir)
tokenizer.save_pretrained(utils.get_original_cwd()+'/'+cfg.output_dir)
label_map = {i : label for i, label in enumerate(label_list,1)}
model_config = {"bert_model":cfg.bert_model,"do_lower":cfg.do_lower_case,"max_seq_length":cfg.max_seq_length,"num_labels":len(label_list)+1,"label_map":label_map}
json.dump(model_config,open(os.path.join(utils.get_original_cwd()+'/'+cfg.output_dir,"model_config.json"),"w"))
# Load a trained model and config that you have fine-tuned
else:
# Load a trained model and vocabulary that you have fine-tuned
model = TrainNer.from_pretrained(utils.get_original_cwd()+'/'+cfg.output_dir)
tokenizer = BertTokenizer.from_pretrained(utils.get_original_cwd()+'/'+cfg.output_dir, do_lower_case=cfg.do_lower_case)

model.to(device)

if cfg.do_eval:
if cfg.eval_on == "dev":
eval_examples = processor.get_dev_examples(utils.get_original_cwd()+'/'+cfg.data_dir)
elif cfg.eval_on == "test":
eval_examples = processor.get_test_examples(utils.get_original_cwd()+'/'+cfg.data_dir)
else:
raise ValueError("eval on dev or test set only")
eval_features = convert_examples_to_features(eval_examples, label_list, cfg.max_seq_length, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long)
all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_valid_ids,all_lmask_ids)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=cfg.eval_batch_size)
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []
label_map = {i : label for i, label in enumerate(label_list,1)}
for input_ids, input_mask, segment_ids, label_ids,valid_ids,l_mask in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
valid_ids = valid_ids.to(device)
label_ids = label_ids.to(device)
l_mask = l_mask.to(device)

with torch.no_grad():
logits = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask,device=device)

logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
input_mask = input_mask.to('cpu').numpy()

for i, label in enumerate(label_ids):
temp_1 = []
temp_2 = []
for j,m in enumerate(label):
if j == 0:
continue
elif label_ids[i][j] == len(label_map):
y_true.append(temp_1)
y_pred.append(temp_2)
break
else:
temp_1.append(label_map[label_ids[i][j]])

if logits[i][j] != 0:
temp_2.append(label_map[logits[i][j]])
else:
temp_2.append(0)


report = classification_report(y_true, y_pred,digits=4)
logger.info("\n%s", report)
output_eval_file = os.path.join(utils.get_original_cwd()+'/'+cfg.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
logger.info("\n%s", report)
writer.write(report)

if __name__ == '__main__':
main()

0 comments on commit 4813849

Please sign in to comment.