Skip to content

Commit

Permalink
2.0.23
Browse files Browse the repository at this point in the history
  • Loading branch information
yangheng95 committed Jan 3, 2023
1 parent e6100a7 commit dd5a74c
Show file tree
Hide file tree
Showing 29 changed files with 141 additions and 944 deletions.
123 changes: 1 addition & 122 deletions examples-v2/aspect_polarity_classification/checkpoints-v2.0.json
Original file line number Diff line number Diff line change
@@ -1,122 +1 @@
{
"2.0.0": {
"APC": {
"multilingual": {
"id": "",
"Training Model": "FAST-LSA-T-V2-Deberta",
"Training Dataset": "APCDatasetList.Multilingual",
"Language": "Multilingual",
"Description": "Trained on RTX3090",
"Available Version": "1.10.5+",
"Checkpoint File": "fast_lcf_bert_Multilingual_acc_82.66_f1_82.06.zip",
"Author": "H, Yang ([email protected])"
},
"english": {
"id": "",
"Training Model": "FAST-LSA-T-V2-Deberta",
"Training Dataset": "APCDatasetList.English",
"Language": "English",
"Description": "Trained on RTX3090",
"Available Version": "1.10.5+",
"Checkpoint File": "fast_lsa_t_v2_English_acc_82.21_f1_81.81.zip",
"Author": "H, Yang ([email protected])"
},
"chinese": {
"id": "",
"Training Model": "FAST-LSA-T-V2-Deberta",
"Training Dataset": "APCDatasetList.Chinese",
"Language": "Chinese",
"Description": "Trained on RTX3090",
"Available Version": "1.10.5+",
"Checkpoint File": "fast_lsa_t_v2_Chinese_acc_96.0_f1_95.1.zip",
"Author": "H, Yang ([email protected])"
}
},
"ATEPC": {
"multilingual": {
"id": "",
"Training Model": "FAST-LCF-ATEPC",
"Training Dataset": "ABSADatasets.Multilingual",
"Language": "Multilingual",
"Description": "Trained on RTX3090",
"Available Version": "1.16.0+",
"Checkpoint File": "fast_lcf_atepc_Multilingual_cdw_apcacc_78.08_apcf1_77.81_atef1_75.41.zip",
"Author": "H, Yang ([email protected])"
},
"english": {
"id": "",
"Training Model": "FAST-LCF-ATEPC",
"Training Dataset": "ATEPCDatasetList.English",
"Language": "English",
"Description": "Trained on RTX3090",
"Available Version": "1.10.5+",
"Checkpoint File": "fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip",
"Author": "H, Yang ([email protected])"
},
"chinese": {
"id": "",
"Training Model": "FAST-LCF-ATEPC",
"Training Dataset": "ATEPCDatasetList.Chinese",
"Language": "Chinese",
"Description": "Trained on RTX3090",
"Available Version": "1.10.5+",
"Checkpoint File": "fast_lcf_atepc_Chinese_cdw_apcacc_96.22_apcf1_95.32_atef1_78.73.zip",
"Author": "H, Yang ([email protected])"
}
},
"RNAC": {
"degrad_lstm": {
"id": "",
"Training Model": "LSTM",
"Training Dataset": "ABSADatasets.Multilingual",
"Language": "RNA",
"Description": "Trained on RTX3090",
"Available Version": "1.16.0+",
"Checkpoint File": "lstm_degrad_acc_85.26_f1_84.62.zip",
"Author": "H, Yang ([email protected])"
},
"degrad_bert": {
"id": "",
"Training Model": "MLP",
"Training Dataset": "Degrad",
"Language": "RNA",
"Description": "Trained on RTX3090",
"Available Version": "1.16.0+",
"Checkpoint File": "bert_mlp_degrad_acc_87.44_f1_86.99.zip",
"Author": "H, Yang ([email protected])"
}
},
"TAD": {
"tad-sst2": {
"id": "",
"Training Model": "TAD",
"Training Dataset": "SST2",
"Language": "English",
"Description": "Trained on RTX3090",
"Available Version": "1.15+",
"Checkpoint File": "TAD-SST2.zip",
"Author": "H, Yang ([email protected])"
},
"tad-agnews10k": {
"id": "",
"Training Model": "TAD",
"Training Dataset": "AGNews",
"Language": "English",
"Description": "Trained on RTX3090",
"Available Version": "1.15+",
"Checkpoint File": "TAD-AGNews10K.zip",
"Author": "H, Yang ([email protected])"
},
"tad-amazon": {
"id": "",
"Training Model": "TAD",
"Training Dataset": "AGNews",
"Language": "English",
"Description": "Trained on RTX3090",
"Available Version": "1.15+",
"Checkpoint File": "TAD-Amazon.zip",
"Author": "H, Yang ([email protected])"
}
}
}
}
{"2.0.0": {"APC": {"multilingual": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.Multilingual", "Language": "Multilingual", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_bert_Multilingual_acc_82.66_f1_82.06.zip", "Author": "H, Yang ([email protected])"}, "english": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.English", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lsa_t_v2_English_acc_82.21_f1_81.81.zip", "Author": "H, Yang ([email protected])"}, "chinese": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.Chinese", "Language": "Chinese", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lsa_t_v2_Chinese_acc_96.0_f1_95.1.zip", "Author": "H, Yang ([email protected])"}}, "ATEPC": {"multilingual": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ABSADatasets.Multilingual", "Language": "Multilingual", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "fast_lcf_atepc_Multilingual_cdw_apcacc_78.08_apcf1_77.81_atef1_75.41.zip", "Author": "H, Yang ([email protected])"}, "english": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ATEPCDatasetList.English", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip", "Author": "H, Yang ([email protected])"}, "chinese": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ATEPCDatasetList.Chinese", "Language": "Chinese", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_atepc_Chinese_cdw_apcacc_96.22_apcf1_95.32_atef1_78.73.zip", "Author": "H, Yang ([email protected])"}}, "RNAC": {"degrad_lstm": {"id": "", "Training Model": "LSTM", "Training Dataset": "ABSADatasets.Multilingual", "Language": "RNA", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "lstm_degrad_acc_85.26_f1_84.62.zip", "Author": "H, Yang ([email protected])"}, "degrad_bert": {"id": "", "Training Model": "MLP", "Training Dataset": "Degrad", "Language": "RNA", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "bert_mlp_degrad_acc_87.44_f1_86.99.zip", "Author": "H, Yang ([email protected])"}}, "TAD": {"tad-sst2": {"id": "", "Training Model": "TAD", "Training Dataset": "SST2", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-SST2.zip", "Author": "H, Yang ([email protected])"}, "tad-agnews10k": {"id": "", "Training Model": "TAD", "Training Dataset": "AGNews", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-AGNews10K.zip", "Author": "H, Yang ([email protected])"}, "tad-amazon": {"id": "", "Training Model": "TAD", "Training Dataset": "AGNews", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-Amazon.zip", "Author": "H, Yang ([email protected])"}}}}
2 changes: 1 addition & 1 deletion examples-v2/aspect_polarity_classification/train_apc.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,4 @@
# checkpoint_save_mode=ModelSaveOption.DO_NOT_SAVE_MODEL,
auto_device=DeviceTypeOption.AUTO,
)
trainer.load_trained_model()
trainer.load_trained_model()
1 change: 1 addition & 0 deletions pyabsa/framework/dataset_class/dataset_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self, config, tokenizer, dataset_type, **kwargs):
self.config.dataset_file, dataset_type=dataset_type, **kwargs
)
self.data = self.covert_to_tensor(self.data)
self.data = self.data[: self.config.get("data_num", -1)]
if self.config.get("verbose", True):
self.config.logger.info(
"{} data examples:\n {}".format(dataset_type, self.data[:2])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ def pad_syntax_based_srd(text, dep_dist, tokenizer, opt):


def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect):
tokenizer = tokenizer.tokenizer
if hasattr(opt, "dynamic_truncate") and opt.dynamic_truncate:
_max_seq_len = opt.max_seq_len - len(aspect.split(" "))
reserved_num = 3
_max_seq_len = opt.max_seq_len - len(aspect.split(" ")) - reserved_num
text_left = text_left.split(" ")
text_right = text_right.split(" ")
if _max_seq_len < (len(text_left) + len(text_right)):
Expand All @@ -98,12 +100,10 @@ def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect):
text_left = " ".join(text_left)
text_right = " ".join(text_right)

# tokenizer.bos_token = tokenizer.bos_token if tokenizer.bos_token else '[CLS]'
# tokenizer.eos_token = tokenizer.eos_token if tokenizer.eos_token else '[SEP]'
# bos_token = tokenizer.bos_token
# eos_token = tokenizer.eos_token
bos_token = ""
eos_token = ""
tokenizer.bos_token = tokenizer.bos_token if tokenizer.bos_token else "[CLS]"
tokenizer.eos_token = tokenizer.eos_token if tokenizer.eos_token else "[SEP]"
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token

text_raw = text_left + " " + aspect + " " + text_right
text_spc = (
Expand All @@ -115,7 +115,7 @@ def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect):
)
aspect_bert_indices = text_to_sequence(tokenizer, aspect, opt.max_seq_len)

aspect_begin = np.count_nonzero(tokenizer.tokenize(bos_token + " " + text_left))
aspect_begin = len(tokenizer.tokenize(bos_token + " " + text_left))
aspect_position = set(
range(aspect_begin, aspect_begin + np.count_nonzero(aspect_bert_indices))
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@

import numpy as np
import tqdm
from torch.utils.data import Dataset

from pyabsa import LabelPaddingOption
from pyabsa.framework.dataset_class.dataset_template import PyABSADataset
from pyabsa.utils.file_utils.file_utils import load_dataset_from_file
from pyabsa.utils.pyabsa_utils import validate_example, fprint
from .classic_bert_apc_utils import prepare_input_for_apc, build_sentiment_window
from .dependency_graph import dependency_adj_matrix, configure_spacy_model
from ..__lcf__.data_utils_for_inference import parse_sample, ABSAInferenceDataset
from ..__lcf__.data_utils_for_inference import ABSAInferenceDataset


class BERTABSAInferenceDataset(ABSAInferenceDataset):
Expand Down Expand Up @@ -63,7 +61,7 @@ def process_data(self, samples, ignore_error=True):
continue

prepared_inputs = prepare_input_for_apc(
self.config, self.tokenizer.tokenizer, text_left, text_right, aspect
self.config, self.tokenizer, text_left, text_right, aspect
)

aspect_position = prepared_inputs["aspect_position"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class ClassicAPCModelList(list):
def __init__(self):
super(ClassicAPCModelList, self).__init__(
[
# self.ASGCN,
self.ASGCN,
self.AOA,
self.ATAE_LSTM,
self.Cabasc,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,30 +91,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
self.model = torch.load(
model_path, map_location=DeviceTypeOption.CPU
)
with open(tokenizer_path, mode="rb") as f:
if hasattr(APCModelList, self.config.model.__name__) or hasattr(
BERTBaselineAPCModelList, self.config.model.__name__
):
try:
if kwargs.get("offline", False):
self.tokenizer = AutoTokenizer.from_pretrained(
find_cwd_dir(
self.config.pretrained_bert.split("/")[-1]
),
do_lower_case="uncased"
in self.config.pretrained_bert,
)
else:
self.tokenizer = AutoTokenizer.from_pretrained(
self.config.pretrained_bert,
do_lower_case="uncased"
in self.config.pretrained_bert,
)
except ValueError:
self.tokenizer = pickle.load(f)
elif hasattr(GloVeAPCModelList, self.config.model.__name__):
self.embedding_matrix = self.config.embedding_matrix
self.tokenizer = self.config.tokenizer

self.tokenizer = self.config.tokenizer

if kwargs.get("verbose", False):
fprint("Config used in Training:")
Expand Down Expand Up @@ -380,7 +358,7 @@ def _run_prediction(self, save_path=None, print_result=True, **kwargs):
}
)
n_total += 1
if kwargs.get("merge_results", None):
if kwargs.get("merge_results", True):
results = self.merge_results(results)
try:
if print_result:
Expand Down
14 changes: 12 additions & 2 deletions pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,18 @@ def __init__(self, checkpoint=None, **kwargs):

if state_dict_path or model_path:
if state_dict_path:
bert = AutoModel.from_pretrained(self.config.pretrained_bert)
self.model = self.config.model(bert, self.config)
if kwargs.get("offline", False):
self.bert = AutoModel.from_pretrained(
find_cwd_dir(
self.config.pretrained_bert.split("/")[-1]
),
)
else:
self.bert = AutoModel.from_pretrained(
self.config.pretrained_bert,
)

self.model = self.config.model(self.bert, self.config)
self.model.load_state_dict(
torch.load(
state_dict_path, map_location=DeviceTypeOption.CPU
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
model_path, map_location=DeviceTypeOption.CPU
)

try:
self.tokenizer = PretrainedTokenizer(self.config, **kwargs)
except ValueError:
if tokenizer_path:
with open(tokenizer_path, mode="rb") as f:
self.tokenizer = pickle.load(f)
else:
self.embedding_matrix = self.config.embedding_matrix
self.tokenizer = self.config.tokenizer
Expand All @@ -119,6 +113,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
)
)

self.tokenizer = self.config.tokenizer

if kwargs.get("verbose", False):
fprint("Config used in Training:")
print_args(self.config)
Expand Down
13 changes: 2 additions & 11 deletions pyabsa/tasks/RNAClassification/prediction/rna_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,19 +96,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
model_path, map_location=DeviceTypeOption.CPU
)

try:
self.tokenizer = PretrainedTokenizer(
max_seq_len=self.config.max_seq_len,
config=self.config,
**kwargs
)
except ValueError:
if tokenizer_path:
with open(tokenizer_path, mode="rb") as f:
self.tokenizer = pickle.load(f)
else:
self.embedding_matrix = self.config.embedding_matrix
self.tokenizer = self.config.tokenizer
if model_path:
self.model = torch.load(
model_path, map_location=DeviceTypeOption.CPU
Expand All @@ -123,6 +112,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
)
)

self.tokenizer = self.config.tokenizer

if kwargs.get("verbose", False):
fprint("Config used in Training:")
print_args(self.config)
Expand Down
2 changes: 1 addition & 1 deletion pyabsa/tasks/RNARegression/instructor/rnar_instructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def _train_and_evaluate(self, criterion):

for epoch in range(self.config.num_epoch):
patience -= 1
description = "Epoch:{} | Loss: {}".format(epoch, "nan")
description = "Epoch:{} | Loss: {}".format(epoch, 0)
iterator = tqdm(self.train_dataloaders[0])
for i_batch, sample_batched in enumerate(iterator):
global_step += 1
Expand Down
2 changes: 2 additions & 0 deletions pyabsa/tasks/RNARegression/prediction/rna_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
)
)

self.tokenizer = self.config.tokenizer

if kwargs.get("verbose", False):
fprint("Config used in Training:")
print_args(self.config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -591,8 +591,8 @@ def _k_fold_train_and_evaluate(self, criterion):
def _evaluate_acc_f1(self, test_dataloader):
# switch model to evaluation mode
self.model.eval()
n_label_test_correct, n_label_test_total = 0, 0
n_adv_det_test_correct, n_adv_det_test_total = 0, 0
n_label_test_correct, n_label_test_total = 1e-10, 1e-10
n_adv_det_test_correct, n_adv_det_test_total = 1e-10, 1e-10
n_adv_tr_test_correct, n_adv_tr_test_total = 1e-10, 1e-10
t_label_targets_all, t_label_outputs_all = None, None
t_adv_det_targets_all, t_adv_det_outputs_all = None, None
Expand Down
12 changes: 2 additions & 10 deletions pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
model_path, map_location=DeviceTypeOption.CPU
)

try:
self.tokenizer = PretrainedTokenizer(
max_seq_len=self.config.max_seq_len,
config=self.config,
**kwargs
)
except ValueError:
if tokenizer_path:
with open(tokenizer_path, mode="rb") as f:
self.tokenizer = pickle.load(f)
else:
self.embedding_matrix = self.config.embedding_matrix
self.tokenizer = self.config.tokenizer
Expand All @@ -206,6 +196,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
)
)

self.tokenizer = self.config.tokenizer

if kwargs.get("verbose", False):
fprint("Config used in Training:")
print_args(self.config)
Expand Down
Loading

0 comments on commit dd5a74c

Please sign in to comment.