From dd5a74c4b2898d4484a8a762106bed76308efe7d Mon Sep 17 00:00:00 2001 From: yangheng95 Date: Tue, 3 Jan 2023 15:50:27 +0000 Subject: [PATCH] 2.0.23 --- .../checkpoints-v2.0.json | 123 +-- .../train_apc.py | 2 +- .../dataset_class/dataset_template.py | 1 + .../__plm__/classic_bert_apc_utils.py | 16 +- .../__plm__/data_utils_for_inference.py | 6 +- .../models/__classic__/__init__.py | 2 +- .../prediction/sentiment_classifier.py | 28 +- .../prediction/aspect_extractor.py | 14 +- .../prediction/code_defect_detector.py | 8 +- .../prediction/rna_classifier.py | 13 +- .../instructor/rnar_instructor.py | 2 +- .../RNARegression/prediction/rna_regressor.py | 2 + .../instructor/tad_instructor.py | 4 +- .../prediction/tad_classifier.py | 12 +- .../prediction/text_classifier.py | 10 +- .../prediction/protein_regressor.py | 10 +- .../prediction/rna_classifier.py | 14 +- release-note.json | 701 ------------------ requirements.txt | 2 +- setup.py | 6 +- unit_test/test_0_clean.py | 31 + unit_test/test_1_dataset_downloading.py | 14 +- unit_test/test_2_tc_pretrain.py | 5 + unit_test/test_3_atepc_pretrain.py | 9 +- unit_test/test_4_apc_pretrain.py | 16 +- unit_test/test_5_adversarial_defense.py | 6 +- .../test_6_apc_inference_set_generation.py | 13 +- unit_test/test_7_augmentation.py | 2 +- unit_test/test_9_clean.py | 13 +- 29 files changed, 141 insertions(+), 944 deletions(-) delete mode 100644 release-note.json create mode 100644 unit_test/test_0_clean.py diff --git a/examples-v2/aspect_polarity_classification/checkpoints-v2.0.json b/examples-v2/aspect_polarity_classification/checkpoints-v2.0.json index adb20ff7e..36aaaa9cd 100644 --- a/examples-v2/aspect_polarity_classification/checkpoints-v2.0.json +++ b/examples-v2/aspect_polarity_classification/checkpoints-v2.0.json @@ -1,122 +1 @@ -{ - "2.0.0": { - "APC": { - "multilingual": { - "id": "", - "Training Model": "FAST-LSA-T-V2-Deberta", - "Training Dataset": "APCDatasetList.Multilingual", - "Language": "Multilingual", - "Description": "Trained on RTX3090", - "Available Version": "1.10.5+", - "Checkpoint File": "fast_lcf_bert_Multilingual_acc_82.66_f1_82.06.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - }, - "english": { - "id": "", - "Training Model": "FAST-LSA-T-V2-Deberta", - "Training Dataset": "APCDatasetList.English", - "Language": "English", - "Description": "Trained on RTX3090", - "Available Version": "1.10.5+", - "Checkpoint File": "fast_lsa_t_v2_English_acc_82.21_f1_81.81.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - }, - "chinese": { - "id": "", - "Training Model": "FAST-LSA-T-V2-Deberta", - "Training Dataset": "APCDatasetList.Chinese", - "Language": "Chinese", - "Description": "Trained on RTX3090", - "Available Version": "1.10.5+", - "Checkpoint File": "fast_lsa_t_v2_Chinese_acc_96.0_f1_95.1.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - } - }, - "ATEPC": { - "multilingual": { - "id": "", - "Training Model": "FAST-LCF-ATEPC", - "Training Dataset": "ABSADatasets.Multilingual", - "Language": "Multilingual", - "Description": "Trained on RTX3090", - "Available Version": "1.16.0+", - "Checkpoint File": "fast_lcf_atepc_Multilingual_cdw_apcacc_78.08_apcf1_77.81_atef1_75.41.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - }, - "english": { - "id": "", - "Training Model": "FAST-LCF-ATEPC", - "Training Dataset": "ATEPCDatasetList.English", - "Language": "English", - "Description": "Trained on RTX3090", - "Available Version": "1.10.5+", - "Checkpoint File": "fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - }, - "chinese": { - "id": "", - "Training Model": "FAST-LCF-ATEPC", - "Training Dataset": "ATEPCDatasetList.Chinese", - "Language": "Chinese", - "Description": "Trained on RTX3090", - "Available Version": "1.10.5+", - "Checkpoint File": "fast_lcf_atepc_Chinese_cdw_apcacc_96.22_apcf1_95.32_atef1_78.73.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - } - }, - "RNAC": { - "degrad_lstm": { - "id": "", - "Training Model": "LSTM", - "Training Dataset": "ABSADatasets.Multilingual", - "Language": "RNA", - "Description": "Trained on RTX3090", - "Available Version": "1.16.0+", - "Checkpoint File": "lstm_degrad_acc_85.26_f1_84.62.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - }, - "degrad_bert": { - "id": "", - "Training Model": "MLP", - "Training Dataset": "Degrad", - "Language": "RNA", - "Description": "Trained on RTX3090", - "Available Version": "1.16.0+", - "Checkpoint File": "bert_mlp_degrad_acc_87.44_f1_86.99.zip", - "Author": "H, Yang (hy345@exeter.ac.uk)" - } - }, - "TAD": { - "tad-sst2": { - "id": "", - "Training Model": "TAD", - "Training Dataset": "SST2", - "Language": "English", - "Description": "Trained on RTX3090", - "Available Version": "1.15+", - "Checkpoint File": "TAD-SST2.zip", - "Author": "H, Yang (yangheng@m.scnu.edu.cn)" - }, - "tad-agnews10k": { - "id": "", - "Training Model": "TAD", - "Training Dataset": "AGNews", - "Language": "English", - "Description": "Trained on RTX3090", - "Available Version": "1.15+", - "Checkpoint File": "TAD-AGNews10K.zip", - "Author": "H, Yang (yangheng@m.scnu.edu.cn)" - }, - "tad-amazon": { - "id": "", - "Training Model": "TAD", - "Training Dataset": "AGNews", - "Language": "English", - "Description": "Trained on RTX3090", - "Available Version": "1.15+", - "Checkpoint File": "TAD-Amazon.zip", - "Author": "H, Yang (yangheng@m.scnu.edu.cn)" - } - } - } -} \ No newline at end of file +{"2.0.0": {"APC": {"multilingual": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.Multilingual", "Language": "Multilingual", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_bert_Multilingual_acc_82.66_f1_82.06.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}, "english": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.English", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lsa_t_v2_English_acc_82.21_f1_81.81.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}, "chinese": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.Chinese", "Language": "Chinese", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lsa_t_v2_Chinese_acc_96.0_f1_95.1.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}}, "ATEPC": {"multilingual": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ABSADatasets.Multilingual", "Language": "Multilingual", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "fast_lcf_atepc_Multilingual_cdw_apcacc_78.08_apcf1_77.81_atef1_75.41.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}, "english": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ATEPCDatasetList.English", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}, "chinese": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ATEPCDatasetList.Chinese", "Language": "Chinese", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_atepc_Chinese_cdw_apcacc_96.22_apcf1_95.32_atef1_78.73.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}}, "RNAC": {"degrad_lstm": {"id": "", "Training Model": "LSTM", "Training Dataset": "ABSADatasets.Multilingual", "Language": "RNA", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "lstm_degrad_acc_85.26_f1_84.62.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}, "degrad_bert": {"id": "", "Training Model": "MLP", "Training Dataset": "Degrad", "Language": "RNA", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "bert_mlp_degrad_acc_87.44_f1_86.99.zip", "Author": "H, Yang (hy345@exeter.ac.uk)"}}, "TAD": {"tad-sst2": {"id": "", "Training Model": "TAD", "Training Dataset": "SST2", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-SST2.zip", "Author": "H, Yang (yangheng@m.scnu.edu.cn)"}, "tad-agnews10k": {"id": "", "Training Model": "TAD", "Training Dataset": "AGNews", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-AGNews10K.zip", "Author": "H, Yang (yangheng@m.scnu.edu.cn)"}, "tad-amazon": {"id": "", "Training Model": "TAD", "Training Dataset": "AGNews", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-Amazon.zip", "Author": "H, Yang (yangheng@m.scnu.edu.cn)"}}}} \ No newline at end of file diff --git a/examples-v2/aspect_polarity_classification/train_apc.py b/examples-v2/aspect_polarity_classification/train_apc.py index afb548eee..dc658d9ec 100644 --- a/examples-v2/aspect_polarity_classification/train_apc.py +++ b/examples-v2/aspect_polarity_classification/train_apc.py @@ -69,4 +69,4 @@ # checkpoint_save_mode=ModelSaveOption.DO_NOT_SAVE_MODEL, auto_device=DeviceTypeOption.AUTO, ) - trainer.load_trained_model() \ No newline at end of file + trainer.load_trained_model() diff --git a/pyabsa/framework/dataset_class/dataset_template.py b/pyabsa/framework/dataset_class/dataset_template.py index 29c02b3dc..ba1845b25 100644 --- a/pyabsa/framework/dataset_class/dataset_template.py +++ b/pyabsa/framework/dataset_class/dataset_template.py @@ -39,6 +39,7 @@ def __init__(self, config, tokenizer, dataset_type, **kwargs): self.config.dataset_file, dataset_type=dataset_type, **kwargs ) self.data = self.covert_to_tensor(self.data) + self.data = self.data[: self.config.get("data_num", -1)] if self.config.get("verbose", True): self.config.logger.info( "{} data examples:\n {}".format(dataset_type, self.data[:2]) diff --git a/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/classic_bert_apc_utils.py b/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/classic_bert_apc_utils.py index 0a982ba2b..65fc2b247 100644 --- a/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/classic_bert_apc_utils.py +++ b/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/classic_bert_apc_utils.py @@ -85,8 +85,10 @@ def pad_syntax_based_srd(text, dep_dist, tokenizer, opt): def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect): + tokenizer = tokenizer.tokenizer if hasattr(opt, "dynamic_truncate") and opt.dynamic_truncate: - _max_seq_len = opt.max_seq_len - len(aspect.split(" ")) + reserved_num = 3 + _max_seq_len = opt.max_seq_len - len(aspect.split(" ")) - reserved_num text_left = text_left.split(" ") text_right = text_right.split(" ") if _max_seq_len < (len(text_left) + len(text_right)): @@ -98,12 +100,10 @@ def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect): text_left = " ".join(text_left) text_right = " ".join(text_right) - # tokenizer.bos_token = tokenizer.bos_token if tokenizer.bos_token else '[CLS]' - # tokenizer.eos_token = tokenizer.eos_token if tokenizer.eos_token else '[SEP]' - # bos_token = tokenizer.bos_token - # eos_token = tokenizer.eos_token - bos_token = "" - eos_token = "" + tokenizer.bos_token = tokenizer.bos_token if tokenizer.bos_token else "[CLS]" + tokenizer.eos_token = tokenizer.eos_token if tokenizer.eos_token else "[SEP]" + bos_token = tokenizer.bos_token + eos_token = tokenizer.eos_token text_raw = text_left + " " + aspect + " " + text_right text_spc = ( @@ -115,7 +115,7 @@ def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect): ) aspect_bert_indices = text_to_sequence(tokenizer, aspect, opt.max_seq_len) - aspect_begin = np.count_nonzero(tokenizer.tokenize(bos_token + " " + text_left)) + aspect_begin = len(tokenizer.tokenize(bos_token + " " + text_left)) aspect_position = set( range(aspect_begin, aspect_begin + np.count_nonzero(aspect_bert_indices)) ) diff --git a/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_inference.py b/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_inference.py index 5eabbcdda..a263c2317 100644 --- a/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_inference.py +++ b/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_inference.py @@ -5,15 +5,13 @@ import numpy as np import tqdm -from torch.utils.data import Dataset from pyabsa import LabelPaddingOption from pyabsa.framework.dataset_class.dataset_template import PyABSADataset -from pyabsa.utils.file_utils.file_utils import load_dataset_from_file from pyabsa.utils.pyabsa_utils import validate_example, fprint from .classic_bert_apc_utils import prepare_input_for_apc, build_sentiment_window from .dependency_graph import dependency_adj_matrix, configure_spacy_model -from ..__lcf__.data_utils_for_inference import parse_sample, ABSAInferenceDataset +from ..__lcf__.data_utils_for_inference import ABSAInferenceDataset class BERTABSAInferenceDataset(ABSAInferenceDataset): @@ -63,7 +61,7 @@ def process_data(self, samples, ignore_error=True): continue prepared_inputs = prepare_input_for_apc( - self.config, self.tokenizer.tokenizer, text_left, text_right, aspect + self.config, self.tokenizer, text_left, text_right, aspect ) aspect_position = prepared_inputs["aspect_position"] diff --git a/pyabsa/tasks/AspectPolarityClassification/models/__classic__/__init__.py b/pyabsa/tasks/AspectPolarityClassification/models/__classic__/__init__.py index 69104656f..e5f7d3c6a 100644 --- a/pyabsa/tasks/AspectPolarityClassification/models/__classic__/__init__.py +++ b/pyabsa/tasks/AspectPolarityClassification/models/__classic__/__init__.py @@ -38,7 +38,7 @@ class ClassicAPCModelList(list): def __init__(self): super(ClassicAPCModelList, self).__init__( [ - # self.ASGCN, + self.ASGCN, self.AOA, self.ATAE_LSTM, self.Cabasc, diff --git a/pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py b/pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py index 15cd512f4..e9ddd64a8 100644 --- a/pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py +++ b/pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py @@ -91,30 +91,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): self.model = torch.load( model_path, map_location=DeviceTypeOption.CPU ) - with open(tokenizer_path, mode="rb") as f: - if hasattr(APCModelList, self.config.model.__name__) or hasattr( - BERTBaselineAPCModelList, self.config.model.__name__ - ): - try: - if kwargs.get("offline", False): - self.tokenizer = AutoTokenizer.from_pretrained( - find_cwd_dir( - self.config.pretrained_bert.split("/")[-1] - ), - do_lower_case="uncased" - in self.config.pretrained_bert, - ) - else: - self.tokenizer = AutoTokenizer.from_pretrained( - self.config.pretrained_bert, - do_lower_case="uncased" - in self.config.pretrained_bert, - ) - except ValueError: - self.tokenizer = pickle.load(f) - elif hasattr(GloVeAPCModelList, self.config.model.__name__): - self.embedding_matrix = self.config.embedding_matrix - self.tokenizer = self.config.tokenizer + + self.tokenizer = self.config.tokenizer if kwargs.get("verbose", False): fprint("Config used in Training:") @@ -380,7 +358,7 @@ def _run_prediction(self, save_path=None, print_result=True, **kwargs): } ) n_total += 1 - if kwargs.get("merge_results", None): + if kwargs.get("merge_results", True): results = self.merge_results(results) try: if print_result: diff --git a/pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py b/pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py index 63add42f2..24879880f 100644 --- a/pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py +++ b/pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py @@ -85,8 +85,18 @@ def __init__(self, checkpoint=None, **kwargs): if state_dict_path or model_path: if state_dict_path: - bert = AutoModel.from_pretrained(self.config.pretrained_bert) - self.model = self.config.model(bert, self.config) + if kwargs.get("offline", False): + self.bert = AutoModel.from_pretrained( + find_cwd_dir( + self.config.pretrained_bert.split("/")[-1] + ), + ) + else: + self.bert = AutoModel.from_pretrained( + self.config.pretrained_bert, + ) + + self.model = self.config.model(self.bert, self.config) self.model.load_state_dict( torch.load( state_dict_path, map_location=DeviceTypeOption.CPU diff --git a/pyabsa/tasks/CodeDefectDetection/prediction/code_defect_detector.py b/pyabsa/tasks/CodeDefectDetection/prediction/code_defect_detector.py index 33c72bfa3..88c9b543a 100644 --- a/pyabsa/tasks/CodeDefectDetection/prediction/code_defect_detector.py +++ b/pyabsa/tasks/CodeDefectDetection/prediction/code_defect_detector.py @@ -96,12 +96,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): model_path, map_location=DeviceTypeOption.CPU ) - try: - self.tokenizer = PretrainedTokenizer(self.config, **kwargs) - except ValueError: - if tokenizer_path: - with open(tokenizer_path, mode="rb") as f: - self.tokenizer = pickle.load(f) else: self.embedding_matrix = self.config.embedding_matrix self.tokenizer = self.config.tokenizer @@ -119,6 +113,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/pyabsa/tasks/RNAClassification/prediction/rna_classifier.py b/pyabsa/tasks/RNAClassification/prediction/rna_classifier.py index 7a27d44ab..e3202232b 100644 --- a/pyabsa/tasks/RNAClassification/prediction/rna_classifier.py +++ b/pyabsa/tasks/RNAClassification/prediction/rna_classifier.py @@ -96,19 +96,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): model_path, map_location=DeviceTypeOption.CPU ) - try: - self.tokenizer = PretrainedTokenizer( - max_seq_len=self.config.max_seq_len, - config=self.config, - **kwargs - ) - except ValueError: - if tokenizer_path: - with open(tokenizer_path, mode="rb") as f: - self.tokenizer = pickle.load(f) else: self.embedding_matrix = self.config.embedding_matrix - self.tokenizer = self.config.tokenizer if model_path: self.model = torch.load( model_path, map_location=DeviceTypeOption.CPU @@ -123,6 +112,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/pyabsa/tasks/RNARegression/instructor/rnar_instructor.py b/pyabsa/tasks/RNARegression/instructor/rnar_instructor.py index 2d5d43d22..c3d649e19 100644 --- a/pyabsa/tasks/RNARegression/instructor/rnar_instructor.py +++ b/pyabsa/tasks/RNARegression/instructor/rnar_instructor.py @@ -282,7 +282,7 @@ def _train_and_evaluate(self, criterion): for epoch in range(self.config.num_epoch): patience -= 1 - description = "Epoch:{} | Loss: {}".format(epoch, "nan") + description = "Epoch:{} | Loss: {}".format(epoch, 0) iterator = tqdm(self.train_dataloaders[0]) for i_batch, sample_batched in enumerate(iterator): global_step += 1 diff --git a/pyabsa/tasks/RNARegression/prediction/rna_regressor.py b/pyabsa/tasks/RNARegression/prediction/rna_regressor.py index 6f739b1b9..47aa6fee1 100644 --- a/pyabsa/tasks/RNARegression/prediction/rna_regressor.py +++ b/pyabsa/tasks/RNARegression/prediction/rna_regressor.py @@ -128,6 +128,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/pyabsa/tasks/TextAdversarialDefense/instructor/tad_instructor.py b/pyabsa/tasks/TextAdversarialDefense/instructor/tad_instructor.py index 4c2576506..7932a4aa2 100644 --- a/pyabsa/tasks/TextAdversarialDefense/instructor/tad_instructor.py +++ b/pyabsa/tasks/TextAdversarialDefense/instructor/tad_instructor.py @@ -591,8 +591,8 @@ def _k_fold_train_and_evaluate(self, criterion): def _evaluate_acc_f1(self, test_dataloader): # switch model to evaluation mode self.model.eval() - n_label_test_correct, n_label_test_total = 0, 0 - n_adv_det_test_correct, n_adv_det_test_total = 0, 0 + n_label_test_correct, n_label_test_total = 1e-10, 1e-10 + n_adv_det_test_correct, n_adv_det_test_total = 1e-10, 1e-10 n_adv_tr_test_correct, n_adv_tr_test_total = 1e-10, 1e-10 t_label_targets_all, t_label_outputs_all = None, None t_adv_det_targets_all, t_adv_det_outputs_all = None, None diff --git a/pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py b/pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py index 67f3886ea..0ff772ee9 100644 --- a/pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py +++ b/pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py @@ -179,16 +179,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): model_path, map_location=DeviceTypeOption.CPU ) - try: - self.tokenizer = PretrainedTokenizer( - max_seq_len=self.config.max_seq_len, - config=self.config, - **kwargs - ) - except ValueError: - if tokenizer_path: - with open(tokenizer_path, mode="rb") as f: - self.tokenizer = pickle.load(f) else: self.embedding_matrix = self.config.embedding_matrix self.tokenizer = self.config.tokenizer @@ -206,6 +196,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/pyabsa/tasks/TextClassification/prediction/text_classifier.py b/pyabsa/tasks/TextClassification/prediction/text_classifier.py index e18ff0da3..78787641a 100644 --- a/pyabsa/tasks/TextClassification/prediction/text_classifier.py +++ b/pyabsa/tasks/TextClassification/prediction/text_classifier.py @@ -14,7 +14,7 @@ from findfile import find_file, find_cwd_dir from termcolor import colored from torch.utils.data import DataLoader -from transformers import AutoModel +from transformers import AutoModel, AutoTokenizer from sklearn import metrics @@ -104,12 +104,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): model_path, map_location=DeviceTypeOption.CPU ) - try: - self.tokenizer = PretrainedTokenizer(self.config, **kwargs) - except ValueError: - if tokenizer_path: - with open(tokenizer_path, mode="rb") as f: - self.tokenizer = pickle.load(f) else: self.embedding_matrix = self.config.embedding_matrix self.tokenizer = self.config.tokenizer @@ -127,6 +121,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/pyabsa/tasks/_Archive/ProteinRegression/prediction/protein_regressor.py b/pyabsa/tasks/_Archive/ProteinRegression/prediction/protein_regressor.py index 797e0a14b..0b416de97 100644 --- a/pyabsa/tasks/_Archive/ProteinRegression/prediction/protein_regressor.py +++ b/pyabsa/tasks/_Archive/ProteinRegression/prediction/protein_regressor.py @@ -103,14 +103,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): model_path, map_location=DeviceTypeOption.CPU ) - try: - self.tokenizer = PretrainedTokenizer( - config=self.config, **kwargs - ) - except ValueError: - if tokenizer_path: - with open(tokenizer_path, mode="rb") as f: - self.tokenizer = pickle.load(f) else: self.embedding_matrix = self.config.embedding_matrix self.tokenizer = self.config.tokenizer @@ -128,6 +120,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/pyabsa/tasks/_Archive/RNAClassification/prediction/rna_classifier.py b/pyabsa/tasks/_Archive/RNAClassification/prediction/rna_classifier.py index 32c137564..acc2da0fe 100644 --- a/pyabsa/tasks/_Archive/RNAClassification/prediction/rna_classifier.py +++ b/pyabsa/tasks/_Archive/RNAClassification/prediction/rna_classifier.py @@ -95,19 +95,7 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): self.model = torch.load( model_path, map_location=DeviceTypeOption.CPU ) - - try: - self.tokenizer = PretrainedTokenizer( - max_seq_len=self.config.max_seq_len, - config=self.config, - **kwargs - ) - except ValueError: - if tokenizer_path: - with open(tokenizer_path, mode="rb") as f: - self.tokenizer = pickle.load(f) else: - self.tokenizer = self.config.tokenizer self.embedding_matrix = self.config.embedding_matrix if model_path: @@ -124,6 +112,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs): ) ) + self.tokenizer = self.config.tokenizer + if kwargs.get("verbose", False): fprint("Config used in Training:") print_args(self.config) diff --git a/release-note.json b/release-note.json deleted file mode 100644 index 79ac54e7e..000000000 --- a/release-note.json +++ /dev/null @@ -1,701 +0,0 @@ -{ - "2.0.23": { - "1": "Source code reformat based on black" - }, - "2.0.22": { - "1": "Tokenizer fix" - }, - "2.0.21": { - "1": "Fix the description of the metrics, now the metrics are displayed in Dev/Valid metric" - }, - "2.0.20": { - "1": "Fix the VoteEnsemblePredictor" - }, - "2.0.19": { - "1": "Add VoteEnsemblePredictor to support voting ensemble prediction, see https://github.com/yangheng95/PyABSA/blob/v2/examples-v2/ensemble_inference.py" - }, - "2.0.18": { - "1": "Default to disable torch.compile(), set config.use_torch_compile=True to enabled to enable", - "2": "Minor improvements" - }, - "2.0.17": { - "1": "Fix a potential bug while using auto-device", - "2": "General improvements" - }, - "2.0.16": { - "1": "Add PyTorch 2.0 torch.compile() feature, which get considerable speedup on model training. You can install preview version of PyTorch 2.0 following to https://pytorch.org/get-started/pytorch-2.0/#getting-started" - }, - "2.0.15": { - "1": "Fix available checkpoints printing in all_available_checkpoints()" - }, - "2.0.14(12,13)": { - "1": "Fix device setting in inference, to set auto_device to set device in a inference class", - "2": "Fix a bug in APC inference", - "3": "Other improvements" - }, - "2.0.11": { - "1": "Fix auto-augmentation API", - "2": "You can see the augmentation examples in the examples folder", - "3": "Fix a typo in training progress bar", - "4": "Fix a bug for compatible inference with the v1.x versions" - }, - "2.0.10": { - "1": "Now predict() API can be used to predict a list of a texts." - }, - "2.0.9": { - "1": "Now predict() API can be used to predict a list of a texts.", - "2": "Deprecated version because it use a feature from python3.9 and above" - }, - "2.0.8(7)": { - "1": "Add Word2Vec training and BPE tokenizer training APIs and examples" - }, - "2.0.6(5)": { - "1": "Minor fixes and improvements" - }, - "2.0.4(3)": { - "1": "Add compatibility with the inference API of v1.x, but the checkpoints are not compatible with v1.x, so you may find there is no match checkpoint in your old code.", - "2": "Fix resuming from checkpoint bug in ATEPC" - }, - "2.0.2(0,1)": { - "1": "Refactor major package implementation; the package is now organized by NLP tasks, and the package is now more modular. The package structure is expected to be stable in the future.", - "2": "Refactor inference model initialization methods, you need to load a inference using SentimentClassifier(), etc. All the inference APIs are named with predict and batch_predict now.", - "3": "Add tokenization, training, inference and other support for RNA sequence processing", - "4": "The online demos are stilled based on the old version, please wait for the update", - "5": "The auto-text augmentation is not available in this version, please wait for the update", - "6": "The checkpoints also save the tokenizer and embedding(glove, etc), so you can load the tokenizer, embedding from the checkpoint", - "7": "You need to remove old datasets for v1.x. The dataset annotation has been changed; the input and and label are separated by $LABEL$. The padding label is -100 now", - "8": "You can set eval_batch_size in inference (i.e., predict or batch_predict) to save memory", - "9": "You will be able to pass a list of data to fine-tune existing checkpoints in the next version, which is very useful for online learning" - }, - "1.16.26": { - "1": "Improves Config Manager" - }, - "1.16.25": { - "1": "Improves stability of dataset search" - }, - "1.16.24": { - "1": "Improves stability of dataset search" - }, - "1.16.23": { - "1": "Fix the positions output of ATEPC", - "2": "Fix a demo bug", - "3": "Fix the make_ABSA_dataset function" - }, - "1.16.22": { - "1": "Add tokenizer support for apc-to-atepc dataset conversion", - "2": "Fix kaggle dataset number" - }, - "1.16.21": { - "1": "Minor revisions(fix ckpt index)", - "2": "Fix aspect position tagging in ATEPC results" - }, - "1.16.19(20)": { - "1": "Minor fixes and improvements", - "2": "Fix some examples" - }, - "1.16.18": { - "1": "Fix text classification training", - "2": "remove support 1.14.3 and earlier versions", - "3": "Remove default prints about available checkpoints", - "4": "Remove default prints about training args" - }, - "1.16.17": { - "1": "Add Kaggle dataset" - }, - "1.16.16": { - "1": "Modify the inference file location strategy", - "2": "Add use_amp option for using pytorch amp to accelerate training" - }, - "1.16.15": { - "1": "Fix conversion of apc datasets to atepc datasets" - }, - "1.16.14": { - "1": "Remove dataset version validation in PyABSA module init", - "2": "Remove example validation for ATEPC inference" - }, - "1.16.13": { - "1": "Add empty line check for APC training and dataset conversion" - }, - "1.16.11": { - "1": "Fix a issue which prevents the augmentation dataset loading" - }, - "1.16.10": { - "1": "Try to fix a bug in ATEPC inference related to https://github.com/yangheng95/PyABSA/issues/217", - "2": "update metric-visualizer version to 0.5.3" - }, - "1.16.9": { - "1": "Deprecated" - }, - "1.16.8": { - "1": "Add auto-augmentation for ATEPC", - "2": "Fix a bug in ATEPC dataset conversion" - }, - "1.16.7": { - "1": "Add auto-augmentation (APC & TC) for custom datasets (experimental feature), 1. merge dataset into integrated_datasets (locally or PR), 2. set load_aug=True in trainers, will auto-augment your custom dataset when training" - }, - "1.16.6.1": { - "1": "Improve conversion of apc dataset to atepc dataset" - }, - "1.16.6": { - "1": "Remove the option to read the local checkpoints.json in checkpoint downloading", - "2": "Some prints modifications to the console", - "3": "Update findfile dependency to 1.7.9.8", - "4": "Update metric-visualizer dependency to >= 0.5.0", - "5": "Will remove Google Drive checkpoints hosting soon" - }, - "1.16.5": { - "1": "Modify some output printing", - "2": "Fix offline pretrained model loading for text classification" - }, - "1.16.4": { - "1": "Add validation set support for ATEPC", - "2": "Other improvements" - }, - "1.16.3": { - "1": "fix patch for 1.16.2" - }, - "1.16.2": { - "1": "fix a dataset detection issue", - "2": "Add probability distribution and confidence to the sentiment classification in ATEPC model", - "3": "Add a experimental feature to automatic annotate the APC amd ATEPC dataset using aspect extractor provided by PyABSA, see demos/aspect_text_extraction/extract_aspect_and_make_dataset.py" - }, - "1.16.1": { - "1": "fix some DatasetItem" - }, - "1.16.0": { - "1": "Fix a checkpoint downloading and inflation bug which prevents loading a checkpoint from huggingface spaces", - "2": "Fix a important bug which cause unexpected low performance when performing ATEPC inference for Chinese language (and possibly other non-latin languages)", - "3": "Minor modifications" - }, - "1.15.7": { - "1": "Update findfile dependency to 1.7.9.5", - "2": "Default to activate use_bert_spc for ATEPC models to improve ATE and APC performance", - "3": "Minor fixes" - }, - "1.15.6": { - "1": "Add classification report (including precision, recall, F1) display after training, set config.show_metric to activate", - "2": "Add offline option to load huggingface model in inference: set get_xxx(offline=True) to auto detect and load local pretrained model", - "3": "Fix lcf-fusion in fast-lsa-t-v2", - "4": "Fix some typos", - "5": "Rename MOOC-En dataset to MOOC_En" - }, - "1.15.5": { - "1": "Minor fixes of tad inference" - }, - "1.15.4": { - "1": "Fix some bugs", - "2": "Add some new features", - "3": "Add a english MOOC dataset" - }, - "1.15.0": { - "1": "Fix some bugs" - }, - "1.14.8": { - "1": "Minor fixes" - }, - "1.14.7": { - "1": "Refactor optimizer support, you can use torch optimizer either by a string or an optimizer object from torch.optim", - "2": "Fix checkpoint saving in some scenarios", - "3": "Minor fixes" - }, - "1.14.6": { - "1": "Minor fixes" - }, - "1.14.5": { - "1": "Revert Python version dependency", - "2": "Revert Torch dependency" - }, - "1.14.4": { - "1": "This version contain breaking experimental changes, if you find any bug please roll back and report on Github" - }, - "1.14.3(2)": { - "1": "Refactor pre-tokenization before inference for multilingual ATEPC", - "2": "Reset default checkpoint host to Huggingface Hub (useful for Chinese users), please test and report if it works", - "3": "Other bug fixes and improvement, see source code" - }, - "1.14.1": { - "1": "Minor update" - }, - "1.14.0": { - "1": "Bug fixes" - }, - "1.13.4": { - "1": "Refactor output save format", - "2": "Register SemEval2016Task5 datasets in PyABSA", - "3": "Add More language support for ATEPC" - }, - "1.13.3": { - "1": "Improve quality of aspect term extraction results" - }, - "1.13.0(1,2)": { - "1": "Bug Fixes" - }, - "1.10.6": { - "1": "Fix latent resource warning" - }, - "1.10.5": { - "1": "Fix config check function", - "2": "Fix inference of baseline APC models", - "3": "Some refactor, run tests and improve stability", - "4": "Remove SSW APC models" - }, - "1.10.4": { - "1": "Fix a bug in dataset detection, which may cause unexpected dataset mis-detection" - }, - "1.10.3": { - "1": "Add V2 for LSA models, note V2 is not the better model for all scenarios" - }, - "1.10.2": { - "1": "General Update" - }, - "1.10.0": { - "1": "Add more IOB tag support, ref: https://github.com/yangheng95/PyABSA/issues/161", - "2": "WARNING: Modify some models, and some checkpoints on Google Drive may be unavailable due to this update" - }, - "1.9.6": { - "1": "Revert a change causing APC inference fault", - "2": "Modify some default hyper-params", - "3": "Add warmup support, e.g., config.warmup_step=1000" - }, - "1.9.5": { - "1": "Set default optimizers to AdamW" - }, - "1.9.4": { - "1": "Add LSA support for BERT-SPC models" - }, - "1.9.3": { - "1": "Test Version, No important modification" - }, - "1.9.2": { - "1": "General update, ref https://github.com/yangheng95/PyABSA/issues/159" - }, - "1.9.1": { - "1": "Fix a bug in auto hidden_dim and embed_dim setting" - }, - "1.9.0": { - "1": "Deprecate hidden_dim and embed_dim setting of pretrained models", - "2": "Fix ATEPC metric printing", - "3": "Add huggingface space support for ATEPC inference" - }, - "1.8.41": { - "1": "Fix output order of ATEPC inference", - "2": "Fix process multi-aspect sentence in ATEPC" - }, - "1.8.40": { - "1": "Fix valid set loading in BertBaseline APC training", - "2": "Fix a bug in multi-cuda training of text classification " - }, - "1.8.39(38)": { - "1": "Fix the no decay bug in ATEPC training", - "2": "Add apex support (no test yet)" - }, - "1.8.37": { - "1": "Fix the fine-tuned bert save function in text classification", - "2": "Add SST entry in ClassificationDatasetList" - }, - "1.8.36": { - "1": "Fix the fine-tuned bert save function", - "2": "Add notification for augment dataset usage" - }, - "1.8.34(35)": { - "1": "Refactor the cache strategy to avoid cache loading error" - }, - "1.8.33": { - "1": "Modify the version requirement" - }, - "1.8.32": { - "1": "This patch fixes the sentiment prediction in ATEPC", - "2": "Fix a training problem in ATEPC" - }, - "1.8.30": { - "1": "This patch fixes the checkpoint downloading problem" - }, - "1.8.29": { - "1": "Migrate googledrivedownloader to gdown, add a hint for Google Drive's large file download restriction", - "2": "Fix ASGCN, ASGCN-BERT" - }, - "1.8.28": { - "1": "This patch fix a problem in GloVe-based text classification" - }, - "1.8.26": { - "1": "Code review & minor fixes" - }, - "1.8.25": { - "1": "Add raw LSA support option for TNet-LF and ASGCN-BERT", - "2": "Fix some problems", - "3": "General maintenance without feature update" - }, - "1.8.24": { - "1": "Revise some printing" - }, - "1.8.23": { - "1": "Fix a problem of dataset loading" - }, - "1.8.22": { - "1": "Fix path of 1.8.21" - }, - "1.8.21": { - "1": "Activate retry in case of handle network error", - "2": "Remake metric summary board" - }, - "1.8.20": { - "1": "Update version dependency of MetricVisualizer" - }, - "1.8.19": { - "1": "Fix a problem about show_metric option" - }, - "1.8.16(17,18)": { - "1": "Add simple MetricVisualizer (https://github.com/yangheng95/MetricVisualizer) integration. if you dont want to use MetricVisualizer, please set config.show_metric=False" - }, - "1.8.15": { - "1": "Minor revisions" - }, - "1.8.14": { - "1": "Fix text classification inference using pretrained model (GloVe based inference is not affected)", - "2": "Improve stability in using GloVe based model (include APC, TC)" - }, - "1.8.13": { - "1": "Add confidence in text classification output" - }, - "1.8.12": { - "1": "Add GitEE support in integrated dataset downloading" - }, - "1.8.11": { - "1": "Fix version comparison in parsing checkpoints" - }, - "1.8.9(10)": { - "1": "Minor doc fix" - }, - "1.8.8": { - "1": "Minor improvements" - }, - "1.8.4(5)": { - "1": "Minor revisions" - }, - "1.8.2": { - "1": "Minor fixes and optimization" - }, - "1.8.1": { - "1": "Fix an inference bug for ATEPC" - }, - "1.8.0": { - "1": "Add more pretrained model (i.e., encoder model) for ATEPC task. e.g., yangheng/deberta-v3-base-absa-v1.1(large), roberta-base(large)", - "2": "Refactor Docs" - }, - "1.6.17": { - "1": "Remove some optimizers to support pytorch < 1.10.1" - }, - "1.6.16": { - "1": "Make some minor fixes" - }, - "1.6.15": { - "1": "Add validation set support for aspect-based sentiment polarity classification", - "2": "Add confidence output for aspect-based sentiment polarity classification", - "3": "Make some minor fixes" - }, - "1.6.13": { - "1": "Some minor modifications" - }, - "1.6.12": { - "1": "Fix cross_validate for APC", - "2": "Fix cache function in ATEPC" - }, - "1.6.10": { - "1": "Fix a potential problem while training based on checkpoint in multi-cuda environ" - }, - "1.6.7(8)": { - "1": "Fix a potential problem while do batch inference after training based on cached dataset" - }, - "1.6.4": { - "1": "Fix a potential problem in generate APC inference set", - "2": "Register a Yelp dataset in PyABSA provided by WeLi9811: https://github.com/WeiLi9811" - }, - "1.6.3": { - "1": "Fix a potential problem in the sentiment classifier while loading tokenizer", - "2": "Fix a problem in preprocessing in APC inference" - }, - "1.6": { - "0": "This is a stable version which eliminates almost all unknown problems before", - "1": "Fix a problem in saving fine-tuned pretrained model", - "2": "Fix an inference problem in LCA-BERT, SSW-T, SSW-S models", - "3": "Fix a problem in updating ABSADatasets version", - "4": "Modify the output format for BERTBaseline models and GloVe based models to adapt apc_trainer architecture", - "5": "Fix the data preprocessing code for BERTBaseline models and GloVe based models", - "6": "Rename the dependency matrix cache folder, that is for remove conflict between dependency matrix folder and integrated datasets", - "7": "Add alert while loading fine-tuned models", - "8": "Fix the inference for DLCF-DCA and DLCFS-DCA models", - "9": "Fix the embedding function in IAN-Bert model", - "10": "Fix the hop arg missing problem in Memnet-Bert, Ram-Bert, Memnet-GloVe, Ram-GloVe models", - "11": "Fix a printing problem in baseline APC model inference result", - "12": "Fix a parallel problem in BERT-BASE-ATEPC model", - "13": "Stabilize the text_classifier", - "14": "Fix test_loader init problem and dataset cache problem in ClassificationTrainer" - }, - "1.5.4": { - "1": "Fix a a bug while using custom dataset" - }, - "1.5.3": { - "1": "Minor fixes and modifications" - }, - "1.5": { - "1": "Release after full test, no known error yet", - "2": "Fix BERT-SPC Modeling", - "3": "Remove release-note check for efficient", - "4": "Fix dataset cache function", - "5": "Remove DistributedDataParallel for stability", - "6": "Remove older checkpoints", - "7": "Optimize early stop strategy, now patience means patience for epochs", - "8": "Fix a problem may fail APC checkpoint loading", - "9": "Fix a data loading problem for ATEPC", - "10": "Add cache dataset option for all models", - "99": "Other modifications" - }, - "1.3.15": { - "1": "Improve stability while using other pretrained models for ATEPC", - "2": "This is a general update of default config for ATEPC" - }, - "1.3.13": { - "1": "Fix a checkpoint loading problem of APC (Some checkpoint at Google Drive may be unavailable now, we will update soon)" - }, - "1.3.12": { - "1": "Refactor to support customize IOB label for ATEPC (The integrated function to covert APC dataset to ATEPC dataset remains only support ASP IOB now, please customize your dataset's IOB label using your own script)", - "2": "Update default pretrained model for ATEPC", - "3": "Minor changes" - }, - "1.3.11": { - "1": "Divide LSA into FAST-LSA and LSA models" - }, - "1.3.9": { - "1": "Fix low performance of APC using roberta-base" - }, - "1.3.8": { - "1": "Fix a fatal problem in ATEPC example preprocessing (influenced versions: V1.X - V1.3.5), which triggers tremendous ASPECT TOO LONG WARNING. This error severely damaged the ATEPC performance. The ATEPC checkpoints on GoogleDrive were also influenced and may be updated in teh future. ", - "2": "Add deep_ensemble option, use config.deep_ensemble=True to activate", - "3": "Add early stop option, default patience=5", - "4": "Refactor utils to print sorted args", - "5": "Fix a problem while using checkpoint_save_mode=3 to save finetuned BERT", - "6": "Refactor to retry training only while catching ConnectionError", - "7": "Fix an ensemble problem in APC", - "8": "Add full support distributed training", - "9": "Add distributed training option i.e., DataParallel or DistributedDataParallel", - "10": "Fix some potential problem in using other pretrained models in ATEPC to infer (caused by hard code [CLS], [SEP]), support roberta now", - "11": "This is an public test version, could be removed later. Please roll back if you find any problem. I am sorry for my mistake, but I dont have enough time to maintain this project." - }, - "1.3.5": { - "1": "Update default pretrained_bert (bert-base-uncased -> roberta-base)", - "2": "Add cache dataset option for APC task, use config.cache_dataset=True to activate" - }, - "1.3.4": { - "1": "Replace remaining BertModel.from_pretrained() and BertTokenizer.from_pretrained() with AutoModel.from_pretrained() and AutoTokenizer.from_pretrained()", - "2": "Fix some ensemble problems" - }, - "1.3.1": { - "1": "Add multi-cuda support for APC model and part of ATEPC models", - "2": "Add ensemble support for APC models", - "3": "Fix support of legacy APC models in loading & inference using shared checkpoint " - }, - "1.2.13": { - "1": "Minor update in dataset searching" - }, - "1.2.12": { - "1": "Add set/get functions for configs" - }, - "1.2.10": { - "1": "Add an rule on APC dataset lines checking", - "2": "Add SpaCy model config in classic APC models", - "3": "Not fully tested for all situations" - }, - "1.2.9": { - "1": "Add an rule on APC dataset lines checking", - "2": "You can use multiple types of label in your dataset, e.g., string, number" - }, - "1.2.8": { - "1": "Fix the convert_apc_set_to_atepc_set function", - "2": "Fix the error to load a inference model from training, i.e., use trainer.load_trained_model() to load the inference model", - "3": "Fix a bug of batch size setting in atepc inference", - "4": "Fix a bug of auto label-mapping" - }, - "1.2.7": { - "1": "Deprecated" - }, - "1.2.4": { - "1": "Refactor checkpoint map processing format", - "2": "Refactor APC inference to merge results with same text", - "3": "Improve stability" - }, - "1.2.3": { - "1": "Enhance ATEPC dataset converting", - "2": "Fix some problems in some particular situations", - "3": "Improve stability" - }, - "1.2.2": { - "1": "Full support of flexible datasets, update ABSADatasets to version 2021.10.02", - "2": "Support batch_size setting in ATEPC, APC, TC inference", - "3": "Fix the inference of DLCF_DCA model", - "4": "This version is for replacing 1.2.0(1)" - }, - "1.2.0": { - "1": "Enhance to support more flexible labels, now you can define string-based label instead of integer labels", - "2": "Remove set_sentiment_map() support due to above modification", - "3": "Fix a problem may cause problem while building graph for combined datasets", - "4": "Fix a printing problem in ATEPC", - "5": "Fix a bug in inference set loading", - "6": "Redefine the Chinese datasets", - "7": "This version involves considerable modification and may contain potential bug" - }, - "1.1.24": { - "1": "Add the parameters statistics function", - "2": "Optimize the DLCF_DCA model" - }, - "1.1.23": { - "1": "Improve atepc aspect_extractor result, ensure final output is same length and order as original input examples", - "2": "Fix a problem may merge all aspects of different example into 1 line if duplicate example is fed", - "3": "Fix a problem may cause error in text classification", - "4": "Fix a dataset loading problem" - }, - "1.1.22": { - "1": "Improve dataset search to be more flexible", - "2": "Refactor label-mapping trigger. This feature is developed based on the mooc dataset: https://github.com/jmc-123/ABSADatasets/tree/master/datasets/apc_datasets/Chinese/mooc", - "3": "Fix the batch inference of text classification", - "4": "Fix the text classification dataset downloading problem", - "5": "Fix a problem may cause failure of ATEPC inference", - "6": "Add the dependency declaration of typing_extensions" - }, - "1.1.20": { - "1": "Add automatic ABSADatasets update check" - }, - "1.1.19": { - "1": "Fix training without testset in APC", - "2": "Add SpaCy model setting option, e.g., config.spacy_model = 'zh_core_web_sm'" - }, - "1.1.18": { - "1": "Reformat and fix a bug of ATEPC output" - }, - "1.1.17": { - "1": "Add a new Chinese shampoo dataset, thanks to brightgems@github", - "2": "Upgrade ABSADatasets to version: 2021.09.21", - "3": "Fix the inference of DLCF_DCA", - "4": "Fix the training and inferring LCA-Net model", - "5": "Improve the config check function" - }, - "1.1.16": { - "1": "Enable flexible dataset format for ATEPC dataset, see https://github.com/yangheng95/PyABSA/issues/78", - "2": "Fix a bug may cause checkpoint loading problem on no-cuda device", - "3": "Add package version validation" - }, - "1.1.14": { - "1": "Fix the dataset processing functions" - }, - "1.1.13": { - "1": "Refactor ATEPC inference code", - "2": "Add batch inference for APC and ATEPC, temporarily using freeze batch size", - "3": "Define the English dataset" - }, - "1.1.12": { - "1": "Enable downloading shared checkpoint from a google drive url, this is for downloading checkpoint not registered in PyABSA", - "2": "Refine LCF vec memory occupation", - "3": "Add LCF-BERT2 and LCFS-BERT2 demo models", - "4": "Fix a bug report (https://github.com/yangheng95/PyABSA/issues/73)" - }, - "1.1.9": { - "1": "Fix a problem in BERT-ATEPC model" - }, - "1.1.8": { - "1": "Fix a problem may cause checkpoint saving failure" - }, - "1.1.7": { - "1": "Fix the inference of ATEPC using internal datasets, if you are using 1.1.5 or 1.1.6, please update to this version", - "2": "Improve stability and test all examples" - }, - "1.1.6": { - "1": "Deprecated" - }, - "1.1.3": { - "1": "Fix the feature to resume/retrain from a checkpoint" - }, - "1.1.2": { - "1": "Fix https://github.com/yangheng95/PyABSA/issues/59#issuecomment-902531502" - }, - "1.1": { - "1": "Fix some problems" - }, - "1.0.7(.1.2.3)": { - "1": "Fix all examples.", - "2": "Fix patch of #58 (https://github.com/yangheng95/PyABSA/issues/58)" - }, - "1.0.6": { - "1": "Fix potential error loading GloVe-based model's checkpoint." - }, - "1.0.5": { - "1": "Fix potential error loading ATEPC checkpoint." - }, - "1.0.4": { - "1": "Add checkpoint save options, default to save the state_dict instead the whole model", - "2": "Update documentation of some examples", - "3": "Fix a dataset selection problem" - }, - "1.0.1": { - "1": "Fix #53" - }, - "0.9.2.1": { - "1": "fix path of #49" - }, - "0.9.2.0": { - "1": "Add text classification (training & inference) support and SST datasets" - }, - "0.9.1.0": { - "1": "Add model type check before retraining", - "2": "Fix syntax distance measure for ATEPC models" - }, - "0.9.0.6": { - "1": "Optimize inference printing", - "2": "Set default encoding=utf-8", - "3": "Fix graph assigning for ASGCN", - "4": "Fix a problem may causing failure while search inference datasets" - }, - "0.9.0.0": { - "1": "Add BERT baseline models, not available until full test", - "2": "Refactor some code to allow add model easier", - "3": "Add post-training feature: to train based on a pretrained PyABSA model, refer to https://github.com/yangheng95/PyABSA/issues/48", - "4": "Add batch inference (from file) for ATEPC", - "5": "Fix a bug while predicting sentiment polarity using ATEPC model, refer to https://github.com/yangheng95/PyABSA/issues/47" - }, - "0.8.9.4": { - "1": "Fix the inference of DLCF_DCA model" - }, - "0.8.9.3": { - "1": "Refactor some code" - }, - "0.8.9.3rc1": { - "1": "Add evaluation for glove-based APC models", - "2": "fix some problems" - }, - "0.8.9.3rc0": { - "1": "Add DLCF_DCA_BERT models" - }, - "0.8.9.2": { - "1": "Refactor parameter loading method to manage parameters depend on specific model (Note you need to clone the latest examples after updating)", - "2": "Fix cross validation", - "3": "Plan to enable BERT for baseline models" - }, - "0.8.9.1": { - "1": "Add GloVe models support for APC, available model list: AOA, ASGCN, ATAE-LSTM, Cabasc, IAN, LSTM, MemNet, MGAN, RAM, TC/TD-LSTM, TNet_LF", - "2": "Add GloVe embedding download support", - "3": "Next Step: Add inference support for GloVe-based APC models", - "4": "Please feel free to contribute" - }, - "0.8.8.8": { - "1": "Add checkpoint verification", - "2": "Add release note with open source code", - "3": "fix param search function" - }, - "0.8.8.7": { - "1": "Add release note", - "2": "Remove some duplicated code" - }, - "0.8.8.5": { - "1": "Add new datasets (T-shirt, Television, Copyright belongs to https://github.com/rajdeep345/ABSA-Reproducibility)", - "2": "Add polarity label-fix features for some datasets containing negative labels", - "3": "Some typo-fix" - } -} diff --git a/requirements.txt b/requirements.txt index 7b03e0a9e..d6024b3b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -findfile>=1.7.10 +findfile>=1.7.10 autocuda>=0.12 metric-visualizer>=0.6.6 boostaug>=2.3.0 diff --git a/setup.py b/setup.py index 0d5b25792..6cae2c485 100644 --- a/setup.py +++ b/setup.py @@ -40,9 +40,9 @@ license="MIT", install_requires=[ "findfile>=1.7.10", - "autocuda>=0.14", - "metric-visualizer>=0.6.6", - "boostaug>=2.3.0", + "autocuda>=0.15", + "metric-visualizer>=0.7.0", + "boostaug>=2.3.1", "spacy", "networkx", "seqeval", diff --git a/unit_test/test_0_clean.py b/unit_test/test_0_clean.py new file mode 100644 index 000000000..949a6e27c --- /dev/null +++ b/unit_test/test_0_clean.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# file: clean.py +# time: 06/11/2022 11:07 +# author: yangheng +# github: https://github.com/yangheng95 +# GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en +# ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research +# Copyright (C) 2022. All Rights Reserved. +import os + +import findfile + +from pyabsa.utils.pyabsa_utils import fprint + + +def test_clean(): + if os.path.exists("integrated_datasets"): + os.remove("integrated_datasets") + + fprint("Start cleaning...") + for f in findfile.find_cwd_files( + or_key=[".zip", ".cache", ".mv", ".json", ".txt"], + exclude_key="glove", + recursive=1, + ): + os.remove(f) + fprint("Cleaned all files in the current directory.") + + +if __name__ == "__main__": + test_clean() diff --git a/unit_test/test_1_dataset_downloading.py b/unit_test/test_1_dataset_downloading.py index e31cfeb4e..b63ab195c 100644 --- a/unit_test/test_1_dataset_downloading.py +++ b/unit_test/test_1_dataset_downloading.py @@ -17,10 +17,9 @@ from pyabsa.tasks.AspectPolarityClassification import APCDatasetList - def test_download_dataset_by_name(): - if os.path.exists('integrated_datasets'): - shutil.rmtree('integrated_datasets') + if os.path.exists("integrated_datasets"): + shutil.rmtree("integrated_datasets") download_dataset_by_name( TaskCodeOption.Aspect_Polarity_Classification, dataset_name=APCDatasetList.English, @@ -28,6 +27,11 @@ def test_download_dataset_by_name(): def test_download_all_available_dataset(): - if os.path.exists('integrated_datasets'): - shutil.rmtree('integrated_datasets') + if os.path.exists("integrated_datasets"): + shutil.rmtree("integrated_datasets") download_all_available_datasets() + + +if __name__ == "__main__": + test_download_dataset_by_name() + test_download_all_available_dataset() diff --git a/unit_test/test_2_tc_pretrain.py b/unit_test/test_2_tc_pretrain.py index d6bf33340..acb711424 100644 --- a/unit_test/test_2_tc_pretrain.py +++ b/unit_test/test_2_tc_pretrain.py @@ -85,3 +85,8 @@ def test_all_glove_models(): auto_device=DeviceTypeOption.AUTO, ).load_trained_model() text_classifier.predict("I love it very much!") + + +if __name__ == "__main__": + test_all_bert_models() + test_all_glove_models() diff --git a/unit_test/test_3_atepc_pretrain.py b/unit_test/test_3_atepc_pretrain.py index fd3b45829..b500e90bb 100644 --- a/unit_test/test_3_atepc_pretrain.py +++ b/unit_test/test_3_atepc_pretrain.py @@ -45,7 +45,7 @@ def test_chinese_atepc_models(): # # for dataset in ABSADatasetList(): for dataset in ATEPC.ATEPCDatasetList.Phone: - for model in ATEPC.ATEPCModelList(): + for model in ATEPC.ATEPCModelList()[1:2]: config = ATEPC.ATEPCConfigManager.get_atepc_config_chinese() cuda.empty_cache() config.model = model @@ -84,7 +84,7 @@ def test_all_ate_models(): # # for dataset in ABSADatasetList(): for dataset in ATEPC.ATEPCDatasetList()[:1]: - for model in ATEPC.ATEPCModelList(): + for model in ATEPC.ATEPCModelList()[1:2]: config = ATEPC.ATEPCConfigManager.get_atepc_config_english() cuda.empty_cache() config.model = model @@ -116,3 +116,8 @@ def test_all_ate_models(): pred_sentiment=True, # Predict the sentiment of extracted aspect terms ) aspect_extractor.destroy() + + +if __name__ == "__main__": + test_all_ate_models() + test_chinese_atepc_models() diff --git a/unit_test/test_4_apc_pretrain.py b/unit_test/test_4_apc_pretrain.py index e62859619..6b159b5dd 100644 --- a/unit_test/test_4_apc_pretrain.py +++ b/unit_test/test_4_apc_pretrain.py @@ -59,7 +59,7 @@ def test_cross_validate(): dataset=dataset, checkpoint_save_mode=ModelSaveOption.SAVE_MODEL_STATE_DICT, auto_device=DeviceTypeOption.AUTO, - ).load_trained_model() + ) sent_classifier = apc_trainer.load_trained_model() for ex in apc_examples: result = sent_classifier.predict( @@ -103,7 +103,7 @@ def test_lcf_apc_models(): from pyabsa import AspectPolarityClassification as APC for dataset in [APC.APCDatasetList.Laptop14]: - for model in APC.APCModelList(): + for model in APC.APCModelList()[:1]: config = APC.APCConfigManager.get_apc_config_english() config.lcf = "cdm" config.model = model @@ -163,7 +163,7 @@ def test_bert_apc_models(): for dataset in [APC.APCDatasetList.Laptop14, APC.APCDatasetList.Phone]: - for model in APC.BERTBaselineAPCModelList(): + for model in APC.BERTBaselineAPCModelList()[1:2]: config = APC.APCConfigManager.get_apc_config_english() cuda.empty_cache() config.model = model @@ -193,7 +193,7 @@ def test_glove_apc_models(): from pyabsa import AspectPolarityClassification as APC for dataset in [APC.APCDatasetList.Laptop14]: - for model in APC.GloVeAPCModelList(): + for model in APC.GloVeAPCModelList()[1:2]: cuda.empty_cache() config = APC.APCConfigManager.get_apc_config_glove() config.model = model @@ -218,3 +218,11 @@ def test_glove_apc_models(): ) sent_classifier.destroy() + + +if __name__ == "__main__": + # test_lcf_apc_models() + test_bert_apc_models() + test_glove_apc_models() + # test_save_models() + # test_lcf_apc_models() diff --git a/unit_test/test_5_adversarial_defense.py b/unit_test/test_5_adversarial_defense.py index dab68e012..e687cccc0 100644 --- a/unit_test/test_5_adversarial_defense.py +++ b/unit_test/test_5_adversarial_defense.py @@ -25,7 +25,7 @@ def test_tad_training(): config.seed = [2] config.l2reg = 1e-5 config.cross_validate_fold = -1 - config.data_num = 60 + config.data_num = 600 dataset = DatasetItem("SST2TextFooler") @@ -34,3 +34,7 @@ def test_tad_training(): ).load_trained_model() text_classifier.batch_predict(dataset) + + +if __name__ == "__main__": + test_tad_training() diff --git a/unit_test/test_6_apc_inference_set_generation.py b/unit_test/test_6_apc_inference_set_generation.py index 26dbea128..b3d1e58f2 100644 --- a/unit_test/test_6_apc_inference_set_generation.py +++ b/unit_test/test_6_apc_inference_set_generation.py @@ -13,9 +13,14 @@ ) -# def test_apc_inference_set_generation(): -# generate_inference_set_for_apc('integrated_datasets') +def test_apc_inference_set_generation(): + generate_inference_set_for_apc("integrated_datasets") -# def test_apc_inference_set_conversion(): -# convert_apc_set_to_atepc_set('integrated_datasets') +def test_apc_inference_set_conversion(): + convert_apc_set_to_atepc_set("integrated_datasets") + + +if __name__ == "__main__": + test_apc_inference_set_generation() + test_apc_inference_set_conversion() diff --git a/unit_test/test_7_augmentation.py b/unit_test/test_7_augmentation.py index d170a50e4..8ba984c7a 100644 --- a/unit_test/test_7_augmentation.py +++ b/unit_test/test_7_augmentation.py @@ -36,7 +36,7 @@ def test_classification_augmentation(): SST2 = TC.TCDatasetList.SST2 auto_classification_augmentation( - config=config, dataset=SST2, device=DeviceTypeOption.CPU + config=config, dataset=SST2, device=DeviceTypeOption.AUTO ) diff --git a/unit_test/test_9_clean.py b/unit_test/test_9_clean.py index 6354dea1b..86a8abc2d 100644 --- a/unit_test/test_9_clean.py +++ b/unit_test/test_9_clean.py @@ -12,10 +12,19 @@ from pyabsa.utils.pyabsa_utils import fprint + def test_clean(): - fprint('Start cleaning...') + if os.path.exists("integrated_datasets"): + os.remove("integrated_datasets") + fprint("Start cleaning...") for f in findfile.find_cwd_files( - or_key=[".zip", ".cache", ".mv", ".json", ".txt"], exclude_key="glove", recursive=1 + or_key=[".zip", ".cache", ".mv", ".json", ".txt"], + exclude_key="glove", + recursive=1, ): os.remove(f) fprint("Cleaned all files in the current directory.") + + +if __name__ == "__main__": + test_clean()