2.0.23

Utine · Jan 3, 2023 · dd5a74c · dd5a74c
1 parent e6100a7
commit dd5a74c
Show file tree

Hide file tree

Showing 29 changed files with 141 additions and 944 deletions.
diff --git a/examples-v2/aspect_polarity_classification/checkpoints-v2.0.json b/examples-v2/aspect_polarity_classification/checkpoints-v2.0.json
@@ -1,122 +1 @@
-{
-  "2.0.0": {
-    "APC": {
-      "multilingual": {
-        "id": "",
-        "Training Model": "FAST-LSA-T-V2-Deberta",
-        "Training Dataset": "APCDatasetList.Multilingual",
-        "Language": "Multilingual",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.10.5+",
-        "Checkpoint File": "fast_lcf_bert_Multilingual_acc_82.66_f1_82.06.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "english": {
-        "id": "",
-        "Training Model": "FAST-LSA-T-V2-Deberta",
-        "Training Dataset": "APCDatasetList.English",
-        "Language": "English",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.10.5+",
-        "Checkpoint File": "fast_lsa_t_v2_English_acc_82.21_f1_81.81.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "chinese": {
-        "id": "",
-        "Training Model": "FAST-LSA-T-V2-Deberta",
-        "Training Dataset": "APCDatasetList.Chinese",
-        "Language": "Chinese",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.10.5+",
-        "Checkpoint File": "fast_lsa_t_v2_Chinese_acc_96.0_f1_95.1.zip",
-        "Author": "H, Yang ([email protected])"
-      }
-    },
-    "ATEPC": {
-      "multilingual": {
-        "id": "",
-        "Training Model": "FAST-LCF-ATEPC",
-        "Training Dataset": "ABSADatasets.Multilingual",
-        "Language": "Multilingual",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.16.0+",
-        "Checkpoint File": "fast_lcf_atepc_Multilingual_cdw_apcacc_78.08_apcf1_77.81_atef1_75.41.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "english": {
-        "id": "",
-        "Training Model": "FAST-LCF-ATEPC",
-        "Training Dataset": "ATEPCDatasetList.English",
-        "Language": "English",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.10.5+",
-        "Checkpoint File": "fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "chinese": {
-        "id": "",
-        "Training Model": "FAST-LCF-ATEPC",
-        "Training Dataset": "ATEPCDatasetList.Chinese",
-        "Language": "Chinese",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.10.5+",
-        "Checkpoint File": "fast_lcf_atepc_Chinese_cdw_apcacc_96.22_apcf1_95.32_atef1_78.73.zip",
-        "Author": "H, Yang ([email protected])"
-      }
-    },
-    "RNAC": {
-      "degrad_lstm": {
-        "id": "",
-        "Training Model": "LSTM",
-        "Training Dataset": "ABSADatasets.Multilingual",
-        "Language": "RNA",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.16.0+",
-        "Checkpoint File": "lstm_degrad_acc_85.26_f1_84.62.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "degrad_bert": {
-        "id": "",
-        "Training Model": "MLP",
-        "Training Dataset": "Degrad",
-        "Language": "RNA",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.16.0+",
-        "Checkpoint File": "bert_mlp_degrad_acc_87.44_f1_86.99.zip",
-        "Author": "H, Yang ([email protected])"
-      }
-    },
-    "TAD": {
-      "tad-sst2": {
-        "id": "",
-        "Training Model": "TAD",
-        "Training Dataset": "SST2",
-        "Language": "English",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.15+",
-        "Checkpoint File": "TAD-SST2.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "tad-agnews10k": {
-        "id": "",
-        "Training Model": "TAD",
-        "Training Dataset": "AGNews",
-        "Language": "English",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.15+",
-        "Checkpoint File": "TAD-AGNews10K.zip",
-        "Author": "H, Yang ([email protected])"
-      },
-      "tad-amazon": {
-        "id": "",
-        "Training Model": "TAD",
-        "Training Dataset": "AGNews",
-        "Language": "English",
-        "Description": "Trained on RTX3090",
-        "Available Version": "1.15+",
-        "Checkpoint File": "TAD-Amazon.zip",
-        "Author": "H, Yang ([email protected])"
-      }
-    }
-  }
-}
+{"2.0.0": {"APC": {"multilingual": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.Multilingual", "Language": "Multilingual", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_bert_Multilingual_acc_82.66_f1_82.06.zip", "Author": "H, Yang ([email protected])"}, "english": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.English", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lsa_t_v2_English_acc_82.21_f1_81.81.zip", "Author": "H, Yang ([email protected])"}, "chinese": {"id": "", "Training Model": "FAST-LSA-T-V2-Deberta", "Training Dataset": "APCDatasetList.Chinese", "Language": "Chinese", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lsa_t_v2_Chinese_acc_96.0_f1_95.1.zip", "Author": "H, Yang ([email protected])"}}, "ATEPC": {"multilingual": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ABSADatasets.Multilingual", "Language": "Multilingual", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "fast_lcf_atepc_Multilingual_cdw_apcacc_78.08_apcf1_77.81_atef1_75.41.zip", "Author": "H, Yang ([email protected])"}, "english": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ATEPCDatasetList.English", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43.zip", "Author": "H, Yang ([email protected])"}, "chinese": {"id": "", "Training Model": "FAST-LCF-ATEPC", "Training Dataset": "ATEPCDatasetList.Chinese", "Language": "Chinese", "Description": "Trained on RTX3090", "Available Version": "1.10.5+", "Checkpoint File": "fast_lcf_atepc_Chinese_cdw_apcacc_96.22_apcf1_95.32_atef1_78.73.zip", "Author": "H, Yang ([email protected])"}}, "RNAC": {"degrad_lstm": {"id": "", "Training Model": "LSTM", "Training Dataset": "ABSADatasets.Multilingual", "Language": "RNA", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "lstm_degrad_acc_85.26_f1_84.62.zip", "Author": "H, Yang ([email protected])"}, "degrad_bert": {"id": "", "Training Model": "MLP", "Training Dataset": "Degrad", "Language": "RNA", "Description": "Trained on RTX3090", "Available Version": "1.16.0+", "Checkpoint File": "bert_mlp_degrad_acc_87.44_f1_86.99.zip", "Author": "H, Yang ([email protected])"}}, "TAD": {"tad-sst2": {"id": "", "Training Model": "TAD", "Training Dataset": "SST2", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-SST2.zip", "Author": "H, Yang ([email protected])"}, "tad-agnews10k": {"id": "", "Training Model": "TAD", "Training Dataset": "AGNews", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-AGNews10K.zip", "Author": "H, Yang ([email protected])"}, "tad-amazon": {"id": "", "Training Model": "TAD", "Training Dataset": "AGNews", "Language": "English", "Description": "Trained on RTX3090", "Available Version": "1.15+", "Checkpoint File": "TAD-Amazon.zip", "Author": "H, Yang ([email protected])"}}}}
diff --git a/examples-v2/aspect_polarity_classification/train_apc.py b/examples-v2/aspect_polarity_classification/train_apc.py
@@ -69,4 +69,4 @@
                 # checkpoint_save_mode=ModelSaveOption.DO_NOT_SAVE_MODEL,
                 auto_device=DeviceTypeOption.AUTO,
             )
-            trainer.load_trained_model()
+            trainer.load_trained_model()
diff --git a/pyabsa/framework/dataset_class/dataset_template.py b/pyabsa/framework/dataset_class/dataset_template.py
@@ -39,6 +39,7 @@ def __init__(self, config, tokenizer, dataset_type, **kwargs):
                 self.config.dataset_file, dataset_type=dataset_type, **kwargs
             )
             self.data = self.covert_to_tensor(self.data)
+        self.data = self.data[: self.config.get("data_num", -1)]
         if self.config.get("verbose", True):
             self.config.logger.info(
                 "{} data examples:\n {}".format(dataset_type, self.data[:2])

diff --git a/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/classic_bert_apc_utils.py b/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/classic_bert_apc_utils.py
@@ -85,8 +85,10 @@ def pad_syntax_based_srd(text, dep_dist, tokenizer, opt):
 
 
 def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect):
+    tokenizer = tokenizer.tokenizer
     if hasattr(opt, "dynamic_truncate") and opt.dynamic_truncate:
-        _max_seq_len = opt.max_seq_len - len(aspect.split(" "))
+        reserved_num = 3
+        _max_seq_len = opt.max_seq_len - len(aspect.split(" ")) - reserved_num
         text_left = text_left.split(" ")
         text_right = text_right.split(" ")
         if _max_seq_len < (len(text_left) + len(text_right)):
@@ -98,12 +100,10 @@ def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect):
         text_left = " ".join(text_left)
         text_right = " ".join(text_right)
 
-    # tokenizer.bos_token = tokenizer.bos_token if tokenizer.bos_token else '[CLS]'
-    # tokenizer.eos_token = tokenizer.eos_token if tokenizer.eos_token else '[SEP]'
-    # bos_token = tokenizer.bos_token
-    # eos_token = tokenizer.eos_token
-    bos_token = ""
-    eos_token = ""
+    tokenizer.bos_token = tokenizer.bos_token if tokenizer.bos_token else "[CLS]"
+    tokenizer.eos_token = tokenizer.eos_token if tokenizer.eos_token else "[SEP]"
+    bos_token = tokenizer.bos_token
+    eos_token = tokenizer.eos_token
 
     text_raw = text_left + " " + aspect + " " + text_right
     text_spc = (
@@ -115,7 +115,7 @@ def prepare_input_for_apc(opt, tokenizer, text_left, text_right, aspect):
     )
     aspect_bert_indices = text_to_sequence(tokenizer, aspect, opt.max_seq_len)
 
-    aspect_begin = np.count_nonzero(tokenizer.tokenize(bos_token + " " + text_left))
+    aspect_begin = len(tokenizer.tokenize(bos_token + " " + text_left))
     aspect_position = set(
         range(aspect_begin, aspect_begin + np.count_nonzero(aspect_bert_indices))
     )

diff --git a/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_inference.py b/pyabsa/tasks/AspectPolarityClassification/dataset_utils/__plm__/data_utils_for_inference.py
@@ -5,15 +5,13 @@
 
 import numpy as np
 import tqdm
-from torch.utils.data import Dataset
 
 from pyabsa import LabelPaddingOption
 from pyabsa.framework.dataset_class.dataset_template import PyABSADataset
-from pyabsa.utils.file_utils.file_utils import load_dataset_from_file
 from pyabsa.utils.pyabsa_utils import validate_example, fprint
 from .classic_bert_apc_utils import prepare_input_for_apc, build_sentiment_window
 from .dependency_graph import dependency_adj_matrix, configure_spacy_model
-from ..__lcf__.data_utils_for_inference import parse_sample, ABSAInferenceDataset
+from ..__lcf__.data_utils_for_inference import ABSAInferenceDataset
 
 
 class BERTABSAInferenceDataset(ABSAInferenceDataset):
@@ -63,7 +61,7 @@ def process_data(self, samples, ignore_error=True):
                     continue
 
                 prepared_inputs = prepare_input_for_apc(
-                    self.config, self.tokenizer.tokenizer, text_left, text_right, aspect
+                    self.config, self.tokenizer, text_left, text_right, aspect
                 )
 
                 aspect_position = prepared_inputs["aspect_position"]

diff --git a/pyabsa/tasks/AspectPolarityClassification/models/__classic__/__init__.py b/pyabsa/tasks/AspectPolarityClassification/models/__classic__/__init__.py
@@ -38,7 +38,7 @@ class ClassicAPCModelList(list):
     def __init__(self):
         super(ClassicAPCModelList, self).__init__(
             [
-                # self.ASGCN,
+                self.ASGCN,
                 self.AOA,
                 self.ATAE_LSTM,
                 self.Cabasc,

diff --git a/pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py b/pyabsa/tasks/AspectPolarityClassification/prediction/sentiment_classifier.py
@@ -91,30 +91,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                         self.model = torch.load(
                             model_path, map_location=DeviceTypeOption.CPU
                         )
-                    with open(tokenizer_path, mode="rb") as f:
-                        if hasattr(APCModelList, self.config.model.__name__) or hasattr(
-                            BERTBaselineAPCModelList, self.config.model.__name__
-                        ):
-                            try:
-                                if kwargs.get("offline", False):
-                                    self.tokenizer = AutoTokenizer.from_pretrained(
-                                        find_cwd_dir(
-                                            self.config.pretrained_bert.split("/")[-1]
-                                        ),
-                                        do_lower_case="uncased"
-                                        in self.config.pretrained_bert,
-                                    )
-                                else:
-                                    self.tokenizer = AutoTokenizer.from_pretrained(
-                                        self.config.pretrained_bert,
-                                        do_lower_case="uncased"
-                                        in self.config.pretrained_bert,
-                                    )
-                            except ValueError:
-                                self.tokenizer = pickle.load(f)
-                        elif hasattr(GloVeAPCModelList, self.config.model.__name__):
-                            self.embedding_matrix = self.config.embedding_matrix
-                            self.tokenizer = self.config.tokenizer
+
+                self.tokenizer = self.config.tokenizer
 
                 if kwargs.get("verbose", False):
                     fprint("Config used in Training:")
@@ -380,7 +358,7 @@ def _run_prediction(self, save_path=None, print_result=True, **kwargs):
                         }
                     )
                     n_total += 1
-        if kwargs.get("merge_results", None):
+        if kwargs.get("merge_results", True):
             results = self.merge_results(results)
         try:
             if print_result:

diff --git a/pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py b/pyabsa/tasks/AspectTermExtraction/prediction/aspect_extractor.py
@@ -85,8 +85,18 @@ def __init__(self, checkpoint=None, **kwargs):
 
                 if state_dict_path or model_path:
                     if state_dict_path:
-                        bert = AutoModel.from_pretrained(self.config.pretrained_bert)
-                        self.model = self.config.model(bert, self.config)
+                        if kwargs.get("offline", False):
+                            self.bert = AutoModel.from_pretrained(
+                                find_cwd_dir(
+                                    self.config.pretrained_bert.split("/")[-1]
+                                ),
+                            )
+                        else:
+                            self.bert = AutoModel.from_pretrained(
+                                self.config.pretrained_bert,
+                            )
+
+                        self.model = self.config.model(self.bert, self.config)
                         self.model.load_state_dict(
                             torch.load(
                                 state_dict_path, map_location=DeviceTypeOption.CPU

diff --git a/pyabsa/tasks/CodeDefectDetection/prediction/code_defect_detector.py b/pyabsa/tasks/CodeDefectDetection/prediction/code_defect_detector.py
@@ -96,12 +96,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 model_path, map_location=DeviceTypeOption.CPU
                             )
 
-                        try:
-                            self.tokenizer = PretrainedTokenizer(self.config, **kwargs)
-                        except ValueError:
-                            if tokenizer_path:
-                                with open(tokenizer_path, mode="rb") as f:
-                                    self.tokenizer = pickle.load(f)
                     else:
                         self.embedding_matrix = self.config.embedding_matrix
                         self.tokenizer = self.config.tokenizer
@@ -119,6 +113,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 )
                             )
 
+                self.tokenizer = self.config.tokenizer
+
                 if kwargs.get("verbose", False):
                     fprint("Config used in Training:")
                     print_args(self.config)

diff --git a/pyabsa/tasks/RNAClassification/prediction/rna_classifier.py b/pyabsa/tasks/RNAClassification/prediction/rna_classifier.py
@@ -96,19 +96,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 model_path, map_location=DeviceTypeOption.CPU
                             )
 
-                        try:
-                            self.tokenizer = PretrainedTokenizer(
-                                max_seq_len=self.config.max_seq_len,
-                                config=self.config,
-                                **kwargs
-                            )
-                        except ValueError:
-                            if tokenizer_path:
-                                with open(tokenizer_path, mode="rb") as f:
-                                    self.tokenizer = pickle.load(f)
                     else:
                         self.embedding_matrix = self.config.embedding_matrix
-                        self.tokenizer = self.config.tokenizer
                         if model_path:
                             self.model = torch.load(
                                 model_path, map_location=DeviceTypeOption.CPU
@@ -123,6 +112,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 )
                             )
 
+                self.tokenizer = self.config.tokenizer
+
                 if kwargs.get("verbose", False):
                     fprint("Config used in Training:")
                     print_args(self.config)

diff --git a/pyabsa/tasks/RNARegression/instructor/rnar_instructor.py b/pyabsa/tasks/RNARegression/instructor/rnar_instructor.py
@@ -282,7 +282,7 @@ def _train_and_evaluate(self, criterion):
 
         for epoch in range(self.config.num_epoch):
             patience -= 1
-            description = "Epoch:{} | Loss: {}".format(epoch, "nan")
+            description = "Epoch:{} | Loss: {}".format(epoch, 0)
             iterator = tqdm(self.train_dataloaders[0])
             for i_batch, sample_batched in enumerate(iterator):
                 global_step += 1

diff --git a/pyabsa/tasks/RNARegression/prediction/rna_regressor.py b/pyabsa/tasks/RNARegression/prediction/rna_regressor.py
@@ -128,6 +128,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 )
                             )
 
+                self.tokenizer = self.config.tokenizer
+
                 if kwargs.get("verbose", False):
                     fprint("Config used in Training:")
                     print_args(self.config)

diff --git a/pyabsa/tasks/TextAdversarialDefense/instructor/tad_instructor.py b/pyabsa/tasks/TextAdversarialDefense/instructor/tad_instructor.py
@@ -591,8 +591,8 @@ def _k_fold_train_and_evaluate(self, criterion):
     def _evaluate_acc_f1(self, test_dataloader):
         # switch model to evaluation mode
         self.model.eval()
-        n_label_test_correct, n_label_test_total = 0, 0
-        n_adv_det_test_correct, n_adv_det_test_total = 0, 0
+        n_label_test_correct, n_label_test_total = 1e-10, 1e-10
+        n_adv_det_test_correct, n_adv_det_test_total = 1e-10, 1e-10
         n_adv_tr_test_correct, n_adv_tr_test_total = 1e-10, 1e-10
         t_label_targets_all, t_label_outputs_all = None, None
         t_adv_det_targets_all, t_adv_det_outputs_all = None, None

diff --git a/pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py b/pyabsa/tasks/TextAdversarialDefense/prediction/tad_classifier.py
@@ -179,16 +179,6 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 model_path, map_location=DeviceTypeOption.CPU
                             )
 
-                        try:
-                            self.tokenizer = PretrainedTokenizer(
-                                max_seq_len=self.config.max_seq_len,
-                                config=self.config,
-                                **kwargs
-                            )
-                        except ValueError:
-                            if tokenizer_path:
-                                with open(tokenizer_path, mode="rb") as f:
-                                    self.tokenizer = pickle.load(f)
                     else:
                         self.embedding_matrix = self.config.embedding_matrix
                         self.tokenizer = self.config.tokenizer
@@ -206,6 +196,8 @@ def __init__(self, checkpoint=None, cal_perplexity=False, **kwargs):
                                 )
                             )
 
+                self.tokenizer = self.config.tokenizer
+
                 if kwargs.get("verbose", False):
                     fprint("Config used in Training:")
                     print_args(self.config)