docs: update README.md

l294265421 · Jan 29, 2020 · f33a6be · f33a6be
1 parent 9e47ad8
commit f33a6be
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 278 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,31 @@
 * allennlp 0.9.0
 
 ## Usage
+
+### Supported datasets
+- [SemEval-2014 Task 4](http://alt.qcri.org/semeval2014/task4/)
+    - SemEval-2014-Task-4-LAPT
+    - SemEval-2014-Task-4-REST
+- [2018-aaai-Learning to Attend via Word-Aspect Associative Fusion for Aspect-based Sentiment Analysis](https://arxiv.org/abs/1712.05403v1)
+    - SemEval-2014-Task-4-REST-DevSplits
+- [2018-acl-Aspect Based Sentiment Analysis with Gated Convolutional Networks](https://arxiv.org/abs/1805.07043)
+    - SemEval-2014-Task-4-REST-Hard
+    - SemEval-141516-LARGE-REST-HARD
+- [SemEval-2015 Task 12](http://alt.qcri.org/semeval2015/task12/)
+    - SemEval-2015-Task-12-LAPT
+    - SemEval-2015-Task-12-REST
+    - SemEval-2015-Task-12-HOTEL
+- [SemEval-2016 Task 5] (http://alt.qcri.org/semeval2016/task5/)
+    - SemEval-2016-Task-5-CH-CAME-SB1
+    - SemEval-2016-Task-5-CH-PHNS-SB1
+    - SemEval-2016-Task-5-LAPT-SB1
+    - SemEval-2016-Task-5-LAPT-SB2
+    - SemEval-2016-Task-5-REST-SB1
+    - SemEval-2016-Task-5-REST-SB2
+- [2019-emnlp-A Challenge Dataset and Effective Models for Aspect Based Sentiment Analysis](https://www.aclweb.org/anthology/D19-1654.pdf)
+    - MAMSACSA
+    - MAMSATSA
+
 ### Aspect-Category Sentiment Classification (ACSC) Models
 #### Supported models
 - ae-lstm [Attention-based LSTM for Aspect-level Sentiment Classification](https://www.aclweb.org/anthology/D16-1058.pdf)
@@ -21,10 +46,10 @@
 - Heat(papers/2017-CIKM-Aspect-level Sentiment Classification with HEAT (HiErarchical ATtention) Network.pdf)
 ![heat](images/heat.png)
 #### Training
-sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --embedding_filepath glove.840B.300d.txt
+sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --current_dataset SemEval-2014-Task-4-REST-DevSplits --embedding_filepath glove.840B.300d.txt
 
 #### visualization
-sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --embedding_filepath glove.840B.300d.txt --train False --visualize_attention True
+sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --current_dataset SemEval-2014-Task-4-REST-DevSplits --embedding_filepath glove.840B.300d.txt --train False --visualize_attention True
 
 ### Aspect Category Detection (ACD) Models
 

diff --git a/acsa/data_adapter/data_object.py b/acsa/data_adapter/data_object.py
@@ -224,167 +224,6 @@ def generate_dev_data(self, result, dev_size):
                 result['dev'] = result['test']
 
 
-class AsgcnData(BaseDataset):
-    """
-    Aspect-basedSentimentClassiﬁcationwithAspect-speciﬁcGraph ConvolutionalNetworks
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data_by_filepath(self, train_filepath, test_filepath):
-        data_type_and_filepath = {'train': train_filepath,
-                                  'test': test_filepath}
-        data_type_and_data = {}
-        for data_type, filepath in data_type_and_filepath.items():
-            lines = file_utils.read_all_lines(filepath)
-            sentences = []
-            polarity_mapping = {'-1': 'negative',
-                                '0': 'neutral',
-                                '1': 'positive'}
-            for i in range(0, len(lines), 3):
-                text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
-                aspect = lines[i + 1].lower().strip()
-                polarity = lines[i + 2].strip()
-                if text_left != '':
-                    text = text_left + " " + aspect
-                    from_index = len(text_left) + 1
-                else:
-                    text = aspect
-                    from_index = 0
-                if text_right != '':
-                    text = text + ' ' + text_right
-                to_index = from_index + len(aspect)
-                if text[from_index: to_index] != aspect:
-                    logger.error('error aspect index: %s != %s' (text[from_index: to_index], aspect))
-                aspect_term = AspectTerm(aspect, polarity_mapping[polarity], from_index, to_index)
-                sentence = AbsaSentence(text, None, None, [aspect_term])
-                sentences.append(sentence)
-            documents = [AbsaDocument(sentence.text, None, None, None, [sentence]) for sentence in sentences]
-            data_type_and_data[data_type] = documents
-        return data_type_and_data['train'], None, data_type_and_data['test']
-
-
-class AsgcnData2014Rest(AsgcnData):
-    """
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data(self):
-        train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'restaurant_train.raw')
-        test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'restaurant_test.raw')
-        return super()._load_data_by_filepath(train_filepath, test_filepath)
-
-
-class AsgcnData2014Lapt(AsgcnData):
-    """
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data(self):
-        train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'laptop_train.raw')
-        test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'laptop_test.raw')
-        return super()._load_data_by_filepath(train_filepath, test_filepath)
-
-
-class AsgcnData2015Rest(AsgcnData):
-    """
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data(self):
-        train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval15', 'restaurant_train.raw')
-        test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval15', 'restaurant_test.raw')
-        return super()._load_data_by_filepath(train_filepath, test_filepath)
-
-
-class AsgcnData2016Rest(AsgcnData):
-    """
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data(self):
-        train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval16', 'restaurant_train.raw')
-        test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval16', 'restaurant_test.raw')
-        return super()._load_data_by_filepath(train_filepath, test_filepath)
-
-
-class YelpDataset(BaseDataset):
-    """
-
-    """
-
-    def __init__(self, configuration: dict=None):
-        if 'max_word_num' not in configuration:
-            configuration['max_word_num'] = sys.maxsize
-        if 'max_sample_num_per_class' not in configuration:
-            configuration['max_sample_num_per_class'] = sys.maxsize
-        super().__init__(configuration)
-
-    def _load_data_by_filepath(self, train_filepath, test_filepath=None, val_filepath=None):
-        """
-
-        :return:
-        """
-        data_type_and_datas = {}
-        data_type_and_filepath = {
-            'train': train_filepath,
-            'test': test_filepath,
-            'dev': val_filepath
-        }
-        for data_type, filepath in data_type_and_filepath.items():
-            if filepath is None:
-                data_type_and_datas[data_type] = None
-                continue
-            lines = file_utils.read_all_lines_generator(filepath)
-            documents = []
-            polarity_count = defaultdict(int)
-            for i, line in enumerate(lines):
-                # if i < 100000:
-                #     continue
-                # 太多样本，机器受不了
-                line_dict = json.loads(line)
-                text: str = line_dict['text']
-                if len(text.split()) > self.configuration['max_word_num']:
-                    continue
-                stars = line_dict['stars']
-                if stars > 3:
-                    label = 'positive'
-                elif stars == 3:
-                    label = 'neutral'
-                else:
-                    label = 'negative'
-                # 2018-Exploiting Document Knowledge for Aspect-level Sentiment Classification 30k
-                if polarity_count[label] > self.configuration['max_sample_num_per_class']:
-                    continue
-                else:
-                    polarity_count[label] += 1
-                document = AbsaDocument(text, label, None, None, None)
-                documents.append(document)
-            data_type_and_datas[data_type] = documents
-        train_data = data_type_and_datas['train']
-        train_data, dev_data = train_test_split(train_data, test_size=0.2)
-        test_data = dev_data
-        return train_data, dev_data, test_data
-
-    def _load_data(self):
-        train_filepath = self.configuration['train_filepath']
-        return self._load_data_by_filepath(train_filepath)
-
-
 class Semeval2014Task4(BaseDataset):
     """
 
@@ -1342,106 +1181,6 @@ def load_csv_data(filepath, skip_first_line=True):
     return result
 
 
-class Bdci2019InternetNews(BaseDataset):
-    """
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data(self):
-        train_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
-                                      'Train_DataSet.csv')
-        train_label_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
-                                      'Train_DataSet_Label.csv')
-        train_rows = load_csv_data(train_filepath)
-        train_label_rows = load_csv_data(train_label_filepath)
-
-        test_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
-                                      'Test_DataSet.csv')
-        test_label_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
-                                      'submit_example.csv')
-        test_rows = load_csv_data(test_filepath)
-        test_label_rows = load_csv_data(test_label_filepath)
-
-        data = {
-            'train': [train_rows, train_label_rows],
-            'test': [test_rows, test_label_rows]
-        }
-        result = {}
-        for data_type, [data_rows, label_rows] in data.items():
-            samples = []
-            for i in range(len(data_rows)):
-                sample_id = data_rows[i][0]
-                data = '。'.join(data_rows[i][1:])
-                label = label_rows[i][1]
-                sample = Text(data, label, sample_id=sample_id)
-                samples.append(sample)
-            result[data_type] = samples
-        train_data = result['train']
-        dev_data = None
-        test_data = result['test']
-        return train_data, dev_data, test_data
-
-
-class Bdci2019FinancialNegative(BaseDataset):
-    """
-
-    """
-
-    def __init__(self, configuration: dict = None):
-        super().__init__(configuration)
-
-    def _load_data(self):
-        """
-
-        :return:
-        """
-        train_filepath = os.path.join(base_data_dir, 'bdci2019', '金融信息负面及主体判定',
-                                      'Train_Data.csv')
-        train_rows = load_csv_data(train_filepath)
-        test_filepath = os.path.join(base_data_dir, 'bdci2019', '金融信息负面及主体判定',
-                                     'Test_Data.csv')
-        test_label_filepath = os.path.join(base_data_dir, 'bdci2019', '金融信息负面及主体判定',
-                                           'Submit_Example.csv')
-        test_rows = load_csv_data(test_filepath)
-        test_label_rows = load_csv_data(test_label_filepath)
-        data_rows = {
-            'train': train_rows,
-            'test': test_rows
-        }
-        data = {
-            'train': None,
-            'test': None
-        }
-        for data_type, rows in data_rows.items():
-            samples = []
-            for row in rows:
-                sample_id = row[0]
-                title = row[1]
-                content = row[2]
-                entities = row[3].split(';')
-                text_polarity = row[4] if data_type == 'train' else 0
-                key_entities = row[5].split(';') if data_type == 'train' else []
-                aspect_categories = []
-                for entity in entities:
-                    if entity in key_entities:
-                        polarity = '1'
-                    else:
-                        polarity = '0'
-                    aspect_category = AspectCategory(entity, polarity)
-                    aspect_categories.append(aspect_category)
-                text = '%scontent-begin。%s' % (title, content)
-                document = AbsaDocument(text, text_polarity, aspect_categories, None, None, sample_id=sample_id)
-                samples.append(document)
-            data[data_type] = samples
-        train_data = data['train']
-        dev_data = None
-        test_data = data['test']
-        return train_data, dev_data, test_data
-
-
 suported_dataset_names_and_data_loader = {
     'SemEval-2014-Task-4-LAPT': Semeval2014Task4Lapt,
     'SemEval-2014-Task-4-REST': Semeval2014Task4Rest,
@@ -1457,15 +1196,8 @@ def _load_data(self):
     'SemEval-2016-Task-5-LAPT-SB2': Semeval2016Task5LaptSub2,
     'SemEval-2016-Task-5-REST-SB1': Semeval2016Task5RestSub1,
     'SemEval-2016-Task-5-REST-SB2': Semeval2016Task5RestSub2,
-    'bdci2019-internet-news-sa': Bdci2019InternetNews,
-    'bdci2019-financial-negative': Bdci2019FinancialNegative,
-    'AsgcnData2014Lapt': AsgcnData2014Lapt,
-    'AsgcnData2014Rest': AsgcnData2014Rest,
-    'AsgcnData2015Rest': AsgcnData2015Rest,
-    'AsgcnData2016Rest': AsgcnData2016Rest,
     'MAMSACSA': MAMSACSA,
     'MAMSATSA': MAMSATSA,
-    'yelp-dataset': YelpDataset
 }
 
 
@@ -1482,11 +1214,3 @@ def get_dataset_class_by_name(dataset_name):
     dataset_name = 'SemEval-2014-Task-4-REST'
     dataset = get_dataset_class_by_name(dataset_name)()
     print('')
-
-
-
-
-
-
-
-
diff --git a/...19-emnlp-A Challenge Dataset and Effective Models for Aspect Based Sentiment Analysis.pdf b/...19-emnlp-A Challenge Dataset and Effective Models for Aspect Based Sentiment Analysis.pdf