Skip to content

Commit

Permalink
docs: update README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
l294265421 committed Jan 29, 2020
1 parent 9e47ad8 commit f33a6be
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 278 deletions.
29 changes: 27 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,31 @@
* allennlp 0.9.0

## Usage

### Supported datasets
- [SemEval-2014 Task 4](http://alt.qcri.org/semeval2014/task4/)
- SemEval-2014-Task-4-LAPT
- SemEval-2014-Task-4-REST
- [2018-aaai-Learning to Attend via Word-Aspect Associative Fusion for Aspect-based Sentiment Analysis](https://arxiv.org/abs/1712.05403v1)
- SemEval-2014-Task-4-REST-DevSplits
- [2018-acl-Aspect Based Sentiment Analysis with Gated Convolutional Networks](https://arxiv.org/abs/1805.07043)
- SemEval-2014-Task-4-REST-Hard
- SemEval-141516-LARGE-REST-HARD
- [SemEval-2015 Task 12](http://alt.qcri.org/semeval2015/task12/)
- SemEval-2015-Task-12-LAPT
- SemEval-2015-Task-12-REST
- SemEval-2015-Task-12-HOTEL
- [SemEval-2016 Task 5] (http://alt.qcri.org/semeval2016/task5/)
- SemEval-2016-Task-5-CH-CAME-SB1
- SemEval-2016-Task-5-CH-PHNS-SB1
- SemEval-2016-Task-5-LAPT-SB1
- SemEval-2016-Task-5-LAPT-SB2
- SemEval-2016-Task-5-REST-SB1
- SemEval-2016-Task-5-REST-SB2
- [2019-emnlp-A Challenge Dataset and Effective Models for Aspect Based Sentiment Analysis](https://www.aclweb.org/anthology/D19-1654.pdf)
- MAMSACSA
- MAMSATSA

### Aspect-Category Sentiment Classification (ACSC) Models
#### Supported models
- ae-lstm [Attention-based LSTM for Aspect-level Sentiment Classification](https://www.aclweb.org/anthology/D16-1058.pdf)
Expand All @@ -21,10 +46,10 @@
- Heat(papers/2017-CIKM-Aspect-level Sentiment Classification with HEAT (HiErarchical ATtention) Network.pdf)
![heat](images/heat.png)
#### Training
sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --embedding_filepath glove.840B.300d.txt
sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --current_dataset SemEval-2014-Task-4-REST-DevSplits --embedding_filepath glove.840B.300d.txt

#### visualization
sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --embedding_filepath glove.840B.300d.txt --train False --visualize_attention True
sh scripts/run.sh acsa/acac_pytorch/acsc_bootstrap.py --model_name Heat --current_dataset SemEval-2014-Task-4-REST-DevSplits --embedding_filepath glove.840B.300d.txt --train False --visualize_attention True

### Aspect Category Detection (ACD) Models

Expand Down
276 changes: 0 additions & 276 deletions acsa/data_adapter/data_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,167 +224,6 @@ def generate_dev_data(self, result, dev_size):
result['dev'] = result['test']


class AsgcnData(BaseDataset):
"""
Aspect-basedSentimentClassificationwithAspect-specificGraph ConvolutionalNetworks
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data_by_filepath(self, train_filepath, test_filepath):
data_type_and_filepath = {'train': train_filepath,
'test': test_filepath}
data_type_and_data = {}
for data_type, filepath in data_type_and_filepath.items():
lines = file_utils.read_all_lines(filepath)
sentences = []
polarity_mapping = {'-1': 'negative',
'0': 'neutral',
'1': 'positive'}
for i in range(0, len(lines), 3):
text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
aspect = lines[i + 1].lower().strip()
polarity = lines[i + 2].strip()
if text_left != '':
text = text_left + " " + aspect
from_index = len(text_left) + 1
else:
text = aspect
from_index = 0
if text_right != '':
text = text + ' ' + text_right
to_index = from_index + len(aspect)
if text[from_index: to_index] != aspect:
logger.error('error aspect index: %s != %s' (text[from_index: to_index], aspect))
aspect_term = AspectTerm(aspect, polarity_mapping[polarity], from_index, to_index)
sentence = AbsaSentence(text, None, None, [aspect_term])
sentences.append(sentence)
documents = [AbsaDocument(sentence.text, None, None, None, [sentence]) for sentence in sentences]
data_type_and_data[data_type] = documents
return data_type_and_data['train'], None, data_type_and_data['test']


class AsgcnData2014Rest(AsgcnData):
"""
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data(self):
train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'restaurant_train.raw')
test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'restaurant_test.raw')
return super()._load_data_by_filepath(train_filepath, test_filepath)


class AsgcnData2014Lapt(AsgcnData):
"""
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data(self):
train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'laptop_train.raw')
test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval14', 'laptop_test.raw')
return super()._load_data_by_filepath(train_filepath, test_filepath)


class AsgcnData2015Rest(AsgcnData):
"""
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data(self):
train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval15', 'restaurant_train.raw')
test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval15', 'restaurant_test.raw')
return super()._load_data_by_filepath(train_filepath, test_filepath)


class AsgcnData2016Rest(AsgcnData):
"""
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data(self):
train_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval16', 'restaurant_train.raw')
test_filepath = os.path.join(base_data_dir, 'ASGCN', 'semeval16', 'restaurant_test.raw')
return super()._load_data_by_filepath(train_filepath, test_filepath)


class YelpDataset(BaseDataset):
"""
"""

def __init__(self, configuration: dict=None):
if 'max_word_num' not in configuration:
configuration['max_word_num'] = sys.maxsize
if 'max_sample_num_per_class' not in configuration:
configuration['max_sample_num_per_class'] = sys.maxsize
super().__init__(configuration)

def _load_data_by_filepath(self, train_filepath, test_filepath=None, val_filepath=None):
"""
:return:
"""
data_type_and_datas = {}
data_type_and_filepath = {
'train': train_filepath,
'test': test_filepath,
'dev': val_filepath
}
for data_type, filepath in data_type_and_filepath.items():
if filepath is None:
data_type_and_datas[data_type] = None
continue
lines = file_utils.read_all_lines_generator(filepath)
documents = []
polarity_count = defaultdict(int)
for i, line in enumerate(lines):
# if i < 100000:
# continue
# 太多样本,机器受不了
line_dict = json.loads(line)
text: str = line_dict['text']
if len(text.split()) > self.configuration['max_word_num']:
continue
stars = line_dict['stars']
if stars > 3:
label = 'positive'
elif stars == 3:
label = 'neutral'
else:
label = 'negative'
# 2018-Exploiting Document Knowledge for Aspect-level Sentiment Classification 30k
if polarity_count[label] > self.configuration['max_sample_num_per_class']:
continue
else:
polarity_count[label] += 1
document = AbsaDocument(text, label, None, None, None)
documents.append(document)
data_type_and_datas[data_type] = documents
train_data = data_type_and_datas['train']
train_data, dev_data = train_test_split(train_data, test_size=0.2)
test_data = dev_data
return train_data, dev_data, test_data

def _load_data(self):
train_filepath = self.configuration['train_filepath']
return self._load_data_by_filepath(train_filepath)


class Semeval2014Task4(BaseDataset):
"""
Expand Down Expand Up @@ -1342,106 +1181,6 @@ def load_csv_data(filepath, skip_first_line=True):
return result


class Bdci2019InternetNews(BaseDataset):
"""
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data(self):
train_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
'Train_DataSet.csv')
train_label_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
'Train_DataSet_Label.csv')
train_rows = load_csv_data(train_filepath)
train_label_rows = load_csv_data(train_label_filepath)

test_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
'Test_DataSet.csv')
test_label_filepath = os.path.join(base_data_dir, 'bdci2019', '互联网新闻情感分析',
'submit_example.csv')
test_rows = load_csv_data(test_filepath)
test_label_rows = load_csv_data(test_label_filepath)

data = {
'train': [train_rows, train_label_rows],
'test': [test_rows, test_label_rows]
}
result = {}
for data_type, [data_rows, label_rows] in data.items():
samples = []
for i in range(len(data_rows)):
sample_id = data_rows[i][0]
data = '。'.join(data_rows[i][1:])
label = label_rows[i][1]
sample = Text(data, label, sample_id=sample_id)
samples.append(sample)
result[data_type] = samples
train_data = result['train']
dev_data = None
test_data = result['test']
return train_data, dev_data, test_data


class Bdci2019FinancialNegative(BaseDataset):
"""
"""

def __init__(self, configuration: dict = None):
super().__init__(configuration)

def _load_data(self):
"""
:return:
"""
train_filepath = os.path.join(base_data_dir, 'bdci2019', '金融信息负面及主体判定',
'Train_Data.csv')
train_rows = load_csv_data(train_filepath)
test_filepath = os.path.join(base_data_dir, 'bdci2019', '金融信息负面及主体判定',
'Test_Data.csv')
test_label_filepath = os.path.join(base_data_dir, 'bdci2019', '金融信息负面及主体判定',
'Submit_Example.csv')
test_rows = load_csv_data(test_filepath)
test_label_rows = load_csv_data(test_label_filepath)
data_rows = {
'train': train_rows,
'test': test_rows
}
data = {
'train': None,
'test': None
}
for data_type, rows in data_rows.items():
samples = []
for row in rows:
sample_id = row[0]
title = row[1]
content = row[2]
entities = row[3].split(';')
text_polarity = row[4] if data_type == 'train' else 0
key_entities = row[5].split(';') if data_type == 'train' else []
aspect_categories = []
for entity in entities:
if entity in key_entities:
polarity = '1'
else:
polarity = '0'
aspect_category = AspectCategory(entity, polarity)
aspect_categories.append(aspect_category)
text = '%scontent-begin。%s' % (title, content)
document = AbsaDocument(text, text_polarity, aspect_categories, None, None, sample_id=sample_id)
samples.append(document)
data[data_type] = samples
train_data = data['train']
dev_data = None
test_data = data['test']
return train_data, dev_data, test_data


suported_dataset_names_and_data_loader = {
'SemEval-2014-Task-4-LAPT': Semeval2014Task4Lapt,
'SemEval-2014-Task-4-REST': Semeval2014Task4Rest,
Expand All @@ -1457,15 +1196,8 @@ def _load_data(self):
'SemEval-2016-Task-5-LAPT-SB2': Semeval2016Task5LaptSub2,
'SemEval-2016-Task-5-REST-SB1': Semeval2016Task5RestSub1,
'SemEval-2016-Task-5-REST-SB2': Semeval2016Task5RestSub2,
'bdci2019-internet-news-sa': Bdci2019InternetNews,
'bdci2019-financial-negative': Bdci2019FinancialNegative,
'AsgcnData2014Lapt': AsgcnData2014Lapt,
'AsgcnData2014Rest': AsgcnData2014Rest,
'AsgcnData2015Rest': AsgcnData2015Rest,
'AsgcnData2016Rest': AsgcnData2016Rest,
'MAMSACSA': MAMSACSA,
'MAMSATSA': MAMSATSA,
'yelp-dataset': YelpDataset
}


Expand All @@ -1482,11 +1214,3 @@ def get_dataset_class_by_name(dataset_name):
dataset_name = 'SemEval-2014-Task-4-REST'
dataset = get_dataset_class_by_name(dataset_name)()
print('')








Binary file not shown.

0 comments on commit f33a6be

Please sign in to comment.