forked from mbzuai-nlp/SemEval2024-task8
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
22,470 additions
and
1,002 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash | ||
|
||
# Assuming you've set up HfArgumentParser to parse command line arguments in train.py | ||
|
||
python train.py \ | ||
--model_path "allenai/longformer-base-4096" \ | ||
--train_csv "../data/train/train_chatgpt.csv" \ | ||
--load_best_model_at_end True \ | ||
--dev_csv "../data/dev/dev.csv" \ | ||
--test_csvs ../data/dev/dev.csv \ | ||
--metric_for_best_model "eval_mean_absolute_diff" \ | ||
--greater_is_better False \ | ||
--do_train True \ | ||
--do_predict True \ | ||
--seed 55 \ | ||
--output_dir "./runs/exp_5" \ | ||
--logging_dir "./runs/exp_5/logs" \ | ||
--num_train_epochs 10 \ | ||
--per_device_train_batch_size 32 \ | ||
--per_device_eval_batch_size 32 \ | ||
--auto_find_batch_size True \ | ||
--logging_steps 10 \ | ||
--load_best_model_at_end True \ | ||
--evaluation_strategy "epoch" \ | ||
--save_strategy "epoch" \ | ||
--save_total_limit 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,362 @@ | ||
import torch | ||
import json | ||
from transformers import AutoTokenizer, AutoModelForTokenClassification | ||
import pandas as pd | ||
import numpy as np | ||
|
||
from dataclasses import dataclass, field | ||
from typing import Any, List, Optional | ||
import transformers | ||
import logging | ||
import glob | ||
import os | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger() | ||
|
||
|
||
@dataclass | ||
class ModelConfig: | ||
model_path: str = "allenai/longformer-base-4096" | ||
|
||
|
||
@dataclass | ||
class DatasetConfig: | ||
train_csv: str = field(default=None, metadata={"help": "Path to train csv file"}) | ||
dev_csv: str = field(default=None, metadata={"help": "Path to dev csv file"}) | ||
test_csvs: List[str] = field( | ||
default=None, metadata={"help": "Path to test csv files"} | ||
) | ||
|
||
|
||
@dataclass | ||
class TrainingArgsConfig(transformers.TrainingArguments): | ||
seed: int = 42 | ||
output_dir: str = "./runs/exp_3" | ||
num_train_epochs: int = 10 | ||
per_device_train_batch_size: int = 32 | ||
per_device_eval_batch_size: int = 32 | ||
auto_find_batch_size: bool = True | ||
logging_dir: str = "./runs/exp_3/logs" | ||
logging_steps: int = 10 | ||
load_best_model_at_end: bool = True | ||
evaluation_strategy: str = "epoch" | ||
save_strategy: str = "epoch" | ||
save_total_limit: int = 2 | ||
|
||
|
||
class Semeval_Data(torch.utils.data.Dataset): | ||
def __init__(self, data_path, max_length=1024, inference=False, debug=False): | ||
self.data = pd.read_csv(data_path).to_dict("records") | ||
self.inference = inference | ||
self.tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096") | ||
self.max_length = max_length | ||
self.debug = debug | ||
|
||
def __len__(self): | ||
return len(self.data) | ||
|
||
def __getitem__(self, idx): | ||
text = self.data[idx]["mixed_review"] | ||
uuid = self.data[idx]["uuid"] | ||
if not self.inference: | ||
label = self.data[idx]["human_end_boundary"] | ||
|
||
if self.debug and not self.inference: | ||
print("Orignal Human Position: ", label) | ||
|
||
labels = [] | ||
corresponding_word = [] | ||
tokens = [] | ||
input_ids = [] | ||
attention_mask = [] | ||
|
||
for jdx, word in enumerate(text.split(" ")): | ||
word_encoded = self.tokenizer.tokenize(word) | ||
sub_words = len(word_encoded) | ||
|
||
if not self.inference: | ||
is_machine_text = 1 if jdx >= label else 0 | ||
labels.extend([is_machine_text] * sub_words) | ||
|
||
corresponding_word.extend([jdx] * sub_words) | ||
tokens.extend(word_encoded) | ||
input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded)) | ||
attention_mask.extend([1] * sub_words) | ||
|
||
###Add padding to labels as -100 | ||
if len(input_ids) < self.max_length - 2: | ||
input_ids = ( | ||
[0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2) | ||
) | ||
if not self.inference: | ||
labels = [-100] + labels + [-100] * (self.max_length - len(labels) - 1) | ||
|
||
attention_mask = ( | ||
[1] | ||
+ attention_mask | ||
+ [1] | ||
+ [0] * (self.max_length - len(attention_mask) - 2) | ||
) | ||
corresponding_word = ( | ||
[-100] | ||
+ corresponding_word | ||
+ [-100] * (self.max_length - len(corresponding_word) - 1) | ||
) | ||
tokens = ( | ||
["<s>"] | ||
+ tokens | ||
+ ["</s>"] | ||
+ ["<pad>"] * (self.max_length - len(tokens) - 2) | ||
) | ||
else: | ||
# Add -100 for CLS and SEP tokens | ||
input_ids = [0] + input_ids[: self.max_length - 2] + [2] | ||
|
||
if not self.inference: | ||
labels = [-100] + labels[: self.max_length - 2] + [-100] | ||
|
||
corresponding_word = ( | ||
[-100] + corresponding_word[: self.max_length - 2] + [-100] | ||
) | ||
attention_mask = [1] + attention_mask[: self.max_length - 2] + [1] | ||
tokens = ["<s>"] + tokens[: self.max_length - 2] + ["</s>"] | ||
|
||
encoded = {} | ||
if not self.inference: | ||
encoded["labels"] = torch.tensor(labels) | ||
encoded["input_ids"] = torch.tensor(input_ids) | ||
encoded["attention_mask"] = torch.tensor(attention_mask) | ||
|
||
if not self.inference: | ||
assert encoded["input_ids"].shape == encoded["labels"].shape | ||
|
||
if self.debug and not self.inference: | ||
print("Tokenized Human Position: ", labels.index(1)) | ||
print("Original Human Position: ", label) | ||
print("Full Human Text:", text) | ||
print("\n") | ||
print("Human Text Truncated:", text.split(" ")[:label]) | ||
print("\n") | ||
encoded["partial_human_review"] = " ".join(text.split(" ")[:label]) | ||
|
||
if self.inference: | ||
encoded["text"] = text | ||
encoded["uuid"] = uuid | ||
encoded["corresponding_word"] = corresponding_word | ||
|
||
return encoded | ||
|
||
|
||
def evaluate_position_difference(actual_position, predicted_position): | ||
""" | ||
Compute the absolute difference between the actual and predicted start positions. | ||
Args: | ||
- actual_position (int): Actual start position of machine-generated text. | ||
- predicted_position (int): Predicted start position of machine-generated text. | ||
Returns: | ||
- int: Absolute difference between the start positions. | ||
""" | ||
return abs(actual_position - predicted_position) | ||
|
||
|
||
def get_start_position(sequence, mapping=None, token_level=True): | ||
""" | ||
Get the start position from a sequence of labels or predictions. | ||
Args: | ||
- sequence (np.array): A sequence of labels or predictions. | ||
- mapping (np.array): Mapping from index to word for the sequence. | ||
- token_level (bool): If True, return positional indices; else, return word mappings. | ||
Returns: | ||
- int or str: Start position in the sequence. | ||
""" | ||
# Locate the position of label '1' | ||
|
||
if mapping is not None: | ||
mask = mapping != -100 | ||
sequence = sequence[mask] | ||
mapping = mapping[mask] | ||
|
||
index = np.where(sequence == 1)[0] | ||
value = index[0] if index.size else (len(sequence) - 1) | ||
|
||
if not token_level: | ||
value = mapping[value] | ||
|
||
return value | ||
|
||
|
||
def evaluate_machine_start_position( | ||
labels, predictions, idx2word=None, token_level=False | ||
): | ||
""" | ||
Evaluate the starting position of machine-generated text in both predicted and actual sequences. | ||
Args: | ||
- labels (np.array): Actual labels. | ||
- predictions (np.array): Predicted labels. | ||
- idx2word (np.array): Mapping from index to word for each sequence in the batch. | ||
- token_level (bool): Flag to determine if evaluation is at token level. If True, return positional indices; else, return word mappings. | ||
Returns: | ||
- float: Mean absolute difference between the start positions in predictions and actual labels. | ||
""" | ||
predicted_positions = predictions.argmax(axis=-1) | ||
|
||
actual_starts = [] | ||
predicted_starts = [] | ||
|
||
if not token_level and idx2word is None: | ||
raise ValueError( | ||
"idx2word must be provided if evaluation is at word level (token_level=False)" | ||
) | ||
|
||
for idx in range(labels.shape[0]): | ||
# Remove padding | ||
mask = labels[idx] != -100 | ||
predict, label, mapping = ( | ||
predicted_positions[idx][mask], | ||
labels[idx][mask], | ||
idx2word[idx][mask] if not token_level else None, | ||
) | ||
|
||
# If token_level is True, just use the index; otherwise, map to word | ||
predicted_value = get_start_position(predict, mapping, token_level) | ||
actual_value = get_start_position(label, mapping, token_level) | ||
|
||
predicted_starts.append(predicted_value) | ||
actual_starts.append(actual_value) | ||
|
||
# Log cases where label '1' is not found | ||
# if not np.where(predict == 1)[0].size: | ||
# print(f"No start label (1) in prediction at sequence {idx}") | ||
# if not np.where(label == 1)[0].size: | ||
# print(f"No start label (1) in actual label at sequence {idx}") | ||
|
||
position_differences = [ | ||
evaluate_position_difference(actual, predict) | ||
for actual, predict in zip(actual_starts, predicted_starts) | ||
] | ||
mean_position_difference = np.mean(position_differences) | ||
|
||
return mean_position_difference | ||
|
||
|
||
def compute_metrics(p): | ||
pred, labels = p | ||
mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True) | ||
|
||
return { | ||
"mean_absolute_diff": mean_absolute_diff, | ||
} | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = transformers.HfArgumentParser( | ||
(ModelConfig, DatasetConfig, TrainingArgsConfig) | ||
) | ||
model_args, data_args, training_args = parser.parse_args_into_dataclasses() | ||
print("Model Arguments: ", model_args) | ||
print("Data Arguments: ", data_args) | ||
print("Training Arguments: ", training_args) | ||
|
||
# Set seed | ||
transformers.set_seed(training_args.seed) | ||
|
||
model = AutoModelForTokenClassification.from_pretrained( | ||
model_args.model_path, num_labels=2, trust_remote_code=True | ||
) | ||
if training_args.do_eval or training_args.do_predict: | ||
# if not in training mode, load the best model | ||
if not os.path.exists(training_args.output_dir): | ||
raise ValueError( | ||
"Output directory ({}) does not exist. Please train the model first.".format( | ||
training_args.output_dir | ||
) | ||
) | ||
logging.info("Trying to load the best model from the output directory...") | ||
|
||
model = AutoModelForTokenClassification.from_pretrained( | ||
training_args.output_dir, num_labels=2, trust_remote_code=True | ||
) | ||
|
||
model = AutoModelForTokenClassification.from_pretrained( | ||
"/home/osama/projects/semeval_private/semeval2024-private/semeval-taskC/baseline/runs/exp_4/checkpoint-687", | ||
num_labels=2, | ||
trust_remote_code=True, | ||
) | ||
|
||
train_set = Semeval_Data(data_args.train_csv) | ||
dev_set = Semeval_Data(data_args.dev_csv) | ||
|
||
trainer = transformers.Trainer( | ||
model=model, | ||
args=training_args, | ||
train_dataset=train_set, | ||
eval_dataset=dev_set, | ||
tokenizer=train_set.tokenizer, | ||
compute_metrics=compute_metrics, | ||
) | ||
|
||
if training_args.do_train: | ||
logger.info("Training...") | ||
logger.info("*** Train Dataset ***") | ||
logger.info(f"Number of samples: {len(train_set)}") | ||
logger.info("*** Dev Dataset ***") | ||
logger.info(f"Number of samples: {len(dev_set)}") | ||
|
||
trainer.train() | ||
|
||
logger.info("Training completed!") | ||
|
||
if training_args.do_eval: | ||
logger.info("Evaluating...") | ||
logger.info("*** Dev Dataset ***") | ||
logger.info(f"Number of samples: {len(dev_set)}") | ||
|
||
metrics = trainer.evaluate() | ||
logger.info(f"Metrics: {metrics}") | ||
trainer.save_metrics("eval", metrics) | ||
|
||
logger.info("Evaluation completed!") | ||
|
||
if training_args.do_predict: | ||
test_sets = [] | ||
for test_csv in data_args.test_csvs: | ||
test_set = Semeval_Data(test_csv, inference=True) | ||
test_sets.append(test_set) | ||
logger.info("Predicting...") | ||
logger.info("*** Test Datasets ***") | ||
logger.info(f"Number of samples: {len(test_sets)}") | ||
|
||
for idx, test_set in enumerate(test_sets): | ||
logger.info(f"Test Dataset {idx + 1}") | ||
logger.info(f"Number of samples: {len(test_set)}") | ||
|
||
predictions, _, _ = trainer.predict(test_set) | ||
logger.info("Predictions completed!") | ||
|
||
df = pd.DataFrame( | ||
{ | ||
"uuid": [i["uuid"] for i in test_set], | ||
"start_position": [ | ||
get_start_position( | ||
i[0], | ||
np.array(i[1]["corresponding_word"]), | ||
token_level=False, | ||
) | ||
for i in list(zip(predictions.argmax(axis=-1), test_set)) | ||
], | ||
} | ||
) | ||
import os | ||
|
||
file_name = os.path.basename(data_args.test_csvs[idx]) | ||
file_dirs = os.path.join(training_args.output_dir, "predictions") | ||
os.makedirs(file_dirs, exist_ok=True) | ||
file_path = os.path.join(file_dirs, file_name) | ||
df.to_csv(file_path, index=False) |
Oops, something went wrong.