Skip to content

Commit

Permalink
Add: SubtaskC data
Browse files Browse the repository at this point in the history
  • Loading branch information
OAfzal committed Sep 19, 2023
1 parent a9719d5 commit 0334c06
Show file tree
Hide file tree
Showing 7 changed files with 22,470 additions and 1,002 deletions.
26 changes: 26 additions & 0 deletions subtaskC/baseline/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Assuming you've set up HfArgumentParser to parse command line arguments in train.py

python train.py \
--model_path "allenai/longformer-base-4096" \
--train_csv "../data/train/train_chatgpt.csv" \
--load_best_model_at_end True \
--dev_csv "../data/dev/dev.csv" \
--test_csvs ../data/dev/dev.csv \
--metric_for_best_model "eval_mean_absolute_diff" \
--greater_is_better False \
--do_train True \
--do_predict True \
--seed 55 \
--output_dir "./runs/exp_5" \
--logging_dir "./runs/exp_5/logs" \
--num_train_epochs 10 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--auto_find_batch_size True \
--logging_steps 10 \
--load_best_model_at_end True \
--evaluation_strategy "epoch" \
--save_strategy "epoch" \
--save_total_limit 2
362 changes: 362 additions & 0 deletions subtaskC/baseline/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,362 @@
import torch
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification
import pandas as pd
import numpy as np

from dataclasses import dataclass, field
from typing import Any, List, Optional
import transformers
import logging
import glob
import os

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()


@dataclass
class ModelConfig:
model_path: str = "allenai/longformer-base-4096"


@dataclass
class DatasetConfig:
train_csv: str = field(default=None, metadata={"help": "Path to train csv file"})
dev_csv: str = field(default=None, metadata={"help": "Path to dev csv file"})
test_csvs: List[str] = field(
default=None, metadata={"help": "Path to test csv files"}
)


@dataclass
class TrainingArgsConfig(transformers.TrainingArguments):
seed: int = 42
output_dir: str = "./runs/exp_3"
num_train_epochs: int = 10
per_device_train_batch_size: int = 32
per_device_eval_batch_size: int = 32
auto_find_batch_size: bool = True
logging_dir: str = "./runs/exp_3/logs"
logging_steps: int = 10
load_best_model_at_end: bool = True
evaluation_strategy: str = "epoch"
save_strategy: str = "epoch"
save_total_limit: int = 2


class Semeval_Data(torch.utils.data.Dataset):
def __init__(self, data_path, max_length=1024, inference=False, debug=False):
self.data = pd.read_csv(data_path).to_dict("records")
self.inference = inference
self.tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
self.max_length = max_length
self.debug = debug

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
text = self.data[idx]["mixed_review"]
uuid = self.data[idx]["uuid"]
if not self.inference:
label = self.data[idx]["human_end_boundary"]

if self.debug and not self.inference:
print("Orignal Human Position: ", label)

labels = []
corresponding_word = []
tokens = []
input_ids = []
attention_mask = []

for jdx, word in enumerate(text.split(" ")):
word_encoded = self.tokenizer.tokenize(word)
sub_words = len(word_encoded)

if not self.inference:
is_machine_text = 1 if jdx >= label else 0
labels.extend([is_machine_text] * sub_words)

corresponding_word.extend([jdx] * sub_words)
tokens.extend(word_encoded)
input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))
attention_mask.extend([1] * sub_words)

###Add padding to labels as -100
if len(input_ids) < self.max_length - 2:
input_ids = (
[0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2)
)
if not self.inference:
labels = [-100] + labels + [-100] * (self.max_length - len(labels) - 1)

attention_mask = (
[1]
+ attention_mask
+ [1]
+ [0] * (self.max_length - len(attention_mask) - 2)
)
corresponding_word = (
[-100]
+ corresponding_word
+ [-100] * (self.max_length - len(corresponding_word) - 1)
)
tokens = (
["<s>"]
+ tokens
+ ["</s>"]
+ ["<pad>"] * (self.max_length - len(tokens) - 2)
)
else:
# Add -100 for CLS and SEP tokens
input_ids = [0] + input_ids[: self.max_length - 2] + [2]

if not self.inference:
labels = [-100] + labels[: self.max_length - 2] + [-100]

corresponding_word = (
[-100] + corresponding_word[: self.max_length - 2] + [-100]
)
attention_mask = [1] + attention_mask[: self.max_length - 2] + [1]
tokens = ["<s>"] + tokens[: self.max_length - 2] + ["</s>"]

encoded = {}
if not self.inference:
encoded["labels"] = torch.tensor(labels)
encoded["input_ids"] = torch.tensor(input_ids)
encoded["attention_mask"] = torch.tensor(attention_mask)

if not self.inference:
assert encoded["input_ids"].shape == encoded["labels"].shape

if self.debug and not self.inference:
print("Tokenized Human Position: ", labels.index(1))
print("Original Human Position: ", label)
print("Full Human Text:", text)
print("\n")
print("Human Text Truncated:", text.split(" ")[:label])
print("\n")
encoded["partial_human_review"] = " ".join(text.split(" ")[:label])

if self.inference:
encoded["text"] = text
encoded["uuid"] = uuid
encoded["corresponding_word"] = corresponding_word

return encoded


def evaluate_position_difference(actual_position, predicted_position):
"""
Compute the absolute difference between the actual and predicted start positions.
Args:
- actual_position (int): Actual start position of machine-generated text.
- predicted_position (int): Predicted start position of machine-generated text.
Returns:
- int: Absolute difference between the start positions.
"""
return abs(actual_position - predicted_position)


def get_start_position(sequence, mapping=None, token_level=True):
"""
Get the start position from a sequence of labels or predictions.
Args:
- sequence (np.array): A sequence of labels or predictions.
- mapping (np.array): Mapping from index to word for the sequence.
- token_level (bool): If True, return positional indices; else, return word mappings.
Returns:
- int or str: Start position in the sequence.
"""
# Locate the position of label '1'

if mapping is not None:
mask = mapping != -100
sequence = sequence[mask]
mapping = mapping[mask]

index = np.where(sequence == 1)[0]
value = index[0] if index.size else (len(sequence) - 1)

if not token_level:
value = mapping[value]

return value


def evaluate_machine_start_position(
labels, predictions, idx2word=None, token_level=False
):
"""
Evaluate the starting position of machine-generated text in both predicted and actual sequences.
Args:
- labels (np.array): Actual labels.
- predictions (np.array): Predicted labels.
- idx2word (np.array): Mapping from index to word for each sequence in the batch.
- token_level (bool): Flag to determine if evaluation is at token level. If True, return positional indices; else, return word mappings.
Returns:
- float: Mean absolute difference between the start positions in predictions and actual labels.
"""
predicted_positions = predictions.argmax(axis=-1)

actual_starts = []
predicted_starts = []

if not token_level and idx2word is None:
raise ValueError(
"idx2word must be provided if evaluation is at word level (token_level=False)"
)

for idx in range(labels.shape[0]):
# Remove padding
mask = labels[idx] != -100
predict, label, mapping = (
predicted_positions[idx][mask],
labels[idx][mask],
idx2word[idx][mask] if not token_level else None,
)

# If token_level is True, just use the index; otherwise, map to word
predicted_value = get_start_position(predict, mapping, token_level)
actual_value = get_start_position(label, mapping, token_level)

predicted_starts.append(predicted_value)
actual_starts.append(actual_value)

# Log cases where label '1' is not found
# if not np.where(predict == 1)[0].size:
# print(f"No start label (1) in prediction at sequence {idx}")
# if not np.where(label == 1)[0].size:
# print(f"No start label (1) in actual label at sequence {idx}")

position_differences = [
evaluate_position_difference(actual, predict)
for actual, predict in zip(actual_starts, predicted_starts)
]
mean_position_difference = np.mean(position_differences)

return mean_position_difference


def compute_metrics(p):
pred, labels = p
mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True)

return {
"mean_absolute_diff": mean_absolute_diff,
}


if __name__ == "__main__":
parser = transformers.HfArgumentParser(
(ModelConfig, DatasetConfig, TrainingArgsConfig)
)
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
print("Model Arguments: ", model_args)
print("Data Arguments: ", data_args)
print("Training Arguments: ", training_args)

# Set seed
transformers.set_seed(training_args.seed)

model = AutoModelForTokenClassification.from_pretrained(
model_args.model_path, num_labels=2, trust_remote_code=True
)
if training_args.do_eval or training_args.do_predict:
# if not in training mode, load the best model
if not os.path.exists(training_args.output_dir):
raise ValueError(
"Output directory ({}) does not exist. Please train the model first.".format(
training_args.output_dir
)
)
logging.info("Trying to load the best model from the output directory...")

model = AutoModelForTokenClassification.from_pretrained(
training_args.output_dir, num_labels=2, trust_remote_code=True
)

model = AutoModelForTokenClassification.from_pretrained(
"/home/osama/projects/semeval_private/semeval2024-private/semeval-taskC/baseline/runs/exp_4/checkpoint-687",
num_labels=2,
trust_remote_code=True,
)

train_set = Semeval_Data(data_args.train_csv)
dev_set = Semeval_Data(data_args.dev_csv)

trainer = transformers.Trainer(
model=model,
args=training_args,
train_dataset=train_set,
eval_dataset=dev_set,
tokenizer=train_set.tokenizer,
compute_metrics=compute_metrics,
)

if training_args.do_train:
logger.info("Training...")
logger.info("*** Train Dataset ***")
logger.info(f"Number of samples: {len(train_set)}")
logger.info("*** Dev Dataset ***")
logger.info(f"Number of samples: {len(dev_set)}")

trainer.train()

logger.info("Training completed!")

if training_args.do_eval:
logger.info("Evaluating...")
logger.info("*** Dev Dataset ***")
logger.info(f"Number of samples: {len(dev_set)}")

metrics = trainer.evaluate()
logger.info(f"Metrics: {metrics}")
trainer.save_metrics("eval", metrics)

logger.info("Evaluation completed!")

if training_args.do_predict:
test_sets = []
for test_csv in data_args.test_csvs:
test_set = Semeval_Data(test_csv, inference=True)
test_sets.append(test_set)
logger.info("Predicting...")
logger.info("*** Test Datasets ***")
logger.info(f"Number of samples: {len(test_sets)}")

for idx, test_set in enumerate(test_sets):
logger.info(f"Test Dataset {idx + 1}")
logger.info(f"Number of samples: {len(test_set)}")

predictions, _, _ = trainer.predict(test_set)
logger.info("Predictions completed!")

df = pd.DataFrame(
{
"uuid": [i["uuid"] for i in test_set],
"start_position": [
get_start_position(
i[0],
np.array(i[1]["corresponding_word"]),
token_level=False,
)
for i in list(zip(predictions.argmax(axis=-1), test_set))
],
}
)
import os

file_name = os.path.basename(data_args.test_csvs[idx])
file_dirs = os.path.join(training_args.output_dir, "predictions")
os.makedirs(file_dirs, exist_ok=True)
file_path = os.path.join(file_dirs, file_name)
df.to_csv(file_path, index=False)
Loading

0 comments on commit 0334c06

Please sign in to comment.