Add: SubtaskC data

rug-1-at-semeval24-task8 · Sep 19, 2023 · 0334c06 · 0334c06
1 parent a9719d5
commit 0334c06
Show file tree

Hide file tree

Showing 7 changed files with 22,470 additions and 1,002 deletions.
diff --git a/subtaskC/baseline/run.sh b/subtaskC/baseline/run.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Assuming you've set up HfArgumentParser to parse command line arguments in train.py
+
+python train.py \
+  --model_path "allenai/longformer-base-4096" \
+  --train_csv "../data/train/train_chatgpt.csv" \
+  --load_best_model_at_end True \
+  --dev_csv "../data/dev/dev.csv" \
+  --test_csvs ../data/dev/dev.csv \
+  --metric_for_best_model "eval_mean_absolute_diff" \
+  --greater_is_better False \
+  --do_train True \
+  --do_predict True \
+  --seed 55 \
+  --output_dir "./runs/exp_5" \
+  --logging_dir "./runs/exp_5/logs" \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size 32 \
+  --per_device_eval_batch_size 32 \
+  --auto_find_batch_size True \
+  --logging_steps 10 \
+  --load_best_model_at_end True \
+  --evaluation_strategy "epoch" \
+  --save_strategy "epoch" \
+  --save_total_limit 2
diff --git a/subtaskC/baseline/train.py b/subtaskC/baseline/train.py
@@ -0,0 +1,362 @@
+import torch
+import json
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import pandas as pd
+import numpy as np
+
+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+import transformers
+import logging
+import glob
+import os
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger()
+
+
+@dataclass
+class ModelConfig:
+    model_path: str = "allenai/longformer-base-4096"
+
+
+@dataclass
+class DatasetConfig:
+    train_csv: str = field(default=None, metadata={"help": "Path to train csv file"})
+    dev_csv: str = field(default=None, metadata={"help": "Path to dev csv file"})
+    test_csvs: List[str] = field(
+        default=None, metadata={"help": "Path to test csv files"}
+    )
+
+
+@dataclass
+class TrainingArgsConfig(transformers.TrainingArguments):
+    seed: int = 42
+    output_dir: str = "./runs/exp_3"
+    num_train_epochs: int = 10
+    per_device_train_batch_size: int = 32
+    per_device_eval_batch_size: int = 32
+    auto_find_batch_size: bool = True
+    logging_dir: str = "./runs/exp_3/logs"
+    logging_steps: int = 10
+    load_best_model_at_end: bool = True
+    evaluation_strategy: str = "epoch"
+    save_strategy: str = "epoch"
+    save_total_limit: int = 2
+
+
+class Semeval_Data(torch.utils.data.Dataset):
+    def __init__(self, data_path, max_length=1024, inference=False, debug=False):
+        self.data = pd.read_csv(data_path).to_dict("records")
+        self.inference = inference
+        self.tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
+        self.max_length = max_length
+        self.debug = debug
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        text = self.data[idx]["mixed_review"]
+        uuid = self.data[idx]["uuid"]
+        if not self.inference:
+            label = self.data[idx]["human_end_boundary"]
+
+        if self.debug and not self.inference:
+            print("Orignal Human Position: ", label)
+
+        labels = []
+        corresponding_word = []
+        tokens = []
+        input_ids = []
+        attention_mask = []
+
+        for jdx, word in enumerate(text.split(" ")):
+            word_encoded = self.tokenizer.tokenize(word)
+            sub_words = len(word_encoded)
+
+            if not self.inference:
+                is_machine_text = 1 if jdx >= label else 0
+                labels.extend([is_machine_text] * sub_words)
+
+            corresponding_word.extend([jdx] * sub_words)
+            tokens.extend(word_encoded)
+            input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))
+            attention_mask.extend([1] * sub_words)
+
+        ###Add padding to labels as -100
+        if len(input_ids) < self.max_length - 2:
+            input_ids = (
+                [0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2)
+            )
+            if not self.inference:
+                labels = [-100] + labels + [-100] * (self.max_length - len(labels) - 1)
+
+            attention_mask = (
+                [1]
+                + attention_mask
+                + [1]
+                + [0] * (self.max_length - len(attention_mask) - 2)
+            )
+            corresponding_word = (
+                [-100]
+                + corresponding_word
+                + [-100] * (self.max_length - len(corresponding_word) - 1)
+            )
+            tokens = (
+                ["<s>"]
+                + tokens
+                + ["</s>"]
+                + ["<pad>"] * (self.max_length - len(tokens) - 2)
+            )
+        else:
+            # Add -100 for CLS and SEP tokens
+            input_ids = [0] + input_ids[: self.max_length - 2] + [2]
+
+            if not self.inference:
+                labels = [-100] + labels[: self.max_length - 2] + [-100]
+
+            corresponding_word = (
+                [-100] + corresponding_word[: self.max_length - 2] + [-100]
+            )
+            attention_mask = [1] + attention_mask[: self.max_length - 2] + [1]
+            tokens = ["<s>"] + tokens[: self.max_length - 2] + ["</s>"]
+
+        encoded = {}
+        if not self.inference:
+            encoded["labels"] = torch.tensor(labels)
+        encoded["input_ids"] = torch.tensor(input_ids)
+        encoded["attention_mask"] = torch.tensor(attention_mask)
+
+        if not self.inference:
+            assert encoded["input_ids"].shape == encoded["labels"].shape
+
+        if self.debug and not self.inference:
+            print("Tokenized Human Position: ", labels.index(1))
+            print("Original Human Position: ", label)
+            print("Full Human Text:", text)
+            print("\n")
+            print("Human Text Truncated:", text.split(" ")[:label])
+            print("\n")
+            encoded["partial_human_review"] = " ".join(text.split(" ")[:label])
+
+        if self.inference:
+            encoded["text"] = text
+            encoded["uuid"] = uuid
+            encoded["corresponding_word"] = corresponding_word
+
+        return encoded
+
+
+def evaluate_position_difference(actual_position, predicted_position):
+    """
+    Compute the absolute difference between the actual and predicted start positions.
+
+    Args:
+    - actual_position (int): Actual start position of machine-generated text.
+    - predicted_position (int): Predicted start position of machine-generated text.
+
+    Returns:
+    - int: Absolute difference between the start positions.
+    """
+    return abs(actual_position - predicted_position)
+
+
+def get_start_position(sequence, mapping=None, token_level=True):
+    """
+    Get the start position from a sequence of labels or predictions.
+
+    Args:
+    - sequence (np.array): A sequence of labels or predictions.
+    - mapping (np.array): Mapping from index to word for the sequence.
+    - token_level (bool): If True, return positional indices; else, return word mappings.
+
+    Returns:
+    - int or str: Start position in the sequence.
+    """
+    # Locate the position of label '1'
+
+    if mapping is not None:
+        mask = mapping != -100
+        sequence = sequence[mask]
+        mapping = mapping[mask]
+
+    index = np.where(sequence == 1)[0]
+    value = index[0] if index.size else (len(sequence) - 1)
+
+    if not token_level:
+        value = mapping[value]
+
+    return value
+
+
+def evaluate_machine_start_position(
+    labels, predictions, idx2word=None, token_level=False
+):
+    """
+    Evaluate the starting position of machine-generated text in both predicted and actual sequences.
+
+    Args:
+    - labels (np.array): Actual labels.
+    - predictions (np.array): Predicted labels.
+    - idx2word (np.array): Mapping from index to word for each sequence in the batch.
+    - token_level (bool): Flag to determine if evaluation is at token level. If True, return positional indices; else, return word mappings.
+
+    Returns:
+    - float: Mean absolute difference between the start positions in predictions and actual labels.
+    """
+    predicted_positions = predictions.argmax(axis=-1)
+
+    actual_starts = []
+    predicted_starts = []
+
+    if not token_level and idx2word is None:
+        raise ValueError(
+            "idx2word must be provided if evaluation is at word level (token_level=False)"
+        )
+
+    for idx in range(labels.shape[0]):
+        # Remove padding
+        mask = labels[idx] != -100
+        predict, label, mapping = (
+            predicted_positions[idx][mask],
+            labels[idx][mask],
+            idx2word[idx][mask] if not token_level else None,
+        )
+
+        # If token_level is True, just use the index; otherwise, map to word
+        predicted_value = get_start_position(predict, mapping, token_level)
+        actual_value = get_start_position(label, mapping, token_level)
+
+        predicted_starts.append(predicted_value)
+        actual_starts.append(actual_value)
+
+        # Log cases where label '1' is not found
+        # if not np.where(predict == 1)[0].size:
+        #     print(f"No start label (1) in prediction at sequence {idx}")
+        # if not np.where(label == 1)[0].size:
+        #     print(f"No start label (1) in actual label at sequence {idx}")
+
+    position_differences = [
+        evaluate_position_difference(actual, predict)
+        for actual, predict in zip(actual_starts, predicted_starts)
+    ]
+    mean_position_difference = np.mean(position_differences)
+
+    return mean_position_difference
+
+
+def compute_metrics(p):
+    pred, labels = p
+    mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True)
+
+    return {
+        "mean_absolute_diff": mean_absolute_diff,
+    }
+
+
+if __name__ == "__main__":
+    parser = transformers.HfArgumentParser(
+        (ModelConfig, DatasetConfig, TrainingArgsConfig)
+    )
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    print("Model Arguments: ", model_args)
+    print("Data Arguments: ", data_args)
+    print("Training Arguments: ", training_args)
+
+    # Set seed
+    transformers.set_seed(training_args.seed)
+
+    model = AutoModelForTokenClassification.from_pretrained(
+        model_args.model_path, num_labels=2, trust_remote_code=True
+    )
+    if training_args.do_eval or training_args.do_predict:
+        # if not in training mode, load the best model
+        if not os.path.exists(training_args.output_dir):
+            raise ValueError(
+                "Output directory ({}) does not exist. Please train the model first.".format(
+                    training_args.output_dir
+                )
+            )
+        logging.info("Trying to load the best model from the output directory...")
+
+        model = AutoModelForTokenClassification.from_pretrained(
+            training_args.output_dir, num_labels=2, trust_remote_code=True
+        )
+
+    model = AutoModelForTokenClassification.from_pretrained(
+        "/home/osama/projects/semeval_private/semeval2024-private/semeval-taskC/baseline/runs/exp_4/checkpoint-687",
+        num_labels=2,
+        trust_remote_code=True,
+    )
+
+    train_set = Semeval_Data(data_args.train_csv)
+    dev_set = Semeval_Data(data_args.dev_csv)
+
+    trainer = transformers.Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_set,
+        eval_dataset=dev_set,
+        tokenizer=train_set.tokenizer,
+        compute_metrics=compute_metrics,
+    )
+
+    if training_args.do_train:
+        logger.info("Training...")
+        logger.info("*** Train Dataset ***")
+        logger.info(f"Number of samples: {len(train_set)}")
+        logger.info("*** Dev Dataset ***")
+        logger.info(f"Number of samples: {len(dev_set)}")
+
+        trainer.train()
+
+        logger.info("Training completed!")
+
+    if training_args.do_eval:
+        logger.info("Evaluating...")
+        logger.info("*** Dev Dataset ***")
+        logger.info(f"Number of samples: {len(dev_set)}")
+
+        metrics = trainer.evaluate()
+        logger.info(f"Metrics: {metrics}")
+        trainer.save_metrics("eval", metrics)
+
+        logger.info("Evaluation completed!")
+
+    if training_args.do_predict:
+        test_sets = []
+        for test_csv in data_args.test_csvs:
+            test_set = Semeval_Data(test_csv, inference=True)
+            test_sets.append(test_set)
+        logger.info("Predicting...")
+        logger.info("*** Test Datasets ***")
+        logger.info(f"Number of samples: {len(test_sets)}")
+
+        for idx, test_set in enumerate(test_sets):
+            logger.info(f"Test Dataset {idx + 1}")
+            logger.info(f"Number of samples: {len(test_set)}")
+
+            predictions, _, _ = trainer.predict(test_set)
+            logger.info("Predictions completed!")
+
+            df = pd.DataFrame(
+                {
+                    "uuid": [i["uuid"] for i in test_set],
+                    "start_position": [
+                        get_start_position(
+                            i[0],
+                            np.array(i[1]["corresponding_word"]),
+                            token_level=False,
+                        )
+                        for i in list(zip(predictions.argmax(axis=-1), test_set))
+                    ],
+                }
+            )
+            import os
+
+            file_name = os.path.basename(data_args.test_csvs[idx])
+            file_dirs = os.path.join(training_args.output_dir, "predictions")
+            os.makedirs(file_dirs, exist_ok=True)
+            file_path = os.path.join(file_dirs, file_name)
+            df.to_csv(file_path, index=False)