Skip to content

Commit

Permalink
Merge branch 'main' of github.com:evidencebp/comsum into main
Browse files Browse the repository at this point in the history
  • Loading branch information
idanacumen committed Aug 12, 2021
2 parents 4925a8b + b409e94 commit fcfd511
Show file tree
Hide file tree
Showing 9 changed files with 236,270 additions and 15 deletions.
1,261 changes: 1,261 additions & 0 deletions data/labels/2ann_cum_sumrandom_batch_9_july_2021_labels.csv

Large diffs are not rendered by default.

40,494 changes: 40,494 additions & 0 deletions data/samples/test_sample_b1_filtered.csv

Large diffs are not rendered by default.

194,346 changes: 194,346 additions & 0 deletions data/samples/train_sample_b1_filtered.csv

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions src/scripts/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
administrative_message_identifier.py - some commit messages (other then the title) are just administrative (e.g., not describing the change content by the reviwer).
Such commits are not suitable for text summarization and the follwing file is a labeling function for identifying them.

filter_messages.py - After opening the zip files, this splits the data to have a test of separate time and repositories

build_benchmarks.py - Builds the none machine learning benchmark - entire messgae, related fix, etc.

configuration.py - common constants, directory locations, etc.
Expand All @@ -11,6 +13,6 @@ evaluate_concept_stability.py - evaluates meaning preserving per concept.

evaluate_meaning_preserving.py - evaluate meaning preserving.

finetune_distillbart_comsum.sh - an examples of finetuning a model
finetune_distillbart_comsum.sh - an examples of finetuning a model with tokenizers library

requirments.txt - pyhton libraries used in this project
requirements.txt - pyhton libraries used in this project
1 change: 1 addition & 0 deletions src/scripts/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
sys.path.append(BASE_PATH)

DATA_PATH = os.path.join(BASE_PATH, r'data')
SAMPLED_DATA_PATH = os.path.join(DATA_PATH, r'samples')
FULL_DATA_PATH = os.path.join(DATA_PATH, r'dataset')
SAMPLES_PATH = os.path.join(DATA_PATH, r'samples')
SPLIT_DATA_PATH = os.path.join(DATA_PATH, r'split')
Expand Down
20 changes: 14 additions & 6 deletions src/scripts/convert_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,19 @@ def csv_to_data(work_dir, out_dir, csv_paths, val_size=1000, test_size=1000, for

working_dir = SPLIT_DATA_PATH

out_dir = os.path.join(SPLIT_DATA_PATH, "train")
csv_paths = [os.path.join(SPLIT_DATA_PATH, "plain_commits_dataset_train00000000000" + str(i)) for i in range(10)] + [
os.path.join(SPLIT_DATA_PATH, "plain_commits_dataset_train0000000000" + str(i)) for i in range(10, 14)]
csv_to_data(working_dir, out_dir, csv_paths, force=force)
# out_dir = os.path.join(SPLIT_DATA_PATH, "train")
# csv_paths = [os.path.join(SPLIT_DATA_PATH, "plain_commits_dataset_train00000000000" + str(i)) for i in range(10)] + [
# os.path.join(SPLIT_DATA_PATH, "plain_commits_dataset_train0000000000" + str(i)) for i in range(10, 14)]
# csv_to_data(working_dir, out_dir, csv_paths, force=force)
#
# out_dir = os.path.join(SPLIT_DATA_PATH, "test")
# csv_paths = [os.path.join(SPLIT_DATA_PATH, "plain_commits_dataset_test.csv")]
# csv_to_data(working_dir, out_dir, csv_paths, val_size=0, test_size=0, force=force)

out_dir = os.path.join(SPLIT_DATA_PATH, "test_sample")
csv_paths = [os.path.join(SPLIT_DATA_PATH, "test_sample_b1.csv")]
csv_to_data(working_dir, out_dir, csv_paths, val_size=0, test_size=0, force=force)

out_dir = os.path.join(SPLIT_DATA_PATH, "test")
csv_paths = [os.path.join(SPLIT_DATA_PATH, "plain_commits_dataset_test.csv")]
out_dir = os.path.join(SPLIT_DATA_PATH, "train_sample")
csv_paths = [os.path.join(SPLIT_DATA_PATH, "train_sample_b1.csv")]
csv_to_data(working_dir, out_dir, csv_paths, val_size=0, test_size=0, force=force)
125 changes: 125 additions & 0 deletions src/scripts/create_baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import random

import pandas as pd
import re

from configuration import DATA_PATH, BASE_PATH, SPLIT_DATA_PATH

sentence_splitter = re.compile(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s")


def random_sentence_choice(inpath, outpath, force):
if os.path.isfile(outpath) and not force:
print(f"{outpath} already exists pass force =True to overwrite")
return
print(f"reading {os.path.abspath(inpath)}")
print(f"writing to {os.path.abspath(outpath)}")
with open(inpath) as lines:
with open(outpath, "w") as outfl:
for document in lines:
sentences = sentence_splitter.split(document)
sentences = [sentence for sentence in sentences if sentence]
outfl.write(random.choice(sentences).strip() + "\n")


if __name__ == '__main__':
force = False
force = True
working_dir = os.path.join(DATA_PATH, "split")
outdir = os.path.join(BASE_PATH, "outputs", "random")
os.makedirs(outdir, exist_ok=True)
# inpath = os.path.join(SPLIT_DATA_PATH,"train1batch","/train.source")
# outpath = os.path.join(outdir, "train1batch.txt")
# random_sentence_choice(inpath, outpath, force=force)

# inpath = os.path.join(SPLIT_DATA_PATH,"semantic","train.source")
# outpath = os.path.join(outdir, "semantic.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"train_short","train.source")
# outpath = os.path.join(outdir, "train_short.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"adaptive","train.source")
# outpath = os.path.join(outdir, "adaptive.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"corrective","train.source")
# outpath = os.path.join(outdir, "corrective.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"refactor","train.source")
# outpath = os.path.join(outdir, "refactor.txt")
# random_sentence_choice(inpath, outpath, force=force)

# inpath = os.path.join(SPLIT_DATA_PATH,"train5m","train.source")
# outpath = os.path.join(outdir, "train5m.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"train1m","train.source")
# outpath = os.path.join(outdir, "train1m.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"train2m","train.source")
# outpath = os.path.join(outdir, "train2m.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"train","train.source")
# outpath = os.path.join(outdir, "train.txt")
# random_sentence_choice(inpath, outpath, force=force)

# inpath = os.path.join(SPLIT_DATA_PATH,"train_short","train.source")
# outpath = os.path.join(outdir, "train_short.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"train","test.source")
# outpath = os.path.join(outdir, "test.txt")
# random_sentence_choice(inpath, outpath, force=force)

# inpath = os.path.join(SPLIT_DATA_PATH,"adaptive10","train.source")
# outpath = os.path.join(outdir, "adaptive10.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"corrective10","train.source")
# outpath = os.path.join(outdir, "corrective10.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"refactor10","train.source")
# outpath = os.path.join(outdir, "refactor10.txt")
# random_sentence_choice(inpath, outpath, force=force)

# inpath = os.path.join(SPLIT_DATA_PATH,"test_sample","test.source")
# outpath = os.path.join(outdir, "test_sample.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"train_sample","train.source")
# outpath = os.path.join(outdir, "train_sample.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"adaptive_sample","test.source")
# outpath = os.path.join(outdir, "adaptive_sample.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"corrective_sample","test.source")
# outpath = os.path.join(outdir, "corrective_sample.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH,"refactor_sample","test.source")
# outpath = os.path.join(outdir, "refactor_sample.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH, "test", "test.source")
# outpath = os.path.join(outdir, "test.txt")
# random_sentence_choice(inpath, outpath, force=force)
#
# inpath = os.path.join(SPLIT_DATA_PATH, "train", "train.source")
# outpath = os.path.join(outdir, "train.txt")
# random_sentence_choice(inpath, outpath, force=force)
inpath = os.path.join(SPLIT_DATA_PATH, "test_sample", "test.source")
outpath = os.path.join(outdir, "test_sample.txt")
random_sentence_choice(inpath, outpath, force=force)

inpath = os.path.join(SPLIT_DATA_PATH, "train_sample", "train.source")
outpath = os.path.join(outdir, "train_sample.txt")
random_sentence_choice(inpath, outpath, force=force)
32 changes: 25 additions & 7 deletions src/scripts/filter_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from os.path import join
import pandas as pd

from configuration import AUX_DATA_PATH, MERGE_COMMIT_FILES, LABELS_PATH, SPLIT_FILE, DATA_PATH
from configuration import AUX_DATA_PATH, MERGE_COMMIT_FILES, LABELS_PATH, SPLIT_FILE, DATA_PATH, SPLIT_DATA_PATH, \
SAMPLED_DATA_PATH
from administrative_message_identifier import label_df_as_administrative, ADMINISTRATIVE_COL


def filter_commits(commit_files, filtered_commits_file, split=None):
# TODO - add repo filter
print(f"Processing {commit_files}")
commits_df = pd.read_csv(commit_files)

Expand All @@ -25,7 +25,7 @@ def filter_commits(commit_files, filtered_commits_file, split=None):
split_repos = repos_split_df[repos_split_df.type == split].repo_name.tolist()
filtered_df = filtered_df[filtered_df.repo_name.isin(split_repos)]

print(f"Writing to {filtered_commits_file}")
print(f"Writing to {filtered_commits_file}, examples:{len(filtered_df)}")
filtered_df.to_csv(filtered_commits_file, index=False)


Expand All @@ -36,12 +36,30 @@ def filter_commits(commit_files, filtered_commits_file, split=None):
# , filtered_commits_file='./tmp/filtered.csv'
# , split='Test')

# Split all files in DATA_PATH
out_dir = join(DATA_PATH, 'split')
# # Split all files in SPLIT_DATA_PATH
# out_dir = SPLIT_DATA_PATH
# os.makedirs(out_dir, exist_ok=True)
# for root, dirs, filenames in os.walk(join(DATA_PATH, 'dataset')):
# for filename in filenames:
# if filename.endswith('zip') or filename.endswith('md'):
# continue
# if 'train' in filename.lower():
# split = 'Train'
# else:
# split = 'Test'
# try:
# filter_commits(commit_files=join(root, filename)
# , filtered_commits_file=join(out_dir, filename)
# , split=split)
# except Exception as e:
# print(f"Failed processing with error type {type(e)} make sure this is a csv like file")

# Split all files in SAMPLED_DATA_PATH
out_dir = SPLIT_DATA_PATH
os.makedirs(out_dir, exist_ok=True)
for root, dirs, filenames in os.walk(join(DATA_PATH, 'dataset')):
for root, dirs, filenames in os.walk(join(SAMPLED_DATA_PATH)):
for filename in filenames:
if filename.endswith('zip'):
if filename.endswith('zip') or filename.endswith('md'):
continue
if 'train' in filename.lower():
split = 'Train'
Expand Down
File renamed without changes.

0 comments on commit fcfd511

Please sign in to comment.