forked from eth-sri/sven
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added scripts to fetch all code snippets from the converted excel files:
- Loading branch information
Kohei Dozono
committed
Aug 20, 2024
1 parent
e8449a0
commit ef76598
Showing
2 changed files
with
153 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
''' | ||
Convert *.jsonl files to *.excel files. | ||
All files are under './train' as well as './val' folder. | ||
Store converted files under './train_excel' and './val_excel' folder. | ||
We must run this script before extracting the code snippets from the excel files! | ||
''' | ||
|
||
import json | ||
import pandas as pd | ||
import os | ||
|
||
def jsonl_to_excel(jsonl_file, fine_name, excel_folder): | ||
# Reads every lines and assign them into a list depends on the programming language | ||
data_py, data_cpp, data_c, data_other = [], [], [], [] | ||
|
||
# If there is no folder for the programming language, create one. | ||
if not os.path.exists(os.path.join(excel_folder, 'python')): | ||
os.makedirs(os.path.join(excel_folder, 'python')) | ||
if not os.path.exists(os.path.join(excel_folder, 'cpp')): | ||
os.makedirs(os.path.join(excel_folder, 'cpp')) | ||
if not os.path.exists(os.path.join(excel_folder, 'c')): | ||
os.makedirs(os.path.join(excel_folder, 'c')) | ||
if not os.path.exists(os.path.join(excel_folder, 'other')): | ||
os.makedirs(os.path.join(excel_folder, 'other')) | ||
|
||
with open(jsonl_file, 'r') as f: | ||
lines = f.readlines() | ||
|
||
# Extract the programming language from the file name | ||
for line in lines: | ||
line = json.loads(line) | ||
file_name = line.get('file_name', '') | ||
extension = file_name.split('.')[-1].lower() | ||
if extension == 'py': | ||
data_py.append(line) | ||
elif extension == 'cpp' or extension == 'cc': | ||
data_cpp.append(line) | ||
elif extension == 'c': | ||
data_c.append(line) | ||
else: | ||
data_other.append(line) | ||
|
||
# Convert the list into a pandas dataframe | ||
df_py = pd.DataFrame(data_py) | ||
df_cpp = pd.DataFrame(data_cpp) | ||
df_c = pd.DataFrame(data_c) | ||
df_other = pd.DataFrame(data_other) | ||
|
||
# Write the dataframe into an excel file for Python, which is under "train_excel" or "val_excel" folder. | ||
# If there is no such a folder, create one. | ||
python_save_path = os.path.join(excel_folder, 'python', fine_name[:-6] + '.xlsx') | ||
cpp_save_path = os.path.join(excel_folder, 'cpp', fine_name[:-6] + '.xlsx') | ||
c_save_path = os.path.join(excel_folder, 'c', fine_name[:-6] + '.xlsx') | ||
other_save_path = os.path.join(excel_folder, 'other', fine_name[:-6] + '.xlsx') | ||
|
||
df_py.to_excel(python_save_path, index=False) | ||
df_cpp.to_excel(cpp_save_path, index=False) | ||
df_c.to_excel(c_save_path, index=False) | ||
df_other.to_excel(other_save_path, index=False) | ||
|
||
|
||
def main(): | ||
for folder in ['train', 'val']: | ||
jsonl_folder = os.path.join('.', folder) | ||
excel_folder = os.path.join('.', folder + '_xlsx') | ||
if not os.path.exists(excel_folder): | ||
os.makedirs(excel_folder) | ||
for file in os.listdir(jsonl_folder): | ||
if file.endswith('.jsonl'): | ||
jsonl_file = os.path.join(jsonl_folder, file) | ||
jsonl_to_excel(jsonl_file, file, excel_folder) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
''' | ||
Get code snippets from xlsx file. | ||
There are two folderss: train_xlsx and val_xlsx. | ||
Each folder contains four subfolders: python, cpp, c, and other. And each xlsx file is named with the corresponding CWE ID, such as cwe-022.xlsx. | ||
Save those code snippets as a code file, such as ./cwe-022/cwe-022_1.py, ./cwe-022/cwe-022_1.cpp, ./cwe-022/cwe-022_1.c, etc. We can just save the code snippets classified as other as a .txt file. | ||
''' | ||
|
||
import os | ||
import pandas as pd | ||
import re | ||
import sys | ||
|
||
def get_code_snippets_from_xlsx(excel_folder, code_folder): | ||
if not os.path.exists(code_folder): | ||
os.makedirs(code_folder) | ||
|
||
# Define the subfolders | ||
subfolders = ['python', 'cpp', 'c', 'other'] | ||
|
||
# Iterate through each subfolder | ||
for subfolder in subfolders: | ||
excel_subfolder = os.path.join(excel_folder, subfolder) | ||
code_subfolder = os.path.join(code_folder, subfolder) | ||
|
||
if not os.path.exists(code_subfolder): | ||
os.makedirs(code_subfolder) | ||
|
||
# Get all xlsx files in the subfolder | ||
xlsx_files = [f for f in os.listdir(excel_subfolder) if f.endswith('.xlsx')] | ||
|
||
for xlsx_file in xlsx_files: | ||
cwe_id = xlsx_file.split('.')[0] # Extract CWE ID from filename | ||
excel_path = os.path.join(excel_subfolder, xlsx_file) | ||
|
||
# Read the xlsx file | ||
df = pd.read_excel(excel_path) | ||
|
||
# Create a folder for this CWE if it doesn't exist | ||
cwe_folder = os.path.join(code_subfolder, cwe_id) | ||
if not os.path.exists(cwe_folder): | ||
os.makedirs(cwe_folder) | ||
|
||
# Extract and save code snippets | ||
for idx, row in df.iterrows(): | ||
code_snippet = row['func_src_before'] | ||
if pd.notna(code_snippet): | ||
# Determine file extension | ||
if subfolder == 'python': | ||
ext = '.py' | ||
elif subfolder == 'cpp': | ||
ext = '.cpp' | ||
elif subfolder == 'c': | ||
ext = '.c' | ||
else: | ||
ext = '.txt' | ||
|
||
# Create filename | ||
filename = f"{cwe_id}_{idx + 1}{ext}" | ||
file_path = os.path.join(cwe_folder, filename) | ||
|
||
# Write code snippet to file | ||
with open(file_path, 'w', encoding='utf-8') as f: | ||
f.write(code_snippet) | ||
|
||
print(f"Code snippets extracted and saved in {code_folder}") | ||
|
||
# Example usage | ||
if __name__ == "__main__": | ||
if len(sys.argv) != 3: | ||
print("Usage: python script.py <excel_folder> <code_folder>") | ||
sys.exit(1) | ||
|
||
excel_folder = sys.argv[1] | ||
code_folder = sys.argv[2] | ||
get_code_snippets_from_xlsx(excel_folder, code_folder) | ||
|
||
# Example command: python get_code_snippets_from_xlsx.py train_xlsx train_code (for train), | ||
# python get_code_snippets_from_xlsx.py val_xlsx val_code (for val) |