Skip to content

Commit

Permalink
Added scripts to fetch all code snippets from the converted excel files:
Browse files Browse the repository at this point in the history
  • Loading branch information
Kohei Dozono committed Aug 20, 2024
1 parent e8449a0 commit ef76598
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 0 deletions.
75 changes: 75 additions & 0 deletions data_train_val/convert_to_xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
'''
Convert *.jsonl files to *.excel files.
All files are under './train' as well as './val' folder.
Store converted files under './train_excel' and './val_excel' folder.
We must run this script before extracting the code snippets from the excel files!
'''

import json
import pandas as pd
import os

def jsonl_to_excel(jsonl_file, fine_name, excel_folder):
# Reads every lines and assign them into a list depends on the programming language
data_py, data_cpp, data_c, data_other = [], [], [], []

# If there is no folder for the programming language, create one.
if not os.path.exists(os.path.join(excel_folder, 'python')):
os.makedirs(os.path.join(excel_folder, 'python'))
if not os.path.exists(os.path.join(excel_folder, 'cpp')):
os.makedirs(os.path.join(excel_folder, 'cpp'))
if not os.path.exists(os.path.join(excel_folder, 'c')):
os.makedirs(os.path.join(excel_folder, 'c'))
if not os.path.exists(os.path.join(excel_folder, 'other')):
os.makedirs(os.path.join(excel_folder, 'other'))

with open(jsonl_file, 'r') as f:
lines = f.readlines()

# Extract the programming language from the file name
for line in lines:
line = json.loads(line)
file_name = line.get('file_name', '')
extension = file_name.split('.')[-1].lower()
if extension == 'py':
data_py.append(line)
elif extension == 'cpp' or extension == 'cc':
data_cpp.append(line)
elif extension == 'c':
data_c.append(line)
else:
data_other.append(line)

# Convert the list into a pandas dataframe
df_py = pd.DataFrame(data_py)
df_cpp = pd.DataFrame(data_cpp)
df_c = pd.DataFrame(data_c)
df_other = pd.DataFrame(data_other)

# Write the dataframe into an excel file for Python, which is under "train_excel" or "val_excel" folder.
# If there is no such a folder, create one.
python_save_path = os.path.join(excel_folder, 'python', fine_name[:-6] + '.xlsx')
cpp_save_path = os.path.join(excel_folder, 'cpp', fine_name[:-6] + '.xlsx')
c_save_path = os.path.join(excel_folder, 'c', fine_name[:-6] + '.xlsx')
other_save_path = os.path.join(excel_folder, 'other', fine_name[:-6] + '.xlsx')

df_py.to_excel(python_save_path, index=False)
df_cpp.to_excel(cpp_save_path, index=False)
df_c.to_excel(c_save_path, index=False)
df_other.to_excel(other_save_path, index=False)


def main():
for folder in ['train', 'val']:
jsonl_folder = os.path.join('.', folder)
excel_folder = os.path.join('.', folder + '_xlsx')
if not os.path.exists(excel_folder):
os.makedirs(excel_folder)
for file in os.listdir(jsonl_folder):
if file.endswith('.jsonl'):
jsonl_file = os.path.join(jsonl_folder, file)
jsonl_to_excel(jsonl_file, file, excel_folder)

if __name__ == '__main__':
main()
78 changes: 78 additions & 0 deletions data_train_val/get_code_snippets_from_xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
'''
Get code snippets from xlsx file.
There are two folderss: train_xlsx and val_xlsx.
Each folder contains four subfolders: python, cpp, c, and other. And each xlsx file is named with the corresponding CWE ID, such as cwe-022.xlsx.
Save those code snippets as a code file, such as ./cwe-022/cwe-022_1.py, ./cwe-022/cwe-022_1.cpp, ./cwe-022/cwe-022_1.c, etc. We can just save the code snippets classified as other as a .txt file.
'''

import os
import pandas as pd
import re
import sys

def get_code_snippets_from_xlsx(excel_folder, code_folder):
if not os.path.exists(code_folder):
os.makedirs(code_folder)

# Define the subfolders
subfolders = ['python', 'cpp', 'c', 'other']

# Iterate through each subfolder
for subfolder in subfolders:
excel_subfolder = os.path.join(excel_folder, subfolder)
code_subfolder = os.path.join(code_folder, subfolder)

if not os.path.exists(code_subfolder):
os.makedirs(code_subfolder)

# Get all xlsx files in the subfolder
xlsx_files = [f for f in os.listdir(excel_subfolder) if f.endswith('.xlsx')]

for xlsx_file in xlsx_files:
cwe_id = xlsx_file.split('.')[0] # Extract CWE ID from filename
excel_path = os.path.join(excel_subfolder, xlsx_file)

# Read the xlsx file
df = pd.read_excel(excel_path)

# Create a folder for this CWE if it doesn't exist
cwe_folder = os.path.join(code_subfolder, cwe_id)
if not os.path.exists(cwe_folder):
os.makedirs(cwe_folder)

# Extract and save code snippets
for idx, row in df.iterrows():
code_snippet = row['func_src_before']
if pd.notna(code_snippet):
# Determine file extension
if subfolder == 'python':
ext = '.py'
elif subfolder == 'cpp':
ext = '.cpp'
elif subfolder == 'c':
ext = '.c'
else:
ext = '.txt'

# Create filename
filename = f"{cwe_id}_{idx + 1}{ext}"
file_path = os.path.join(cwe_folder, filename)

# Write code snippet to file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(code_snippet)

print(f"Code snippets extracted and saved in {code_folder}")

# Example usage
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py <excel_folder> <code_folder>")
sys.exit(1)

excel_folder = sys.argv[1]
code_folder = sys.argv[2]
get_code_snippets_from_xlsx(excel_folder, code_folder)

# Example command: python get_code_snippets_from_xlsx.py train_xlsx train_code (for train),
# python get_code_snippets_from_xlsx.py val_xlsx val_code (for val)

0 comments on commit ef76598

Please sign in to comment.