Skip to content

Commit

Permalink
Update utils.py
Browse files Browse the repository at this point in the history
  • Loading branch information
mounKim authored Nov 10, 2022
1 parent c5a4ef0 commit 7824f5f
Showing 1 changed file with 23 additions and 11 deletions.
34 changes: 23 additions & 11 deletions datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,31 @@
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

def construct_fc_data(dir_path:str, affiliation_path:str, citation_threshold:int):

def change_edge_data(file):
t = pd.read_csv(file)
dic = {}
for x, y in zip(t['PaperId'], t['PaperReferenceId']):
if x not in dic.keys():
dic[x] = []
dic[x].append(y)
return dic


def construct_data(dir_path: str, affiliation_path: str, citation_threshold: int):
"""
args:
Construct dataset for GNNs
dir_path: directory of each journal/conference
affiliation_path: CSV File path that represents authors' affiliation.
citation_threshold: Criterion that decides whether a paper falls above or below top 10%
return: tuple of numpy arrays
"""

sentences = []
years = []
citation = []
ids = []
labels = []
affiliations = []

Expand All @@ -35,7 +46,7 @@ def construct_fc_data(dir_path:str, affiliation_path:str, citation_threshold:int
paper_id = file.split('.')[0]

try:
affiliation_vector = affiliation_table[affiliation_table['PaperId']==int(paper_id)].values[0]
affiliation_vector = affiliation_table[affiliation_table['PaperId'] == int(paper_id)].values[0]
except IndexError:
paper_id_err += 1
continue
Expand All @@ -46,13 +57,13 @@ def construct_fc_data(dir_path:str, affiliation_path:str, citation_threshold:int
load_err += 1
continue

abstract = data['abstract_inverted_index']
if abstract==None:
abstract = data['abstract_inverted_index']
if abstract is None:
abstract_err += 1
continue
pub_year = data['publication_year']
citation_cnt = data['cited_by_count']
label = 1 if citation_cnt>citation_threshold else 0
label = 1 if citation_cnt > citation_threshold else 0

year = np.zeros(13)
year[pub_year - 2010] = 1.0
Expand All @@ -67,20 +78,21 @@ def construct_fc_data(dir_path:str, affiliation_path:str, citation_threshold:int

sentence = data['title'] + ' ' + abstract_text
sentences.append(sentence)
ids.append(paper_id)
years.append(year)
affiliations.append(affiliation_vector)
labels.append(label)


f.close()

if load_err:
print("Warning: {} json files are failed to upload.".format(load_err))
if abstract_err:
print("Warning: {} json files don't have abstract data.".format(abstract_err))
if paper_id_err:
if paper_id_err:
print("Warning: {} paper-IDs do not exist.".format(paper_id_err))

print("{} json files are uploaded.".format(len(file_list)-load_err-abstract_err-paper_id_err))
print("{} json files are uploaded.".format(len(file_list) - load_err - abstract_err - paper_id_err))

return TfidfVectorizer(max_features=1000).fit_transform(sentences).toarray(), np.array(years), np.array(affiliations), np.array(labels)
return np.array(ids), TfidfVectorizer(max_features=1000).fit_transform(sentences).toarray(), np.array(years), np.\
array(affiliations), np.array(labels)

0 comments on commit 7824f5f

Please sign in to comment.