Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Mujian63 authored Jun 12, 2024
1 parent fb4c048 commit fb37107
Show file tree
Hide file tree
Showing 14 changed files with 128,331 additions and 0 deletions.
207 changes: 207 additions & 0 deletions 01_data_prepare.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "496365cc-d2d8-407c-849c-5368dfbd8809",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import warnings\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"from tqdm import tqdm\n",
"import gc\n",
"import re\n",
"from sklearn.metrics import roc_auc_score\n",
"import os\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"from gensim.models import Word2Vec\n",
"import polars as pl\n",
"from pathlib import Path\n",
"from glob import glob\n",
"import json\n",
"import joblib"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c94400df-b09e-43ef-b387-fb4845127562",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"try:\n",
" os.makedirs( 'data')\n",
"except:\n",
" continue\n",
"try:\n",
" os.makedirs('temp_data')\n",
"except:\n",
" continue\n",
"try:\n",
" os.makedirs('sub')\n",
"except:\n",
" continue\n",
"try:\n",
" os.makedirs('feats/w2v_feats')\n",
"except:\n",
" continue\n",
"try:\n",
" os.makedirs('bge_m3_emb/w2v_feats')\n",
"except:\n",
" continue\n",
"try:\n",
" os.makedirs('oof2')\n",
"except:\n",
" continue\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a32a46c1-d830-4549-81f0-7892a4b997b8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 317302/317302 [02:15<00:00, 2348.13it/s]\n"
]
},
{
"data": {
"text/plain": [
"['data/pid_df.pkl']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('../IND-WhoIsWho/pid_to_info_all.json', 'r') as file:\n",
" data = json.load(file)\n",
"\n",
"\n",
"pid_df = []\n",
"for f in tqdm(data.keys()):\n",
" \n",
" temp = list(data[f].values())\n",
" temp = pd.DataFrame([temp],columns = data[f].keys())\n",
" pid_df.append(temp.copy())\n",
"pid_df = pd.concat(pid_df)\n",
"joblib.dump(pid_df,'data/pid_df.pkl')\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "93895c70-cef6-40fb-88dc-9ec7208d982d",
"metadata": {},
"outputs": [],
"source": [
"with open('../IND-test-public/ind_test_author_filter_public.json', 'r') as file:\n",
" data = json.load(file)\n",
" \n",
"test = []\n",
"for key in data.keys():\n",
" temp = pd.DataFrame(data[key]['papers'],columns = ['PID'])\n",
" temp['autherID'] = key\n",
" temp['autherName'] = data[key]['name']\n",
" \n",
" test.append(temp.copy())\n",
"test = pd.concat(test).reset_index(drop = True)\n",
"test.to_feather('data/test.feather')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0b047d8d-1303-4c12-ac56-f28cec02fec6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"import json\n",
" \n",
"with open('../IND-WhoIsWho/train_author.json', 'r') as file:\n",
" data = json.load(file)\n",
" \n",
"train = []\n",
"for key in data.keys():\n",
" temp1 = pd.DataFrame(data[key]['normal_data'],columns = ['PID'])\n",
" temp1['label'] = 0 \n",
"\n",
"\n",
" temp2 = pd.DataFrame(data[key]['outliers'],columns = ['PID'])\n",
" temp2['label'] = 1\n",
" temp = pd.concat([temp1,temp2]).reset_index(drop = True)\n",
" temp['autherID'] = key\n",
" temp['autherName'] = data[key]['name']\n",
" train.append(temp.copy())\n",
"train = pd.concat(train).reset_index(drop = True)\n",
"train.to_feather('data/train.feather')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "04c4879b-6c74-4d4b-a139-588eaaaa6c41",
"metadata": {},
"outputs": [],
"source": [
"with open('../IND-WhoIsWho/ind_valid_author.json', 'r') as file:\n",
" data = json.load(file)\n",
" \n",
"test = []\n",
"for key in data.keys():\n",
" temp = pd.DataFrame(data[key]['papers'],columns = ['PID'])\n",
" temp['autherID'] = key\n",
" temp['autherName'] = data[key]['name']\n",
" \n",
" test.append(temp.copy())\n",
"test = pd.concat(test).reset_index(drop = True)\n",
"test.to_feather('data/valid.feather')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1ee4901-a4a3-4f0b-a67e-259d9452c0dc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit fb37107

Please sign in to comment.