Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
uncleban authored May 12, 2019
1 parent 543a2f9 commit 790109e
Showing 1 changed file with 390 additions and 0 deletions.
390 changes: 390 additions & 0 deletions ques2_210fea_74151.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,390 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import StratifiedKFold, KFold\n",
"import matplotlib.pyplot as plt\n",
"from sklearn import preprocessing\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"transaction_history = pd.read_csv('data/FT_Camp_2/sz_detail.csv')\n",
"sz_id_inf = pd.read_csv('data/FT_Camp_2/trx_cod.csv')\n",
"g2_cod_inf = pd.read_csv('data/FT_Camp_2/g2.csv')\n",
"cuts_inf = pd.read_csv('data/FT_Camp_2/cust_bas_inf.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## 用户信息, 性别转类别特征,划分年龄区间\n",
"cuts_inf['gender'] = cuts_inf['gender'].astype('category').cat.codes\n",
"cuts_inf['age'] = cuts_inf.age.apply(lambda x: int(x) if x not in ['3', '\\\\N'] else np.nan)\n",
"cuts_inf['age_range'] = pd.cut(cuts_inf['age'], [0, 19, 29, 39, 49, 59, 100], labels=False).astype('category').cat.codes\n",
"cuts_inf['aum227'] = cuts_inf.aum227.apply(lambda x: float(x) if x != '\\\\N' else np.nan)\n",
"cuts_inf['aum306'] = cuts_inf.aum306.apply(lambda x: float(x) if x != '\\\\N' else np.nan)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## 收支分类 转类别特征\n",
"sz_id_inf['cat1'] = sz_id_inf['cat1'].astype('category').cat.codes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## sz_id g2 类别编码 train test 保持一致的编码\n",
"transaction_history = transaction_history.sort_values(['id', 'prt_dt']).reset_index(drop=True)\n",
"transaction_history['g2_cod'] = transaction_history['g2_cod'].fillna('-1')\n",
"sz_le = preprocessing.LabelEncoder()\n",
"sz_le.fit(transaction_history.sz_id.values)\n",
"g2_le = preprocessing.LabelEncoder()\n",
"g2_le.fit(transaction_history.g2_cod.values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"### 特征提取函数\n",
"def get_feature(end_time, data_name):\n",
" if data_name == 'train':\n",
" data = pd.read_csv('data/FT_Camp_2/train.csv')\n",
" else:\n",
" data = pd.read_csv('data/FT_Camp_2/pred_users.csv')\n",
" \n",
" transaction_history_data = transaction_history[(transaction_history['prt_dt']<=end_time)].reset_index(drop=True)\n",
"\n",
" ## 将一些辅助类别信息补充到交易信息表里\n",
" transaction_history_data = pd.merge(transaction_history_data, cuts_inf[['id', 'gender', 'age', 'age_range']], on=['id'], how='left')\n",
" transaction_history_data = pd.merge(transaction_history_data, sz_id_inf[['sz_id', 'cat1']], on=['sz_id'], how='left')\n",
"\n",
" transaction_history_data['sz_id'] = sz_le.transform(transaction_history_data['sz_id'].values)\n",
" transaction_history_data['g2_cod'] = g2_le.transform(transaction_history_data['g2_cod'].values)\n",
"\n",
" transaction_history_data = transaction_history_data[['id', 'sz_id', 'cat1', 'g2_cod', 'gender', 'age', 'age_range', 'rmb_amt','prt_dt']]\n",
"\n",
" ## 训练集用户属性 信息\n",
" data = pd.merge(data, cuts_inf[['id', 'gender', 'age', 'age_range']], on=['id'], how='left')\n",
"\n",
" ## 训练集用户记录里最多的 sz_id g2_cod\n",
" temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['prt_dt'].count()\\\n",
" .sort_values(['id', 'prt_dt'], ascending=[True, False]).groupby('id').apply(lambda x: x['sz_id'].values[0])\\\n",
" .reset_index().rename(columns={0:'sz_id'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
" data['sz_id'] = data['sz_id'].astype('category').cat.codes\n",
"\n",
" temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['prt_dt'].count()\\\n",
" .sort_values(['id', 'prt_dt'], ascending=[True, False]).groupby('id').apply(lambda x: x['g2_cod'].values[0])\\\n",
" .reset_index().rename(columns={0:'g2_cod'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
" data['g2_cod'] = data['g2_cod'].astype('category').cat.codes\n",
"\n",
" ### 用户交易条目数和交易天数 平均每天交易数\n",
" temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_trans_count':'count'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_trans_day_count':'nunique'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" data['user_trans_day_mean'] = data['user_trans_count']/data['user_trans_day_count']\n",
"\n",
" ## 用户 总交易额 收入 支出 条目、天平均\n",
" temp = transaction_history_data[transaction_history_data['rmb_amt']>=0].groupby('id', as_index=False)['rmb_amt'].agg({'user_sr_sum':'sum'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" temp = transaction_history_data[transaction_history_data['rmb_amt']<0].groupby('id', as_index=False)['rmb_amt'].agg({'user_zc_sum':'sum'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" data['user_sr_+_zc'] = data['user_sr_sum'] + data['user_zc_sum']\n",
" data['user_sr_-_zc'] = data['user_sr_sum'] - data['user_zc_sum']\n",
"\n",
" for i in ['user_sr_sum', 'user_zc_sum', 'user_sr_+_zc', 'user_sr_-_zc']:\n",
" col = i + '/trans'\n",
" data[col] = data[i]/data['user_trans_count']\n",
" col = i + '/days'\n",
" data[col] = data[i]/data['user_trans_day_count']\n",
"\n",
" ## 用户 总交易额 收入 支出 均值,方差 统计值\n",
" temp = transaction_history_data.groupby('id', as_index=False)['rmb_amt']\\\n",
" .agg({'user_rmb_mean':'mean', 'user_rmb_std':'std'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" temp = transaction_history_data[transaction_history_data['rmb_amt']>=0].groupby('id', as_index=False)['rmb_amt']\\\n",
" .agg({'user_sr_mean':'mean', 'user_sr_std':'std'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" temp = transaction_history_data[transaction_history_data['rmb_amt']<0].groupby('id', as_index=False)['rmb_amt']\\\n",
" .agg({'user_zc_mean':'mean', 'user_zc_std':'std'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" ## 用户 sz三种不同类型计数占比\n",
" temp = transaction_history_data.groupby(['id', 'cat1'])['prt_dt'].count().unstack().reset_index()\\\n",
" .rename(columns={0:'user_cat1_0_count', 1:'user_cat1_1_count', 2:'user_cat1_2_count'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" for i in ['user_cat1_0_count', 'user_cat1_1_count', 'user_cat1_2_count']:\n",
" col = i.replace('count', 'ratio')\n",
" data[col] = data[i]/data['user_trans_count']\n",
"\n",
" ## 用户 交易的第一天和最后一天\n",
" temp = transaction_history_data.groupby('id', as_index=False)['prt_dt'].agg({'user_first_day':'min', 'user_last_day':'max'})\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" data['user_first_day'] = data['user_first_day'].astype('category').cat.codes\n",
" data['user_last_day'] = data['user_last_day'].astype('category').cat.codes\n",
"\n",
" ## 用户 交易记录 sz_id 占比\n",
" temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['prt_dt'].count()\n",
" temp = pd.merge(temp, transaction_history_data[['id', 'prt_dt']].groupby('id', as_index=False)['prt_dt'].agg({'sz_id_ratio': 'count'}),\\\n",
" on=['id'], how='left')\n",
" temp['sz_id_ratio'] = temp['prt_dt']/temp['sz_id_ratio']\n",
" temp = pd.pivot_table(temp[['id', 'sz_id', 'sz_id_ratio']], index=['id', 'sz_id']).unstack().reset_index().fillna(0)\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
"\n",
" ## 用户 交易记录 g2_cod 占比\n",
" temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['prt_dt'].count()\n",
" temp = pd.merge(temp, transaction_history_data[['id', 'prt_dt']].groupby('id', as_index=False)['prt_dt'].agg({'g2_cod_ratio': 'count'}),\\\n",
" on=['id'], how='left')\n",
" temp['g2_cod_ratio'] = temp['prt_dt']/temp['g2_cod_ratio']\n",
" temp = pd.pivot_table(temp[['id', 'g2_cod', 'g2_cod_ratio']], index=['id', 'g2_cod']).unstack().reset_index().fillna(0)\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
" \n",
" ## 用户余额\n",
" if data_name == 'train':\n",
" data = pd.merge(data, cuts_inf[['id', 'aum227']], on=['id'], how='left').rename(columns={'aum227':'aum_last'})\n",
" else:\n",
" data = pd.merge(data, cuts_inf[['id', 'aum306']], on=['id'], how='left').rename(columns={'aum306':'aum_last'})\n",
" \n",
" ## 用户 交易记录 sz_id 金额求和\n",
" temp = transaction_history_data.groupby(['id', 'sz_id'], as_index=False)['rmb_amt'].agg({'sz_id_amt':'sum'})\n",
" temp = pd.pivot_table(temp[['id', 'sz_id', 'sz_id_amt']], index=['id', 'sz_id']).unstack().reset_index().fillna(0)\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
" \n",
" ## 用户 交易记录 g2_cod 金额求和\n",
" temp = transaction_history_data.groupby(['id', 'g2_cod'], as_index=False)['rmb_amt'].agg({'g2_cod_amt':'sum'})\n",
" temp = pd.pivot_table(temp[['id', 'g2_cod', 'g2_cod_amt']], index=['id', 'g2_cod']).unstack().reset_index().fillna(0)\n",
" data = pd.merge(data, temp, on=['id'], how='left')\n",
" \n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## 1-1 ~ 2-27 训练集提取特征 1-1 ~ 3-6 测试集提取特征\n",
"train = get_feature('2019-02-27', 'train')\n",
"test = get_feature('2019-03-06', 'test')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## 处理列名格式\n",
"rename_col = []\n",
"for i in train.columns:\n",
" if isinstance(i, tuple):\n",
" rename_col.append(i[0]+'_'+str(i[1]))\n",
" else:\n",
" rename_col.append(i)\n",
"train.columns = rename_col\n",
"\n",
"rename_col = []\n",
"for i in test.columns:\n",
" if isinstance(i, tuple):\n",
" rename_col.append(i[0]+'_'+str(i[1]))\n",
" else:\n",
" rename_col.append(i)\n",
"test.columns = rename_col"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y = train['click_w228'].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## 模型用到的210个特征\n",
"## 具体筛选方式是先跑一折xgb模型,保留特征重要度大于10的210个特征,后期固定下来所以略去具体的训练步骤\n",
"col_lst = [ i for i in train.columns if i in [\n",
" 'aum_last', 'age', 'user_cat1_1_ratio', 'user_trans_day_mean','sz_id_ratio_28', 'sz_id_amt_31', 'user_cat1_0_ratio',\n",
" 'user_sr_std', 'user_cat1_2_ratio', 'user_zc_std', 'sz_id_amt_53','user_sr_+_zc', 'user_cat1_2_count', 'user_sr_+_zc/trans',\n",
" 'user_first_day', 'g2_cod_amt_221', 'user_cat1_1_count', 'sz_id_ratio_31', 'sz_id_ratio_53', 'user_sr_+_zc/days',\n",
" 'sz_id_ratio_15', 'user_sr_mean', 'user_cat1_0_count','g2_cod_ratio_239', 'user_last_day', 'sz_id_amt_28',\n",
" 'g2_cod_ratio_221', 'sz_id_ratio_32', 'g2_cod_ratio_223','g2_cod_ratio_203', 'sz_id_ratio_39', 'sz_id_amt_15',\n",
" 'sz_id_ratio_42', 'sz_id_ratio_52', 'g2_cod_amt_265','user_zc_sum', 'user_rmb_mean', 'sz_id_amt_52', 'sz_id_amt_54',\n",
" 'sz_id_ratio_40', 'g2_cod_amt_223', 'g2_cod_ratio_201','user_sr_sum', 'user_rmb_std', 'sz_id_ratio_54', 'g2_cod_amt_201',\n",
" 'g2_cod_amt_243', 'g2_cod_ratio_243', 'sz_id_amt_32','g2_cod_amt_203', 'sz_id_ratio_45', 'sz_id_ratio_1',\n",
" 'user_trans_count', 'sz_id_amt_10', 'g2_cod_ratio_158','user_trans_day_count', 'sz_id_amt_42', 'sz_id_ratio_30',\n",
" 'g2_cod_ratio_117', 'sz_id_ratio_27', 'g2_cod_amt_306','g2_cod_ratio_265', 'user_sr_sum/days', 'sz_id_amt_45',\n",
" 'user_zc_sum/days', 'user_sr_-_zc/days', 'user_sr_sum/trans','sz_id_amt_40', 'user_zc_mean', 'sz_id_amt_33', 'sz_id_ratio_7',\n",
" 'sz_id_ratio_33', 'sz_id_amt_39', 'sz_id_ratio_16', 'g2_cod_ratio_304', 'sz_id_ratio_2', 'g2_cod_ratio_306',\n",
" 'g2_cod_ratio_266', 'user_sr_-_zc', 'g2_cod_ratio_300','g2_cod_ratio_303', 'sz_id_ratio_3', 'user_zc_sum/trans',\n",
" 'sz_id_amt_27', 'g2_cod_amt_253', 'g2_cod_amt_304','g2_cod_amt_117', 'sz_id_ratio_10', 'g2_cod_amt_300', 'gender',\n",
" 'g2_cod_ratio_119', 'g2_cod_amt_303', 'g2_cod_amt_112','sz_id_ratio_19', 'g2_cod_ratio_278', 'g2_cod_ratio_253',\n",
" 'user_sr_-_zc/trans', 'sz_id', 'g2_cod_ratio_345', 'g2_cod','g2_cod_ratio_129', 'sz_id_amt_7', 'sz_id_amt_24', 'sz_id_amt_30',\n",
" 'g2_cod_ratio_222', 'sz_id_amt_1', 'g2_cod_amt_340','g2_cod_ratio_31', 'sz_id_amt_19', 'sz_id_amt_3', 'sz_id_amt_41',\n",
" 'sz_id_amt_16', 'g2_cod_ratio_340', 'sz_id_ratio_29','g2_cod_ratio_185', 'sz_id_amt_46', 'g2_cod_amt_31',\n",
" 'g2_cod_amt_158', 'sz_id_amt_11', 'g2_cod_ratio_112','g2_cod_ratio_121', 'sz_id_amt_2', 'sz_id_ratio_41',\n",
" 'sz_id_ratio_24', 'sz_id_ratio_11', 'g2_cod_amt_346','sz_id_ratio_6', 'sz_id_ratio_46', 'g2_cod_amt_278',\n",
" 'g2_cod_amt_119', 'g2_cod_amt_345', 'g2_cod_amt_239','sz_id_amt_29', 'g2_cod_ratio_346', 'g2_cod_amt_121',\n",
" 'g2_cod_amt_157', 'sz_id_amt_37', 'sz_id_ratio_34','g2_cod_ratio_34', 'sz_id_amt_0', 'sz_id_ratio_26',\n",
" 'g2_cod_amt_222', 'g2_cod_amt_185', 'sz_id_ratio_17','g2_cod_amt_34', 'g2_cod_ratio_314', 'sz_id_amt_6',\n",
" 'g2_cod_ratio_187', 'sz_id_amt_12', 'sz_id_ratio_12','g2_cod_amt_268', 'g2_cod_amt_129', 'sz_id_ratio_36',\n",
" 'sz_id_amt_17', 'sz_id_ratio_37', 'g2_cod_ratio_268', 'age_range','g2_cod_ratio_283', 'g2_cod_amt_41', 'g2_cod_amt_269',\n",
" 'sz_id_ratio_23', 'sz_id_ratio_4', 'g2_cod_amt_187','g2_cod_ratio_262', 'g2_cod_ratio_293', 'g2_cod_ratio_128',\n",
" 'sz_id_ratio_0', 'g2_cod_ratio_148', 'g2_cod_amt_283','sz_id_amt_55', 'g2_cod_ratio_130', 'sz_id_amt_49',\n",
" 'g2_cod_amt_120', 'g2_cod_amt_293', 'g2_cod_amt_266', 'g2_cod_ratio_120', 'g2_cod_ratio_267', 'g2_cod_ratio_172',\n",
" 'g2_cod_amt_314', 'g2_cod_ratio_263', 'sz_id_amt_34','sz_id_amt_26', 'g2_cod_amt_267', 'g2_cod_ratio_174',\n",
" 'g2_cod_ratio_269', 'g2_cod_amt_134', 'sz_id_amt_4','sz_id_amt_36', 'sz_id_ratio_44', 'g2_cod_ratio_157',\n",
" 'g2_cod_amt_174', 'sz_id_ratio_25', 'sz_id_amt_25', 'sz_id_amt_8','sz_id_amt_9', 'g2_cod_ratio_292', 'g2_cod_ratio_349',\n",
" 'g2_cod_amt_263', 'g2_cod_amt_130', 'sz_id_amt_22','sz_id_ratio_9', 'g2_cod_amt_206', 'sz_id_ratio_22',\n",
" 'g2_cod_ratio_350', 'sz_id_ratio_51', 'sz_id_amt_51','g2_cod_ratio_41', 'sz_id_ratio_47', 'sz_id_ratio_5',\n",
" 'g2_cod_amt_172']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## 模型训练部分,分层抽样,10折cv只跑一折即可\n",
"skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)\n",
"auc_list = []\n",
"sub_list = []\n",
"\n",
"for k, (train_idx, test_idx) in enumerate(skf.split(y, y)):\n",
" if k in [9]:\n",
" print(k+1)\n",
" \n",
" X_train, X_test, y_train, y_test = \\\n",
" train[col_lst].iloc[train_idx], train[col_lst].iloc[test_idx], y[train_idx], y[test_idx]\n",
"\n",
" xgb_train = xgb.DMatrix(X_train, label=y_train)\n",
" xgb_val = xgb.DMatrix(X_test, label=y_test)\n",
"\n",
" \n",
" params = {\n",
" 'booster': 'gbtree',\n",
" 'objective': 'binary:logistic',\n",
" 'eval_metric': 'auc',\n",
" 'gamma': 0.1,\n",
" 'min_child_weight': 1.1,\n",
" 'learning_rate' : 0.01,\n",
" 'max_depth': 5,\n",
" 'subsample': 0.8,\n",
" 'colsample_bytree': 0.8,\n",
" 'colsample_bylevel': 0.8,\n",
" 'lambda': 10,\n",
" 'verbose_eval': 1,\n",
" 'nthread': 6,\n",
" 'silent': 1,\n",
" }\n",
"\n",
" evallist = [(xgb_train, 'train'), (xgb_val, 'eval')]\n",
" gbm = xgb.train(params, xgb_train, 3000, evallist, early_stopping_rounds=60, verbose_eval=50) \n",
" \n",
" auc_list.append(gbm.best_score)\n",
" sub_list.append(gbm.predict(xgb.DMatrix(test[col_lst]), ntree_limit=gbm.best_ntree_limit)) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"test['score'] = sub_list[0]\n",
"test[['id', 'score']].to_csv('ques2_210fea_cv1.csv', index=None)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:py3]",
"language": "python",
"name": "conda-env-py3-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 790109e

Please sign in to comment.