forked from finafily526/A_Model_of_Risk_Control
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Part4.1模型_logistic.py
90 lines (81 loc) · 2.78 KB
/
Part4.1模型_logistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import re
import time
import datetime
import pickle
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegressionCV
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
### 计算KS值
def KSs(df, score, target):
'''
:param df: 包含目标变量与预测值的数据集
:param score: 得分或者概率
:param target: 目标变量
:return: KS值
'''
total = df.groupby([score])[target].count()
bad = df.groupby([score])[target].sum()
all = pd.DataFrame({'total':total, 'bad':bad})
all['good'] = all['total'] - all['bad']
all[score] = all.index
all = all.sort_values(by=score,ascending=False)
all.index = range(len(all))
all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
return max(KS)
#读取数据
trainData = pd.read_csv('test1.csv')
##只计算统计变量
allFeatures = list(trainData.columns)
allFeatures.remove('Unnamed: 0')
## 上一步的结果
multi_analysis = []
### (1)将多变量分析的后变量带入LR模型中
y = trainData['y']
X = trainData[multi_analysis]
X['intercept'] = [1] * X.shape[0]
LR = sm.Logit(y, X).fit()
summary = LR.summary()
print(summary)
pvals = LR.pvalues
pvals = pvals.to_dict()
print(pvals)
# 有些变量不显著,需要逐步剔除
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True)
while (len(varLargeP) > 0 and len(multi_analysis) > 0):
# 每次迭代中,剔除最不显著的变量,直到
# (1) 剩余所有变量均显著
# (2) 没有特征可选
varMaxP = varLargeP[0][0]
print(varMaxP)
if varMaxP == 'intercept':
print('the intercept is not significant!')
break
multi_analysis.remove(varMaxP)
y = trainData['result']
X = trainData[multi_analysis]
X['intercept'] = [1] * X.shape[0]
LR = sm.Logit(y, X).fit()
pvals = LR.pvalues
pvals = pvals.to_dict()
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True)
summary = LR.summary()
trainData['prob'] = LR.predict(X)
ks = KSs(trainData, 'prob', 'result')
auc = roc_auc_score(trainData['result'], trainData['prob'])
print('normalLR:ks {}, auc {}'.format(ks, auc))
# 将模型保存
saveModel = open('LR_Model_Normal.pkl', 'wb+')
pickle.dump(LR, saveModel)
saveModel.close()