集成学习一般可分为三大类:boosting,bagging,stacking,随机森林算法归属于bagging类,它的特点是使用多个没有依赖关系的弱学习器进行并行拟合,最后的决策也很简单,对于分类问题则使用简单的投票法,对于回归问题,则使用平均法。
在随机森林算法中建立每棵树的过程是:
注意使用随机森林算法的采样方式与与GBDT算法不同,GBDT算法采用的是无放回采样,而随机森林算法采用的是有放回采样。
采用有放回采样方式可以保证随机森林的每棵树之间都有交集,否则每棵决策树都有可能是“有偏的”。
承接上篇博客,对2万个数据进行算法验证,并比较CART、GBDT及随机森林算法。
import pandas as pd df = pd.read_csv("./train_modified.csv") df.head()
Disbursed | Existing_EMI | ID | Loan_Amount_Applied | Loan_Tenure_Applied | Monthly_Income | Var4 | Var5 | Age | EMI_Loan_Submitted_Missing | ... | Var2_2 | Var2_3 | Var2_4 | Var2_5 | Var2_6 | Mobile_Verified_0 | Mobile_Verified_1 | Source_0 | Source_1 | Source_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | ID000002C20 | 300000 | 5 | 20000 | 1 | 0 | 37 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
1 | 0 | 0.0 | ID000004E40 | 200000 | 2 | 35000 | 3 | 13 | 30 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 |
2 | 0 | 0.0 | ID000007H20 | 600000 | 4 | 22500 | 1 | 0 | 34 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
3 | 0 | 0.0 | ID000008I30 | 1000000 | 5 | 35000 | 3 | 10 | 28 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
4 | 0 | 25000.0 | ID000009J40 | 500000 | 2 | 100000 | 3 | 17 | 31 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows × 51 columns
df["Disbursed"].value_counts()
0 19680 1 320 Name: Disbursed, dtype: int64
from sklearn.model_selection import train_test_split from matplotlib import pyplot as plt %matplotlib inline x_columns = [x for x in df.columns if x not in ["Disbursed", "ID"]] # 挑选除了Disbursed、ID这两列的数据 X = df[x_columns] y = df["Disbursed"] # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # 数据可视化 fig = plt.figure() plt.scatter(x_train[y_train==0]["Loan_Tenure_Applied"], x_train[y_train==0]["Var4"]) plt.scatter(x_train[y_train==1]["Loan_Tenure_Applied"], x_train[y_train==1]["Var4"]) plt.legend([0, 1]) plt.show()
from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import roc_auc_score, classification_report dtc = DecisionTreeClassifier() dtc.fit(x_train, y_train) accuracy = dtc.score(x_test, y_test) print("Accuracy (test): \n", accuracy) y_pred = dtc.predict(x_test) print("混淆矩阵:\n", classification_report(y_test, y_pred)) y_predprob = dtc.predict_proba(x_test)[:, 1] print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
Accuracy (test): 0.9658 混淆矩阵: precision recall f1-score support 0 0.99 0.98 0.98 4926 1 0.05 0.07 0.06 74 avg / total 0.97 0.97 0.97 5000 AUC Score (test): 0.523336
from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import roc_auc_score, classification_report gbc = GradientBoostingClassifier() gbc.fit(x_train, y_train) accuracy = gbc.score(x_test, y_test) print("Accuracy (test): \n", accuracy) y_pred = gbc.predict(x_test) print("混淆矩阵:\n", classification_report(y_test, y_pred)) y_predprob = gbc.predict_proba(x_test)[:, 1] print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
Accuracy (test): 0.9848 混淆矩阵: precision recall f1-score support 0 0.99 1.00 0.99 4926 1 0.00 0.00 0.00 74 avg / total 0.97 0.98 0.98 5000 AUC Score (test): 0.824064
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score, classification_report rfc = RandomForestClassifier(oob_score=True) rfc.fit(x_train, y_train) accuracy = rfc.score(x_test, y_test) print("Accuracy (test): \n", accuracy) y_pred = rfc.predict(x_test) print("混淆矩阵(test):\n", classification_report(y_test, y_pred)) y_predprob = rfc.predict_proba(x_test)[:, 1] print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob)) print("袋外分数:\n", rfc.oob_score_)
Accuracy (test): 0.9852 混淆矩阵(test): precision recall f1-score support 0 0.99 1.00 0.99 4926 1 0.50 0.01 0.03 74 avg / total 0.98 0.99 0.98 5000 AUC Score (test): 0.603095 袋外分数: 0.9803333333333333 D:\anaconda\setup\lib\site-packages\sklearn\ensemble\forest.py:453: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates. warn("Some inputs do not have OOB scores. " D:\anaconda\setup\lib\site-packages\sklearn\ensemble\forest.py:458: RuntimeWarning: invalid value encountered in true_divide predictions[k].sum(axis=1)[:, np.newaxis])
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score, classification_report from sklearn.model_selection import GridSearchCV rfc = RandomForestClassifier(oob_score=True, max_features="sqrt") params = {"max_depth": list(range(3,15, 2)), "n_estimators": list(range(50, 201, 20)), 'min_samples_split': list(range(80,150,20)), 'min_samples_leaf': list(range(10,60,10))} gs = GridSearchCV(estimator=rfc, param_grid=params, cv=5) gs.fit(x_train, y_train) accuracy = gs.score(x_test, y_test) print("Accuracy (test): \n", accuracy) y_pred = gs.predict(x_test) print("混淆矩阵(test):\n", classification_report(y_test, y_pred)) y_predprob = gs.predict_proba(x_test)[:, 1] print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob))
Accuracy (test): 0.9852 混淆矩阵(test): precision recall f1-score support 0 0.99 1.00 0.99 4926 1 0.00 0.00 0.00 74 avg / total 0.97 0.99 0.98 5000 AUC Score (test): 0.803139 D:\anaconda\setup\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
gs.best_estimator_, gs.best_score_, gs.best_params_
(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=3, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=80, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=True, random_state=None, verbose=0, warm_start=False), 0.9836, {'max_depth': 3, 'min_samples_leaf': 10, 'min_samples_split': 80, 'n_estimators': 50})
rfc2 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=3, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=80, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=True, random_state=None, verbose=0, warm_start=False) rfc2.fit(x_train, y_train) accuracy = rfc2.score(x_test, y_test) print("Accuracy (test): \n", accuracy) y_pred = rfc2.predict(x_test) print("混淆矩阵(test):\n", classification_report(y_test, y_pred)) y_predprob = rfc2.predict_proba(x_test)[:, 1] print("AUC Score (test): %f" % roc_auc_score(y_test, y_predprob)) print("袋外分数:\n", rfc2.oob_score_)
Accuracy (test): 0.9836 混淆矩阵(test): precision recall f1-score support 0 0.98 1.00 0.99 4918 1 0.00 0.00 0.00 82 avg / total 0.97 0.98 0.98 5000 AUC Score (test): 0.808931 袋外分数: 0.9841333333333333 D:\anaconda\setup\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)