机器学习sklearn（10）集成分类

时间 2021-01-13 标签集成分类

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
x, y = datasets.make_moons(n_samples=500, noise=.3, random_state=42) #生成数据集
print(x)

[[ 8.31039149e-01 -2.58748754e-01]
 [ 1.18506381e+00  9.20387143e-01]
 [ 1.16402213e+00 -4.55525583e-01]
 [-2.36556013e-02  1.08628844e+00]
 [ 4.80502733e-01  1.50942444e+00]
 [ 1.31164912e+00 -5.51176060e-01]

 [ 1.30173265e-01  1.09442697e+00]
 [ 2.82035071e-01  1.04835431e+00]
 [ 2.74530128e-01 -1.42660544e-01]
 [ 1.82398104e-01  2.96612334e-02]
 [ 1.26017313e+00 -5.89217041e-01]
 [-2.66492596e-02  1.58396005e+00]]

#数据集可视化

plt.scatter(x[y==0,0], x[y==0,1])
plt.scatter(x[y==1,0], x[y==1,1])
plt.show()

#用逻辑回归算法进行训练
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=27)
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(x_train, y_train)
log.score(x_test, y_test)

F:\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

0.87

#用支持向量机算法进行训练

from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

F:\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)

0.93

#用决策树算法进行训练

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt.score(x_test, y_test)

0.89

predict_log = log.predict(x_test)
predict_svm = svm.predict(x_test)
predict_dt = dt.predict(x_test)
y_predict = np.array((predict_log + predict_svm + predict_dt)>=2, dtype='int')
print(y_predict)

[0 1 1 1 1 0 0 1 0 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 1 1 1 0 1 1 0 0 1 0 0
 0 1 0 0 1 1 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 1
 1 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1]

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.93

#sklearn封装了集成学习架构
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
('log', LogisticRegression()),
('svm', SVC()),
('dt', DecisionTreeClassifier())
], voting='hard') #hard voting，这个投票机制中，算法的权重都一样
voting_clf.fit(x_train, y_train)
voting_clf.score(x_test, y_test)

F:\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
F:\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)

0.93

#soft voting,不同算法应该区分不同的权重，某个算法的预测准确，就该分配高权重，而某些算法的预测准确度相对较低就该分配较低的权重。这就是soft voting的思想
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
('log', LogisticRegression()),
('svm', SVC(probability=True)),
('dt', DecisionTreeClassifier(random_state=666))
], voting='soft')
voting_clf.fit(x_train, y_train)
voting_clf.score(x_test, y_test)

F:\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
F:\Anaconda3\lib\site-packages\sklearn\svm\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)

0.93

#Bagging,bagging为放回取样，pasting为不放回取样，相比较而言，bagging使用的比较多。
#n_estimators指定子模型个数，max_sample指定每次训练传入的样本个数，最后指定是否用boostrap，即是否采用有放回的方式。
#要形成成百上千的子模型，首选决策树，决策树非参数学习更能产生出不一样的子模型
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True)
bagging.fit(x_train, y_train)
bagging.score(x_test, y_test)

0.93

#OOb即out of bag，在放回取样中，可能一部分样本始终未被取到，通过概率统计，大约会有37%的样本从未被使用到，这些样本就是out of bag，
#这些样本可以作为测试样本，来验证模型。在bagging中传入参数oob_score，接着将所有的样本全部放进去，调用oob_score_就可以得到测试分数。
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True, oob_score=True)
bagging.fit(x, y)
bagging.oob_score_

0.92

#在样本特征足够多，例如图像中，可以针对特征进行随机采样（最大样本数小于实际样本数即可），称为random subspaces，
#既针对样本，又针对特征进行随机采样的方式称为random patches。
#max_features设置取的特征个数，bootstrap_features设置是否为有放回。
#bagging的方式容易并行处理，可以在baggingclassifier中传入参数，n_jobs=-1，将所有的核用于计算。
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=5000, max_samples=100, bootstrap=True, oob_score=True,
n_jobs=-1, max_features=1, bootstrap_features=True)
bagging.fit(x, y)
bagging.oob_score_

0.856

#随机森林
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, random_state=666, oob_score=True, n_jobs=-1)
rf.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=True, random_state=666, verbose=0, warm_start=False)

rf.oob_score_

0.896

#extra tree,这种方式提供额外的随机性，训练速度快，抑制了过拟合，但增大了偏差。
#1、随机森林应用的是Bagging模型，而ET是使用所有的训练样本得到每棵决策树，也就是每棵决策树应用的是相同的全部训练样本；
#2、随机森林是在一个随机子集内得到最佳分叉属性，而ET是完全随机的得到分叉值，从而实现对决策树进行分叉的。
from sklearn.ensemble import ExtraTreesClassifier
ef = ExtraTreesClassifier(n_estimators=500, random_state=666, bootstrap=True, oob_score=True, n_jobs=-1)
ef.fit(x, y)

ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=True, random_state=666, verbose=0, warm_start=False)

#回归问题，决策树可以解决回归问题，因此集成森林也可以
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

#boosting,一种集成学习方式，同样也是集成多个模型，但是每个模型都在尝试增强整体的模型效果。
#例如Ada boosting,对于未被使用的样本点将会增大其权重，在下次训练中，将会优先考虑这些样本点，如此反复下去，直到训练出足够多的样本点。
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=500)
ada.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=500, random_state=None)

ada.score(x_test, y_test)

0.856

#gradient boosting的思想是训练一个模型m1，该模型产生错误e1，针对e1训练出第二个模型m2，产生错误e2，以此类推，最终预测结果是将所有模型进行相加。
#由于每个模型都是对上一个模型的修正，因此模型的预测效果将会不断的增强。
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
gra = GradientBoostingClassifier(max_depth=2, n_estimators=30)
gra.fit(x_train, y_train)
gra.score(x_test, y_test)

0.912