基于乳腺癌数据集建立并比较集成分类器效果。
#导入数据集
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
#划分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=42)
#建立决策树、Bagging、AdaBoost、随机森林模型
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
clf_tree=DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=1)
clf_bagging=BaggingClassifier(n_estimators=100, random_state=1)
clf_adaboost=AdaBoostClassifier(n_estimators=100, random_state=1)
clf_randomforest=RandomForestClassifier(n_estimators=100, random_state=1)
clf_tree = clf_tree.fit(X_train, y_train)
clf_bagging = clf_bagging.fit(X_train, y_train)
clf_adaboost = clf_adaboost.fit(X_train, y_train)
clf_randomforest = clf_randomforest.fit(X_train, y_train)
#准确率评估
tree_score=clf_tree.score(X_test,y_test)
bagging_score=clf_bagging.score(X_test,y_test)
adaboost_score=clf_adaboost.score(X_test,y_test)
randomforest_score=clf_randomforest.score(X_test,y_test)
print("tree_score:%s" % (tree_score))
print("bagging_score:%s" % (bagging_score))
print("adaboost_score:%s" % (adaboost_score))
print("randomforest_score:%s" % (randomforest_score))
#以下代码可对比决策树模型和随机森林模型的特征重要性。计算方法是将森林中所有树的特征重要性求和并取平均。一般来说,随机森林给出的特征重要性要比单棵树给出的更为可靠
#输出特征重要性矩阵
print(clf_tree.feature_importances_)
print(clf_randomforest.feature_importances_)
#定义特征重要性可视化条形图
import numpy as np
import matplotlib.pyplot as plt
def plot_feature_importances_cancer(model,title='hyperplan'):
n_features = cancer.data.shape[1]
plt.figure(figsize=(10,7.5))
plt.title(title)
plt.barh(range(n_features),model.feature_importances_,align='center')
plt.yticks(np.arange(n_features),cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
#查看决策树及随机森林的特征重要性
plot_feature_importances_cancer(clf_tree,title='tree')
plt.show()
plot_feature_importances_cancer(clf_randomforest,title='randomforest')
plt.show()
#与单棵树相比,随机森林中有更多特征的重要性不为0;和决策树类似,随机森林也给了“worst radius”特征很大的重要性,但从总体看,它实际上选择了“worst perimeter”作为信息量最大的特征。由于构造随机森林过程中的随机性,算法需要考虑多种可能的解释,结果就是随机森林比单棵树更能从总体把握数据的特征。