基于乳腺癌数据集的决策树算法实例如下。
# 导入乳腺癌数据
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
#划分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
#模型训练
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier(random_state=3)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
#准确率评估
train_score=clf.score(X_train,y_train)
test_score=clf.score(X_test,y_test)
print("train_score:%s" % (train_score))
print("test_score:%s" % (test_score))
#接下来,本节以决策树深度max_depth为例,将模型深度与模型效果之间的关系画出来,直观地看到参数值与模型准确度之间的关系。
#定义“深度”参数选择函数
def score(d):
clf=DecisionTreeClassifier(max_depth=d)
clf.fit(X_train,y_train)
train_score=clf.score(X_train,y_train)
test_score=clf.score(X_test,y_test)
return(train_score,test_score)
depths=range(2,15)
scores=[score(d) for d in depths]
train_score=[s[0] for s in scores]
test_score=[s[1] for s in scores]
#画图
import matplotlib.pyplot as plt
plt.figure()
plt.grid()
plt.xlabel('max depth of decision tree')
plt.ylabel('score')
plt.plot(depths,train_score,'.g-',label='train score')
plt.plot(depths,test_score,'.r--',label='test score')
plt.legend()
plt.show()
# 随着模型深度的增加,绿色实线代表的训练集准确度越来越高,而红色虚线代表的测试集准确度经过了上升下降波折,其中在深度为3时取得了最高值。