使用决策树算法针对乳腺癌数据集建立分类模型,首先按照7:3的比例分为训练集测试集,在此基础上分别用混淆矩阵、准确率、精确率、召回率、F值、分类报告这六种形式对该模型的分类效果进行评估。
#导入数据集
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
#划分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=42)
#建立决策树模型
from sklearn.tree import DecisionTreeClassifier
clf_tree=DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=1)
clf_tree = clf_tree.fit(X_train, y_train)
#使用已建立模型进行预测
y_pred=clf_tree.predict(X_test)
#混淆矩阵
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
#准确率
from sklearn.metrics import accuracy_score
print("accuracy of malignant and benign:%s" % (accuracy_score(y_test,y_pred)))
#精确率
from sklearn.metrics import precision_score
print("precision of malignant:%s" % (precision_score(y_test,y_pred, pos_label=0)))
print("precision of benign:%s" % (precision_score(y_test,y_pred)))
#召回率
from sklearn.metrics import recall_score
print("recall of malignant:%s" % (recall_score(y_test,y_pred,pos_label=0)))
print("recall of benign:%s" % (recall_score(y_test,y_pred)))
#对该数据集用交叉验证的方法进行评估,设定cv=10,即10折交叉验证,通过cross_val_score函数可以得到评估指标的数组,由于选定的scoring为f1,因此得到的数组表示每次交叉验证得到的F值。还可以通过score.mean()函数来求10次F值结果的平均值
#交叉验证评估
from sklearn import cross_validation
score=cross_validation.cross_val_score(clf_tree, cancer.data, cancer.target, cv=10,scoring='f1')
print(score)
print(score.mean())