import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from  scipy.stats import chi2_contingency
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
data=pd.read_csv("mushrooms.csv")
#用一个函数来输出模型的结果
def print_score(classifier,X_train,y_train,X_test,y_test,train=True):
    if train == True:
        print("Training results:\n")
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train,classifier.predict(X_train))))
        print('Classification Report:\n{}\n'.format(classification_report(y_train,classifier.predict(X_train))))
        print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_train,classifier.predict(X_train))))
        res = cross_val_score(classifier, X_train, y_train, cv=10, n_jobs=-1, scoring='accuracy')
    elif train == False:
        print("Test results:\n")
        print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,classifier.predict(X_test))))
        print('Classification Report:\n{}\n'.format(classification_report(y_test,classifier.predict(X_test))))
        print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,classifier.predict(X_test))))
#将数据进行编码
labelenconder=LabelEncoder()
for col in data.columns:
    data[col]= labelenconder.fit_transform(data[col])
y=data["class"].values
Encoder_y=LabelEncoder()
y = Encoder_y.fit_transform(y)
y[y==0]=-1
'''
#进行卡方检验，来判断定类变量之间是不是显著相关
kf_data=np.array(data)
tu= np.array(data['class'])
for col in data.columns:
    ndata=np.array(data[col])
    c=np.unique(ndata)
    x=[]
    y=[]
    for i in range(0,len(c)):
        count1=0
        count2=0
        for j in range(0,len(ndata)):
            if ndata[j]==i:
                if tu[j]==0:
                    count1+=1
                else:
                    count2+=1
        x.append(count1)
        y.append(count2)
    d=np.array([x, y])
    kf = chi2_contingency(d)
    if kf[1] > 0.05:
        print(d)
        print(col)
        print("%.4f" % kf[1])
'''


#将前文的属性选择中区分度不大的属性剔除
x=data.drop(["class","cap-shape","cap-surface","cap-color","gill-attachment","gill-spacing","stalk-shape","stalk-root","veil-type","veil-color"],axis=1)
#对数据进行哑变量编码，例如若用数字1-12表示1-12月，那么就潜在表示了12月和1月差的很远，其实离的很近，因此划分成不同

x=pd.get_dummies(x,columns=x.columns,drop_first=True)
#进行训练集测试集的划分
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.5, random_state=42)


'''
#下面是算法部分，将想要使用的算法前的注释引号去掉即可使用
#支持向量机
classifier = SVC(kernel='rbf',random_state=42)
classifier.fit(x_train,y_train)
print_score(classifier,x_train,y_train,x_test,y_test,train=False)'''

'''
#KNN
classifier= KNN()
classifier.fit(x_train,y_train)
print_score(classifier,x_train,y_train,x_test,y_test,train=False)'''
'''
朴素贝叶斯
from sklearn.naive_bayes import GaussianNB as NB
classifier = NB()
classifier.fit(x_train,y_train)
print_score(classifier,x_train,y_train,x_test,y_test,train=False)'''

'''#决策树
from sklearn.tree import DecisionTreeClassifier as DT
classifier = DT(criterion='entropy',random_state=42)
res = cross_val_score(classifier,x_train, y_train, cv=10, n_jobs=-1, scoring='roc_auc')
print(res)'''
#计算auc值
#y0=classifier.predict(x_test)
#print(roc_auc_score(y_test,y0))