首先对波士顿房价数据集做了线性回归,发现模型效果一般。当线性回归模型太简单导致欠拟合时,可以通过增加多项式特征来让线性回归模型更好地拟合数据。在scikit-learn中,线性回归由类sklearn.linear_model.LinearRegression实现,多项式由类sklearn.preprocessing.PolynomialFeatures实现。需要用一个管道把两个类串起来,即sklearn.pipeline.Pipeline类。
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
#导入数据集
boston=load_boston()
#划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.3, random_state=3)
print(X_train.shape)
print(X_test.shape)
#建立线性回归模型
clf=LinearRegression()
clf=clf.fit(X_train, y_train)
#准确率评估
train_score=clf.score(X_train,y_train)
test_score=clf.score(X_test,y_test)
print("train_score:%s" % (train_score))
print("test_score:%s" % (test_score))
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
#模型优化
def polymodel(degree=1):
poly_features=PolynomialFeatures(degree=degree,include_bias=False)
linear_regression=LinearRegression(normalize=True)
pipeline=Pipeline([('poly_features',poly_features),('linear_regression',linear_regression)])
return pipeline
#建立二阶多项式
clf_poly=polymodel(degree=2)
clf_poly=clf_poly.fit(X_train,y_train)
train_score=clf_poly.score(X_train,y_train)
test_score=clf_poly.score(X_test,y_test)
#准确率评估
print("poly_train_score:%s" % (train_score))
print("poly_test_score:%s" % (test_score))
#可以看到,增加模型复杂度后,该分类器的准确率在训练集和测试集上均有所提高