python数据分析(十三)

# -*- coding: utf-8 -*-

from numpy import *

import pandas as pd

###线性回归####

#读取数据

data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)

data.head()

data.tail()

#画散点图

import seaborn as sns

import matplotlib

%matplotlib inline

sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.8)

sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.8, kind='reg')

#计算相关系数矩阵

data.corr()

#构建X、Y数据集

X = data[['TV', 'Radio', 'Newspaper']]

X.head()

y = data['Sales']

y.head()

##直接根据系数矩阵公式计算

def standRegres(xArr,yArr):

xMat = mat(xArr); yMat = mat(yArr).T

xTx = xMat.T*xMat

if linalg.det(xTx) == 0.0:

print "This matrix is singular, cannot do inverse"

return

ws = xTx.I * (xMat.T*yMat)

return ws

#求解回归方程系数

X2=X

X2['intercept']=[1]*200

standRegres(X2,y)

##利用现有库求解

from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg.fit(X, y)

print linreg.intercept_

print linreg.coef_

print zip(['TV','Radio','Newspaper'], linreg.coef_)

##测试集和训练集的构建

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

linreg.fit(X_train, y_train)

#结果

print linreg.intercept_

print linreg.coef_

print zip(['TV','Radio','Newspaper'], linreg.coef_)

#预测

y_pred = linreg.predict(X_test)

#误差评估

from sklearn import metrics

# calculate MAE using scikit-learn

print "MAE:",metrics.mean_absolute_error(y_test,y_pred)

# calculate MSE using scikit-learn

print "MSE:",metrics.mean_squared_error(y_test,y_pred)

# calculate RMSE using scikit-learn

print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test,y_pred))

##模型比较

feature_cols = ['TV', 'Radio']

X = data[feature_cols]

y = data.Sales

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

linreg.fit(X_train, y_train)

y_pred = linreg.predict(X_test)

# calculate MAE using scikit-learn

print "MAE:",metrics.mean_absolute_error(y_test,y_pred)

# calculate MSE using scikit-learn

print "MSE:",metrics.mean_squared_error(y_test,y_pred)

# calculate RMSE using scikit-learn

print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test,y_pred))

你可能感兴趣的:(python数据分析(十三))