二手车交易价格预测(一)

本菜鸟的第一次机器学习实战。。。。

1,加载相关工具包

## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

 2,读入数据,并查看相关信息

data_train = pd.read_csv('./used_car_train.csv',sep=' ')
data_test = pd.read_csv('./used_car_test.csv',sep=' ')
data_train.describe()
data_train.shape
data_test.shape
data_train.head()
data_train.info()
data_train.columns
data_test.info()

3,对特征标签进行分析

data_train['price'].describe()
print('偏值:{}'.format(data_train['price'].skew()))
print('峰值:{}'.format(data_train['price'].kurt()))
sns.distplot(data_train['price'])

二手车交易价格预测(一)_第1张图片

二手车交易价格预测(一)_第2张图片 

 

#特征标签分析
data = pd.concat([data_train['price'],data_train['power']],axis=1)
data.plot.scatter(x='power',y='price')
data = pd.concat([data_train['price'],data_train['kilometer']],axis=1)
data.plot.scatter(x='kilometer',y='price')

 4,分开数值型和object型的数据

 

numerical_cols = data_train.select_dtypes(exclude='object').columns
print(numerical_cols)
categorical_cols = data_train.select_dtypes(include='object').columns
print(categorical_cols)

 二手车交易价格预测(一)_第3张图片

 画图对一些特征进行分析

X_data = data_train[feature_cols]
Y_data = data_train['price']

X_test = data_test[feature_cols]

print('X train shape:',X_data.shape)
print('X test shape:',X_test.shape)

data = data_train[['price','bodyType']]
plt.subplots(figsize=(8,6))
sns.boxplot(x='bodyType',y='price',data=data)

data = data_train[['price','power']]
plt.subplots(figsize=(8,6))
sns.boxplot(x='power',y='price',data=data)

data =data_train['kilometer']
bins =np.arange(0,20,2.5)

plt.hist(data,bins=bins)

 再查看相关性

二手车交易价格预测(一)_第4张图片

cols = corrmat.nlargest(10, 'price')['price'].index # 数值最大的前十个
cm = np.corrcoef(data_train[cols].values.T)
plt.figure(figsize = (8, 6))
sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', cmap = 'Blues',
            annot_kws = {'size': 10}, yticklabels = cols.values, xticklabels = cols.values)

 二手车交易价格预测(一)_第5张图片

 缺失值处理

total_missing = data_train.isnull().sum().sort_values(ascending = False)
percent = (data_train.isnull().sum() / len(data_train)).sort_values(ascending = False).round(3)
missing_data = pd.concat([total_missing, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()

 

from scipy.stats import norm
from scipy import stats

plt.figure(figsize = (8, 6))
sns.distplot(data_train['price'], fit = norm)

(mu, sigma) = norm.fit(data_train['price'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu = $ {:.2f} and $\sigma = $ {:.2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('price distribution')

fig = plt.figure(figsize = (8, 6))
stats.probplot(data_train['price'], plot = plt);

最后进行log变换

#对数变换log(1+x)
#使用log1p比用log(x+1)更好
#data_train['price'] = np.log(data_train['price'].values+1)
data_train['price'] = np.log1p(data_train['price'].values)
(mu, sigma) = norm.fit(data_train['price'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))

plt.figure(figsize = (8, 6))
sns.distplot(data_train['price'], fit = norm)
plt.legend(['Normal dist. ($\mu = $ {:.2f} and $\sigma$ = {:.2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

plt.figure(figsize = (8, 6))
stats.probplot(data_train['price'], plot = plt);

二手车交易价格预测(一)_第6张图片

 

 参考资料:(11条消息) 数据挖掘之房价预测任务_sanjianjixiang的博客-CSDN博客

你可能感兴趣的:(机器学习)