这个流程主要是方便学习数据挖掘的IT人员快速了解数据挖掘的过程以及应该注意的要点
数据来源:Kaggle上有这样一个比赛:城市自行车共享系统使用状况。
https://www.kaggle.com/c/bike-sharing-demand可以下载
有一个博客对这个题目进行了专门的分析,见如下博客,题目分析以及特征的分析:
https://blog.csdn.net/u013795429/article/details/52092659
数据的处理以及特征处理,见如下博客:
https://blog.csdn.net/mlljava1111/article/details/53694055
https://blog.csdn.net/qiujiahao123/article/details/68927616
特征工程与相关算法的融合如下:
#coding=utf-8
import pandas as pd
data = pd.read_csv("train.csv", header = 0,error_bad_lines=False)
# print(data.head())
# 把datetime域切成 日期 和 时间 两部分。
temp = pd.DatetimeIndex(data['datetime'])
data['date'] = temp.date
data['time'] = temp.time
# print(data.head())
#设定hour这个小时字段
data["hour"] = pd.to_datetime(data.time,format= '%H:%M:%S')
data['hour'] = pd.Index(data["hour"]).hour
#我们对时间类的特征处理,产出一个星期几的类别型变量
data["dayofweek"] = pd.DatetimeIndex(data.date).dayofweek
#对时间类特征处理,产出一个时间长度变量
data['dateDays'] = (data.date - data.date[0]).astype('timedelta64[D]')
#做一个小小的统计来看看真实的数据分布,我们统计一下一周各天的自行车租赁情况(分注册的人和没注册的人
byday = data.groupby(data["dayofweek"])
# 统计下没注册的用户租赁情况
byday['casual'].sum().reset_index()
# 统计下注册的用户的租赁情况
# print(byday['casual'].sum().reset_index())
byday['registered'].sum().reset_index()
#周末因为不同于平常,所以周末另外设置一列存储
data['Saturday'] = 0
data.Saturday[data["dayofweek"]==5]=1
data['Sunday'] = 0
data.Sunday[data["dayofweek"]==6]=1
# print(data.head())
# 从数据中,把原始的时间字段等踢掉
dataRel = data.drop(['datetime', 'count','date','time','dayofweek'], axis=1)
# print(dataRel.head())
# 特征向量化
from sklearn.feature_extraction import DictVectorizer
#将连续值的属性放入一个dict中
featureConCols = ['temp','atemp','humidity','windspeed','dateDays','hour']
dataFeatureCon = dataRel[featureConCols]
# print(dataFeatureCon)
#当数据中存在NaN缺失值时,我们可以用NA替代NaN
dataFeatureCon = dataFeatureCon.fillna('NA')
# 把离散值的属性放到一个dict中
X_dictCon = dataFeatureCon.T.to_dict().values()
# print(X_dictCon)
# print(dataFeatureCon)
# 把离散值的属性放到另外一个dict中
featureCatCols = ['season','holiday','workingday','weather','Saturday', 'Sunday']
dataFeatureCat = dataRel[featureCatCols]
# print(dataFeatureCat)
dataFeatureCat = dataFeatureCat.fillna( 'NA' ) #in case I missed any
X_dictCat = dataFeatureCat.T.to_dict().values()
# print(X_dictCat)
# print(dataFeatureCat.head())
# 向量化特征,即将上述的编成的特征与数字一一对应,进行组成数组
vec = DictVectorizer(sparse = False)
X_vec_cat = vec.fit_transform(X_dictCat)
# print(X_vec_cat)
X_vec_con = vec.fit_transform(X_dictCon)
# print(X_vec_con)
#
#标准化连续值特征
from sklearn import preprocessing
#标准化连续值数据
scaler = preprocessing.StandardScaler().fit(X_vec_con)
X_vec_con = scaler.transform(X_vec_con)
# print(X_vec_con)
#类别特征编码
#最常用的当然是one-hot编码咯,比如颜色 红、蓝、黄 会被编码为[1, 0, 0],[0, 1, 0],[0, 0, 1]
from sklearn import preprocessing
#one-hot编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_vec_cat)
X_vec_cat = enc.transform(X_vec_cat).toarray()
# print(X_vec_cat)
# 把特征拼一起
# 把离散和连续的特征都组合在一起
import numpy as np
X_vec = np.concatenate((X_vec_con,X_vec_cat),axis=1)
# print(X_vec)
#对结果值也处理一下
# 拿到结果的浮点数值
# 对Y向量化
Y_vec_reg = dataRel['registered'].values.astype(float)
Y_vec_cas = dataRel['casual'].values.astype(float)
# print(Y_vec_reg)
# print(Y_vec_cas)
Y = data['count'].values
#print(Y)
"""模型训练,主要采用了4种模型进行训练,
1.岭回归、2.支持向量机、3.随机森林、
4.用网格搜索给随机森林找一组参数,然后在用随机森林预测
"""
# 1.岭回归
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
print("\n1.岭回归")
x_train,x_test,y_train,y_test = train_test_split(X_vec,Y,test_size=0.25,random_state=1)
clf = LinearRegression()
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
# 2.支持向量机--为什么分类的SVM可以用来做回归
from sklearn.svm import SVR
print("\n2.支持向量机")
clf = SVR(kernel='rbf', C=10, gamma=0.001)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
# 3.随机森林
print("\n3.随机森林")
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=80)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
# 4.用网格搜索给随机森林找一组参数,然后在用随机森林预测
print("\n4.用网格搜索给随机森林找一组参数,然后在用随机森林预测")
from sklearn.model_selection import GridSearchCV
params = {"n_estimators":[30,60,90]}
scores = ['r2']
for score in scores:
print(score)
clf = GridSearchCV(RandomForestRegressor(),params,cv=5,scoring=score)
clf.fit(x_train,y_train)
print(clf.best_estimator_)
clf = RandomForestRegressor(bootstrap=True,criterion='mse',max_depth=None,
max_features = "auto",max_leaf_nodes = None,
min_impurity_decrease=0.0,min_impurity_split=None,
min_samples_leaf=1,min_samples_split=2,
min_weight_fraction_leaf =0.0,n_estimators = 90,n_jobs = 1,
oob_score=False,random_state = None,verbose=0,warm_start=False
)
clf.fit(x_train, y_train)
t=clf.predict(x_test)
print(t)
print("y_test:%s"%y_test)
print("训练集准确率:{},测试集准确率:{}".format(
clf.score(x_train, y_train),
clf.score(x_test, y_test)
))
结果:
SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
data.Saturday[data["dayofweek"]==5]=1
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
data.Sunday[data["dayofweek"]==6]=1
[ 16 40 32 ..., 168 129 88]
1.岭回归
[ 174. 68. 392. ..., 114. -70. -26.]
y_test:[308 14 539 ..., 24 14 8]
训练集准确率:0.39493964089432243,测试集准确率:0.4018670152885786
2.支持向量机
[ 161.9917164 64.73611511 258.07775507 ..., 106.7790444 -4.74826863
26.4338403 ]
y_test:[308 14 539 ..., 24 14 8]
训练集准确率:0.27130138131106696,测试集准确率:0.26183605728175696
3.随机森林
[ 309.3 6.3375 570.5375 ..., 27.0625 10.125 14.2875]
y_test:[308 14 539 ..., 24 14 8]
训练集准确率:0.9917779365294469,测试集准确率:0.9491408575667828
4.用网格搜索给随机森林找一组参数,然后在用随机森林预测
r2
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)
[ 304.53333333 7.82222222 565.82222222 ..., 27.72222222 10.93333333
13.9 ]
y_test:[308 14 539 ..., 24 14 8]
训练集准确率:0.9918493230582454,测试集准确率:0.950078345686705
算法参考:
https://blog.csdn.net/shawroad88/article/details/87277407
https://blog.csdn.net/wong2016/article/details/87916292不一样的实现,可以看看
gridSearchCV(网格搜索)的参数、方法及示例:
https://blog.csdn.net/weixin_41988628/article/details/83098130
数据挖掘一般流程及模型整理:
https://mp.weixin.qq.com/s/mtU-58ZPW9ruOj7H16zpVQ
https://blog.csdn.net/scheezer/article/details/82794757
参考:
https://blog.csdn.net/mlljava1111/article/details/53694055
https://blog.csdn.net/qq_41185868/article/details/81711462
https://blog.csdn.net/shawroad88/article/details/87277407