---参考另一篇,使用DecisionTree做共享单车租赁量的回归预测
第一步:点击数据地址 ,选择下载文件 Bike-Sharing-Dataset.zip
使用hour.csv,去掉第一行的数据说明,得到hour_noheader.csv
第二步:加载数据,并放入cache缓存中
#Initializing PySpark
from pyspark import SparkContext, SparkConf
#Spark Config
sc.stop()
conf = SparkConf().setMaster('local[4]').setAppName('BikeSharing_app')
sc = SparkContext(conf=conf)
import matplotlib.pyplot as plt
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
import numpy as np
path = '/Users/gao/Bike-Sharing-Dataset/hour_noheader.csv'
raw_data = sc.textFile(path)
num_data = raw_data.count()
records = raw_data.map(lambda x:x.split(','))
first = records.first()
print(first)
print(num_data)
#将数据放到缓存中,因为需要多次读取
records.cache()
第三步:one-hot-encoding,提取特征
#定义函数,进行去重编码映射
def get_mapping(rdd, idx):
return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()
#针对 category变量
mappings = [get_mapping(records,i) for i in np.arange(2,10)]
#查看多少个特征
cat_len = sum(map(len,mappings))
num_len = len(records.first()[11:15])
total_len = num_len + cat_len
print( "Feature vector length for categorical features: %d" % cat_len)
print( "Feature vector length for numerical features: %d" % num_len)
print( "Total feature vector length: %d" % total_len)
#提取特征
def extract_features(record):
cat_vec = np.zeros(cat_len)
i = 0
step = 0
for field in record[2:9]:
m = mappings[i]
idx = m[field]
cat_vec[idx + step] = 1
i=i+ 1
step = step + len(m)
num_vec = np.array([float(field) for field in record[10:14]])
return np.concatenate((cat_vec, num_vec))
#提取标签变量
def extract_label(record):
return float(record[-1])
data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r)))
first_point = data.first()
print ("Raw data: " + str(first[2:]))
print ("Label: " + str(first_point.label))
print ("Linear Model feature vector:\n" + str(first_point.features))
print ("Linear Model feature vector length: " + str(len(first_point.features)))
第四步:查看label变量的分布,并进行log变换
#绘制 target变量的分布直方图,确定是否是正态分布
targets = records.map(lambda r: float(r[-1])).collect()
plt.hist(targets, bins=40, color='lightblue', density=True)
fig = plt.gcf()
fig.set_size_inches(16, 10)
#对target值进行 log-变换,并重新绘制分布直方图,查看分布情况
log_targets = records.map(lambda r: np.log(float(r[-1]))).collect()
plt.hist(log_targets, bins=40, color='lightblue', normed=True)
fig = plt.gcf()
fig.set_size_inches(16, 10)
data_log = data.map(lambda lp: LabeledPoint(np.log(lp.label),lp.features)) #log变换
第五步:拆分训练集、测试集
#拆分训练样本-测试样本,使用 sample 和 subtractByKey的方法。
data_with_idx = data.zipWithIndex().map(lambda point_index: (point_index[1], point_index[0]))
test = data_with_idx.sample(False, 0.2, 42)
train = data_with_idx.subtractByKey(test)
train_data = train.map(lambda index_point:index_point[1])
test_data = test.map(lambda index_point:index_point[1])
train_size = train_data.count()
test_size = test_data.count()
print ("Training data size: %d" % train_size)
print ("Test data size: %d" % test_size)
print ("Total data size: %d " % num_data)
print ("Train + Test size : %d" % (train_size + test_size))
第六步:定义模型评价度量函数
#定义评价模型的度量
def evaluate(train, test, iterations, step, regParam, regType,intercept):
model = LinearRegressionWithSGD.train(train, iterations, step,regParam=regParam, regType=regType, intercept=intercept)
tp = test.map(lambda p: (p.label, model.predict(p.features)))
rmsle = np.sqrt(tp.map(lambda lp: squared_log_error(lp[0], lp[1])).mean())
return rmsle
第七步:调节参数,选择最优解。可调节参数包括:Iterations、step、L2/L1正则化系数、Intercept
#示例:调节参数之---Iterations 迭代次数
params = [1, 5, 10, 20, 50, 100, 200, 300]
metrics = [evaluate(train_data, test_data, param, 0.01, 0.0, 'l2', False) for param in params]
print( params)
print( metrics)
plt.plot(params, metrics)
fig = plt.gcf()
plt.xlabel('log')
第八步:汇总上述最优模型参数和结果。
Done