Spark MLlib 下的逻辑回归二元分类
训练模型
导入必要的包
import numpy as np
import pyspark
from matplotlib import pyplot as plt
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
演示回归曲线
x = np.linspace(0,1000,100)
y_t = x*4 + 5
y_r = y_t + np.random.randint(-1000,1000,100)
plt.plot(x,y_t,ls="-",c="r")
plt.scatter(x,y_r)

初始化spark的上下文对象
sc = pyspark.SparkContext(master="local[*]",appName="StumbleuponAnalysis")
准备为数据
def extract_features(fields,categories_dict,end):
category_id = categories_dict[fields[3]]
category_features = np.zeros(len(categories_dict))
category_features[category_id] = 1
numerical_features = [0.0 if f=="?" else float(f) for f in fields[4:end] ]
return np.concatenate((category_features,numerical_features))
def parpare_data(sc,scale):
raw_lines_and_header = sc.textFile("file:/home/zh123/.jupyter/workspace/stumbleupon/train.tsv")
header_line = raw_lines_and_header.first()
raw_non_header_data = raw_lines_and_header.filter(lambda l:l!=header_line)
raw_non_quot_lines = raw_non_header_data.map(lambda l:l.replace("\"",""))
raw_data = raw_non_quot_lines.map(lambda l:l.split("\t"))
print("数据长度:",raw_data.count())
categories_dict = raw_data.map(lambda field:field[3]).distinct().zipWithIndex().collectAsMap()
label_rdd = raw_data.map(lambda fields:float(fields[-1]))
feature_rdd = raw_data.map(lambda fields:extract_features(fields,categories_dict,len(fields)-1))
std_scaler = StandardScaler(withMean=True,withStd=True).fit(feature_rdd)
scaler_features = std_scaler.transform(feature_rdd)
label_point = label_rdd.zip(scaler_features)
label_point_rdd = label_point.map(lambda r:LabeledPoint(r[0],r[1]))
return label_point_rdd.randomSplit(scale),categories_dict
模型评估
定义评估模型AUC值的函数
def evaluate_model(model,validation_data):
predict = model.predict(validation_data.map(lambda p:p.features)).map(lambda x:float(x))
predict_and_label = predict.zip(validation_data.map(lambda p:p.label))
metrics = BinaryClassificationMetrics(predict_and_label)
return metrics.areaUnderROC
定义综合模型评估函数
import time
def train_evaluate_model(train_data,validation_data,numIterations,stepSize,miniBatchFraction):
start_time = time.time()
model = LogisticRegressionWithSGD.train(train_data,numIterations,stepSize,miniBatchFraction)
duration = time.time() - start_time
AUC = evaluate_model(model,validation_data)
return (model,AUC,duration,numIterations,stepSize,miniBatchFraction)
定义评估参数的函数
import pandas as pd
def evaluate_parameter(train_data,validation_data,numIterationsList,stepSizeList,miniBatchFractionList):
metrics = []
columns = ["Model","AUC","Duration","numIterations","stepSize","miniBatchFraction"]
for numIterations in numIterationsList:
for stepSize in stepSizeList:
for miniBatchFraction in miniBatchFractionList:
metrics.append(train_evaluate_model(train_data,validation_data,numIterations,stepSize,miniBatchFraction))
if(len(numIterationsList) > 1):
return pd.DataFrame(metrics,index=numIterationsList,columns=columns)
elif(len(stepSizeList) > 1):
return pd.DataFrame(metrics,index=stepSizeList,columns=columns)
elif(len(miniBatchFractionList)>1):
return pd.DataFrame(metrics,index=miniBatchFractionList,columns=columns)
else:
return pd.DataFrame(metrics,index=[0],columns=columns)
获取训练数据,验证数据,测试数据
((train_data,validation_data,test_data),categories_dict) = parpare_data(sc,scale=[8,1,1])
train_data.persist()
validation_data.persist()
test_data.persist()
数据长度: 7395
PythonRDD[4739] at RDD at PythonRDD.scala:52
评估 numIterations参数影响
训练模型并获取评估参数表
evaluate_table = evaluate_parameter(train_data,validation_data,[i for i in range(1,50,5)],[10],[1])
evaluate_table
|
Model |
AUC |
Duration |
numIterations |
stepSize |
miniBatchFraction |
1 |
(weights=[0.6677226910837364,-0.69951944405741... |
0.664205 |
0.542155 |
1 |
10 |
1 |
6 |
(weights=[0.28810190368216665,-0.3890579409906... |
0.603375 |
0.149749 |
6 |
10 |
1 |
11 |
(weights=[0.2982103093226861,-0.30009276222335... |
0.637453 |
0.186136 |
11 |
10 |
1 |
16 |
(weights=[0.2590246366263148,-0.27478234116180... |
0.690569 |
0.213902 |
16 |
10 |
1 |
21 |
(weights=[0.25133027462275814,-0.2542369719546... |
0.696628 |
0.267709 |
21 |
10 |
1 |
26 |
(weights=[0.24840617513903634,-0.2527605271207... |
0.697719 |
0.317076 |
26 |
10 |
1 |
31 |
(weights=[0.2480626698782132,-0.25281749529624... |
0.693588 |
0.355656 |
31 |
10 |
1 |
36 |
(weights=[0.24788753296317756,-0.2530393653347... |
0.693588 |
0.488446 |
36 |
10 |
1 |
41 |
(weights=[0.24788753296317756,-0.2530393653347... |
0.693588 |
0.362525 |
41 |
10 |
1 |
46 |
(weights=[0.24788753296317756,-0.2530393653347... |
0.693588 |
0.378403 |
46 |
10 |
1 |
根据评估参数表绘制图像
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=4)
ax.set_ylim(0.6,0.7)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
ax.grid()
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

评估 stepSize 参数的影响
训练模型并获取评估参数表
evaluate_table = evaluate_parameter(train_data,validation_data,[26],[i for i in range(10,200,15)],[1])
evaluate_table
|
Model |
AUC |
Duration |
numIterations |
stepSize |
miniBatchFraction |
10 |
(weights=[0.24840617513903634,-0.2527605271207... |
0.697719 |
0.306683 |
26 |
10 |
1 |
25 |
(weights=[0.40103746760777653,-0.4924966686183... |
0.591412 |
0.305612 |
26 |
25 |
1 |
40 |
(weights=[0.5409425093445586,-0.77344879343874... |
0.564893 |
0.311465 |
26 |
40 |
1 |
55 |
(weights=[0.6844234097438462,-1.09699570420703... |
0.559457 |
0.418840 |
26 |
55 |
1 |
70 |
(weights=[0.8379207450635585,-1.43000712772985... |
0.557723 |
0.299107 |
26 |
70 |
1 |
85 |
(weights=[1.0323510305921046,-1.76105166506314... |
0.571635 |
0.288278 |
26 |
85 |
1 |
100 |
(weights=[1.313234120315815,-2.091223074965485... |
0.590554 |
0.304034 |
26 |
100 |
1 |
115 |
(weights=[1.5106494358271485,-2.37554034126727... |
0.590554 |
0.288630 |
26 |
115 |
1 |
130 |
(weights=[1.6808460801490464,-2.64560901166279... |
0.586638 |
0.323949 |
26 |
130 |
1 |
145 |
(weights=[1.846760000240688,-2.914826089181457... |
0.585547 |
0.307586 |
26 |
145 |
1 |
160 |
(weights=[2.0073226982616266,-3.18046915476317... |
0.581202 |
0.305315 |
26 |
160 |
1 |
175 |
(weights=[2.1580796544605683,-3.43464112632351... |
0.570992 |
0.295500 |
26 |
175 |
1 |
190 |
(weights=[2.295776697917227,-3.674935300385708... |
0.565770 |
0.337451 |
26 |
190 |
1 |
根据评估参数表绘制图像
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=6)
ax.set_ylim(0.6,0.7)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

评估miniBatchFraction 参数影响
训练模型并获取评估参数表
evaluate_table = evaluate_parameter(train_data,validation_data,[26],[10],np.linspace(0.1,1,5))
evaluate_table
|
Model |
AUC |
Duration |
numIterations |
stepSize |
miniBatchFraction |
0.100 |
(weights=[0.22432239986157868,-0.2165393087222... |
0.682073 |
0.293671 |
26 |
10 |
0.100 |
0.325 |
(weights=[0.25329319340814027,-0.2708727029103... |
0.702727 |
0.273905 |
26 |
10 |
0.325 |
0.550 |
(weights=[0.24474754141432709,-0.2484500877818... |
0.693803 |
0.276777 |
26 |
10 |
0.550 |
0.775 |
(weights=[0.25171480871609914,-0.2515106513891... |
0.702064 |
0.292244 |
26 |
10 |
0.775 |
1.000 |
(weights=[0.24840617513903634,-0.2527605271207... |
0.697719 |
0.280513 |
26 |
10 |
1.000 |
根据评估参数表绘制图像
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(evaluate_table.index,evaluate_table["AUC"],color="c",tick_label=evaluate_table.index,label="AUC",width=0.1)
ax.set_ylim(0.6,0.75)
ax2 = ax.twinx()
ax2.plot(evaluate_table.index,evaluate_table["Duration"],c="r",label="Duration",marker="o")
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)

测试模型
导入测试集
def loadTestData(sc):
raw_lines_and_header = sc.textFile("file:/home/zh123/.jupyter/workspace/stumbleupon/test.tsv")
header_line = raw_lines_and_header.first()
raw_non_header_data = raw_lines_and_header.filter(lambda l:l!=header_line)
raw_non_quot_lines = raw_non_header_data.map(lambda l:l.replace("\"",""))
raw_data = raw_non_quot_lines.map(lambda l:l.split("\t"))
print("数据长度:",raw_data.count())
web_url_rdd = raw_data.map(lambda fields:fields[0])
feature_rdd = raw_data.map(lambda fields:extract_features(fields,categories_dict,len(fields)))
std_scaler = StandardScaler(withMean=True,withStd=True).fit(feature_rdd)
scaler_features = std_scaler.transform(feature_rdd)
test_point_rdd = web_url_rdd.zip(scaler_features)
return test_point_rdd
test_file_data = loadTestData(sc)
test_file_data.first()
数据长度: 3171
('http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-sauce.html',
DenseVector([-0.355, -0.2496, -0.7015, -0.3917, -0.1041, -0.2274, -0.21, -0.059, -0.1056, 0.0, 0.0, 2.3909, -0.2594, -0.1983, 0.1364, -0.021, -0.3888, 0.3429, -0.4867, -0.3604, -0.3208, 0.342, 0.0, 0.2093, -0.1513, -0.1, -0.0436, 0.7933, 0.7491, -0.7269, -0.2042, -0.0052, -0.2303, -0.5689, 0.406, -0.2558]))
加载最终的模型
model = evaluate_table[evaluate_table.AUC == evaluate_table.AUC.max()].Model.values[0]
使用模型进行预测
for f in test_file_data.randomSplit([10,3171-10])[0].collect():
print(f[0],bool(model.predict(f[1])))
http://www.youbeauty.com/body-fitness/dressing-for-your-body-type?page=2 False
http://www.couponingncooking.com/2012/03/super-easy-whole-chicken-in-crock-pot.html True
http://www.rsvlts.com/2012/08/04/inside-the-london-olympics-week-one-62-high-quality-photos/ False
http://backtoherroots.com/2011/08/04/90-second-nutella-chocolate-cake/ True
http://cathlincooks.blogspot.com/ True
http://www.cheapcooking.com/articles/healthy-school-lunch-ideas.htm True
http://www.ted.com/index.php/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen.html False
http://www.break.com/index/hot-girls-risky-business-fail.html True
http://www.salon.com/2010/04/03/toasted_peeps_brulee_recipe/ True
http://www.joepastry.com/category/pastry/charlotte/ True
http://www.behance.net/leon_farrant/frame/2878481 True
http://www.wimp.com/pageturner/ False