pyspark之LogisticRegression算法

import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf,SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler

def extract_features(fleld,catedoriesMap,featureEnd):
    categoryidx = catedoriesMap[fleld[3]]
    categoryfeatures = np.zeros(len(catedoriesMap))
    categoryfeatures[categoryidx] = 1
    numericalFeatures=[convert_float(fleld) for fleld in fleld[4:featureEnd]]
    return np.concatenate((categoryfeatures,numericalFeatures))

def extract_label(field):
    label = field[-1]
    return float(label)
def convert_float(x):
    return (0 if x=="?" else float(x))

global Path
if sc.master[0:5]=='local':
    Path='file:/home/swt/pythonwork/PythonProject/'
else:
    Path="hdfs://localhost:9000/user/swt/"

# def prepare_data(sc):
print('load data...')
rawDataWithHeader = sc.textFile(Path+'data/train.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x != header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print("is "+str(lines.count()))

# 取出label值
categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
labelRDD = lines.map(lambda r:extract_label(r))
print(labelRDD.take(3))

[0.0, 1.0, 1.0]
# 取出feature数据
featureRDD = lines.map(lambda r:extract_features(r,categoriesMap,len(r)-1))

# 将数据标准化
stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
ScalerFeatureRDD = stdScaler.transform(featureRDD)
# 将label和feature整合在一起
labelpoint = labelRDD.zip(ScalerFeatureRDD)
print('label',labelpoint.take(2))
# 从这看出它还是DenseVector格式
label [(0.0, DenseVector([2.7207, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, -0.4464, -0.271, -0.2017, 1.1376, -0.0819, 1.0251, -0.0559, -0.4689, -0.3543, -0.3175, 0.3385, 0.0, 0.8288, -0.1473, 0.2296, -0.1416, 0.7902, 0.7172, -0.298, -0.2035, -0.033, -0.0488, 0.9401, -0.1087, -0.2788])), (1.0, DenseVector([-0.3675, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, 2.2397, -0.271, -0.2017, 0.4887, 0.1063, 0.1959, 0.509, 1.2695, 1.3097, -0.3132, 0.3385, 0.0, 1.0202, -0.1473, -0.5771, -0.0975, 0.7902, 0.7172, 0.4866, -0.2035, -0.0838, 0.0459, 1.2494, 0.0489, 0.3058]))]
# 转换成LabeledPoint形式
labelpointRDD = labelpoint.map(lambda r:LabeledPoint(r[0],r[1]))
print('labelRDD',labelpointRDD.take(3))
# 转换成了LabeledPoint格式
labelRDD [LabeledPoint(0.0, [2.7207366564548514,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,-0.2709990696925828,-0.2016540523193296,1.137647336497678,-0.08193557169294771,1.0251398128933331,-0.05586356442541689,-0.4688932531289357,-0.3543053263079386,-0.3175352172363148,0.3384507982396541,0.0,0.828822173315322,-0.14726894334628504,0.22963982357813484,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642257,-0.2034625779299476,-0.03296720969690391,-0.04878112975579913,0.9400699751165439,-0.10869848852526258,-0.2788207823137022]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,2.2397340510665176,-0.2709990696925828,-0.2016540523193296,0.4886859904169113,0.10628363705145247,0.19588566290866805,0.5089868068250981,1.2694691632834691,1.3097138984590067,-0.31317609057749013,0.3384507982396541,0.0,1.020243830531209,-0.14726894334628504,-0.5770724205625781,-0.09745981080144801,0.7902380499177364,0.7171947294529865,0.4865822517691842,-0.2034625779299476,-0.08378163520013758,0.04594422902162049,1.2493695598285408,0.04885342046314602,0.3057802219012584]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,3.6895505753205593,-0.2016540523193296,1.7637001514533053,-0.04396165033238008,0.46169187416320356,0.7334297297958666,0.29269849146593974,-0.09123800981506591,-0.3032188826367953,0.3384507982396541,0.0,0.3866538001107552,-0.14726894334628504,-0.14053895060796384,-0.08084807670648379,0.7902380499177364,0.7171947294529865,1.2221251282681904,-0.2034625779299476,-0.39171029351574205,0.4415619039155495,1.8679687292525349,-0.03381270192906146,-0.5503866803183464])]
# 将数据切分成训练集,验证集,测试集
(trainData,validationData,testData) = labelpointRDD.randomSplit([8,1,1])

# 持久化
trainData.persist()
validationData.persist()
testData.persist()

# 开始训练
start_time = time.time()
model = LogisticRegressionWithSGD.train(trainData,15,10,0.5)
score=model.predict(validationData.map(lambda p:p.features))
score = score.map(lambda x:float(x))

scoreAndLabels=score.zip(validationData.map(lambda p:p.label))

# 求AUC指数
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(scoreAndLabels)
AUC = metrics.areaUnderROC
print('auc',AUC)
auc 0.6603715728715729

你可能感兴趣的:(虚拟机+大数据)