import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf,SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler
def extract_features(fleld,catedoriesMap,featureEnd):
categoryidx = catedoriesMap[fleld[3]]
categoryfeatures = np.zeros(len(catedoriesMap))
categoryfeatures[categoryidx] = 1
numericalFeatures=[convert_float(fleld) for fleld in fleld[4:featureEnd]]
return np.concatenate((categoryfeatures,numericalFeatures))
def extract_label(field):
label = field[-1]
return float(label)
def convert_float(x):
return (0 if x=="?" else float(x))
global Path
if sc.master[0:5]=='local':
Path='file:/home/swt/pythonwork/PythonProject/'
else:
Path="hdfs://localhost:9000/user/swt/"
print('load data...')
rawDataWithHeader = sc.textFile(Path+'data/train.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x != header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print("is "+str(lines.count()))
categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
labelRDD = lines.map(lambda r:extract_label(r))
print(labelRDD.take(3))
[0.0, 1.0, 1.0]
featureRDD = lines.map(lambda r:extract_features(r,categoriesMap,len(r)-1))
stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
ScalerFeatureRDD = stdScaler.transform(featureRDD)
labelpoint = labelRDD.zip(ScalerFeatureRDD)
print('label',labelpoint.take(2))
label [(0.0, DenseVector([2.7207, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, -0.4464, -0.271, -0.2017, 1.1376, -0.0819, 1.0251, -0.0559, -0.4689, -0.3543, -0.3175, 0.3385, 0.0, 0.8288, -0.1473, 0.2296, -0.1416, 0.7902, 0.7172, -0.298, -0.2035, -0.033, -0.0488, 0.9401, -0.1087, -0.2788])), (1.0, DenseVector([-0.3675, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, 2.2397, -0.271, -0.2017, 0.4887, 0.1063, 0.1959, 0.509, 1.2695, 1.3097, -0.3132, 0.3385, 0.0, 1.0202, -0.1473, -0.5771, -0.0975, 0.7902, 0.7172, 0.4866, -0.2035, -0.0838, 0.0459, 1.2494, 0.0489, 0.3058]))]
labelpointRDD = labelpoint.map(lambda r:LabeledPoint(r[0],r[1]))
print('labelRDD',labelpointRDD.take(3))
labelRDD [LabeledPoint(0.0, [2.7207366564548514,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,-0.2709990696925828,-0.2016540523193296,1.137647336497678,-0.08193557169294771,1.0251398128933331,-0.05586356442541689,-0.4688932531289357,-0.3543053263079386,-0.3175352172363148,0.3384507982396541,0.0,0.828822173315322,-0.14726894334628504,0.22963982357813484,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642257,-0.2034625779299476,-0.03296720969690391,-0.04878112975579913,0.9400699751165439,-0.10869848852526258,-0.2788207823137022]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,2.2397340510665176,-0.2709990696925828,-0.2016540523193296,0.4886859904169113,0.10628363705145247,0.19588566290866805,0.5089868068250981,1.2694691632834691,1.3097138984590067,-0.31317609057749013,0.3384507982396541,0.0,1.020243830531209,-0.14726894334628504,-0.5770724205625781,-0.09745981080144801,0.7902380499177364,0.7171947294529865,0.4865822517691842,-0.2034625779299476,-0.08378163520013758,0.04594422902162049,1.2493695598285408,0.04885342046314602,0.3057802219012584]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,3.6895505753205593,-0.2016540523193296,1.7637001514533053,-0.04396165033238008,0.46169187416320356,0.7334297297958666,0.29269849146593974,-0.09123800981506591,-0.3032188826367953,0.3384507982396541,0.0,0.3866538001107552,-0.14726894334628504,-0.14053895060796384,-0.08084807670648379,0.7902380499177364,0.7171947294529865,1.2221251282681904,-0.2034625779299476,-0.39171029351574205,0.4415619039155495,1.8679687292525349,-0.03381270192906146,-0.5503866803183464])]
(trainData,validationData,testData) = labelpointRDD.randomSplit([8,1,1])
trainData.persist()
validationData.persist()
testData.persist()
start_time = time.time()
model = LogisticRegressionWithSGD.train(trainData,15,10,0.5)
score=model.predict(validationData.map(lambda p:p.features))
score = score.map(lambda x:float(x))
scoreAndLabels=score.zip(validationData.map(lambda p:p.label))
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(scoreAndLabels)
AUC = metrics.areaUnderROC
print('auc',AUC)
auc 0.6603715728715729