ML之LogisticRegression

ML 线性回归

1. 数据输入:

tips.csv

1,1,1
1,1.1,0.9
1,1,1.2
2,10,11
2,9,10
2,10,12
3,50,52
3,49,50
3,48,49

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, PipelineModel

data = sqlContext.read.load(dataPath, format='com.databricks.spark.csv', inferSchema='true')
data = data.withColumnRenamed('_c0', 'label')
strs=''
for i in range(len(data.columns)-1):
strs=strs+'_c'+str(i+1)+','
strs=strs[0:len(strs)-1]

all_feats = [str(x) for x in strs.split(',')]
assemblerAllFeatures = VectorAssembler(inputCols=all_feats, outputCol='features')
pipeline = Pipeline(stages=[assemblerAllFeatures])
pipelineModel = pipeline.fit(data)
output = pipelineModel.transform(data)
df=output.select('label','features')


2. 训练模型

def logisticRegression(df,arguments):
"""
Only supports binary classification
"""
from pyspark.ml.classification import LogisticRegression
maxIter = 100
regParam = 0
elasticNetParam = 0
if arguments.maxIter != None:
maxIter = float(arguments.maxIter)
if arguments.regParam != None:
regParam = float(arguments.regParam)
if arguments.elasticNetParam != None:
elasticNetParam = float(arguments.elasticNetParam)
lr = LogisticRegression(maxIter=maxIter,
regParam=regParam,
elasticNetParam=elasticNetParam)
lrModel = lr.fit(df)
return lrModel


modelPath = arguments.modelPath
model.write().overwrite().save(modelPath)

3. 预测输入数据

df=sc.parallelize([Row(features=Vectors.dense([float(x) for x in dataSet.split(',')]))]).toDF()

预测:

from pyspark.ml.classification import LogisticRegressionModel
model = LogisticRegressionModel.load(modelPath)

result = model.transform(data).head()
str_value = str(result.prediction)
fo = open("/tmp/foo.txt", "w")
fo.write(str_value);
fo.close()


你可能感兴趣的:(hadoop/hive)