import findspark
findspark.init()
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName('LIXU').getOrCreate()
1、加载数据
数据的下载地址:http://www.tomdrabas.com/data/LearningPySpark/births_transformed.csv.gz
import pyspark.sql.types as typ
labels = [
('INFANT_ALIVE_AT_REPORT', typ.IntegerType()),
('BIRTH_PLACE', typ.StringType()),
('MOTHER_AGE_YEARS', typ.IntegerType()),
('FATHER_COMBINED_AGE', typ.IntegerType()),
('CIG_BEFORE', typ.IntegerType()),
('CIG_1_TRI', typ.IntegerType()),
('CIG_2_TRI', typ.IntegerType()),
('CIG_3_TRI', typ.IntegerType()),
('MOTHER_HEIGHT_IN', typ.IntegerType()),
('MOTHER_PRE_WEIGHT', typ.IntegerType()),
('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
('DIABETES_PRE', typ.IntegerType()),
('DIABETES_GEST', typ.IntegerType()),
('HYP_TENS_PRE', typ.IntegerType()),
('HYP_TENS_GEST', typ.IntegerType()),
('PREV_BIRTH_PRETERM', typ.IntegerType())
]
schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])
births = spark.read.csv('./births_transformed.csv.gz', header=True, schema=schema)
births.head(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=29, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=99, MOTHER_PRE_WEIGHT=999, MOTHER_DELIVERY_WEIGHT=999, MOTHER_WEIGHT_GAIN=99, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0)]
2、创建transformers
import pyspark.ml.feature as ft
births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(typ.IntegerType())) # 将字符“1”转换成数字1
encoder = ft.OneHotEncoder(inputCol = 'BIRTH_PLACE_INT', outputCol = 'BIRTH_PLACE_VEC') # onehot处理
featuresCreator = ft.VectorAssembler(inputCols = [col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol = 'features') # 将特征合并成features
3、创建逻辑回归模型
import pyspark.ml.classification as cl
logistic = cl.LogisticRegression(maxIter = 10, regParam = 0.01, labelCol = 'INFANT_ALIVE_AT_REPORT')
4、创建pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [encoder, featuresCreator, logistic])
5、训练模型
births_train, births_test = births.randomSplit([0.7, 0.3], seed = 666)
model = pipeline.fit(births_train)
test_model = model.transform(births_test)
test_model.take(1)
[Row(INFANT_ALIVE_AT_REPORT=0, BIRTH_PLACE='1', MOTHER_AGE_YEARS=13, FATHER_COMBINED_AGE=99, CIG_BEFORE=0, CIG_1_TRI=0, CIG_2_TRI=0, CIG_3_TRI=0, MOTHER_HEIGHT_IN=66, MOTHER_PRE_WEIGHT=133, MOTHER_DELIVERY_WEIGHT=135, MOTHER_WEIGHT_GAIN=2, DIABETES_PRE=0, DIABETES_GEST=0, HYP_TENS_PRE=0, HYP_TENS_GEST=0, PREV_BIRTH_PRETERM=0, BIRTH_PLACE_INT=1, BIRTH_PLACE_VEC=SparseVector(9, {1: 1.0}), features=SparseVector(24, {0: 13.0, 1: 99.0, 6: 66.0, 7: 133.0, 8: 135.0, 9: 2.0, 16: 1.0}), rawPrediction=DenseVector([1.0573, -1.0573]), probability=DenseVector([0.7422, 0.2578]), prediction=0.0)]
6、对模型进行评估
import pyspark.ml.evaluation as ev
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol = 'probability', labelCol = 'INFANT_ALIVE_AT_REPORT')
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'}))
0.7401301847095617
0.7139354342365674
7、保存和加载模型
pipelinePath = ‘./infant_oneHotEncoder_Logistic_Pipeline’
pipeline.write().overwrite().save(pipelinePath)
loadedPipeline = Pipeline.load(pipelinePath)
loadedPipeline.fit(births_train).transform(births_test).take(1)
from pyspark.ml import PipelineModel
modelPath = ‘./infant_oneHotEncoder_Logistic_PipelineModel’
model.write().overwrite().save(modelPath)
loadedPipelineModel = PipelineModel.load(modelPath)
test_loadedModel = loadedPipelineModel.transform(births_test)
1、Grid search
import pyspark.ml.tuning as tune
logistic = cl.LogisticRegression(labelCol="INFANT_ALIVE_AT_REPORT")
grid = tune.ParamGridBuilder().addGrid(logistic.maxIter, [2, 10, 30]).addGrid(logistic.regParam, [0.01, 0.05, 0.3]).build()
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol = 'probability', labelCol = 'INFANT_ALIVE_AT_REPORT')
cv = tune.CrossValidator(estimator = logistic, estimatorParamMaps = grid, evaluator = evaluator)
pipeline = Pipeline(stages = [encoder, featuresCreator])
data_transformer = pipeline.fit(births_train)
cvModel = cv.fit(data_transformer.transform(births_train))
data_test = data_transformer.transform(births_test)
results = cvModel.transform(data_test)
print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
results = [
(
[
{key.name: paramValue}
for key, paramValue
in zip(
params.keys(),
params.values())
], metric
)
for params, metric
in zip(
cvModel.getEstimatorParamMaps(),
cvModel.avgMetrics
)
]
print(sorted(results,key=lambda el: el[1],reverse=True)[0])
0.7404799467361349
0.7158426790526992
([{'regParam': 0.01}, {'maxIter': 30}], 0.7384569581670912)
2、Train_validation splitt
0.7294296314442145
0.703775950281647
import pyspark.sql.functions as func
births = births.withColumn("INFANT_ALIVE_AT_REPORT", func.col("INFANT_ALIVE_AT_REPORT").cast(typ.DoubleType()))
births_train, births_test = births.randomSplit([0.7, 0.3], seed = 666)
classifier = cl.RandomForestClassifier(numTrees=5, maxDepth=5, labelCol='INFANT_ALIVE_AT_REPORT')
pipeline = Pipeline(stages = [encoder, featuresCreator, classifier])
model = pipeline.fit(births_train)
test = model.transform(births_test)
evaluator = ev.BinaryClassificationEvaluator(labelCol='INFANT_ALIVE_AT_REPORT')
print(evaluator.evaluate(test, {evaluator.metricName: "areaUnderROC"}))
print(evaluator.evaluate(test, {evaluator.metricName: "areaUnderPR"}))
0.7671165748668931
0.7367360611074735
import pyspark.ml.tuning as tune
clf = cl.RandomForestClassifier(labelCol="INFANT_ALIVE_AT_REPORT")
grid = tune.ParamGridBuilder().addGrid(clf.numTrees, [2, 5, 8]).addGrid(clf.maxDepth, [3, 5, 7]).build()
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol = 'probability', labelCol = 'INFANT_ALIVE_AT_REPORT')
cv = tune.CrossValidator(estimator = clf, estimatorParamMaps = grid, evaluator = evaluator)
pipeline = Pipeline(stages = [encoder, featuresCreator])
data_transformer = pipeline.fit(births_train)
cvModel = cv.fit(data_transformer.transform(births_train))
data_test = data_transformer.transform(births_test)
results = cvModel.transform(data_test)
print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))
results = [
(
[
{key.name: paramValue}
for key, paramValue
in zip(
params.keys(),
params.values())
], metric
)
for params, metric
in zip(
cvModel.getEstimatorParamMaps(),
cvModel.avgMetrics
)
]
print(sorted(results,key=lambda el: el[1],reverse=True)[0])
0.7780783438870142
0.758032350342556
([{'maxDepth': 7}, {'numTrees': 8}], 0.7751605479831878)
import pyspark.ml.clustering as clus
kmeans = clus.KMeans(k = 5, featuresCol = 'features')
pipeline = Pipeline(stages = [encoder, featuresCreator, kmeans])
model = pipeline.fit(births_train)
test = model.transform(births_test)
test.groupBy('prediction').agg({'*' : 'count', 'MOTHER_HEIGHT_IN' : 'avg'}).collect()
[Row(prediction=1, avg(MOTHER_HEIGHT_IN)=67.69473684210526, count(1)=475),
Row(prediction=3, avg(MOTHER_HEIGHT_IN)=66.64658634538152, count(1)=249),
Row(prediction=4, avg(MOTHER_HEIGHT_IN)=64.43472584856397, count(1)=2298),
Row(prediction=2, avg(MOTHER_HEIGHT_IN)=83.91154791154791, count(1)=407),
Row(prediction=0, avg(MOTHER_HEIGHT_IN)=64.31597357170618, count(1)=10292)]
text_data = spark.createDataFrame([
['''To make a computer do anything, you have to write a
computer program. To write a computer program, you have
to tell the computer, step by step, exactly what you want
it to do. The computer then "executes" the program,
following each step mechanically, to accomplish the end
goal. When you are telling the computer what to do, you
also get to choose how it's going to do it. That's where
computer algorithms come in. The algorithm is the basic
technique used to get the job done. Let's follow an
example to help get an understanding of the algorithm
concept.'''],
['''Laptop computers use batteries to run while not
connected to mains. When we overcharge or overheat
lithium ion batteries, the materials inside start to
break down and produce bubbles of oxygen, carbon dioxide,
and other gases. Pressure builds up, and the hot battery
swells from a rectangle into a pillow shape. Sometimes
the phone involved will operate afterwards. Other times
it will die. And occasionally—kapow! To see what's
happening inside the battery when it swells, the CLS team
used an x-ray technology called computed tomography.'''],
['''This technology describes a technique where touch
sensors can be placed around any side of a device
allowing for new input sources. The patent also notes
that physical buttons (such as the volume controls) could
be replaced by these embedded touch sensors. In essence
Apple could drop the current buttons and move towards
touch-enabled areas on the device for the existing UI. It
could also open up areas for new UI paradigms, such as
using the back of the smartphone for quick scrolling or
page turning.'''],
['''The National Park Service is a proud protector of
America’s lands. Preserving our land not only safeguards
the natural environment, but it also protects the
stories, cultures, and histories of our ancestors. As we
face the increasingly dire consequences of climate
change, it is imperative that we continue to expand
America’s protected lands under the oversight of the
National Park Service. Doing so combats climate change
and allows all American’s to visit, explore, and learn
from these treasured places for generations to come. It
is critical that President Obama acts swiftly to preserve
land that is at risk of external threats before the end
of his term as it has become blatantly clear that the
next administration will not hold the same value for our
environment over the next four years.'''],
['''The National Park Foundation, the official charitable
partner of the National Park Service, enriches America’s
national parks and programs through the support of
private citizens, park lovers, stewards of nature,
history enthusiasts, and wilderness adventurers.
Chartered by Congress in 1967, the Foundation grew out of
a legacy of park protection that began over a century
ago, when ordinary citizens took action to establish and
protect our national parks. Today, the National Park
Foundation carries on the tradition of early park
advocates, big thinkers, doers and dreamers—from John
Muir and Ansel Adams to President Theodore Roosevelt.'''],
['''Australia has over 500 national parks. Over 28
million hectares of land is designated as national
parkland, accounting for almost four per cent of
Australia's land areas. In addition, a further six per
cent of Australia is protected and includes state
forests, nature parks and conservation reserves.National
parks are usually large areas of land that are protected
because they have unspoilt landscapes and a diverse
number of native plants and animals. This means that
commercial activities such as farming are prohibited and
human activity is strictly monitored.''']
], ['documents'])
tokenizer = ft.RegexTokenizer(inputCol='documents', outputCol='input_arr', pattern='\s+|[,.\"]')
stopwords = ft.StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='input_stop')
stringIndexer = ft.CountVectorizer(inputCol=stopwords.getOutputCol(), outputCol='input_indexed')
tokenized = stopwords.transform(tokenizer.transform(text_data))
stringIndexer.fit(tokenized).transform(tokenized).select('input_indexed').take(2)
clustering = clus.LDA(k=2, optimizer='online', featuresCol=stringIndexer.getOutputCol())
pipeline = Pipeline(stages = [tokenizer, stopwords, stringIndexer, clustering])
topics = pipeline.fit(text_data).transform(text_data)
topics.select('topicDistribution').collect()
[Row(topicDistribution=DenseVector([0.7338, 0.2662])),
Row(topicDistribution=DenseVector([0.0127, 0.9873])),
Row(topicDistribution=DenseVector([0.0191, 0.9809])),
Row(topicDistribution=DenseVector([0.9886, 0.0114])),
Row(topicDistribution=DenseVector([0.9896, 0.0104])),
Row(topicDistribution=DenseVector([0.9794, 0.0206]))]
features = ['MOTHER_AGE_YEARS','MOTHER_HEIGHT_IN',
'MOTHER_PRE_WEIGHT','DIABETES_PRE',
'DIABETES_GEST','HYP_TENS_PRE',
'HYP_TENS_GEST', 'PREV_BIRTH_PRETERM',
'CIG_BEFORE','CIG_1_TRI', 'CIG_2_TRI',
'CIG_3_TRI'
]
featuresCreator = ft.VectorAssembler(inputCols=[col for col in features[1:]], outputCol='features')
selector = ft.ChiSqSelector(numTopFeatures=6, outputCol='selectedFeatures', labelCol='MOTHER_WEIGHT_GAIN')
import pyspark.ml.regression as reg
regressor = reg.GBTRegressor(maxIter=15, maxDepth=3, labelCol='MOTHER_WEIGHT_GAIN')
pipeline = Pipeline(stages = [featuresCreator, selector, regressor])
weightGain = pipeline.fit(births_train)
evaluator = ev.RegressionEvaluator(predictionCol="prediction", labelCol="MOTHER_WEIGHT_GAIN")
print(evaluator.evaluate(weightGain.transform(births_test),{evaluator.metricName: 'r2'}))
0.4886585890292995