oneHot编码--标准化--主成分--聚类

2019独角兽企业重金招聘Python工程师标准>>>

1.导入包

import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.Dataset

import org.apache.spark.sql.Row

import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.Column

import org.apache.spark.sql.DataFrameReader

import org.apache.spark.rdd.RDD

import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

import org.apache.spark.sql.Encoder

import org.apache.spark.sql.functions._

import org.apache.spark.sql.DataFrameStatFunctions

import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.StringIndexer

import org.apache.spark.ml.feature.OneHotEncoder

import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.feature.MinMaxScaler

import org.apache.spark.ml.feature.StandardScaler

import org.apache.spark.ml.feature.PCA

import org.apache.spark.ml.clustering.KMeans

2.导入数据

val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()

// For implicit conversions like converting RDDs to DataFrames

import spark.implicits._

val data: DataFrame = spark.read.format("csv").option("header", true).load("hdfs://ns1/datafile/wangxiao/Affairs.csv")

data: org.apache.spark.sql.DataFrame = [affairs: string, gender: string ... 7 more fields]

data.cache

res0: data.type = [affairs: string, gender: string ... 7 more fields]

data.limit(10).show()

+-------+------+---+------------+--------+-------------+---------+----------+------+

+-------+------+---+------------+--------+-------------+---------+----------+------+

| 0| male| 37| 10| no| 3| 18| 7| 4|

| 0|female| 27| 4| no| 4| 14| 6| 4|

| 0|female| 32| 15| yes| 1| 12| 1| 4|

| 0| male| 57| 15| yes| 5| 18| 6| 5|

| 0| male| 22| 0.75| no| 2| 17| 6| 3|

| 0|female| 32| 1.5| no| 2| 17| 5| 5|

| 0|female| 22| 0.75| no| 2| 12| 1| 3|

| 0| male| 57| 15| yes| 2| 14| 4| 4|

| 0|female| 32| 15| yes| 4| 16| 1| 2|

| 0| male| 22| 1.5| no| 4| 14| 4| 5|

+-------+------+---+------------+--------+-------------+---------+----------+------+

// 转换字符类型，将Double和String的字段分开放

val data1 = data.select(

| data("affairs").cast("Double"),

| data("age").cast("Double"),

| data("yearsmarried").cast("Double"),

| data("religiousness").cast("Double"),

| data("education").cast("Double"),

| data("occupation").cast("Double"),

| data("rating").cast("Double"),

| data("gender").cast("String"),

| data("children").cast("String"))

data1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

data1.printSchema()

root

|-- affairs: double (nullable = true)

|-- age: double (nullable = true)

|-- yearsmarried: double (nullable = true)

|-- religiousness: double (nullable = true)

|-- education: double (nullable = true)

|-- occupation: double (nullable = true)

|-- rating: double (nullable = true)

|-- gender: string (nullable = true)

|-- children: string (nullable = true)

data1.limit(10).show

+-------+----+------------+-------------+---------+----------+------+------+--------+

+-------+----+------------+-------------+---------+----------+------+------+--------+

| 0.0|37.0| 10.0| 3.0| 18.0| 7.0| 4.0| male| no|

| 0.0|27.0| 4.0| 4.0| 14.0| 6.0| 4.0|female| no|

| 0.0|32.0| 15.0| 1.0| 12.0| 1.0| 4.0|female| yes|

| 0.0|57.0| 15.0| 5.0| 18.0| 6.0| 5.0| male| yes|

| 0.0|22.0| 0.75| 2.0| 17.0| 6.0| 3.0| male| no|

| 0.0|32.0| 1.5| 2.0| 17.0| 5.0| 5.0|female| no|

| 0.0|22.0| 0.75| 2.0| 12.0| 1.0| 3.0|female| no|

| 0.0|57.0| 15.0| 2.0| 14.0| 4.0| 4.0| male| yes|

| 0.0|32.0| 15.0| 4.0| 16.0| 1.0| 2.0|female| yes|

| 0.0|22.0| 1.5| 4.0| 14.0| 4.0| 5.0| male| no|

+-------+----+------------+-------------+---------+----------+------+------+--------+

val dataDF = data1

dataDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 7 more fields]

dataDF.cache()

res4: dataDF.type = [affairs: double, age: double ... 7 more fields]

3.字符转换成数字索引，OneHot编码，注意setDropLast设置为false

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

字符转换成数字索引

val indexer = new StringIndexer().setInputCol("gender").setOutputCol("genderIndex").fit(dataDF)

indexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_27dba613193a

val indexed = indexer.transform(dataDF)

indexed: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 8 more fields]

// OneHot编码，注意setDropLast设置为false

val encoder = new OneHotEncoder().setInputCol("genderIndex").setOutputCol("genderVec").setDropLast(false)

encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_155a53de3aef

val encoded = encoder.transform(indexed)

encoded: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 9 more fields]

encoded.show()

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+

| 0.0|37.0| 10.0| 3.0| 18.0| 7.0| 4.0| male| no| 1.0|(2,[1],[1.0])|

| 0.0|27.0| 4.0| 4.0| 14.0| 6.0| 4.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|32.0| 15.0| 1.0| 12.0| 1.0| 4.0|female| yes| 0.0|(2,[0],[1.0])|

| 0.0|57.0| 15.0| 5.0| 18.0| 6.0| 5.0| male| yes| 1.0|(2,[1],[1.0])|

| 0.0|22.0| 0.75| 2.0| 17.0| 6.0| 3.0| male| no| 1.0|(2,[1],[1.0])|

| 0.0|32.0| 1.5| 2.0| 17.0| 5.0| 5.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 0.75| 2.0| 12.0| 1.0| 3.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|57.0| 15.0| 2.0| 14.0| 4.0| 4.0| male| yes| 1.0|(2,[1],[1.0])|

| 0.0|32.0| 15.0| 4.0| 16.0| 1.0| 2.0|female| yes| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 1.5| 4.0| 14.0| 4.0| 5.0| male| no| 1.0|(2,[1],[1.0])|

| 0.0|37.0| 15.0| 2.0| 20.0| 7.0| 2.0| male| yes| 1.0|(2,[1],[1.0])|

| 0.0|27.0| 4.0| 4.0| 18.0| 6.0| 4.0| male| yes| 1.0|(2,[1],[1.0])|

| 0.0|47.0| 15.0| 5.0| 17.0| 6.0| 4.0| male| yes| 1.0|(2,[1],[1.0])|

| 0.0|22.0| 1.5| 2.0| 17.0| 5.0| 4.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|27.0| 4.0| 4.0| 14.0| 5.0| 4.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|37.0| 15.0| 1.0| 17.0| 5.0| 5.0|female| yes| 0.0|(2,[0],[1.0])|

| 0.0|37.0| 15.0| 2.0| 18.0| 4.0| 3.0|female| yes| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 0.75| 3.0| 16.0| 5.0| 4.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 1.5| 2.0| 16.0| 5.0| 5.0|female| no| 0.0|(2,[0],[1.0])|

| 0.0|27.0| 10.0| 2.0| 14.0| 1.0| 5.0|female| yes| 0.0|(2,[0],[1.0])|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+

only showing top 20 rows

val indexer1 = new StringIndexer().setInputCol("children").setOutputCol("childrenIndex").fit(encoded)

indexer1: org.apache.spark.ml.feature.StringIndexerModel = strIdx_55db099c07b7

val indexed1 = indexer1.transform(encoded)

indexed1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 10 more fields]

val encoder1 = new OneHotEncoder().setInputCol("childrenIndex").setOutputCol("childrenVec").setDropLast(false)

val encoded1 = encoder1.transform(indexed1)

encoded1: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields]

encoded1.show()

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

| 0.0|37.0| 10.0| 3.0| 18.0| 7.0| 4.0| male| no| 1.0|(2,[1],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|27.0| 4.0| 4.0| 14.0| 6.0| 4.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|32.0| 15.0| 1.0| 12.0| 1.0| 4.0|female| yes| 0.0|(2,[0],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|57.0| 15.0| 5.0| 18.0| 6.0| 5.0| male| yes| 1.0|(2,[1],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 0.75| 2.0| 17.0| 6.0| 3.0| male| no| 1.0|(2,[1],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|32.0| 1.5| 2.0| 17.0| 5.0| 5.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|22.0| 0.75| 2.0| 12.0| 1.0| 3.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|57.0| 15.0| 2.0| 14.0| 4.0| 4.0| male| yes| 1.0|(2,[1],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|32.0| 15.0| 4.0| 16.0| 1.0| 2.0|female| yes| 0.0|(2,[0],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 1.5| 4.0| 14.0| 4.0| 5.0| male| no| 1.0|(2,[1],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|37.0| 15.0| 2.0| 20.0| 7.0| 2.0| male| yes| 1.0|(2,[1],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|27.0| 4.0| 4.0| 18.0| 6.0| 4.0| male| yes| 1.0|(2,[1],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|47.0| 15.0| 5.0| 17.0| 6.0| 4.0| male| yes| 1.0|(2,[1],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 1.5| 2.0| 17.0| 5.0| 4.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|27.0| 4.0| 4.0| 14.0| 5.0| 4.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|37.0| 15.0| 1.0| 17.0| 5.0| 5.0|female| yes| 0.0|(2,[0],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|37.0| 15.0| 2.0| 18.0| 4.0| 3.0|female| yes| 0.0|(2,[0],[1.0])| 0.0|(2,[0],[1.0])|

| 0.0|22.0| 0.75| 3.0| 16.0| 5.0| 4.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|22.0| 1.5| 2.0| 16.0| 5.0| 5.0|female| no| 0.0|(2,[0],[1.0])| 1.0|(2,[1],[1.0])|

| 0.0|27.0| 10.0| 2.0| 14.0| 1.0| 5.0|female| yes| 0.0|(2,[0],[1.0])| 0.0|(2,[0],[1.0])|

+-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+

only showing top 20 rows

val encodeDF: DataFrame = encoded1

encodeDF: org.apache.spark.sql.DataFrame = [affairs: double, age: double ... 11 more fields]

encodeDF.show()