字符转换成数字索引 val indexer = new StringIndexer().setInputCol( "gender" ).setOutputCol( "genderIndex" ).fit(dataDF) indexer : org.apache.spark.ml.feature.StringIndexerModel = strIdx _ 27 dba 613193 a val indexed = indexer.transform(dataDF) indexed : org.apache.spark.sql.DataFrame = [affairs : double, age : double ... 8 more fields] // OneHot编码,注意setDropLast设置为false val encoder = new OneHotEncoder().setInputCol( "genderIndex" ).setOutputCol( "genderVec" ).setDropLast( false ) encoder : org.apache.spark.ml.feature.OneHotEncoder = oneHot _ 155 a 53 de 3 aef val encoded = encoder.transform(indexed) encoded : org.apache.spark.sql.DataFrame = [affairs : double, age : double ... 9 more fields] encoded.show() +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+ |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex| genderVec| +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+ | 0.0 | 37.0 | 10.0 | 3.0 | 18.0 | 7.0 | 4.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 14.0 | 6.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 32.0 | 15.0 | 1.0 | 12.0 | 1.0 | 4.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 57.0 | 15.0 | 5.0 | 18.0 | 6.0 | 5.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 2.0 | 17.0 | 6.0 | 3.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 32.0 | 1.5 | 2.0 | 17.0 | 5.0 | 5.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 2.0 | 12.0 | 1.0 | 3.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 57.0 | 15.0 | 2.0 | 14.0 | 4.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 32.0 | 15.0 | 4.0 | 16.0 | 1.0 | 2.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 4.0 | 14.0 | 4.0 | 5.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 2.0 | 20.0 | 7.0 | 2.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 18.0 | 6.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 47.0 | 15.0 | 5.0 | 17.0 | 6.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 2.0 | 17.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 14.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 1.0 | 17.0 | 5.0 | 5.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 2.0 | 18.0 | 4.0 | 3.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 3.0 | 16.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 2.0 | 16.0 | 5.0 | 5.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 27.0 | 10.0 | 2.0 | 14.0 | 1.0 | 5.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+ only showing top 20 rows val indexer 1 = new StringIndexer().setInputCol( "children" ).setOutputCol( "childrenIndex" ).fit(encoded) indexer 1 : org.apache.spark.ml.feature.StringIndexerModel = strIdx _ 55 db 099 c 07 b 7 val indexed 1 = indexer 1 .transform(encoded) indexed 1 : org.apache.spark.sql.DataFrame = [affairs : double, age : double ... 10 more fields] val encoder 1 = new OneHotEncoder().setInputCol( "childrenIndex" ).setOutputCol( "childrenVec" ).setDropLast( false ) val encoded 1 = encoder 1 .transform(indexed 1 ) encoded 1 : org.apache.spark.sql.DataFrame = [affairs : double, age : double ... 11 more fields] encoded 1 .show() +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex| genderVec|childrenIndex| childrenVec| +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ | 0.0 | 37.0 | 10.0 | 3.0 | 18.0 | 7.0 | 4.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 14.0 | 6.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 32.0 | 15.0 | 1.0 | 12.0 | 1.0 | 4.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 57.0 | 15.0 | 5.0 | 18.0 | 6.0 | 5.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 2.0 | 17.0 | 6.0 | 3.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 32.0 | 1.5 | 2.0 | 17.0 | 5.0 | 5.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 2.0 | 12.0 | 1.0 | 3.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 57.0 | 15.0 | 2.0 | 14.0 | 4.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 32.0 | 15.0 | 4.0 | 16.0 | 1.0 | 2.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 4.0 | 14.0 | 4.0 | 5.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 2.0 | 20.0 | 7.0 | 2.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 18.0 | 6.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 47.0 | 15.0 | 5.0 | 17.0 | 6.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 2.0 | 17.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 14.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 1.0 | 17.0 | 5.0 | 5.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 2.0 | 18.0 | 4.0 | 3.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 3.0 | 16.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 2.0 | 16.0 | 5.0 | 5.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 10.0 | 2.0 | 14.0 | 1.0 | 5.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ only showing top 20 rows val encodeDF : DataFrame = encoded 1 encodeDF : org.apache.spark.sql.DataFrame = [affairs : double, age : double ... 11 more fields] encodeDF.show() +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ |affairs| age|yearsmarried|religiousness|education|occupation|rating|gender|children|genderIndex| genderVec|childrenIndex| childrenVec| +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ | 0.0 | 37.0 | 10.0 | 3.0 | 18.0 | 7.0 | 4.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 14.0 | 6.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 32.0 | 15.0 | 1.0 | 12.0 | 1.0 | 4.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 57.0 | 15.0 | 5.0 | 18.0 | 6.0 | 5.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 2.0 | 17.0 | 6.0 | 3.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 32.0 | 1.5 | 2.0 | 17.0 | 5.0 | 5.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 2.0 | 12.0 | 1.0 | 3.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 57.0 | 15.0 | 2.0 | 14.0 | 4.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 32.0 | 15.0 | 4.0 | 16.0 | 1.0 | 2.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 4.0 | 14.0 | 4.0 | 5.0 | male| no| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 2.0 | 20.0 | 7.0 | 2.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 18.0 | 6.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 47.0 | 15.0 | 5.0 | 17.0 | 6.0 | 4.0 | male| yes| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 2.0 | 17.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 4.0 | 4.0 | 14.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 1.0 | 17.0 | 5.0 | 5.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 37.0 | 15.0 | 2.0 | 18.0 | 4.0 | 3.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| | 0.0 | 22.0 | 0.75 | 3.0 | 16.0 | 5.0 | 4.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 22.0 | 1.5 | 2.0 | 16.0 | 5.0 | 5.0 |female| no| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 1.0 |( 2 ,[ 1 ],[ 1.0 ])| | 0.0 | 27.0 | 10.0 | 2.0 | 14.0 | 1.0 | 5.0 |female| yes| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| 0.0 |( 2 ,[ 0 ],[ 1.0 ])| +-------+----+------------+-------------+---------+----------+------+------+--------+-----------+-------------+-------------+-------------+ only showing top 20 rows encodeDF.printSchema() root |-- affairs : double (nullable = true ) |-- age : double (nullable = true ) |-- yearsmarried : double (nullable = true ) |-- religiousness : double (nullable = true ) |-- education : double (nullable = true ) |-- occupation : double (nullable = true ) |-- rating : double (nullable = true ) |-- gender : string (nullable = true ) |-- children : string (nullable = true ) |-- genderIndex : double (nullable = true ) |-- genderVec : vector (nullable = true ) |-- childrenIndex : double (nullable = true ) |-- childrenVec : vector (nullable = true ) |