import spark.implicits._
var data_csv = Seq(
("ke,sun"),
("tian,sun")
).toDF("CST_NO")
+--------+
| CST_NO|
+--------+
| ke,sun|
|tian,sun|
+--------+
将CST_NO 列
var neg_tmp = data_tmp.select("CST_NO").collect().map(_(0)).toList
println(neg_tmp.length)
// 取第一行 neg_tmp(0)
var neg_list = neg_tmp(0).toString.split(",")
println(neg_list)
结果:
neg_tmp: List[Any] = List(ke,sun,tian,sun)
1
neg_list: Array[String] = Array(ke, sun, tian, sun)
参考博客: 点击传送
List去重
1, 最简单直接办法是用distinct
scala> val l = List(1,2,3,3,4,4,5,5,6,6,6,8,9)
l: List[Int] = List(1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 8, 9)
scala> l.distinct
res32: List[Int] = List(1, 2, 3, 4, 5, 6, 8, 9)
2, toSet
scala> l.toSet.toList
res33: List[Int] = List(5, 1, 6, 9, 2, 3, 8, 4)
参考博客: 点击传送
// 注 表格里值一定要统一格式 ,全转化为String(null除外,没意义) 如果没有则toDF方法报错
var lst = List[String]("57.54", "trusfortMeans", null, "20190720", "5852.00", null, null, "25.77", null)
var name_list = List("idm", "CO", "distrn","dayId", "Ant", "CLP", "CAC", "PE_num","CE")
import org.apache.spark.sql.functions._
import org.apache.spark.ml._
var df = List((lst.toArray)).toDF("features")
//df: org.apache.spark.sql.DataFrame = [id: int, features: vector]
df.show()
+--------------------+
| features|
+--------------------+
|[57.54, trusfortM...|
+--------------------+
// name_list为列名 lst为一行的值
// 注 表格里值一定要统一格式 ,全转化为String(null除外,没意义) 如果没有则toDF方法报错
var lst = List[String]("57.54", "trusfortMeans", null, "20190720", "5852.00", null, null, "25.77", null)
var name_list = List("idm", "CO", "distrn","dayId", "Ant", "CLP", "CAC", "PE_num","CE")
import org.apache.spark.sql.functions._
import org.apache.spark.ml._
var df = List((lst.toArray)).toDF("features")
//df: org.apache.spark.sql.DataFrame = [id: int, features: vector]
df.show()
// +--------------------+
// | features|
// +--------------------+
// |[57.54, trusfortM...|
// +--------------------+
// sizeof `elements` should be equal to the number of entries in column `features`
val elements = name_list.toArray
// Create a SQL-like expression using the array
val sqlExpr = elements.zipWithIndex.map{ case (alias, idx) => col("features").getItem(idx).as(alias) }
// Extract Elements from dfArr
df = df.select(sqlExpr : _*)
df.show()
df: org.apache.spark.sql.DataFrame = [features: array]
+--------------------+
| features|
+--------------------+
|[57.54, trusfortM...|
+--------------------+
df: org.apache.spark.sql.DataFrame = [idm: string, CO: string ... 7 more fields]
+-----+-------------+------+--------+-------+----+----+------+----+
| idm| CO|distrn| dayId| Ant| CLP| CAC|PE_num| CE|
+-----+-------------+------+--------+-------+----+----+------+----+
|57.54|trusfortMeans| null|20190720|5852.00|null|null| 25.77|null|
+-----+-------------+------+--------+-------+----+----+------+----+
参考链接:点击传送