import spark.implicits._
var data1 = Seq(
("0", "ming", "tj","2019-09-06 17:15:15", "2002", "192.196", "win7", "bai"),
("1", "ming", "tj","2019-09-07 16:15:15", "4004", "192.194", "win7", "wang"),
("0", "ming", "tj","2019-09-08 05:15:15", "7007", "192.195", "ios", "wang"),
("0", "ming", "ln","2019-09-08 05:15:15", "7007", "192.195", "ios", "wang"),
("0", "li", "hlj","2019-09-06 17:15:15", "2002", "192.196", "win7", "bai"),
("1", "li", "hlj","2019-09-06 17:15:15", "2002", "192.196", "win7", "bai"),
("0", "li", "hlj","2019-09-07 16:15:15", "4004", "192.194", "win7", "wang"),
("0", "li", "ln","2019-09-08 05:15:15", "7007", "192.195", "ios", "wang"),
("1", "tian", "hlj","2019-09-08 13:15:15", "8008", "192.194", "win7", "zhu"),
("0", "tian", "hlj","2019-09-08 19:15:15", "9009", "192.196", "mac", "bai"),
("0", "xixi", "ln","2019-09-08 19:15:15", "9009", "192.196", "mac", "bai"),
("1", "xixi", "jl","2019-09-08 19:15:15", "9009", "192.196", "mac", "bai"),
("0", "haha", "hegang","2019-09-08 15:15:15", "10010", "192.192", "ios", "wei")
).toDF("label", "name", "live","START_TIME", "AMOUNT", "CLIENT_IP", "CLIENT_MAC", "PAYER_CODE")
+-----+----+------+-------------------+------+---------+----------+----------+
|label|name| live| START_TIME|AMOUNT|CLIENT_IP|CLIENT_MAC|PAYER_CODE|
+-----+----+------+-------------------+------+---------+----------+----------+
| 0|ming| tj|2019-09-06 17:15:15| 2002| 192.196| win7| bai|
| 1|ming| tj|2019-09-07 16:15:15| 4004| 192.194| win7| wang|
| 0|ming| tj|2019-09-08 05:15:15| 7007| 192.195| ios| wang|
| 0|ming| ln|2019-09-08 05:15:15| 7007| 192.195| ios| wang|
| 0| li| hlj|2019-09-06 17:15:15| 2002| 192.196| win7| bai|
| 1| li| hlj|2019-09-06 17:15:15| 2002| 192.196| win7| bai|
| 0| li| hlj|2019-09-07 16:15:15| 4004| 192.194| win7| wang|
| 0| li| ln|2019-09-08 05:15:15| 7007| 192.195| ios| wang|
| 1|tian| hlj|2019-09-08 13:15:15| 8008| 192.194| win7| zhu|
| 0|tian| hlj|2019-09-08 19:15:15| 9009| 192.196| mac| bai|
| 0|xixi| ln|2019-09-08 19:15:15| 9009| 192.196| mac| bai|
| 1|xixi| jl|2019-09-08 19:15:15| 9009| 192.196| mac| bai|
| 0|haha|hegang|2019-09-08 15:15:15| 10010| 192.192| ios| wei|
+-----+----+------+-------------------+------+---------+----------+----------+
val da1 = data1.select("label")
da1.show()
结果:
da1: org.apache.spark.sql.DataFrame = [label: string]
+-----+
|label|
+-----+
| 0|
| 1|
| 0|
| 0|
| 0|
| 1|
| 0|
| 0|
| 1|
| 0|
| 0|
| 1|
| 0|
+-----+
val sef = Seq(“label”, “AMOUNT”)
(也可以用Array,ArrayBuffer)
方法1:
select(sef.head, sef.tail: *)
方法2:
select(sef.map(data1.col()): _*)
val sef = Seq("label", "AMOUNT")
val da3 = data1.select(sef.head, sef.tail: _*)
da3.show()
结果:
da3: org.apache.spark.sql.DataFrame = [label: string, AMOUNT: string]
+-----+------+
|label|AMOUNT|
+-----+------+
| 0| 2002|
| 1| 4004|
| 0| 7007|
| 0| 7007|
| 0| 2002|
| 1| 2002|
| 0| 4004|
| 0| 7007|
| 1| 8008|
| 0| 9009|
| 0| 9009|
| 1| 9009|
| 0| 10010|
+-----+------+
或者:
data1.select(sef.map(data1.col(_)): _*).show
def select(cols : org.apache.spark.sql.Column*) : org.apache.spark.sql.DataFrame = { /* compiled code */ }
针对传入的是字符,必须是 (col=“字符”, cols=Seq(“字符列表”): _*)
参考: 网址1