(四)DataFrame的常用函数

[hadoop@hadoop000 bin]$ ./spark-shell --master local[2] --jars ~/software/mysql-connector-java-5.1.27.jar
scala> case class Student (id:String, name:String, phone:String, email:String)
scala> val students = sc.textFile("file:///home/hadoop/data/student.data").map(_.split("\\|")).map(x=>(Student(x(0),x(1),x(2),x(3)))).toDF()
students: org.apache.spark.sql.DataFrame = [id: string, name: string ... 2 more fields]
//1.显示表中数据,超过20个字符的会部分省略
scala> students.show
+---+--------+--------------+--------------------+
| id|    name|         phone|               email|
+---+--------+--------------+--------------------+
|  1|   Burke|1-300-746-8446|ullamcorper.velit...|
|  2|   Kamal|1-668-571-5046|pede.Suspendisse@...|
|  3|    Olga|1-956-311-1686|Aenean.eget,netus...|
|  4|   Belle|1-246-894-6340|vitae.aliquet.nec...|
|  5|  Trevor|1-300-527-4967|dapibus.id@acturp...|
|  6|  Laurel|1-691-379-9921|adipiscing@consec...|
|  7|    Sara|1-608-140-1995|Donec.nibh@enimEt...|
|  8|  Kaseem|1-881-586-2689|cursus.et.magna@e...|
|  9|     Lev|1-916-367-5608|Vivamus.nisi@ipsu...|
| 10|    Maya|1-271-683-2698|accumsan.convalli...|
| 11|     Emi|1-467-270-1337|        [email protected]|
| 12|   Caleb|1-683-212-0896|Suspendisse@Quisq...|
| 13|Florence|1-603-575-2444|sit.amet.dapibus@...|
| 14|   Anika|1-856-828-7883|euismod@ligulaeli...|
| 15|   Tarik|1-398-171-2268|[email protected]|
| 16|   Amena|1-878-250-3129|lorem.lucrus.ut@s...|
| 17| Blossom|1-154-406-9596|Nunc.commodo.auct...|
| 18|     Guy|1-869-521-3230|senectus.et.netus...|
| 19| Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
| 20|  Edward|1-711-710-6552|lectus@aliquetlib...|
+---+--------+--------------+--------------------+
only showing top 20 rows
//2.完整显示表中所有数据
scala> students.show(false)
+---+--------+--------------+-----------------------------------------+
|id |name    |phone         |email                                    |
+---+--------+--------------+-----------------------------------------+
|1  |Burke   |1-300-746-8446|[email protected]|
|2  |Kamal   |1-668-571-5046|[email protected]        |
|3  |Olga    |1-956-311-1686|Aenean.eget,[email protected]   |
|4  |Belle   |1-246-894-6340|[email protected]            |
|5  |Trevor  |1-300-527-4967|[email protected]           |
|6  |Laurel  |1-691-379-9921|[email protected]         |
|7  |Sara    |1-608-140-1995|[email protected]        |
|8  |Kaseem  |1-881-586-2689|[email protected]              |
|9  |Lev     |1-916-367-5608|[email protected]              |
|10 |Maya    |1-271-683-2698|[email protected] |
|11 |Emi     |1-467-270-1337|[email protected]                             |
|12 |Caleb   |1-683-212-0896|[email protected]                  |
|13 |Florence|1-603-575-2444|[email protected]   |
|14 |Anika   |1-856-828-7883|[email protected]                 |
|15 |Tarik   |1-398-171-2268|[email protected]                     |
|16 |Amena   |1-878-250-3129|[email protected]          |
|17 |Blossom |1-154-406-9596|[email protected]        |
|18 |Guy     |1-869-521-3230|[email protected]       |
|19 |Malachi |1-608-637-2772|[email protected]             |
|20 |Edward  |1-711-710-6552|[email protected]               |
+---+--------+--------------+-----------------------------------------+
only showing top 20 rows
//3.显示前5条数据
scala> students.show(5,false)
+---+------+--------------+-----------------------------------------+
|id |name  |phone         |email                                    |
+---+------+--------------+-----------------------------------------+
|1  |Burke |1-300-746-8446|[email protected]|
|2  |Kamal |1-668-571-5046|[email protected]        |
|3  |Olga  |1-956-311-1686|Aenean.eget,[email protected]   |
|4  |Belle |1-246-894-6340|[email protected]            |
|5  |Trevor|1-300-527-4967|[email protected]           |
+---+------+--------------+-----------------------------------------+
only showing top 5 rows
//4.显示第一条数据
scala> students.head
res9: org.apache.spark.sql.Row = [1,Burke,1-300-746-8446,[email protected]]
//5.显示前5条数据
scala> students.head(5)
res10: Array[org.apache.spark.sql.Row] = Array([1,Burke,1-300-746-8446,[email protected]], [2,Kamal,1-668-571-5046,[email protected]], [3,Olga,1-956-311-1686,Aenean.eget,[email protected]], [4,Belle,1-246-894-6340,[email protected]], [5,Trevor,1-300-527-4967,[email protected]])
//6.显示前5条数据
scala> students.head(5).foreach(println)
[1,Burke,1-300-746-8446,[email protected]]
[2,Kamal,1-668-571-5046,[email protected]]
[3,Olga,1-956-311-1686,Aenean.eget,[email protected]]
[4,Belle,1-246-894-6340,[email protected]]
[5,Trevor,1-300-527-4967,[email protected]]
//7.显示第一条数据
scala> students.first
res12: org.apache.spark.sql.Row = [1,Burke,1-300-746-8446,[email protected]]
//8.显示“id”和“name”列
scala> students.select("id","name").show
+---+--------+
| id|    name|
+---+--------+
|  1|   Burke|
|  2|   Kamal|
|  3|    Olga|
|  4|   Belle|
|  5|  Trevor|
|  6|  Laurel|
|  7|    Sara|
|  8|  Kaseem|
|  9|     Lev|
| 10|    Maya|
| 11|     Emi|
| 12|   Caleb|
| 13|Florence|
| 14|   Anika|
| 15|   Tarik|
| 16|   Amena|
| 17| Blossom|
| 18|     Guy|
| 19| Malachi|
| 20|  Edward|
+---+--------+
only showing top 20 rows
//9.筛选id<5的数据
scala> students.filter("id<5").show
+---+-----+--------------+--------------------+
| id| name|         phone|               email|
+---+-----+--------------+--------------------+
|  1|Burke|1-300-746-8446|ullamcorper.velit...|
|  2|Kamal|1-668-571-5046|pede.Suspendisse@...|
|  3| Olga|1-956-311-1686|Aenean.eget,netus...|
|  4|Belle|1-246-894-6340|vitae.aliquet.nec...|
+---+-----+--------------+--------------------+
//10.筛选name为空的数据
scala> students.filter("name=''").show
+---+----+--------------+--------------------+
| id|name|         phone|               email|
+---+----+--------------+--------------------+
| 21|    |1-711-710-6552|lecrus@aliquetlib...|
| 22|    |1-711-710-6552|lecrus@aliquetlib...|
+---+----+--------------+--------------------+
//11.筛选name为空或者NULL的数据
scala> students.filter("name=''or name='NULL'").show
+---+----+--------------+--------------------+
| id|name|         phone|               email|
+---+----+--------------+--------------------+
| 21|    |1-711-710-6552|lecrus@aliquetlib...|
| 22|    |1-711-710-6552|lecrus@aliquetlib...|
| 23|NULL|1-711-710-6552|lecrus@aliquetlib...|
+---+----+--------------+--------------------+
//12.筛选出name第一个字母为“M”的数据
scala> students.filter("name like 'M%'").show
+---+-------+--------------+--------------------+
| id|   name|         phone|               email|
+---+-------+--------------+--------------------+
| 10|   Maya|1-271-683-2698|accumsan.convalli...|
| 19|Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
+---+-------+--------------+--------------------+
//13.筛选出name第一个字母为“M”的数据
scala> students.filter("substr(name,0,1)='M'").show
+---+-------+--------------+--------------------+
| id|   name|         phone|               email|
+---+-------+--------------+--------------------+
| 10|   Maya|1-271-683-2698|accumsan.convalli...|
| 19|Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
+---+-------+--------------+--------------------+
//14.筛选出name前三个字母为“Mal”的数据
scala> students.filter("substr(name,0,3)='Mal'").show
+---+-------+--------------+--------------------+
| id|   name|         phone|               email|
+---+-------+--------------+--------------------+
| 19|Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
+---+-------+--------------+--------------------+
//15.按name排序,默认是升序
scala> students.sort($"name").show
+---+--------+--------------+--------------------+
| id|    name|         phone|               email|
+---+--------+--------------+--------------------+
| 21|        |1-711-710-6552|lecrus@aliquetlib...|
| 22|        |1-711-710-6552|lecrus@aliquetlib...|
| 16|   Amena|1-878-250-3129|lorem.lucrus.ut@s...|
| 14|   Anika|1-856-828-7883|euismod@ligulaeli...|
|  4|   Belle|1-246-894-6340|vitae.aliquet.nec...|
| 17| Blossom|1-154-406-9596|Nunc.commodo.auct...|
|  1|   Burke|1-300-746-8446|ullamcorper.velit...|
| 12|   Caleb|1-683-212-0896|Suspendisse@Quisq...|
| 20|  Edward|1-711-710-6552|lectus@aliquetlib...|
| 11|     Emi|1-467-270-1337|        [email protected]|
| 13|Florence|1-603-575-2444|sit.amet.dapibus@...|
| 18|     Guy|1-869-521-3230|senectus.et.netus...|
|  2|   Kamal|1-668-571-5046|pede.Suspendisse@...|
|  8|  Kaseem|1-881-586-2689|cursus.et.magna@e...|
|  6|  Laurel|1-691-379-9921|adipiscing@consec...|
|  9|     Lev|1-916-367-5608|Vivamus.nisi@ipsu...|
| 19| Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
| 10|    Maya|1-271-683-2698|accumsan.convalli...|
| 23|    NULL|1-711-710-6552|lecrus@aliquetlib...|
|  3|    Olga|1-956-311-1686|Aenean.eget,netus...|
+---+--------+--------------+--------------------+
only showing top 20 rows
//16.按name降序排列
scala> students.sort($"name".desc).show(23,false)
+---+--------+--------------+-----------------------------------------+
|id |name    |phone         |email                                    |
+---+--------+--------------+-----------------------------------------+
|5  |Trevor  |1-300-527-4967|[email protected]           |
|15 |Tarik   |1-398-171-2268|[email protected]                     |
|7  |Sara    |1-608-140-1995|[email protected]        |
|3  |Olga    |1-956-311-1686|Aenean.eget,[email protected]   |
|23 |NULL    |1-711-710-6552|[email protected]               |
|10 |Maya    |1-271-683-2698|[email protected] |
|19 |Malachi |1-608-637-2772|[email protected]             |
|9  |Lev     |1-916-367-5608|[email protected]              |
|6  |Laurel  |1-691-379-9921|[email protected]         |
|8  |Kaseem  |1-881-586-2689|[email protected]              |
|2  |Kamal   |1-668-571-5046|[email protected]        |
|18 |Guy     |1-869-521-3230|[email protected]       |
|13 |Florence|1-603-575-2444|[email protected]   |
|11 |Emi     |1-467-270-1337|[email protected]                             |
|20 |Edward  |1-711-710-6552|[email protected]               |
|12 |Caleb   |1-683-212-0896|[email protected]                  |
|1  |Burke   |1-300-746-8446|[email protected]|
|17 |Blossom |1-154-406-9596|[email protected]        |
|4  |Belle   |1-246-894-6340|[email protected]            |
|14 |Anika   |1-856-828-7883|[email protected]                 |
|16 |Amena   |1-878-250-3129|[email protected]          |
|21 |        |1-711-710-6552|[email protected]               |
|22 |        |1-711-710-6552|[email protected]               |
+---+--------+--------------+-----------------------------------------+
//17.按name降序排列,name相同的情况下按id降序排列
scala> students.sort($"name".desc,$"id".desc).show(23,false)
+---+--------+--------------+-----------------------------------------+
|id |name    |phone         |email                                    |
+---+--------+--------------+-----------------------------------------+
|5  |Trevor  |1-300-527-4967|[email protected]           |
|15 |Tarik   |1-398-171-2268|[email protected]                     |
|7  |Sara    |1-608-140-1995|[email protected]        |
|3  |Olga    |1-956-311-1686|Aenean.eget,[email protected]   |
|23 |NULL    |1-711-710-6552|[email protected]               |
|10 |Maya    |1-271-683-2698|[email protected] |
|19 |Malachi |1-608-637-2772|[email protected]             |
|9  |Lev     |1-916-367-5608|[email protected]              |
|6  |Laurel  |1-691-379-9921|[email protected]         |
|8  |Kaseem  |1-881-586-2689|[email protected]              |
|2  |Kamal   |1-668-571-5046|[email protected]        |
|18 |Guy     |1-869-521-3230|[email protected]       |
|13 |Florence|1-603-575-2444|[email protected]   |
|11 |Emi     |1-467-270-1337|[email protected]                             |
|20 |Edward  |1-711-710-6552|[email protected]               |
|12 |Caleb   |1-683-212-0896|[email protected]                  |
|1  |Burke   |1-300-746-8446|[email protected]|
|17 |Blossom |1-154-406-9596|[email protected]        |
|4  |Belle   |1-246-894-6340|[email protected]            |
|14 |Anika   |1-856-828-7883|[email protected]                 |
|16 |Amena   |1-878-250-3129|[email protected]          |
|22 |        |1-711-710-6552|[email protected]               |
|21 |        |1-711-710-6552|[email protected]               |
+---+--------+--------------+-----------------------------------------+
//18.临时修改列名
scala> students.select($"name".as("new_name")).show(5,false)
+--------+
|new_name|
+--------+
|Burke   |
|Kamal   |
|Olga    |
|Belle   |
|Trevor  |
+--------+
only showing top 5 rows
//19.join的用法
scala> val students1 = spark.sparkContext.textFile("file:///home/hadoop/data/student.data").map(_.split("\\|")).map(x =>(Student(x(0),x(1),x(2),x(3)))).toDF()
students1: org.apache.spark.sql.DataFrame = [id: string, name: string ... 2 more fields]

scala> val students2 = spark.sparkContext.textFile("file:///home/hadoop/data/student1.data").map(_.split("\\|")).map(x =>(Student(x(0),x(1),x(2),x(3)))).toDF()
students2: org.apache.spark.sql.DataFrame = [id: string, name: string ... 2 more fields]
//如果不输入join的第三个参数"inner",也默认为innerjoin
scala> students1.join(students2,students1.col("id")===students2.col("id"),"inner").show()
+---+--------+--------------+--------------------+---+--------+--------------+--------------------+
| id|    name|         phone|               email| id|    name|         phone|               email|
+---+--------+--------------+--------------------+---+--------+--------------+--------------------+
| 15|   Tarik|1-398-171-2268|[email protected]| 15|   Tarik|1-398-171-2268|[email protected]|
| 22|        |1-711-710-6552|lecrus@aliquetlib...| 22|        |1-711-710-6552|lecrus@aliquetlib...|
| 16|   Amena|1-878-250-3129|lorem.lucrus.ut@s...| 16|   Amena|1-878-250-3129|lorem.lucrus.ut@s...|
| 18|     Guy|1-869-521-3230|senectus.et.netus...| 18|     Guy|1-869-521-3230|senectus.et.netus...|
| 17| Blossom|1-154-406-9596|Nunc.commodo.auct...| 17| Blossom|1-154-406-9596|Nunc.commodo.auct...|
| 19| Malachi|1-608-637-2772|Proin.mi.Aliquam@...| 19| Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
| 23|    NULL|1-711-710-6552|lecrus@aliquetlib...| 23|    NULL|1-711-710-6552|lecrus@aliquetlib...|
| 20|  Edward|1-711-710-6552|lectus@aliquetlib...| 20|  Edward|1-711-710-6552|lectus@aliquetlib...|
| 12|   Caleb|1-683-212-0896|Suspendisse@Quisq...| 12|   Caleb|1-683-212-0896|Suspendisse@Quisq...|
| 13|Florence|1-603-575-2444|sit.amet.dapibus@...| 13|Florence|1-603-575-2444|sit.amet.dapibus@...|
| 14|   Anika|1-856-828-7883|euismod@ligulaeli...| 14|   Anika|1-856-828-7883|euismod@ligulaeli...|
| 21|        |1-711-710-6552|lecrus@aliquetlib...| 21|        |1-711-710-6552|lecrus@aliquetlib...|
+---+--------+--------------+--------------------+---+--------+--------------+--------------------+

你可能感兴趣的:((四)DataFrame的常用函数)