Spark:通过Demo来入门Spark

RDD

  • 统计 worldCount
  • 文件 wc.txt
[hadoop@hadoop001 spark-test-data]$ cat wc.txt 
hello,world,hello
hello,world
welcome

scala> val file = sc.textFile("file:///home/hadoop/data/spark-test-data/wc.txt")
file: org.apache.spark.rdd.RDD[String] = file:///home/hadoop/data/spark-test-data/wc.txt MapPartitionsRDD[13] at textFile at :24

scala> val words=file.flatMap(lines=>lines.split(","))
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[14] at flatMap at :30
scala> val wordsM=words.map(word=>(word,1))
wordsM: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[15] at map at :32

scala> val wordsc=wordsM.reduceByKey((x,y)=>x+y)
wordsc: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[16] at reduceByKey at :34

scala> wordsc.collect
res5: Array[(String, Int)] = Array((hello,3), (welcome,1), (world,2))

DataFrame

  • 文件 test2.json
[hadoop@hadoop001 spark-test-data]$ cat test2.json 
{"name":"Yin", "age":18,"address":{"city":"Columbus","state":"Ohio"}}
{"name":"Michael","age":16, "address":{"city":null, "state":"California"}}
[hadoop@hadoop001 spark-test-data]$ 

scala> val testDf=spark.read.format("json").load("/home/hadoop/data/spark-test-data/test2.json")
testDf: org.apache.spark.sql.DataFrame = [address: struct, age: bigint ... 1 more field]

scala> testDf.printSchema
root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)


scala> testDf.show
+-----------------+---+-------+
|          address|age|   name|
+-----------------+---+-------+
|  [Columbus,Ohio]| 18|    Yin|
|[null,California]| 16|Michael|
+-----------------+---+-------+


scala> testDf.select("name")
res17: org.apache.spark.sql.DataFrame = [name: string]

scala> testDf.select("name").show
+-------+
|   name|
+-------+
|    Yin|
|Michael|
+-------+

Running SQL Queries Programmatically

  • 在程序上使用sql执行

scala> testDf.createOrReplaceTempView("people")

scala> val sqlDF = spark.sql("SELECT * FROM people")
sqlDF: org.apache.spark.sql.DataFrame = [address: struct, age: bigint ... 1 more field]

scala> sqlDF.show
+-----------------+---+-------+
|          address|age|   name|
+-----------------+---+-------+
|  [Columbus,Ohio]| 18|    Yin|
|[null,California]| 16|Michael|
+-----------------+---+-------+


scala> 

RDD=>DataFrame 反射的方式

  • 数据:student.data
[hadoop@hadoop001 spark-test-data]$ cat student.data 
1|Burke|1-300-746-8446|[email protected]
2|Kamal|1-668-571-5046|[email protected]
3|Olga|1-956-311-1686|[email protected]
4|Belle|1-246-894-6340|[email protected]
5|Trevor|1-300-527-4967|[email protected]
6|Laurel|1-691-379-9921|[email protected]
7|Sara|1-608-140-1995|[email protected]
8|Kaseem|1-881-586-2689|[email protected]
9|Lev|1-916-367-5608|[email protected]
10|Maya|1-271-683-2698|[email protected]
11|Emi|1-467-270-1337|[email protected]
12|Caleb|1-683-212-0896|[email protected]
13|Florence|1-603-575-2444|[email protected]
14|Anika|1-856-828-7883|[email protected]
15|Tarik|1-398-171-2268|[email protected]
16|Amena|1-878-250-3129|[email protected]
17|Blossom|1-154-406-9596|[email protected]
18|Guy|1-869-521-3230|[email protected]
19|Malachi|1-608-637-2772|[email protected]
20|Edward|1-711-710-6552|[email protected]
21||1-711-710-6552|[email protected]
22||1-711-710-6552|[email protected]
23|NULL|1-711-710-6552|[email protected]

  • 第一次(val studentDs=rdd.map(lines=>lines.split("|")))
scala> val rdd = spark.sparkContext.textFile("file:////home/hadoop/data/spark-test-data/student.data")
rdd: org.apache.spark.rdd.RDD[String] = file:////home/hadoop/data/spark-test-data/student.data MapPartitionsRDD[72] at textFile at :23

scala> val studentDs=rdd.map(lines=>lines.split("|"))
studentDs: org.apache.spark.sql.Dataset[Array[String]] = [value: array]

scala> case class Student(id:Int,name:String,phone:String,emal:String)
defined class Student

scala> val studentDfF = studentDs.map(info=>Student(info(0).toInt,info(1),info(2),info(3))).toDF()
studentDfF: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> studentDfF.printSchema
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- emal: string (nullable = true)


scala> studentDfF.show
+---+----+-----+----+
| id|name|phone|emal|
+---+----+-----+----+
|  1|   ||    B|   u|
|  2|   ||    K|   a|
|  3|   ||    O|   l|
|  4|   ||    B|   e|
|  5|   ||    T|   r|
|  6|   ||    L|   a|
|  7|   ||    S|   a|
|  8|   ||    K|   a|
|  9|   ||    L|   e|
|  1|   0|    ||   M|
|  1|   1|    ||   E|
|  1|   2|    ||   C|
|  1|   3|    ||   F|
|  1|   4|    ||   A|
|  1|   5|    ||   T|
|  1|   6|    ||   A|
|  1|   7|    ||   B|
|  1|   8|    ||   G|
|  1|   9|    ||   M|
|  2|   0|    ||   E|
+---+----+-----+----+
only showing top 20 rows
  • 第二次(rdd.map(lines=>lines.split("\|")))
scala> val studentDs=rdd.map(lines=>lines.split("\\|"))
studentDs: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[10] at map at :28

scala> val studentDF = studentDs.map(info=>Student(info(0).toInt,info(1),info(2),info(3))).toDF()
studentDF: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]

scala> studentDF.show
+---+--------+--------------+--------------------+
| id|    name|         phone|                emal|
+---+--------+--------------+--------------------+
|  1|   Burke|1-300-746-8446|ullamcorper.velit...|
|  2|   Kamal|1-668-571-5046|pede.Suspendisse@...|
|  3|    Olga|1-956-311-1686|Aenean.eget.metus...|
|  4|   Belle|1-246-894-6340|vitae.aliquet.nec...|
|  5|  Trevor|1-300-527-4967|dapibus.id@acturp...|
|  6|  Laurel|1-691-379-9921|adipiscing@consec...|
|  7|    Sara|1-608-140-1995|Donec.nibh@enimEt...|
|  8|  Kaseem|1-881-586-2689|cursus.et.magna@e...|
|  9|     Lev|1-916-367-5608|Vivamus.nisi@ipsu...|
| 10|    Maya|1-271-683-2698|accumsan.convalli...|
| 11|     Emi|1-467-270-1337|        [email protected]|
| 12|   Caleb|1-683-212-0896|Suspendisse@Quisq...|
| 13|Florence|1-603-575-2444|sit.amet.dapibus@...|
| 14|   Anika|1-856-828-7883|euismod@ligulaeli...|
| 15|   Tarik|1-398-171-2268|[email protected]|
| 16|   Amena|1-878-250-3129|lorem.luctus.ut@s...|
| 17| Blossom|1-154-406-9596|Nunc.commodo.auct...|
| 18|     Guy|1-869-521-3230|senectus.et.netus...|
| 19| Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
| 20|  Edward|1-711-710-6552|lectus@aliquetlib...|
+---+--------+--------------+--------------------+
only showing top 20 rows

RDD=>DataFrame 编程的方式

  • 数据:people.txt
hadoop@hadoop001 spark-test-data]$ cat people.txt 
 Burke	,12
   Kamal	,13
    Olga	,14
   Belle	,15
  Trevor	,16
  Laurel	,17
    Sara	,18
  Kaseem	,19
     Lev	,20
    Maya	,21
     Emi	,22
   Caleb	,23
Florence	,24
   Anika	,25
   Tarik	,26
   Amena	,27
 Blossom	,28
     Guy	,29
 Malachi	,30
  Edward	,31

  • 操作
scala> val rdd = spark.sparkContext.textFile("file:///home/hadoop/data/spark-test-data/people.txt")
rdd: org.apache.spark.rdd.RDD[String] = file:///home/hadoop/data/spark-test-data/people.txt MapPartitionsRDD[16] at textFile at :26


scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._

scala> import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}

scala> val rowRDD = rdd.map(_.split(",")).map(attributes => Row(attributes(0).trim, attributes(1).trim.toInt))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[33] at map at :33

scala> val schema = StructType(Array(StructField("name", StringType, true),
     |       StructField("age", IntegerType, true)))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,true))

scala> val pepleRDD = spark.createDataFrame(rowRDD,schema)
pepleRDD: org.apache.spark.sql.DataFrame = [name: string, age: int]

scala> pepleRDD.show
+--------+---+
|    name|age|
+--------+---+
|   Burke| 12|
|   Kamal| 13|
|    Olga| 14|
|   Belle| 15|
|  Trevor| 16|
|  Laurel| 17|
|    Sara| 18|
|  Kaseem| 19|
|     Lev| 20|
|    Maya| 21|
|     Emi| 22|
|   Caleb| 23|
|Florence| 24|
|   Anika| 25|
|   Tarik| 26|
|   Amena| 27|
| Blossom| 28|
|     Guy| 29|
| Malachi| 30|
|  Edward| 31|
+--------+---+

paquet

  • 文件:users.parquet
scala> val userDF=spark.read.load("/home/hadoop/data/spark-test-data/resources/users.parquet")
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
userDF: org.apache.spark.sql.DataFrame = [name: string, favorite_color: string ... 1 more field]

scala> userDF.printSchema
root
 |-- name: string (nullable = true)
 |-- favorite_color: string (nullable = true)
 |-- favorite_numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)


scala> userDF.show
18/11/08 02:34:45 WARN ParquetRecordReader: Can not initialize counter due to context is not a instance of TaskInputOutputContext, but is org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+

cvs

scala> val salesDFCsv = spark.read.format("csv").option("sep", ",").option("inferSchema", "true").option("header", "true").load("/home/hadoop/data/spark-test-data/resources/sales.csv")
salesDFCsv: org.apache.spark.sql.DataFrame = [transactionId: int, customerId: int ... 2 more fields]

scala> salesDFCsv.show
+-------------+----------+------+----------+
|transactionId|customerId|itemId|amountPaid|
+-------------+----------+------+----------+
|          111|         1|     1|     100.0|
|          112|         2|     2|     505.0|
|          113|         3|     3|     510.0|
|          114|         4|     4|     600.0|
|          115|         1|     2|     500.0|
|          116|         1|     2|     500.0|
|          117|         1|     2|     500.0|
|          118|         1|     2|     500.0|
|          119|         2|     3|     500.0|
|          120|         1|     2|     500.0|
|          121|         1|     4|     500.0|
|          122|         1|     2|     500.0|
|          123|         1|     4|     500.0|
|          124|         1|     2|     500.0|
+-------------+----------+------+----------+

spark操作hive

  • 操作前准备
  1. spark-env.shl

在$SPARK_HOME/conf 下的spark-env.shl 添加

export SPARK_CLASSPATH=$HIVE_HOME/lib/mysql-connector-java-5.1.47.jar

  1. 复制 $HIVE_HOME/conf 下的

hive-site.xml 到$SPARK_HOME/conf

  1. 复制 $HIVE_HOME/conf 下的
  • 读Hive
    • 第一种方法
scala> val hiveDF=spark.table("emp_p")
hiveDF: org.apache.spark.sql.DataFrame = [empno: int, ename: string ... 6 more fields]

scala> hiveDF.show
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
+-----+------+---------+----+----------+-------+------+------+
|empno| ename|      job| mgr|  hiredate|    sal|  comm|deptno|
+-----+------+---------+----+----------+-------+------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17|  800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20| 1600.0| 300.0|    30|
| 7521|  WARD| SALESMAN|7698| 1981-2-22| 1250.0| 500.0|    30|
| 7566| JONES|  MANAGER|7839|  1981-4-2| 2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28| 1250.0|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839|  1981-5-1| 2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839|  1981-6-9| 2450.0|  null|    10|
| 7788| SCOTT|  ANALYST|7566| 1987-4-19| 3000.0|  null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17| 5000.0|  null|    10|
| 7844|TURNER| SALESMAN|7698|  1981-9-8| 1500.0|   0.0|    30|
| 7876| ADAMS|    CLERK|7788| 1987-5-23| 1100.0|  null|    20|
| 7900| JAMES|    CLERK|7698| 1981-12-3|  950.0|  null|    30|
| 7902|  FORD|  ANALYST|7566| 1981-12-3| 3000.0|  null|    20|
| 7934|MILLER|    CLERK|7782| 1982-1-23| 1300.0|  null|    10|
| 8888|  HIVE|  PROGRAM|7839| 1988-1-23|10300.0|  null|  null|
+-----+------+---------+----+----------+-------+------+------+

  • 第二种方法

:spark.sql(“select * from emp_p”).show

scala> spark.sql("select * from emp_p").show
+-----+------+---------+----+----------+-------+------+------+
|empno| ename|      job| mgr|  hiredate|    sal|  comm|deptno|
+-----+------+---------+----+----------+-------+------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17|  800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20| 1600.0| 300.0|    30|
| 7521|  WARD| SALESMAN|7698| 1981-2-22| 1250.0| 500.0|    30|
| 7566| JONES|  MANAGER|7839|  1981-4-2| 2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28| 1250.0|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839|  1981-5-1| 2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839|  1981-6-9| 2450.0|  null|    10|
| 7788| SCOTT|  ANALYST|7566| 1987-4-19| 3000.0|  null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17| 5000.0|  null|    10|
| 7844|TURNER| SALESMAN|7698|  1981-9-8| 1500.0|   0.0|    30|
| 7876| ADAMS|    CLERK|7788| 1987-5-23| 1100.0|  null|    20|
| 7900| JAMES|    CLERK|7698| 1981-12-3|  950.0|  null|    30|
| 7902|  FORD|  ANALYST|7566| 1981-12-3| 3000.0|  null|    20|
| 7934|MILLER|    CLERK|7782| 1982-1-23| 1300.0|  null|    10|
| 8888|  HIVE|  PROGRAM|7839| 1988-1-23|10300.0|  null|  null|
+-----+------+---------+----+----------+-------+------+------+

  • 第三种方法

scala> sql("select * from emp_p").show
+-----+------+---------+----+----------+-------+------+------+
|empno| ename|      job| mgr|  hiredate|    sal|  comm|deptno|
+-----+------+---------+----+----------+-------+------+------+
| 7369| SMITH|    CLERK|7902|1980-12-17|  800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20| 1600.0| 300.0|    30|
| 7521|  WARD| SALESMAN|7698| 1981-2-22| 1250.0| 500.0|    30|
| 7566| JONES|  MANAGER|7839|  1981-4-2| 2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28| 1250.0|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839|  1981-5-1| 2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839|  1981-6-9| 2450.0|  null|    10|
| 7788| SCOTT|  ANALYST|7566| 1987-4-19| 3000.0|  null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17| 5000.0|  null|    10|
| 7844|TURNER| SALESMAN|7698|  1981-9-8| 1500.0|   0.0|    30|
| 7876| ADAMS|    CLERK|7788| 1987-5-23| 1100.0|  null|    20|
| 7900| JAMES|    CLERK|7698| 1981-12-3|  950.0|  null|    30|
| 7902|  FORD|  ANALYST|7566| 1981-12-3| 3000.0|  null|    20|
| 7934|MILLER|    CLERK|7782| 1982-1-23| 1300.0|  null|    10|
| 8888|  HIVE|  PROGRAM|7839| 1988-1-23|10300.0|  null|  null|
+-----+------+---------+----+----------+-------+------+------+


  • 写Hive

注意:默认存储格式 parquet

scala> val usersPath = "file:///home/hadoop/data/spark-test-data/resources/users.parquet"
usersPath: String = file:///home/hadoop/data/spark-test-data/resources/users.parquet

scala> val usersDF = spark.read.load(usersPath)

scala> usersDF.write.format("parquet").saveAsTable("ruoze_d5.namesPartByColor3")
18/11/08 07:53:25 WARN ParquetRecordReader: Can not initialize counter due to context is not a instance of TaskInputOutputContext, but is org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl

hive (ruoze_d5)> select * from namespartbycolor3;
OK
namespartbycolor3.name	namespartbycolor3.favorite_color	namespartbycolor3.favorite_numbers
Alyssa	NULL	[3,9,15,20]
Ben	red	NULL
Time taken: 0.771 seconds, Fetched: 2 row(s)

注意

spark写入hive的时候要 写入的数据库要与hive-site.xml 里配置的一致,否则 即使在写入完成后,在hive里有表,但是表里没有数据

写入分区

scala> peopleDF.write.partitionBy("age").saveAsTable("ruoze_d5.spark_people")
[hadoop@hadoop001 ~]$ hadoop fs -ls /user/hive/warehouse/ruoze_d5.db/spark_people
18/11/08 10:51:13 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 4 items
-rw-r--r--   3 hadoop supergroup          0 2018-11-08 10:49 /user/hive/warehouse/ruoze_d5.db/spark_people/_SUCCESS
drwxr-xr-x   - hadoop supergroup          0 2018-11-08 10:48 /user/hive/warehouse/ruoze_d5.db/spark_people/age=19
drwxr-xr-x   - hadoop supergroup          0 2018-11-08 10:49 /user/hive/warehouse/ruoze_d5.db/spark_people/age=30
drwxr-xr-x   - hadoop supergroup          0 2018-11-08 10:48 /user/hive/warehouse/ruoze_d5.db/spark_people/age=__HIVE_DEFAULT_PARTITION__

hive 操作 MySQL

  • 读MySQL

scala> val mysqlDF = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306").option("dbtable", "sqoop.emp").option("user", "root").option("password", "123456").option("driver", "com.mysql.jdbc.Driver").load()
mysqlDF: org.apache.spark.sql.DataFrame = [empno: decimal(4,0), ename: string ... 6 more fields]

scala> mysqlDF.show
+-----+--------+---------+----+--------------------+--------+--------+------+
|empno|   ename|      job| mgr|            hiredate|     sal|    comm|deptno|
+-----+--------+---------+----+--------------------+--------+--------+------+
| 1111|     lxp|       bb|8888|2018-12-12 00:00:...|16000.00|30000.00|    40|
| 7369|   SMITH|    CLERK|7902|1980-12-17 00:00:...|  800.00|    null|    20|
| 7499|   ALLEN| SALESMAN|7698|1981-02-20 00:00:...| 1600.00|  300.00|    30|
| 7521|    WARD| SALESMAN|7698|1981-02-22 00:00:...| 1250.00|  500.00|    30|
| 7566|   JONES|  MANAGER|7839|1981-04-02 00:00:...| 2975.00|    null|    20|
| 7654|  MARTIN| SALESMAN|7698|1981-09-28 00:00:...| 1250.00| 1400.00|    30|
| 7698|   BLAKE|  MANAGER|7839|1981-05-01 00:00:...| 2850.00|    null|    30|
| 7782|   CLARK|  MANAGER|7839|1981-06-09 00:00:...| 2450.00|    null|    10|
| 7788|   SCOTT|  ANALYST|7566|1982-12-09 00:00:...| 3000.00|    null|    20|
| 7839|    KING|PRESIDENT|null|1981-11-17 00:00:...| 5000.00|    null|    10|
| 7844|  TURNER| SALESMAN|7698|1981-09-08 00:00:...| 1500.00|    0.00|    30|
| 7876|   ADAMS|    CLERK|7788|1983-01-12 00:00:...| 1100.00|    null|    20|
| 7900|   JAMES|    CLERK|7698|1981-12-03 00:00:...|  950.00|    null|    30|
| 7902|    FORD|  ANALYST|7566|1981-12-03 00:00:...| 3000.00|    null|    20|
| 7934|  MILLER|    CLERK|7782|1982-01-23 00:00:...| 1300.00|    null|    10|
| 8888|xiaoming|     null|null|                null|    null|    null|  null|
| 9999|     lxp|       bb|9999|2018-12-12 00:00:...|16000.00|30000.00|    40|
+-----+--------+---------+----+--------------------+--------+--------+------+

  • 写入MySQL
scala>  val mysqlDF = peopleDF.write.format("jdbc").option("url", "jdbc:mysql://localhost:3306").option("dbtable", "sqoop.spark_people").option("user", "root").option("password", "123456").option("driver", "com.mysql.jdbc.Driver").save()
mysqlDF: Unit = ()

mysql> select * from spark_people;
+------+---------+
| age  | name    |
+------+---------+
| NULL | Michael |
|   30 | Andy    |
|   19 | Justin  |
+------+---------+
3 rows in set (0.00 sec)


你可能感兴趣的:(Spark)