[hadoop@hadoop001 spark-test-data]$ cat wc.txt
hello,world,hello
hello,world
welcome
scala> val file = sc.textFile("file:///home/hadoop/data/spark-test-data/wc.txt")
file: org.apache.spark.rdd.RDD[String] = file:///home/hadoop/data/spark-test-data/wc.txt MapPartitionsRDD[13] at textFile at :24
scala> val words=file.flatMap(lines=>lines.split(","))
words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[14] at flatMap at :30
scala> val wordsM=words.map(word=>(word,1))
wordsM: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[15] at map at :32
scala> val wordsc=wordsM.reduceByKey((x,y)=>x+y)
wordsc: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[16] at reduceByKey at :34
scala> wordsc.collect
res5: Array[(String, Int)] = Array((hello,3), (welcome,1), (world,2))
[hadoop@hadoop001 spark-test-data]$ cat test2.json
{"name":"Yin", "age":18,"address":{"city":"Columbus","state":"Ohio"}}
{"name":"Michael","age":16, "address":{"city":null, "state":"California"}}
[hadoop@hadoop001 spark-test-data]$
scala> val testDf=spark.read.format("json").load("/home/hadoop/data/spark-test-data/test2.json")
testDf: org.apache.spark.sql.DataFrame = [address: struct, age: bigint ... 1 more field]
scala> testDf.printSchema
root
|-- address: struct (nullable = true)
| |-- city: string (nullable = true)
| |-- state: string (nullable = true)
|-- age: long (nullable = true)
|-- name: string (nullable = true)
scala> testDf.show
+-----------------+---+-------+
| address|age| name|
+-----------------+---+-------+
| [Columbus,Ohio]| 18| Yin|
|[null,California]| 16|Michael|
+-----------------+---+-------+
scala> testDf.select("name")
res17: org.apache.spark.sql.DataFrame = [name: string]
scala> testDf.select("name").show
+-------+
| name|
+-------+
| Yin|
|Michael|
+-------+
scala> testDf.createOrReplaceTempView("people")
scala> val sqlDF = spark.sql("SELECT * FROM people")
sqlDF: org.apache.spark.sql.DataFrame = [address: struct, age: bigint ... 1 more field]
scala> sqlDF.show
+-----------------+---+-------+
| address|age| name|
+-----------------+---+-------+
| [Columbus,Ohio]| 18| Yin|
|[null,California]| 16|Michael|
+-----------------+---+-------+
scala>
RDD=>DataFrame 反射的方式
[hadoop@hadoop001 spark-test-data]$ cat student.data
1|Burke|1-300-746-8446|[email protected]
2|Kamal|1-668-571-5046|[email protected]
3|Olga|1-956-311-1686|[email protected]
4|Belle|1-246-894-6340|[email protected]
5|Trevor|1-300-527-4967|[email protected]
6|Laurel|1-691-379-9921|[email protected]
7|Sara|1-608-140-1995|[email protected]
8|Kaseem|1-881-586-2689|[email protected]
9|Lev|1-916-367-5608|[email protected]
10|Maya|1-271-683-2698|[email protected]
11|Emi|1-467-270-1337|[email protected]
12|Caleb|1-683-212-0896|[email protected]
13|Florence|1-603-575-2444|[email protected]
14|Anika|1-856-828-7883|[email protected]
15|Tarik|1-398-171-2268|[email protected]
16|Amena|1-878-250-3129|[email protected]
17|Blossom|1-154-406-9596|[email protected]
18|Guy|1-869-521-3230|[email protected]
19|Malachi|1-608-637-2772|[email protected]
20|Edward|1-711-710-6552|[email protected]
21||1-711-710-6552|[email protected]
22||1-711-710-6552|[email protected]
23|NULL|1-711-710-6552|[email protected]
scala> val rdd = spark.sparkContext.textFile("file:////home/hadoop/data/spark-test-data/student.data")
rdd: org.apache.spark.rdd.RDD[String] = file:////home/hadoop/data/spark-test-data/student.data MapPartitionsRDD[72] at textFile at :23
scala> val studentDs=rdd.map(lines=>lines.split("|"))
studentDs: org.apache.spark.sql.Dataset[Array[String]] = [value: array]
scala> case class Student(id:Int,name:String,phone:String,emal:String)
defined class Student
scala> val studentDfF = studentDs.map(info=>Student(info(0).toInt,info(1),info(2),info(3))).toDF()
studentDfF: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> studentDfF.printSchema
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
|-- phone: string (nullable = true)
|-- emal: string (nullable = true)
scala> studentDfF.show
+---+----+-----+----+
| id|name|phone|emal|
+---+----+-----+----+
| 1| || B| u|
| 2| || K| a|
| 3| || O| l|
| 4| || B| e|
| 5| || T| r|
| 6| || L| a|
| 7| || S| a|
| 8| || K| a|
| 9| || L| e|
| 1| 0| || M|
| 1| 1| || E|
| 1| 2| || C|
| 1| 3| || F|
| 1| 4| || A|
| 1| 5| || T|
| 1| 6| || A|
| 1| 7| || B|
| 1| 8| || G|
| 1| 9| || M|
| 2| 0| || E|
+---+----+-----+----+
only showing top 20 rows
scala> val studentDs=rdd.map(lines=>lines.split("\\|"))
studentDs: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[10] at map at :28
scala> val studentDF = studentDs.map(info=>Student(info(0).toInt,info(1),info(2),info(3))).toDF()
studentDF: org.apache.spark.sql.DataFrame = [id: int, name: string ... 2 more fields]
scala> studentDF.show
+---+--------+--------------+--------------------+
| id| name| phone| emal|
+---+--------+--------------+--------------------+
| 1| Burke|1-300-746-8446|ullamcorper.velit...|
| 2| Kamal|1-668-571-5046|pede.Suspendisse@...|
| 3| Olga|1-956-311-1686|Aenean.eget.metus...|
| 4| Belle|1-246-894-6340|vitae.aliquet.nec...|
| 5| Trevor|1-300-527-4967|dapibus.id@acturp...|
| 6| Laurel|1-691-379-9921|adipiscing@consec...|
| 7| Sara|1-608-140-1995|Donec.nibh@enimEt...|
| 8| Kaseem|1-881-586-2689|cursus.et.magna@e...|
| 9| Lev|1-916-367-5608|Vivamus.nisi@ipsu...|
| 10| Maya|1-271-683-2698|accumsan.convalli...|
| 11| Emi|1-467-270-1337| [email protected]|
| 12| Caleb|1-683-212-0896|Suspendisse@Quisq...|
| 13|Florence|1-603-575-2444|sit.amet.dapibus@...|
| 14| Anika|1-856-828-7883|euismod@ligulaeli...|
| 15| Tarik|1-398-171-2268|[email protected]|
| 16| Amena|1-878-250-3129|lorem.luctus.ut@s...|
| 17| Blossom|1-154-406-9596|Nunc.commodo.auct...|
| 18| Guy|1-869-521-3230|senectus.et.netus...|
| 19| Malachi|1-608-637-2772|Proin.mi.Aliquam@...|
| 20| Edward|1-711-710-6552|lectus@aliquetlib...|
+---+--------+--------------+--------------------+
only showing top 20 rows
RDD=>DataFrame 编程的方式
hadoop@hadoop001 spark-test-data]$ cat people.txt
Burke ,12
Kamal ,13
Olga ,14
Belle ,15
Trevor ,16
Laurel ,17
Sara ,18
Kaseem ,19
Lev ,20
Maya ,21
Emi ,22
Caleb ,23
Florence ,24
Anika ,25
Tarik ,26
Amena ,27
Blossom ,28
Guy ,29
Malachi ,30
Edward ,31
scala> val rdd = spark.sparkContext.textFile("file:///home/hadoop/data/spark-test-data/people.txt")
rdd: org.apache.spark.rdd.RDD[String] = file:///home/hadoop/data/spark-test-data/people.txt MapPartitionsRDD[16] at textFile at :26
scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._
scala> import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
scala> val rowRDD = rdd.map(_.split(",")).map(attributes => Row(attributes(0).trim, attributes(1).trim.toInt))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[33] at map at :33
scala> val schema = StructType(Array(StructField("name", StringType, true),
| StructField("age", IntegerType, true)))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(age,IntegerType,true))
scala> val pepleRDD = spark.createDataFrame(rowRDD,schema)
pepleRDD: org.apache.spark.sql.DataFrame = [name: string, age: int]
scala> pepleRDD.show
+--------+---+
| name|age|
+--------+---+
| Burke| 12|
| Kamal| 13|
| Olga| 14|
| Belle| 15|
| Trevor| 16|
| Laurel| 17|
| Sara| 18|
| Kaseem| 19|
| Lev| 20|
| Maya| 21|
| Emi| 22|
| Caleb| 23|
|Florence| 24|
| Anika| 25|
| Tarik| 26|
| Amena| 27|
| Blossom| 28|
| Guy| 29|
| Malachi| 30|
| Edward| 31|
+--------+---+
scala> val userDF=spark.read.load("/home/hadoop/data/spark-test-data/resources/users.parquet")
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
userDF: org.apache.spark.sql.DataFrame = [name: string, favorite_color: string ... 1 more field]
scala> userDF.printSchema
root
|-- name: string (nullable = true)
|-- favorite_color: string (nullable = true)
|-- favorite_numbers: array (nullable = true)
| |-- element: integer (containsNull = true)
scala> userDF.show
18/11/08 02:34:45 WARN ParquetRecordReader: Can not initialize counter due to context is not a instance of TaskInputOutputContext, but is org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+------+--------------+----------------+
| name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa| null| [3, 9, 15, 20]|
| Ben| red| []|
+------+--------------+----------------+
scala> val salesDFCsv = spark.read.format("csv").option("sep", ",").option("inferSchema", "true").option("header", "true").load("/home/hadoop/data/spark-test-data/resources/sales.csv")
salesDFCsv: org.apache.spark.sql.DataFrame = [transactionId: int, customerId: int ... 2 more fields]
scala> salesDFCsv.show
+-------------+----------+------+----------+
|transactionId|customerId|itemId|amountPaid|
+-------------+----------+------+----------+
| 111| 1| 1| 100.0|
| 112| 2| 2| 505.0|
| 113| 3| 3| 510.0|
| 114| 4| 4| 600.0|
| 115| 1| 2| 500.0|
| 116| 1| 2| 500.0|
| 117| 1| 2| 500.0|
| 118| 1| 2| 500.0|
| 119| 2| 3| 500.0|
| 120| 1| 2| 500.0|
| 121| 1| 4| 500.0|
| 122| 1| 2| 500.0|
| 123| 1| 4| 500.0|
| 124| 1| 2| 500.0|
+-------------+----------+------+----------+
在$SPARK_HOME/conf 下的spark-env.shl 添加
export SPARK_CLASSPATH=$HIVE_HOME/lib/mysql-connector-java-5.1.47.jar
hive-site.xml 到$SPARK_HOME/conf
scala> val hiveDF=spark.table("emp_p")
hiveDF: org.apache.spark.sql.DataFrame = [empno: int, ename: string ... 6 more fields]
scala> hiveDF.show
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
+-----+------+---------+----+----------+-------+------+------+
|empno| ename| job| mgr| hiredate| sal| comm|deptno|
+-----+------+---------+----+----------+-------+------+------+
| 7369| SMITH| CLERK|7902|1980-12-17| 800.0| null| 20|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20| 1600.0| 300.0| 30|
| 7521| WARD| SALESMAN|7698| 1981-2-22| 1250.0| 500.0| 30|
| 7566| JONES| MANAGER|7839| 1981-4-2| 2975.0| null| 20|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28| 1250.0|1400.0| 30|
| 7698| BLAKE| MANAGER|7839| 1981-5-1| 2850.0| null| 30|
| 7782| CLARK| MANAGER|7839| 1981-6-9| 2450.0| null| 10|
| 7788| SCOTT| ANALYST|7566| 1987-4-19| 3000.0| null| 20|
| 7839| KING|PRESIDENT|null|1981-11-17| 5000.0| null| 10|
| 7844|TURNER| SALESMAN|7698| 1981-9-8| 1500.0| 0.0| 30|
| 7876| ADAMS| CLERK|7788| 1987-5-23| 1100.0| null| 20|
| 7900| JAMES| CLERK|7698| 1981-12-3| 950.0| null| 30|
| 7902| FORD| ANALYST|7566| 1981-12-3| 3000.0| null| 20|
| 7934|MILLER| CLERK|7782| 1982-1-23| 1300.0| null| 10|
| 8888| HIVE| PROGRAM|7839| 1988-1-23|10300.0| null| null|
+-----+------+---------+----+----------+-------+------+------+
:spark.sql(“select * from emp_p”).show
scala> spark.sql("select * from emp_p").show
+-----+------+---------+----+----------+-------+------+------+
|empno| ename| job| mgr| hiredate| sal| comm|deptno|
+-----+------+---------+----+----------+-------+------+------+
| 7369| SMITH| CLERK|7902|1980-12-17| 800.0| null| 20|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20| 1600.0| 300.0| 30|
| 7521| WARD| SALESMAN|7698| 1981-2-22| 1250.0| 500.0| 30|
| 7566| JONES| MANAGER|7839| 1981-4-2| 2975.0| null| 20|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28| 1250.0|1400.0| 30|
| 7698| BLAKE| MANAGER|7839| 1981-5-1| 2850.0| null| 30|
| 7782| CLARK| MANAGER|7839| 1981-6-9| 2450.0| null| 10|
| 7788| SCOTT| ANALYST|7566| 1987-4-19| 3000.0| null| 20|
| 7839| KING|PRESIDENT|null|1981-11-17| 5000.0| null| 10|
| 7844|TURNER| SALESMAN|7698| 1981-9-8| 1500.0| 0.0| 30|
| 7876| ADAMS| CLERK|7788| 1987-5-23| 1100.0| null| 20|
| 7900| JAMES| CLERK|7698| 1981-12-3| 950.0| null| 30|
| 7902| FORD| ANALYST|7566| 1981-12-3| 3000.0| null| 20|
| 7934|MILLER| CLERK|7782| 1982-1-23| 1300.0| null| 10|
| 8888| HIVE| PROGRAM|7839| 1988-1-23|10300.0| null| null|
+-----+------+---------+----+----------+-------+------+------+
scala> sql("select * from emp_p").show
+-----+------+---------+----+----------+-------+------+------+
|empno| ename| job| mgr| hiredate| sal| comm|deptno|
+-----+------+---------+----+----------+-------+------+------+
| 7369| SMITH| CLERK|7902|1980-12-17| 800.0| null| 20|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20| 1600.0| 300.0| 30|
| 7521| WARD| SALESMAN|7698| 1981-2-22| 1250.0| 500.0| 30|
| 7566| JONES| MANAGER|7839| 1981-4-2| 2975.0| null| 20|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28| 1250.0|1400.0| 30|
| 7698| BLAKE| MANAGER|7839| 1981-5-1| 2850.0| null| 30|
| 7782| CLARK| MANAGER|7839| 1981-6-9| 2450.0| null| 10|
| 7788| SCOTT| ANALYST|7566| 1987-4-19| 3000.0| null| 20|
| 7839| KING|PRESIDENT|null|1981-11-17| 5000.0| null| 10|
| 7844|TURNER| SALESMAN|7698| 1981-9-8| 1500.0| 0.0| 30|
| 7876| ADAMS| CLERK|7788| 1987-5-23| 1100.0| null| 20|
| 7900| JAMES| CLERK|7698| 1981-12-3| 950.0| null| 30|
| 7902| FORD| ANALYST|7566| 1981-12-3| 3000.0| null| 20|
| 7934|MILLER| CLERK|7782| 1982-1-23| 1300.0| null| 10|
| 8888| HIVE| PROGRAM|7839| 1988-1-23|10300.0| null| null|
+-----+------+---------+----+----------+-------+------+------+
注意:默认存储格式 parquet
scala> val usersPath = "file:///home/hadoop/data/spark-test-data/resources/users.parquet"
usersPath: String = file:///home/hadoop/data/spark-test-data/resources/users.parquet
scala> val usersDF = spark.read.load(usersPath)
scala> usersDF.write.format("parquet").saveAsTable("ruoze_d5.namesPartByColor3")
18/11/08 07:53:25 WARN ParquetRecordReader: Can not initialize counter due to context is not a instance of TaskInputOutputContext, but is org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
hive (ruoze_d5)> select * from namespartbycolor3;
OK
namespartbycolor3.name namespartbycolor3.favorite_color namespartbycolor3.favorite_numbers
Alyssa NULL [3,9,15,20]
Ben red NULL
Time taken: 0.771 seconds, Fetched: 2 row(s)
spark写入hive的时候要 写入的数据库要与hive-site.xml 里配置的一致,否则 即使在写入完成后,在hive里有表,但是表里没有数据
scala> peopleDF.write.partitionBy("age").saveAsTable("ruoze_d5.spark_people")
[hadoop@hadoop001 ~]$ hadoop fs -ls /user/hive/warehouse/ruoze_d5.db/spark_people
18/11/08 10:51:13 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 4 items
-rw-r--r-- 3 hadoop supergroup 0 2018-11-08 10:49 /user/hive/warehouse/ruoze_d5.db/spark_people/_SUCCESS
drwxr-xr-x - hadoop supergroup 0 2018-11-08 10:48 /user/hive/warehouse/ruoze_d5.db/spark_people/age=19
drwxr-xr-x - hadoop supergroup 0 2018-11-08 10:49 /user/hive/warehouse/ruoze_d5.db/spark_people/age=30
drwxr-xr-x - hadoop supergroup 0 2018-11-08 10:48 /user/hive/warehouse/ruoze_d5.db/spark_people/age=__HIVE_DEFAULT_PARTITION__
scala> val mysqlDF = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306").option("dbtable", "sqoop.emp").option("user", "root").option("password", "123456").option("driver", "com.mysql.jdbc.Driver").load()
mysqlDF: org.apache.spark.sql.DataFrame = [empno: decimal(4,0), ename: string ... 6 more fields]
scala> mysqlDF.show
+-----+--------+---------+----+--------------------+--------+--------+------+
|empno| ename| job| mgr| hiredate| sal| comm|deptno|
+-----+--------+---------+----+--------------------+--------+--------+------+
| 1111| lxp| bb|8888|2018-12-12 00:00:...|16000.00|30000.00| 40|
| 7369| SMITH| CLERK|7902|1980-12-17 00:00:...| 800.00| null| 20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20 00:00:...| 1600.00| 300.00| 30|
| 7521| WARD| SALESMAN|7698|1981-02-22 00:00:...| 1250.00| 500.00| 30|
| 7566| JONES| MANAGER|7839|1981-04-02 00:00:...| 2975.00| null| 20|
| 7654| MARTIN| SALESMAN|7698|1981-09-28 00:00:...| 1250.00| 1400.00| 30|
| 7698| BLAKE| MANAGER|7839|1981-05-01 00:00:...| 2850.00| null| 30|
| 7782| CLARK| MANAGER|7839|1981-06-09 00:00:...| 2450.00| null| 10|
| 7788| SCOTT| ANALYST|7566|1982-12-09 00:00:...| 3000.00| null| 20|
| 7839| KING|PRESIDENT|null|1981-11-17 00:00:...| 5000.00| null| 10|
| 7844| TURNER| SALESMAN|7698|1981-09-08 00:00:...| 1500.00| 0.00| 30|
| 7876| ADAMS| CLERK|7788|1983-01-12 00:00:...| 1100.00| null| 20|
| 7900| JAMES| CLERK|7698|1981-12-03 00:00:...| 950.00| null| 30|
| 7902| FORD| ANALYST|7566|1981-12-03 00:00:...| 3000.00| null| 20|
| 7934| MILLER| CLERK|7782|1982-01-23 00:00:...| 1300.00| null| 10|
| 8888|xiaoming| null|null| null| null| null| null|
| 9999| lxp| bb|9999|2018-12-12 00:00:...|16000.00|30000.00| 40|
+-----+--------+---------+----+--------------------+--------+--------+------+
scala> val mysqlDF = peopleDF.write.format("jdbc").option("url", "jdbc:mysql://localhost:3306").option("dbtable", "sqoop.spark_people").option("user", "root").option("password", "123456").option("driver", "com.mysql.jdbc.Driver").save()
mysqlDF: Unit = ()
mysql> select * from spark_people;
+------+---------+
| age | name |
+------+---------+
| NULL | Michael |
| 30 | Andy |
| 19 | Justin |
+------+---------+
3 rows in set (0.00 sec)