文章目录
- 文本文件输入输出
- JSON/CSV文件输入输出
- SequenceFile文件输入输出
- 保存SequenceFile文件
- 查看SequenceFile文件
- 对象文件输入输出
- hadoop输入输出
- MySQL的输入输出
文本文件输入输出
读取文本文件
scala> sc.textFile("./wc.txt")
res4: org.apache.spark.rdd.RDD[String] = ./wc.txt MapPartitionsRDD[5] at textFile at <console>:25
保存文本文件
scala> res4.saveAsTextFile("./test")
JSON/CSV文件输入输出
这种有格式的文件的输入和输出还是通过文本文件的输入和输出来支持的,Spark Core没有内置对JSON文件和CSV文件的解析和反解析功能,这个解析功能是需要用户自己根据需求来定制的。 注意:JSON文件的读取如果需要多个partition来读,那么JSOn文件一般一行一个json。如果你的JSON是跨行的,那么需要整体读入所有数据,并整体解析。
SequenceFile文件输入输出
保存SequenceFile文件
scala> val rdd =sc.parallelize(List((1,"aa"),(2,"bb"),(4,"cc")))
rdd: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[7] at parallelize at <console>:24
scala> rdd.saveAsSequenceFile("./seq")
查看SequenceFile文件
scala> sc.sequenceFile[Int,String]("./seq").collect
res8: Array[(Int, String)] = Array((2,bb), (4,cc), (1,aa))
对象文件输入输出
保存对象文件
scala> val rdd =sc.parallelize(List((1,"aa"),(2,"bb"),(4,"cc")))
rdd: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[11] at parallelize at <console>:24
scala> rdd.saveAsObjectFile("./obj")
查看对象文件
scala> sc.objectFile[(Int,String)]("obj").collect
res10: Array[(Int, String)] = Array((2,bb), (4,cc), (1,aa))
hadoop输入输出
从hadoop读取
object ReadHadoopFile {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HadoopFileApp")
val sc = new SparkContext(sparkConf)
val input = sc.newAPIHadoopFile[LongWritable,
Text,
TextInputFormat]
("/output/part*",
classOf[TextInputFormat],
classOf[LongWritable],
classOf[Text])
println("有多少条数据:" + input.count)
input.foreach(print(_))
input.first
sc.stop()
}
}
保存到hadoop
object WriteHadoopFile {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HadoopFileApp")
val sc = new SparkContext(sparkConf)
val initialRDD = sc.parallelize(Array(("hadoop", 30), ("hive", 71), ("cat",
11)))
initialRDD.saveAsHadoopFile("/output/",
classOf[Text] ,
classOf[LongWritable] ,
classOf[TextOutputFormat[Text, LongWritable]])
sc.stop()
}
}
MySQL的输入输出
从MySQL读取数据
def main (args: Array[String] ) {
val sparkConf = new SparkConf ().setMaster ("local[2]").setAppName
("HBaseApp")
val sc = new SparkContext (sparkConf)
val rdd = new JdbcRDD (
sc,
() => {
Class.forName ("com.mysql.jdbc.Driver").newInstance () DriverManager.getConnection
("jdbc:mysql://linux01:3306/company", "root",
"123456")
},
"select * from staff where id >= ? and id <= ?;",
1,
100,
1,
r => (r.getString (1), r.getString (2), r.getString (3) ) ).cache ()
println (rdd.count () ) rdd.foreach (println (_) )
sc.stop ()
}
查看MySQL中数据
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HBaseApp")
val sc = new SparkContext(sparkConf)
val data = sc.parallelize(List(("Irelia", "Female"), ("Ezreal", "Male"),
("Alistar", "Female")))
data.foreachPartition(insertData)
}
def insertData(iterator: Iterator[(String, String)]): Unit = {
val conn = DriverManager.getConnection("jdbc:mysql://linux01:3306/company",
"root", "123456")
iterator.foreach(data => {
val ps = conn.prepareStatement("insert into staff(name, sex) values
(?, ?)")
ps.setString(1, data._1)
ps.setString(2, data._2)
ps.executeUpdate()
})
}