val part = sc.textFile("file:///root/software/spark-2.2.0-bin-hadoop2.6/README.md")
val part = sc.textFile("hdfs://master:8020/sougou/README.md")
<=> (等价于)
val part = sc.textFile("/sougou/README.md")
textFile参数说明:
若参数是本地文件系统下的文件,则加载该文件;
若参数是本地文件系统中的一个目录,则加载该目录下的所有文件;
parallelize
val rdd1 = sc.parallelize(Array(1 to 10))
//或者通过list创建
val list = List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
val rdd1 = sc.parallelize(list)
val rdd = sc.makeRDD(1 to 4, 1)
var rdd = sc.makeRDD(Array(("A","1"),("B","1"),("C","3")), 2)
val rdd = sc.makeRDD(Seq(10,4,2,12,3))
filter
val lines = sc.textFile("hdfs://master:8020/sougou/README.md")
val linesWithSpark = lines.filter(x => x.contains("Spark"))
map
val data = 1 to 10
val rdd1 = sc.parallelize(data, 5)
val rdd2 = rdd1.map( x => x + 10)
rdd2.collect
val lines = sc.textFile("hdfs://master:8020/sougou/README.md")
val linesSplit = lines.map(_.split(" "))
linesSplit.collect
flatMap
val lines = sc.textFile("hdfs://master:8020/sougou/README.md")
val linesSplit = lines.flatMap(_.split(" "))
linesSplit.collect
groupByKey
val rdd = sc.makeRDD(Array(("A",1),("A",0),("B",1),("B",2),("C",3)), 2)
val groups = rdd.groupByKey()
groups.collect
reduceByKey
val rdd = sc.makeRDD(Array(("A",1),("A",0),("B",1),("B",2),("C",3)), 2)
val reduces = rdd.reduceByKey((a,b)=>a+b)
reduces.collect
reduce
val rdd = sc.makeRDD(Array(("A",1),("A",0),("B",1),("B",2),("C",3)), 2)
rdd.reduce((a, b)=>(a._1+b._1, a._2 + b._2))
val lines = sc.textFile("hdfs://master:8020/sougou/README.md")
val linesLen = lines.map(x => x.length)
linesLen.reduce((x,y)=>x+y)
rdd.persist(MEMORY_ONLY) <=>rdd.cache()
rdd.unpersist
方法1:可以在创建RDD的时候指定分区的个数
方法2:使用reparition重置分区的个数
方法3:系统自带分区HashParitioner和RangeParitioner
方法4:用户自定义分区,org.apache.spark.Partitioner
spark RDD中一个用户自定义分区的例子
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.Partitioner
class UdfPartitioners(numParts: Int) extends Partitioner{
override def numPartitions: Int = numParts
override def getPartition(key: Any): Int = {
key.toString().toInt % 10
}
}
object UdfPartitioners {
def main(args : Array[String]) {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val data = sc.parallelize(1 to 10, 5)
data.map((_,1)).
partitionBy(new UdfPartitioners(10)).
map(_._1).
saveAsTextFile("file:///root/partitions")
}
}
键值对RDD是指每个RDD元素都是(key,value)键值对类型。
val lines = sc.textFile("hdfs://master:8020/sougou/README.md")
val linesKV = lines.flatMap(_.split(" ")).map(x=>(x,1)).reduceByKey((x,y)=>x+y).take(10)
linesKV.foreach(println)
val list = List('A','B','C','D','E')
val rdd1 = sc.parallelize(list)
val rdd2 = rdd1.map(x=>(x, 1))
rdd2.foreach(println)
reduceByKey(func): 使用函数func合并具有相同键的值;
groupByKey(): 对具有相同键的值进行分组;
reduceByKey等价于groupByKey和map的组合。如:
val list = List("AB","BB","BC","CD","AE")
val rdd1 = sc.parallelize(list)
val rdd2 = rdd1.map(x=>(x, 1))
val rdd3 = rdd2.reduceByKey(_ + _).foreach(println)
val rdd4 = rdd2.groupByKey().map(x => (x._1, x._2.sum)).foreach(println)
keys: 把键值RDD中的key返回;
values:把键值RDD中的value返回;
sortByKey:返回一个根据key排序的RDD,默认升序,sortByKey(false)降序排列;
sortBy:可以按key排序,也可以按值排序,如下例子所述:
val list = Array(("AB",23),("BB",16),("BC",18),("CD",34),("AE",98))
val rdd1 = sc.parallelize(list)
val rdd2 = rdd1.sortBy(_._1, true).foreach(println)
val rdd3 = rdd1.sortByKey().foreach(println)
val rdd4 = rdd1.sortBy(_._2, false).foreach(println)
mapValues(func):对键值对RDD中的每个value都应用一个函数,但是key不会发生改变:
val list = Array(("AB",23),("BB",16),("BC",18),("CD",34),("AE",98))
val rdd1 = sc.parallelize(list)
rdd1.mapValues(x => x * 10).foreach(println)
join():join表示内连接,对于给定的两个输入数据集(K,V1)和(K,V2),只有在两个数据集中都存在的Key才会被输出,
最终得到一个(K,(V1,V2))类型的数据集;
val rdd1 = sc.parallelize(Array(("spark",23),("hadoop",16),("hadoop",18)))
val rdd2 = sc.parallelize(Array(("spark",24),("fast",34)))
val rdd3 = rdd1.join(rdd2)
rdd3.foreach(println)
combineByKey:
combineByKey(createCombiner, mergeValue, mergeCombiners, partitioner, mapSideCombine)
假设有一些销售数据,数据采用键值对的形式,即<公司,当月收入>,要求使用combineByKey操作计算每个公司的总收入以及每月的平均收入,实现代码如下:
package sparkstudy.StatisticWord
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object CombineByKey {
def main(args : Array[String]) {
val conf = new SparkConf().setAppName("CombineByKey")
val sc = new SparkContext(conf)
val data = Array(
("comany-1", 88),("comany-1", 96),("comany-1", 85),
("comany-2", 94),("comany-2", 86),("comany-2", 74),
("comany-3", 86),("comany-3", 88),("comany-3", 92)
)
val rdd = sc.parallelize(data, 3)
// 结果(总收入,每月平均收入)
{
val res = rdd.combineByKey(
(income) => (income, 1),
(acc: (Int, Int), income) => (acc._1 + income, acc._2 + 1),
(acc1:(Int, Int), acc2:(Int,Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
).map({
case (key, value) => (key, value._1, value._1/value._2.toFloat)
case _ => None
})
res.foreach(println)
}
// 结果(总收入,每月平均收入,每月的收入增加10倍之后的总收入)
{
val res = rdd.combineByKey(
(income) => (income, 1, income*10),
(acc: (Int, Int, Int), income) => (acc._1 + income, acc._2 + 1, acc._3 + income*10),
(acc1:(Int, Int, Int), acc2:(Int,Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._3, acc1._3 + acc2._3)
).map({
case (key, value) => (key, value._1, value._1/value._2.toFloat, value._3)
case _ => None
})
res.foreach(println)
}
}
}
代码中的对应关系
参数 | 参数值 |
---|---|
createCombiner | (income) => (income, 1) |
mergeValue | (acc: (Int, Int), income) => (acc._1 + income, acc._2 + 1) |
mergeCombiners | (acc1:(Int, Int), acc2:(Int,Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2) |
partitioner | HashPartitioner |
mapSideCombine | true |
从文件中读取数据创建RDD: textFile
val part = sc.textFile("/sougou/README.md")
textFile参数说明:
把RDD写入到本地文件中: saveAsTextFile
saveAsTextFile的参数是一个目录名,而不是一个文件名,每个RDD分区会生成一个对应的文件。若参数提供的目录已经存在,spark将会报错。
从HDFS中读取数据创建RDD: textFile
val lines1 = sc.textFile("hdfs://master:8020/sougou/README.md")
val lines2 = sc.textFile("/sougou/README.md")
在分布式文件系统中,上述两种写法是等效的。
把RDD写入到HDFS文件中: saveAsTextFile
val lines = sc.textFile("hdfs://master:8020/sougou/README.md", 3)
lines.saveAsTextFile("/data/abc")
在hdfs的目录/data/abc下会生成如下4个文件:
0 2018-09-21 23:06 /root/data/abc/_SUCCESS
1417 2018-09-21 23:06 /root/data/abc/part-00000
1157 2018-09-21 23:06 /root/data/abc/part-00001
1235 2018-09-21 23:06 /root/data/abc/part-00002
读Json数据
val lines = sc.textFile("hdfs://master:8020/data/people.json", 1)
lines.foreach(println)
解析Json数据
参考如下代码
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import scala.util.parsing.json.JSON
object JsonParse {
def main(args : Array[String]) {
val conf = new SparkConf().setAppName("JsonParser")
val sc = new SparkContext(conf)
val jsonStr = sc.textFile("hdfs://master:8020/data/people.json")
val parseRes = jsonStr.map(x => JSON.parseFull(x))
parseRes.foreach({r => r match {
case Some(map: Map[String, Any]) => println(map)
case None => println("Parse failed, " + r)
case other => println("Unknow data structure: " + other)
}})
}
}
通过hbase shell创建表并插入数据
create 'student','info'
put 'student','1','info:name','ZhangSan'
put 'student','1','info:gender','F'
put 'student','1','info:age','23'
put 'student','2','info:name','LiSi'
put 'student','2','info:gender','M'
put 'student','2','info:age','28'
从hbase中读取数据
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object ReadHBase {
def main(args : Array[String]) {
val conf = new SparkConf().setAppName("ReadHBase")
val sc = new SparkContext(conf)
// set query table name
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set(TableInputFormat.INPUT_TABLE, "student")
val stuRDD = sc.newAPIHadoopRDD(hbaseConf,
classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count = stuRDD.count()
println("Students RDD Count:" + count)
stuRDD.cache()
stuRDD.foreach({
case (_, result) =>{
val key = Bytes.toString(result.getRow)
val name = Bytes.toString(result.getValue("info".getBytes, "name".getBytes))
val gender = Bytes.toString(result.getValue("info".getBytes, "gender".getBytes))
val age = Bytes.toString(result.getValue("info".getBytes, "age".getBytes))
println("key: "+key+", name = "+name+", gender = "+gender+", age = "+age)
}
})
}
}
向hbase中写数据
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._
object WriteHBase {
def main(args : Array[String]) {
val conf = new SparkConf().setAppName("WriteHBase")
val sc = new SparkContext(conf)
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, "student")
hbaseConf.set("mapreduce.output.fileoutputformat.outputdir", "/tmp")
val job = new Job(hbaseConf)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val rddStr = sc.makeRDD(Array("3,WangWu,M,26", "4,ZhaoLiu,F,27"))
val rdd = rddStr.map(_.split(",")).map(arr => {
val put = new Put(Bytes.toBytes(arr(0)))
put.add(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(arr(1)))
put.add(Bytes.toBytes("info"), Bytes.toBytes("gender"), Bytes.toBytes(arr(2)))
put.add(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(arr(3)))
(new ImmutableBytesWritable, put)
})
rdd.saveAsNewAPIHadoopDataset(job.getConfiguration)
}
}