val conf = new SparkConf().setAppName("hh")
conf.setMaster("local[3]")
val sc = new SparkContext(conf)
val data = sc.textFile("/home/hadoop4/Desktop/i.txt")
.map(_.split("\t")).map(f => f.map(f => f.toDouble))
.map(f => ("k"+f(0),f(1)))
// variance
//data:RDD[(String,Double)]
val dataArr = data.map(f=>(f._1,ArrayBuffer(f._2)))
//dataArr RDD[(String,ArrayBuffer[Double])]
dataArr.collect().foreach(println(_))
//output
(k1.0,ArrayBuffer(2.0))
(k1.0,ArrayBuffer(4.0))
(k4.0,ArrayBuffer(5.0))
(k4.0,ArrayBuffer(7.0))
(k7.0,ArrayBuffer(8.0))
(k10.0,ArrayBuffer(11.0))
(k10.0,ArrayBuffer(13.0))
(k10.0,ArrayBuffer(1.0))
(k1.0,ArrayBuffer(100.0))
(k10.0,ArrayBuffer(11.0))
(k10.0,ArrayBuffer(11.0))
(k1.0,ArrayBuffer(2.0))
(k4.0,ArrayBuffer(7.0))
val dataArrRed = dataArr.reduceByKey((x,y)=>x++=y) //dataArrRed :RDD[(String,ArrayBuffer[Double])] dataArrRed.collect().foreach(println(_)) //output (k1.0,ArrayBuffer(2.0, 4.0, 100.0, 2.0)) (k7.0,ArrayBuffer(8.0)) (k10.0,ArrayBuffer(11.0, 13.0, 1.0, 11.0, 11.0)) (k4.0,ArrayBuffer(5.0, 7.0, 7.0)) val dataARM = dataArrRed.collect().map( f=>(f._1,sc.makeRDD(f._2,2))) val dataARMM = dataARM.map( f=>(f._1,(f._2.variance(),f._2.max(),f._2.min()))) .foreach(println(_)) sc.stop() //output (k1.0,(1777.0,100.0,2.0)) (k7.0,(0.0,8.0,8.0)) (k10.0,(18.24,13.0,1.0)) (k4.0,(0.8888888888888888,7.0,5.0))
由于在reduceByKey后要先collect,driver.maxResultSize不够大的话会报错,driver.maxResultSize默认设置为1g,在数据量大时会报错,我是把driver.maxResultSize
设置为了10g.
这方法还是有问题,如何避免collect??
解决方法,笔者在StackOverFlow问的问题:how to avoid collect when using statistic stat
data:
1 2 3
1 4 5
4 5 6
4 7 8
7 8 9
10 11 12
10 13 14
10 1 2
1 100 100
10 11 2
10 11 2
1 2 5
4 7 6
val dataString = xx.trim //trim去除首尾的空格
val dataArray = dataString.split("\\n")
.map(_.split("\\s+")) //\\s表示空格,回车,换行等空白符,+号表示一个或多个的意思
.map(_.map((_.toDouble)))
.map(f=>("k"+f(0),f(1)))
val data = sc.parallelize(dataArray)
//data:RDD[(String,Double)]
val dataStats = data.aggregateByKey(new StatCounter()
)({(s,v)=>s.merge(v)},{(s,t)=>s.merge(t)}) //or, slightly shorter but perhaps over-tricky: val dataStats = data.aggregateByKey(new StatCounter())(_ merge _ , _ merge _) //dataStats:RDD[(String, Statcounter)] //Re-format to the OP's format and print val result = dataStats.map(f=>(f._1,(f._2.variance, f._2.max, f._2.min))) .foreach(println(_)) //result=dataStats.map(...) :RDD[(Stirng,(Double,Double,Double))] //output (k1.0,(1776.9999999999998,100.0,2.0)) (k7.0,(0.0,8.0,8.0)) (k10.0,(18.240000000000002,13.0,1.0)) (k4.0,(0.888888888888889,7.0,5.0))
Version with two columns:
val dataArray = dataString.split("\\n")
.map(_.split("\\s+")).map(_.map(_.toDouble))
.map(f=>("k"+f(0),Array(f(1),f(2))))
val data = sc.parallelize(dataArray)
//data:RDD[(String,Array[Double])]
val dataStats = data.aggregateByKey(
Array(new StatCounter(), new StatCounter())
)({(s,v)=>Array(s(0) merge v(0),s(1) merge v(1))}, {(s,t)=>Array(s(0) merge t(0), s(1) merge t(1))}) //dataStats:RDD[(String, Array[StatCounter])] val result = dataStats.map(f=>(f._1, (f._2(0).variance,f._2(0).max,f._2(0).min), (f._2(1).variance,f._2(1).max,f._2(1).min) ))).foreach(println(_)) //result=dataStats.map(...) :RDD[(Stirng,(Double,Double,Double),(Double,Double,Double))] //output (k1.0,(1777.0,100.0,2.0),(1716.6875,100.0,3.0)) (k7.0,(0.0,8.0,8.0),(0.0,9.0,9.0)) (k10.0,(18.240000000000002,13.0,1.0),(29.440000000000005,14.0,2.0)) (k4.0,(0.8888888888888888,7.0,5.0),(0.8888888888888888,8.0,6.0))
‘n’ column version:
val n =2
val dataStats = data.aggregateByKey(List.fill(n)(new Counter()))(
{(s,v)=>(s zip v).map{case (si, vi) => si merge vi}},
{(s,t)=>(s zip t).map{case (si, ti) => si merge ti}})
val result = dataStats.map(f=>(f._1, f._2.map(x=>(x.variance, x.max, x.min))))
.foreach(println(_))
Output same as above, but you have more columns, you can change n
, it will break if the Arrays in any row less than n elements.
def mosMean(data:RDD[(String,Double)]): RDD[(String,Double)] ={
val dataStats = data.aggregateByKey(new StatCounter()
)(_ merge _,_ merge _)
val result = dataStats.map(f=>(f._1,f._2.mean))
result
}
def saveToHbase(result:RDD[(String,Double)],tablename:String,column:String,item:String*)={
result.foreachPartition { x =>
val myConf = HBaseConfiguration.create()
myConf.set("hbase.zookeeper.quorum",
"compute000,compute001,compute002,compute003,compute004," +
"compute005,compute006,compute007,compute008,compute009,compute010," +
"compute011,compute012,compute013,compute014,compute015,compute016," +
"compute017,compute018,compute019,compute020,compute021,compute022," +
"compute023,compute024,compute025,compute026,compute027,compute028," +
"compute029,compute030,compute031,compute032,compute033,compute034," +
"compute035,compute036,compute037,compute038")
myConf.set("hbase.master", "10.10.10.10:60000")
myConf.set("hbase.zookeeper.property.clientPort", "2181")
myConf.set("hbase.defaults.for.version.skip", "true")
val myTable = new HTable(myConf, tablename)
myTable.setAutoFlush(false, false)
myTable.setWriteBufferSize(3 * 1024 * 1024)
x.foreach { f =>
val p = new Put(Bytes.toBytes(f._1))
for (k <- 0 until item.length) {
p.addColumn(Bytes.toBytes(column), Bytes.toBytes(item(k)), Bytes.toBytes(f._2.toString))
}
myTable.put(p)
}
myTable.flushCommits()
}
}