Spark倒排,求中位数,CountOnce

倒排序
输入:
id1 spark hadoop
id2 scala spark
id3 java hadoop spark
id4 scala java
id5 the spark
id6 the hadoop and spark
id7 hadoop Flink storm spark
输出:
Flink id7
scala id2 id4
spark id1 id2 id3 id5 id6 id7
hadoop id1 id3 id6 id7
java id3 id4
and id6
storm id7
the id5 id6
代码(Scala版):

package com.dt.spark.cores.scala



import org.apache.spark.{SparkContext, SparkConf}

import scala.collection.mutable



object InvertedIndex {
def main (args: Array[String]) {

val conf = new SparkConf().setAppName("inverted index").setMaster("local")
val sc = new SparkContext(conf)

val data = sc.textFile("E:\\workspases\\data\\invertedIndex.txt")

val keydata = data.map(line => line.split("\t")).map(linetemp =>(linetemp(0),linetemp(1)))
val inverting = keydata.flatMap(file =>{
val list = new mutable.LinkedHashMap[String,String]()
val words = file._2.split(" ").iterator
while (words.hasNext)
{
list.put(words.next(),file._1)
}
list
})
inverting.reduceByKey(_+" "+_).map(pari=>pari._1+"\t"+pari._2).foreach(println)
sc.stop()
}
}

中位数

package com.dt.spark.cores.scala

import org.apache.spark.{SparkContext, SparkConf}



object Median {

def main (args: Array[String]) {

val conf = new SparkConf().setAppName("Median").setMaster("local")
val sc = new SparkContext(conf)

val data = sc.textFile("E:\\workspases\\data\\Median.txt")

val words = data.flatMap(_.split(" ")).map(word => word.toInt)
val number = words.map(word =>(word/4,word)).sortByKey()
val pariCount = words.map(word => (word/4,1)).reduceByKey(_+_).sortByKey()
val count = words.count().toInt
var mid =0
if(count%2 != 0)
{
mid = count/2+1
}else
{
mid = count/2
}

var temp =0
var temp1= 0
var index = 0
val tongNumber = pariCount.count().toInt

var foundIt = false
for(i <- 0 to tongNumber-1 if !foundIt)
{
temp = temp + pariCount.collectAsMap()(i)
temp1 = temp - pariCount.collectAsMap()(i)
if(temp >= mid)
{
index = i
foundIt = true
}
}
val tonginneroffset = mid - temp1

val median = number.filter(_._1==index).takeOrdered(tonginneroffset)
sc.setLogLevel("ERROR")
println(median(tonginneroffset-1)._2)
sc.stop()

}
}

CountOnce
问题描述:
假设HDFS只存储一个标号为ID的Block,每份数据保存2个备份,这样就有2个机器存储了相同的数据。其中ID是小于10亿的整数。若有一个数据块丢失,则需要找到哪个是丢失的数据块。
在某个时间,如果得到一个数据块ID的列表,能否快速地找到这个表中仅出现一次的ID?即快速找出出现故障的数据块的ID。
问题阐述:已知一个数组,数组中只有一个数据是出现一遍的,其他数据都是出现两遍,将出现一次的数据找出。
输入:
5
3
2
2
3
5
7
6
6
9
9
10
11
16
11
10
16

输出:
7

代码:

package com.dt.spark.cores.scala

import org.apache.spark.{SparkContext, SparkConf}


object CountOnce {

def main (args: Array[String]) {
val conf = new SparkConf().setAppName("CountOnce").setMaster("local")
val sc = new SparkContext(conf)

val data = sc.textFile("E:\\workspases\\data\\CountOnce.txt")
val word = data.map(line => line.toInt)
val result =word.reduce(_^_)
println(result)
}
}

你可能感兴趣的:(spark)