练习一:求最大最小值
在文件中获取数据求浮点数和整数的最大值和最小值
1,2.33,4,1.55,2.56,
55,55,55,23.77,1.88987,0.3324,
22.567,5.5567,7.8895,33
import org.apache.spark.{SparkConf, SparkContext}
object demo01 {
def main(args: Array[String]): Unit = {
println("求最大最小值")
val conf=new SparkConf().setMaster("local").setAppName("minmax")
val sc = new SparkContext(conf)
val data= sc.textFile("D:\\resource\\minmax.txt",1)
var minn=Double.MaxValue
var maxn=Double.MinValue
val datap=data.flatMap{x=>x.split("\r\n").mkString("").split(",")}
var i=1
val countt=datap.count()
for (x<-datap){
val n=x.toDouble
if(n>maxn)maxn=n
if(n
练习二:求爷孙关系
janet winnie
winnie poul
sam dida
helen janet
helen jack
jack salon
salon rose
rose tom
jack nicole
sam flitter
janet sam
sam sisi
def ancesor(sc:SparkContext)= {
val data = sc.textFile("D:\\resource\\ancesor.txt", 2)
val cps = data.map { x => (x.split(" ")(0), x.split(" ")(1)) }
val cpss=cps.collectAsMap() //激活RDD
val to=cps.foreach{cp1=>
val list=List("")
cpss.foreach{cp2=>
if(cp2._1.equals(cp1._2)) {
println("祖辈:"+cp1._1+",孙辈:"+cp2._2)
}
}
}
}
练习三:排序
对csv文件进行排序
name,score
helen,40
tom,50
mary,69
ben,60
sasa,70
marier,76
dida,78
object demo01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sort1").setMaster("local")
val sc = new SparkContext(conf)
val sqc = new SQLContext(sc)
//csv文件的第一行为列信息,设置header=true可以读取列信息
val data2 = sqc.read.option("header", "true").csv("d:\\demo.csv").toDF()
data2.registerTempTable("tb_score")
sqc.sql("select * from tb_score order by score desc").show()
}
}
对普通文件进行排序
hello 2
say 4
dida 5
discuss 6
subway 10
object demo01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sort2").setMaster("local")
val sc = new SparkContext(conf)
val sqc = new SQLContext(sc)
val data=sc.textFile("d:\\demo2.txt").map(x=>(x.split(" ")(0),x.split(" ")(1).toInt))
//方法一,直接用内置函数,排序得出降序排序
sqc.createDataFrame(data).toDF("name","num").orderBy("num").show()
//方法二,转变成临时table进行sql排序,可指定升降序
sqc.createDataFrame(data).toDF("name","num").registerTempTable("tb_s")
sqc.sql("select * from tb_s order by num").show()
}
}
对json文件进行排序
{“id”:1, “name”:“leo”, “age”:18}
{“id”:2, “name”:“jack”, “age”:19}
{“id”:3, “name”:“marry”, “age”:17}
object demo01 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("json")
val sc=new SparkContext(conf)
val sqc=new SQLContext(sc)
val jdata=sqc.read.json("d:/demo3.json").createOrReplaceTempView("demo3")
sqc.sql("select id,name,age from demo3 order by id").show()
}
}
练习四:二次排序
hello 2 23
baby 2 44
hello 1 22
hello 3 55
nice 2 58
kitty 3 66
apple 1 44
使用SparkSQL
object demo01 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("second")
val sc=new SparkContext(conf)
val sqc=new SQLContext(sc)
val data=sc.textFile("d:/demo4.txt")
.map{x=>(x.split(" ")(0),x.split(" ")(1),x.split(" ")(2))}
sqc.createDataFrame(data).toDF("name","class","score")
.createOrReplaceTempView("demo4")
sqc.sql("select * from demo4 as t order by class,score desc").show()
}
}
使用Spark
//用于排序的类,注意第一行的写法
class record(val clss:Int,val score:Int) extends Ordered[record] with Serializable {
def compare(other: record): Int = {
val comp = clss.compareTo(other.clss)
if (comp == 0) {
other.score.compareTo(score)
} else {
comp
}
}
}
//用于处理的Spark
object demo01 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setMaster("local").setAppName("second")
val sc=new SparkContext(conf)
val data=sc.textFile("d:/demo4.txt")
.map{x=>
val ele=x.split(" ")
(new record(ele(1).toInt,ele(2).toInt),x)
}
val result=data.sortByKey(true).map(x=>x._2.replace(" ",","))
result.foreach(println)
}
}
练习五:倒排索引
搜索引擎的索引法则
object demo01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("second")
val sc = new SparkContext(conf)
sc.wholeTextFiles("D:\\resource\\daopai", 2).flatMap{ x =>
val file = x._1.split("/").last.dropRight(4)
x._2.split("\r\n").mkString(" ").split(" ")
.map { x => (x, file) }
}.map(x=>(x._1,x._2)).groupByKey()
.map{x=>(x._1,x._2.toList.distinct.mkString(","))}
.foreach(println)
}
}
练习六:??