Spark统计一个文件里每个单词出现次数, 并按单词出现次数倒序排序取出前

数据

A,Dataset,is,a,distributed,collection,of,data,Dataset,is,a,new,interface,added,in,Spark,1,6,that,provides,the,benefits,of,RDDs,strong,typing,ability,to,use,powerful,lambda,functions,with,the,benefits,of,Spark,SQL’s,optimized,execution,engine,A,Dataset,can,be,constructed,from,JVM,objects,and,then,manipulated,using,functional,transformations,map,flatMap,filter,etc,,The,Dataset,API,is,available,in,Scala,and,Java,Python,does,not,have,the,support,for,the,Dataset,API,But,due,to,Python’s,dynamic,nature,many,of,the,benefits,of,the,Dataset,API,are,already,available,i,e,you,can,access,the,field,of,a,row,by,name,naturally,row,columnName,The,case,for,R,is,similar,A,DataFrame,is,a,Dataset,organized,into,named,columns,It,is,conceptually,equivalent,to,a,table,in,a,relational,database,or,a,data,frame,in,R/Python,but,with,richer,optimizations,under,the,hood,DataFrames,can,be,constructed,from,a,wide,array,of,sources,such,as:,structured,data,files,tables,in,Hive,external,databases,or,existing,RDDs,The,DataFrame,API,is,available,in,Scala,Java,Python,and,R,In,Scala,and,Java,a,DataFrame,is,represented,by,a,Dataset,of,Rows,In,the,Scala,API,DataFrame,is,simply,a,type,alias,of,Dataset[Row],While,in,Java,API,users,need,to,use,Dataset,to,represent,a,DataFrame,Throughout,this,document,we,will,often,refer,to,Scala/Java,Datasets,of,Rows,as,DataFrames

操作


scala> val file = spark.sparkContext.textFile("file:///home/hadoop/data/spark-test-data/wc3.txt")
file: org.apache.spark.rdd.RDD[String] = file:///home/hadoop/data/spark-test-data/wc3.txt MapPartitionsRDD[58] at textFile at :27

scala> val rowRDD = file.flatMap(_.split(",")).map(x=>(x,1)).reduceByKey((x,y)=>(x+y))
rowRDD: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[61] at reduceByKey at :29

scala> rowRDD.map(_._2).sortBy(x=>x,false).foreach(println)
12
10
9
9
8
7
6
6
5
4
4
4
3
3
3
3
3
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

scala> rowRDD.sortBy(_._2,false).foreach(println)
(a,12)
(of,10)
(the,9)
(is,9)
(Dataset,8)
(in,7)
(API,6)
(to,6)
(DataFrame,5)
(Java,4)
(Scala,4)
(and,4)
(The,3)
(A,3)
(can,3)
(data,3)
(available,3)
(benefits,3)
(Python,2)
(Spark,2)
(for,2)
(constructed,2)
(be,2)
(row,2)
(Rows,2)
(by,2)
(R,2)
(with,2)
(In,2)
(from,2)
(use,2)
(or,2)
(RDDs,2)
(DataFrames,2)
(table,1)
(organized,1)
(under,1)
(e,1)
(this,1)
(existing,1)
(represented,1)
(dynamic,1)
(have,1)
(field,1)
(added,1)
(i,1)
(such,1)
(type,1)
(represent,1)
(we,1)
(optimizations,1)
(Hive,1)
(SQL��s,1)
(already,1)
(transformations,1)
(engine,1)
(named,1)
(are,1)
(typing,1)
(not,1)
(provides,1)
(refer,1)
(R/Python,1)
(While,1)
(richer,1)
(as:,1)
(then,1)
(lambda,1)
(Dataset[Row],1)
(as,1)
(into,1)
(naturally,1)
(It,1)
(document,1)
(but,1)
(functional,1)
(array,1)
(JVM,1)
(collection,1)
(conceptually,1)
(But,1)
(flatMap,1)
(equivalent,1)
(database,1)
(6,1)
(simply,1)
(Scala/Java,1)
(optimized,1)
(external,1)
(sources,1)
(ability,1)
(relational,1)
(databases,1)
(map,1)
(tables,1)
(wide,1)
(new,1)
(execution,1)
(using,1)
(support,1)
(Python��s,1)
(distributed,1)
(columns,1)
(similar,1)
(Dataset,1)
(1,1)
(alias,1)
(due,1)
(powerful,1)
(columnName,1)
(interface,1)
(etc,1)
(nature,1)
(you,1)
(files,1)
(name,1)
(need,1)
(that,1)
(filter,1)
(Datasets,1)
(many,1)
(strong,1)
(will,1)
(,1)
(frame,1)
(does,1)
(access,1)
(structured,1)
(objects,1)
(often,1)
(case,1)
(hood,1)
(functions,1)
(users,1)
(manipulated,1)
(Throughout,1)

你可能感兴趣的:(Spark)