单机版
public class WordCount {
public static void main(String[] args) throws IOException {
Map<String, Integer>[] maps=new HashMap[5];
int a=0;
for (int i = 1; i <=5; i++) {
maps[a++] = docunment(i);
}
Map<String, Integer> merge = merge(maps);
Set<Map.Entry<String, Integer>> entrySet = merge.entrySet();
entrySet.forEach(System.out::println);
}
public static Map<String, Integer> docunment(int i) throws IOException {
Map<String, Integer> map=new HashMap<String, Integer>();
BufferedReader br=new BufferedReader(new FileReader("D:\\bd_example\\data\\wordcount\\"+i+".txt"));
String s=null;
while ((s=br.readLine())!=null) {
String[] split = s.split("\t");
for (String string : split) {
if (map.containsKey(string)) {
map.put(string, map.get(string)+1);
}else {
map.put(string, 1);
}
}
}
return map;
}
public static Map<String, Integer> merge(Map<String, Integer>...maps) {
Map<String, Integer> map=new HashMap<String, Integer>();
for (Map<String, Integer> current_map : maps) {
Set<String> current_keys = current_map.keySet();
for (String k : current_keys) {
Integer old_count = current_map.get(k);
if (map.containsKey(k)) {
Integer current_count = map.get(k);
map.put(k, old_count+current_count);
}else {
map.put(k, old_count);
}
}
}
return map;
}
}
MapReduce
public class WC {
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String string = value.toString();
String[] words = string.split("\t");
for (String s : words) {
Text mtText=new Text(s);
IntWritable iWritable=new IntWritable(1);
context.write(mtText, iWritable);
}
}
}
public class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable v : values) {
sum+=v.get();
}
IntWritable resIntWritable=new IntWritable(sum);
context.write(key, resIntWritable);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf=new Configuration();
Job job=Job.getInstance(conf);
job.setJarByClass(WC.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
}
Hive
create database wc;
use wc;
create table wordcount(word string)
row format delimited fields terminated by "\n" location '/wordcount';
load data local inpath "/home/fanger/example/wordcount" into table wordcount;
select * from wordcount;
select
a.word as word,count(*) as num
from(
select
wc.word as word
from wordcount
lateral view explode(split(word,"\t")) wc as word) a
group by a.word;
Spark
public class _01sparkCount {
public static void main(String[] args) {
SparkConf conf=new SparkConf();
conf.setMaster("local[*]");
conf.setAppName(_01sparkCount.class.getSimpleName());
JavaSparkContext jsc=new JavaSparkContext(conf);
JavaRDD<String> text = jsc.textFile("D:\\bd_example\\data\\wordcount\\");
int numPartitions = text.getNumPartitions();
System.out.println(numPartitions);
JavaRDD<String> lines = text.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split("\t")).iterator();
}
});
JavaPairRDD<String, Integer> maps = lines.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2(s, 1);
}
});
JavaPairRDD<String, Integer> reduces = maps.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
reduces.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println(t._1+"---->"+t._2);
}
});
jsc.textFile("D:\\bd_example\\data\\wordcount\\").flatMap(x->Arrays.asList(x.split("\t")).iterator()).
mapToPair(v->new Tuple2<>(v,1)).reduceByKey((v1,v2)->v1+v2).foreach(t->{
System.out.println(t._1+"\t"+t._2);
});
jsc.stop();
}
}
Scala
val array2=Array("a b c","a c d e s","a d e g")
print(array
.map(x=>x.split(" "))
.flatten
.map(x=>(x,1))
.groupBy(x=>x._1)
.map(x=>(x._1,x._2.length))
.toList
.sortWith((x,y)=>x._1>y._1))
val stringToInt = array2
.flatMap(_.split(" "))
.map((_,1))
.groupBy(_._1)
.map(t=>(t._1,t._2.length))
.toList
.sortWith((x,y)=>x._1>y._1)
println(stringToInt)
val conf=new SparkConf().setAppName(AggreatBy.getClass.getSimpleName).setMaster("local[1]")
val sc=new SparkContext(conf)
val array2=Array("a b c","a c d e s","a d e g")
val valuerdd: RDD[String] = sc.parallelize(array2)
val value = valuerdd.flatMap(_.split("\\s+")).map((_,1))
val stringToLong: collection.Map[String, Long] = value.countByKey()
for((k,v)<-stringToLong){
println(s"${k},${v}")
}