一些例子,所用版本为hadoop 2.6.5
1、统计字数
数据格式如下(单词,频数,以tab分开):
1 package com.mr.test;
2
3 import java.io.IOException;
4 import org.apache.hadoop.conf.Configuration;
5 import org.apache.hadoop.fs.Path;
6 import org.apache.hadoop.io.IntWritable;
7 import org.apache.hadoop.io.Text;
8 import org.apache.hadoop.mapreduce.Job;
9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13
14 public class MRTest {
15
16 public static class C01Mapper extends Mapper {
17
18 @Override
19 public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
20 String[] line = value.toString().split("\t");
21 if (line.length == 2) {
22 context.write(new Text(line[0]),new IntWritable(Integer.parseInt(line[1])));
23 }
24 }
25 }
26
27 public static class C01Reducer extends Reducer {
28
29 @Override
30 public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
31 int i =0;
32 for (IntWritable value : values){
33 i += value.get();
34 }
35 context.write(key, new IntWritable(i));
36 }
37 }
38
39 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
40 // 参数含义: agrs[0]标识 in, agrs[1]标识 out,agrs[2]标识 unitmb,agrs[3]标识 reducer number,
41
42 int unitmb =Integer.valueOf(args[2]);
43 String in = args[0];
44 String out = args[1];
45 int nreducer = Integer.valueOf(args[3]);
46
47 Configuration conf = new Configuration();
48 conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024));
49 conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024));
50 conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024));
51 conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024));
52
53 Job job = new Job(conf);
54 FileInputFormat.addInputPath(job, new Path(in));
55 FileOutputFormat.setOutputPath(job, new Path(out));
56 job.setMapperClass(C01Mapper.class );
57 job.setReducerClass(C01Reducer.class );
58 job.setNumReduceTasks(nreducer);
59 job.setCombinerClass(C01Reducer.class );
60 job.setMapOutputKeyClass(Text.class );
61 job.setMapOutputValueClass(IntWritable.class );
62 job.setOutputKeyClass(Text.class );
63 job.setOutputValueClass(IntWritable.class );
64 job.setJarByClass(MRTest.class );
65 job.waitForCompletion(true );
66 }
67 }
2、统计用户在网站的停留时间
数据格式(用户,毫秒数,网站,以tab分开):
A 100 baidu.com
B 900 google.com
C 515 sohu.com
D 618 sina.com
E 791 google.com
B 121 baidu.com
C 915 google.com
D 112 sohu.com
E 628 sina.com
A 681 google.com
C 121 baidu.com
D 215 google.com
E 812 sohu.com
A 128 sina.com
B 291 google.com
1 package com.mr.test;
2
3 import java.io.IOException;
4 import org.apache.hadoop.conf.Configuration;
5 import org.apache.hadoop.fs.Path;
6 import org.apache.hadoop.io.IntWritable;
7 import org.apache.hadoop.io.Text;
8 import org.apache.hadoop.io.WritableComparable;
9 import org.apache.hadoop.io.WritableComparator;
10 import org.apache.hadoop.mapreduce.Job;
11 import org.apache.hadoop.mapreduce.Mapper;
12 import org.apache.hadoop.mapreduce.Partitioner;
13 import org.apache.hadoop.mapreduce.Reducer;
14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
16
17 public class MRWeb {
18
19 public static class C02Mapper extends Mapper {
20 @Override
21 public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
22 String line[] = value.toString().split("\t");
23 // 格式检查
24 if (line.length == 3){
25 String name = line[0];
26 String time = line[1];
27 String website = line[2];
28 context.write(new Text(name + "\t" + time), new Text(time + "\t" + website));
29 }
30 }
31 }
32
33 public static class C02Partitioner extends Partitioner {
34
35 @Override
36 public int getPartition(Text key, Text value, int number) {
37 String name = key.toString().split("\t")[0];
38 int hash =name.hashCode();
39 // 以此实现分区
40 return Math.abs(hash % number);
41 }
42
43 }
44
45 public static class C02Sort extends WritableComparator {
46 // 必须有的
47 protected C02Sort() {
48 super (Text.class ,true );
49 }
50
51 @Override
52 public int compare(WritableComparable w1, WritableComparable w2) {
53 Text h1 = new Text(((Text)w1).toString().split("\t")[0] );
54 Text h2 = new Text(((Text)w2).toString().split("\t")[0] );
55 IntWritable m1 =new IntWritable(Integer.valueOf(((Text)w1).toString().split("\t")[1]));
56 IntWritable m2 =new IntWritable(Integer.valueOf(((Text)w2).toString().split("\t")[1]));
57
58 int result;
59 if (h1.equals(h2)){
60 result = m2.compareTo(m1);
61 }else {
62 result =h1.compareTo(h2);
63 }
64 return result;
65 }
66 }
67
68 public static class C02Group extends WritableComparator{
69 protected C02Group() {
70 super (Text.class ,true );
71 }
72 @Override
73 public int compare(WritableComparable w1, WritableComparable w2) {
74 Text h1 = new Text(((Text)w1).toString().split("\t")[0] );
75 Text h2 = new Text(((Text)w2).toString().split("\t")[0] );
76
77 return h1.compareTo(h2);
78 }
79 }
80
81 public static class C02Reducer extends Reducer {
82
83 @Override
84 protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
85 int count = 0;
86 String name =key.toString().split("\t")[0];
87 // 分组排序已经做好了,这里只管打印
88 for (Text value : values){
89 count++;
90 StringBuffer buffer = new StringBuffer();
91 buffer.append(name);
92 buffer.append("\t");
93 buffer.append(value.toString());
94 context.write(new IntWritable(count), new Text(buffer.toString()));
95 }
96 }
97 }
98
99 public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
100 // 参数含义: agrs[0]标识 in, agrs[1]标识 out,agrs[2]标识 unitmb,agrs[3]标识 reducer number,
101 if (args.length != 4){
102 System.out.println("error");
103 System.exit(0);
104 }
105
106 int unitmb =Integer.valueOf(args[2]);
107 String in = args[0];
108 String out = args[1];
109 int nreducer = Integer.valueOf(args[3]);
110
111 Configuration conf = new Configuration();
112 conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024));
113 conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024));
114 conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024));
115 conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024));
116
117 Job job = new Job(conf);
118 FileInputFormat.addInputPath(job, new Path(in));
119 FileOutputFormat.setOutputPath(job, new Path(out));
120 job.setMapperClass(C02Mapper.class );
121 job.setReducerClass(C02Reducer.class );
122 job.setNumReduceTasks(nreducer);
123 job.setPartitionerClass(C02Partitioner.class );
124 job.setGroupingComparatorClass(C02Group.class );
125 job.setSortComparatorClass(C02Sort.class );
126 job.setMapOutputKeyClass(Text.class );
127 job.setMapOutputValueClass(Text.class );
128 job.setOutputKeyClass(IntWritable.class );
129 job.setOutputValueClass(Text.class );
130 job.setJarByClass(MRWeb.class );
131 job.waitForCompletion(true );
132 }
133 }
运行:hadoop jar ~/c02mrtest.jar com.mr.test.MRWeb TestData/webcount.txt /DataWorld/webresult 128 1
结果的样子:
3、json数组分析
数据格式(前面以tab分开):
1 [{"name":"A","age":16,"maths":100}]
2 [{"name":"B","age":17,"maths":97}]
3 [{"name":"C","age":18,"maths":89}]
4 [{"name":"D","age":15,"maths":98}]
5 [{"name":"E","age":19,"maths":100}]
1 package com.mr.test;
2
3 import java.io.IOException;
4 import org.apache.hadoop.conf.Configuration;
5 import org.apache.hadoop.fs.Path;
6 import org.apache.hadoop.io.Text;
7 import org.apache.hadoop.mapreduce.Job;
8 import org.apache.hadoop.mapreduce.Mapper;
9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 import net.sf.json.JSONArray;
12 import net.sf.json.JSONObject;
13
14 public class MRString {
15
16 public static class C03Mapper extends Mapper {
17 @Override
18 protected void map(Object key, Text value, Mapper.Context context)
19 throws IOException, InterruptedException {
20 String[] line = value.toString().split("\t");
21 if (line.length ==2){
22 String c = line[0];
23 String j = line[1];
24 JSONArray jsonArray =JSONArray.fromObject(j);
25 int size = jsonArray.size();
26 for (int i=0;i){
27 String name = "";
28 String age = "";
29 String maths = "";
30 JSONObject jsonObject =jsonArray.getJSONObject(i);
31 if (jsonObject.containsKey("name")){
32 name = jsonObject.getString("name");
33 }
34 if (jsonObject.containsKey("age")){
35 age = jsonObject.getString("age");
36 }
37 if (jsonObject.containsKey("maths")){
38 maths = jsonObject.getString("maths");
39 }
40 StringBuffer buffer =new StringBuffer();
41 buffer.append(name);
42 buffer.append("\t");
43 buffer.append(age);
44 buffer.append("\t");
45 buffer.append(maths);
46 context.write(new Text(c), new Text(buffer.toString()));
47 }
48 }
49 }
50 }
51
52 public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
53 // 参数含义: agrs[0]标识 in, agrs[1]标识 out,agrs[2]标识 unitmb,agrs[3]
54 if (args.length != 3){
55 System.out.println("error");
56 System.exit(0);
57 }
58
59 int unitmb =Integer.valueOf(args[2]);
60 String in = args[0];
61 String out = args[1];
62
63 Configuration conf = new Configuration();
64 conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024));
65 conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024));
66 conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024));
67 conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024));
68
69 Job job = new Job(conf);
70 job.addFileToClassPath(new Path("TestData/json-lib-2.4-jdk15.jar"));
71 job.addFileToClassPath(new Path("TestData/ezmorph-1.0.6.jar"));
72 FileInputFormat.addInputPath(job, new Path(in));
73 FileOutputFormat.setOutputPath(job, new Path(out));
74 job.setMapperClass(C03Mapper.class );
75 // 没有reducer的情况下必须设置
76 job.setNumReduceTasks(0);
77 job.setMapOutputKeyClass(Text.class );
78 job.setMapOutputValueClass(Text.class );
79 job.setOutputKeyClass(Text.class );
80 job.setOutputValueClass(Text.class );
81 job.setJarByClass(MRString.class );
82 job.waitForCompletion(true );
83 }
84 }
运行 hadoop jar ~/c03mrtest.jar com.mr.test.MRString TestData/jsonarray.txt /DataWorld/jsonoutput 128
结果:
这个例子还有一点值得注意(Path中的目录是HDFS中的目录 ):
job.addFileToClassPath(new Path("TestData/json-lib-2.4-jdk15.jar")); //jar文件下载地址:http://json-lib.sourceforge.net/
job.addFileToClassPath(new Path("TestData/ezmorph-1.0.6.jar")); //jar文件下载地址:http://ezmorph.sourceforge.net/ 使用这两句,在程序中动态添加了用于json解析的jar文件,而利用服务器中的ClassPath是访问不到这两个文件的。在编程的时候,在windows客户端下,为了语法书写方便,导入了json-lib-2.4-jdk15.jar ,但是并没有导入ezmorph-1.0.6.jar 。
也就是说,可以在程序中动态的加入jar文件,只要知道了它在HDFS中的位置。