解答2017年10月大数据行赛一题
数据:data.txt里面有500W个1-1000的整数
[root@master ~]# head data.txt
851,944,745,729,855,954,179,124,753
710,990,349,777,36,970,688,463,450
385,178,816,361,797,792,960,708,459
840,128,997,610,722,180,887,635,205
840,623,804,327,645,379,986,489,420
210,181,356,411,909,498,654,59,288
568,828,352,420,829,629,183,993,308
858,118,598,561,227,828,158,38,203
96,187,22,654,300,194,97,990,509
508,728,10,569,726,588,822,219,571
生成数据的方法
[root@master ~]# cat genNum.sh
count=0
for k in $(seq 1 555556)
do
for j in $(seq 1 8)
do
num=$(($RANDOM%1000+1))
echo -e "$num,\c" >> ./data.txt
count=$((count+1))
if [ $count -eq 5000000 ]; then break; fi
done
if [ $count -eq 5000000 ]; then break; fi
num=$(($RANDOM%1000+1))
echo "$num" >> ./data.txt
count=$((count+1))
done
要求:使用mapreduce程序去除data.txt中所有的以2开头的数字
WordMapper.java
package com.jsit;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordMapper extends Mapper {
@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for(String word : words) {
context.write(new Text(word), new IntWritable(1));
}
}
}
Reducer.java
package com.jsit;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordReducer extends Reducer {
@Override
protected void reduce(Text key, Iterable values,
Reducer.Context context) throws IOException, InterruptedException {
long count = 0;
for(IntWritable v : values) {
count += v.get();
}
context.write(key, new LongWritable(count));
}
}
Test.java
package com.jsit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Test {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setMapperClass(WordMapper.class);
job.setReducerClass(WordReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(job, "e:/temp/test.txt");
FileOutputFormat.setOutputPath(job, new Path("e:/temp/out/"));
job.waitForCompletion(true);
}
}