题目:
user1 1:00 u1 1.5
user2 2:00 u2 0.5
user3 3:00 u3 0.1
user4 4:00 u1 1.4
user5 5:00 u4 1.3
user6 6:00 u4 1.9
user7 7:00 u5 2.4
user8 8:00 u1 0.1
user9 9:00 u6 0.6
user10 10:00 u1 0.5
user11 1:00 u2 0.2
user12 3:00 u4 0.9
user13 4:00 u2 9.1
user14 6:00 u1 6.1
user15 5:00 u5 5.1
user10 10:00 u2 0.4
user10 10:00 u3 0.4
user3 10:00 u2 0.4
user4 10:00 u2 0.4
用Hadoop分析海量日志文件,每行日志记录了如下数据:
TableName(表名),Time(时间),User(用户),TimeSpan(时间开销)
要求编写MapReduce程序算出高峰时间段(如上午10点)哪张表被访问的最频繁,以及这段时间访问这张表最多的用户,以及这个用户的总时间开销。
思路:本题分为两部分:
第一:先求特定时间段哪张表被访问的最频繁?
处理方式:以表名和时间段作为key值,统计分组后的记录条数,即为访问次数,求取最大值即可。输出结果按照时间段分为不同文件输出。
第二:以表名、时间段和用户作为key值,统计分组后的记录条数,累加时间开销,求取记录最大值即可获得结果
由于本题给出的数据限制,本题以解题为主,结果仅做参考:
第一步代码:
分组代码:
/**
* @author: lpj
* @date: 2018年3月16日 下午10:13:24
* @Description:
*/
package lpj.reduceWorkbean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
*
*/
public class MyPatitionerAccess extends Partitioner{
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
*/
@Override
public int getPartition(Text key, NullWritable value, int numPartitions) {
//user1 1:00 1
String[] reads = key.toString().split("\t");//按照时间段输出
if (reads[1].equals("1:00")) {
return 0;
}else if (reads[1].equals("2:00")) {
return 1;
}else if (reads[1].equals("3:00")) {
return 2;
}else if (reads[1].equals("4:00")) {
return 3;
}else if (reads[1].equals("5:00")) {
return 4;
}else if (reads[1].equals("6:00")) {
return 5;
}else if (reads[1].equals("7:00")) {
return 6;
}else if (reads[1].equals("8:00")) {
return 7;
}else if (reads[1].equals("9:00")) {
return 8;
}else if (reads[1].equals("10:00")) {
return 9;
}else if (reads[1].equals("11:00")) {
return 10;
}else if (reads[1].equals("12:00")) {
return 11;
}else if (reads[1].equals("13:00")) {
return 12;
}else if (reads[1].equals("14:00")) {
return 13;
}else if (reads[1].equals("15:00")) {
return 14;
}else if (reads[1].equals("16:00")) {
return 15;
}else if (reads[1].equals("17:00")) {
return 16;
}else if (reads[1].equals("18:00")) {
return 17;
}else if (reads[1].equals("19:00")) {
return 18;
}else {
return 19;
}
}
}
主体代码:
/**
* @author: lpj
* @date: 2018年3月16日 下午7:16:47
* @Description:
*/
package lpj.reduceWork;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import lpj.reduceWorkbean.MyPatitionerAccess;
/**
*
*/
public class FrequentAccessMR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.addResource("hdfs-site.xml");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地
Job job = Job.getInstance(conf);
job.setJarByClass(FrequentAccessMR.class);
job.setMapperClass(FrequentAccessMR_Mapper.class);
job.setReducerClass(FrequentAccessMR_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setPartitionerClass(MyPatitionerAccess.class);//指定分组器
job.setNumReduceTasks(20);//指定输出文件数目
Path inputPath = new Path("d:/a/homework8.txt");
Path outputPath = new Path("d:/a/homework8");
if (fs.exists(inputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
public static class FrequentAccessMR_Mapper extends Mapper{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
//user4 10:00 u2 0.4
String [] reads = value.toString().trim().split("\t");
String kk = reads[0] + "\t" + reads[1];//组合表名时间段
kout.set(kk);
context.write(kout, NullWritable.get());
}
}
public static class FrequentAccessMR_Reducer extends Reducer{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException {
int accessNum = 0;
//统计不同时间段不同表的访问次数
for(NullWritable vin : values){
accessNum ++;
}
String kk = key.toString() + "\t" + accessNum;
kout.set(kk);
context.write(kout, NullWritable.get());
}
}
}
结果输出20个不同时间段的文件结果:
其中10:00的结果文件内容为
user10 10:00 5
user3 10:00 1
user4 10:00 1
即在10:00时间段内,user10的访问量最高
进行第二步:选取表名为user10,时间段为10:00组合用户作为key值,统计分组后的不同用户的访问系数以及开销时间
/**
* @author: lpj
* @date: 2018年3月16日 下午7:16:47
* @Description:
*/
package lpj.reduceWork;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import lpj.reduceWorkbean.MyPatitionerAccess;
/**
*
*/
public class FrequentAccessMR2 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.addResource("hdfs-site.xml");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地
Job job = Job.getInstance(conf);
job.setJarByClass(FrequentAccessMR2.class);
job.setMapperClass(FrequentAccessMR_Mapper.class);
job.setReducerClass(FrequentAccessMR_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setPartitionerClass(MyPatitionerAccess.class);//指定分组器
Path inputPath = new Path("d:/a/homework8.txt");
Path outputPath = new Path("d:/a/homework8_2");
if (fs.exists(inputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
public static class FrequentAccessMR_Mapper extends Mapper{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
//user4 10:00 u2 0.4
String [] reads = value.toString().trim().split("\t");
if (reads[0].equals("user10") && reads[1].equals("10:00")) {
kout.set(reads[0] + "\t" + reads[1] + "\t" + reads[2]);//以用为分组
valueout.set(reads[3]);//以时间量为value
context.write(kout, valueout);
}
}
}
public static class FrequentAccessMR_Reducer extends Reducer{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException {
int accessNum = 0;
String [] reads = key.toString().trim().split("\t");//user4 10:00 u2 0.4
double sumtime = 0;
//统计不同用户的访问次数以及访问时间
for(Text vin : values){
accessNum ++;
sumtime += Double.parseDouble(vin.toString());
}
String kk = key.toString() + "\t" + accessNum;
kout.set(kk);
valueout.set(sumtime+"");
context.write(kout, valueout);
}
}
}
结果为:
user10 10:00 u1 3 0.8
user10 10:00 u2 1 0.4
user10 10:00 u3 1 0.4
即用户u1访问最频繁,时间总长为0.8