MapReduce案例8——求最频繁访问数据表以及最频繁访问的用户和时长

题目:

user1	1:00	u1	1.5
user2	2:00	u2	0.5
user3	3:00	u3	0.1
user4	4:00	u1	1.4
user5	5:00	u4	1.3
user6	6:00	u4	1.9
user7	7:00	u5	2.4
user8	8:00	u1	0.1
user9	9:00	u6	0.6
user10	10:00	u1	0.5
user11	1:00	u2	0.2
user12	3:00	u4	0.9
user13	4:00	u2	9.1
user14	6:00	u1	6.1
user15	5:00	u5	5.1
user10	10:00	u2	0.4
user10	10:00	u3	0.4
user3	10:00	u2	0.4
user4	10:00	u2	0.4


用Hadoop分析海量日志文件,每行日志记录了如下数据:
TableName(表名),Time(时间),User(用户),TimeSpan(时间开销)

要求编写MapReduce程序算出高峰时间段(如上午10点)哪张表被访问的最频繁,以及这段时间访问这张表最多的用户,以及这个用户的总时间开销。

思路:本题分为两部分:

第一:先求特定时间段哪张表被访问的最频繁?

处理方式:以表名和时间段作为key值,统计分组后的记录条数,即为访问次数,求取最大值即可。输出结果按照时间段分为不同文件输出。

第二:以表名、时间段和用户作为key值,统计分组后的记录条数,累加时间开销,求取记录最大值即可获得结果

由于本题给出的数据限制,本题以解题为主,结果仅做参考:

第一步代码:

分组代码:

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午10:13:24
 * @Description:
 */
package lpj.reduceWorkbean;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 *
 */
public class MyPatitionerAccess extends Partitioner{

	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
	 */
	@Override
	public int getPartition(Text key, NullWritable value, int numPartitions) {
		//user1	1:00	1
		String[] reads = key.toString().split("\t");//按照时间段输出
		if (reads[1].equals("1:00")) {
			return 0;
		}else if (reads[1].equals("2:00")) {
			return 1;
		}else if (reads[1].equals("3:00")) {
			return 2;
		}else if (reads[1].equals("4:00")) {
			return 3;
		}else if (reads[1].equals("5:00")) {
			return 4;
		}else if (reads[1].equals("6:00")) {
			return 5;
		}else if (reads[1].equals("7:00")) {
			return 6;
		}else if (reads[1].equals("8:00")) {
			return 7;
		}else if (reads[1].equals("9:00")) {
			return 8;
		}else if (reads[1].equals("10:00")) {
			return 9;
		}else if (reads[1].equals("11:00")) {
			return 10;
		}else if (reads[1].equals("12:00")) {
			return 11;
		}else if (reads[1].equals("13:00")) {
			return 12;
		}else if (reads[1].equals("14:00")) {
			return 13;
		}else if (reads[1].equals("15:00")) {
			return 14;
		}else if (reads[1].equals("16:00")) {
			return 15;
		}else if (reads[1].equals("17:00")) {
			return 16;
		}else if (reads[1].equals("18:00")) {
			return 17;
		}else if (reads[1].equals("19:00")) {
			return 18;
		}else {
			return 19;
		}
	}

}

主体代码:

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import lpj.reduceWorkbean.MyPatitionerAccess;
/**
 *
 */
public class FrequentAccessMR {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(FrequentAccessMR.class);
		job.setMapperClass(FrequentAccessMR_Mapper.class);
		job.setReducerClass(FrequentAccessMR_Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		job.setPartitionerClass(MyPatitionerAccess.class);//指定分组器
		job.setNumReduceTasks(20);//指定输出文件数目
		
		Path inputPath = new Path("d:/a/homework8.txt");
		Path outputPath = new Path("d:/a/homework8");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	
	public static class FrequentAccessMR_Mapper extends Mapper{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//user4	10:00	u2	0.4
			String [] reads = value.toString().trim().split("\t");
			String kk = reads[0] + "\t" + reads[1];//组合表名时间段
			kout.set(kk);
			context.write(kout, NullWritable.get());
		}
	}
	public static class FrequentAccessMR_Reducer extends Reducer{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException {
			int accessNum = 0;
			//统计不同时间段不同表的访问次数
			for(NullWritable vin : values){
				accessNum ++;
			}
			String kk = key.toString() + "\t" + accessNum;
			kout.set(kk);
			context.write(kout, NullWritable.get());
		}
		
	}

}

结果输出20个不同时间段的文件结果:

其中10:00的结果文件内容为

user10	10:00	5
user3	10:00	1
user4	10:00	1

即在10:00时间段内,user10的访问量最高

进行第二步:选取表名为user10,时间段为10:00组合用户作为key值,统计分组后的不同用户的访问系数以及开销时间

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import lpj.reduceWorkbean.MyPatitionerAccess;
/**
 *
 */
public class FrequentAccessMR2 {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(FrequentAccessMR2.class);
		job.setMapperClass(FrequentAccessMR_Mapper.class);
		job.setReducerClass(FrequentAccessMR_Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setPartitionerClass(MyPatitionerAccess.class);//指定分组器
		
		Path inputPath = new Path("d:/a/homework8.txt");
		Path outputPath = new Path("d:/a/homework8_2");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	
	public static class FrequentAccessMR_Mapper extends Mapper{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//user4	10:00	u2	0.4
			String [] reads = value.toString().trim().split("\t");
			if (reads[0].equals("user10") && reads[1].equals("10:00")) {
				kout.set(reads[0] + "\t" + reads[1] + "\t" + reads[2]);//以用为分组
				valueout.set(reads[3]);//以时间量为value
				context.write(kout, valueout);
			}
		}
	}
	public static class FrequentAccessMR_Reducer extends Reducer{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Text key, Iterable values, Context context)throws IOException, InterruptedException {
			int accessNum = 0;
			String [] reads = key.toString().trim().split("\t");//user4	10:00	u2	0.4
			double sumtime = 0;
			//统计不同用户的访问次数以及访问时间
			for(Text vin : values){
				accessNum ++;
				sumtime += Double.parseDouble(vin.toString());
			}
			String kk = key.toString() + "\t" + accessNum;
			kout.set(kk);
			valueout.set(sumtime+"");
			context.write(kout, valueout);
		}
		
	}

}

结果为:

user10	10:00	u1	3	0.8
user10	10:00	u2	1	0.4
user10	10:00	u3	1	0.4

即用户u1访问最频繁,时间总长为0.8


你可能感兴趣的:(MapReduce)