显示输出网站用户的登录记录.
服务后台记录下的日志文件有3个,分别是:
所有用户列表文件 user-log
u_id name
1 小红
2 小行
3 小通
4 小闪
5 小镇
6 小振
7 小秀
8 小微
9 小懂
10 小明
11 小刚
12 小举
13 小黑
14 小白
15 小鹏
16 小习
用户性别文件sex-log
sex_id sex
0 不知道
1 男
2 女
用户登录网站时间记录文件login-log
u_id sex_id login_time
1 1 2017-04-17 08:16:20
2 2 2017-04-15 06:18:20
3 1 2017-04-16 05:16:24
4 2 2017-04-14 03:18:20
5 1 2017-04-13 02:16:25
6 2 2017-04-13 01:15:20
7 1 2017-04-12 08:16:34
8 2 2017-04-11 09:16:20
9 0 2017-04-10 05:16:50
最后经过mapreduce得到的结果如下:
1 小红 男 2017-04-17 08:16:20
2 小张 女 2017-04-16 05:16:24
...
这样的需求很简单,读取login-log的每一行,根据空格” “拆分每一行,然后根据u_id去user-log中得到name,根据sex_id去sex-log中得到具体的sex,然后将拆分后以及查到的每一个部分都封装到一个Writable实现子类中,输出每一个Writable的子类对象,就可以得到最后的结果.
我的Writable实现类为Person:
package com.ljy.demo15;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Person implements Writable{
private int id;
private String sex;
private String name;
private String loginfo;
public Person() {
}
public Person(int id, String sex, String name, String loginfo) {
this.id = id;
this.sex = sex;
this.name = name;
this.loginfo = loginfo;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getLoginfo() {
return loginfo;
}
public void setLoginfo(String loginfo) {
this.loginfo = loginfo;
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readInt();
this.sex = in.readUTF();
this.name = in.readUTF();
this.loginfo = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(sex);
out.writeUTF(name);
out.writeUTF(loginfo);
}
@Override
public String toString() {
return id+" "+sex+" "+name+" "+loginfo;
}
}
我自定义的Mapper类:
static class DemoMapper extends Mapper<LongWritable, Text, Person, NullWritable> {
//存储sex-log的数据
private Map sex_map = new HashMap<>();
//存储user-log的数据
private Map uer_map = new HashMap<>();
@Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
String id = split[0];
String sexid = split[1];
String loginfo = split[2];
//根据user-id从map中获取name
String name = uer_map.get(Integer.parseInt(id));
//根据 sex-id从map中获取具体的sex
String sex = sex_map.get(Integer.parseInt(sexid));
//将得到的数据封装到Writable的实现类中
Person p = new Person(Integer.parseInt(id), sex, name, loginfo);
context.write(p, null);
}
/**
* 读取小表的内容
*
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Mapper.Context context)
throws IOException, InterruptedException {
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
for (Path path : paths) {
String name = path.getName();
//读取sex-log的数据,并存到map集合中
if (name.equals("sex-log")) {
BufferedReader br = new BufferedReader(new FileReader(path.toString()));
String line = null;
while ((line = br.readLine()) != null) {
String[] sexs = line.split(" ");
sex_map.put(Integer.parseInt(sexs[0]), sexs[1]);
}
br.close();
}
//读取user-log的数据,并存到map集合中
if (name.equals("user-log")) {
BufferedReader br = new BufferedReader(new FileReader(path.toString()));
String line = null;
while ((line = br.readLine()) != null) {
String[] users = line.split(" ");
uer_map.put(Integer.parseInt(users[0]), users[1]);
}
br.close();
}
}
}
}
我的Driver的main函数代码:
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
Job job = Job.getInstance(conf, "demo15");
job.setJarByClass(DemoDriver.class);
job.setMapperClass(DemoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Person.class);
job.setOutputKeyClass(Person.class);
job.setOutputValueClass(NullWritable.class);
//指定sex-log和user-log的存放位置
job.setCacheFiles(new URI[] { new URI("/test/demo15/cache/sex-log"), new URI("/test/demo15/cache/user-log") });
FileInputFormat.addInputPath(job, new Path("/test/demo15/input"));
Path path = new Path("/test/demo15/output");
FileOutputFormat.setOutputPath(job, path);
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.93.111:9000"), conf);
if (fs.exists(path)) {
fs.delete(path, true);
}
int res = job.waitForCompletion(true) ? 1 : 0;
System.exit(res);
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
}
}
原本我以为这样就可以完美的输出最后的结果了,但是将jar包放到linux上运行时,出现了下面的错误:
/09 23:25:09 INFO mapreduce.Job: map 0% reduce 0%
18/05/09 23:25:24 INFO mapreduce.Job: Task Id : attempt_1525876857912_0001_m_000000_0, Status : FAILED
Error: java.io.IOException: Initialization of all the collectors failed. Error in last collector was :class com.ljy.demo15.Person
at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:415)
at org.apache.hadoop.mapred.MapTask.access$100(MapTask.java:81)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.(MapTask.java:698)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:770)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.ClassCastException: class com.ljy.demo15.Person
at java.lang.Class.asSubclass(Class.java:3208)
at org.apache.hadoop.mapred.JobConf.getOutputKeyComparator(JobConf.java:887)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.init(MapTask.java:1004)
at org.apache.hadoop.mapred.MapTask.createSortingCollector(MapTask.java:402)
... 9 more
由于是新手,错误信息中并没有指定是我的代码的那一行出错,于是就上网查,网上查到的结果都是Text的包导错了,本来是org.apache.hadoop.io.Text下的Text,但是导成了其他包下的,我以为我的也是这样的,但是经过再三确认后,我的就是org.apache.hadoop.io.Text.
Google的好多页,也没有得到最后的结果,于是我就想把这个问题放下吧,不能耽误时间,还需要继续其他的事情呢,刚好当天我买了数据算法这本书,打开第一章,讲的是二次排序,说mapreduce内置默认的排序功能,突然恍然大悟,曾经学习TreeMap和TreeSet的时候,也有排序的要求,我们自定义的类,如果没有实现两个对象大小比较的方法,就没有自动排序功能可言,而以上我自定义的Person作为Map输出的key,也需要这样的比较功能,而Writable只是Hadoop对序列化和反序列化的加强,要能实现比较功能,需要实现的类是WritableComparable接口,然后重写其compareTo方法.果不其然,修改了代码以后,就可以得到正确的结果了.
最后的Person类
package com.ljy.demo15;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Person implements WritableComparable{
private int id;
private String sex;
private String name;
private String loginfo;
public Person() {
}
public Person(int id, String sex, String name, String loginfo) {
this.id = id;
this.sex = sex;
this.name = name;
this.loginfo = loginfo;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getLoginfo() {
return loginfo;
}
public void setLoginfo(String loginfo) {
this.loginfo = loginfo;
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readInt();
this.sex = in.readUTF();
this.name = in.readUTF();
this.loginfo = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(sex);
out.writeUTF(name);
out.writeUTF(loginfo);
}
@Override
public String toString() {
return id+" "+sex+" "+name+" "+loginfo;
}
@Override
public int compareTo(Person o) {
int res = o.getId()-this.id;
return res==0?1:res;//如果返回0,表示这两个对象是相等的,而要求key不能相等.所以如果等于0,就按先来后到存进去咯
}
}
最后的Driver类:
package com.ljy.demo15;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
@SuppressWarnings("deprecation")
public class DemoDriver {
static class DemoMapper extends Mapper {
private Map sex_map = new HashMap<>();
private Map uer_map = new HashMap<>();
@Override
protected void map(LongWritable key, Text value,
Mapper.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
String id = split[0];
String sexid = split[1];
String loginfo = split[2];
String name = uer_map.get(Integer.parseInt(id));
String sex = sex_map.get(Integer.parseInt(sexid));
Person p = new Person(Integer.parseInt(id), sex, name, loginfo);
context.write(p,null);
}
/**
* 读取小表的内容
*
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Mapper.Context context)
throws IOException, InterruptedException {
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
for (Path path : paths) {
String name = path.getName();
if (name.equals("t_sex")) {
BufferedReader br = new BufferedReader(new FileReader(path.toString()));
String line = null;
while ((line = br.readLine()) != null) {
String[] sexs = line.split(" ");
sex_map.put(Integer.parseInt(sexs[0]), sexs[1]);
}
br.close();
}
if (name.equals("t_user")) {
BufferedReader br = new BufferedReader(new FileReader(path.toString()));
String line = null;
while ((line = br.readLine()) != null) {
String[] users = line.split(" ");
uer_map.put(Integer.parseInt(users[0]), users[1]);
}
br.close();
}
}
}
}
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
Job job = Job.getInstance(conf, "demo15");
job.setJarByClass(DemoDriver.class);
job.setMapperClass(DemoMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Person.class);
job.setOutputKeyClass(Person.class);
job.setOutputValueClass(NullWritable.class);
job.setCacheFiles(new URI[] { new URI("/test/demo15/cache/t_sex"), new URI("/test/demo15/cache/t_user") });
FileInputFormat.addInputPath(job, new Path("/test/demo15/input"));
Path path = new Path("/test/demo15/output");
FileOutputFormat.setOutputPath(job, path);
FileSystem fs = FileSystem.get(new URI("hdfs://192.168.93.111:9000"), conf);
if (fs.exists(path)) {
fs.delete(path, true);
}
int res = job.waitForCompletion(true) ? 1 : 0;
System.exit(res);
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
}
}
}
总结:自定义的序列化对象作为MapReduce的key输出的时候,需要实现WritableComparable接口,从写compareTo方法.
以上只是记录我的学习过程,并不是实际生产,实际生产,如果用户量很很大,向上面我代码中将user-log读到内存中,将会是一个很不雅的做法,程序将没有hadoop的可伸缩性,会产生内存溢出错误.