以下是qq的好友列表数据,冒号前是一个用户,冒号后是该用户的所有好友(数据中的好友关系是单向的)
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
求出哪些人两两之间有共同好友,及他俩的共同好友都有谁?
第一步:
第二步
public class ComonsFriendsStepOne extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
Job job = Job.getInstance(conf, ComonsFriendsStepOne.class.getSimpleName());
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\input"));
job.setMapperClass(ComonsFriendsStepOneMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ComonsFriendsStepOneReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\output"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static class ComonsFriendsStepOneMapper extends Mapper<LongWritable,Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(":");
String person = split[0];
String[] friends = split[1].split(",");
for (String friend : friends) {
context.write(new Text(friend),new Text(person));
}
}
}
public static class ComonsFriendsStepOneReducer extends Reducer<Text,Text,Text,Text>{
@Override
protected void reduce(Text friend, Iterable<Text> persons, Context context) throws IOException, InterruptedException {
StringBuffer buffer = new StringBuffer();
for (Text person : persons) {
buffer.append(person).append("-");
}
context.write(friend,new Text(buffer.toString()));
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
ToolRunner.run(configuration,new ComonsFriendsStepOne(),args);
}
}
public class ComonsFriendsStepTwo extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), ComonsFriendsStepTwo.class.getSimpleName());
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\output"));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(ComonsFriendStepTwoMapper.class);
job.setReducerClass(ComonsFriendStepTwoReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\共同好友\\outstep2"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static class ComonsFriendStepTwoMapper extends Mapper<LongWritable,Text,Text,Text>{
/**
* A F-D-O-I-H-B-K-G-C-
* B E-A-J-F-
* C K-A-B-E-F-G-H-
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
String friends = split[0];
String[] persons = split[1].split("-");
//排序,避免c-b 与b-c 这样的情况出现
Arrays.sort(persons);
for(int i =0;i< persons.length -1 ;i++){
for(int j = i+1;j<persons.length;j++){
context.write(new Text(persons[i]+"-"+persons[j]),new Text(friends));
}
}
}
}
public static class ComonsFriendStepTwoReducer extends Reducer<Text,Text,Text,Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuffer buffer = new StringBuffer();
for (Text value : values) {
buffer.append(value.toString()+"\t");
}
context.write(key,new Text(buffer.toString()));
}
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(),new ComonsFriendsStepTwo(),args);
}
}
有大量的文本(文档、网页),需要建立搜索索引
首选将文档的内容全部读取出来,加上文档的名字作为key,文档的value为1,组织成这样的一种形式的数据
map端数据输出
hello-a.txt 1
hello-a.txt 1
hello-a.txt 1
reduce端数据输出
hello-a.txt 3
public class IndexCreate extends Configured implements Tool {
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(),new IndexCreate(),args);
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), IndexCreate.class.getSimpleName());
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\倒排索引\\input"));
job.setMapperClass(IndexCreateMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(IndexCreateReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\倒排索引\\outindex"));
boolean bool = job.waitForCompletion(true);
return bool?0:1;
}
public static class IndexCreateMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
Text text = new Text();
IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取文件切片
FileSplit fileSplit = (FileSplit) context.getInputSplit();
//通过文件切片获取文件名
String name = fileSplit.getPath().getName();
String line = value.toString();
String[] split = line.split(" ");
//输出 单词--文件名作为key value是1
for (String word : split) {
text.set(word+"--"+name);
context.write(text,v);
}
}
}
public static class IndexCreateReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
IntWritable value = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
value.set(count);
context.write(key,value);
}
}
}
无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案
小文件的优化无非以下几种方式:
1、在数据采集的时候,就将小文件或小批数据合成大文件再上传HDFS
2、在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并
3、在mapreduce处理时,可采用combineInputFormat提高效率
本节实现的是上述第二种方式
程序的核心机制:
自定义一个InputFormat
改写RecordReader,实现一次读取一个完整文件封装为KV
在输出时使用SequenceFileOutPutFormat输出合并文件
public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable>{
/*
直接返回文件不可切割,保证一个文件是一个完整的一行
*/
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException,InterruptedException {
WholeFileRecordReader reader = new WholeFileRecordReader();
reader.initialize(split, context);
return reader;
}
}
/**
*
* RecordReader的核心工作逻辑:
* 通过nextKeyValue()方法去读取数据构造将返回的key value
* 通过getCurrentKey 和 getCurrentValue来返回上面构造好的key和value
*
*
* @author
*
*/
public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> {
private FileSplit fileSplit;
private Configuration conf;
private BytesWritable value = new BytesWritable();
private boolean processed = false;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
this.fileSplit = (FileSplit) split;
this.conf = context.getConfiguration();
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!processed) {
byte[] contents = new byte[(int) fileSplit.getLength()];
Path file = fileSplit.getPath();
FileSystem fs = file.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(file);
IOUtils.readFully(in, contents, 0, contents.length);
value.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
processed = true;
return true;
}
return false;
}
@Override
public NullWritable getCurrentKey() throws IOException,
InterruptedException {
return NullWritable.get();
}
@Override
public BytesWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException {
return processed ? 1.0f : 0.0f;
}
@Override
public void close() throws IOException {
}
}
public class SmallFilesToSequenceFileConverter extends Configured implements Tool {
static class SequenceFileMapper extends
Mapper<NullWritable, BytesWritable, Text, BytesWritable> {
private Text filenameKey;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
InputSplit split = context.getInputSplit();
Path path = ((FileSplit) split).getPath();
filenameKey = new Text(path.toString());
}
@Override
protected void map(NullWritable key, BytesWritable value,
Context context) throws IOException, InterruptedException {
context.write(filenameKey, value);
}
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"combine small files to sequencefile");
job.setJarByClass(SmallFilesToSequenceFileConverter.class);
job.setInputFormatClass(WholeFileInputFormat.class);
WholeFileInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义inputformat_小文件合并\\input"));
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义inputformat_小文件合并\\output"));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setMapperClass(SequenceFileMapper.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(),
args);
System.exit(exitCode);
}
}
现在有一些订单的评论数据,需求,将订单的好评与差评进行区分开来,将最终的数据分开到不同的文件夹下面去,数据内容参见资料文件夹,其中数据第九个字段表示好评,中评,差评。0:好评,1:中评,2:差评
程序的关键点是要在一个mapreduce程序中根据数据的不同输出两类结果到不同目录,这类灵活的输出需求可以通过自定义outputformat来实现
1、在mapreduce中访问外部资源
2、自定义outputformat,改写其中的recordwriter,改写具体输出数据的方法write()
public class MyOutPutFormat extends FileOutputFormat<Text,NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Path enhancePath = new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\out1\\1.txt");
Path toCrawlPath = new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\out2\\2.txt");
FSDataOutputStream enhanceOut = fs.create(enhancePath);
FSDataOutputStream toCrawlOut = fs.create(toCrawlPath);
return new MyRecordWriter(enhanceOut,toCrawlOut);
}
static class MyRecordWriter extends RecordWriter<Text, NullWritable>{
FSDataOutputStream enhanceOut = null;
FSDataOutputStream toCrawlOut = null;
public MyRecordWriter(FSDataOutputStream enhanceOut, FSDataOutputStream toCrawlOut) {
this.enhanceOut = enhanceOut;
this.toCrawlOut = toCrawlOut;
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
if (key.toString().split("\t")[9].equals("0")){
toCrawlOut.write(key.toString().getBytes());
toCrawlOut.write("\r\n".getBytes());
}else{
enhanceOut.write(key.toString().getBytes());
enhanceOut.write("\r\n".getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if(toCrawlOut!=null){
toCrawlOut.close();
}
if(enhanceOut!=null){
enhanceOut.close();
}
}
}
}
public class MyOwnOutputFormatMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
Job job = Job.getInstance(conf, MyOwnOutputFormatMain.class.getSimpleName());
job.setJarByClass(MyOwnOutputFormatMain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\input"));
job.setMapperClass(MyOwnMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(MyOutPutFormat.class);
//设置一个输出目录,这个目录会输出一个success的成功标志的文件
MyOutPutFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义outputformat\\out2"));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static class MyOwnMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
String commentStatus = split[9];
context.write(value,NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
ToolRunner.run(configuration,new MyOwnOutputFormatMain(),args);
}
}
1、利用“订单id和成交金额”作为key,可以将map阶段读取到的所有订单数据按照id分区,按照金额排序,发送到reduce
2、在reduce端利用groupingcomparator将订单id相同的kv聚合成组,然后取第一个即是最大值
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private Double price;
@Override
public int compareTo(OrderBean o) {
//比较订单id的排序顺序
int i = this.orderId.compareTo(o.orderId);
if(i==0){
//如果订单id相同,则比较金额,金额大的排在前面
i = - this.price.compareTo(o.price);
}
return i;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeDouble(price);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.price = in.readDouble();
}
public OrderBean() {
}
public OrderBean(String orderId, Double price) {
this.orderId = orderId;
this.price = price;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public Double getPrice() {
return price;
}
public void setPrice(Double price) {
this.price = price;
}
@Override
public String toString() {
return orderId +"\t"+price;
}
}
public class OrderPartition extends Partitioner<OrderBean,NullWritable> {
@Override
public int getPartition(OrderBean orderBean, NullWritable nullWritable, int i) {
//自定义分区,将相同订单id的数据发送到同一个reduce里面去
return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE)%i;
}
}
public class MyGroupIngCompactor extends WritableComparator {
//将我们自定义的OrderBean注册到我们自定义的MyGroupIngCompactor当中来
//表示我们的分组器在分组的时候,对OrderBean这一种类型的数据进行分组
//传入作为key的bean的class类型,以及制定需要让框架做反射获取实例对象
public MyGroupIngCompactor() {
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean first = (OrderBean) a;
OrderBean second = (OrderBean) b;
return first.getOrderId().compareTo(second.getOrderId());
}
}
public class GroupingCompactorMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), GroupingCompactorMain.class.getSimpleName());
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义groupingComparator\\input"));
job.setMapperClass(MyGroupingMapper.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setPartitionerClass(OrderPartition.class);
job.setGroupingComparatorClass(MyGroupIngCompactor.class);
job.setReducerClass(MyGroupingReducer.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(2);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///F:\\传智播客大数据离线阶段课程资料\\5、大数据离线第五天\\自定义groupingComparator\\output"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static class MyGroupingMapper extends Mapper<LongWritable,Text,OrderBean,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
OrderBean orderBean = new OrderBean(split[0], Double.valueOf(split[2]));
context.write(orderBean,NullWritable.get());
}
}
public static class MyGroupingReducer extends Reducer<OrderBean,NullWritable,OrderBean,NullWritable>{
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(),new GroupingCompactorMain(),args);
}
}
ControlledJob cJob1 = new ControlledJob(job1.getConfiguration());
ControlledJob cJob2 = new ControlledJob(job2.getConfiguration());
ControlledJob cJob3 = new ControlledJob(job3.getConfiguration());
cJob1.setJob(job1);
cJob2.setJob(job2);
cJob3.setJob(job3);
// 设置作业依赖关系
cJob2.addDependingJob(cJob1);
cJob3.addDependingJob(cJob2);
JobControl jobControl = new JobControl("RecommendationJob");
jobControl.addJob(cJob1);
jobControl.addJob(cJob2);
jobControl.addJob(cJob3);
// 新建一个线程来运行已加入JobControl中的作业,开始进程并等待结束
Thread jobControlThread = new Thread(jobControl);
jobControlThread.start();
while (!jobControl.allFinished()) {
Thread.sleep(500);
}
jobControl.stop();
return 0;