Hadoop:MapReduce编程之统计二手房数目

MapReduce编程之统计二手房数目

要求:统计出上海各个地区二手房的数目,其中将浦东的二手房单独由一个Reduce计算

分析:由于浦东区的二手房数目需要单独统计,因此需要设置分区器,同时设置ReduceTask为2
代码实现:

package com.miao.secondhouse;

import com.miao.partition.UserPartition;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/**
 * @ClassName SecondHouseNum
 * @Description  TODO 统计上海各个地区二手房数目,将浦东的二手房单独由一个Reduce计算
 * @Date 2021-04-27 17:47:53
 * @Create By     Miao
 */
public class SecondHouseNum extends Configured implements Tool {

    //构建、配置、提交Job
    public int run(String[] args) throws Exception {
        /**
         * step1:构建Job
         */
        //实例化一个MapReduce的Job对象
        Job job = Job.getInstance(this.getConf(),"second house num");
        //指定允许jar包运行的类
        job.setJarByClass(SecondHouseNum.class);

        /**
         * step2:配置Job
         */
        //Input:配置输入
        //指定输入类的类型
        job.setInputFormatClass(TextInputFormat.class);//可以不指定,默认就是TextInputFormat
        //指定输入源,也就是二手房信息所在的文件的路径
        Path inputPath = new Path("D:\\Study\\idea\\MavenProject\\secondhouse.csv");//使用第一个参数作为程序的输入
        TextInputFormat.setInputPaths(job,inputPath);

        //Map:配置Map
        job.setMapperClass(SecondMapper.class); //设置调用的Mapper类
        job.setMapOutputKeyClass(Text.class); //设置K2的类型
        job.setMapOutputValueClass(IntWritable.class); //设置V2的类型

        //Shuffle:配置Shuffle
       job.setPartitionerClass(UserPartition.class); //设置分区器


        //Reduce:配置Reduce
        job.setReducerClass(SecondReducer.class); //设置调用reduce的类
        job.setOutputKeyClass(Text.class); //设置K3的类型
        job.setOutputValueClass(IntWritable.class); //设置V3的类型
        job.setNumReduceTasks(2); //设置ReduceTask的个数,默认为1

        //Output:配置输出
        //指定输出类的类型
        job.setOutputFormatClass(TextOutputFormat.class);//默认就是TextOutputFormat
        //设置输出的路径,输出计算得到的二手房数目信息存放的文件的路径
        Path outputPath = new Path("D:\\Study\\idea\\MavenProject\\output\\three");
        //判断输出是否存在,存在就删除
        FileSystem fs = FileSystem.get(this.getConf());
        if(fs.exists(outputPath)){
            fs.delete(outputPath,true);
        }
        TextOutputFormat.setOutputPath(job,outputPath);


        /**
         * step3:提交Job
         */
        return job.waitForCompletion(true) ? 0 : -1;
    }


    //程序的入口方法
    public static void main(String[] args) throws Exception {
        //构建配置管理对象
        Configuration conf = new Configuration();
        //通过工具类的run方法调用当前类的实例的run方法
        int status = ToolRunner.run(conf, new SecondHouseNum(), args);
        //退出程序
        System.exit(status);
    }

    public static class SecondMapper extends Mapper{
        //Key2
        Text outputKey = new Text();
        //Value2
        IntWritable outputValue = new IntWritable(1);

        //小区名称,户型,面积,地区,楼层朝向,总价,单价,建造年份
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //取出地区
            String region = value.toString().split(",")[3];
            //地区作为Key2
            this.outputKey.set(region);
            //输出
            context.write(this.outputKey,this.outputValue);
        }
    }

    public static class SecondReducer extends Reducer{
        //Value3
        IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            this.outputValue.set(sum);
            context.write(key,this.outputValue);
        }
    }
}

自定义分区器

代码实现:

package com.miao.partition;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * @ClassName UserPartition
 * @Description TODO 自定义分区器
 *  	extends Partitioner
 *  	必须指定Key2和Value2的类型,重写getPartition方法必须指定类型
 * @Date 2021-04-27 17:47:53
 * @Create By     Miao
 */
 
public class UserPartition extends Partitioner {
    @Override
    public int getPartition(Text k2, IntWritable v2, int numPartition) {
        String region = k2.toString();
        if ("浦东".equals(region)) {
            return 0;
        } else {
            return 1;
        }
    }    
}

你可能感兴趣的:(Hadoop,mapreduce,hadoop)