数据说明:date(日期),county(县),state(州),fips(县编码code),cases(累计确诊病例),deaths(累计死亡病例)。
问题:统计美国2021-01-28,每个州state新增确诊案例数、新增死亡案例数
分析:
因为mr中数据传递是键值对的形式,而当有多个值作为key或者value输出时,可以将其封装成一个对象Bean,这样就可以更加方便值的获取和读取。
在封装成对象时,除了要添加参数构造(空参、全参)、gettingSetting、toString以外,还需要实现hadoop序列化接口Writable。
在封装对象时,可以添加一个类似与全参构造的方法set,方便一次性传值,就不需要一次一次的set变量了。(以下案例就有涉及)
注意:如果该自定义对象作为key还需要重写comparable接口
package covid.Bean;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class StateBean implements Writable {
//确诊病例数
private long Cases;
//死亡病例数
private long deaths;
public StateBean() {
}
public StateBean(long cases, long deaths) {
Cases = cases;
this.deaths = deaths;
}
public void set(long cases, long deaths) {
Cases = cases;
this.deaths = deaths;
}
public long getCases() {
return Cases;
}
public void setCases(long cases) {
Cases = cases;
}
public long getDeaths() {
return deaths;
}
public void setDeaths(long deaths) {
this.deaths = deaths;
}
@Override
public String toString() {
return Cases +"\t"+deaths;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(Cases);
dataOutput.writeLong(deaths);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.Cases = dataInput.readLong();
this.deaths = dataInput.readLong();
}
}
package covid.Bean;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class mapper extends Mapper<LongWritable, Text,Text,StateBean> {
Text outputKey = new Text();
StateBean outputvalue = new StateBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 跳过第一行数据
if (key.get() > 0){
String[] sl = value.toString().split(",");
//将state数据作为key值,为了后续reduce分组求和
outputKey.set(sl[1]);
//注意包装类中cases数据类型为Long,因此需要转换类型
outputvalue.setCases(Long.parseLong(sl[3]));
outputvalue.setDeaths(Long.parseLong(sl[4]));
context.write(outputKey,outputvalue);
}
}
}
package covid.Bean;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reducer extends Reducer<Text,StateBean,Text,StateBean> {
StateBean outputValue = new StateBean();
@Override
protected void reduce(Text key, Iterable<StateBean> values, Context context) throws IOException, InterruptedException {
long TotalCases = 0;
long TotalDeaths = 0;
for (StateBean value:values){
TotalCases += value.getCases();
TotalDeaths += value.getDeaths();
}
outputValue.set(TotalCases,TotalDeaths);
context.write(key,outputValue);
}
}
模板式写法,多写即可
package covid.Bean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class driver {
public static void main(String[] args) throws Exception {
//配置文件对象
Configuration conf = new Configuration();
//创建job实例
Job job = Job.getInstance(conf, driver.class.getSimpleName());
job.setJarByClass(driver.class);
//设置与mapper的连接
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(StateBean.class);
//设置与reducer的连接
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(StateBean.class);
//输入路径
FileInputFormat.setInputPaths(job,new Path("E:\\InAndOut\\hadoop\\Input\\USCovid\\us-states.csv"));
//输出路径
Path out = new Path("E:\\InAndOut\\hadoop\\Output\\Bean");
FileOutputFormat.setOutputPath(job,out);
//判断文件是否存在
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)){
fs.delete(out,true);
}
//提交并等待完成
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
总结:该案例逻辑简单,主要是要明白自定义对象序列化的使用。
问题:将美国2021-01-28,每个州state的确诊案例数进行倒序排序。
分析:
WritableComparable接口实现要在后面规定其泛型,例如
WritableComparable接口只是Writable和Comparable合并写法。
自定义排序规则:返回 0-等于、负数-小于、正数-大于。而倒序精髓:如果大于,那么强制返回负数。
this.Cases表示当前的数,而o.getCases()表示其他的数。
package covid.sort;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class sortBean implements WritableComparable<sortBean> {
//确诊病例数
private long Cases;
//死亡病例数
private long deaths;
public sortBean() {
}
public sortBean(long cases, long deaths) {
Cases = cases;
this.deaths = deaths;
}
public void set(long cases, long deaths) {
Cases = cases;
this.deaths = deaths;
}
public long getCases() {
return Cases;
}
public void setCases(long cases) {
Cases = cases;
}
public long getDeaths() {
return deaths;
}
public void setDeaths(long deaths) {
this.deaths = deaths;
}
@Override
public String toString() {
return Cases + "\t" + deaths;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(Cases);
dataOutput.writeLong(deaths);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.Cases = dataInput.readLong();
this.deaths = dataInput.readLong();
}
//自定义排序规则
@Override
public int compareTo(sortBean o) {
return this.Cases - o.getCases() > 0 ? -1 : (this.Cases - o.getCases() < 0 ? 1 : 0);
}
}
package covid.sort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class mapper extends Mapper<LongWritable, Text, sortBean,Text> {
sortBean outKey = new sortBean();
Text outValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split("\t");
outKey.set(Long.parseLong(fields[1]),Long.parseLong(fields[2]));
outValue.set(fields[0]);
context.write(outKey,outValue);
}
}
重点:如果时key为自定义类型,没有重写hashcode和equals方法,默认比较的是地址值,这就导致每一个kv键值对为一组。
package covid.sort;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reducer extends Reducer<sortBean, Text,Text,sortBean> {
@Override
protected void reduce(sortBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//排好序后,reduce进行分组操作,分组规则是判断key是否相同
//但是key为自定义的对象,把那个没有重写hashcode和equals方法,默认比较的是地址值,这就导致每一个kv键值对为一组。
//因此只需要将key和value颠倒即可
Text outKey = values.iterator().next();
context.write(outKey,key);
}
}
package covid.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class driver {
public static void main(String[] args) throws Exception {
//配置文件对象
Configuration conf = new Configuration();
//创建job实例
Job job = Job.getInstance(conf, driver.class.getSimpleName());
job.setJarByClass(driver.class);
//设置与mapper的连接
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(sortBean.class);
job.setMapOutputValueClass(Text.class);
//设置与reducer的连接
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(sortBean.class);
//输入路径
FileInputFormat.setInputPaths(job,new Path("E:\\InAndOut\\hadoop\\Input\\sortData"));
//输出路径
Path out = new Path("E:\\InAndOut\\hadoop\\Output\\sort");
FileOutputFormat.setOutputPath(job,out);
//判断文件是否存在
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)){
fs.delete(out,true);
}
//提交并等待完成
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
问题:将美国疫情数据不同州的输出到不同文件中,属于同一个州的各个县输出到同一个结果文件中。
分析:
输出到不同文件中–>reducetask有多个(>2)–>默认只有1个,如何有多个?—>可以设置,job.setNumReduceTasks(N)—>当有多个reducetask 意味着数据分区---->默认分区规则是什么? hashPartitioner—>默认分区规则符合你的业务需求么?---->符合,直接使用—>不符合,自定义分区。
package covid.partition;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
public class StatePartitioner extends Partitioner<Text,Text> {
//模拟美国各州数据字典 实际中可以从redis中快速查询 如果数据不大也可以使用数据集合保存
public static HashMap<String,Integer> stateMap = new HashMap<String,Integer>();
static{
stateMap.put("Alabama", 0);
stateMap.put("Arkansas", 1);
stateMap.put("California", 2);
stateMap.put("Florida", 3);
stateMap.put("Indiana", 4);
}
@Override
public int getPartition(Text key, Text value, int i) {
Integer code = stateMap.get(key.toString());
//判断是否是指定的那几个州,如果是返回对应的值,如果不是返回5
if (code!=null){
return code;
}
return 5;
}
}
输入数据和案例一相同
package covid.partition;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class mapper extends Mapper<LongWritable, Text,Text,Text> {
Text outkey = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] sl = value.toString().split(",");
outkey.set(sl[1]);
context.write(outkey,value);
}
}
package covid.partition;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reducer extends Reducer<Text, Text,Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value:values){
context.write(value,NullWritable.get());
}
}
}
package covid.partition;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class driver {
public static void main(String[] args) throws Exception {
//配置文件对象
Configuration conf = new Configuration();
//创建job实例
Job job = Job.getInstance(conf, covid.sort.driver.class.getSimpleName());
job.setJarByClass(covid.sort.driver.class);
//设置与mapper的连接
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置与reducer的连接
job.setReducerClass(reducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置分区个数,以及分区规则
job.setNumReduceTasks(6);
job.setPartitionerClass(StatePartitioner.class);
//输入路径
FileInputFormat.setInputPaths(job,new Path("E:\\InAndOut\\hadoop\\Input\\USCovid\\us-states.csv"));
//输出路径
Path out = new Path("E:\\InAndOut\\hadoop\\Output\\statePartitioner");
FileOutputFormat.setOutputPath(job,out);
//判断文件是否存在
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)){
fs.delete(out,true);
}
//提交并等待完成
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
分组概述:
自定义分组规则:
问题:统计美国2021-01-28,每个州state的确诊案例数最多的县是哪一个。
分析:
package covid.group;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class groupBean implements WritableComparable<groupBean> {
private String state;
private String country;
private long cases;
public groupBean() {
}
public groupBean(String state, String country, long cases) {
this.state = state;
this.country = country;
this.cases = cases;
}
public void set(String state, String country, long cases) {
this.state = state;
this.country = country;
this.cases = cases;
}
public String getState() {
return state;
}
public void setState(String state) {
this.state = state;
}
public String getCountry() {
return country;
}
public void setCountry(String country) {
this.country = country;
}
public long getCases() {
return cases;
}
public void setCases(long cases) {
this.cases = cases;
}
@Override
public String toString() {
return state +'\t' + country + '\t' + cases;
}
//根据州state正序进行排序 如果州相同 则根据确诊数量cases倒序排序
@Override
public int compareTo(groupBean o) {
int result;
int i =state.compareTo(o.getState());
if (i > 0){
result = 1;
}else if (i<0){
result = -1;
}else {
result = cases>o.getCases()?-1:1;
}
return result;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(state);
dataOutput.writeUTF(country);
dataOutput.writeLong(cases);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.state = dataInput.readUTF();
this.country = dataInput.readUTF();
this.cases = dataInput.readLong();
}
}
该mapper中将所需内容都封装到key中,原因是自定义分组和排序要求的数据必须在key中。
package covid.group;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class mapper extends Mapper<LongWritable, Text,groupBean, NullWritable> {
groupBean outputKey = new groupBean();
NullWritable outputValue = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if (key.get()>0){
String[] sl = value.toString().split(",");
outputKey.set(sl[2],sl[1],Long.parseLong(sl[4]));
context.write(outputKey,outputValue);
}
}
}
public groupingComparator() {
super(groupBean.class,true);
}
package covid.group;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class groupingComparator extends WritableComparator {
public groupingComparator() {
super(groupBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
//类型转换
groupBean aBean = (groupBean) a;
groupBean bBean = (groupBean) b;
return aBean.getState().compareTo(bBean.getState());
}
}
思考:按照key中的一个变量进行分组,但是reduce方法的变量是key和value的迭代,key是怎么和valude一一对应呢?
package covid.group;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class reducer extends Reducer<groupBean, NullWritable,groupBean,NullWritable> {
@Override
protected void reduce(groupBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//不遍历迭代器,此时key就是分组中的第一个key就是该州确诊病例人数最多的县对应的数据
context.write(key,NullWritable.get());
}
}
需要在驱动类中设置关联自定义分组规则。
package covid.group;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class driver {
public static void main(String[] args) throws Exception {
//配置文件对象
Configuration conf = new Configuration();
//创建job实例
Job job = Job.getInstance(conf, covid.sort.driver.class.getSimpleName());
job.setJarByClass(covid.sort.driver.class);
//设置与mapper的连接
job.setMapperClass(mapper.class);
job.setMapOutputKeyClass(groupBean.class);
job.setMapOutputValueClass(NullWritable.class);
//设置与reducer的连接
job.setReducerClass(reducer.class);
job.setOutputKeyClass(groupBean.class);
job.setOutputValueClass(NullWritable.class);
//设置自定义分组
job.setGroupingComparatorClass(groupingComparator.class);
//输入路径
FileInputFormat.setInputPaths(job,new Path("E:\\InAndOut\\hadoop\\Input\\USCovid\\us-counties.csv"));
//输出路径
Path out = new Path("E:\\InAndOut\\hadoop\\Output\\group");
FileOutputFormat.setOutputPath(job,out);
//判断文件是否存在
FileSystem fs = FileSystem.get(conf);
if (fs.exists(out)){
fs.delete(out,true);
}
//提交并等待完成
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
-----------------------------根据黑马程序员学习所总结