ecar168.csv(汽车销售数据表):
字段 | 数据类型 | 字段说明 |
---|---|---|
ranking | String | 排名 |
manufacturer | String | 厂商 |
vehicle_type | String | 车型 |
monthly_sales_volume | String | 月销量 |
accumulated_this_year | String | 本年累计 |
last_month | String | 上月 |
chain_ratio | String | 环比 |
corresponding_period_of_last_yea | String | 去年同期 |
year_on_year | String | 同比 |
series | String | 系列 |
month | String | 统计时间 |
url_href | String | 数据来源 url |
url_title | String | 数据来源 标题 |
url_car_detail | String | 汽车详情 url |
"1,东风日产,日产轩逸,56201,421871,57525,-2.30%,48483,15.92%,全部,2020年10月,http://www.ecar168.cn/xiaoliang/phb_cx/37.htm,2020年10月全部车型销量排行榜,http://data.ecar168.cn/car/216/"
"2,长城汽车,哈弗H6,52734,266231,40475,30.29%,40623,29.81%,全部,2020年10月,http://www.ecar168.cn/xiaoliang/phb_cx/37.htm,2020年10月全部车型销量排行榜,http://data.ecar168.cn/car/658/"
"3,上汽大众,大众朗逸,40984,332685,39595,3.51%,40608,0.93%,全部,2020年10月,http://www.ecar168.cn/xiaoliang/phb_cx/37.htm,2020年10月全部车型销量排行榜,http://data.ecar168.cn/car/465/"
"4,一汽大众,大众宝来,37944,251470,37959,-0.04%,36799,3.11%,全部,2020年10月,http://www.ecar168.cn/xiaoliang/phb_cx/37.htm,2020年10月全部车型销量排行榜,http://data.ecar168.cn/car/477/"
"5,一汽大众,大众速腾,36380,238334,33219,9.52%,36015,1.01%,全部,2020年10月,http://www.ecar168.cn/xiaoliang/phb_cx/37.htm,2020年10月全部车型销量排行榜,http://data.ecar168.cn/car/203/"
ecar168_car_detail.csv(汽车信息表):
字段 | 数据类型 | 字段说明 |
---|---|---|
manufacturer | String | 厂商 |
vehicle_type | String | 车型 |
series | String | 系列 |
url_href | String | 数据来源url |
url_title | String | 数据来源 标题 |
url_car_detail | String | 汽车详情url/综述链接 |
price | String | 价格 |
url_sales | String | 销量链接 |
"大众速腾,http://data.ecar168.cn/car/203/,13.18-19.68万,紧凑型,2019,1.2L、1.4L,手动、自动,5.4-5.6L(综合)"
"大众宝来,http://data.ecar168.cn/car/477/,9.88-15.6万,紧凑型,2019,1.4L、1.5L,手动、自动,5.6-5.9L(综合)"
"大众朗逸,http://data.ecar168.cn/car/465/,9.99-16.59万,紧凑型,2020,1.2L、1.4L、1.5L,手动、自动,5.1-5.7L(综合)"
"哈弗H6,http://data.ecar168.cn/car/658/,9.8-14.1万,SUV,2020,1.5L、2.0L,手动、自动,6.6-6.9L(综合)"
"长安CS75,http://data.ecar168.cn/car/1133/,9.58-14.68万,SUV,2019,1.5L,手动、自动,6.5-7.0L(综合)"
在master节点上操作,编写mapreduce代码对ecar168.csv文本文件中的数据去重处理,并把处理完成的数据输出到hdfs上的/output/DupRemoval/路径下。(1分)
package com.saddam.bigdata.platform.DupRemoval;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class DupRemovalMapper extends Mapper<LongWritable, Text, Text, Text> {
Text outKey=new Text();
Text outValue=new Text();
/**
* @param key--->输入的key:偏移量;输出的key:一整行内容
* @param value->输入的value:一整行内容;输出的value:一整行内容
* @param context:1,东风日产,日产轩逸,56201,421871,57525,-2.30%,48483,15.92%,全部,2020年 10月,http://www.ecar168.cn/xiaoliang/phb_cx/37.htm,2020年10月全部车型 销量排行榜,http://data.ecar168.cn/car/216/
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//补全代码
String line=value.toString();//获取一行
outKey.set(line);
outValue.set(line);
context.write(outKey,outValue);
}
}
package com.saddam.bigdata.platform.DupRemoval;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class DupRemovalReducer extends Reducer<Text, Text, NullWritable, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(null,key);
}
}
package com.saddam.bigdata.platform.DupRemoval;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DupRemovalMapReduce {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "DupaRemoval");
job.setJarByClass(DupRemovalMapReduce.class);
job.setMapperClass(DupRemovalMapper.class);
job.setReducerClass(DupRemovalReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//补全代码
String inputPath ="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\input\\car_data.csv";
String outputPath ="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\output\\output_car_dup333333";
FileInputFormat.setInputPaths(job,new Path(inputPath));
FileOutputFormat.setOutputPath(job,new Path(outputPath));
boolean b = job.waitForCompletion(true);
if (!b) {
System.out.println("wordcount task fail!");
}
}
}
在master节点上操作,编写 mapreduce 代码对 hdfs 上的/output/DupRemoval/part-r-00000数据中month字段统一清洗为yyyy-mm(例:2012-04)格式,并保留month(清洗后)、manufacturer 、vehicle_type、monthly_sales_volume、series 、url_car_detail 6个字段输出到hdfs上的/output/dataclean/路径下。(2分)
package com.saddam.bigdata.platform.CarData_3000.Clean3000;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class Clean3Mapper extends Mapper<LongWritable, Text, Text, Text> {
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] word = value.toString().split(",");
try {
//2020年10月
String word10 = word[10].replaceAll("年", "-");
//word10:2020-10月
word10=word10.replaceAll("月","");
//word10:2020-10
String year=word10.substring(0,5);
String month=word10.substring(5);
if (month.length()==1){
month="0"+month;
word10=year+month;
}
outKey.set(word10);
outValue.set(word10 + "," + word[1] + "," + word[2] + "," + word[3] + "," + word[9] + "," + word[13]);
context.write(outKey, outValue);
} catch (Exception e) {
}
}
}
package com.saddam.bigdata.platform.CarData_3000.Clean3000;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class Clean3Reducer extends Reducer<Text, Text, NullWritable, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(null,value);
}
}
}
package com.saddam.bigdata.platform.CarData_3000.Clean3000;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Clean3Driver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "carclean");
job.setJarByClass(Clean3Driver.class);
job.setMapperClass(Clean3Mapper.class);
job.setReducerClass(Clean3Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
String inputPath="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\output\\Car_Data\\3000_dup\\part-r-00000";
String outputPath="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\output\\Car_Data\\3000_clean";
FileInputFormat.setInputPaths(job,new Path(inputPath));
FileOutputFormat.setOutputPath(job,new Path(outputPath));
job.waitForCompletion(true);
}
}
在master节点上操作,编写 mapreduce 代码实现以下要求的操作:(3分)
oil_consumption字段清洗掉多余内容只保留数值,并计算平均值,保留2位小数,生成为oil_consumption_average新字段。
transmission_case字段中的"手动 自动"类别替换为"全动"。
整合hdfs上的/data/ecar168_car_detail.csv和/output/carjoindetail/part-r-00000数据,要求如下:
以/data/ecar168_car_detail.csv数据为基准,整合/output/carjoindetail/part-r-00000的数据。
按顺序输出date,manufacturer,vehicle_type,monthly_sales_volume,series,url_car_detail,price,car_type,displacement,transmission_case,oil_consumption_average字段。
输出路径为:hdfs://master:9000/output/carjoindetail/
package com.saddam.bigdata.platform.CarData_3000.Join3000;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class Join3Mapper extends Mapper<LongWritable, Text, Text, MapWritable> {
//创建对象
MapWritable mapWritable=new MapWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取当前处理的切片
FileSplit inputSplit=(FileSplit)context.getInputSplit();
//获取切片所属的文件名称(文件名称)
String filename=inputSplit.getPath().getName();
//切割处理读取的每一行数据
String line=value.toString();
String[] words=line.split(",");
//将Key的值赋空
Text TextKey=null;
//判断文本名
if (filename.equals("part-r-00000")){
//以/data/ecar168_car_detail.csv数据为基准,整合/output/carjoindetail/part-r-00000的数据。
//details:大众速腾,http://data.ecar168.cn/car/203/,13.18-19.68万,紧凑型,2019,1.2L、1.4L,手动、自动,5.4-5.6L(综合)
//part-r-00000:2019-11,华晨中华,中华H530,84,全部,http://data.ecar168.cn/car/778/
TextKey=new Text(words[2]+words[5]);
Text month=new Text(words[0]);//获取日期
Text manufacturer=new Text(words[1]);//获取厂商
Text vehicle_type=new Text(words[2]);//获取车型
Text monthly_sales_volume=new Text(words[3]);//获取月销量
Text series=new Text(words[4]);//获取系列
Text url_car_detail=new Text(words[5]);//获取汽车详情url
mapWritable.put(new Text("flag"),new Text("car"));
mapWritable.put(new Text("month"),month);
mapWritable.put(new Text("manufacturer"),manufacturer);
mapWritable.put(new Text("vehicle_type"),vehicle_type);
mapWritable.put(new Text("monthly_sales_volume"),monthly_sales_volume);
mapWritable.put(new Text("series"),series);
mapWritable.put(new Text("url_car_detail"),url_car_detail);
context.write(TextKey,mapWritable);
}else {
TextKey=new Text(words[0]+words[1]);
Text vehicle_type=new Text(words[0]);
Text url_href=new Text(words[1]);
Text price=new Text(words[2]);
Text car_type=new Text(words[3]);
Text time_to_market=new Text(words[4]);
Text displacement=new Text(words[5]);
Text transmission_case=new Text(words[6]);
//处理油量
String oil_consumption_str=words[7];//5.4-5.6L(综合)
oil_consumption_str=oil_consumption_str.replaceAll("(综合)","");//5.4-5.6L
oil_consumption_str=oil_consumption_str.replaceAll("L","");//5.4-5.6
/**
* indexOf() 方法有以下四种形式:
*
* public int indexOf(int ch): 返回指定字符在字符串中第一次出现处的索引,如果此字符串中没有这样的字符,则返回 -1。
*
* public int indexOf(int ch, int fromIndex): 返回从 fromIndex 位置开始查找指定字符在字符串中第一次出现处的索引,如果此字符串中没有这样的字符,则返回 -1。
*
* int indexOf(String str): 返回指定字符在字符串中第一次出现处的索引,如果此字符串中没有这样的字符,则返回 -1。
*
* int indexOf(String str, int fromIndex): 返回从 fromIndex 位置开始查找指定字符在字符串中第一次出现处的索引,如果此字符串中没有这样的字符,则返回 -1。
*/
if (oil_consumption_str.indexOf("-")>0){
String[] oil_consumption_=oil_consumption_str.split("-");
float oil_consumption_0=Float.parseFloat(oil_consumption_[0]);
float oil_consumption_1=Float.parseFloat(oil_consumption_[1]);
oil_consumption_str=String.valueOf((oil_consumption_0+oil_consumption_1)/2).substring(0,3);
}
Text oil_consumption_average=new Text(oil_consumption_str);
mapWritable.put(new Text("flag"),new Text("detail"));
mapWritable.put(new Text("vehicle_type"),vehicle_type);
mapWritable.put(new Text("url_href"),url_href);
mapWritable.put(new Text("price"),price);
mapWritable.put(new Text("car_type"),car_type);
mapWritable.put(new Text("time_to_market"),time_to_market);
mapWritable.put(new Text("displacement"),displacement);
mapWritable.put(new Text("time_to_market"),time_to_market);
mapWritable.put(new Text("transmission_case"),transmission_case);
mapWritable.put(new Text("oil_consumption_average"),oil_consumption_average);
context.write(TextKey,mapWritable);
}
}
}
package com.saddam.bigdata.platform.CarData_3000.Join3000;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class Join3Reducer extends Reducer<Text, MapWritable, NullWritable, Text> {
@Override
protected void reduce(Text key, Iterable<MapWritable> values, Context context) throws IOException, InterruptedException {
boolean blag=false;
List list=new ArrayList();
Map map=new HashMap();
for (MapWritable mapWritable:values){
String flag=mapWritable.get(new Text("flag")).toString();
if (flag.equals("car")){
Map car=new HashMap();
car.put("month",mapWritable.get(new Text("month")).toString());
car.put("manufacturer",mapWritable.get(new Text("manufacturer")).toString());
car.put("vehicle_type",mapWritable.get(new Text("vehicle_type")).toString());
car.put("monthly_sales_volume",mapWritable.get(new Text("monthly_sales_volume")).toString());
car.put("series",mapWritable.get(new Text("series")).toString());
car.put("url_car_detail",mapWritable.get(new Text("url_car_detail")).toString());
list.add(car);
}
if (flag.equals("detail")){
blag=true;
map.put("vehicle_type",mapWritable.get(new Text("vehicle_type")).toString());
map.put("url_href",mapWritable.get(new Text("url_href")).toString());
map.put("price",mapWritable.get(new Text("price")).toString());
map.put("car_type",mapWritable.get(new Text("car_type")).toString());
map.put("time_to_market",mapWritable.get(new Text("time_to_market")).toString());
map.put("displacement",mapWritable.get(new Text("displacement")).toString());
map.put("time_to_market",mapWritable.get(new Text("time_to_market")).toString());
map.put("transmission_case",mapWritable.get(new Text("transmission_case")).toString());
map.put("oil_consumption_average",mapWritable.get(new Text("oil_consumption_average")).toString());
}
}
try {
if (blag){
String price=String.valueOf(map.get("price")).replaceAll(" ","");
String transmission_case=String.valueOf(map.get("transmission_case"));
if ("手动、自动".equals(transmission_case)){
transmission_case="全动";
}
String outputmap=price+","+map.get("car_type")+","+map.get("displacement")+","+transmission_case+","+map.get("oil_consumption_average");
for (Object obj:list){
Map mapobj=(Map)obj;
String output=mapobj.get("month")+","+mapobj.get("manufacturer")+","+mapobj.get("vehicle_type")+","+mapobj.get("monthly_sales_volume")+","+mapobj.get("series")+","+mapobj.get("url_car_detail")+","+outputmap;
context.write(null,new Text(output));
}
}
}catch (Exception e){
}
}
}
package com.saddam.bigdata.platform.CarData_3000.Join3000;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Join3Driver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "carjoindetail");
job.setJarByClass(Join3Driver.class);
job.setMapperClass(Join3Mapper.class);
job.setReducerClass(Join3Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MapWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//根据题目要求编写代码
String inputPath1 ="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\output\\Car_Data\\output_car_clean\\part-r-00000";
String inputPath2 ="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\input\\ecar168_car_detail.txt";
String outputPath ="D:\\Maven_java content root\\Maven_web\\hadoop-train\\mapreduce_data\\output\\Car_Data\\output_car3Join";
//同时加载2个数据源
FileInputFormat.setInputPaths(job,new Path(inputPath1),new Path(inputPath2));
FileOutputFormat.setOutputPath(job,new Path(outputPath));
boolean b = job.waitForCompletion(true);
if (!b) {
System.out.println("wordcount task fail!");
}
}
}
2007-08,哈飞汽车,赛豹V,287,全部,https://www.16888.com/125890/
2007-08,天津一汽,威姿,145,全部,https://www.16888.com/57968/
2007-09,天津一汽,威姿,287,全部,https://www.16888.com/57968/
2007-09,哈飞汽车,赛豹V,330,全部,https://www.16888.com/125890/
2007-10,哈飞汽车,赛豹V,576,全部,https://www.16888.com/125890/
2007-10,广汽三菱,欧蓝德,2,全部,https://www.16888.com/127531/
2007-10,天津一汽,威姿,203,全部,https://www.16888.com/57968/
2007-11,哈飞汽车,赛豹V,439,全部,https://www.16888.com/125890/
2007-11,广汽三菱,欧蓝德,87,全部,https://www.16888.com/127531/
2007-11,天津一汽,威姿,206,全部,https://www.16888.com/57968/
2007-12,广汽三菱,欧蓝德,7,全部,https://www.16888.com/127531/
2007-12,华晨中华,中华酷宝,182,全部,https://www.16888.com/58080/
2007-12,天津一汽,威姿,80,全部,https://www.16888.com/57968/
2007-12,哈飞汽车,赛豹V,387,全部,https://www.16888.com/125890/
2008-01,华晨中华,中华酷宝,487,全部,https://www.16888.com/58080/
2008-01,天津一汽,威姿,80,全部,https://www.16888.com/57968/
2008-01,哈飞汽车,赛豹V,540,全部,https://www.16888.com/125890/
2008-02,华晨中华,中华酷宝,132,全部,https://www.16888.com/58080/
2008-02,天津一汽,威姿,363,全部,https://www.16888.com/57968/
2008-02,哈飞汽车,赛豹V,465,全部,https://www.16888.com/125890/
2020-10,长安标致雪铁龙,DS 4S,0,全部,http://data.ecar168.cn/car/1498/,17.19-22.99万,紧凑型,1.6L,自动,6.3
2020-10,长安标致雪铁龙,DS 6,0,全部,http://data.ecar168.cn/car/1298/,20.69-27.29万,SUV,1.6L,自动,6.7
2020-10,长安标致雪铁龙,DS 7,5,全部,http://data.ecar168.cn/car/1864/,20.89-31.99万,SUV,1.6L,自动,6.2
2020-10,广汽菲克,Jeep大指挥官PHEV,30,美系,http://data.ecar168.cn/car/2222/,30.98-33.68万,SUV,2.0L,自动,1.6
2020-10,广汽菲克,Jeep大指挥官PHEV,30,全部,http://data.ecar168.cn/car/2222/,30.98-33.68万,SUV,2.0L,自动,1.6
2020-10,广汽菲克,Jeep指南者,2092,美系,http://data.ecar168.cn/car/1576/,15.58-22.98万,SUV,1.3L,自动,6.8
2020-04,广汽菲克,Jeep指南者,2020,美系,http://data.ecar168.cn/car/1576/,15.58-22.98万,SUV,1.3L,自动,6.8
2020-10,广汽菲克,Jeep指南者,2092,全部,http://data.ecar168.cn/car/1576/,15.58-22.98万,SUV,1.3L,自动,6.8
2020-04,广汽菲克,Jeep指南者,2020,全部,http://data.ecar168.cn/car/1576/,15.58-22.98万,SUV,1.3L,自动,6.8
2020-04,广汽菲克,Jeep自由光,510,美系,http://data.ecar168.cn/car/1451/,17.98-31.98万,SUV,2.0L,自动,8.5
2020-10,广汽菲克,Jeep自由光,1259,美系,http://data.ecar168.cn/car/1451/,17.98-31.98万,SUV,2.0L,自动,8.5
2020-10,广汽菲克,Jeep自由光,1259,全部,http://data.ecar168.cn/car/1451/,17.98-31.98万,SUV,2.0L,自动,8.5