2019年安徽省大数据网络赛数据预处理(三)

数据

zhan.txt
zhan.txt其实就是预处理二所得的数据,因为不想把原来的数据所替换,所以重命名为zhan.txt了

"uid":"131192622122401792"  "platform":"Android"   "app_version":"1007030202"   "pid":"5616"   "cityid":"626"  
"uid":"131192622122401792"  "platform":"Android"   "app_version":"1007030202"   "pid":"5616"   "cityid":"626"  
"uid":"131192622122401792"  "platform":"Android"   "app_version":"1007030202"   "pid":"5616"   "cityid":"626"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"142873087346606080"  "platform":"Android"   "app_version":"1007090002"   "pid":"5057"   "cityid":"86"  
"uid":"161350486564405248"  "platform":"Android"   "app_version":"1007060402"   "pid":"8888"   "cityid":"1750"  
"uid":"161350486564405248"  "platform":"Android"   "app_version":"1007060402"   "pid":"8888"   "cityid":"1750"

cityid.txt文件的数据如下:

1701|桐城市|桐城市|安徽|中国|安庆市|华东地区|四线城市|31.05228|116.93861
1702|宿松县|宿松县|安徽|中国|安庆市|华东地区|四线城市|30.151213|116.1142
1703|枞阳县|枞阳县|安徽|中国|安庆市|华东地区|四线城市|30.69371|117.21059
1704|太湖县|太湖县|安徽|中国|安庆市|华东地区|四线城市|30.420059|116.26508
1705|怀宁县|怀宁县|安徽|中国|安庆市|华东地区|四线城市|30.409006|116.64709
1706|岳西县|岳西县|安徽|中国|安庆市|华东地区|四线城市|30.857161|116.35818
1707|望江县|望江县|安徽|中国|安庆市|华东地区|四线城市|30.123537|116.67433
1708|潜山县|潜山县|安徽|中国|安庆市|华东地区|四线城市|30.630346|116.5672
5317|迎江区|迎江区|安徽|中国|安庆市|华东地区|四线城市|30.511548|117.09115
5318|大观区|大观区|安徽|中国|安庆市|华东地区|四线城市|30.553957|117.02167
1691|怀远县|怀远县|安徽|中国|蚌埠市|华东地区|四线城市|32.95665|117.19356
1692|固镇县|固镇县|安徽|中国|蚌埠市|华东地区|四线城市|33.314575|117.31171
1693|五河县|五河县|安徽|中国|蚌埠市|华东地区|四线城市|33.139736|117.88253

题目要求

此为数据处理的常用方式,进行两个数据集的关联。将把log.log文件处理后的文件中city的值对应编号关联匹配数据cityid.txt,将城市编码替换为城市名称输出。

代码

package com.mr2;
import java.io.IOException;
import java.util.Vector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class preThree {
    public static class MyMapper extends Mapper<LongWritable,Text,Text,Text>
    {
    	/*
    	 * zhan.txt文件数据:
    	 * "uid":"479489006"  "platform":"Android"   "app_version":"1007090002"   "pid":"5599"   "cityid":"5491"  
           "uid":"479489006"  "platform":"Android"   "app_version":"1007090002"   "pid":"5599"   "cityid":"5491"  
    	 * cityid.txt文件数据:
    	 * 1701|桐城市|桐城市|安徽|中国|安庆市|华东地区|四线城市|31.05228|116.93861
         * 1702|宿松县|宿松县|安徽|中国|安庆市|华东地区|四线城市|30.151213|116.1142
        */
    	private FileSplit inputsplit;    //记录分片信息
    	protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException
    	{
    		inputsplit = (FileSplit)context.getInputSplit();    //获取分片信息
    		String filename = inputsplit.getPath().getName();   //获取文件名称
    	    if(filename.contains("zhan"))   	//判断输入的路径信息
    		{
    			String s = value.toString();
    			String[] split = s.split("  ");
    			//String[] split = s.split("\\s+");
    			/*提取出两文件连接的key*/
    			String[] m = split[4].split(":");
    			if(m[1].length()>0) //利用if()过滤掉cityid为空的数据,防止下面数组下标越界
    			{
    				String joinkey = m[1].substring(1,m[1].length()-1); //将提取的joinkey与在cityid文件里的一致
        			//String joinKey = split[4].substring("cityid".length()+4,split[4].length()-1);
        			//取出value,并打上标记
            		String joinvalue = "zhan"+split[0]+"  "+split[1]+"  "+split[2]+"  "+split[3];
            		context.write(new Text(joinkey),new Text(joinvalue));
    			}
        		
    		}
    		if(filename.contains("city"))
    		{
    			String s = value.toString();
    			String[] split = s.split("\\|");
    			context.write(new Text(split[0]),new Text("cityid"+split[5]));
    		}
    	}
    }
   public static class MyReduce extends Reducer<Text,Text,Text,Text>
    {
    	protected void reduce(Text k2,Iterable<Text>v2,Context context) throws IOException,InterruptedException
    	{
    		Vector<String> vecA = new Vector<String>();
    		Vector<String> vecB = new Vector<String>();
    		for(Text value : v2)
    		{
    			String line = value.toString();
    			if(line.startsWith("zhan"))
    			{
    		     vecA.add(line.substring("zhan".length()));  //将"zhan"文件对应joinvalue加入vacA,注意不要写成line.substring(4)因为还有双引号
    			}
    		     if(line.startsWith("cityid"))
    		     {
    		    	// vecB.add(line.substring(6));//将"cityid"文件对应joinvalue加入vecB
    		    	 vecB.add(line.substring("cityid".length()));
    		     }
    		}
    		for(String s1 : vecA)  //利用两个for()循环进行笛卡儿积
    		{
    			for(String s2 : vecB)
    			{
    				context.write(new Text(s1),new Text("\"cityid\""+":"+"\""+s2+"\""));  
    			}
    		}	
     	}
   }
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException
	{
		// TODO Auto-generated method stub
	   Configuration conf = new Configuration();
 	   Job job = Job.getInstance(conf,preThree.class.getSimpleName());
 	   job.setJarByClass(preThree.class);
 	   job.setMapperClass(MyMapper.class);
 	   job.setReducerClass(MyReduce.class);
 	   job.setMapOutputKeyClass(Text.class);
 	   job.setMapOutputValueClass(Text.class);
 	   job.setOutputKeyClass(Text.class);
 	   job.setOutputValueClass(Text.class);
 	   //输入一个路径,将两个文件放在一个文件夹下,在执行hadoop jar命令的时候可以精确到文件夹的名字
 	   //如/Mrtwo,而不用精确到具体文件/Mrtwo/zhan.txt /Mrtwo/log.log如何使用后者则需要给出两个输入路径
 	   FileInputFormat.addInputPath(job,new Path(args[0]));
 	   FileOutputFormat.setOutputPath(job,new Path(args[1]));
 	   job.waitForCompletion(true);	 
	}
}



注意
在对两个文件进行取joinkey时一定要确保一致,否则经过reduce()函数之后,文件没有输出。我就这困扰了很久,取zhan.txt的joinkey时带了双引号,如"9541",而取cityid.txt的时候却没有带双引号,如9541。

结果

2019年安徽省大数据网络赛数据预处理(三)_第1张图片

你可能感兴趣的:(MapReduce)