hive-sqoop案例

部分原始数据

27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127

110.52.250.126 - - [30/May/2013:17:38:20 +0800] "GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1" 200 1292

27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680

27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_2.gif HTTP/1.1" 200 682

一、初始准备

hive (default)> create database if not exists hm;

hive (default)> use hm;

hive (hm)> dfs -mkdir -p /hm/log/160630;

hive (hm)> dfs -put /home/hadoop/hivetest/access_2013_05_30.log /hm/log/160630

 

二、mapreduce程序对原数据进行清理

package com.ibeifeng.hadoop.action.hdfs;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Cleaner extends Configured implements Tool{

 static class MyMapper extends Mapper{
  LogParser parser =new Cleaner().new LogParser();
  
  Text mapOutValue = new Text();
  @Override
  protected void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {
   String line = value.toString();
   String[] parsed = parser.parse(line);
   
   final String ip = parsed[0];
   final String longtime = parsed[1];
   String url = parsed[2]; 
   
   //过滤所有静态的资源请求
   if(url.startsWith("GET /static")||url.startsWith("GET /uc_server")){
    return;
   }
   //截取中间的网址部分
   if(url.startsWith("GET")){
    url = url.substring("GET ".length()+1, url.length()-" HTTP/1.1".length());
   }
   if(url.startsWith("POST")){
    url = url.substring("POST ".length()+1, url.length()-" HTTP/1.1".length());
   }
   
   mapOutValue.set(ip+"\t"+longtime+"\t"+url);
   context.write(key, mapOutValue);
  }
  
 }
 
 static class MyReducer extends Reducer{

  @Override
  protected void reduce(LongWritable value, Iterable values,Context context)
    throws IOException, InterruptedException {
   for (Text text : values) {
    context.write(text, NullWritable.get());
   }
  }
  
 }
 
 class LogParser{
  //27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
  public final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMMyyyy:HH:mm:ss", Locale.ENGLISH);
  public final SimpleDateFormat FORMAT1 = new SimpleDateFormat("yyyyMMddHHmmss");
  
  /**
   * 解析日志的行记录
   * @param line 输入的行记录  27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
   * @return 解析成 110.52.250.126 20130530173820 data/cache/style_1_widthauto.css?y7a
   */
  public String[] parse(String line){
   
   String ip=parseIP(line);
   
   String time ;
   try {
    time = parseTime(line);
   } catch (ParseException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    time = "null";
   }
   
   String url = parseURL(line);
   
   String status = parseStatus(line);
   
   String traffic = parseTraffic(line);
   
   return new String[]{ip , time ,url , status , traffic};
  }

  private String parseTraffic(String line) {
   String trim = line.substring(line.lastIndexOf("\"")+1).trim();
   return trim.split(" ")[1];
  }

  private String parseStatus(String line) {
   String trim = line.substring(line.lastIndexOf("\"")+1).trim();
   return trim.split(" ")[0];
  }

  private String parseURL(String line) {
   final int first = line.indexOf("\"");
   final int last = line.lastIndexOf("\"");
   String url = line.substring(first+1, last);
   return url;
  }

  private String parseTime(String line) throws ParseException {
   // TODO Auto-generated method stub
   int first = line.indexOf("[");
   int last = line.indexOf("+0800]");
   String time = line.substring(first+1 ,last).trim();
   
   return FORMAT1.format(FORMAT.parse(time));
  }

  private String parseIP(String line) {
   // TODO Auto-generated method stub
   String ip = line.split("- -")[0].trim();
   return ip;
  }
  
  
  
 }
 public int run(String[] args) throws Exception {
  Configuration conf = getConf();

  Job job = Job.getInstance(conf, Cleaner.class.getSimpleName());
  job.setJarByClass(Cleaner.class);
  
  FileInputFormat.addInputPath(job, new Path(args[0]));
  
  job.setMapperClass(MyMapper.class);
  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(Text.class);
  
  job.setReducerClass(MyReducer.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(NullWritable.class);
  
  FileSystem fileSystem = FileSystem.newInstance(conf);
  Path outPath = new Path(args[1]);
  if(fileSystem.exists(outPath)){
   fileSystem.delete(outPath, true);
  }
  
  FileOutputFormat.setOutputPath(job, outPath);
  boolean success = job.waitForCompletion(true);

  return success ? 0 : 1;
 }

 public static void main(String[] args) throws Exception {
  Configuration conf = new Configuration();
  int status = ToolRunner.run(conf, new Cleaner(), args);
  if(status == 0){
   System.out.println("成功!");
  }else{
   System.out.println("失败!");
  }
  System.exit(status);
 }

}

使用mapreduce进行数据清理

 

[hadoop@bf1 hadoop-2.5.0-cdh5.3.6]$ bin/hadoop jar /home/hadoop/hivetest/cleaner.jar /hm/log/160630 /hm/cleaned/160630

三、创建hive

CREATE EXTERNAL TABLE hmbbs (ip string, logtime string, url string)

PARTITIONED BY (logdate string)

ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

LOCATION '/hm/cleaned';

添加分区

hive (hm)> alter table hmbbs add partition (logdate = "160630") location "/hm/cleaned/160630";

四、开始分析

1、计算160630pv

drop table if exists pv_160630;

create table pv_160630

row format delimited fields terminated by '\t'

as

select count(*) from hmbbs where logdate=160630;

2、计算点击次数超过20的前20vip

drop table if exists vip_160630;

create table vip_160630

row format delimited fields terminated by '\t'

as

select 160630,ip,count(*) as hits from hmbbs

where logdate=160630

group by ip

having hits >20

order by hits desc limit 20;

五、mysql中创建数据库,及表

[hadoop@bf1 ~]$ mysql -uroot -p123456

mysql> show databases;

mysql> create database if not exists hm;

mysql> use hm;

mysql> create table if not exists pv(pv int);

 

六、利用sqoophive导出数据到mysql

bin/sqoop export \

--connect jdbc:mysql://bf1:3306/hm \

--username root \

--password 123456 \

--table pv \

--export-dir "/user/hive/warehouse/hm.db/pv_160630" \

--fields-terminated-by '\t' \

-m 1

你可能感兴趣的:(hive)