部分原始数据
27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127 110.52.250.126 - - [30/May/2013:17:38:20 +0800] "GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1" 200 1292 27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_1.gif HTTP/1.1" 200 680 27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/hot_2.gif HTTP/1.1" 200 682 |
一、初始准备
hive (default)> create database if not exists hm; hive (default)> use hm; hive (hm)> dfs -mkdir -p /hm/log/160630; hive (hm)> dfs -put /home/hadoop/hivetest/access_2013_05_30.log /hm/log/160630
|
二、mapreduce程序对原数据进行清理
package com.ibeifeng.hadoop.action.hdfs;
import java.io.IOException;
import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Locale;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner;
public class Cleaner extends Configured implements Tool{
static class MyMapper extends Mapper
LogParser parser =new Cleaner().new LogParser(); Text mapOutValue = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] parsed = parser.parse(line); final String ip = parsed[0]; final String longtime = parsed[1]; String url = parsed[2]; //过滤所有静态的资源请求 if(url.startsWith("GET /static")||url.startsWith("GET /uc_server")){ return; } //截取中间的网址部分 if(url.startsWith("GET")){ url = url.substring("GET ".length()+1, url.length()-" HTTP/1.1".length()); } if(url.startsWith("POST")){ url = url.substring("POST ".length()+1, url.length()-" HTTP/1.1".length()); } mapOutValue.set(ip+"\t"+longtime+"\t"+url); context.write(key, mapOutValue); } } static class MyReducer extends Reducer
@Override
protected void reduce(LongWritable value, Iterable throws IOException, InterruptedException { for (Text text : values) { context.write(text, NullWritable.get()); } } } class LogParser{ //27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127 public final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMMyyyy:HH:mm:ss", Locale.ENGLISH); public final SimpleDateFormat FORMAT1 = new SimpleDateFormat("yyyyMMddHHmmss"); /** * 解析日志的行记录 * @param line 输入的行记录 27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127 * @return 解析成 110.52.250.126 20130530173820 data/cache/style_1_widthauto.css?y7a */ public String[] parse(String line){ String ip=parseIP(line); String time ; try { time = parseTime(line); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); time = "null"; } String url = parseURL(line); String status = parseStatus(line); String traffic = parseTraffic(line); return new String[]{ip , time ,url , status , traffic}; }
private String parseTraffic(String line) {
String trim = line.substring(line.lastIndexOf("\"")+1).trim(); return trim.split(" ")[1]; }
private String parseStatus(String line) {
String trim = line.substring(line.lastIndexOf("\"")+1).trim(); return trim.split(" ")[0]; }
private String parseURL(String line) {
final int first = line.indexOf("\""); final int last = line.lastIndexOf("\""); String url = line.substring(first+1, last); return url; }
private String parseTime(String line) throws ParseException {
// TODO Auto-generated method stub int first = line.indexOf("["); int last = line.indexOf("+0800]"); String time = line.substring(first+1 ,last).trim(); return FORMAT1.format(FORMAT.parse(time)); }
private String parseIP(String line) {
// TODO Auto-generated method stub String ip = line.split("- -")[0].trim(); return ip; } } public int run(String[] args) throws Exception { Configuration conf = getConf();
Job job = Job.getInstance(conf, Cleaner.class.getSimpleName());
job.setJarByClass(Cleaner.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileSystem fileSystem = FileSystem.newInstance(conf); Path outPath = new Path(args[1]); if(fileSystem.exists(outPath)){ fileSystem.delete(outPath, true); } FileOutputFormat.setOutputPath(job, outPath); boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration(); int status = ToolRunner.run(conf, new Cleaner(), args); if(status == 0){ System.out.println("成功!"); }else{ System.out.println("失败!"); } System.exit(status); }
}
|
使用mapreduce进行数据清理
[hadoop@bf1 hadoop-2.5.0-cdh5.3.6]$ bin/hadoop jar /home/hadoop/hivetest/cleaner.jar /hm/log/160630 /hm/cleaned/160630 |
三、创建hive表
CREATE EXTERNAL TABLE hmbbs (ip string, logtime string, url string) PARTITIONED BY (logdate string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/hm/cleaned'; |
添加分区
hive (hm)> alter table hmbbs add partition (logdate = "160630") location "/hm/cleaned/160630"; |
四、开始分析
1、计算160630的pv
drop table if exists pv_160630; create table pv_160630 row format delimited fields terminated by '\t' as select count(*) from hmbbs where logdate=160630; |
2、计算点击次数超过20的前20,vip
drop table if exists vip_160630; create table vip_160630 row format delimited fields terminated by '\t' as select 160630,ip,count(*) as hits from hmbbs where logdate=160630 group by ip having hits >20 order by hits desc limit 20; |
五、mysql中创建数据库,及表
[hadoop@bf1 ~]$ mysql -uroot -p123456 mysql> show databases; mysql> create database if not exists hm; mysql> use hm; mysql> create table if not exists pv(pv int); |
六、利用sqoop从hive导出数据到mysql
bin/sqoop export \ --connect jdbc:mysql://bf1:3306/hm \ --username root \ --password 123456 \ --table pv \ --export-dir "/user/hive/warehouse/hm.db/pv_160630" \ --fields-terminated-by '\t' \ -m 1 |