1、日志要进入的目标表结构信息
1.1 Hive库上的目标表结构信息
CREATE TABLE `yemao_log`(
`id` int,
`time` int,
`url_from` string,
`url_current` string,
`url_to` string,
`options` string,
`uid` int,
`new_visitor` string,
`province` string,
`city` string,
`site` string,
`device` string,
`phone` string,
`token` string,
`dorm` string,
`order_phone` string,
`order_dormitory` string,
`order_amount` string,
`order_id` int,
`uname` string,
`site_id` int,
`address` string,
`dorm_id` int,
`dormentry_id` int,
`rid` int,
`cart_quantity` string)
PARTITIONED BY (
`log_date` int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://Master:9000/user/hive/warehouse/yemao_log'
TBLPROPERTIES (
'transient_lastDdlTime'='1447308813');
1.2 Mysql库上当前表,其实就是一个临时表
CREATE TABLE `yemao_log` (
`id` varchar(8000) DEFAULT NULL,
`time` varchar(8000) DEFAULT NULL,
`url_from` text,
`url_current` text,
`url_to` text,
`options` text,
`uid` text,
`new_visitor` text,
`province` text,
`city` text,
`site` text,
`device` text,
`phone` text,
`token` text,
`dorm` text,
`order_phone` text,
`order_dormitory` text,
`order_amount` text,
`order_id` text,
`uname` text,
`site_id` text,
`address` text,
`dorm_id` text,
`dormentry_id` text,
`rid` text,
`cart_quantity` text
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
1.3 Mysql库上历史表,其实就是实际在用的表
CREATE TABLE `yemao_loghis` (
`id` varchar(8000) DEFAULT NULL,
`time` varchar(8000) DEFAULT NULL,
`url_from` text,
`url_current` text,
`url_to` text,
`options` text,
`uid` text,
`new_visitor` text,
`province` text,
`city` text,
`site` text,
`device` text,
`phone` text,
`token` text,
`dorm` text,
`order_phone` text,
`order_dormitory` text,
`order_amount` text,
`order_id` text,
`uname` text,
`site_id` text,
`address` text,
`dorm_id` text,
`dormentry_id` text,
`rid` text,
`cart_quantity` text,
`log_date` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
/*!50100 PARTITION BY LIST (log_date)
(PARTITION p0 VALUES IN (0) ENGINE = InnoDB,
PARTITION p20151109 VALUES IN (20151109) ENGINE = InnoDB,
PARTITION p20151110 VALUES IN (20151110) ENGINE = InnoDB,
PARTITION p20151111 VALUES IN (20151111) ENGINE = InnoDB,
PARTITION p20151112 VALUES IN (20151112) ENGINE = InnoDB,
PARTITION p20151113 VALUES IN (20151113) ENGINE = InnoDB,
PARTITION p20151114 VALUES IN (20151114) ENGINE = InnoDB,
PARTITION p20151115 VALUES IN (20151115) ENGINE = InnoDB,
PARTITION p20151116 VALUES IN (20151116) ENGINE = InnoDB,
PARTITION p20151117 VALUES IN (20151117) ENGINE = InnoDB,
PARTITION p20151118 VALUES IN (20151118) ENGINE = InnoDB,
PARTITION p20151119 VALUES IN (20151119) ENGINE = InnoDB,
PARTITION p20151120 VALUES IN (20151120) ENGINE = InnoDB,
PARTITION p20151121 VALUES IN (20151121) ENGINE = InnoDB,
PARTITION p20151122 VALUES IN (20151122) ENGINE = InnoDB,
PARTITION p20151123 VALUES IN (20151123) ENGINE = InnoDB,
PARTITION p20151124 VALUES IN (20151124) ENGINE = InnoDB,
PARTITION p20151125 VALUES IN (20151125) ENGINE = InnoDB,
PARTITION p20151126 VALUES IN (20151126) ENGINE = InnoDB,
PARTITION p20151127 VALUES IN (20151127) ENGINE = InnoDB,
PARTITION p20151128 VALUES IN (20151128) ENGINE = InnoDB,
PARTITION p20151129 VALUES IN (20151129) ENGINE = InnoDB,
PARTITION p20151130 VALUES IN (20151130) ENGINE = InnoDB,
PARTITION p20151201 VALUES IN (20151201) ENGINE = InnoDB,
PARTITION p20151202 VALUES IN (20151202) ENGINE = InnoDB,
PARTITION p20151203 VALUES IN (20151203) ENGINE = InnoDB) */;
2、数据处理的存储过程
CREATE DEFINER=`datahs`@`%` PROCEDURE `p_ymlog_maintain`(IN `v_log_date` int)
BEGIN
DECLARE
v_partition_exists INT;
SELECT
count(1) INTO v_partition_exists
FROM
information_schema.`PARTITIONS`
WHERE
TABLE_SCHEMA = 'logdata'
AND table_name = 'yemao_loghis'
AND partition_name = concat('p',v_log_date);
IF v_partition_exists = 1 THEN
SET @exec_sql=concat("ALTER TABLE logdata.yemao_loghis DROP PARTITION p",v_log_date);
PREPARE stmt FROM @exec_sql;
EXECUTE stmt;
END IF;
SET @exec_sql=concat("ALTER TABLE logdata.yemao_loghis ADD PARTITION (PARTITION p",v_log_date," VALUES IN (",v_log_date,"));");
PREPARE stmt FROM @exec_sql;
EXECUTE stmt;
SET @exec_sql=concat("INSERT INTO logdata.yemao_loghis (
id,
time,
url_from,
url_current,
url_to,
OPTIONS,
uid,
new_visitor,
province,
city,
site,
device,
phone,
token,
dorm,
order_phone,
order_dormitory,
order_amount,
order_id,
uname,
site_id,
address,
dorm_id,
dormentry_id,
rid,
cart_quantity,
log_date
) SELECT
a.id,
a.time,
a.url_from,
a.url_current,
a.url_to,
a. OPTIONS,
a.uid,
a.new_visitor,
a.province,
a.city,
a.site,
a.device,
a.phone,
a.token,
a.dorm,
a.order_phone,
a.order_dormitory,
a.order_amount,
a.order_id,
a.uname,
a.site_id,
a.address,
a.dorm_id,
a.dormentry_id,
a.rid,
a.cart_quantity,
",v_log_date," log_date
FROM
logdata.yemao_log a
WHERE
id <> 'id';");
PREPARE stmt FROM @exec_sql;
EXECUTE stmt;
TRUNCATE TABLE logdata.yemao_log;
END
3、数据处理及装载的Shell脚本
定时自动调度脚本
ymlog_proc.sh
#/bin/bash
export yesterday=`date -d last-day +%Y%m%d`
cd /home/spark/opt/Log_Data/yemao
for tar in yemao*$yesterday.tar.gz;
do
tar xvf $tar;
grep -h "\[{.*}\]" *.log >> yemaolog;
rm -rf /home/spark/opt/Log_Data/yemao/*.log
done
sed -i 's/^.//' yemaolog
sed -i 's/..$//' yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoimport -d yemao -c yemao_log_$yesterday --drop /home/spark/opt/Log_Data/yemao/yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoexport -d yemao -c yemao_log_$yesterday --csv -f id,time,url_from,url_current,url_to,options,uid,new_visitor,province,city,site,device,phone,token,dorm,order_phone,order_dormitory,order_amount,order_id,uname,site_id,address,dorm_id,dormentry_id,rid,cart_quantity -o /home/spark/opt/Log_Data/yemao/yemao.dat
/home/spark/opt/hive-1.2.1/bin/hive -e "alter table yemao_log drop if exists partition (log_date=$yesterday);alter table yemao_log add if not exists partition (log_date=$yesterday);load data local inpath '/home/spark/opt/Log_Data/yemao/yemao.dat' into table yemao_log partition (log_date=$yesterday);"
#/usr/local/mysql/bin/mysqlimport -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 --fields-terminated-by=, --lines-terminated-by=\n logdata /home/spark/opt/Log_Data/yemao/yemao.dat --local
/home/spark/opt/sqoop-1.4.6/bin/sqoop export --connect jdbc:mysql://120.55.189.188:3306/logdata --username datawarehouse --password datawarehouse2015 --table yemao_log --export-dir /user/hive/warehouse/yemao_log/log_date=$yesterday --input-fields-terminated-by ',';
/usr/local/mysql/bin/mysql -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 -e "call logdata.p_ymlog_maintain($yesterday);"
rm -rf /home/spark/opt/Log_Data/yemao/yemao.dat
rm -rf /home/spark/opt/Log_Data/yemao/yemaolog
rm -rf /home/spark/opt/Log_Data/yemao/yemao_log.java
手动调度处理脚本
ymlog_proc_manual.sh
#/bin/bash
#export yesterday=`date -d last-day +%Y%m%d`
echo -n "please enter a day for runing :"
read yesterday
cd /home/spark/opt/Log_Data/yemao
for tar in yemao*$yesterday.tar.gz;
do
tar xvf $tar;
grep -h "\[{.*}\]" *.log >> yemaolog;
rm -rf /home/spark/opt/Log_Data/yemao/*.log
done
sed -i 's/^.//' yemaolog
sed -i 's/..$//' yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoimport -d yemao -c yemao_log_$yesterday --drop /home/spark/opt/Log_Data/yemao/yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoexport -d yemao -c yemao_log_$yesterday --csv -f id,time,url_from,url_current,url_to,options,uid,new_visitor,province,city,site,device,phone,token,dorm,order_phone,order_dormitory,order_amount,order_id,uname,site_id,address,dorm_id,dormentry_id,rid,cart_quantity -o /home/spark/opt/Log_Data/yemao/yemao.dat
/home/spark/opt/hive-1.2.1/bin/hive -e "alter table yemao_log drop if exists partition (log_date=$yesterday);alter table yemao_log add if not exists partition (log_date=$yesterday);load data local inpath '/home/spark/opt/Log_Data/yemao/yemao.dat' into table yemao_log partition (log_date=$yesterday);"
#/usr/local/mysql/bin/mysqlimport -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 --fields-terminated-by=, --lines-terminated-by=\n logdata /home/spark/opt/Log_Data/yemao/yemao.dat --local
/home/spark/opt/sqoop-1.4.6/bin/sqoop export --connect jdbc:mysql://120.55.189.188:3306/logdata --username datawarehouse --password datawarehouse2015 --table yemao_log --export-dir /user/hive/warehouse/yemao_log/log_date=$yesterday --input-fields-terminated-by ',';
/usr/local/mysql/bin/mysql -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 -e "call logdata.p_ymlog_maintain($yesterday);"
rm -rf /home/spark/opt/Log_Data/yemao/yemao.dat
rm -rf /home/spark/opt/Log_Data/yemao/yemaolog
rm -rf /home/spark/opt/Log_Data/yemao/yemao_log.java
4、设置Corntab定时调度
[spark@Master ~]$ crontab -l
0 6 * * * sh /home/spark/opt/Log_Data/ymlog_proc.sh
5、处理流程说明
业务系统埋点产生的用户行为数据,是以Json方式进行保存和传送过来的。首先,将源日志数据进行一定程度的处理,使之成为标准的Json格式;然后将文件装载到MongoDB数据库;最后根据需要将必要字段分别装载到Hive及Mysql数据库中。