用户行为分析业务系统日志处理方案

1、日志要进入的目标表结构信息
1.1 Hive库上的目标表结构信息
CREATE TABLE `yemao_log`(
  `id` int, 
  `time` int, 
  `url_from` string, 
  `url_current` string, 
  `url_to` string, 
  `options` string, 
  `uid` int, 
  `new_visitor` string, 
  `province` string, 
  `city` string, 
  `site` string, 
  `device` string, 
  `phone` string, 
  `token` string, 
  `dorm` string, 
  `order_phone` string, 
  `order_dormitory` string, 
  `order_amount` string, 
  `order_id` int, 
  `uname` string, 
  `site_id` int, 
  `address` string, 
  `dorm_id` int, 
  `dormentry_id` int, 
  `rid` int, 
  `cart_quantity` string)
PARTITIONED BY ( 
  `log_date` int)
ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ',' 
  LINES TERMINATED BY '\n' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://Master:9000/user/hive/warehouse/yemao_log'
TBLPROPERTIES (
  'transient_lastDdlTime'='1447308813');

1.2 Mysql库上当前表,其实就是一个临时表
CREATE TABLE `yemao_log` (
  `id` varchar(8000) DEFAULT NULL,
  `time` varchar(8000) DEFAULT NULL,
  `url_from` text,
  `url_current` text,
  `url_to` text,
  `options` text,
  `uid` text,
  `new_visitor` text,
  `province` text,
  `city` text,
  `site` text,
  `device` text,
  `phone` text,
  `token` text,
  `dorm` text,
  `order_phone` text,
  `order_dormitory` text,
  `order_amount` text,
  `order_id` text,
  `uname` text,
  `site_id` text,
  `address` text,
  `dorm_id` text,
  `dormentry_id` text,
  `rid` text,
  `cart_quantity` text
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

1.3 Mysql库上历史表,其实就是实际在用的表
CREATE TABLE `yemao_loghis` (
  `id` varchar(8000) DEFAULT NULL,
  `time` varchar(8000) DEFAULT NULL,
  `url_from` text,
  `url_current` text,
  `url_to` text,
  `options` text,
  `uid` text,
  `new_visitor` text,
  `province` text,
  `city` text,
  `site` text,
  `device` text,
  `phone` text,
  `token` text,
  `dorm` text,
  `order_phone` text,
  `order_dormitory` text,
  `order_amount` text,
  `order_id` text,
  `uname` text,
  `site_id` text,
  `address` text,
  `dorm_id` text,
  `dormentry_id` text,
  `rid` text,
  `cart_quantity` text,
  `log_date` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
/*!50100 PARTITION BY LIST (log_date)
(PARTITION p0 VALUES IN (0) ENGINE = InnoDB,
 PARTITION p20151109 VALUES IN (20151109) ENGINE = InnoDB,
 PARTITION p20151110 VALUES IN (20151110) ENGINE = InnoDB,
 PARTITION p20151111 VALUES IN (20151111) ENGINE = InnoDB,
 PARTITION p20151112 VALUES IN (20151112) ENGINE = InnoDB,
 PARTITION p20151113 VALUES IN (20151113) ENGINE = InnoDB,
 PARTITION p20151114 VALUES IN (20151114) ENGINE = InnoDB,
 PARTITION p20151115 VALUES IN (20151115) ENGINE = InnoDB,
 PARTITION p20151116 VALUES IN (20151116) ENGINE = InnoDB,
 PARTITION p20151117 VALUES IN (20151117) ENGINE = InnoDB,
 PARTITION p20151118 VALUES IN (20151118) ENGINE = InnoDB,
 PARTITION p20151119 VALUES IN (20151119) ENGINE = InnoDB,
 PARTITION p20151120 VALUES IN (20151120) ENGINE = InnoDB,
 PARTITION p20151121 VALUES IN (20151121) ENGINE = InnoDB,
 PARTITION p20151122 VALUES IN (20151122) ENGINE = InnoDB,
 PARTITION p20151123 VALUES IN (20151123) ENGINE = InnoDB,
 PARTITION p20151124 VALUES IN (20151124) ENGINE = InnoDB,
 PARTITION p20151125 VALUES IN (20151125) ENGINE = InnoDB,
 PARTITION p20151126 VALUES IN (20151126) ENGINE = InnoDB,
 PARTITION p20151127 VALUES IN (20151127) ENGINE = InnoDB,
 PARTITION p20151128 VALUES IN (20151128) ENGINE = InnoDB,
 PARTITION p20151129 VALUES IN (20151129) ENGINE = InnoDB,
 PARTITION p20151130 VALUES IN (20151130) ENGINE = InnoDB,
 PARTITION p20151201 VALUES IN (20151201) ENGINE = InnoDB,
 PARTITION p20151202 VALUES IN (20151202) ENGINE = InnoDB,
 PARTITION p20151203 VALUES IN (20151203) ENGINE = InnoDB) */;

2、数据处理的存储过程
CREATE DEFINER=`datahs`@`%` PROCEDURE `p_ymlog_maintain`(IN `v_log_date` int)
BEGIN
	DECLARE
		v_partition_exists INT;

SELECT
	count(1) INTO v_partition_exists
FROM
	information_schema.`PARTITIONS`
WHERE
	TABLE_SCHEMA = 'logdata'
AND table_name = 'yemao_loghis'
AND partition_name = concat('p',v_log_date);

IF v_partition_exists = 1 THEN
SET @exec_sql=concat("ALTER TABLE logdata.yemao_loghis DROP PARTITION p",v_log_date);
PREPARE stmt FROM @exec_sql; 
EXECUTE stmt; 
END IF;


SET @exec_sql=concat("ALTER TABLE logdata.yemao_loghis ADD PARTITION (PARTITION p",v_log_date,"	VALUES IN (",v_log_date,"));");
PREPARE stmt FROM @exec_sql; 
EXECUTE stmt; 

SET @exec_sql=concat("INSERT INTO logdata.yemao_loghis (
	id,
	time,
	url_from,
	url_current,
	url_to,
	OPTIONS,
	uid,
	new_visitor,
	province,
	city,
	site,
	device,
	phone,
	token,
	dorm,
	order_phone,
	order_dormitory,
	order_amount,
	order_id,
	uname,
	site_id,
	address,
	dorm_id,
	dormentry_id,
	rid,
	cart_quantity,
	log_date
) SELECT
	a.id,
	a.time,
	a.url_from,
	a.url_current,
	a.url_to,
	a. OPTIONS,
	a.uid,
	a.new_visitor,
	a.province,
	a.city,
	a.site,
	a.device,
	a.phone,
	a.token,
	a.dorm,
	a.order_phone,
	a.order_dormitory,
	a.order_amount,
	a.order_id,
	a.uname,
	a.site_id,
	a.address,
	a.dorm_id,
	a.dormentry_id,
	a.rid,
	a.cart_quantity,
  ",v_log_date," log_date
FROM
	logdata.yemao_log a
WHERE
	id <> 'id';");
PREPARE stmt FROM @exec_sql; 
EXECUTE stmt; 

TRUNCATE TABLE logdata.yemao_log;

END

3、数据处理及装载的Shell脚本
定时自动调度脚本
ymlog_proc.sh
#/bin/bash
export yesterday=`date -d last-day +%Y%m%d`

cd /home/spark/opt/Log_Data/yemao
for tar in yemao*$yesterday.tar.gz; 
do
tar xvf $tar;
grep  -h "\[{.*}\]" *.log >> yemaolog;
rm -rf /home/spark/opt/Log_Data/yemao/*.log
done
sed -i 's/^.//' yemaolog
sed -i 's/..$//' yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoimport -d yemao -c yemao_log_$yesterday --drop /home/spark/opt/Log_Data/yemao/yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoexport -d yemao -c yemao_log_$yesterday --csv -f id,time,url_from,url_current,url_to,options,uid,new_visitor,province,city,site,device,phone,token,dorm,order_phone,order_dormitory,order_amount,order_id,uname,site_id,address,dorm_id,dormentry_id,rid,cart_quantity -o /home/spark/opt/Log_Data/yemao/yemao.dat
/home/spark/opt/hive-1.2.1/bin/hive -e "alter table yemao_log drop if exists partition (log_date=$yesterday);alter table yemao_log add if not exists partition (log_date=$yesterday);load data local inpath '/home/spark/opt/Log_Data/yemao/yemao.dat' into table yemao_log partition (log_date=$yesterday);"
#/usr/local/mysql/bin/mysqlimport -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 --fields-terminated-by=, --lines-terminated-by=\n logdata /home/spark/opt/Log_Data/yemao/yemao.dat --local
/home/spark/opt/sqoop-1.4.6/bin/sqoop export --connect jdbc:mysql://120.55.189.188:3306/logdata --username datawarehouse --password datawarehouse2015 --table yemao_log --export-dir /user/hive/warehouse/yemao_log/log_date=$yesterday --input-fields-terminated-by ',';
/usr/local/mysql/bin/mysql -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 -e "call logdata.p_ymlog_maintain($yesterday);"
rm -rf /home/spark/opt/Log_Data/yemao/yemao.dat
rm -rf /home/spark/opt/Log_Data/yemao/yemaolog
rm -rf /home/spark/opt/Log_Data/yemao/yemao_log.java


手动调度处理脚本
ymlog_proc_manual.sh
#/bin/bash
#export yesterday=`date -d last-day +%Y%m%d`
echo -n "please enter a day for runing :"
read yesterday

cd /home/spark/opt/Log_Data/yemao
for tar in yemao*$yesterday.tar.gz; 
do
tar xvf $tar;
grep  -h "\[{.*}\]" *.log >> yemaolog;
rm -rf /home/spark/opt/Log_Data/yemao/*.log
done
sed -i 's/^.//' yemaolog
sed -i 's/..$//' yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoimport -d yemao -c yemao_log_$yesterday --drop /home/spark/opt/Log_Data/yemao/yemaolog
/home/spark/opt/mongodb-2.7.0/bin/mongoexport -d yemao -c yemao_log_$yesterday --csv -f id,time,url_from,url_current,url_to,options,uid,new_visitor,province,city,site,device,phone,token,dorm,order_phone,order_dormitory,order_amount,order_id,uname,site_id,address,dorm_id,dormentry_id,rid,cart_quantity -o /home/spark/opt/Log_Data/yemao/yemao.dat
/home/spark/opt/hive-1.2.1/bin/hive -e "alter table yemao_log drop if exists partition (log_date=$yesterday);alter table yemao_log add if not exists partition (log_date=$yesterday);load data local inpath '/home/spark/opt/Log_Data/yemao/yemao.dat' into table yemao_log partition (log_date=$yesterday);"
#/usr/local/mysql/bin/mysqlimport -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 --fields-terminated-by=, --lines-terminated-by=\n logdata /home/spark/opt/Log_Data/yemao/yemao.dat --local
/home/spark/opt/sqoop-1.4.6/bin/sqoop export --connect jdbc:mysql://120.55.189.188:3306/logdata --username datawarehouse --password datawarehouse2015 --table yemao_log --export-dir /user/hive/warehouse/yemao_log/log_date=$yesterday --input-fields-terminated-by ',';
/usr/local/mysql/bin/mysql -h120.55.189.188 -udatawarehouse -pdatawarehouse2015 -e "call logdata.p_ymlog_maintain($yesterday);"
rm -rf /home/spark/opt/Log_Data/yemao/yemao.dat
rm -rf /home/spark/opt/Log_Data/yemao/yemaolog
rm -rf /home/spark/opt/Log_Data/yemao/yemao_log.java


4、设置Corntab定时调度
[spark@Master ~]$ crontab -l
0 6 * * * sh /home/spark/opt/Log_Data/ymlog_proc.sh

5、处理流程说明
业务系统埋点产生的用户行为数据,是以Json方式进行保存和传送过来的。首先,将源日志数据进行一定程度的处理,使之成为标准的Json格式;然后将文件装载到MongoDB数据库;最后根据需要将必要字段分别装载到Hive及Mysql数据库中。

你可能感兴趣的:(Solutions,hive,centos,mongodb,日志分析)