基于hadoop平台的pig语言对apache日志系统的分析

阅读更多
pig脚本如下
register myudfs.jar;
DEFINE DayExtractor org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd hh:mm:ss');
log = LOAD '/user/input/test/hp_analytics.ifensi.com-access_log.log' USING PigStorage() as (l1:chararray,l2:chararray,l3:chararray,l4:chararray,l5:chararray,l6:chararray,l7:chararray,l8:chararray,l9:chararray,l10:chararray);
log1 = FOREACH log GENERATE FLATTEN(STRSPLIT(l1, '\\,', 2))as (ip,otherargs),SUBSTRING(l4,1,21) as date,FLATTEN(REGEX_EXTRACT_ALL(l5,'\\"[^ ]* ([^ ]*) [^\\"]*\\"')) as url,FLATTEN(REGEX_EXTRACT_ALL(l8,'.{1}(.*).{1}')) as referer,FLATTEN(REGEX_EXTRACT_ALL(l9,'.{1}(.*).{1}')) as useragent,FLATTEN(REGEX_EXTRACT_ALL(l10,'.{1}(.*).{1}')) as vuid;
log2 = FILTER log1 BY SUBSTRING(vuid, 0, 4)=='vuid';
log3 = FOREACH log2 GENERATE ip, myudfs.DateExtractor(date) as date, FLATTEN(STRSPLIT(url, '\\?', 2)) AS (cmd, args), referer, useragent,FLATTEN(REGEX_EXTRACT_ALL(vuid,'.{5}(.*)')) as vuid;
SPLIT log3 INTO ihm IF cmd=='/__ihm.gif', ia IF cmd=='/__ia.gif';
-- ia process block
log4 = FOREACH ia GENERATE vuid, ip, FLATTEN(STRSPLIT(date, '\\|', 2)) AS (date, time), FLATTEN(REGEX_EXTRACT_ALL(args,'version=([^&]*)&(.*)')) as (ia_version, ia_other), referer,useragent;

SPLIT log4 INTO ia_version1 IF ia_version == '1.0', ia_version2 IF ia_version == '1.1';

log5 = FOREACH ia_version1 GENERATE vuid, ip,date,time, FLATTEN(REGEX_EXTRACT_ALL(ia_other,'browser=([^&]*)&browser_version=([^&]*)&operation_system=([^&]*)&operation_system_version=([^&]*)&flash_version=([^&]*)&java_enabled=([^&]*)&language=([^&]*)&screen_colors=([^&]*)&screen_resolution=([^&]*)&referrer=([^&]*)&tourl=([^&]*)&vuid=([^&]*)')) AS (ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_vuid), referer, useragent;
result1  = FOREACH log5 GENERATE vuid,ip,date,time,ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_vuid,referer,useragent;
STORE result1 INTO '/test/output/data/ia/ia_version1' USING PigStorage();

log6 = FOREACH ia_version2 GENERATE vuid, ip,date,time, FLATTEN(REGEX_EXTRACT_ALL(ia_other,'browser=([^&]*)&browser_version=([^&]*)&operation_system=([^&]*)&operation_system_version=([^&]*)&flash_version=([^&]*)&java_enabled=([^&]*)&language=([^&]*)&screen_colors=([^&]*)&screen_resolution=([^&]*)&referrer=([^&]*)&tourl=([^&]*)&title=([^&]*)&vuid=([^&]*)&muid=([^&]*)&mfid=([^&]*)&musername=([^&]*)&memail=([^&]*)')) AS (ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_title,ia_vuid,ia_muid,ia_mfid,ia_musername,ia_memail), referer, useragent;
result2  = FOREACH log6 GENERATE vuid,ip,date,time,ia_browser,ia_browser_version,ia_operation_system,ia_operation_system_version,ia_flash_version,ia_java_enabled,ia_language,ia_screen_colors,ia_screen_resolution,ia_referrer,ia_tourl,ia_title,ia_vuid,referer,useragent,ia_muid,ia_mfid,ia_musername,ia_memail;
STORE result2 INTO '/test/output/data/ia/ia_version2' USING PigStorage();
-- ihm process block
ihm1 = FOREACH ihm GENERATE vuid,ip,FLATTEN(STRSPLIT(date, '\\|', 2)) AS (date, time),FLATTEN(REGEX_EXTRACT_ALL(args,'version=([^&]*)&(.*)')) as (ihm_version, ihm_other),referer,useragent;
ihm2 = FOREACH ihm1 GENERATE vuid,ip,date,time,ihm_version,FLATTEN(REGEX_EXTRACT_ALL(ihm_other,'vuid=([^&]*)&url=([^&]*)&width=([^&]*)&x=([^&]*)&y=(.*)')) as (ihm_vuid,ihm_url,ihm_width,ihm_x,ihm_y),referer,useragent;
ihm3 = FOREACH ihm2 GENERATE vuid,ip,date,time,ihm_vuid,ihm_url,ihm_x,ihm_y,ihm_width,referer,useragent;
STORE ihm3 INTO '/test/output/data/ihm' USING PigStorage();
附件为部分日志文件
  • logs.rar (2.4 KB)
  • 下载次数: 36

你可能感兴趣的:(Apache,Hadoop,Flash,HP,脚本)