pig处理json格式数据,取top100

在如下数据类型中
文件A:这里面是sequenceFile格式文件,key是long型时间戳,value是一个Text类型字符串(是JSON格式)
20140902112312 {"app":"da","data":{"ip":"1032096474"}}


文件B: 这里也是SequenceFile, key是Text型,对应的是上面数据中的vid,value是一个Text类型字符串(也是JSON格式)
1CH3vFsRCmQaVQG9J8yH {"age_group":"45-54"}


文件C:referrer_top100.txt 记录排名前100的referrer

求:referrer的top100下的所有vid和vid属性。


pig脚本一:

REGISTER /Users/shuguo/pig/contrib/piggybank/java/piggybank.jar;
DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader();

domain = load '/tmp' using PigStorage('\t') as (
 domain_text:chararray
);

domain_cookie =load '/tmp/input/gnome_simple.data' USING SequenceFileLoader('\t') as (
 time:chararray,
 log_text:chararray
);

cookie = load '/tmp/input/buzz_cookie_simple.data' USING SequenceFileLoader('\t') as (
 vid:chararray,
 cookie_value:chararray
);


domain = foreach domain generate
domain_text as  domain_text;

cookie = foreach cookie generate
vid as vid,
cookie_value as  cookie_value;

domain_cookie_a = foreach domain_cookie generate log_text;
store domain_cookie_a into '/tmp/input/domain_cookie.json' using PigStorage('\t');
log_domain_cookie = load '/tmp/input/domain_cookie.json' USING JsonLoader('app:chararray,
data(ip:chararray, keyword:chararray,referrer:chararray, title:chararray,ua:chararray,url:chararray,uuid:chararray,vid:chararray),
time:chararray,
type:chararray');

domain_cookie_b = foreach log_domain_cookie generate
vid as  vid,
title as title;

domain_cookie_c = distinct domain_cookie_b;


domain_cookie_find = join domain_cookie_c by title, domain by domain_text; 
domain_cookie_result = foreach domain_cookie_find generate domain_cookie_find::domain::domain_text as domain,domain_cookie_find::domain_cookie_c::vid as vid;

domain_result = join domain_cookie_result by vid, cookie by vid; 
domain_result = foreach domain_result generate domain_result::domain_cookie_result::domain, domain_result::domain_cookie_result::vid,domain_result::cookie::cookie_value;
store domain_result into '/tmp' using PigStorage('\t');


pig脚本二:

register /opt/pig/contrib/piggybank/java/piggybank.jar;
REGISTER /home/code/opensource/elephant-bird/pig/target/elephant-bird-pig-4.6-SNAPSHOT-jar-with-dependencies.jar;

DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader();
DEFINE JsonStringToMap com.twitter.elephantbird.pig.piggybank.JsonStringToMap();

genome_raw = LOAD '$INFILE1' using SequenceFileLoader() as (key:long, value:chararray); 
genome_parsed = FOREACH genome_raw GENERATE JsonStringToMap(value) as genome_data:map[];

genome_data = FOREACH genome_parsed GENERATE JsonStringToMap(genome_data#'data');
top_100_domain = LOAD '$INFILE2' as (domain:chararray);

cookie_domain = FOREACH  genome_data GENERATE json#'vid' as vid:chararray,FLATTEN(REGEX_EXTRACT(json#'url', '(http://[^/]+).*',1)) as d:chararray;

cookie_with_domain = JOIN cookie_domain BY d,top_100_domain BY  domain USING 'replicated';
cookie_with_domain_se = FOREACH cookie_with_domain GENERATE vid, domain;
cookie_with_domain_gr = GROUP cookie_with_domain_se BY vid;
cookie_with_domain_re = FOREACH cookie_with_domain_gr GENERATE $0 as vid,$1.domain as domain;

cookie_info_raw = LOAD '$INFILE3' USING com.twitter.elephantbird.pig.load.SequenceFileLoader (
    '-c com.twitter.elephantbird.pig.util.TextConverter',
    '-c com.twitter.elephantbird.pig.util.TextConverter'
) AS  (
	key:chararray,
	value:chararray
);
cookie_info = FOREACH cookie_info_raw GENERATE JsonStringToMap(value) as cookie:map[];

cookie_join = JOIN  cookie_with_domain_re BY vid, cookie_info BY cookie#'b_id'; 
cookie_result = FOREACH cookie_join GENERATE cookie#'b_id',cookie#'age_group',cookie#'country_city',cookie#'device',cookie#'gender',cookie#'interests',domain;

STORE cookie_result INTO  '$OUTFILE';

你可能感兴趣的:(json,pig)