医疗人群数据包

需求介绍


  • 医疗人群数据包
  • http://jira.dev.zamplus.com/browse/ZAMPDMP-1358

  1. 需要用到数据详见附件
  2. 生成三个数据包(zid 包)
    i. 包1:所有搜索过 sheet1 所包含关键词的zid
    ii. 包2:访问过 sheet2 + sheet3 所有domain 的zid
    iii.包3:包1+包2去重
  3. 附件

SQL 统计

create table tmp.domain (d string) location '/bh/warehouse/dmp/tmp/domain';
create table tmp.keyword (d string) location '/bh/warehouse/dmp/tmp/keyword';
load data local inpath '/home/wankun/domain.txt' into table tmp.domain;
load data local inpath '/home/wankun/keywords.txt' into table tmp.keyword;

create table tmp.zid1 (d string) location '/bh/warehouse/dmp/tmp/zid1';
create table tmp.zid2 (d string) location '/bh/warehouse/dmp/tmp/zid2';
create table tmp.zid3 (d string) location '/bh/warehouse/dmp/tmp/zid3';

insert overwrite local directory '/home/wankun/tmp/zids1' select distinct zid from ( select zid,java_method('java.net.URLDecoder', 'decode', search_id, 'UTF-8') as search_id from ( select zid, map_keys(search) as search_map FROM insight_cdr_v2 cdr where concat(year,month,day)>='20150608' and concat(year,month,day)<='20150615' ) cdr LATERAL VIEW explode(search_map) search_list AS search_id ) t join tmp.keyword t1 where instr(t.search_id,t1.d)>0 ;

insert overwrite local directory '/home/wankun/tmp/zids2' select distinct zid from ( select * from ( select zid, map_keys(top_site) as top_site FROM insight_cdr_v2 cdr where concat(year,month,day)>='20150608' and concat(year,month,day)<='20150615' ) cdr LATERAL VIEW explode(top_site) top_site_table AS ts ) t3 join tmp.domain t4 where instr(t3.ts,t4.d)>0;

insert overwrite table tmp.zid3 select distinct d from (select d from tmp.zid1 union all select d from tmp.zid2) as t;

你可能感兴趣的:(医疗人群数据包)