这里根据请求的页面URL来分组(clustered)是为了方便统计每个页面的PV >> create table ods_pageviews(session string,ip string,viewtime string,visitpage string, staytime string,step string) partitioned by(inputDate string) clustered by(visitpage) sorted by(viewtime) into 4 buckets row format delimited fields terminated by ‘ ’;
>> insert into table ods_pageviews partition(inputDate='2016-05-17') select pv.session,pv.ip,concat(pv.requestdate,"-",pv.requesttime),pv.visitpage,pv.staytime,pv.step from pageviews as pv where pv.inputDate='2016-05-17';
创建PageViews事实表的时间维度表并从当天的事实表里导入数据
>>create table ods_dim_pageviews_time(time string,year string,month string,day string,hour string,minutes string,seconds string) partitioned by(inputDate String) clustered by(year,month,day) sorted by(time) into 4 buckets row format delimited fields terminated by ' ';
>> insert overwrite table ods_dim_pageviews_time partition(inputDate='2016-05-17') select distinct pv.viewtime, substring(pv.viewtime,0,4),substring(pv.viewtime,6,2),substring(pv.viewtime,9,2),substring(pv.viewtime,12,2),substring(pv.viewtime,15,2),substring(pv.viewtime,18,2) from ods_pageviews as pv;
创建PageViews事实表的URL维度表并从当天的事实表里导入数据 >> create table ods_dim_pageviews_url(visitpage string,host string,path string,query string) partitioned by(inputDate string) clustered by(visitpage) sorted by(visitpage) into 4 buckets row format delimited fields terminated by ' ';
>> insert into table ods_dim_pageviews_url partition(inputDate='2016-05-17') select distinct pv.visitpage,b.host,b.path,b.query from pageviews pv lateral view parse_url_tuple(concat('https://localhost',pv.visitpage),'HOST','PATH','QUERY') b as host,path,query;
查询每天PV总数前20的页面 >> select op.visitpage as path,count(*) as num from ods_pageviews as op join ods_dim_pageviews_url as opurl on (op.visitpage = opurl.visitpage) join ods_dim_pageviews_time as optime on (optime.time = op.viewtime) where optime.year='2013' and optime.month='09' and optime.day='19' group by op.visitpage sort by num desc limit 20;
>> insert into table ods_visits partition(inputDate='2016-05-18') select vi.session,concat(vi.startdate,"-",vi.starttime),concat(vi.enddate,"-",vi.endtime),vi.entrypage,vi.leavepage,vi.viewpagenum,vi.ip,vi.referal from visitsinfo as vi where vi.inputDate='2016-05-18';
创建Visits事实表的时间维度表并从当天的事实表里导入数据
>>create table ods_dim_visits_time(time string,year string,month string,day string,hour string,minutes string,seconds string) partitioned by(inputDate String) clustered by(year,month,day) sorted by(time) into 4 buckets row format delimited fields terminated by ' ';
将“访问时间”和“离开时间”两列的值合并后再放入时间维度表中,减少数据的冗余 >>insert overwrite table ods_dim_visits_time partition(inputDate='2016-05-18') select distinct ov.timeparam, substring(ov.timeparam,0,4),substring(ov.timeparam,6,2),substring(ov.timeparam,9,2),substring(ov.timeparam,12,2),substring(ov.timeparam,15,2),substring(ov.timeparam,18,2) from (select ov1.entrytime as timeparam from ods_visits as ov1 union select ov2.leavetime as timeparam from ods_visits as ov2) as ov;
创建visits事实表的URL维度表并从当天的事实表里导入数据 >> create table ods_dim_visits_url(pageurl string,host string,path string,query string) partitioned by(inputDate string) clustered by(pageurl) sorted by(pageurl) into 4 buckets row format delimited fields terminated by ' ';
将每个session的进入页面和离开页面的URL合并后存入到URL维度表中 >>insert into table ods_dim_visits_url partition(inputDate='2016-05-18') select distinct ov.pageurl,b.host,b.path,b.query from (select ov1.entrypage as pageurl from ods_visits as ov1 union select ov2.leavepage as pageurl from ods_visits as ov2 ) as ov lateral view parse_url_tuple(concat('https://localhost',ov.pageurl),'HOST','PATH','QUERY') b as host,path,query;
将每个session从哪个外站进入当前网站的信息存入到URL维度表中 >>insert into table ods_dim_visits_url partition(inputDate='2016-05-18') select distinct ov.referal,b.host,b.path,b.query from ods_visits as ov lateral view parse_url_tuple(ov.referal,'HOST','PATH','QUERY') b as host,path,query;
统计每个页面的跳出人数(事实上真正有价值的统计应该是统计页面的跳出率,但为了简单示范,作者在这里简化成统计跳出人数) >> select ov.leavepage as jumpPage, count(*) as jumpNum from ods_visits as ov group by ov.leavepage order by jumpNum desc;
首先我们要创建业务页面的PV的临时信息表,临时表和里面的数据会在session结束的时候清理掉 >> create temporary table transactionpageviews(url string,views int) row format delimited fields terminated by ' ';
先统计业务页面的总PV然后按转换步骤顺序插入每个页面的PV信息到transactionpageviews表中 >> insert into table transactionpageviews select opurl.path as path,count(*) as num from ods_pageviews as op join ods_dim_pageviews_url as opurl on (op.visitpage = opurl.visitpage) join ods_dim_pageviews_time as optime on (optime.time = op.viewtime) where optime.year='2013' and optime.month='09' and optime.day='19' and opurl.path='/index' group by opurl.path;
>> insert into table transactionpageviews select opurl.path as path,count(*) as num from ods_pageviews as op join ods_dim_pageviews_url as opurl on (op.visitpage = opurl.visitpage) join ods_dim_pageviews_time as optime on (optime.time = op.viewtime) where optime.year='2013' and optime.month='09' and optime.day='19' and opurl.path='/detail' group by opurl.path;
>> insert into table transactionpageviews select opurl.path as path,count(*) as num from ods_pageviews as op join ods_dim_pageviews_url as opurl on (op.visitpage = opurl.visitpage) join ods_dim_pageviews_time as optime on (optime.time = op.viewtime) where optime.year='2013' and optime.month='09' and optime.day='19' and opurl.path='/createOrder' group by opurl.path;
>> insert into table transactionpageviews select opurl.path as path,count(*) as num from ods_pageviews as op join ods_dim_pageviews_url as opurl on (op.visitpage = opurl.visitpage) join ods_dim_pageviews_time as optime on (optime.time = op.viewtime) where optime.year='2013' and optime.month='09' and optime.day='19' and opurl.path='/confirmOrder' group by opurl.path;
计算业务页面之间的转换率 >> select row_number() over() as rownum,a.url as url, a.views as pageViews,b.views as lastPageViews,a.views/b.views as transferRation from (select row_number() over() as rownum,views,url from transactionpageviews) as a left join (select row_number() over() as rownum,views,url from transactionpageviews) as b on (a.rownum = b.rownum-1 );
#!/bin/bash
CURDATE=$(date +%y-%m-%d)
CURDATEHIVE=$(date +%Y-%m-%d)
/home/ymh/apps/hadoop-2.6.4/bin/hdfs dfs -df /flume/events/$CURDATE
if [[ 1 -ne $? ]]
then
/home/ymh/apps/hadoop-2.6.4/bin/hadoop jar /export/data/mydata/clickstream.jar com.guludada.clickstream.logClean
fi
if [[ 1 -ne $? ]]
then
/home/ymh/apps/hadoop-2.6.4/bin/hadoop jar /export/data/mydata/clickstream.jar com.guludada.clickstream.logSession
fi
if [[ 1 -ne $? ]]
then
/home/ymh/apps/hadoop-2.6.4/bin/hadoop jar /export/data/mydata/clickstream.jar com.guludada.clickstream.PageViews
fi
#Load today's data
if [[ 1 -ne $? ]]
then
/home/ymh/apps/hadoop-2.6.4/bin/hdfs dfs -chmod 777 /clickstream/pageviews/$CURDATE/
echo "load data inpath '/clickstream/pageviews/$CURDATE/' into table pageviews partition(inputDate='$CURDATEHIVE');" | /apps/apache-hive-1.2.1-bin/bin/beeline -u jdbc:hive2://localhost:10000
fi
#Create fact table and its dimension tables
if [[ 1 -ne $? ]]
then
echo "insert into table ods_pageviews partition(inputDate='$CURDATEHIVE') select pv.session,pv.ip,concat(pv.requestdate,'-',pv.requesttime) as viewtime,pv.visitpage,pv.staytime,pv.step from pageviews as pv where pv.inputDate='$CURDATEHIVE';" | /apps/apache-hive-1.2.1-bin/bin/beeline -u jdbc:hive2://localhost:10000
fi
if [[ 1 -ne $? ]]
then
echo "insert into table ods_dim_pageviews_time partition(inputDate='$CURDATEHIVE') select distinct pv.viewtime, substring(pv.viewtime,0,4),substring(pv.viewtime,6,2),substring(pv.viewtime,9,2),substring(pv.viewtime,12,2),substring(pv.viewtime,15,2),substring(pv.viewtime,18,2) from ods_pageviews as pv;" | /apps/apache-hive-1.2.1-bin/bin/beeline -u jdbc:hive2://localhost:10000
fi
if [[ 1 -ne $? ]]
then
echo "insert into table ods_dim_pageviews_url partition(inputDate='$CURDATEHIVE') select distinct pv.visitpage,b.host,b.path,b.query from pageviews pv lateral view parse_url_tuple(concat('https://localhost',pv.visitpage),'HOST','PATH','QUERY') b as host,path,query;" | /apps/apache-hive-1.2.1-bin/bin/beeline -u jdbc:hive2://localhost:10000
fi
public class ReverseWords {
/**
* 题目:颠倒一个句子中的词的顺序。比如: I am a student颠倒后变成:student a am I.词以空格分隔。
* 要求:
* 1.实现速度最快,移动最少
* 2.不能使用String的方法如split,indexOf等等。
* 解答:两次翻转。
*/
publ
oracle外部表是只允许只读访问,不能进行DML操作,不能创建索引,可以对外部表进行的查询,连接,排序,创建视图和创建同义词操作。
you can select, join, or sort external table data. You can also create views and synonyms for external tables. Ho
发现一个老外写的不错的jquery插件,可以实现将HTML
表格导出为excel,pdf等格式,
地址在:
https://github.com/kayalshri/
下面看个例子,实现导出表格到excel,pdf
<html>
<head>
<title>Export html table to excel an