网站流量日志复杂分析(二)

要求:
网站流量日志复杂分析(二)_第1张图片
字段解释:
网站流量日志复杂分析(二)_第2张图片

  1. 创建表:

    DROP TABLE IF EXISTS yhd_log_parquet ;
    CREATE TABLE yhd_log_parquet(
    id                     string,
    url                    string,
    referer                string,
    keyword                string,
    type                   string,
    guid                   string,
    pageId                 string,
    moduleId               string,
    linkId                 string,
    attachedInfo           string,
    sessionId              string,
    trackerU               string,
    trackerType            string,
    ip                     string,
    trackerSrc             string,
    cookie                 string,
    orderCode              string,
    trackTime              string,
    endUserId              string,
    firstLink              string,
    sessionViewNo          string,
    productId              string,
    curMerchantId          string,
    provinceId             string,
    cityId                 string,
    fee                    string,
    edmActivity            string,
    edmEmail               string,
    edmJobId               string,
    ieVersion              string,
    platform               string,
    internalKeyword        string,
    resultSum              string,
    currentPage            string,
    linkPosition           string,
    buttonPosition         string
    )
    PARTITIONED BY (date string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS PARQUET ;

    网站流量日志复杂分析(二)_第3张图片

  2. 加载数据

    set parquet.compression = SNAPPY ;
    set hive.exec.dynamic.partition.mode=nonstrict;
    INSERT INTO TABLE 
      yhd_log_parquet
    PARTITION (date)
    SELECT
      *
    FROM
      yhd_log ;
  3. 查看

    1. 使用指令:desc formatted yhd_log_parquet; 可以查看数据保存位置
      网站流量日志复杂分析(二)_第4张图片
    2. HDFS 上查看文件大小:
      dfs -du -h /user/hive/warehouse/db_track.db/yhd_log_parquet/date=20150828 ;
      这里写图片描述
  4. 创建会话信息表SESSION_INFO

    USE db_track ;
    DROP TABLE IF EXISTS session_info ;
    CREATE TABLE session_info(
    session_id           string,
    guid                 string,
    trackerU             string,
    landing_url          string,
    landing_url_ref      string,
    user_id              string,
    pv                   string,
    stay_time            string,
    min_trackTime        string,
    ip                   string,
    provinceId           string
    )
    PARTITIONED BY (date string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS TEXTFILE ;

    网站流量日志复杂分析(二)_第5张图片

  5. 从原数据表中获取数据插入到会话信息表

    1. 临时表1

      1. 创建表

        USE db_track ;
        DROP TABLE IF EXISTS tmp_session_info ;
        CREATE TABLE tmp_session_info 
        AS
        SELECT
          sessionId AS session_id ,
          MAX(guid) AS guid ,
          -- trackerU
          -- landing_url
          -- landing_url_ref
          MAX(enduserid) AS user_id ,
          COUNT(url) AS pv ,
          -- max_tracktime - min_trackTime
          unix_timestamp(MAX(tracktime)) - unix_timestamp(MIN(tracktime)) AS stay_time ,
          MIN(tracktime) AS min_trackTime ,
          MAX(ip) AS ip ,
          MAX(provinceid) AS provinceId
        FROM
          yhd_log_parquet
        WHERE 
          date = '20150828'    
        GROUP BY
          sessionId ;

        网站流量日志复杂分析(二)_第6张图片

      2. 查询一下数量

        SELECT COUNT(1) FROM tmp_session_info ;

        这里写图片描述

      3. 按照pv数排序

        SELECT session_id, pv FROM tmp_session_info ORDER BY pv DESC LIMIT 10 ;

        网站流量日志复杂分析(二)_第7张图片

    2. 临时表2

      1. 创建表

        USE db_track ;
        DROP TABLE IF EXISTS tmp_track_url ;
        CREATE TABLE tmp_track_url
        AS
        SELECT
          sessionid ,
          tracktime ,
          trackeru ,
          url ,
          referer
        FROM
          yhd_log_parquet 
        WHERE 
          date = '20150828' ;

        网站流量日志复杂分析(二)_第8张图片

      2. 两张临时表join插入会话信息表

        INSERT OVERWRITE TABLE session_info PARTITION(date = '20150828')
        SELECT
          a.session_id AS session_id,
          MAX(a.guid) AS guid ,
          MAX(b.trackeru) AS trackerU ,
          MAX(b.url) AS landing_url ,
          MAX(b.referer) AS landing_url_ref ,
          MAX(a.user_id) AS user_id ,
          MAX(a.pv) AS pv ,
          MAX(a.stay_time / 1000) AS stay_time ,
          MAX(a.min_trackTime) AS min_trackTime ,
          MAX(a.ip) AS ip ,
          MAX(a.provinceId) AS provinceId  
        FROM
          tmp_session_info a
        JOIN
          tmp_track_url b
        ON 
          a.session_id = b.sessionid
          AND
          a.min_trackTime = b.tracktime 
        GROUP BY
        a.session_id ;
      3. 查看总数:select count(1) from session_info;
        这里写图片描述

  6. 按照要求分析数据

    1. 创建每日每日流量分析表

      USE db_track ;
      DROP TABLE IF EXISTS tmp_visit_daily ;
      CREATE TABLE tmp_visit_daily 
      AS
      SELECT
        date ,
        COUNT(DISTINCT guid) AS uv ,
        SUM(pv) AS pv ,
        COUNT(DISTINCT CASE WHEN length(trim(user_id)) > 0 THEN user_id else NULL end) AS login_users ,
        COUNT(DISTINCT CASE WHEN user_id IS NULL OR length(trim(user_id)) = 0 THEN guid else NULL end) AS visit_users ,
        AVG(stay_time) AS avg_stay_time ,
        COUNT(CASE WHEN pv >= 2 THEN session_id else NULL end)/COUNT(session_id) AS second_rate,
        COUNT(DISTINCT ip) AS ip_num ,
        COUNT(session_id) session_num
      FROM
        session_info 
      WHERE 
        date = '20150828' 
      GROUP BY
        date ;
    2. 查询结果
      这里写图片描述

你可能感兴趣的:(笔记,大数据学习)