54.Spark大型电商项目-用户访问session分析-top10活跃session之分组取TopN算法获取top10活跃session

目录

重命名数据表top10_category_session

代码

Top10Session.java

ITop10SessionDAO.java

Top10SessionDAOImpl.java

DAOFactory.java

UserVisitSessionAnalyzeSpark.java


本篇文章记录用户访问session分析-top10活跃session之分组取TopN算法获取top10活跃session。

重命名数据表top10_category_session

alter table top10_category_session rename to top10_session;

代码

domain

Top10Session.java

package graduation.java.domain;

/**
 * FileName: Top10Session
 * Author:   hadoop
 * Email:    [email protected]
 * Date:     19-3-24 下午4:59
 * Description:
 *
 * top10品类session点击top10实体类
 */
public class Top10Session {
    private  long taskid;
    private long categoryid;
    private String sessionid;
    private long clickCount;

    public long getTaskid() {
        return taskid;
    }

    public void setTaskid(long taskid) {
        this.taskid = taskid;
    }

    public long getCategoryid() {
        return categoryid;
    }

    public void setCategoryid(long categoryid) {
        this.categoryid = categoryid;
    }

    public String getSessionid() {
        return sessionid;
    }

    public void setSessionid(String sessionid) {
        this.sessionid = sessionid;
    }

    public long getClickCount() {
        return clickCount;
    }

    public void setClickCount(long clickCount) {
        this.clickCount = clickCount;
    }

    @Override
    public String toString() {
        return "Top10Session{" +
                "taskid=" + taskid +
                ", categoryid=" + categoryid +
                ", sessionid='" + sessionid + '\'' +
                ", clickCount=" + clickCount +
                '}';
    }
}

dao

ITop10SessionDAO.java

package graduation.java.dao;

import graduation.java.domain.Top10Session;

/**
 * FileName: ITop10SessionDAO
 * Author:   hadoop
 * Email:    [email protected]
 * Date:     19-3-24 下午5:02
 * Description:
 * top10品类session点击top10接口
 */
public interface ITop10SessionDAO {

    void insert(Top10Session top10Session);

}

impl

Top10SessionDAOImpl.java

package graduation.java.impl;

import graduation.java.dao.ITop10SessionDAO;
import graduation.java.domain.Top10Session;
import graduation.java.jdbc.JDBCHelper;

/**
 * FileName: Top10SessionDAOImpl
 * Author:   hadoop
 * Email:    [email protected]
 * Date:     19-3-24 下午5:04
 * Description:
 * top10品类session点击top10DAO实现
 */
public class Top10SessionDAOImpl implements ITop10SessionDAO {

    @Override
    public void insert(Top10Session top10Session) {
        String sql = "insert into top10_session values(?,?,?,?)";
        Object[] param = new Object[]{
                top10Session.getTaskid(),
                top10Session.getCategoryid(),
                top10Session.getSessionid(),
                top10Session.getClickCount()
        };

        JDBCHelper jdbcHelper = JDBCHelper.getInstance();
        jdbcHelper.executeUpdate(sql,param);

    }
}

factory

DAOFactory.java

/**
     * 获取top10 session管理DAO
     * @return
     */
    public static Top10SessionDAOImpl getTop10SessionDAO(){
        return new Top10SessionDAOImpl();
    }

Spark

UserVisitSessionAnalyzeSpark.java

**
     * 获取top10活跃session
     * @param sc
     * @param taskId
     * @param top10CategoryList
     * @param sessionid2detailRDD
     */

    private static void getTop10Session(JavaSparkContext sc, long taskId,
                                        List> top10CategoryList,
                                        JavaPairRDD sessionid2detailRDD) {

        List> top10CategoryIdList = new ArrayList>();

        for (Tuple2 category: top10CategoryList){
            long categoryId = Long.valueOf(StringUtils.getFieldFromConcatString(category._2,"\\|",Constants.FIELD_CATEGORY_ID));
            top10CategoryIdList.add(new Tuple2(categoryId,categoryId));
        }

        JavaPairRDD top10CategoryIdRDD = sc.parallelizePairs(top10CategoryIdList);

        /**
         * 第二步:计算top10热门品类被各session点击的次数
         */
        JavaPairRDD> sessionid2detailsRDD =
                sessionid2detailRDD.groupByKey();

        JavaPairRDD categoryid2sessionCountRDD = sessionid2detailsRDD.flatMapToPair(
                new PairFlatMapFunction>, Long, String>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Iterator> call(Tuple2> tuple) throws Exception {
                        String sessionid = tuple._1;
                        Iterator iterator = tuple._2.iterator();
                        Map categoryCountMap = new HashMap();
                        while (iterator.hasNext()){
                            Row row = iterator.next();
                            if (row.getLong(6) != Long.MAX_VALUE){
                                long categoryid = row.getLong(6);
                                Long count = categoryCountMap.get(categoryid);
                                if (count == null){
                                    count = 0L;
                                }
                                count++;
                                categoryCountMap.put(categoryid,count);
                            }
                        }
                        //返回结果,格式
                        List> list = new ArrayList>();
                        for (Map.Entry categoryCountEntry: categoryCountMap.entrySet()){
                            long categoryid = categoryCountEntry.getKey();
                            long count = categoryCountEntry.getValue();
                            String value = sessionid +"," + count;
                            list.add(new Tuple2(categoryid,value));
                        }
                        return  list.iterator();
                    }
                }
        );

        //获取到top10热门品类,被各个session点击的次数

        JavaPairRDD top10CategorySessionCountRDD = top10CategoryIdRDD
                .join(categoryid2sessionCountRDD)
                .mapToPair(new PairFunction>, Long, String>() {

                    private static final long serialVersionUID = 1L;
                    @Override
                    public Tuple2 call(Tuple2> tuple) throws Exception {
                        return new Tuple2(tuple._1,tuple._2._2);
                    }
                });

        /**
         * 第三步:分组获取TopN算法实现,获取每个品类的top10活跃用户
         */

        JavaPairRDD> top10CategorySessionCountsRDD = top10CategorySessionCountRDD.groupByKey();

        JavaPairRDD top10SessionRDD = top10CategorySessionCountsRDD.flatMapToPair(
                new PairFlatMapFunction>, String, String>() {
                    private static final long serialVersionUID = 1L;
                    @Override
                    public Iterator> call(Tuple2> tuple) throws Exception {
                        long categoryid = tuple._1;
                        Iterator iterator = tuple._2.iterator();

                        //定义获取top10session的排序数组
                        String[] top10Sessions = new String[10];

                        while (iterator.hasNext()){
                            String sessionCount = iterator.next();
                            long count = Long.valueOf(sessionCount.split(",")[1]);

                            //遍历数组
                            for (int i = 0; i < top10Sessions.length; i++){
                                //如果当前i为是null,那么直接将i位数据赋值为当前的sessionCount
                                if (top10Sessions[i] == null){
                                    top10Sessions[i] = sessionCount;
                                    break;
                                } else{
                                    long _count = Long.valueOf(top10Sessions[i].split(",")[1]);
                                    //插入排序,将数据进行排序
                                    if(count > _count){
                                        for (int j = 9 ; j > i; j--){
                                            top10Sessions[j] = top10Sessions[j-1];
                                        }
                                        top10Sessions[i] = sessionCount;
                                        break;
                                    }
                                    //比较小,继续外层for循环
                                }
                            }
                        }

                        //将数据写入Mysql
                        List> list = new ArrayList>();

                        for (String sessionCount : top10Sessions){
                            String sessionid = sessionCount.split(",")[0];
                            long count = Long.valueOf(sessionCount.split(",")[1]);

                            //将top10session写入MySQL
                            Top10Session top10Session = new Top10Session();
                            top10Session.setTaskid(taskId);
                            top10Session.setCategoryid(categoryid);
                            top10Session.setSessionid(sessionid);
                            top10Session.setClickCount(count);
                            ITop10SessionDAO top10SessionDAO = DAOFactory.getTop10SessionDAO();
                            top10SessionDAO.insert(top10Session);

                            list.add(new Tuple2(sessionid,sessionid));

                        }

                        return list.iterator();
                    }
                }
        );

        /**
         * 第四步:获取top10活跃session的明细数据,并写入MySQL
         */
        JavaPairRDD> sessionDetailRDD =
                top10SessionRDD.join(sessionid2detailRDD);
        sessionDetailRDD.foreach(new VoidFunction>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2> tuple) throws Exception {
                Row row = tuple._2._2;
                SessionDetail sessionDetail = new SessionDetail();
                sessionDetail.setTaskid(taskId);
                sessionDetail.setUserid(row.getLong(1));
                sessionDetail.setSessionid(row.getString(2));
                sessionDetail.setPageid(row.getLong(3));
                sessionDetail.setActionTime(row.getString(4));
                sessionDetail.setSeachKeyWord(row.getString(5));
                sessionDetail.setClickCategoryId(row.getLong(6));
                sessionDetail.setClickProductId(row.getLong(7));
                sessionDetail.setOrderCategoryIds(row.getString(8));
                sessionDetail.setOrderProductIds(row.getString(9));
                sessionDetail.setPayCategoryIds(row.getString(10));
                sessionDetail.setPayProductIds(row.getString(11));

                ISessionDetailDAO sessionDetailDAO = DAOFactory.getSessionDetailDAO();
                sessionDetailDAO.insert(sessionDetail);
            }
        });

    }

 

你可能感兴趣的:(大数据,spark,电商用户行为分析)