pyspark练习(学习笔记)

一、单词统计

import findspark
findspark.init()
from pyspark import SparkConf,SparkContext

if __name__ == '__main__':
    conf = SparkConf()
    conf.setMaster('local')
    conf.setAppName('test')
    sc = SparkContext()
    lines = sc.textFile('./worldcount.txt')
    words = lines.flatMap(lambda line:line.split(' '))
    pair_words = words.map(lambda word:(word,1))
    reduce_result = pair_words.reduceByKey(lambda v1,v2:v1+v2)
    result = reduce_result.sortBy(lambda tp:tp[1])
    result.foreach(print)

运行结果:
pyspark练习(学习笔记)_第1张图片

二、模拟生产数据

# 模拟生产数据,写入到data下的pvuvData

import findspark
findspark.init()

import random
import time
import sys

def mock(path):
    date = time.strftime('%Y-%m-%d')

    # ip = '192.168.' + str(random.randint(0,255))+ '.' + str(random.randint(0,255))
    # ip = '%s%s%s%s'%('192.168.',str(random.randint(0,255)),'.',str(random.randint(0,255)))

    list = ['192.168', str(random.randint(0, 255)), str(random.randint(0, 255))]
    ip = '.'.join(list)

    # 5位的userID
    userID = getUserId()

    locations = ['beijing', 'shenzhen', 'shagnhai', 'guangzhou', 'chongqing', 'hangzhou']
    # random.randint(0,5) 包含0,包含5
    location = locations[random.randint(0, 5)]

    for j in range(0, random.randint(1, 10)):
        websites = ['www.baidu.com', 'www.xiaomi.com', 'www.jd.com', 'www.taobao.com', 'www.qq.com', 'www.360.com','www.dagndang.com']
        website = websites[random.randint(0, 6)]

        operations = ['register', 'view', 'login', 'logout', 'buy', 'comment', 'jump']
        operation = operations[random.randint(0, 6)]

        oneInfo = date + '\t' + ip + '\t' + 'uid' + userID + '\t' + location + '\t' + website + '\t' + operation
        print(oneInfo)
        writeLogToFile(path, oneInfo)

def getUserId():
    id = str(random.randint(0, 99999))
    tmpStr = " "
    if len(id) < 5:
        for i in range(0, (5 - len(id))):
            tmpStr += '0'
    return tmpStr + id

def writeLogToFile(path, log):
    # 'r':读;‘w’:写;'a':追加
    # 'r+' == r + w(可读可写,文件若不存在就报错(IOError))
    # 'w+' == w+r(可读可写,文件若不存在就创建)
    # 'a+' == a+r(可追加可写,文件若不存在就创建)
    # 对应的,如果是二进制文件爱你,就都加一个b:'rb','wb','ab','rb+','wb+','ab+'
    with open(path, 'a+') as f:
        f.writelines(log + '\n')

if __name__ == '__main__':
    # 写入当前路径./pvuvdata
    outputPath = sys.argv[1]
    for i in range(1, 10000):
        mock(outputPath)

三、统计pv、uv等

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# ==================================================
# @Created   : 2022/1/5 19:52
# @Author    : 72106949
# @Objective : TODO
# @File      : pv_uv_count.py
# =================================================

import findspark
findspark.init()
from pyspark import SparkConf,SparkContext

def get_top2_local(x):
    site = x[0]
    local_Iterable = x[1]

    local_dic = {}
    for local in local_Iterable:
        if local in local_dic:
            local_dic[local] += 1
        else:
            local_dic[local] = 1

    sorted_list = sorted(local_dic.items(),key=lambda x:x[1],reverse=True)  #items()以列表返回可遍历的(键,值)元组。
    # 并以值排序,reverse=True代表从大到小排
    return_list = []
    if (len(sorted_list)>=2):    # sorted_list 的元素大于2个
        for i in range(0,2):
            return_list.append(sorted_list[i])
        else:  #元素小于2个的时候就直接等于sorted_list
            return_list = sorted_list

    return return_list

def get_hottest_operation(x):
    site = x[0]
    operator_iterable = x[1]

    operator_dic = {}
    for operator in operator_iterable:
        if operator in operator_dic:
            operator_dic[operator] += 1
        else:
            operator_dic[operator] =1

    sorted_list = sorted(operator_dic.items(),key=lambda x:x[1],reverse=True)
    operator_list = []
    if (len(sorted_list)>=2):
        for i in range(0,1):
            operator_list.append(sorted_list[i])
        else:
            operator_list = sorted_list

    return operator_list

def get_uid_site_count(x):
    uid = x[0]
    site_iterable = x[1]

    site_dic = {}
    for site in site_iterable:
        if site in site_dic:
            site_dic[site] += 1
        else:
            site_dic[site] = 1

    return_list = []
    for site,count in site_dic.items():
        return_list.append((site,(uid,count)))

    return return_list

def get_active_uid_top3(x):
    site = x[0]
    uid_count_iterable = x[1]
    top3_uid = ['','','']
    for tp in uid_count_iterable:
        uid = tp[0]
        count = tp[1]
        for i in range(0,len(top3_uid)):
            if (top3_uid[i] == ''):
                top3_uid[i] = tp
                break
            elif(count > top3_uid[i][1]):
                for j in range(2,i,-1):
                    top3_uid[j] = top3_uid[j-1]
                    top3_uid[i] = tp
                    break
        return top3_uid


if __name__=='__main__':
    conf = SparkConf()
    conf.setAppName("pv_uv_count")
    conf.setMaster("local")
    sc = SparkContext(conf=conf)
    datas = sc.textFile("./pvuvdata")

    # 1.pv --page view 即网页浏览量或点击量
    # datas = sc.textFile("./pvuvdata")
    lines_pv = datas.map(lambda line:(line.split("\t")[4],1))
    datasReduceByKey = lines_pv.reduceByKey(lambda v1,v2:v1+v2)
    datasSortBy = datasReduceByKey.sortBy(lambda tp:tp[1],ascending=False)
    datasSortBy.foreach(print)

    # 2.uv --Unique visitor,是指通过互联网访问、浏览这个网页的自然人(ip地址),注意去重!
    # datas = sc.textFile("./pvuvdata")
    lines_uv = datas.map(lambda line:line.split("\t")[1]+"_"+line.split("\t")[4]).distinct()
    pair_lines_uv = lines_uv.map(lambda one:(one.split("_")[1],1))
    datasReduceByKey_uv = pair_lines_uv.reduceByKey(lambda v1,v2:v1+v2)
    datasSortBy_uv = datasReduceByKey_uv.sortBy(lambda tp:tp[1],ascending=False)
    datasSortBy_uv.foreach(print)

    # 3.统计除了某个地区以外的uv----使用filter函数进行筛选,或者说过滤掉。比如除了北京地区外的所有uv
    # datas = sc.textFile("./pvuvdata")
    datas_filter = datas.filter(lambda ones:ones.split("\t")[3]=="beijing")
    lines_uv2 = datas_filter.map(lambda line:line.split("\t")[1]+"_"+line.split("\t")[4]).distinct()
    pair_lines_uv2 = lines_uv2.map(lambda one:(one.split("_")[1],1))
    datasReduceByKey_uv2 = pair_lines_uv2.reduceByKey(lambda v1,v2:v1+v2)
    datasSortBy_uv2 = datasReduceByKey_uv2.sortBy(lambda tp:tp[1],ascending=False)
    datasSortBy_uv2.foreach(print)

    # 4.统计每个网站最活跃的top2地区
    # datas = sc.textFile("./pvuvdata")
    site_local = datas.map(lambda one:(one.split("\t")[4] , one.split("\t")[3]))
    site_local_Iterable = site_local.groupByKey()
    sorted_result = site_local_Iterable.map(lambda x:get_top2_local(x))
    sorted_result.foreach(print)

    # 5.统计每个网站最热门的操作
    # datas = sc.textFile("./pvuvdata")
    site_operator = datas.map(lambda one:(one.split("\t")[4], one.split("\t")[5]))
    site_operator_Iterable = site_operator.groupByKey()
    sorted_resilt = site_operator_Iterable.map(lambda x:get_hottest_operation(x))
    sorted_resilt.foreach(print)

    # 6.统计每个网站下最活跃的top3用户
    # datas = sc.textFile("./pvuvdata")
    uid_site = datas.map(lambda line:(line.split("\t")[2],line.split("\t")[4]))
    uid_siteIterable = uid_site.groupByKey()
    uid_site_count = uid_siteIterable.flatMap(lambda x:get_uid_site_count(x))
    top3_uid_info = uid_site_count.groupByKey().map(lambda x:get_active_uid_top3(x))
    top3_uid_info.foreach(print)

另一个种方法:

# 统计每个网址访问量最高的top3地区
def get_top3_local(one):
    site = one[0]
    local_iterable = one[1]

    local_dic = {}

    for local in local_iterable:
        if local in local_dic:
            local_dic[local] += 1
        else:
            local_dic[local] = 1

    new_list = sorted(local_dic.items(),key=lambda tp:tp[1],reverse=True)

    return_list = []
    if len(new_list) > 3:
        for i in range(3):
            return_list.append(new_list[i])
    else:
        return_list = new_list

    return site,return_list


def get_all_info(info):
    return_list = []
    site = info[0]
    return_list.append(site + "_" + info[1][0][0] + "_" + str(info[1][0][1]))
    return_list.append(site + "_" + info[1][1][0] + "_" + str(info[1][1][1]))
    return_list.append(site + "_" + info[1][2][0] + "_" + str(info[1][2][1]))
    return return_list

if __name__=='__main__':
    conf = SparkConf()
    conf.setMaster("local")
    conf.setAppName("pv_uv_test")
    sc = SparkContext(conf=conf)
    lines = sc.textFile("./pvuvdata")

    # 统计每个网址访问量最高的top3地区
    lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])) \
    .groupByKey() \
    .map(lambda tp:get_top3_local(tp)) \
    .flatMap(lambda info:get_all_info(info)) \
    .foreach(print)

结果:
pyspark练习(学习笔记)_第2张图片
按照这样的格式来写:

rdd1:(uid,site) -> rdd2:(uid,[site,site…]) -> (site,(uid,count)) ->[site,[(uid,count)…]] -> …

def get_top3_uidcnt(info):
    site = info[0]
    uidcnt_iterable = info[1]
    top3_info = ["","",""]
    for tp in uidcnt_iterable:
        uid = tp[0]
        cnt = tp[1]
        for i in range(len(top3_info)):
            if top3_info[i] == "":
                top3_info[i]= tp
                break
            elif top3_info[i][1] < cnt:
                for j in range(2,i,-1):
                    top3_info[j] = top3_info[j-1]
                top3_info[i] = tp
                break

    return (site,top3_info)

if __name__=='__main__':
    conf = SparkConf()
    conf.setMaster("local")
    conf.setAppName("pv_uv_test")
    sc = SparkContext(conf=conf)
    lines = sc.textFile("./pvuvdata")

# rdd1:(uid,site) -> rdd2:(uid,[site,site.....]) -> (site,(uid,count)) ->[site,[(uid,count)....]] -> ....
    lines.map(lambda line:(line.split("\t")[2],line.split("\t")[4])) \
    .groupByKey() \
    .flatMap(lambda info:get_site_uidcnt(info))\
    .groupByKey() \
    .map(lambda info:get_top3_uidcnt(info)) \
    .foreach(print)

结果:
pyspark练习(学习笔记)_第3张图片
按照这样的格式来输出:

rdd1:(uid_site,1) -> rdd2:(uid_site,totalcount) -> (site,(uid,count)) ->[site,[(uid,count)…]] -> …

if __name__=='__main__':
    conf = SparkConf()
    conf.setMaster("local")
    conf.setAppName("pv_uv_test")
    sc = SparkContext(conf=conf)
    lines = sc.textFile("./pvuvdata")

    # rdd1:(uid_site,1) -> rdd2:(uid_site,totalcount) -> (site,(uid,count)) ->[site,[(uid,count)....]] -> ....
    lines.map(lambda line:(line.split("\t")[2] + "_"+ line.split("\t")[4],1)) \
    .reduceByKey(lambda v1,v2:v1+v2) \
    .map(lambda tp:(tp[0].split("_")[1],(tp[0].split("_")[0],tp[1]))) \
    .groupByKey() \
    .map(lambda info:get_top3_uidcnt(info)) \
    .foreach(print)

结果:
pyspark练习(学习笔记)_第4张图片

你可能感兴趣的:(python)