一、单词统计
import findspark
findspark.init()
from pyspark import SparkConf,SparkContext
if __name__ == '__main__':
conf = SparkConf()
conf.setMaster('local')
conf.setAppName('test')
sc = SparkContext()
lines = sc.textFile('./worldcount.txt')
words = lines.flatMap(lambda line:line.split(' '))
pair_words = words.map(lambda word:(word,1))
reduce_result = pair_words.reduceByKey(lambda v1,v2:v1+v2)
result = reduce_result.sortBy(lambda tp:tp[1])
result.foreach(print)
二、模拟生产数据
# 模拟生产数据,写入到data下的pvuvData
import findspark
findspark.init()
import random
import time
import sys
def mock(path):
date = time.strftime('%Y-%m-%d')
# ip = '192.168.' + str(random.randint(0,255))+ '.' + str(random.randint(0,255))
# ip = '%s%s%s%s'%('192.168.',str(random.randint(0,255)),'.',str(random.randint(0,255)))
list = ['192.168', str(random.randint(0, 255)), str(random.randint(0, 255))]
ip = '.'.join(list)
# 5位的userID
userID = getUserId()
locations = ['beijing', 'shenzhen', 'shagnhai', 'guangzhou', 'chongqing', 'hangzhou']
# random.randint(0,5) 包含0,包含5
location = locations[random.randint(0, 5)]
for j in range(0, random.randint(1, 10)):
websites = ['www.baidu.com', 'www.xiaomi.com', 'www.jd.com', 'www.taobao.com', 'www.qq.com', 'www.360.com','www.dagndang.com']
website = websites[random.randint(0, 6)]
operations = ['register', 'view', 'login', 'logout', 'buy', 'comment', 'jump']
operation = operations[random.randint(0, 6)]
oneInfo = date + '\t' + ip + '\t' + 'uid' + userID + '\t' + location + '\t' + website + '\t' + operation
print(oneInfo)
writeLogToFile(path, oneInfo)
def getUserId():
id = str(random.randint(0, 99999))
tmpStr = " "
if len(id) < 5:
for i in range(0, (5 - len(id))):
tmpStr += '0'
return tmpStr + id
def writeLogToFile(path, log):
# 'r':读;‘w’:写;'a':追加
# 'r+' == r + w(可读可写,文件若不存在就报错(IOError))
# 'w+' == w+r(可读可写,文件若不存在就创建)
# 'a+' == a+r(可追加可写,文件若不存在就创建)
# 对应的,如果是二进制文件爱你,就都加一个b:'rb','wb','ab','rb+','wb+','ab+'
with open(path, 'a+') as f:
f.writelines(log + '\n')
if __name__ == '__main__':
# 写入当前路径./pvuvdata
outputPath = sys.argv[1]
for i in range(1, 10000):
mock(outputPath)
三、统计pv、uv等
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ==================================================
# @Created : 2022/1/5 19:52
# @Author : 72106949
# @Objective : TODO
# @File : pv_uv_count.py
# =================================================
import findspark
findspark.init()
from pyspark import SparkConf,SparkContext
def get_top2_local(x):
site = x[0]
local_Iterable = x[1]
local_dic = {}
for local in local_Iterable:
if local in local_dic:
local_dic[local] += 1
else:
local_dic[local] = 1
sorted_list = sorted(local_dic.items(),key=lambda x:x[1],reverse=True) #items()以列表返回可遍历的(键,值)元组。
# 并以值排序,reverse=True代表从大到小排
return_list = []
if (len(sorted_list)>=2): # sorted_list 的元素大于2个
for i in range(0,2):
return_list.append(sorted_list[i])
else: #元素小于2个的时候就直接等于sorted_list
return_list = sorted_list
return return_list
def get_hottest_operation(x):
site = x[0]
operator_iterable = x[1]
operator_dic = {}
for operator in operator_iterable:
if operator in operator_dic:
operator_dic[operator] += 1
else:
operator_dic[operator] =1
sorted_list = sorted(operator_dic.items(),key=lambda x:x[1],reverse=True)
operator_list = []
if (len(sorted_list)>=2):
for i in range(0,1):
operator_list.append(sorted_list[i])
else:
operator_list = sorted_list
return operator_list
def get_uid_site_count(x):
uid = x[0]
site_iterable = x[1]
site_dic = {}
for site in site_iterable:
if site in site_dic:
site_dic[site] += 1
else:
site_dic[site] = 1
return_list = []
for site,count in site_dic.items():
return_list.append((site,(uid,count)))
return return_list
def get_active_uid_top3(x):
site = x[0]
uid_count_iterable = x[1]
top3_uid = ['','','']
for tp in uid_count_iterable:
uid = tp[0]
count = tp[1]
for i in range(0,len(top3_uid)):
if (top3_uid[i] == ''):
top3_uid[i] = tp
break
elif(count > top3_uid[i][1]):
for j in range(2,i,-1):
top3_uid[j] = top3_uid[j-1]
top3_uid[i] = tp
break
return top3_uid
if __name__=='__main__':
conf = SparkConf()
conf.setAppName("pv_uv_count")
conf.setMaster("local")
sc = SparkContext(conf=conf)
datas = sc.textFile("./pvuvdata")
# 1.pv --page view 即网页浏览量或点击量
# datas = sc.textFile("./pvuvdata")
lines_pv = datas.map(lambda line:(line.split("\t")[4],1))
datasReduceByKey = lines_pv.reduceByKey(lambda v1,v2:v1+v2)
datasSortBy = datasReduceByKey.sortBy(lambda tp:tp[1],ascending=False)
datasSortBy.foreach(print)
# 2.uv --Unique visitor,是指通过互联网访问、浏览这个网页的自然人(ip地址),注意去重!
# datas = sc.textFile("./pvuvdata")
lines_uv = datas.map(lambda line:line.split("\t")[1]+"_"+line.split("\t")[4]).distinct()
pair_lines_uv = lines_uv.map(lambda one:(one.split("_")[1],1))
datasReduceByKey_uv = pair_lines_uv.reduceByKey(lambda v1,v2:v1+v2)
datasSortBy_uv = datasReduceByKey_uv.sortBy(lambda tp:tp[1],ascending=False)
datasSortBy_uv.foreach(print)
# 3.统计除了某个地区以外的uv----使用filter函数进行筛选,或者说过滤掉。比如除了北京地区外的所有uv
# datas = sc.textFile("./pvuvdata")
datas_filter = datas.filter(lambda ones:ones.split("\t")[3]=="beijing")
lines_uv2 = datas_filter.map(lambda line:line.split("\t")[1]+"_"+line.split("\t")[4]).distinct()
pair_lines_uv2 = lines_uv2.map(lambda one:(one.split("_")[1],1))
datasReduceByKey_uv2 = pair_lines_uv2.reduceByKey(lambda v1,v2:v1+v2)
datasSortBy_uv2 = datasReduceByKey_uv2.sortBy(lambda tp:tp[1],ascending=False)
datasSortBy_uv2.foreach(print)
# 4.统计每个网站最活跃的top2地区
# datas = sc.textFile("./pvuvdata")
site_local = datas.map(lambda one:(one.split("\t")[4] , one.split("\t")[3]))
site_local_Iterable = site_local.groupByKey()
sorted_result = site_local_Iterable.map(lambda x:get_top2_local(x))
sorted_result.foreach(print)
# 5.统计每个网站最热门的操作
# datas = sc.textFile("./pvuvdata")
site_operator = datas.map(lambda one:(one.split("\t")[4], one.split("\t")[5]))
site_operator_Iterable = site_operator.groupByKey()
sorted_resilt = site_operator_Iterable.map(lambda x:get_hottest_operation(x))
sorted_resilt.foreach(print)
# 6.统计每个网站下最活跃的top3用户
# datas = sc.textFile("./pvuvdata")
uid_site = datas.map(lambda line:(line.split("\t")[2],line.split("\t")[4]))
uid_siteIterable = uid_site.groupByKey()
uid_site_count = uid_siteIterable.flatMap(lambda x:get_uid_site_count(x))
top3_uid_info = uid_site_count.groupByKey().map(lambda x:get_active_uid_top3(x))
top3_uid_info.foreach(print)
另一个种方法:
# 统计每个网址访问量最高的top3地区
def get_top3_local(one):
site = one[0]
local_iterable = one[1]
local_dic = {}
for local in local_iterable:
if local in local_dic:
local_dic[local] += 1
else:
local_dic[local] = 1
new_list = sorted(local_dic.items(),key=lambda tp:tp[1],reverse=True)
return_list = []
if len(new_list) > 3:
for i in range(3):
return_list.append(new_list[i])
else:
return_list = new_list
return site,return_list
def get_all_info(info):
return_list = []
site = info[0]
return_list.append(site + "_" + info[1][0][0] + "_" + str(info[1][0][1]))
return_list.append(site + "_" + info[1][1][0] + "_" + str(info[1][1][1]))
return_list.append(site + "_" + info[1][2][0] + "_" + str(info[1][2][1]))
return return_list
if __name__=='__main__':
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("pv_uv_test")
sc = SparkContext(conf=conf)
lines = sc.textFile("./pvuvdata")
# 统计每个网址访问量最高的top3地区
lines.map(lambda line:(line.split("\t")[4],line.split("\t")[3])) \
.groupByKey() \
.map(lambda tp:get_top3_local(tp)) \
.flatMap(lambda info:get_all_info(info)) \
.foreach(print)
def get_top3_uidcnt(info):
site = info[0]
uidcnt_iterable = info[1]
top3_info = ["","",""]
for tp in uidcnt_iterable:
uid = tp[0]
cnt = tp[1]
for i in range(len(top3_info)):
if top3_info[i] == "":
top3_info[i]= tp
break
elif top3_info[i][1] < cnt:
for j in range(2,i,-1):
top3_info[j] = top3_info[j-1]
top3_info[i] = tp
break
return (site,top3_info)
if __name__=='__main__':
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("pv_uv_test")
sc = SparkContext(conf=conf)
lines = sc.textFile("./pvuvdata")
# rdd1:(uid,site) -> rdd2:(uid,[site,site.....]) -> (site,(uid,count)) ->[site,[(uid,count)....]] -> ....
lines.map(lambda line:(line.split("\t")[2],line.split("\t")[4])) \
.groupByKey() \
.flatMap(lambda info:get_site_uidcnt(info))\
.groupByKey() \
.map(lambda info:get_top3_uidcnt(info)) \
.foreach(print)
if __name__=='__main__':
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("pv_uv_test")
sc = SparkContext(conf=conf)
lines = sc.textFile("./pvuvdata")
# rdd1:(uid_site,1) -> rdd2:(uid_site,totalcount) -> (site,(uid,count)) ->[site,[(uid,count)....]] -> ....
lines.map(lambda line:(line.split("\t")[2] + "_"+ line.split("\t")[4],1)) \
.reduceByKey(lambda v1,v2:v1+v2) \
.map(lambda tp:(tp[0].split("_")[1],(tp[0].split("_")[0],tp[1]))) \
.groupByKey() \
.map(lambda info:get_top3_uidcnt(info)) \
.foreach(print)