陌陌-pyspark

使用陌陌案例数据,用sparksql进行需求实现
需求说明

陌陌-pyspark_第1张图片

字段说明

陌陌-pyspark_第2张图片

代码实现
#conding = utf-8

import os

os.environ['JAVA_HOME']= '/export/server/jdk1.8.0_241'

os.environ['PYSPARK_PYTHON']= '/root/anaconda3/envs/pyspark_env/bin/python'

import pandasas pd

from pyspark.sqlimport SparkSession

from pyspark.sql.typesimport StringType,FloatType,IntegerType,DateType

from pyspark.sqlimport functionsas F

import warnings

warnings.filterwarnings('ignore')

if __name__== '__main__':

    spark= SparkSession.builder.\

master("local[*]").\

config('spark.sql.shuffle.partitions','4').\

config('spark.sql.warehouse.dir','hdfs://node1:8020/user/hive/warehouse').\

config('hive.metastore.uris','thrift://node1:9083').\

enableHiveSupport().\

getOrCreate()

print('--------------------')

data= spark.sql('select *  from db_msg.tb_msg_source')

# 查看数据情况,软件显示不全,截取部分需要用到字段进行查看

    print(data.describe('msg_time','sender_name','sender_account',

'receiver_account','sender_gps','receiver_os',

'receiver_phonetype'

).show())

print(data.columns)

def mm_hour(msg_time):

        r= pd.to_datetime(msg_time).hour

return r

udf1= F.udf(mm_hour,IntegerType())

data= data.withColumn('hour_info',udf1('msg_time'))

# 查看是否增加成功

    print(data.select(data['msg_time'], data['hour_info']).show(3))

# data.persist()  #电脑性能不好会卡住

    # 统计今日总消息量persist

    today_message_number= data.count()

print('统计今日总消息量{}'.format(today_message_number))

# 统计每小时的消息量,发送和接受的用户数

    df_msg_hour= data.groupby('hour_info').\

agg(

F.count('msg_time').alias('消息量'),

F.countDistinct('sender_account').alias('发送用户数'),

F.countDistinct('receiver_account').alias('接受用户数')

)

print(df_msg_hour.show(24))

#各地区发送消息数据量

    # 发送消息最多的top10用户

    msg_sender_user= data.groupby('sender_account').\

agg(F.count('msg_time').alias('消息量')).\

sort(['消息量'],ascending=False)

print(msg_sender_user.show(10))

#接受消息最多的top10用户

    msg_receiver_user= data.groupby('receiver_account').\

agg(F.count('msg_time').alias('消息量')).\

sort(['消息量'],ascending=False)

print(msg_receiver_user.show(10))

# 统计手机型号分布情况

    phonetype_sender_user=  data.groupby('sender_phonetype').\

agg(F.countDistinct('sender_account').alias('数量')).\

sort(['数量'],ascending=False)

print(phonetype_sender_user.show(truncate=False))

# 统计操作系统分布情况

    sender_os_cnt=  data.groupby('sender_os').\

agg(F.countDistinct('sender_account').alias('数量')).\

sort(['数量'],ascending=False)

print(sender_os_cnt.show(truncate=False))

你可能感兴趣的:(hadoop,大数据,hive)