环境:CentOS 7、Spark 3.1.2
注:Spark环境搭建借鉴黑马程序员给的资料
数据:来自阿里云天池
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as func
# 以yarn模式新建spark应用,并指定driver、executor参数,压榨集群性能
spark = SparkSession.builder.appName('天猫复购率').master('yarn').config('spark.num.executors', '8').config(
'spark.executor.memory',
'6g').config('spark.executor.cores',
'1').config(
'spark.driver.memory', '10g').config("spark.default.parallelism", 8).getOrCreate()
该方式极易上手,和pandas几乎一模一样的代码,前提是需要安装spark3版本,如果不到3版本可以跳过看第二种特征工程。
#*****************通过jdbc读取mysql表(生产环境中大多在数据库中)*****************
# 需要有mysql驱动,放到java_home\jre\lib\ext目录下,host需要修改,此处脱敏了
url = 'jdbc:mysql://192.***.***.***:3306/tiaomao_Repurchase?useSSL=false&useUnicode=true'
auth_mysql = {"user": "root", "password": "123456"}
df_train = spark.read.jdbc(url=url, table='train_format1', properties=auth_mysql)
# 转换为pyspark.pandas.dataframe方便调用pandas API 注意此处并非pandas的dataframe是两个不同的概念
df_train = df_train.pandas_api()
df_test = spark.read.jdbc(url=url, table='test_format1', properties=auth_mysql)
df_test = df_test.pandas_api()
# *****************通过hive读取*****************
# 前提是需要配置spark集成hive(当然得装好hive)
# df_test = spark.sql('select * from tiaomao_Repurchase.test_format1')
# df_test = df_test.pandas_api()
user_info = spark.read.jdbc(url=url, table='user_info_format1', properties=auth_mysql)
user_info = user_info.pandas_api()
#*****************以CSV文件读取*****************
# 读取方式1 (因为是以yarn模式运行,需要上传到hdfs,否则集群其他节点无法读取文件)借用pandasAPI进行读取
user_log = ps.read_csv('hdfs:///competiton/tiaomao/user_log_format1.csv')
# 也可将master('yarn')改为master('loacl[*]')模式读取linux本地的csv文件
# user_log = ps.read_csv('./competiton/tiaomao/user_log_format1.csv')
# 读取方式2,先读为pyspark.dataframe再转换为pyspark.pandas.dataframe
# user_log = spark.read.options(header='True', inferSchema='True', delimiter=',').csv("hdfs:///competiton/tiaomao/user_log_format1.csv")
# user_log = user_log.pandas_api()
# *****************数据探查*****************
# 以下实际跑过一次了,先注释掉,因为在没有添加spark缓存的情况下每一次print都会让spark重头运行一遍
# print(df_test.shape, df_train.shape)
# print(user_info.shape)
# print(user_log.shape)
# print(user_info.info())
# print(user_info.head(10))
# print(user_log.isnull().sum(axis=0))
# *****************特征工程*****************
# 处理空值
user_info['age_range']=user_info['age_range'].replace(0.0, -1)# 不能加inplace=True参数
user_info['gender']=user_info['gender'].replace(2.0, -1)
user_info['age_range']=user_info['age_range'].fillna(-1)
user_info['gender']=user_info['gender'].fillna(-1)
# age_range,gender特征添加
df_train = ps.merge(df_train, user_info, on="user_id", how="left")
# total_logs特征添加
total_logs_temp = user_log.groupby([user_log["user_id"], user_log["seller_id"]]).count().reset_index()[
["user_id", "seller_id", "item_id"]]
total_logs_temp.rename(columns={"seller_id": "merchant_id", "item_id": "total_logs"}, inplace=True)
df_train = ps.merge(df_train, total_logs_temp, on=["user_id", "merchant_id"], how="left")
# unique_item_ids特征添加
# 根据"user_id", "seller_id", "item_id"维度去重
unique_item_ids_temp = user_log.groupby([user_log["user_id"], user_log["seller_id"], user_log["item_id"]]).count().reset_index()[
["user_id", "seller_id", "item_id"]]
unique_item_ids_temp1 = unique_item_ids_temp.groupby(
[unique_item_ids_temp["user_id"], unique_item_ids_temp["seller_id"]]).count().reset_index()
unique_item_ids_temp1.rename(columns={"seller_id": "merchant_id", "item_id": "unique_item_ids"}, inplace=True)
df_train = ps.merge(df_train, unique_item_ids_temp1, on=["user_id", "merchant_id"], how="left")
#categories特征构建
# 根据"user_id", "seller_id", "cat_id"维度去重
categories_temp = user_log.groupby([user_log["user_id"], user_log["seller_id"], user_log["cat_id"]]).count().reset_index()[
["user_id", "seller_id", "cat_id"]]
categories_temp1 = categories_temp.groupby(
[categories_temp["user_id"], categories_temp["seller_id"]]).count().reset_index()
categories_temp1.rename(columns={"seller_id":"merchant_id","cat_id":"categories"},inplace=True)
df_train = ps.merge(df_train, categories_temp1, on=["user_id", "merchant_id"], how="left")
# browse_days特征构建
# 根据"user_id", "seller_id", "time_stamp"维度去重
browse_days_temp = user_log.groupby([user_log["user_id"], user_log["seller_id"], user_log["time_stamp"]]).count().reset_index()[
["user_id", "seller_id", "time_stamp"]]
browse_days_temp1 = browse_days_temp.groupby(
[browse_days_temp["user_id"], browse_days_temp["seller_id"]]).count().reset_index()
browse_days_temp1.rename(columns={"seller_id": "merchant_id", "time_stamp": "browse_days"}, inplace=True)
df_train = ps.merge(df_train, browse_days_temp1, on=["user_id", "merchant_id"], how="left")
# one_clicks、shopping_carts、purchase_times、favourite_times特征构建
one_clicks_temp = user_log.groupby([user_log["user_id"], user_log["seller_id"], user_log["action_type"]]).count().reset_index()[
["user_id", "seller_id", "action_type", "item_id"]]
one_clicks_temp.rename(columns={"seller_id": "merchant_id", "item_id": "times"}, inplace=True)
one_clicks_temp["one_clicks"] = one_clicks_temp["action_type"] == 0
one_clicks_temp["one_clicks"] = one_clicks_temp["one_clicks"] * one_clicks_temp["times"]
one_clicks_temp["shopping_carts"] = one_clicks_temp["action_type"] == 1
one_clicks_temp["shopping_carts"] = one_clicks_temp["shopping_carts"] * one_clicks_temp["times"]
one_clicks_temp["purchase_times"] = one_clicks_temp["action_type"] == 2
one_clicks_temp["purchase_times"] = one_clicks_temp["purchase_times"] * one_clicks_temp["times"]
one_clicks_temp["favourite_times"] = one_clicks_temp["action_type"] == 3
one_clicks_temp["favourite_times"] = one_clicks_temp["favourite_times"] * one_clicks_temp["times"]
four_features = one_clicks_temp.groupby(
[one_clicks_temp["user_id"], one_clicks_temp["merchant_id"]]).sum().reset_index()
four_features = four_features.drop(["action_type", "times"], axis=1)
df_train = ps.merge(df_train, four_features, on=["user_id", "merchant_id"], how="left")
df_train=df_train.to_pandas()# 最后可以直接转回pandas直接塞入模型,值得注意的是,会占用driver较大的内存,需调大driver的内存配置,如内存不够则直接入库或者csv存储
学习成本比较高,但可以更深入掌握spark,解决一些棘手的问题,并且企业中可能只有spark2用不了pyspark.pandas.dataframe
# *****************通过jdbc读取mysql表(生产环境中大多在数据库中)*****************
url = 'jdbc:mysql://192.***.***.***:3306/tiaomao_Repurchase?useSSL=false&useUnicode=true'
auth_mysql = {"user": "root", "password": "123456"}
df_train = spark.read.jdbc(url=url, table='train_format1', properties=auth_mysql)
df_test = spark.read.jdbc(url=url, table='test_format1', properties=auth_mysql)
user_info = spark.read.jdbc(url=url, table='user_info_format1', properties=auth_mysql)
# *****************以CSV文件读取*****************
user_log = spark.read.options(header='True', inferSchema='True', delimiter=',').csv(
"hdfs:///competiton/tiaomao/user_log_format1.csv")
# 使用udf+withColumn替代pandas中的apply(lambda x:f(x))
def replace_value1(x):
if x == 0:
return -1
else:
return x
# 此处封装udf时指定数据类型
replace_UDF1 = udf(lambda z: replace_value1(z), IntegerType())
def replace_value2(x):
if x == 2:
return -1
else:
return x
replace_UDF2 = udf(lambda z: replace_value1(z), IntegerType())
user_info=user_info.withColumn("age_range", replace_UDF1(col("age_range")))
user_info=user_info.withColumn("gender", replace_UDF2(col("gender")))
user_info = user_info.fillna({'age_range': -1, 'gender': -1})
# age_range,gender特征添加
df_train = df_train.join(user_info, df_train.user_id == user_info.user_id, 'left')
# total_logs特征添加
# total_logs_temp = user_log.groupby(["user_id", "seller_id"]).agg({'user_id': 'count'}) # 这写法不好重命名列
total_logs_temp = user_log.groupby(["user_id", "seller_id"]).agg(func.count('item_id').alias('total_logs'))
total_logs_temp = total_logs_temp.withColumnRenamed('seller_id', 'merchant_id')
df_train = df_train.join(total_logs_temp, on=["user_id", "merchant_id"], how="left")
# unique_item_ids特征添加
# 根据"user_id", "seller_id", "item_id"维度去重
unique_item_ids_temp = user_log.select('user_id', 'seller_id', 'item_id').dropDuplicates()
unique_item_ids_temp1 = unique_item_ids_temp.groupby(["user_id", "seller_id"]).agg(
func.count('item_id').alias('unique_item_ids'))
unique_item_ids_temp1 = unique_item_ids_temp1.withColumnRenamed('seller_id', 'merchant_id')
df_train = df_train.join(unique_item_ids_temp1, on=["user_id", "merchant_id"], how="left")
# categories特征构建
# 根据"user_id", "seller_id", "cat_id"维度去重
categories_temp = user_log.select('user_id', 'seller_id', 'cat_id').dropDuplicates()
categories_temp1 = categories_temp.groupby(["user_id", "seller_id"]).agg(func.count('cat_id').alias('categories'))
categories_temp1 = categories_temp1.withColumnRenamed('seller_id', 'merchant_id')
df_train = df_train.join(categories_temp1, on=["user_id", "merchant_id"], how="left")
# browse_days特征构建
# 根据"user_id", "seller_id", "time_stamp"维度去重
browse_days_temp = user_log.select('user_id', 'seller_id', 'time_stamp').dropDuplicates()
browse_days_temp1 = browse_days_temp.groupby(["user_id", "seller_id"]).agg(
func.count('time_stamp').alias('browse_days'))
browse_days_temp1 = browse_days_temp1.withColumnRenamed('seller_id', 'merchant_id')
df_train = df_train.join(browse_days_temp1, on=["user_id", "merchant_id"], how="left")
# one_clicks、shopping_carts、purchase_times、favourite_times特征构建
one_clicks_temp = user_log.groupby(["user_id", "seller_id", "action_type"]).agg(
func.count('item_id').alias('times'))
one_clicks_temp = one_clicks_temp.withColumnRenamed('seller_id', 'merchant_id')
def udf_func1(times, action_type, action_type_value):
if action_type == action_type_value:
return times
else:
return 0
# 此处借用rdd进行map操作,顺便熟悉一下rdd,个人感觉这个map更好记一些
one_clicks_temp_rdd = one_clicks_temp.rdd.map(lambda x: (
x['user_id'], x['merchant_id'], udf_func1(x['times'], x['action_type'], 0),
udf_func1(x['times'], x['action_type'], 1), udf_func1(x['times'], x['action_type'], 2),
udf_func1(x['times'], x['action_type'], 3)))
one_clicks_temp = spark.createDataFrame(one_clicks_temp_rdd, ['user_id', 'merchant_id','one_clicks','shopping_carts','purchase_times','favourite_times'])
# 或者还是用udf+withColumn来处理
# udf1 = udf(udf_func1, IntegerType())
# one_clicks_temp = one_clicks_temp.withColumn('action_type_value0', func.lit(0))
# one_clicks_temp = one_clicks_temp.withColumn('action_type_value1', func.lit(1))
# one_clicks_temp = one_clicks_temp.withColumn('action_type_value2', func.lit(2))
# one_clicks_temp = one_clicks_temp.withColumn('action_type_value3', func.lit(3))
#
# one_clicks_temp = one_clicks_temp.withColumn('one_clicks',udf1(col('times'), col('action_type'), col('action_type_value0')))
# one_clicks_temp = one_clicks_temp.withColumn('shopping_carts',udf1(col('times'), col('action_type'), col('action_type_value1')))
# one_clicks_temp = one_clicks_temp.withColumn('purchase_times',udf1(col('times'), col('action_type'), col('action_type_value2')))
# one_clicks_temp = one_clicks_temp.withColumn('favourite_times',udf1(col('times'), col('action_type'), col('action_type_value3')))
four_features = one_clicks_temp.groupby(["user_id", "merchant_id"]).agg(
func.sum('one_clicks').alias('one_clicks'), func.sum('shopping_carts').alias('shopping_carts'),
func.sum('purchase_times').alias('purchase_times'), func.sum('favourite_times').alias('favourite_times'))
df_train = df_train.join(four_features, on=["user_id", "merchant_id"], how="left")
df_train=df_train.to_pandas()
此处没有学习的必要,因为都是些简单处理,数据量并不大,可以直接在hive/mysql里写SQL,spark-SQL反而可能会有一些语法上不支持的小缺陷,这种方式适用于一些特定的场景进行快速开发。
比如:需要多次跑同一段SQL,但每次跑的时候都会动态调整SQL语句。以及需要自定义一些udf函数,并且结合rdd算子一起使用。
模型这边就没有继续往下做了,想要快速测试直接LightGBM就可以了,本篇文章的重点在于如何借用Spark计算框架快速运算特征工程,尽可能贴近企业实战。当我接触几十亿甚至几百亿数据的时候,我不希望自己是束手无策的。
我在虚拟机搭的spark环境,本以为就算虚拟机也是并行的Spark肯定比windows 本地的pandas快,然后pandas比spark快了3倍。但是集群中的spark是真的快!!!
接下来准备在window本地装一下Pyspark,据说Polars和阿里云的mars性能也不错,不过毕竟spark含金量更高一些,大多云数据库也都支持spark,比如阿里云的Maxcompute。
在入门spark的时候建议先学理论再实践,如果忽视它运行机制很容易坑里,一直报错还只是小问题,万一没有报错并且没有发现那会引发极大的问题!!!一下可能整崩好几亿的生产数据。