数据下载地址:https://www.kaggle.com/c/ga-customer-revenue-prediction
1. 定义问题
根据渠道、日期、媒体信息、国家等信息预测顾客销售额综合的对数函数。
y u s e r = ∑ i = 1 n t r a n s a c t i o n y_{user} = \sum_{i=1}^{n}transaction yuser=i=1∑ntransaction
t a r g e t = l n ( y u s e r + 1 ) target = ln(y_{user} +1) target=ln(yuser+1)
2. 数据准备
import pandas as pd
from pandas.io.json import json_normalize
import json
json_col = ['device','geoNetwork','totals','trafficSource']
data = pd.read_csv('E:\\study\\data\\GA customer revenue\\train.csv',sep=',',header=0,\
converters={column:json.loads for column in json_col})
#json转成多列,json_normalize对象是dict,json_loads对象是字符串
for col in json_col:
data_col=json_normalize(data[col])
data_col.columns = [f"{sub_col}" for sub_col in data_col.columns]
data = data.drop(col,axis=1).merge(data_col,right_index=True,left_index=True)
# 处理时间 strp对象是字符串 所以不采用这个方法选择年月了
data['date'] = data['date'].astype(str)
data['date'] = data['date'].apply(lambda x:x[:4]+"-"+x[4:6]+"-"+x[6:8])
data['date'] = pd.to_datetime(data['date']) #时间戳
data['month'] = data['date'].apply(lambda x:x.strftime('%Y-%m'))
data['week'] = data['date'].dt.weekday
data['transactionRevenue']=data['transactionRevenue'].astype(float).fillna(0)
for r in data.columns:
a = data[r].value_counts()
if len(a)<2:
print(r)
print(a)
print('----')
# 值是常数的列去掉,剩38列
data = data.drop(['socialEngagementType','browserSize','browserVersion','flashVersion',\
'language','mobileDeviceBranding','mobileDeviceInfo','mobileDeviceMarketingName',\
'mobileInputSelector','operatingSystemVersion','screenResolution','screenColors','cityId',\
'latitude','longitude','networkLocation','visits','adwordsClickInfo.criteriaParameters','campaignCode'],axis=1)
# 缺失值个数
for r in data.columns:
a = len(data[r][pd.isnull(data[r])])/len(data)
if a>0.8:
print(r)
print(a)
print('----')
# 缺失大于90%的字段删掉,剩31列 不要把revenue删掉了
data = data.drop(['adContent','adwordsClickInfo.adNetworkType','adwordsClickInfo.gclId','adwordsClickInfo.isVideoAd','adwordsClickInfo.page','adwordsClickInfo.slot'],axis=1)
3. 探索数据
x_val = []
for r in data.columns:
a = data[r].value_counts()
if len(a)<15:
x_val.append(r)
print(a)
访问量统计结果如下:
访问量主要渠道来源是Organic Search, Social, Direct和Referral。桌面设备具有绝对优势,非移动端访问量明显大于移动端。访问量主要来源地区是美国、亚洲及欧洲。媒体中,organic及referral访问量大。一周中,周五访问量最小。
for x in x_val:
b = data['transactionRevenue'].groupby(data[x]).sum()
print(b)
data['transactionRevenue'].groupby(data['medium']).sum().plot('bar')
收入统计结果如下:
收入主要来源渠道是Direct, Organic Search和Referral,Social带来的访问量大但收入一般。桌面设备带来的收入明显大于移动设备。收入主要来源地区是美国、亚洲。媒体中,organic及referral带来的收入较多。一周中,周五及周六收入较少。
4. 建立模型
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
for x in x_val[:-1]:#week本身就是整数
data[x+'_code']=label.fit_transform(data[x].values.astype(str))
x_val_code=[]
for x in x_val[:-1]:
x_val_code.append(str(x+"_code"))
data_ = pd.concat([data['week'],data[x_val_code]],axis=1)
# 建立模型 考虑sgd随机梯度回归
from sklearn.linear_model import SGDRegressor
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
import numpy as np
target = np.log1p(data['transactionRevenue'])
x_train,x_test,y_train,y_test = train_test_split(data_,target,test_size=0.3)
dtree = SGDRegressor()
dtree.fit(data_,target)