淘宝购买预测——logistic回归,决策树,随机森林

数据是用户在商品全集上的移动端行为数据(D),表名为tianchi_fresh_comp_train_user_2w,包含如下字段:
user_id : 用户标识
item_id : 商品标识
behavior_type : 用户对商品的行为类型,包括浏览、收藏、加购物车、购买,对应取值分别是1、2、3、4
user_geohash : 用户位置的空间标识,可以为空,由经纬度通过保密的算法生成
item_category : 商品分类标识
time : 行为时间,精确到小时级别

1、预测效果并不好,可能是因为特征只选择了前一天的四种行为的次数,如果选择前一周或者更多天的数据应该会更好,离得越近的数据权重设置越高,越远的权重越低。
2、用户位置和商品分类信息不知道怎么用

# -*-coding:utf-8 -*-
__author__ = 'Bai'
import os, numpy as np,math
os.chdir('C:/Bai/taobao/fresh_comp_offline')
f = open('tianchi_fresh_comp_train_user.csv')
context = f.readlines()

##feature
u_dic = [{} for i in range(4)]
for line in context:
    line = line.replace('\n','')
    array = line.split(',')
    if array[0] == 'user_id':
        continue
    time = array[-1].replace(' ','-')
    time = time.split('-')
    day = int(time[1] + time[2])
    if day == 1130:
        day = int(1201)
    else:
        day += 1
    uid = (array[0],array[1],day)    # user_id, item_id, day
    type = int(array[2]) - 1         #浏览、收藏、加购物车、购买,对应取值分别是0,1,2,3
    if uid in u_dic[type]:
        u_dic[type][uid] += 1
    else:
        u_dic[type][uid] = 1

##label
u_buy = {}
u_buy_18 = {}  #用于下面的评估
for line in context:
    line = line.replace('\n','')
    array = line.split(',')
    if array[0] == 'user_id':
        continue
    time = array[-1].replace(' ','-')
    time = time.split('-')
    day = int(time[1] + time[2])
    uid = (array[0],array[1],day)
    if array[2] == '4':
        u_buy[uid] = 1
    else:
        u_buy[uid] = 0

    if day == 1218:
        if array[2] == '4':
            u_buy_18[uid] = 1
        else:
            u_buy_18[uid] = 0

## get train X, Y
X = []
Y = []
for u in u_buy:
    l = []
    if u in u_dic[0]:
        l.append(u_dic[0][u])
    else:
        l.append(0)
    if u in u_dic[1]:
        l.append(u_dic[1][u])
    else:
        l.append(0)
    if u in u_dic[2]:
        l.append(u_dic[2][u])
    else:
        l.append(0)
    if u in u_dic[3]:
        l.append(u_dic[3][u])
    else:
        l.append(0)
    X.append(l)
    Y.append(u_buy[u])

##training

##logistic回归
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

##决策树
#from sklearn.tree import DecisionTreeClassifier
#model = DecisionTreeClassifier(max_depth=4)

##随机森林
#from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier()

##SVM
#from sklearn import svm

model.fit(X, Y)

##用17号的在线行为和18号的购买行为来评估模型
Y18 = {}
for u in u_buy:
    if u[2] == 1218:
        n = []
        if u in u_dic[0]:
            n.append(u_dic[0][u])
        else:
            n.append(0)
        if u in u_dic[1]:
            n.append(u_dic[1][u])
        else:
            n.append(0)
        if u in u_dic[2]:
            n.append(u_dic[2][u])
        else:
            n.append(0)
        if u in u_dic[3]:
            n.append(u_dic[3][u])
        else:
            n.append(0)
        Y18[u] = int(model.predict(n)[0])
##      Y18[u] = model.predict_proba(n)   ##得到的结果是array([[p(1),p(0)]])

a = 0
b = 0
c = 0
for u in Y18:
    if Y18[u] == 1:
        a = a + 1
        if (u in u_buy_18) and (u_buy_18[u] == 1):
            b = b + 1
for u in u_buy_18:
    if u_buy_18[u] == 1:
        c = c + 1
a = float(a)
b = float(b)
c = float(c)
print a,b,c
precision = float(b/a)
recall = float(b/c)
F1 = float(2 * precision * recall)/float(precision + recall)
print F1

你可能感兴趣的:(数据挖掘)