python购物淫秽数据分析(2)

淘宝大数据的游戏,我重新提高自己的思维方式,

插件和代码前前后后写在六个版本,但最好的结果其实是我的第一次2第二码。这让我很惊讶,

但它也说明了一个问题。当你更熟悉的语言,当一方,你缺少的是其他的知识,

  1. 首先是我的数学知识,在分析用户行为时,我们知道浏览次数和购买次数是由一定规律的,这个方面找了数学系的同学问了一些,得到的结论是:你能够进行线性拟合。这是最简单的,可是得到的结果不一定真实,于是推荐我使用高斯分布来做。可是由于自己单枪匹马,所以选了比較简单的线性拟合
  2. 心理学,我们能够从数据中发现。那些常常在淘宝买东西的假设是时间间隔一段就买了同一种商品的,那说明这个人的属于死宅之类的,由于这些东西一般我们旁边就有,还有,就是浏览次数和购买之间的关系,用数学来解答,心理学来分析。多天浏览和购买的关系,
先意淫这些吧,下来上三个版本号的代码:
第一版本号,简单推測浏览十五次购买一次:
import time

u_id=[]
b_id=[]
t_id=[]
b_time=[]
t_num0=0
t_num1=0
t_num2=0
t_num3=0
a=True
i=0
j=0
fileread=open('t_alibaba_data.csv','r')
while True:
    fileline=fileread.readline()
  #  print  type(fileline)
  #  print fileline,
  #  print  i
    filedian =fileline.find(r',')
    filedian1=fileline.rfind(r',')

    b_id1=fileline[filedian+1:filedian1-2]
    b_id.append(b_id1)
    
    u_id1=fileline[:filedian]
    u_id.append(u_id1)
    
    t_id1=fileline[filedian1-1:filedian1]
    t_id.append(t_id1)
    
    b_time1=fileline[filedian1:-2]
    b_time.append(b_time1)


    if not fileline:
        break
output=open('taobao.txt','a')
#print u_id
#print b_id
#print t_id
print b_time

ff=0
while True:
    if u_id[i]==u_id[i+1]:
        
        if ff==0:

            output.write(u_id[i])
            output.write('   ')
        ff=ff+1
        if b_id[i]==b_id[i+1]:
    
            if int(t_id[i])==0:
                    t_num0=t_num0+1
            elif int(t_id[i])==1:
                t_num1=t_num1+1
            elif int(t_id[i])==2:
                t_num2=t_num2+1
            else:
                t_num3=t_num3+1
        else:
            j=j+1
            print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
            if t_num0>=15 or t_num1>=1 :
                output.write(b_id[i])
                output.write(",")
#            else:
#                output.write(b_id[i])
#                output.write(',') 
            t_num0=0
            t_num1=0
            t_num2=0
            t_num3=0
    
#    else:
    else:
        output.write('\n')
        ff=0
#        print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
    i=i+1
    



第二版本号,观察时间和购买行为
#coding:utf-8
import time

u_id=[]
b_id=[]
t_id=[]
b_time=[]
t_num0=0
t_num1=0
t_num2=0
t_num3=0
b_num1=0
b_time4=0
a=True
i=0
j=0
fileread=open('t_alibaba_data.csv','r')
while True:
    fileline=fileread.readline()
  #  print  type(fileline)
  #  print fileline,
  #  print  i
    filedian =fileline.find(r',')
    filedian1=fileline.rfind(r',')

    b_id1=fileline[filedian+1:filedian1-2]
    b_id.append(b_id1)
    
    u_id1=fileline[:filedian]
    u_id.append(u_id1)
    
    t_id1=fileline[filedian1-1:filedian1]
    t_id.append(t_id1)
    
    b_time1=fileline[filedian1:-2]
    b_time.append(b_time1)


    if not fileline:
        break
output=open('taobao.txt','a')
#print u_id
#print b_id
#print t_id
#print b_time

ff=0
while True:
    if u_id[i]==u_id[i+1]:
        
        if ff==0:

            output.write(u_id[i])
            output.write('   ')
        ff=ff+1
        if b_id[i]==b_id[i+1]:
    
            if int(t_id[i])==0:
                    t_num0=t_num0+1
            elif int(t_id[i])==1:
                t_num1=t_num1+1
            elif int(t_id[i])==2:
                t_num2=t_num2+1
            elif b_time[i]!=b_time[i+1]:
                b_time4=b_time4+1
            else:
                t_num3=t_num3+1
        else:
            j=j+1
            b_num1=b_num1+1
            print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4
            if t_num0>=15 and t_num1==0:
                output.write(b_id[i])
                output.write(",")
            if b_time4>=2 and t_num1==0:
                output.write(b_id[i])
                output.write(',') 
            if t_num0>15 and t_num1>=2:
                output.write(b_id[i])
                output.write(',') 
            if t_num2>=1 and t_num1==0:
                output.write(b_id[i])
                output.write(',') 
            if len(b_id)<=3:
                output.write(b_id[i])
                output.write(',') 

#            if b_num1<=3:
#                output.write(b_id[i])
#                output.write(',') 
#
#
            t_num0=0
            t_num1=0
            t_num2=0
            t_num3=0
            b_time4=0
    
#    else:
    else:
        output.write('\n')
        b_num1=b_num1+1
#        print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
    i=i+1
    b_num1=0



第三版本号,使用数学分析
#coding:utf-8
import time
import numpy as np
from scipy import optimize
from math import sqrt

u_id=[]
b_id=[]
t_id=[]
b_time=[]
t_num0=0        #类型
t_num1=0
t_num2=0
t_num3=0
b_num1=0        #品牌个数
b_time4=0       #时间
a=True
i=0
j=0
fileread=open('t_alibaba_data.csv','r')
while True:
    fileline=fileread.readline()
  #  print  type(fileline)
  #  print fileline,
  #  print  i
    filedian =fileline.find(r',')
    filedian1=fileline.rfind(r',')

    b_id1=fileline[filedian+1:filedian1-2]
    b_id.append(b_id1)
    
    u_id1=fileline[:filedian]
    u_id.append(u_id1)
    
    t_id1=fileline[filedian1-1:filedian1]
    t_id.append(t_id1)
    
    b_time1=fileline[filedian1:-2]
    b_time.append(b_time1)


    if not fileline:
        break
output=open('taobao.txt','a')
#print u_id
#print b_id
#print t_id
#print b_time
t_num00=[]
t_num11=[]
t_num22=[]
t_num33=[]
t_time44=[]
cc=0
ff=0
pp=0
while True:
    if u_id[i]==u_id[i+1]:
        
        if ff==0:

            output.write(u_id[i])
            output.write('   ')
        ff=ff+1
        if b_id[i]==b_id[i+1]:
#            cc=cc+1 
            if int(t_id[i])==0:
                    t_num0=t_num0+1
            elif int(t_id[i])==1:
                t_num1=t_num1+1
            elif int(t_id[i])==2:
                t_num2=t_num2+1
            else:
                t_num3=t_num3+1
            if b_time[i]!=b_time[i+1]:
               # print b_time4
                b_time4=b_time4+1
        else:
            j=j+1
        #    b_num1=b_num1+1
#            print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3,b_time4
#            if b_time4>=3:
#                print b_time4
#                pp=pp+1
#       数据拟合分析部分


            t_num00.append(t_num0)
            t_num11.append(t_num1)
            t_num22.append(t_num2)
            t_num33.append(t_num3)
            t_time44.append(b_time4)

#            if t_num0>=10 :
#                output.write(b_id[i])           #看了15次的没有买的
#                output.write(",")
#            elif b_time4>=3 :
#                output.write(b_id[i])           #多天看的,没有买
#                output.write(',') 
#           # if t_num0>15 and t_num1>=2:
#           #     output.write(b_id[i])
#           #     output.write(',') 
#            elif t_num2>=1 :
#                output.write(b_id[i])           #收藏出可是没有买
#                output.write(',') 
#            elif t_num3>=1 :         #放进购物车可是没有买
#                output.write(b_id[i])
#                output.write(',') 
#          #  if b_time4>=2 and t_num1>=2:
#          #      output.write(b_id[i])
#          #      output.write(',') 
#          #  
#            elif t_num1>=1:
#                output.write(b_id[i])           #买过两次
#                output.write(',') 
#                
                
#            if len(b_id)<=3:
#                output.write(b_id[i])
#                output.write(',') 

#            if b_num1<=3:
#                output.write(b_id[i])
#                output.write(',') 
#
#
            t_num0=0
            t_num1=0
            t_num2=0
            t_num3=0
            b_time4=0
#    elif b_num1<=3 and ff!=0:
#        print b_id[i]       
#        output.write(b_id[i])
#        output.write('\n')
#        ff=0
##    else:
    elif not u_id[i+1]:
        break
#    else:
#
#        output.write('\n')
#        ff=0
#        print  u_id[i],b_id[i],t_num0,t_num1,t_num2,t_num3
    i=i+1
    b_num1=0



#分析浏览次数和购买的关系   
y=np.array(t_num00)
x=np.array(t_num11)

def residuals(p):
    k,b=p
    return y-(k*x-b)

r=optimize.leastsq(residuals,[1,0])
k,b=r[0]
print "K=",k,"b=",b

#分析收藏和购买的关系
x22=np.array(t_num22)

def residuals(p):
    k,b=p
    return y-(k*x22-b)

r=optimize.leastsq(residuals,[1,0])
k22,b22=r[0]
print "Kt_num22=",k22,"b22=",b22

#分析购物车和购买的关系
x33=np.array(t_num33)

def residuals(p):
    k,b=p
    return y-(k*x33-b)

r=optimize.leastsq(residuals,[1,0])
k33,b33=r[0]
print "kt_num33=",k33,"b33=",b33
#查看天数和购物关系
x44=np.array(t_time44)

def residuals(p):
    k,b=p
    return y-(k*x44-b)

r=optimize.leastsq(residuals,[1,0])
k44,b44=r[0]
print "ktime=",k44,"b44=",b44

print pp
#def sim_pearson()



你可能感兴趣的:(python)