菜市场价格分析 python pandas Apriori算法 数据预处理

1. 安装python包

  1.  numpy
  2.  pandas

2. 数据预处理

     1. 由于表格非常奇怪,我们对蔬菜和肉类分别处理,并合并

      2.  对有丢失值的列直接drop

3. 编码分析


# coding: utf-8

# In[1]:


# coding = utf-8
import numpy as np
import pandas as pd

data = pd.read_excel("data.xls",encoding='utf-8')#.astype('float')
data.drop(["肉食禽蛋","批发价格"],axis=1,inplace=True)
data=data.drop_duplicates().dropna()
data=data.dropna(axis=1)
index2drop = data["蔬菜名"]!="蔬菜类"
data=data[index2drop]
data = data.pivot(index='日期',columns='蔬菜名',values='价格')

data2 = pd.read_excel("data.xls")#.astype('float')
data2.drop(["蔬菜名","价格"],axis=1,inplace=True)
data2=data2.drop_duplicates().dropna()
index2drop = data2["肉食禽蛋"]!="肉食禽蛋类"
data2=data2[index2drop]
data2 = data2.pivot(index='日期',columns='肉食禽蛋',values='批发价格')
data2.head(6)
data = pd.merge(data,data2,on="日期")
data.head(10)
data.drop(["冻芋头","北瓜"],axis=1,inplace = True)
data.dropna(thresh=10,axis=1,inplace=True)
data.head()
data.dtypes
# data.to_csv("data1.csv",encoding='utf-8')
#
# # help(pd.concat)
# data=pd.concat([data,data2],keys="日期")
# data.head(100)
# data = pd.merge(data,data2,left_on="日期")


# # 在excel中打开data1.csv
# ## 将乱码的缺失值替换成空 (ctrl + F)
# > 若出现读不进来的情况,把csv文件的后缀改为txt,用记事本打开,然后另存为[utf-8编码](https://jingyan.baidu.com/article/9faa7231ae80c2473c28cb85.html),然后把后缀名该回去

# In[2]:


columns = data.columns
print(columns)
# columns=columns.insert(0,"date")
print(columns.shape)
data2=pd.read_csv("data1.csv",index_col ="日期",encoding = 'utf-8').astype('float')
data2.columns = columns
data2.describe()


# In[19]:


temp = data2.isnull().any() #列中是否存在空值
dataDropped = data2.dropna(axis=1)


# ## there are 61 kinds goods in all, and 38 kinds are complete.
# ## So, we drop the columns with nan values..

# In[20]:


dataDroppedNameList = dataDropped.columns


# In[21]:


dataDroppedNameList


# # diff and encode
# 1. Get the differenec DataFrame
# 2. Encode the increasing items with c*2, and the decreasing items with c*2+1,where c is the columns number
# 

# In[22]:


dataDiff = dataDropped.diff()


# In[23]:


dataDiff.head(10)


# In[24]:


# encode
dataEncode = dataDiff.copy()
for i in range(len(dataDroppedNameList)):
    increaingIndex = dataEncode[dataDroppedNameList[i]]>0
    dataEncode[dataDroppedNameList[i]][increaingIndex] = float(i*2)
    decreaingIndex = dataEncode[dataDroppedNameList[i]]<0
    dataEncode[dataDroppedNameList[i]][decreaingIndex] = float(i*2+1)


# In[42]:


dataEncode.drop(['2008/9/29'],inplace=True)
dataEncode=dataEncode.astype('int')


# In[33]:


dataEncode=dataEncode.T
dataEncode.to_excel("Encode.xlsx")


# In[43]:


print(dataEncode)


# In[60]:


# list = [1,2,3,0,2,0,0,303]


import numpy as np
import pandas as pd

def zerofilter(listArr):
    ans = []
    for i in listArr:
        if i!=0:
            ans.append(i)
    return ans

def loadDataSet():

    data = pd.read_excel("Encode.xlsx")
    input =[]
    for i in data.columns:
        dataRow = list(data[i])
        dataRow = zerofilter(dataRow)
        input.append(dataRow)
# print(input)
    return input#[[1,2,5],[2,4],[2,3],[1,2,4],[1,3],[2,3],[1,3],[1,2,3,5],[1,2,3]]
#1.构建候选1项集C1
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])

    C1.sort()
    return list(map(frozenset, C1))

#将候选集Ck转换为频繁项集Lk
#D:原始数据集
#Cn: 候选集项Ck
#minSupport:支持度的最小值
def scanD(D, Ck, minSupport):
    #候选集计数
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                if can not in ssCnt.keys(): ssCnt[can] = 1
                else: ssCnt[can] += 1

    numItems = float(len(D))
    Lk= []     # 候选集项Cn生成的频繁项集Lk
    supportData = {}    #候选集项Cn的支持度字典
    #计算候选项集的支持度, supportData key:候选项, value:支持度
    for key in ssCnt:
        support = ssCnt[key] / numItems
        if support >= minSupport:
            Lk.append(key)
        supportData[key] = support
    return Lk, supportData

#连接操作,将频繁Lk-1项集通过拼接转换为候选k项集
def aprioriGen(Lk_1, k):
    Ck = []
    lenLk = len(Lk_1)
    for i in range(lenLk):
        L1 = list(Lk_1[i])[:k - 2]
        L1.sort()
        for j in range(i + 1, lenLk):
            #前k-2个项相同时,将两个集合合并
            L2 = list(Lk_1[j])[:k - 2]
            L2.sort()
            if L1 == L2:
                Ck.append(Lk_1[i] | Lk_1[j])

    return Ck

def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    L1, supportData = scanD(dataSet, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Lk_1 = L[k-2]
        Ck = aprioriGen(Lk_1, k)
#         print("ck:",Ck)
        Lk, supK = scanD(dataSet, Ck, minSupport)
        supportData.update(supK)
        print("lk:", Lk)
        L.append(Lk)
        k += 1

    return L, supportData

def generateRules(L, supportData, minConf=0.7):
    #频繁项集列表、包含那些频繁项集支持数据的字典、最小可信度阈值
    bigRuleList = [] #存储所有的关联规则
    for i in range(1, len(L)):  #只获取有两个或者更多集合的项目,从1,即第二个元素开始,L[0]是单个元素的
        # 两个及以上的才可能有关联一说,单个元素的项集不存在关联问题
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            #该函数遍历L中的每一个频繁项集并对每个频繁项集创建只包含单个元素集合的列表H1
            if (i > 1):
            #如果频繁项集元素数目超过2,那么会考虑对它做进一步的合并
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:#第一层时,后件数为1
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)# 调用函数2
    return bigRuleList

#生成候选规则集合:计算规则的可信度以及找到满足最小可信度要求的规则
def getinfo(num):
    good = dataDroppedNameList[int(num/2)]
    up = num%2
    ans = good 
    if up:
        ans = ans+"  down"
    else:
        ans = ans+"  up"
    return  ans
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    #针对项集中只有两个元素时,计算可信度
    prunedH = []#返回一个满足最小可信度要求的规则列表
    for conseq in H:#后件,遍历 H中的所有项集并计算它们的可信度值
        conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度计算,结合支持度数据
        if conf >= minConf:
#             print (freqSet-conseq,'-->',conseq,'conf:',conf)
            cause = list(freqSet-conseq)
#             print(cause[0])
            result = list(conseq)
#             print(result[0])
#             print('-----')
            print(getinfo(cause[0]),"--->>>",getinfo(result[0]))
#             print(type(freqSet-conseq))
            #如果某条规则满足最小可信度值,那么将这些规则输出到屏幕显示
            brl.append((freqSet-conseq, conseq, conf))#添加到规则里,brl 是前面通过检查的 bigRuleList
            prunedH.append(conseq)#同样需要放入列表到后面检查
    return prunedH

#合并
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    #参数:一个是频繁项集,另一个是可以出现在规则右部的元素列表 H
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #频繁项集元素数目大于单个集合的元素数
        Hmp1 = aprioriGen(H, m+1)#存在不同顺序、元素相同的集合,合并具有相同部分的集合
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)#计算可信度
        if (len(Hmp1) > 1):    
        #满足最小可信度要求的规则列表多于1,则递归来判断是否可以进一步组合这些规则
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
dataset = loadDataSet()
L, supportData = apriori(dataset, minSupport=0.05)
print("------------------------")
# print(L)
print("----------generateRules-----------")
rules=generateRules(L,supportData,minConf=0.4)


# In[50]:


# Python program to understand frozenset() function 

# tuple of numbers 
nu = (1, 2, 3, 4, 5, 6, 7, 8, 9) 

# converting tuple to frozenset 
fnum = frozenset(nu) 

# printing details 
print("frozenset Object is : ", fnum) 
list(fnum)

 

你可能感兴趣的:(数据分析)