全国高校计算大赛 模拟赛 第三阶段 python版 题解

 标签推荐

#这回只有IDE下的,好气
import pandas as pd

class Task:
    def func(self):
        tf_train_list, tf_test_list = loadSimpDat()
        initSet = createInitSet(tf_train_list)
        myFPtree, myHeaderTab = createFPtree(initSet, 5)  # 最小支持

        tag = list()  # 存储的是筛选的标签
        for child in myFPtree.children.values():
            tag.append(child.name)
        recommend_tag = []

        for i in range(len(tf_test_list)):
            user_recommend_tag = []
            for j in enumerate(tf_test_list[i]):
                if j[1] in tag:
                    user_tag_list = list()
                    user_recommend_tag.extend(
                        txt_wrap_by("<", ">", str(findPrefixPath(j[1], myHeaderTab).keys()), user_tag_list))
                user_recommend_tag = list(set(user_recommend_tag))[1:15:1]
            u = [str(i) for i in user_recommend_tag]
            v = ','.join(u)
            recommend_tag.append(v)

        print(recommend_tag)
        user = [str(x + 1) for x in range(20)]
        df = pd.DataFrame({'id': user, 'recommand_tags': recommend_tag})
        df.to_csv('src/step1/user_recommand.csv', sep=' ', index=False)


class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue#name存放结点名字
        self.count = numOccur#count用于计数
        self.nodeLink = None#用于连接相似的节点
        self.parent = parentNode#用于存放父节点,用于回溯
        self.children = {}#存放儿子节点

    def inc(self, numOccur):
        self.count += numOccur

    #用于输出调试,显示树
    def disp(self, ind=1):
        print('  '*ind, self.name, ' ', self.count)
        tag = self.name
        tag = list()
        tag.append(self.name)
        for child in self.children.values():
            child.disp(ind+1)
        return tag
# 为了能方便地访问FP树种每一个不同的元素,需要为每种元素(的链表)设置一个头(header),
# 这个header除了指向指定元素的第一个结点外,还可以保存该元素在数据集中的总出现次数。
def updateHeader(nodeToTest, targetNode):
    while nodeToTest.nodeLink != None:
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode
def updateFPtree(items, inTree, headerTable, count):
    if items[0] in inTree.children:
        # 判断items的第一个结点是否已作为子结点
        inTree.children[items[0]].inc(count)
    else:
        # 创建新的分支
        inTree.children[items[0]] = treeNode(items[0], count, inTree)
        # 更新相应频繁项集的链表,往后添加
        if headerTable[items[0]][1] == None:
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    # 递归
    if len(items) > 1:
        updateFPtree(items[1::], inTree.children[items[0]], headerTable, count)

def createFPtree(dataSet, minSup=1):
    headerTable = {}
    for trans in dataSet:
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    for k in headerTable.keys():
        if headerTable[k] < minSup:
            del(headerTable[k]) # 删除不满足最小支持度的元素
    freqItemSet = set(headerTable.keys()) # 满足最小支持度的频繁项集
    if len(freqItemSet) == 0:
        return None, None
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] # element: [count, node]


#创建FP树,默认最小支持度为1
def createFPtree(dataSet, minSup=1):
    headerTable = {}
    for trans in dataSet:
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    # 删除不满足最小支持度的元素
    for k in list(headerTable.keys()):
        if headerTable[k] < minSup:
            del(headerTable[k])
    freqItemSet = set(headerTable.keys()) # 满足最小支持度的频繁项集

    if len(freqItemSet) == 0:
        return None, None
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] # element: [count, node]
    retTree = treeNode('Null Set', 1, None)
    for tranSet, count in dataSet.items():
        localD = {}
        for item in tranSet:
            if item in freqItemSet: # 过滤,只取该样本中满足最小支持度的频繁项
                localD[item] = headerTable[item][0] # element : count
        if len(localD) > 0:
            orderedItem = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
            updateFPtree(orderedItem, retTree, headerTable, count)
    return retTree, headerTable

# 查找以目标元素结尾的所有路径(条件模式基)
# 递归回溯
def ascendFPtree(leafNode, prefixPath):
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendFPtree(leafNode.parent, prefixPath)
# 条件模式基
def findPrefixPath(basePat, myHeaderTab):
    treeNode = myHeaderTab[basePat][1] # basePat在FP树中的第一个结点
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendFPtree(treeNode, prefixPath) # prefixPath是倒过来的,从treeNode开始到根
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count # 关联treeNode的计数
        treeNode = treeNode.nodeLink # 下一个basePat结点
    return condPats

def mineFPtree(inTree, headerTable, minSup, preFix, freqItemList):
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1][0], reverse=True)] # 根据频繁项的总频次排序
    for basePat in bigL: # 对每个频繁项
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat, headerTable) # 当前频繁项集的条件模式基
        myCondTree, myHead = createFPtree(condPattBases, minSup) # 构造当前频繁项的条件FP树
        if myHead != None:
            mineFPtree(myCondTree, myHead, minSup, newFreqSet, freqItemList) # 递归挖掘条件FP树


# 数据集
def loadSimpDat():
    df_train = pd.read_csv('src/step1/tag_cooccurrence.csv')
    df_test = pd.read_csv('src/step1/user_tag.csv')

    tf_train_list = list()
    tf_test_list = list()
    for i in range(len(df_train)):
        temp1 = df_train.loc[i][1].split(',')
        tf_train_list.append(temp1)
    for i in range(len(df_test)):
        temp2 = df_test.loc[i][1].split(',')
        # print(temp2)
        tf_test_list.append(temp2)
    return tf_train_list,tf_test_list

# 构造成 element : count 的形式
def createInitSet(dataSet):
    retDict={}
    for trans in dataSet:

        key = frozenset(trans)
        if key in retDict:
            retDict[frozenset(trans)] += 1
        else:
            retDict[frozenset(trans)] = 1
    return retDict

def txt_wrap_by( start_str, end, txt, user_tag_list):
        start = txt.find(start_str)
        if start >= 0:
            end_txt = txt.find(end, start) + len(end)

            if end_txt >= 0:
                user_tag_list.append(txt[start:end_txt].strip())
                txt = txt[end_txt:]
                txt_wrap_by(start_str, end, txt, user_tag_list)
        return user_tag_list

a = Task()
b = a.func()

 

你可能感兴趣的:(笔记,python)