Python基础笔记(2)

import numpy as np
import csv


# 引入使用的文件


def get_alldata(filename):
    # 读取数据文件,获取列的标识,并将其与实验需要的标识进行比对,
    # 从而将数据文件中所需要使用的列的数据提取出来,为之后的处理做铺垫。
    with open(filename, 'r') as csv_file:
        col_name_all = csv_file.readline()[:-1]
        colname_str = "Clothing ID,Recommended IND,Positive Feedback Count,Class Name"
        col_name_all = col_name_all.split(",")
        colname_lst = colname_str.split(",")

        colindex_lst = []
        for i in col_name_all:
            if i in colname_lst:
                colindex_lst.append(col_name_all.index(i))

        dataset = []
        dataset1 = csv.reader(csv_file)
        for row in dataset1:
            adata = list(row[i] for i in colindex_lst)
            dataset.append(adata)
        print(dataset)
        return np.array(dataset)


def get_id_count_arr(dataset):
    # 通过将每行数据通过键值对存储在字典中,
    # 可以比列表的count函数更快的获取得到想要的数据,
    # 也就是出现400次以上的数据。
    mdict = {}
    id_count_list = []
    for row in dataset:
        if row[0] in mdict:
            mdict[row[0]] = mdict[row[0]] + 1
        else:
            mdict[row[0]] = 0
    for k in mdict:
        if mdict[k] >= 400:
            id_count_list.append(k)
    return np.array(id_count_list)


def cal_recom_num(dataset, id_lst):
    # 元素为被推荐次数占评论数的比例
    # 通过两个临时变量记载叠加获取的数据值,
    # 当数据统计完成后,
    # 计算的结果也就可以直接得到
    id_recom_ratio_lst = []
    for i in range(len(id_lst)):
        mnum = 0
        mrnum = 0
        for j in range(dataset.shape[0]):
            if id_lst[i] == dataset[j][0]:
                if dataset[j][1] == '1':
                    mrnum += 1
                mnum += 1
        id_recom_ratio_lst.append(mrnum / mnum)
    return id_recom_ratio_lst


def cal_pos_num(dataset, id_lst):
    # 通之前的方法,数据统计完成后,
    # 就可得出结果,
    # 并使用了临时变量来记录数据。
    id_pos_sum_lst = []  # 每个Clothing ID进行正反馈次数加和的列表
    id_name_lst = []  # 每个Clothing ID的类型名称
    strdata = ''
    for i in range(len(id_lst)):
        sum = 0  # 正反馈次数
        for j in range(dataset.shape[0]):
            if id_lst[i] == dataset[j][0]:
                sum += int(dataset[j][2])
                strdata = dataset[j][3]
        id_pos_sum_lst.append(sum)
        id_name_lst.append(strdata)
    return id_pos_sum_lst, id_name_lst


if __name__ == "__main__":
    # 合理调用之前的方法,获取数据,处理数据,显示结果
    filename = "Z:\\womens_clothing_e-commerce_reviews.csv"
    dataset = get_alldata(filename)
    print("数据集dataset的维度是: {}".format(dataset.shape))
    id_count_lst = get_id_count_arr(dataset)
    print("评论次数大于400的唯一Clothing ID号有{}个,列表是{}".format(len(id_count_lst), id_count_lst))
    recom_ratio_lst = cal_recom_num(dataset, id_count_lst)
    id_pos_sum_lst, id_name_lst = cal_pos_num(dataset, id_count_lst)
    id_data_arrs = np.array((id_count_lst, id_name_lst, recom_ratio_lst, id_pos_sum_lst)).T
    for id_data in id_data_arrs:
        print("Clothing ID为 {} ,服装类型为 {},被推荐的占比为: {},正反馈的总计数为: {}"
              .format(id_data[0], id_data[1], id_data[2], id_data[3]))

 

你可能感兴趣的:(Python)