import numpy as np
import csv
# 引入使用的文件
def get_alldata(filename):
# 读取数据文件,获取列的标识,并将其与实验需要的标识进行比对,
# 从而将数据文件中所需要使用的列的数据提取出来,为之后的处理做铺垫。
with open(filename, 'r') as csv_file:
col_name_all = csv_file.readline()[:-1]
colname_str = "Clothing ID,Recommended IND,Positive Feedback Count,Class Name"
col_name_all = col_name_all.split(",")
colname_lst = colname_str.split(",")
colindex_lst = []
for i in col_name_all:
if i in colname_lst:
colindex_lst.append(col_name_all.index(i))
dataset = []
dataset1 = csv.reader(csv_file)
for row in dataset1:
adata = list(row[i] for i in colindex_lst)
dataset.append(adata)
print(dataset)
return np.array(dataset)
def get_id_count_arr(dataset):
# 通过将每行数据通过键值对存储在字典中,
# 可以比列表的count函数更快的获取得到想要的数据,
# 也就是出现400次以上的数据。
mdict = {}
id_count_list = []
for row in dataset:
if row[0] in mdict:
mdict[row[0]] = mdict[row[0]] + 1
else:
mdict[row[0]] = 0
for k in mdict:
if mdict[k] >= 400:
id_count_list.append(k)
return np.array(id_count_list)
def cal_recom_num(dataset, id_lst):
# 元素为被推荐次数占评论数的比例
# 通过两个临时变量记载叠加获取的数据值,
# 当数据统计完成后,
# 计算的结果也就可以直接得到
id_recom_ratio_lst = []
for i in range(len(id_lst)):
mnum = 0
mrnum = 0
for j in range(dataset.shape[0]):
if id_lst[i] == dataset[j][0]:
if dataset[j][1] == '1':
mrnum += 1
mnum += 1
id_recom_ratio_lst.append(mrnum / mnum)
return id_recom_ratio_lst
def cal_pos_num(dataset, id_lst):
# 通之前的方法,数据统计完成后,
# 就可得出结果,
# 并使用了临时变量来记录数据。
id_pos_sum_lst = [] # 每个Clothing ID进行正反馈次数加和的列表
id_name_lst = [] # 每个Clothing ID的类型名称
strdata = ''
for i in range(len(id_lst)):
sum = 0 # 正反馈次数
for j in range(dataset.shape[0]):
if id_lst[i] == dataset[j][0]:
sum += int(dataset[j][2])
strdata = dataset[j][3]
id_pos_sum_lst.append(sum)
id_name_lst.append(strdata)
return id_pos_sum_lst, id_name_lst
if __name__ == "__main__":
# 合理调用之前的方法,获取数据,处理数据,显示结果
filename = "Z:\\womens_clothing_e-commerce_reviews.csv"
dataset = get_alldata(filename)
print("数据集dataset的维度是: {}".format(dataset.shape))
id_count_lst = get_id_count_arr(dataset)
print("评论次数大于400的唯一Clothing ID号有{}个,列表是{}".format(len(id_count_lst), id_count_lst))
recom_ratio_lst = cal_recom_num(dataset, id_count_lst)
id_pos_sum_lst, id_name_lst = cal_pos_num(dataset, id_count_lst)
id_data_arrs = np.array((id_count_lst, id_name_lst, recom_ratio_lst, id_pos_sum_lst)).T
for id_data in id_data_arrs:
print("Clothing ID为 {} ,服装类型为 {},被推荐的占比为: {},正反馈的总计数为: {}"
.format(id_data[0], id_data[1], id_data[2], id_data[3]))