我们以2022年全国服务外包大赛的A03题目作为示例代码演示衍生特征计算过程。
问题的主要任务时找出商品的销量异常和价格异常,提供4个月的商品信息数据,共1700万余条,4个月的店铺信息数据,共60万余条,强调时间复杂度空间复杂度、异常值识别率和准确率。我们用店铺分析辅助商品的异常,以提高可信度和准确率。
店铺部分数据链接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取码:jhnb
我们现在的工作就是上图中异常店铺检测中特征挖掘部分。
上图是异常店铺检测的具体流程,我们现在做的是流程的第二步:衍生变量计算。因为有部分衍生特征涉及类目,但是店铺数据的类目是存在缺失值的(关于前期的数据预处理与数据预览流程,见:https://blog.csdn.net/Hjh1906008151/article/details/124313507),因为涉及的衍生特征相对重要,且缺失数量近10%,所以我们将任务分为两部分。
应用到了自然语言处理特征工程部分的知识和综合评价方面的知识,主要是对pandas的一些应用的记录:
import numpy as np
import pandas as pd
import re
import jieba
import gensim
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
read_file = r"../Distribution testing/shop.tsv"
write_file = r""
def clear_character(sentence):
pattern = re.compile('[^\u4e00-\u9fa5]') # 去掉字母数字乱七八糟的符号,只留中文
line = re.sub(pattern, '', sentence)
new_sentence = ''.join(line.split())
return new_sentence
def make_bigrams(texts):
bigram = gensim.models.Phrases(texts, min_count=3, threshold=100) # higher threshold fewer phrases.
# trigram = gensim.models.Phrases(bigram[texts], threshold=1)
# 更快地找到 trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)
return [bigram_mod[doc] for doc in texts]
def data_cleaning(df):
train_text = [clear_character(data) for data in df["SHOP_NAME"]]
# 切词,这里不适合Stanfordcorenlp,安装不方便不适合小白,而且跑起来太慢了
train_seg_text = [jieba.lcut(s) for s in train_text]
# 去掉停用词,这里发现直接对词频统计的结果分析效果更好
# train_st_text = [drop_stopwords(s, stopwords) for s in train_seg_text]
# 构建bigram, trigram模型 => 二元分词三元分词
# 我们这里先只采用二元分词
data_words_bigrams = make_bigrams(train_seg_text)
return data_words_bigrams
def Normalization(data):
# 最大最小这里不合适
# min_max = MinMaxScaler(feature_range=(0, 1))
# data[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]] = min_max.fit_transform(data[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]])
# data = data.T
for i in data.columns:
temp = StandardScaler().fit_transform(pd.DataFrame(np.array(data[i]).reshape(-1, 1)))
try:
ret = np.hstack((ret, temp))
except:
ret = temp
return ret
def E_j_fun(data, rows, columns): #计算熵值
E = np.array([[None] * columns for i in range(rows)]) # 新建空矩阵
for i in range(rows):
for j in range(columns):
if data[i][j] == 0:
e_ij = 0.0
else:
P_ij = data[i][j] / data.sum(axis=0)[j] # 计算比重(列求和)
e_ij = (-1 / np.log(rows)) * P_ij * np.log(P_ij)
E[i][j] = e_ij
# print(E)
E_j=E.sum(axis=0) # 求出每列信息熵(指标)列求和
return E_j
def critic(data, rows, columns, our_weight):
Z_ij = np.array([[None] * rows for i in range(columns)])
data_std = np.std(data, axis=1, ddof=1)
# print(data_std)
data_rela = np.corrcoef(data)
data_rela = data_rela.sum(axis=1)
# print(data_std, "\n", data_rela) # 样本标准差(n-1)
C_i = data_rela * data_std # 矩阵点乘
W_i = C_i/sum(C_i)
W_i = W_i*our_weight
print(W_i)
for i in range(columns):
for j in range(rows):
Z_ij[i][j] = data[i][j] * W_i[i]
ret = Z_ij.sum(axis=0)
return ret
def get_benrate(series): # 处理被除数为0
BeiChuShu = series['SHOP_SALES_AMOUNT']
ChuShu = series['SHOP_SALES_VOLUME']
if BeiChuShu == 0:
return 0
else:
return BeiChuShu / ChuShu
def shop(file, weight_crdict, weight_reputation, keyword, keyword2):
# 先编号,为以后做打算
file["index"] = [i for i in range(file.shape[0])]
# 计算年份
# 13311/654400的异常值,不到5%。直接滚吧
file1 = file.dropna(axis=0, how='any', subset=["SHOP_OPEN_DATE"])
year = [(2021 - int(x[:4])) * 12 + 9 - int(x[5:7]) for x in file1["SHOP_OPEN_DATE"]]
file1["year"] = year
file = pd.merge(file, file1, how="outer")
# 统计关键词
word_cut = data_cleaning(file)
# word = pd.DataFrame(word_cut)
# word.replace(to_replace='None', value=np.nan).dropna(axis=1, how='all')
# print(word)
# word.to_csv("切词结果.csv", encoding="utf-8", header=0, index=0)
# for x in word_cut:
# print(x, "\t", set(x), "\t", set(keyword), "\t",1-int(set(keyword).isdisjoint(set(x))))
trust1 = [1 - int(set(keyword).isdisjoint(set(x))) for x in word_cut] # 内存炸了
trust2 = [-1 + int(set(keyword2).isdisjoint(set(x))) for x in word_cut]
trust = [i + j for i, j in zip(trust1, trust2)]
# trust = []
# with open("切词结果.csv", 'r', encoding='utf-8', errors='ignore') as f:
# for line in f:
# print(line)
# final_list = list()
# for row in line:
# final_list.append(row.split(','))
# print(final_list)
file["trust"] = trust
# 计算平均单价,顺便练一下apply
file['ave_price'] = 0 # 处理0
file['ave_price'] = file.apply(get_benrate, axis=1)
# 计算可信度评分
file2 = file.dropna(axis=0, how='any', subset=["trust", "year"])
Standard_data = Normalization(file2[["trust", "year"]]).T
Credit_Score = critic(Standard_data, Standard_data.shape[1], Standard_data.shape[0], weight_crdict)
Credit_Score = (Credit_Score - min(Credit_Score)) / (max(Credit_Score) - min(Credit_Score)) * 100
file2["Credit_Score"] = Credit_Score
file = pd.merge(file, file2, how="outer")
# 计算声誉评分
# 2157/654400的异常值,不到0.5%。直接滚吧
file3 = file.dropna(axis=0, how='any', subset=["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"])
Standard_data = Normalization(file3[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]]).T
Reputation_Score = critic(Standard_data, Standard_data.shape[1], Standard_data.shape[0], weight_reputation)
Reputation_Score = (Reputation_Score - min(Reputation_Score)) / (
max(Reputation_Score) - min(Reputation_Score)) * 100
file3["Reputation_Score"] = Reputation_Score
file = pd.merge(file, file3, how="outer")
# 可视化与保存
print(file)
index = [i for i in range(file.shape[0])]
file["index1"] = index
file.plot.scatter(x='Credit_Score', y='index1', s=2, c="pink")
plt.show()
print(file.value_counts(["Credit_Score"]))
file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="lightskyblue")
plt.show()
file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="lightcoral")
plt.show()
file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="mediumspringgreen")
plt.show()
print(file.value_counts(["Reputation_Score"]))
file = file.drop(labels='index1', axis=1)
file.to_csv("店铺数据.csv", encoding="utf-8")
def main():
file = pd.read_csv(read_file, sep="\t", encoding="utf-8")
weight_crdict = [5, 4] # trust和year的人为权重
weight_reputation = [3, 1, 1] # 商品描述得分,服务得分,配送得分的人为权重
keyword = ["旗舰店", "官方", "直销店", "直销", "厂家直销", "直营店"]
# 出现这样的字眼会被当作可信任的店铺
keyword2 = ["小店", "折扣店", "特卖"]
# 出现这样的字眼会被当作不可信任的店铺
shop(file, weight_crdict, weight_reputation, keyword, keyword2)
if __name__ == '__main__':
main()
里面涉及的综合评价实现可以参考博主这篇文章:https://blog.csdn.net/Hjh1906008151/article/details/123433270
综合评价分布概览:
缺失值填补部分可以参考博主这篇文章:https://blog.csdn.net/Hjh1906008151/article/details/124338450
在上文提到的数据预处理中也提到了,但是随机森林表现感人,不建议在此处使用。
def new_rate_feature():
df_fill_file = pd.read_csv("填补后店铺数据(字符串).csv").drop(axis=1, columns="Unnamed: 0")
# 这里顺手存一下中间结果
df_mean = df_fill_file.groupby("MAIN_BUSINESS")["SHOP_SALES_VOLUME", "SHOP_SALES_AMOUNT", "ave_price"].mean()
df_std = df_fill_file.groupby("MAIN_BUSINESS")["SHOP_SALES_VOLUME", "SHOP_SALES_AMOUNT", "ave_price"].std()
shop_cri = df_fill_file.value_counts("MAIN_BUSINESS")
shop_cri = pd.concat([df_mean, df_std, shop_cri], axis=1)
shop_cri.columns = ["ave_SALES_VOLUME", "ave_SALES_AMOUNT", "ave_price", "std_SALES_VOLUME", "std_SALES_AMOUNT", "std_price", "count"]
shop_cri.sort_values("count").to_csv("店铺平均值.csv", encoding="utf-8-sig")
# 计算在类中的占比
df_fill_file = pd.merge(df_fill_file, df_mean, left_on="MAIN_BUSINESS", right_index=True, how='outer').sort_values(by='index')
df_fill_file[['Rate_volumn', 'Rate_amount', 'Rate_price']] = 0
df_fill_file['Rate_volumn'] = df_fill_file.apply(get_benrate, axis=1,
args=("SHOP_SALES_VOLUME_x", "SHOP_SALES_VOLUME_y"))
df_fill_file['Rate_amount'] = df_fill_file.apply(get_benrate, axis=1,
args=("SHOP_SALES_AMOUNT_x", "SHOP_SALES_AMOUNT_y"))
df_fill_file['Rate_price'] = df_fill_file.apply(get_benrate, axis=1, args=("ave_price_x", "ave_price_y"))
df_fill_file = df_fill_file.drop(columns=["SHOP_SALES_VOLUME_y", "SHOP_SALES_AMOUNT_y", "ave_price_y"], axis=1)
# 计算年份比值
col_temp = ['USER_ID_s', 'Growth_volumn_s', 'Growth_amount_s', 'Growth_price_s', 'Growth_reputation_s',
'Growth_credit_s']
col_target = ['USER_ID', 'SHOP_SALES_VOLUME_x', 'SHOP_SALES_AMOUNT_x', 'ave_price_x', 'Reputation_Score', 'Credit_Score']
col_new = ['Growth_volumn', 'Growth_amount', 'Growth_price', 'Growth_reputation', 'Growth_credit']
df_fill_file = df_fill_file.sort_values(by=['USER_ID', 'DATA_MONTH'], ascending=[False, True], ignore_index=True) # 重置索引
df_fill_file[col_temp] = df_fill_file[col_target].shift(1)
for i in range(len(col_new)):
df_fill_file[col_new[i]] = df_fill_file.apply(growth, axis=1, args=(col_target[i + 1], col_temp[i + 1]))
df_fill_file = df_fill_file.drop(axis=1, columns=col_temp).sort_values(by=["index"], ignore_index=True)
df_fill_file.to_csv("filled_data(Rate).csv", encoding='utf-8-sig')
return df_fill_file