pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)

  我们以2022年全国服务外包大赛的A03题目作为示例代码演示衍生特征计算过程。
  问题的主要任务时找出商品的销量异常和价格异常,提供4个月的商品信息数据,共1700万余条,4个月的店铺信息数据,共60万余条,强调时间复杂度空间复杂度、异常值识别率和准确率。我们用店铺分析辅助商品的异常,以提高可信度和准确率。
  店铺部分数据链接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取码:jhnb
pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)_第1张图片
  我们现在的工作就是上图中异常店铺检测中特征挖掘部分。
pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)_第2张图片
  上图是异常店铺检测的具体流程,我们现在做的是流程的第二步:衍生变量计算。因为有部分衍生特征涉及类目,但是店铺数据的类目是存在缺失值的(关于前期的数据预处理与数据预览流程,见:https://blog.csdn.net/Hjh1906008151/article/details/124313507),因为涉及的衍生特征相对重要,且缺失数量近10%,所以我们将任务分为两部分。

不涉及填补缺失值的衍生特征

  应用到了自然语言处理特征工程部分的知识和综合评价方面的知识,主要是对pandas的一些应用的记录:

import numpy as np
import pandas as pd
import re
import jieba
import gensim
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

read_file = r"../Distribution testing/shop.tsv"
write_file = r""



def clear_character(sentence):
    pattern = re.compile('[^\u4e00-\u9fa5]')    # 去掉字母数字乱七八糟的符号,只留中文
    line = re.sub(pattern, '', sentence)
    new_sentence = ''.join(line.split())
    return new_sentence

def make_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=3, threshold=100)  # higher threshold fewer phrases.
    # trigram = gensim.models.Phrases(bigram[texts], threshold=1)
    # 更快地找到 trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    # trigram_mod = gensim.models.phrases.Phraser(trigram)
    return [bigram_mod[doc] for doc in texts]


def data_cleaning(df):
    train_text = [clear_character(data) for data in df["SHOP_NAME"]]
    # 切词,这里不适合Stanfordcorenlp,安装不方便不适合小白,而且跑起来太慢了
    train_seg_text = [jieba.lcut(s) for s in train_text]
    # 去掉停用词,这里发现直接对词频统计的结果分析效果更好
    # train_st_text = [drop_stopwords(s, stopwords) for s in train_seg_text]
    # 构建bigram, trigram模型 => 二元分词三元分词
    # 我们这里先只采用二元分词
    data_words_bigrams = make_bigrams(train_seg_text)
    return data_words_bigrams


def Normalization(data):
    # 最大最小这里不合适
    # min_max = MinMaxScaler(feature_range=(0, 1))
    # data[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]] = min_max.fit_transform(data[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]])
    # data = data.T
    for i in data.columns:
        temp = StandardScaler().fit_transform(pd.DataFrame(np.array(data[i]).reshape(-1, 1)))
        try:
            ret = np.hstack((ret, temp))
        except:
            ret = temp
    return ret

def E_j_fun(data, rows, columns):  #计算熵值
    E = np.array([[None] * columns for i in range(rows)])   # 新建空矩阵
    for i in range(rows):
        for j in range(columns):
            if data[i][j] == 0:
                e_ij = 0.0
            else:
                P_ij = data[i][j] / data.sum(axis=0)[j]  # 计算比重(列求和)
                e_ij = (-1 / np.log(rows)) * P_ij * np.log(P_ij)
            E[i][j] = e_ij
    # print(E)
    E_j=E.sum(axis=0)       # 求出每列信息熵(指标)列求和
    return E_j

def critic(data, rows, columns, our_weight):
    Z_ij = np.array([[None] * rows for i in range(columns)])
    data_std = np.std(data, axis=1, ddof=1)
    # print(data_std)
    data_rela = np.corrcoef(data)
    data_rela = data_rela.sum(axis=1)
    # print(data_std, "\n", data_rela)        # 样本标准差(n-1)
    C_i = data_rela * data_std              # 矩阵点乘
    W_i = C_i/sum(C_i)
    W_i = W_i*our_weight
    print(W_i)
    for i in range(columns):
        for j in range(rows):
            Z_ij[i][j] = data[i][j] * W_i[i]
    ret = Z_ij.sum(axis=0)
    return ret


def get_benrate(series):        # 处理被除数为0
    BeiChuShu = series['SHOP_SALES_AMOUNT']
    ChuShu = series['SHOP_SALES_VOLUME']
    if BeiChuShu == 0:
        return 0
    else:
        return BeiChuShu / ChuShu


def shop(file, weight_crdict, weight_reputation, keyword, keyword2):

    # 先编号,为以后做打算
    file["index"] = [i for i in range(file.shape[0])]
    # 计算年份
    # 13311/654400的异常值,不到5%。直接滚吧
    file1 = file.dropna(axis=0, how='any', subset=["SHOP_OPEN_DATE"])
    year = [(2021 - int(x[:4])) * 12 + 9 - int(x[5:7]) for x in file1["SHOP_OPEN_DATE"]]
    file1["year"] = year
    file = pd.merge(file, file1, how="outer")

    # 统计关键词
    word_cut = data_cleaning(file)
    # word = pd.DataFrame(word_cut)
    # word.replace(to_replace='None', value=np.nan).dropna(axis=1, how='all')
    # print(word)
    # word.to_csv("切词结果.csv", encoding="utf-8", header=0, index=0)
    # for x in word_cut:
    #     print(x, "\t", set(x), "\t", set(keyword), "\t",1-int(set(keyword).isdisjoint(set(x))))
    trust1 = [1 - int(set(keyword).isdisjoint(set(x))) for x in word_cut]  # 内存炸了
    trust2 = [-1 + int(set(keyword2).isdisjoint(set(x))) for x in word_cut]
    trust = [i + j for i, j in zip(trust1, trust2)]
    # trust = []
    # with open("切词结果.csv", 'r', encoding='utf-8', errors='ignore') as f:
    #     for line in f:
    #         print(line)
    #         final_list = list()
    #         for row in line:
    #             final_list.append(row.split(','))
    #         print(final_list)
    file["trust"] = trust

    # 计算平均单价,顺便练一下apply
    file['ave_price'] = 0   # 处理0
    file['ave_price'] = file.apply(get_benrate, axis=1)

    # 计算可信度评分
    file2 = file.dropna(axis=0, how='any', subset=["trust", "year"])
    Standard_data = Normalization(file2[["trust", "year"]]).T
    Credit_Score = critic(Standard_data, Standard_data.shape[1], Standard_data.shape[0], weight_crdict)
    Credit_Score = (Credit_Score - min(Credit_Score)) / (max(Credit_Score) - min(Credit_Score)) * 100
    file2["Credit_Score"] = Credit_Score
    file = pd.merge(file, file2, how="outer")
    # 计算声誉评分
    # 2157/654400的异常值,不到0.5%。直接滚吧
    file3 = file.dropna(axis=0, how='any', subset=["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"])
    Standard_data = Normalization(file3[["ITEMDESC_SCORE", "SERVICE_SCORE", "DELIVERY_SCORE"]]).T
    Reputation_Score = critic(Standard_data, Standard_data.shape[1], Standard_data.shape[0], weight_reputation)
    Reputation_Score = (Reputation_Score - min(Reputation_Score)) / (
                max(Reputation_Score) - min(Reputation_Score)) * 100
    file3["Reputation_Score"] = Reputation_Score
    file = pd.merge(file, file3, how="outer")

    # 可视化与保存
    print(file)
    index = [i for i in range(file.shape[0])]
    file["index1"] = index
    file.plot.scatter(x='Credit_Score', y='index1', s=2, c="pink")
    plt.show()
    print(file.value_counts(["Credit_Score"]))
    file.plot.scatter(x='Reputation_Score', y='index1', s=2,  c="lightskyblue")
    plt.show()
    file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="lightcoral")
    plt.show()
    file.plot.scatter(x='Reputation_Score', y='index1', s=2, c="mediumspringgreen")
    plt.show()
    print(file.value_counts(["Reputation_Score"]))
    file = file.drop(labels='index1', axis=1)

    file.to_csv("店铺数据.csv", encoding="utf-8")

def main():
    file = pd.read_csv(read_file, sep="\t", encoding="utf-8")
    weight_crdict = [5, 4]      # trust和year的人为权重
    weight_reputation = [3, 1, 1]   # 商品描述得分,服务得分,配送得分的人为权重
    keyword = ["旗舰店", "官方", "直销店", "直销", "厂家直销", "直营店"] 
    # 出现这样的字眼会被当作可信任的店铺
    keyword2 = ["小店", "折扣店", "特卖"]
    # 出现这样的字眼会被当作不可信任的店铺
    shop(file, weight_crdict, weight_reputation, keyword, keyword2)

if __name__ == '__main__':
    main()

  里面涉及的综合评价实现可以参考博主这篇文章:https://blog.csdn.net/Hjh1906008151/article/details/123433270
  综合评价分布概览:
pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)_第3张图片
pandas在数据分析(异常值识别问题)中的应用,以衍生特征计算为例(含2022年全国服务外包大赛实例)_第4张图片

涉及缺失值填补的衍生特征

  缺失值填补部分可以参考博主这篇文章:https://blog.csdn.net/Hjh1906008151/article/details/124338450
  在上文提到的数据预处理中也提到了,但是随机森林表现感人,不建议在此处使用。

def new_rate_feature():
    df_fill_file = pd.read_csv("填补后店铺数据(字符串).csv").drop(axis=1, columns="Unnamed: 0")

    # 这里顺手存一下中间结果
    df_mean = df_fill_file.groupby("MAIN_BUSINESS")["SHOP_SALES_VOLUME", "SHOP_SALES_AMOUNT", "ave_price"].mean()
    df_std = df_fill_file.groupby("MAIN_BUSINESS")["SHOP_SALES_VOLUME", "SHOP_SALES_AMOUNT", "ave_price"].std()
    shop_cri = df_fill_file.value_counts("MAIN_BUSINESS")
    shop_cri = pd.concat([df_mean, df_std, shop_cri], axis=1)
    shop_cri.columns = ["ave_SALES_VOLUME", "ave_SALES_AMOUNT", "ave_price", "std_SALES_VOLUME", "std_SALES_AMOUNT", "std_price", "count"]
    shop_cri.sort_values("count").to_csv("店铺平均值.csv", encoding="utf-8-sig")

    # 计算在类中的占比
    df_fill_file = pd.merge(df_fill_file, df_mean, left_on="MAIN_BUSINESS", right_index=True, how='outer').sort_values(by='index')
    df_fill_file[['Rate_volumn', 'Rate_amount', 'Rate_price']] = 0
    df_fill_file['Rate_volumn'] = df_fill_file.apply(get_benrate, axis=1,
                                                     args=("SHOP_SALES_VOLUME_x", "SHOP_SALES_VOLUME_y"))
    df_fill_file['Rate_amount'] = df_fill_file.apply(get_benrate, axis=1,
                                                     args=("SHOP_SALES_AMOUNT_x", "SHOP_SALES_AMOUNT_y"))
    df_fill_file['Rate_price'] = df_fill_file.apply(get_benrate, axis=1, args=("ave_price_x", "ave_price_y"))
    df_fill_file = df_fill_file.drop(columns=["SHOP_SALES_VOLUME_y", "SHOP_SALES_AMOUNT_y", "ave_price_y"], axis=1)

    # 计算年份比值
    col_temp = ['USER_ID_s', 'Growth_volumn_s', 'Growth_amount_s', 'Growth_price_s', 'Growth_reputation_s',
                'Growth_credit_s']
    col_target = ['USER_ID', 'SHOP_SALES_VOLUME_x', 'SHOP_SALES_AMOUNT_x', 'ave_price_x', 'Reputation_Score', 'Credit_Score']
    col_new = ['Growth_volumn', 'Growth_amount', 'Growth_price', 'Growth_reputation', 'Growth_credit']
    df_fill_file = df_fill_file.sort_values(by=['USER_ID', 'DATA_MONTH'], ascending=[False, True], ignore_index=True)  # 重置索引
    df_fill_file[col_temp] = df_fill_file[col_target].shift(1)
    for i in range(len(col_new)):
        df_fill_file[col_new[i]] = df_fill_file.apply(growth, axis=1, args=(col_target[i + 1], col_temp[i + 1]))
    df_fill_file = df_fill_file.drop(axis=1, columns=col_temp).sort_values(by=["index"], ignore_index=True)

    df_fill_file.to_csv("filled_data(Rate).csv", encoding='utf-8-sig')
    return df_fill_file

你可能感兴趣的:(异常值识别,数据挖掘,python,数据分析)