Pearson相关性分析& plot绘图(相关性系数柱状图、绘制非空值数量柱状图)

Pearson相关性分析 & plot绘图(相关性系数柱状图、绘制非空值数量柱状图)

1.Pearson相关性分析

  • Pearson相关性分析是一种用于检测两个变量之间线性关系强度的统计方法,其结果介于-1和1之间。一个相关系数为1表示完全正相关,-1表示完全负相关,0则表示没有线性关系。 Pearson相关性分析假设数据来自正态分布,并且对异常值敏感。

2.Pearson相关性分析实例

# 计算pearsonr相关系数
def calculate_pearsonr(pd):
    head = pd.head().columns.values
    GDM = pd["目标变量"].tolist()
    coefficient_of_association = {}
    significance_level = {}
    feature_cnt = {}
    for feature in head:
        if feature != "目标变量":
            ftc = 0
            feature_values = pd[feature].tolist()
            GDM_temp, feature_temp, tag = [], [], 0
            for v in feature_values:
                if str(v) != "nan":
                    ftc += 1
                    GDM_temp.append(GDM[tag])
                    feature_temp.append(v)
                tag += 1
            feature_cnt[feature] = ftc
            if len(feature_temp) > 1:
                pc = pearsonr(np.array(feature_temp), np.array(GDM_temp))
                if str(pc[0]) != "nan":
                    ca = pc[0]
                    if ca < -0.0001:
                        ca = ca*-1
                        coefficient_of_association[feature] = ca
                        significance_level[feature] = pc[1]
                    elif ca > 0.0001:
                        coefficient_of_association[feature] = ca
                        significance_level[feature] = pc[1]
    dp_ca = sorted(
        coefficient_of_association.items(),
        key=lambda x: x[1],
        reverse=True)
    print("pearsonr-相关系数:",dp_ca)
   
    dp_ca_Nempty=[(i[0], feature_cnt[i[0]]) for i in dp_ca]
    print("非空值的数量:",dp_ca_Nempty)

    return dp_ca

import matplotlib.pyplot as plt
def plot1(dp_ca):
    # 将元组列表转换为字典
    dp_ca_dict = dict(dp_ca)

    # 创建子图
    # fig, ax = plt.subplots()

    fig = plt.figure(figsize=(16, 10))
    ax = fig.add_subplot(1, 1, 1)

    # 绘制相关性系数柱状图
    ax.bar(dp_ca_dict.keys(), dp_ca_dict.values())
    ax.set_title('Correlation between Feature and 目标变量')
    ax.set_xlabel('Features')
    ax.set_ylabel('Correlation Coefficient')

    # 调整布局并显示图形
    plt.xticks(rotation=45,ha='right') ## # 将x轴标签旋转45度,并以最后一个字符为旋转中心
    # 设置x轴刻度标签字体大小为8
    ax.tick_params(axis='x', labelsize=10)

    plt.tight_layout()
    plt.savefig("./Pearson.jpeg")
    plt.show()

if __name__ == '__main__':

    file = pd.read_excel("./filename.xlsx")
    dp_ca=calculate_pearsonr(file)
    plot1(dp_ca)

Pearson相关性分析& plot绘图(相关性系数柱状图、绘制非空值数量柱状图)_第1张图片

3.plot绘图(相关性系数柱状图、绘制非空值数量柱状图)

import matplotlib.pyplot as plt

# 获取数据
dp_ca = [('feature1', 0.8), ('feature2', 0.6), ('feature3', 0.4),('feature4', 0.77), ('feature5', 0.2), ('feature6', 0.4)]
dp_ca_Nempty = [('feature1', 100), ('feature3', 50), ('feature2', 20),('feature4', 70), ('feature5', 10), ('feature6', 26)]

# 将元组列表转换为字典
dp_ca_dict = dict(dp_ca)
dp_ca_Nempty_dict = dict(dp_ca_Nempty)

# 创建子图
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

# 绘制相关性系数柱状图
axs[0].bar(dp_ca_dict.keys(), dp_ca_dict.values())
axs[0].set_title('Pearson correlation coefficients')
axs[0].set_xlabel('Features')
axs[0].set_ylabel('Correlation coefficient')

# 绘制非空值数量柱状图
axs[1].bar(dp_ca_Nempty_dict.keys(), dp_ca_Nempty_dict.values())
axs[1].set_title('Number of non-empty values')
axs[1].set_xlabel('Features')
axs[1].set_ylabel('Count')

# 调整布局并显示图形
plt.xticks(rotation=45,ha='right') ## # 将x轴标签旋转45度,并以最后一个字符为旋转中心

# 设置x轴刻度标签字体大小为10
axs[0].tick_params(axis='x', labelsize=10)
axs[1].tick_params(axis='x', labelsize=10)

# 调整布局并显示图形
plt.tight_layout()
plt.show()

Pearson相关性分析& plot绘图(相关性系数柱状图、绘制非空值数量柱状图)_第2张图片

你可能感兴趣的:(基础知识,python,numpy,机器学习)