相关性热图 (Python)

相关性热图 (Python)_第1张图片

随手记录, 避免以后重复工作.

任务:

  1. 计算两个二维数组 (DataFrame 形式) 每一列之间的相关性;
  2. 并画出热图;
  3. 标记显著程度 (p值小于0.05标记为 *, 小于0.01标记为**).

参考:

  • scipy.stats.spearmanr
  • scipy.stats.pearsonr

Functions: 计算相关性 (Spearman&Pearson)

import numpy as np
import pandas as pd
# compute correlations
from scipy.stats import spearmanr, pearsonr
from scipy.spatial.distance import cdist

def calc_spearman(df1, df2):
	df1 = pd.DataFrame(df1)
	df2 = pd.DataFrame(df2)
    n1 = df1.shape[1]
    n2 = df2.shape[1]
    corr0, pval0 = spearmanr(df1.values, df2.values)
    # (n1 + n2) x (n1 + n2)
    corr = pd.DataFrame(corr0[:n1, -n2:], index=df1.columns, columns=df2.columns)
    pval = pd.DataFrame(pval0[:n1, -n2:], index=df1.columns, columns=df2.columns)
    return corr, pval

# 简便法, 但是不能获取 pvalue
# from scipy.spatial.distance import cdist
# def calc_pearson(df1, df2):
#     corr0 = 1 - cdist(df1.values.T, df2.values.T, metric='correlation')
#     corr = pd.DataFrame(corr0, index=df1.columns, columns=df2.columns)
#     return corr

def calc_pearson(df1, df2):
	df1 = pd.DataFrame(df1)
	df2 = pd.DataFrame(df2)
    n1 = df1.shape[1]
    n2 = df2.shape[1]
    corr0, pval0 = np.zeros((n1, n2)), np.zeros((n1, n2))
    for row in range(n1):
        for col in range(n2):
            _corr, _p = pearsonr(df1.values[:, row], df2.values[:, col])
            corr0[row, col] = _corr
            pval0[row, col] = _p
    # n1 x n2
    corr = pd.DataFrame(corr0, index=df1.columns, columns=df2.columns)
    pval = pd.DataFrame(pval0, index=df1.columns, columns=df2.columns)
    return corr, pval

Functions: 画出热图, 并且标记显著程度

import matplotlib.pyplot as plt
import seaborn as sns

def pvalue_marker(pval, corr=None, only_pos=False):
    if only_pos:  # 只标记正相关
        if corr is None:  
            print('correlations `corr` is not provided, '
                  'negative correlations cannot be filtered!')
        else:
            pval = pval + (corr < 0).astype(float)
    pval_marker = pval.applymap(lambda x: '**' if x < 0.01 else ('*' if x < 0.05 else ''))
    return pval_marker

def plot_heatmap(
    mat, cmap='RdBu_r', 
    xlabel=f'column', ylabel=f'row',
    tt='',
    fp=None,
    **kwds
):
    fig, ax = plt.subplots()
    sns.heatmap(mat, ax=ax, cmap=cmap, cbar_kws={'shrink': 0.5}, **kwds)
    ax.set_title(tt)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    if fp is not None:
        ax.figure.savefig(fp, bbox_inches='tight')
    return ax
    
'''
Examples
--------
>>> corr, pval = calc_pearson(df1, df2)
>>> pval_marker = pvalue_marker(pval, corr, only_pos=True)
>>> tt = 'Pearson correlations'
>>> plot_heatmap(
...     corr, xlabel='df2', ylabel='df1',
...     tt=tt, cmap='RdBu_r', # vmax=0.75, vmin=-0.1,
...     annot=pval_marker, fmt='s',
...     fp=f'PsCorr-{tt}.pdf',
... )

'''

例子

构造两个有点相关性的随机矩阵:

df1 = pd.DataFrame(np.random.randn(40, 9))
df2 = df1.iloc[:, :-1] + df1.iloc[:, 1: ].values * 0.6
df2 += 0.2 * np.random.randn(*df2.shape)

相关性热图 (Python)_第2张图片
Spearman correlations

corr, pval = calc_pearson(df1, df2)
pval_marker = pvalue_marker(pval, corr, only_pos=only_pos)
tt = 'Spearman correlations'
plot_heatmap(
    corr, xlabel='df2', ylabel='df1',
    tt=tt, cmap='RdBu_r', #vmax=0.75, vmin=-0.1,
    annot=pval_marker, fmt='s',
)

相关性热图 (Python)_第3张图片

Pearson correlations

corr, pval = calc_pearson(df1, df2)
pval_marker = pvalue_marker(pval, corr, only_pos=only_pos)
tt = 'Pearson correlations'
plot_heatmap(
    corr, xlabel='df2', ylabel='df1',
    tt=tt, cmap='RdBu_r', #vmax=0.75, vmin=-0.1,
    annot=pval_marker, fmt='s',
)

相关性热图 (Python)_第4张图片
only_pos 这个参数为 False 时, 会同时标记显著的正相关和负相关.

pval_marker = pvalue_marker(pval, corr, only_pos=False)
plot_heatmap(
    corr, xlabel=f'df2', ylabel=f'df1',
    tt=tt, cmap='RdBu_r', #vmax=0.75, vmin=-0.1,
    annot=pval_marker, fmt='s',
)

相关性热图 (Python)_第5张图片

你可能感兴趣的:(数学girl的代码小记,python,数据可视化)