随手记录, 避免以后重复工作.
任务:
DataFrame
形式) 每一列之间的相关性;*
, 小于0.01标记为**
).参考:
import numpy as np
import pandas as pd
# compute correlations
from scipy.stats import spearmanr, pearsonr
from scipy.spatial.distance import cdist
def calc_spearman(df1, df2):
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
n1 = df1.shape[1]
n2 = df2.shape[1]
corr0, pval0 = spearmanr(df1.values, df2.values)
# (n1 + n2) x (n1 + n2)
corr = pd.DataFrame(corr0[:n1, -n2:], index=df1.columns, columns=df2.columns)
pval = pd.DataFrame(pval0[:n1, -n2:], index=df1.columns, columns=df2.columns)
return corr, pval
# 简便法, 但是不能获取 pvalue
# from scipy.spatial.distance import cdist
# def calc_pearson(df1, df2):
# corr0 = 1 - cdist(df1.values.T, df2.values.T, metric='correlation')
# corr = pd.DataFrame(corr0, index=df1.columns, columns=df2.columns)
# return corr
def calc_pearson(df1, df2):
df1 = pd.DataFrame(df1)
df2 = pd.DataFrame(df2)
n1 = df1.shape[1]
n2 = df2.shape[1]
corr0, pval0 = np.zeros((n1, n2)), np.zeros((n1, n2))
for row in range(n1):
for col in range(n2):
_corr, _p = pearsonr(df1.values[:, row], df2.values[:, col])
corr0[row, col] = _corr
pval0[row, col] = _p
# n1 x n2
corr = pd.DataFrame(corr0, index=df1.columns, columns=df2.columns)
pval = pd.DataFrame(pval0, index=df1.columns, columns=df2.columns)
return corr, pval
import matplotlib.pyplot as plt
import seaborn as sns
def pvalue_marker(pval, corr=None, only_pos=False):
if only_pos: # 只标记正相关
if corr is None:
print('correlations `corr` is not provided, '
'negative correlations cannot be filtered!')
else:
pval = pval + (corr < 0).astype(float)
pval_marker = pval.applymap(lambda x: '**' if x < 0.01 else ('*' if x < 0.05 else ''))
return pval_marker
def plot_heatmap(
mat, cmap='RdBu_r',
xlabel=f'column', ylabel=f'row',
tt='',
fp=None,
**kwds
):
fig, ax = plt.subplots()
sns.heatmap(mat, ax=ax, cmap=cmap, cbar_kws={'shrink': 0.5}, **kwds)
ax.set_title(tt)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
if fp is not None:
ax.figure.savefig(fp, bbox_inches='tight')
return ax
'''
Examples
--------
>>> corr, pval = calc_pearson(df1, df2)
>>> pval_marker = pvalue_marker(pval, corr, only_pos=True)
>>> tt = 'Pearson correlations'
>>> plot_heatmap(
... corr, xlabel='df2', ylabel='df1',
... tt=tt, cmap='RdBu_r', # vmax=0.75, vmin=-0.1,
... annot=pval_marker, fmt='s',
... fp=f'PsCorr-{tt}.pdf',
... )
'''
构造两个有点相关性的随机矩阵:
df1 = pd.DataFrame(np.random.randn(40, 9))
df2 = df1.iloc[:, :-1] + df1.iloc[:, 1: ].values * 0.6
df2 += 0.2 * np.random.randn(*df2.shape)
corr, pval = calc_pearson(df1, df2)
pval_marker = pvalue_marker(pval, corr, only_pos=only_pos)
tt = 'Spearman correlations'
plot_heatmap(
corr, xlabel='df2', ylabel='df1',
tt=tt, cmap='RdBu_r', #vmax=0.75, vmin=-0.1,
annot=pval_marker, fmt='s',
)
Pearson correlations
corr, pval = calc_pearson(df1, df2)
pval_marker = pvalue_marker(pval, corr, only_pos=only_pos)
tt = 'Pearson correlations'
plot_heatmap(
corr, xlabel='df2', ylabel='df1',
tt=tt, cmap='RdBu_r', #vmax=0.75, vmin=-0.1,
annot=pval_marker, fmt='s',
)
only_pos
这个参数为 False 时, 会同时标记显著的正相关和负相关.
pval_marker = pvalue_marker(pval, corr, only_pos=False)
plot_heatmap(
corr, xlabel=f'df2', ylabel=f'df1',
tt=tt, cmap='RdBu_r', #vmax=0.75, vmin=-0.1,
annot=pval_marker, fmt='s',
)