绘制火山图,输入是两个datafreme,行是样本名,列是基因名。使用T-test检验绘制基因表达情况。
def minmax_scale(data):
import numpy as np
# # 示例数据
# data = np.array([2, 4, 6, 8, 10])
# 进行Min-Max标准化
min_val = np.min(data)
max_val = np.max(data)
scaled_data = (data - min_val) / (max_val - min_val)
# 输出标准化后的数据
# print("标准化后的数据:", scaled_data)
return scaled_data
# t-test检验 表达量上调还是下调
def gene_exp(group1, group2, is_scale=False):
from scipy import stats
import numpy as np
# # 两组样本数据
# group1 = np.array([1.2, 1.5, 1.8, 2.1, 2.4])
# group2 = np.array([0.9, 1.3, 1.6, 2.3, 2.7, 3.0])
if is_scale:
group1 = minmax_scale(group1)
group2 = minmax_scale(group2)
# 执行差异表达分析(两组独立样本的t检验)
t_statistic, p_value = stats.ttest_ind(group1, group2, equal_var=False)
# 设置显著性水平(例如0.05)
alpha = 0.05
# # 判断基因的上调或下调情况
# fold_change = np.mean(group2) / np.mean(group1)
# if fold_change > 1 + alpha:
# # print("该基因上调")
# return 'Up', fold_change, p_value
# elif fold_change < 1 - alpha:
# # print("该基因下调")
# return 'Down', fold_change, p_value
# else:
# # print("该基因未表现出显著的上调或下调")
# return 'Unknown', fold_change, p_value
# 计算基因的折叠变化(fold change)
fold_change = np.log2(np.mean(group2) / np.mean(group1))
# 判断基因的上调或下调情况
if fold_change > 0 and p_value < alpha:
# regulation = 'Upregulated'
return 'Up', fold_change, p_value
elif fold_change < 0 and p_value < alpha:
# regulation = 'Downregulated'
return 'Down', fold_change, p_value
else:
# regulation = 'No significant change'
return 'Unknown', fold_change, p_value
# 生成需要的颜色list
def clst(lst):
tlst = []
for i in lst:
if i == 'Up':
tlst.append('r')
elif i == 'Down':
tlst.append('g')
else:
tlst.append('k')
return tlst
def data_exp(df1, df2, is_scale=False):
"""
df1: 数据1
df2: 数据2
数据1和数据2 dataframe格式,行是样本名(可以是不同样本数),列是基因名(相同的列)
"""
from collections import OrderedDict
cols = df1.columns
rmk_lst = []
fc_lst = []
pv_lst = []
# exp_dic = OrderedDict()
for k in cols:
rmk, fc, pv = gene_exp(list(df1[k]), list(df2[k]), is_scale=is_scale)
# 可能出现nan和inf
if np.isnan(fc) or np.isnan(pv) or np.isinf(fc) or np.isinf(pv):
print(':'.join(k)+'\t'+rmk+'\t'+str(fc)+'\t'+str(pv))
continue
rmk_lst.append(rmk)
fc_lst.append(fc)
pv_lst.append(pv)
# exp_dic[k] = [rmk, pv]
print(':'.join(k)+'\t'+rmk+'\t'+str(fc)+'\t'+str(pv))
# return exp_dic
return rmk_lst, fc_lst, pv_lst
def plot_volcano(fold_change, p_value, title, pltcolor=False, regulation=[]):
# 绘制火山图
fig, ax = plt.subplots()
# 计算-log10(p-value)
# neg_log_pval = -np.log10(p_value)
# 计算-log10(p-value),避免出现负数问题
neg_log_pval = -np.log10(np.maximum(p_value, np.finfo(float).eps))
# 绘制散点图
if pltcolor:
# 根据上调/下调与否设置点的颜色
# colors = np.where(regulation=='Up', 'r', np.where(regulation=='Down', 'g', 'k'))
colors = clst(regulation)
ax.scatter(fold_change, neg_log_pval, c=colors)
else:
ax.scatter(fold_change, neg_log_pval) # , c=colors)
# 设置坐标轴标签和标题
ax.set_xlabel('Fold Change (log2)')
ax.set_ylabel('-log10(p-value)')
ax.set_title('Volcano Plot[%s]' % title)
# 绘制水平线(显著性水平)
ax.axhline(-np.log10(alpha), color='gray', linestyle='--')
# 显示图形
plt.show()
使用方法:
# 获取表达上下调结果、foldchange、pvalue:
mklst, fclst, pvlst = data_exp(data_df1, data_df2, is_scale=True)
# 绘制火山图
plot_volcano(list(fclst), list(pvlst), title="mytitle", pltcolor=True, regulation=rmklst)
代码问题:左右标签没有对齐
import pandas as pd
import matplotlib.pyplot as plt
# 创建示例的 DataFrame 1
data1 = {'城市': ['北京', '上海', '广州', '深圳', '成都'],
'人口': [2154, 2423, 1404, 1303, 1682]}
df1 = pd.DataFrame(data1)
# 创建示例的 DataFrame 2
data2 = {'城市': ['纽约', '伦敦', '巴黎', '东京', '新加坡'],
'人口': [8537, 9304, 2141, 13929, 5894]}
df2 = pd.DataFrame(data2)
# 计算辅助变量
n = len(df1) # 数据长度
width = 0.35 # 每个条形图的宽度
# 创建画布和主轴
fig, ax = plt.subplots(figsize=(10, 6))
# 绘制左侧的条形图
ax.barh(df1.index, df1['人口'], height=width, color='blue', label='DataFrame 1')
# 绘制右侧的条形图
ax.barh(df2.index, -df2['人口'], height=width, color='red', label='DataFrame 2')
# 设置纵坐标标签和刻度
ax.set_yticks(df1.index)
ax.set_yticklabels(df1['城市'])
# 设置右侧纵坐标标签和刻度,并逆序显示
ax2 = ax.twinx()
ax2.set_yticks(df1.index)
ax2.set_yticklabels(df2['城市'][::-1]) # 逆序显示右侧坐标标签
# 调整左右坐标轴的位置对齐
ax.tick_params(axis="y", direction="inout", length=6, pad=10) # 左侧标签
ax2.tick_params(axis="y", direction="inout", length=6, pad=10) # 右侧标签
# 显示图例
ax.legend()
# 显示图表
plt.show()
自定义划分的bin区间
import numpy as np
import matplotlib.pyplot as plt
def histplot(df0, df1, label, xmax=None):
# 手动指定区间
bins = np.arange(0, 1.1, 0.1) # 根据需要自定义区间
names = df0.columns
fig, axs = plt.subplots(nrows=8, ncols=5, figsize=(12, 20), sharex=True, sharey=True)
for i, idx_name in enumerate(names):
cx = i // 5
rx = i % 5
# 计算频次划bin
hist, edges_0 = np.histogram(df0[idx_name], bins=bins)
hist, edges_1 = np.histogram(df1[idx_name], bins=bins)
axs[cx, rx].hist(df0[idx_name], bins=edges_0, label=label+'-0')
axs[cx, rx].hist(df1[idx_name], bins=edges_1, alpha=0.5, label=label+'-1')
axs[cx, rx].set_title(idx_name, fontsize=8)
axs[cx, rx].grid(axis="y")
axs[cx, rx].legend()
if xmax is not None:
axs[cx, rx].set_xlim(0, xmax) # (0, 0.1) # (0, 0.5)
plt.show()
histplot(data_df0, data_df1, 'label')
def bins_lst(lst1, lst2, n=10):
import numpy as np
mlst = list(lst1) + list(lst2)
minl = round(min(mlst), 4)
maxl = round(max(mlst), 4)
# print('minmax:', minl, maxl)
step = round(float(maxl - minl) / n, 3)
bins = np.arange(minl, maxl+step, step)
return bins
def histplot_bin(hl_df, pc_df, label, nrows=2, hsize=6, nbin=10, xmax=None):
names = hl_df.columns
fig, axs = plt.subplots(nrows=nrows, ncols=5, figsize=(9, hsize)) # , sharex=True) # , sharey=True)
for i, idx_name in enumerate(names):
cx = i // 5
rx = i % 5
bins = bins_lst(hl_df[idx_name], pc_df[idx_name], n=nbin)
# 计算频次划bin
hist, edges_hl = np.histogram(hl_df[idx_name], bins=bins)
hist, edges_pc = np.histogram(pc_df[idx_name], bins=bins)
axs[cx, rx].hist(hl_df[idx_name], bins=edges_hl, label=label+'-0')
axs[cx, rx].hist(pc_df[idx_name], bins=edges_pc, alpha=0.5, label=label+'-1')
axs[cx, rx].set_title(idx_name, fontsize=8)
axs[cx, rx].grid(axis="y")
if xmax is not None:
axs[cx, rx].set_xlim(0, xmax) # (0, 0.1) # (0, 0.5)
axs[cx, rx].legend()
plt.show()
# 使用
histplot2(df1[features], df2[features], label="label", nrows=3, hsize=8)