# plotnine:python中的ggplot2
import plotnine as pn
from plotnine import data
import numpy as np
import pandas as pd
# 统计分析
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols, glm, poisson
import copy
# 阻止pandas产生warnings(提示DataFrame的相关操作生成引用or副本)
import warnings
warnings.filterwarnings("ignore")
# jupyter中同一个cell的多个结果均自动输出,不需挨个手工print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
选取mtcars的子集df,共32个记录,6个变量
df = data.mtcars[["wt", "mpg", "cyl", "vs", "am", "gear"]]
df.shape
df.dtypes
print(df.head())
(32, 6)
wt float64
mpg float64
cyl int64
vs int64
am int64
gear int64
dtype: object
wt mpg cyl vs am gear
0 2.620 21.0 6 0 1 4
1 2.875 21.0 6 0 1 4
2 2.320 22.8 4 1 1 4
3 3.215 21.4 6 1 0 3
4 3.440 18.7 8 0 0 3
将变量vs、am、gear由数值型连续变量转为字符型分类变量
df["vs"] = df["vs"].astype(str)
df["am"] = df["am"].astype(str)
df["gear"] = df["gear"].astype(str)
df.dtypes
wt float64
mpg float64
cyl int64
vs object
am object
gear object
dtype: object
变量分布
# 连续变量的常见分布统计量
print(df.describe())
# 分类变量的类别及频数
df["vs"].value_counts()
df["am"].value_counts()
df["gear"].value_counts()
wt mpg cyl
count 32.000000 32.000000 32.000000
mean 3.217250 20.090625 6.187500
std 0.978457 6.026948 1.785922
min 1.513000 10.400000 4.000000
25% 2.581250 15.425000 4.000000
50% 3.325000 19.200000 6.000000
75% 3.610000 22.800000 8.000000
max 5.424000 33.900000 8.000000
0 18
1 14
Name: vs, dtype: int64
0 19
1 13
Name: am, dtype: int64
3 15
4 12
5 5
Name: gear, dtype: int64
plotnine是python的一个绘图库,模仿了ggplot2的语法和绘图样式,如果熟悉R的ggplot2,那么该库可快速上手。
官方文档
该库与R的ggplot2使用中主要有以下2点不同:
ggplot()
的mapping参数,R中写为aes(x = mpg, y = wt)
,plotnine写为aes(x="mpg", y="wt")
\
连接或者语句首尾加括号以下是几个简单的例子
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt")) + \
pn.geom_point() + \
pn.geom_smooth(method="lm") + \
pn.theme_classic()
按照vs分组,分别绘制散点图、回归线及其95%置信区间
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt", color="vs")) + \
pn.geom_point() + \
pn.geom_smooth(method="lm") + \
pn.theme_classic()
按照变量vs和gear进行分面
pn.ggplot(data = df, mapping=pn.aes(x="mpg", y="wt")) + \
pn.facet_grid("vs ~ gear") + \
pn.geom_point() + \
pn.geom_smooth(method="lm") + \
pn.theme_xkcd()
各变量的缺失值
df.apply(lambda x: sum(x.isnull())).sort_values()
wt 0
mpg 0
cyl 0
vs 0
am 0
gear 0
count 0
dtype: int64
连续变量和分类变量的分布信息
print(df.describe())
# count:分类变量非缺失值的数量
# unique:分类变量唯一值的数量
# top:分类变量中出现频次最高的数
# freq:分类变量中出现频次最高的数出现了几次
print(df.describe(include="object"))
wt mpg cyl
count 32.000000 32.000000 32.000000
mean 3.217250 20.090625 6.187500
std 0.978457 6.026948 1.785922
min 1.513000 10.400000 4.000000
25% 2.581250 15.425000 4.000000
50% 3.325000 19.200000 6.000000
75% 3.610000 22.800000 8.000000
max 5.424000 33.900000 8.000000
vs am gear
count 32 32 32
unique 2 2 3
top 0 0 3
freq 18 19 15
# 频次
df["vs"].value_counts()
# 构成比
df["vs"].value_counts(normalize=True)
0 18
1 14
Name: vs, dtype: int64
0 0.5625
1 0.4375
Name: vs, dtype: float64
# 方差
np.var(df["wt"])
# 标准差
np.std(df["wt"])
0.927460875
0.9630477013107918
# 众数
stats.mode(df["wt"])
ModeResult(mode=array([3.44]), count=array([3]))
# 偏度
stats.skew(df["wt"])
0.44378553550607736
# 峰度
stats.kurtosis(df["wt"])
0.1724705401587343
正态分布样本均值的标准差(即标准误)及其95%置信区间
# 标准误
se = np.std(df["wt"]) / np.sqrt(len(df["wt"]))
se
# 区间上下限
np.mean(df["wt"]) - 1.96 * se
np.mean(df["wt"]) + 1.96 * se
0.17024439005074438
2.883570995500541
3.550929004499459
p = 0.09,在0.05的显著性水平下接受原假设,即未发现变量wt不符合正态分布
stats.shapiro(df["wt"])
ShapiroResult(statistic=0.9432578682899475, pvalue=0.09265592694282532)
p = 0.38,未发现方差不齐
stats.bartlett(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"])
BartlettResult(statistic=0.7611752294629192, pvalue=0.38296098768025166)
p = 0.001,两总体均值存在显著性差异
# 方差齐性检验显示方差齐,故equal_var=True;若不齐,需要使用校正后的t检验(指定equal_var=False)或使用秩和检验
stats.ttest_ind(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"], equal_var=True)
Ttest_indResult(statistic=3.653533152238974, pvalue=0.0009798492309250216)
df_copy = copy.deepcopy(df)
df_copy.loc[:16, "group"] = "0"
df_copy.loc[16:, "group"] = "1"
df_copy["group"].value_counts()
0 16
1 16
Name: group, dtype: int64
stats.ttest_rel(df_copy.loc[df_copy["group"] == "0", "wt"],
df_copy.loc[df_copy["group"] == "1", "wt"])
Ttest_relResult(statistic=2.0782957471723527, pvalue=0.05526298801644364)
关于独立样本还是相关样本,举个例子:
一项RCT(随机对照实验)想研究药物A和药物B对老年人的降压效果:研究分A组和B组两组,两组各用药14天,分别在基线(入组时)和14天时测血压值。对于A组或B组内部来说,想研究用药前后的血压值是否存在显著性差异,可使用两相关样本t检验(相当于比较前后差值的均值是否显著异于0);如果想比较A组和B组的降压效果是否存在显著性差异,可先分别求得A组和B组血压前后的变化值,然后对二组的变化值进行独立样本t检验【其他方法如协方差分析,基线血压作为协变量】。
stats.ranksums(df.loc[df["vs"] == "0", "wt"], df.loc[df["vs"] == "1", "wt"])
RanksumsResult(statistic=3.2668698585096214, pvalue=0.0010874365767340446)
stats.wilcoxon(df_copy.loc[df_copy["group"] == "0", "wt"],
df_copy.loc[df_copy["group"] == "1", "wt"])
WilcoxonResult(statistic=27.0, pvalue=0.033538818359375)
先拟合回归模型,再对回归模型进行方差分析得到方差分析表
p = 0.0003,说明三组间均值存在显著性差异
print(sm.stats.anova_lm(ols("wt ~ C(gear)", data = df).fit()))
df sum_sq mean_sq F PR(>F)
C(gear) 2.0 12.878947 6.439473 11.115889 0.000261
Residual 29.0 16.799801 0.579303 NaN NaN
进一步进行多重比较(参考文章),发现3组和4组、3组和5组间存在显著性差异
mc = sm.stats.multicomp.MultiComparison(df["wt"], df["gear"])
print(mc.tukeyhsd())
Multiple Comparison of Means - Tukey HSD, FWER=0.05
====================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------
3 4 -1.2759 0.001 -2.0038 -0.5481 True
3 5 -1.26 0.0089 -2.2305 -0.2895 True
4 5 0.0159 0.9 -0.9844 1.0163 False
----------------------------------------------------
方法使用选择:(n为总样本数,E为每个单元格的理论频数)
print(pd.crosstab(df["vs"], df["am"]))
am 0 1
vs
0 12 6
1 7 7
由于n = 32 < 40,应直接使用Fisher确切概率法;这里使用校正卡方检验(correction = True)仅为显示其用法
stats.chi2_contingency(pd.crosstab(df["vs"], df["am"]), correction = True)
(0.34753550543024225,
0.5555115470131495,
1,
array([[10.6875, 7.3125],
[ 8.3125, 5.6875]]))
Fisher确切概率法:p = 0.47,未发现两变量分布存在显著性关联
stats.fisher_exact(pd.crosstab(df["vs"], df["am"]))
(2.0, 0.4726974416017807)
stats.pearsonr(df["wt"], df["mpg"])
(-0.8676593765172278, 1.2939587013505124e-10)
stats.spearmanr(df["wt"], df["mpg"])
SpearmanrResult(correlation=-0.8864220332702978, pvalue=1.487594858127497e-11)
lm.summary()
返回大量信息,各变量回归系数为coef。如mpg的回归系数为-0.1191且p < 0.05,意味着在保持cyl不变的情况下,mpg每减少1个单位,wt相应减少0.12 (95%CI, -0.18 ~ -0.06)个单位。
lm = ols("wt ~ mpg + cyl", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wt R-squared: 0.760
Model: OLS Adj. R-squared: 0.743
Method: Least Squares F-statistic: 45.82
Date: Sun, 19 Jul 2020 Prob (F-statistic): 1.05e-09
Time: 17:06:33 Log-Likelihood: -21.393
No. Observations: 32 AIC: 48.79
Df Residuals: 29 BIC: 53.18
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.0760 1.117 4.544 0.000 2.791 7.361
mpg -0.1191 0.028 -4.216 0.000 -0.177 -0.061
cyl 0.0863 0.095 0.905 0.373 -0.109 0.281
==============================================================================
Omnibus: 5.603 Durbin-Watson: 0.987
Prob(Omnibus): 0.061 Jarque-Bera (JB): 4.488
Skew: 0.910 Prob(JB): 0.106
Kurtosis: 3.234 Cond. No. 277.
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
wt的回归拟合值
lm.predict(df)
0 3.092788
1 3.092788
2 2.705930
3 3.045155
4 3.539185
5 3.438123
6 4.063142
7 2.515401
8 2.705930
9 3.307134
10 3.473847
11 3.813072
12 3.705899
13 3.955969
14 4.527559
15 4.527559
16 4.015510
17 1.562751
18 1.800914
19 1.384130
20 2.860736
21 3.920245
22 3.955969
23 4.182224
24 3.479645
25 2.170065
26 2.324871
27 1.800914
28 3.884521
29 3.247593
30 3.979786
31 2.872644
dtype: float64
定性变量写入C()内,reference指定参照水平
如回归系数0.0738意味着,保持mpg不变的情况下,vs = 1组的wt相比于vs = 0组的wt的均值高出0.07个单位,但p值不显著(一般情况下先看p值,若p值显著大于设定的显著性水平如p = 0.65,则不必再看回归系数)
lm = ols("wt ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wt R-squared: 0.754
Model: OLS Adj. R-squared: 0.737
Method: Least Squares F-statistic: 44.36
Date: Sun, 19 Jul 2020 Prob (F-statistic): 1.51e-09
Time: 17:06:33 Log-Likelihood: -21.786
No. Observations: 32 AIC: 49.57
Df Residuals: 29 BIC: 53.97
Df Model: 2
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 6.0973 0.353 17.274 0.000 5.375 6.819
C(vs, Treatment(reference=0))[T.1] 0.0738 0.239 0.308 0.760 -0.416 0.563
mpg -0.1450 0.020 -7.243 0.000 -0.186 -0.104
==============================================================================
Omnibus: 5.895 Durbin-Watson: 1.108
Prob(Omnibus): 0.052 Jarque-Bera (JB): 4.733
Skew: 0.932 Prob(JB): 0.0938
Kurtosis: 3.279 Cond. No. 89.3
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
对因变量进行变换,此处相当于先对wt取对数获得变量log_wt,再利用log_wt对其它变量回归
mpg的回归系数-0.0495可解释为:mpg每增加1个单位,wt减少 [ e − 0.05 − 1 ] × 100 % [e^{-0.05} -1]\times 100\% [e−0.05−1]×100%,p值显著
lm = ols("np.log(wt) ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: np.log(wt) R-squared: 0.812
Model: OLS Adj. R-squared: 0.799
Method: Least Squares F-statistic: 62.66
Date: Sun, 19 Jul 2020 Prob (F-statistic): 2.97e-11
Time: 17:06:33 Log-Likelihood: 18.563
No. Observations: 32 AIC: -31.13
Df Residuals: 29 BIC: -26.73
Df Model: 2
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 2.0995 0.100 20.988 0.000 1.895 2.304
C(vs, Treatment(reference=0))[T.1] 0.0371 0.068 0.547 0.588 -0.102 0.176
mpg -0.0495 0.006 -8.724 0.000 -0.061 -0.038
==============================================================================
Omnibus: 2.197 Durbin-Watson: 1.607
Prob(Omnibus): 0.333 Jarque-Bera (JB): 1.872
Skew: 0.471 Prob(JB): 0.392
Kurtosis: 2.282 Cond. No. 89.3
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
对自变量进行变换,此处相当于对mpg取平方后获得变量mpg2,再使用mpg2参与回归
回归方程可写为 w t ^ = 4.53 − 0.003 × m p g 2 − 0.10 × v s \hat {wt} = 4.53 - 0.003\times mpg^{2} - 0.10\times vs wt^=4.53−0.003×mpg2−0.10×vs
lm = ols("wt ~ np.square(mpg) + C(vs, Treatment(reference=0))", df).fit()
print(lm.summary())
OLS Regression Results
==============================================================================
Dep. Variable: wt R-squared: 0.680
Model: OLS Adj. R-squared: 0.658
Method: Least Squares F-statistic: 30.78
Date: Sun, 19 Jul 2020 Prob (F-statistic): 6.75e-08
Time: 17:06:33 Log-Likelihood: -25.982
No. Observations: 32 AIC: 57.96
Df Residuals: 29 BIC: 62.36
Df Model: 2
Covariance Type: nonrobust
======================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 4.5263 0.198 22.906 0.000 4.122 4.930
C(vs, Treatment(reference=0))[T.1] -0.0965 0.265 -0.364 0.718 -0.638 0.445
np.square(mpg) -0.0029 0.000 -5.803 0.000 -0.004 -0.002
==============================================================================
Omnibus: 6.860 Durbin-Watson: 0.883
Prob(Omnibus): 0.032 Jarque-Bera (JB): 5.754
Skew: 1.027 Prob(JB): 0.0563
Kurtosis: 3.307 Cond. No. 1.35e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.35e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Binomial指定因变量为二项分布,logit指定连接函数为logit: l o g i t ( p ) = l n ( p 1 − p ) logit(p) = ln(\frac{p}{1 - p}) logit(p)=ln(1−pp)
logit = glm("vs ~ mpg + C(am, Treatment(reference=0))", df,
family=sm.families.Binomial(sm.families.links.logit)).fit()
print(logit.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: ['vs[0]', 'vs[1]'] No. Observations: 32
Model: GLM Df Residuals: 29
Model Family: Binomial Df Model: 2
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -10.323
Date: Sun, 19 Jul 2020 Deviance: 20.646
Time: 17:06:33 Pearson chi2: 20.2
No. Iterations: 6
Covariance Type: nonrobust
======================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 12.7051 4.625 2.747 0.006 3.640 21.770
C(am, Treatment(reference=0))[T.1] 3.0073 1.599 1.880 0.060 -0.128 6.142
mpg -0.6809 0.252 -2.698 0.007 -1.176 -0.186
======================================================================================================
Logistic回归中各变量回归系数经自然指数转化后可解释为OR(Odds Ratio,比值比或优势比),当结局发生率较低时(<10%)可用作RR的估计值反映暴露变量效应大小:如am的回归系数为3.0073,意味着保持mpg不变时,am = 1组的结局发生风险是am = 0组的 e 3.01 = 20.23 e^{3.01} = 20.23 e3.01=20.23倍,p值边缘不显著(p = 0.06)。【由于vs的结局发生率40%多,此处使用Logistic回归估计OR值反应暴露变量效应是不合适的,此处可使用Log-binomial模型估计现患比PR】
np.exp(logit.params[1:])
C(am, Treatment(reference=0))[T.1] 20.232169
mpg 0.506151
dtype: float64
泊松回归的因变量为单位时间或单位空间事件数的发生数,为计数变量(正整数)
# 生成符合泊松分布的因变量count,均值为5
df["count"] = np.random.poisson(lam=5, size=32)
df["count"].value_counts()
5 11
4 5
3 4
6 3
2 3
10 2
8 2
13 1
7 1
Name: count, dtype: int64
poi = poisson("count ~ mpg + C(vs, Treatment(reference=0))", df).fit()
print(poi.summary())
Optimization terminated successfully.
Current function value: 2.174190
Iterations 5
Poisson Regression Results
==============================================================================
Dep. Variable: count No. Observations: 32
Model: Poisson Df Residuals: 29
Method: MLE Df Model: 2
Date: Sun, 19 Jul 2020 Pseudo R-squ.: 0.02153
Time: 17:06:33 Log-Likelihood: -69.574
converged: True LL-Null: -71.105
Covariance Type: nonrobust LLR p-value: 0.2163
======================================================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------------------------------
Intercept 2.0939 0.313 6.691 0.000 1.481 2.707
C(vs, Treatment(reference=0))[T.1] -0.0290 0.211 -0.137 0.891 -0.443 0.385
mpg -0.0218 0.018 -1.199 0.230 -0.057 0.014
======================================================================================================
泊松回归中各变量回归系数经自然指数转化后可解释为RR(Relative Risk,相对危险度),关联性研究中反映了暴露变量效应大小
如vs的回归系数为-0.0290,意味着保持mpg不变时,vs = 1组的结局发生风险是vs = 0组的 e − 0.03 = 0.98 e^{-0.03} = 0.98 e−0.03=0.98倍,但p值不显著(p = 0.89)
np.exp(poi.params[1:])
C(vs, Treatment(reference=0))[T.1] 0.971449
mpg 0.978414
dtype: float64