《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析

第05章 探索性数据分析
5.1 概括性统计


>>> import pandas as pd
>>> import numpy as np
>>> fueleco = pd.read_csv("data/vehicles.csv.zip")
>>> fueleco
       barrels08  barrelsA08  ...  phevHwy  phevComb
0      15.695714         0.0  ...        0         0
1      29.964545         0.0  ...        0         0
2      12.207778         0.0  ...        0         0
3      29.964545         0.0  ...        0         0
4      17.347895         0.0  ...        0         0
...          ...         ...  ...      ...       ...
39096  14.982273         0.0  ...        0         0
39097  14.330870         0.0  ...        0         0
39098  15.695714         0.0  ...        0         0
39099  15.695714         0.0  ...        0         0
39100  18.311667         0.0  ...        0         0


>>> fueleco.mean()  
barrels08         17.442712
barrelsA08         0.219276
charge120          0.000000
charge240          0.029630
city08            18.077799
youSaveSpend   -3459.572645
charge240b         0.005869
phevCity           0.094703
phevHwy            0.094269
phevComb           0.094141
Length: 60, dtype: float64
>>> fueleco.std()  
barrels08          4.580230
barrelsA08         1.143837
charge120          0.000000
charge240          0.487408
city08             6.970672
youSaveSpend    3010.284617
charge240b         0.165399
phevCity           2.279478
phevHwy            2.191115
phevComb           2.226500
Length: 60, dtype: float64
>>> fueleco.quantile(
...     [0, 0.25, 0.5, 0.75, 1]
... )  
      barrels08  barrelsA08  ...  phevHwy  phevComb
0.00   0.060000    0.000000  ...      0.0       0.0
0.25  14.330870    0.000000  ...      0.0       0.0
0.50  17.347895    0.000000  ...      0.0       0.0
0.75  20.115000    0.000000  ...      0.0       0.0
1.00  47.087143   18.311667  ...     81.0      88.0


>>> fueleco.describe()  
         barrels08   barrelsA08  ...      phevHwy     phevComb
count  39101.00...  39101.00...  ...  39101.00...  39101.00...
mean     17.442712     0.219276  ...     0.094269     0.094141
std       4.580230     1.143837  ...     2.191115     2.226500
min       0.060000     0.000000  ...     0.000000     0.000000
25%      14.330870     0.000000  ...     0.000000     0.000000
50%      17.347895     0.000000  ...     0.000000     0.000000
75%      20.115000     0.000000  ...     0.000000     0.000000
max      47.087143    18.311667  ...    81.000000    88.000000


>>> fueleco.describe(include=object)  
              drive eng_dscr  ...   modifiedOn startStop
count         37912    23431  ...        39101      7405
unique            7      545  ...           68         2
top     Front-Wh...    (FFS)  ...  Tue Jan ...         N
freq          13653     8827  ...        29438      5176



>>> fueleco.describe().T
                count         mean  ...       75%          max
barrels08     39101.0    17.442712  ...    20.115    47.087143
barrelsA08    39101.0     0.219276  ...     0.000    18.311667
charge120     39101.0     0.000000  ...     0.000     0.000000
charge240     39101.0     0.029630  ...     0.000    12.000000
city08        39101.0    18.077799  ...    20.000   150.000000
...               ...          ...  ...       ...          ...
youSaveSpend  39101.0 -3459.572645  ... -1500.000  5250.000000
charge240b    39101.0     0.005869  ...     0.000     7.000000
phevCity      39101.0     0.094703  ...     0.000    97.000000
phevHwy       39101.0     0.094269  ...     0.000    81.000000
phevComb      39101.0     0.094141  ...     0.000    88.000000

5.2 列的类型


>>> fueleco.dtypes
barrels08     float64
barrelsA08    float64
charge120     float64
charge240     float64
city08          int64
modifiedOn     object
startStop      object
phevCity        int64
phevHwy         int64
phevComb        int64
Length: 83, dtype: object


>>> fueleco.dtypes.value_counts()
float64    32
int64      27
object     23
bool        1
dtype: int64



>>> fueleco.select_dtypes("int64").describe().T
                count         mean  ...     75%     max
city08        39101.0    18.077799  ...    20.0   150.0
cityA08       39101.0     0.569883  ...     0.0   145.0
co2           39101.0    72.538989  ...    -1.0   847.0
co2A          39101.0     5.543950  ...    -1.0   713.0
comb08        39101.0    20.323828  ...    23.0   136.0
...               ...          ...  ...     ...     ...
year          39101.0  2000.635406  ...  2010.0  2018.0
youSaveSpend  39101.0 -3459.572645  ... -1500.0  5250.0
phevCity      39101.0     0.094703  ...     0.0    97.0
phevHwy       39101.0     0.094269  ...     0.0    81.0
phevComb      39101.0     0.094141  ...     0.0    88.0


>>> np.iinfo(np.int8)
iinfo(min=-128, max=127, dtype=int8)
>>> np.iinfo(np.int16)
iinfo(min=-32768, max=32767, dtype=int16)
>>> fueleco[["city08", "comb08"]].info(memory_usage="deep")

RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   city08  39101 non-null  int64
 1   comb08  39101 non-null  int64
dtypes: int64(2)
memory usage: 611.1 KB
>>> (
...     fueleco[["city08", "comb08"]]
...     .assign(
...         city08=fueleco.city08.astype(np.int16),
...         comb08=fueleco.comb08.astype(np.int16),
...     )
...     .info(memory_usage="deep")
... )

RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   city08  39101 non-null  int16
 1   comb08  39101 non-null  int16
dtypes: int16(2)
memory usage: 152.9 KB



>>> fueleco.make.nunique()
>>> fueleco.model.nunique()
>>> fueleco[["make"]].info(memory_usage="deep")

RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   make    39101 non-null  object
dtypes: object(1)
memory usage: 2.4 MB
>>> (
...     fueleco[["make"]]
...     .assign(make=fueleco.make.astype("category"))
...     .info(memory_usage="deep")
... )

RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   make    39101 non-null  category
dtypes: category(1)
memory usage: 90.4 KB
>>> fueleco[["model"]].info(memory_usage="deep")

RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   model   39101 non-null  object
dtypes: object(1)
memory usage: 2.5 MB
>>> (
...     fueleco[["model"]]
...     .assign(model=fueleco.model.astype("category"))
...     .info(memory_usage="deep")
... )

RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   model   39101 non-null  category
dtypes: category(1)
memory usage: 496.7 KB

5.3 类型数据



>>> fueleco.select_dtypes(object).columns
Index(['drive', 'eng_dscr', 'fuelType', 'fuelType1', 'make', 'model',
       'mpgData', 'trany', 'VClass', 'guzzler', 'trans_dscr', 'tCharger',
       'sCharger', 'atvType', 'fuelType2', 'rangeA', 'evMotor', 'mfrCode',
       'c240Dscr', 'c240bDscr', 'createdOn', 'modifiedOn', 'startStop'],


>>> fueleco.drive.nunique()


>>> fueleco.drive.sample(5, random_state=42)
4217     4-Wheel ...
1736     4-Wheel ...
36029    Rear-Whe...
37631    Front-Wh...
1668     Rear-Whe...
Name: drive, dtype: object


>>> fueleco.drive.isna().sum()
>>> fueleco.drive.isna().mean() * 100


>>> fueleco.drive.value_counts()
Front-Wheel Drive             13653
Rear-Wheel Drive              13284
4-Wheel or All-Wheel Drive     6648
All-Wheel Drive                2401
4-Wheel Drive                  1221
2-Wheel Drive                   507
Part-time 4-Wheel Drive         198
Name: drive, dtype: int64


>>> top_n = fueleco.make.value_counts().index[:6]
>>> (
...     fueleco.assign(
...         make=fueleco.make.where(
...             fueleco.make.isin(top_n), "Other"
...         )
...     ).make.value_counts()
... )
Other        23211
Chevrolet     3900
Ford          3208
Dodge         2557
GMC           2442
Toyota        1976
BMW           1807
Name: make, dtype: int64


>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> top_n = fueleco.make.value_counts().index[:6]
>>> (
...     fueleco.assign(  
...         make=fueleco.make.where(
...             fueleco.make.isin(top_n), "Other"
...         )
...     )
...     .make.value_counts()
...     .plot.bar(ax=ax)
... )
>>> fig.savefig("c5-catpan.png", dpi=300)
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第1张图片


>>> import seaborn as sns
>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> top_n = fueleco.make.value_counts().index[:6]
>>> sns.countplot(
...     y="make",  
...     data=(
...         fueleco.assign(
...             make=fueleco.make.where(
...                 fueleco.make.isin(top_n), "Other"
...             )
...         )
...     ),
... )
>>> fig.savefig("c5-catsns.png", dpi=300) 
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第2张图片



>>> fueleco[fueleco.drive.isna()]
       barrels08  barrelsA08  ...  phevHwy  phevComb
7138    0.240000         0.0  ...        0         0
8144    0.312000         0.0  ...        0         0
8147    0.270000         0.0  ...        0         0
18215  15.695714         0.0  ...        0         0
18216  14.982273         0.0  ...        0         0
...          ...         ...  ...      ...       ...
23023   0.240000         0.0  ...        0         0
23024   0.546000         0.0  ...        0         0
23026   0.426000         0.0  ...        0         0
23031   0.426000         0.0  ...        0         0
23034   0.204000         0.0  ...        0         0


>>> fueleco.drive.value_counts(dropna=False)
Front-Wheel Drive             13653
Rear-Wheel Drive              13284
4-Wheel or All-Wheel Drive     6648
All-Wheel Drive                2401
4-Wheel Drive                  1221
NaN                            1189
2-Wheel Drive                   507
Part-time 4-Wheel Drive         198
Name: drive, dtype: int64



>>> fueleco.rangeA.value_counts()
290        74
270        56
280        53
310        41
277        38
328         1
250/370     1
362/537     1
310/370     1
340-350     1
Name: rangeA, Length: 216, dtype: int64


>>> (
...     fueleco.rangeA.str.extract(r"([^0-9.])")
...     .dropna()
...     .apply(lambda row: "".join(row), axis=1)
...     .value_counts()
... )
/    280
-     71
Name: rangeA, dtype: int64


>>> set(fueleco.rangeA.apply(type))
{, }


>>> fueleco.rangeA.isna().sum()


>>> (
...     fueleco.rangeA.fillna("0")
...     .str.replace("-", "/")
...     .str.split("/", expand=True)
...     .astype(float)
...     .mean(axis=1)
... )
0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
39096    0.0
39097    0.0
39098    0.0
39099    0.0
39100    0.0
Length: 39101, dtype: float64


>>> (
...     fueleco.rangeA.fillna("0")
...     .str.replace("-", "/")
...     .str.split("/", expand=True)
...     .astype(float)
...     .mean(axis=1)
...     .pipe(lambda ser_: pd.cut(ser_, 10))
...     .value_counts()
... )
(-0.45, 44.95]     37688
(269.7, 314.65]      559
(314.65, 359.6]      352
(359.6, 404.55]      205
(224.75, 269.7]      181
(404.55, 449.5]       82
(89.9, 134.85]        12
(179.8, 224.75]        9
(44.95, 89.9]          8
(134.85, 179.8]        5
dtype: int64


>>> (
...     fueleco.rangeA.fillna("0")
...     .str.replace("-", "/")
...     .str.split("/", expand=True)
...     .astype(float)
...     .mean(axis=1)
...     .pipe(lambda ser_: pd.qcut(ser_, 10))
...     .value_counts()
... )
Traceback (most recent call last):
ValueError: Bin edges must be unique: array([  0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. , 449.5]).
>>> (
...     fueleco.city08.pipe(
...         lambda ser: pd.qcut(ser, q=10)
...     ).value_counts()
... )
(5.999, 13.0]    5939
(19.0, 21.0]     4477
(14.0, 15.0]     4381
(17.0, 18.0]     3912
(16.0, 17.0]     3881
(15.0, 16.0]     3855
(21.0, 24.0]     3676
(24.0, 150.0]    3235
(13.0, 14.0]     2898
(18.0, 19.0]     2847
Name: city08, dtype: int64

5.4 连续型数据


>>> fueleco.select_dtypes("number")
       barrels08  barrelsA08  ...  phevHwy  phevComb
0      15.695714         0.0  ...        0         0
1      29.964545         0.0  ...        0         0
2      12.207778         0.0  ...        0         0
3      29.964545         0.0  ...        0         0
4      17.347895         0.0  ...        0         0
...          ...         ...  ...      ...       ...
39096  14.982273         0.0  ...        0         0
39097  14.330870         0.0  ...        0         0
39098  15.695714         0.0  ...        0         0
39099  15.695714         0.0  ...        0         0
39100  18.311667         0.0  ...        0         0


>>> fueleco.city08.sample(5, random_state=42)
4217     11
1736     21
36029    16
37631    16
1668     17
Name: city08, dtype: int64


>>> fueleco.city08.isna().sum()
>>> fueleco.city08.isna().mean() * 100


>>> fueleco.city08.describe()
count    39101.000000
mean        18.077799
std          6.970672
min          6.000000
25%         15.000000
50%         17.000000
75%         20.000000
max        150.000000
Name: city08, dtype: float64


>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> fueleco.city08.hist(ax=ax)
>>> fig.savefig(
...     "c5-conthistpan.png", dpi=300
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第3张图片


>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> fueleco.city08.hist(ax=ax, bins=30)
>>> fig.savefig(
...     "c5-conthistpanbins.png", dpi=300
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第4张图片


>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> sns.distplot(fueleco.city08, rug=True, ax=ax)
>>> fig.savefig(
...     "c5-conthistsns.png", dpi=300
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第5张图片



>>> fig, axs = plt.subplots(nrows=3, figsize=(10, 8))
>>> sns.boxplot(fueleco.city08, ax=axs[0])
>>> sns.violinplot(fueleco.city08, ax=axs[1])
>>> sns.boxenplot(fueleco.city08, ax=axs[2])
>>> fig.savefig("c5-contothersns.png", dpi=300)
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第6张图片
boxplot,violin plot,和 boxen plot

如果想检查数据是否是正态分布的,可以使用Kolmogorov-Smirnov测试,该测试提供了一个p值,如果p < 0.05,则不是正态分布的:

>>> from scipy import stats
>>> stats.kstest(fueleco.city08, cdf="norm")
KstestResult(statistic=0.9999999990134123, pvalue=0.0)


>>> from scipy import stats
>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> stats.probplot(fueleco.city08, plot=ax)
>>> fig.savefig("c5-conprob.png", dpi=300)
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第7张图片

5.5 在不同种数据间比较连续值


>>> mask = fueleco.make.isin(
...     ["Ford", "Honda", "Tesla", "BMW"]
... )
>>> fueleco[mask].groupby("make").city08.agg(
...     ["mean", "std"]
... )
            mean       std
BMW    17.817377  7.372907
Ford   16.853803  6.701029
Honda  24.372973  9.154064
Tesla  92.826087  5.538970


>>> g = sns.catplot(
...     x="make", y="city08", data=fueleco[mask], kind="box"
... )
>>> g.ax.figure.savefig("c5-catbox.png", dpi=300)
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第8张图片



>>> mask = fueleco.make.isin(
...     ["Ford", "Honda", "Tesla", "BMW"]
... )
>>> (fueleco[mask].groupby("make").city08.count())
BMW      1807
Ford     3208
Honda     925
Tesla      46
Name: city08, dtype: int64


>>> g = sns.catplot(
...     x="make", y="city08", data=fueleco[mask], kind="box"
... )
>>> sns.swarmplot(
...     x="make",
...     y="city08", 
...     data=fueleco[mask],
...     color="k",
...     size=1,
...     ax=g.ax,
... )
>>> g.ax.figure.savefig(
...     "c5-catbox2.png", dpi=300
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第9张图片


>>> g = sns.catplot(
...     x="make",
...     y="city08",
...     data=fueleco[mask],
...     kind="box",
...     col="year",
...     col_order=[2012, 2014, 2016, 2018],
...     col_wrap=2,
... )
>>> g.axes[0].figure.savefig(
...     "c5-catboxcol.png", dpi=300
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第10张图片


>>> g = sns.catplot(
...     x="make",
...     y="city08", 
...     data=fueleco[mask],
...     kind="box",
...     hue="year",
...     hue_order=[2012, 2014, 2016, 2018],
... )
>>> g.ax.figure.savefig(
...     "c5-catboxhue.png", dpi=300
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第11张图片


>>> mask = fueleco.make.isin(
...     ["Ford", "Honda", "Tesla", "BMW"]
... )
>>> (
...     fueleco[mask]
...     .groupby("make")
...     .city08.agg(["mean", "std"])
...     .style.background_gradient(cmap="RdBu", axis=0)
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第12张图片

5.6 比较两列连续型数据列


>>> fueleco.city08.cov(fueleco.highway08)
>>> fueleco.city08.cov(fueleco.comb08)
>>> fueleco.city08.cov(fueleco.cylinders)


>>> fueleco.city08.corr(fueleco.highway08)
>>> fueleco.city08.corr(fueleco.cylinders)


>>> import seaborn as sns
>>> fig, ax = plt.subplots(figsize=(8, 8))
>>> corr = fueleco[
...     ["city08", "highway08", "cylinders"]
... ].corr()
>>> mask = np.zeros_like(corr, dtype=np.bool)
>>> mask[np.triu_indices_from(mask)] = True
>>> sns.heatmap(
...     corr,
...     mask=mask,
...     fmt=".2f",
...     annot=True,
...     ax=ax,
...     cmap="RdBu",
...     vmin=-1,
...     vmax=1,
...     square=True,
... )
>>> fig.savefig(
...     "c5-heatmap.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第13张图片


>>> fig, ax = plt.subplots(figsize=(8, 8))
>>> fueleco.plot.scatter(
...     x="city08", y="highway08", alpha=0.1, ax=ax
... )
>>> fig.savefig(
...     "c5-scatpan.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第14张图片
>>> fig, ax = plt.subplots(figsize=(8, 8))
>>> fueleco.plot.scatter(
...     x="city08", y="cylinders", alpha=0.1, ax=ax
... )
>>> fig.savefig(
...     "c5-scatpan-cyl.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第15张图片


>>> fueleco.cylinders.isna().sum()
>>> fig, ax = plt.subplots(figsize=(8, 8))
>>> (
...     fueleco.assign(
...         cylinders=fueleco.cylinders.fillna(0)
...     ).plot.scatter(
...         x="city08", y="cylinders", alpha=0.1, ax=ax
...     )
... )
>>> fig.savefig(
...     "c5-scatpan-cyl0.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第16张图片


>>> res = sns.lmplot(
...     x="city08", y="highway08", data=fueleco
... )
>>> res.fig.savefig(
...     "c5-lmplot.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第17张图片


>>> res = sns.relplot(
...     x="city08",
...     y="highway08",
...     data=fueleco.assign(
...         cylinders=fueleco.cylinders.fillna(0)
...     ),
...     hue="year",
...     size="barrels08",
...     alpha=0.5,
...     height=8,
... )
>>> res.fig.savefig(
...     "c5-relplot2.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第18张图片


>>> res = sns.relplot(
...     x="city08",
...     y="highway08",
...     data=fueleco.assign(
...         cylinders=fueleco.cylinders.fillna(0)
...     ),
...     hue="year",
...     size="barrels08",
...     alpha=0.5,
...     height=8,
...     col="make",
...     col_order=["Ford", "Tesla"],
... )
>>> res.fig.savefig(
...     "c5-relplot3.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第19张图片


>>> fueleco.city08.corr(
...     fueleco.barrels08, method="spearman"
... )

5.7 比较类型值


>>> def generalize(ser, match_name, default):
...     seen = None
...     for match, name in match_name:
...         mask = ser.str.contains(match)
...         if seen is None:
...             seen = mask
...         else:
...             seen |= mask
...         ser = ser.where(~mask, name)
...     ser = ser.where(seen, default)
...     return ser
>>> makes = ["Ford", "Tesla", "BMW", "Toyota"]
>>> data = fueleco[fueleco.make.isin(makes)].assign(
...     SClass=lambda df_: generalize(
...         df_.VClass,
...         [
...             ("Seaters", "Car"),
...             ("Car", "Car"),
...             ("Utility", "SUV"),
...             ("Truck", "Truck"),
...             ("Van", "Van"),
...             ("van", "Van"),
...             ("Wagon", "Wagon"),
...         ],
...         "other",
...     )
... )


>>> data.groupby(["make", "SClass"]).size().unstack()
SClass     Car    SUV  ...  Wagon  other
make                   ...              
BMW     1557.0  158.0  ...   92.0    NaN
Ford    1075.0  372.0  ...  155.0  234.0
Tesla     36.0   10.0  ...    NaN    NaN
Toyota   773.0  376.0  ...  132.0  123.0


>>> pd.crosstab(data.make, data.SClass)
SClass   Car  SUV  ...  Wagon  other
make               ...
BMW     1557  158  ...     92      0
Ford    1075  372  ...    155    234
Tesla     36   10  ...      0      0
Toyota   773  376  ...    132    123


>>> pd.crosstab(
...     [data.year, data.make], [data.SClass, data.VClass]
... )
SClass               Car             ...                       other
VClass      Compact Cars Large Cars  ... Special Purpose Vehicle 4WD
year make                            ...
1984 BMW               6          0  ...            0
     Ford             33          3  ...           21
     Toyota           13          0  ...            3
1985 BMW               7          0  ...            0
     Ford             31          2  ...            9
...                  ...        ...  ...          ...
2017 Tesla             0          8  ...            0
     Toyota            3          0  ...            0
2018 BMW              37         12  ...            0
     Ford              0          0  ...            0
     Toyota            4          0  ...            0

使用Cramér's V方法检查品类的关系:

>>> import scipy.stats as ss
>>> import numpy as np
>>> def cramers_v(x, y):
...     confusion_matrix = pd.crosstab(x, y)
...     chi2 = ss.chi2_contingency(confusion_matrix)[0]
...     n = confusion_matrix.sum().sum()
...     phi2 = chi2 / n
...     r, k = confusion_matrix.shape
...     phi2corr = max(
...         0, phi2 - ((k - 1) * (r - 1)) / (n - 1)
...     )
...     rcorr = r - ((r - 1) ** 2) / (n - 1)
...     kcorr = k - ((k - 1) ** 2) / (n - 1)
...     return np.sqrt(
...         phi2corr / min((kcorr - 1), (rcorr - 1))
...     )
>>> cramers_v(data.make, data.SClass)


>>> data.make.corr(data.SClass, cramers_v)


>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> (
...     data.pipe(
...         lambda df_: pd.crosstab(df_.make, df_.SClass)
...     ).plot.bar(ax=ax)
... )
>>> fig.savefig("c5-bar.png", dpi=300, bbox_inches="tight")
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第20张图片


>>> res = sns.catplot(
...     kind="count", x="make", hue="SClass", data=data
... )
>>> res.fig.savefig(
...     "c5-barsns.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第21张图片


>>> fig, ax = plt.subplots(figsize=(10, 8))
>>> (
...     data.pipe(
...         lambda df_: pd.crosstab(df_.make, df_.SClass)
...     )
...     .pipe(lambda df_: df_.div(df_.sum(axis=1), axis=0))
...     .plot.bar(stacked=True, ax=ax)
... )
>>> fig.savefig(
...     "c5-barstacked.png", dpi=300, bbox_inches="tight"
... )
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第22张图片

5.8 使用Pandas的profiling库

使用pip install pandas-profiling安装profiling库。使用ProfileReport创建一个HTML报告:

>>> import pandas_profiling as pp
>>> pp.ProfileReport(fueleco)
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第23张图片
《Pandas 1.x Cookbook · 第二版》第05章 探索性数据分析_第24张图片


>>> report = pp.ProfileReport(fueleco)
>>> report.to_file("fuel.html")

