象写 SQL 那样用 Pandas 分析数据

Comparison with sql

  • 简介
  • jupyter notebook
    • 代码
    • 输出
  • 参考

简介

过去若干年, 数据分析的第一步是将数据解析并存储到数据库表中, 然后用 SQL 来分析.
现在我用的更多的是用 python 将数据读入内存, 保存到 Pandas DataFrame 中, 用 Pandas 来分析.

其实 Pandas 所用的基本方法和 SQL 是差不多的, Pandas 有篇对比 Pandas 和 SQL 的文章
https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html 写的很好, 通俗易懂, 我就不在这里逐字翻译了, 随手用 jupyter notebook 写下了相关的例子.

jupyter notebook

代码

"""
refer to https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html
"""
import pandas as pd
import numpy as np


url = (
    "https://raw.githubusercontent.com/pandas-dev"
    "/pandas/main/pandas/tests/io/data/csv/tips.csv"
)

sqls = [
    "SELECT total_bill, tip, smoker, time FROM tips",
    "SELECT *, tip/total_bill as tip_rate FROM tips",
    "SELECT * FROM tips WHERE total_bill > 10 and time = 'Dinner'",
    "SELECT * FROM tips WHERE size >= 5 OR total_bill > 45",
    "SELECT sex, count(*) FROM tips GROUP BY sex",
    "SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day",
    "SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day",
    "SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key",
    "SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key",
    "SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key",
    "SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key",
    "SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2",
    "SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5",
    "DELETE FROM tips WHERE tip > 9"
]

cmds = [
    'tips[["total_bill", "tip", "smoker", "time"]]',
    'tips.assign(tip_rate=tips["tip"] / tips["total_bill"])',
    'tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]',
    'tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]',
    'tips.groupby("sex").size()',
    'tips.groupby("day").agg({"tip": np.mean, "day": np.size})',
    'tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})',
    'pd.merge(df1, df2, on="key")',
    'pd.merge(df1, df2, on="key", how="left")',
    'pd.merge(df1, df2, on="key", how="right")',
    'pd.merge(df1, df2, on="key", how="outer")',
    'pd.concat([df1, df2]).drop_duplicates()',
    'tips.nlargest(10 + 5, columns="tip").tail(10)',
    'tips.loc[tips["tip"] <= 9]'
]

seps = "-"*60

df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})

df3 = pd.DataFrame(
    {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)}
)

df4 = pd.DataFrame(
    {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]}
)

tips = pd.read_csv(url)
print(f"DataFrame\n{seps}")
print(tips)


for i in range(0, len(sqls)):
    print(f"\n{i}. {sqls[i]} \n{seps}\n {cmds[i]} \n{seps}")
    eval(f"print({cmds[i]})")

输出

DataFrame
------------------------------------------------------------
     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]

0. SELECT total_bill, tip, smoker, time FROM tips
------------------------------------------------------------
 tips[["total_bill", "tip", "smoker", "time"]] 
------------------------------------------------------------
     total_bill   tip smoker    time
0         16.99  1.01     No  Dinner
1         10.34  1.66     No  Dinner
2         21.01  3.50     No  Dinner
3         23.68  3.31     No  Dinner
4         24.59  3.61     No  Dinner
..          ...   ...    ...     ...
239       29.03  5.92     No  Dinner
240       27.18  2.00    Yes  Dinner
241       22.67  2.00    Yes  Dinner
242       17.82  1.75     No  Dinner
243       18.78  3.00     No  Dinner

[244 rows x 4 columns]

1. SELECT *, tip/total_bill as tip_rate FROM tips 
------------------------------------------------------------
 tips.assign(tip_rate=tips["tip"] / tips["total_bill"]) 
------------------------------------------------------------
     total_bill   tip     sex smoker   day    time  size  tip_rate
0         16.99  1.01  Female     No   Sun  Dinner     2  0.059447
1         10.34  1.66    Male     No   Sun  Dinner     3  0.160542
2         21.01  3.50    Male     No   Sun  Dinner     3  0.166587
3         23.68  3.31    Male     No   Sun  Dinner     2  0.139780
4         24.59  3.61  Female     No   Sun  Dinner     4  0.146808
..          ...   ...     ...    ...   ...     ...   ...       ...
239       29.03  5.92    Male     No   Sat  Dinner     3  0.203927
240       27.18  2.00  Female    Yes   Sat  Dinner     2  0.073584
241       22.67  2.00    Male    Yes   Sat  Dinner     2  0.088222
242       17.82  1.75    Male     No   Sat  Dinner     2  0.098204
243       18.78  3.00  Female     No  Thur  Dinner     2  0.159744

[244 rows x 8 columns]

2. SELECT * FROM tips WHERE total_bill > 10 and time = 'Dinner'
------------------------------------------------------------
 tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)] 
------------------------------------------------------------
     total_bill    tip     sex smoker  day    time  size
23        39.42   7.58    Male     No  Sat  Dinner     4
44        30.40   5.60    Male     No  Sun  Dinner     4
47        32.40   6.00    Male     No  Sun  Dinner     4
52        34.81   5.20  Female     No  Sun  Dinner     4
59        48.27   6.73    Male     No  Sat  Dinner     4
116       29.93   5.07    Male     No  Sun  Dinner     4
155       29.85   5.14  Female     No  Sun  Dinner     5
170       50.81  10.00    Male    Yes  Sat  Dinner     3
172        7.25   5.15    Male    Yes  Sun  Dinner     2
181       23.33   5.65    Male    Yes  Sun  Dinner     2
183       23.17   6.50    Male    Yes  Sun  Dinner     4
211       25.89   5.16    Male    Yes  Sat  Dinner     4
212       48.33   9.00    Male     No  Sat  Dinner     4
214       28.17   6.50  Female    Yes  Sat  Dinner     3
239       29.03   5.92    Male     No  Sat  Dinner     3

3. SELECT * FROM tips WHERE size >= 5 OR total_bill > 45
------------------------------------------------------------
 tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]
------------------------------------------------------------
     total_bill    tip     sex smoker   day    time  size
59        48.27   6.73    Male     No   Sat  Dinner     4
125       29.80   4.20  Female     No  Thur   Lunch     6
141       34.30   6.70    Male     No  Thur   Lunch     6
142       41.19   5.00    Male     No  Thur   Lunch     5
143       27.05   5.00  Female     No  Thur   Lunch     6
155       29.85   5.14  Female     No   Sun  Dinner     5
156       48.17   5.00    Male     No   Sun  Dinner     6
170       50.81  10.00    Male    Yes   Sat  Dinner     3
182       45.35   3.50    Male    Yes   Sun  Dinner     3
185       20.69   5.00    Male     No   Sun  Dinner     5
187       30.46   2.00    Male    Yes   Sun  Dinner     5
212       48.33   9.00    Male     No   Sat  Dinner     4
216       28.15   3.00    Male    Yes   Sat  Dinner     5

4. SELECT sex, count(*) FROM tips GROUP BY sex
------------------------------------------------------------
 tips.groupby("sex").size() 
------------------------------------------------------------
sex
Female     87
Male      157
dtype: int64

5. SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day 
------------------------------------------------------------
 tips.groupby("day").agg({"tip": np.mean, "day": np.size})
------------------------------------------------------------
           tip  day
day                
Fri   2.734737   19
Sat   2.993103   87
Sun   3.255132   76
Thur  2.771452   62

6. SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day 
------------------------------------------------------------
 tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})
------------------------------------------------------------
             tip          
            size      mean
smoker day
No     Fri     4  2.812500
       Sat    45  3.102889
       Sun    57  3.167895
       Thur   45  2.673778
Yes    Fri    15  2.714000
       Sat    42  2.875476
       Sun    19  3.516842
       Thur   17  3.030000

7. SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key
------------------------------------------------------------
 pd.merge(df1, df2, on="key") 
------------------------------------------------------------
  key   value_x   value_y
0   B  0.421341 -0.306186
1   D -0.213784  0.187269
2   D -0.213784 -1.267491

8. SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key 
------------------------------------------------------------
 pd.merge(df1, df2, on="key", how="left") 
------------------------------------------------------------
  key   value_x   value_y
0   A -0.274974       NaN
1   B  0.421341 -0.306186
2   C  0.473855       NaN
3   D -0.213784  0.187269
4   D -0.213784 -1.267491

9. SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key 
------------------------------------------------------------
 pd.merge(df1, df2, on="key", how="right") 
------------------------------------------------------------
  key   value_x   value_y
0   B  0.421341 -0.306186
1   D -0.213784  0.187269
2   D -0.213784 -1.267491
3   E       NaN -0.191261

10. SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key 
------------------------------------------------------------
 pd.merge(df1, df2, on="key", how="outer") 
------------------------------------------------------------
  key   value_x   value_y
0   A -0.274974       NaN
1   B  0.421341 -0.306186
2   C  0.473855       NaN
3   D -0.213784  0.187269
4   D -0.213784 -1.267491
5   E       NaN -0.191261

11. SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2 
------------------------------------------------------------
 pd.concat([df1, df2]).drop_duplicates()
------------------------------------------------------------
  key     value
0   A -0.274974
1   B  0.421341
2   C  0.473855
3   D -0.213784
0   B -0.306186
1   D  0.187269
2   D -1.267491
3   E -0.191261

12. SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5 
------------------------------------------------------------
 tips.nlargest(10 + 5, columns="tip").tail(10) 
------------------------------------------------------------
     total_bill   tip     sex smoker   day    time  size
183       23.17  6.50    Male    Yes   Sun  Dinner     4
214       28.17  6.50  Female    Yes   Sat  Dinner     3
47        32.40  6.00    Male     No   Sun  Dinner     4
239       29.03  5.92    Male     No   Sat  Dinner     3
88        24.71  5.85    Male     No  Thur   Lunch     2
181       23.33  5.65    Male    Yes   Sun  Dinner     2
44        30.40  5.60    Male     No   Sun  Dinner     4
52        34.81  5.20  Female     No   Sun  Dinner     4
85        34.83  5.17  Female     No  Thur   Lunch     4
211       25.89  5.16    Male    Yes   Sat  Dinner     4

13. DELETE FROM tips WHERE tip > 9
------------------------------------------------------------
 tips.loc[tips["tip"] <= 9] 
------------------------------------------------------------
     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[243 rows x 7 columns]
1

参考

  • https://deepnote.com/blog/query-pandas-dataframes-with-sql
  • https://towardsdatascience.com/query-pandas-dataframe-with-sql-2bb7a509793d
  • https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html
  • https://towardsdatascience.com/pandas-equivalent-of-10-useful-sql-queries-f79428e60bd9

你可能感兴趣的:(pandas,sql,python)