过去若干年, 数据分析的第一步是将数据解析并存储到数据库表中, 然后用 SQL 来分析.
现在我用的更多的是用 python 将数据读入内存, 保存到 Pandas DataFrame 中, 用 Pandas 来分析.
其实 Pandas 所用的基本方法和 SQL 是差不多的, Pandas 有篇对比 Pandas 和 SQL 的文章
https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html 写的很好, 通俗易懂, 我就不在这里逐字翻译了, 随手用 jupyter notebook 写下了相关的例子.
"""
refer to https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html
"""
import pandas as pd
import numpy as np
url = (
"https://raw.githubusercontent.com/pandas-dev"
"/pandas/main/pandas/tests/io/data/csv/tips.csv"
)
sqls = [
"SELECT total_bill, tip, smoker, time FROM tips",
"SELECT *, tip/total_bill as tip_rate FROM tips",
"SELECT * FROM tips WHERE total_bill > 10 and time = 'Dinner'",
"SELECT * FROM tips WHERE size >= 5 OR total_bill > 45",
"SELECT sex, count(*) FROM tips GROUP BY sex",
"SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day",
"SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day",
"SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key",
"SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key",
"SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key",
"SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key",
"SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2",
"SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5",
"DELETE FROM tips WHERE tip > 9"
]
cmds = [
'tips[["total_bill", "tip", "smoker", "time"]]',
'tips.assign(tip_rate=tips["tip"] / tips["total_bill"])',
'tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]',
'tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]',
'tips.groupby("sex").size()',
'tips.groupby("day").agg({"tip": np.mean, "day": np.size})',
'tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})',
'pd.merge(df1, df2, on="key")',
'pd.merge(df1, df2, on="key", how="left")',
'pd.merge(df1, df2, on="key", how="right")',
'pd.merge(df1, df2, on="key", how="outer")',
'pd.concat([df1, df2]).drop_duplicates()',
'tips.nlargest(10 + 5, columns="tip").tail(10)',
'tips.loc[tips["tip"] <= 9]'
]
seps = "-"*60
df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)})
df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)})
df3 = pd.DataFrame(
{"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)}
)
df4 = pd.DataFrame(
{"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]}
)
tips = pd.read_csv(url)
print(f"DataFrame\n{seps}")
print(tips)
for i in range(0, len(sqls)):
print(f"\n{i}. {sqls[i]} \n{seps}\n {cmds[i]} \n{seps}")
eval(f"print({cmds[i]})")
DataFrame
------------------------------------------------------------
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
.. ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[244 rows x 7 columns]
0. SELECT total_bill, tip, smoker, time FROM tips
------------------------------------------------------------
tips[["total_bill", "tip", "smoker", "time"]]
------------------------------------------------------------
total_bill tip smoker time
0 16.99 1.01 No Dinner
1 10.34 1.66 No Dinner
2 21.01 3.50 No Dinner
3 23.68 3.31 No Dinner
4 24.59 3.61 No Dinner
.. ... ... ... ...
239 29.03 5.92 No Dinner
240 27.18 2.00 Yes Dinner
241 22.67 2.00 Yes Dinner
242 17.82 1.75 No Dinner
243 18.78 3.00 No Dinner
[244 rows x 4 columns]
1. SELECT *, tip/total_bill as tip_rate FROM tips
------------------------------------------------------------
tips.assign(tip_rate=tips["tip"] / tips["total_bill"])
------------------------------------------------------------
total_bill tip sex smoker day time size tip_rate
0 16.99 1.01 Female No Sun Dinner 2 0.059447
1 10.34 1.66 Male No Sun Dinner 3 0.160542
2 21.01 3.50 Male No Sun Dinner 3 0.166587
3 23.68 3.31 Male No Sun Dinner 2 0.139780
4 24.59 3.61 Female No Sun Dinner 4 0.146808
.. ... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3 0.203927
240 27.18 2.00 Female Yes Sat Dinner 2 0.073584
241 22.67 2.00 Male Yes Sat Dinner 2 0.088222
242 17.82 1.75 Male No Sat Dinner 2 0.098204
243 18.78 3.00 Female No Thur Dinner 2 0.159744
[244 rows x 8 columns]
2. SELECT * FROM tips WHERE total_bill > 10 and time = 'Dinner'
------------------------------------------------------------
tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]
------------------------------------------------------------
total_bill tip sex smoker day time size
23 39.42 7.58 Male No Sat Dinner 4
44 30.40 5.60 Male No Sun Dinner 4
47 32.40 6.00 Male No Sun Dinner 4
52 34.81 5.20 Female No Sun Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
116 29.93 5.07 Male No Sun Dinner 4
155 29.85 5.14 Female No Sun Dinner 5
170 50.81 10.00 Male Yes Sat Dinner 3
172 7.25 5.15 Male Yes Sun Dinner 2
181 23.33 5.65 Male Yes Sun Dinner 2
183 23.17 6.50 Male Yes Sun Dinner 4
211 25.89 5.16 Male Yes Sat Dinner 4
212 48.33 9.00 Male No Sat Dinner 4
214 28.17 6.50 Female Yes Sat Dinner 3
239 29.03 5.92 Male No Sat Dinner 3
3. SELECT * FROM tips WHERE size >= 5 OR total_bill > 45
------------------------------------------------------------
tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]
------------------------------------------------------------
total_bill tip sex smoker day time size
59 48.27 6.73 Male No Sat Dinner 4
125 29.80 4.20 Female No Thur Lunch 6
141 34.30 6.70 Male No Thur Lunch 6
142 41.19 5.00 Male No Thur Lunch 5
143 27.05 5.00 Female No Thur Lunch 6
155 29.85 5.14 Female No Sun Dinner 5
156 48.17 5.00 Male No Sun Dinner 6
170 50.81 10.00 Male Yes Sat Dinner 3
182 45.35 3.50 Male Yes Sun Dinner 3
185 20.69 5.00 Male No Sun Dinner 5
187 30.46 2.00 Male Yes Sun Dinner 5
212 48.33 9.00 Male No Sat Dinner 4
216 28.15 3.00 Male Yes Sat Dinner 5
4. SELECT sex, count(*) FROM tips GROUP BY sex
------------------------------------------------------------
tips.groupby("sex").size()
------------------------------------------------------------
sex
Female 87
Male 157
dtype: int64
5. SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day
------------------------------------------------------------
tips.groupby("day").agg({"tip": np.mean, "day": np.size})
------------------------------------------------------------
tip day
day
Fri 2.734737 19
Sat 2.993103 87
Sun 3.255132 76
Thur 2.771452 62
6. SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day
------------------------------------------------------------
tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})
------------------------------------------------------------
tip
size mean
smoker day
No Fri 4 2.812500
Sat 45 3.102889
Sun 57 3.167895
Thur 45 2.673778
Yes Fri 15 2.714000
Sat 42 2.875476
Sun 19 3.516842
Thur 17 3.030000
7. SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key
------------------------------------------------------------
pd.merge(df1, df2, on="key")
------------------------------------------------------------
key value_x value_y
0 B 0.421341 -0.306186
1 D -0.213784 0.187269
2 D -0.213784 -1.267491
8. SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key
------------------------------------------------------------
pd.merge(df1, df2, on="key", how="left")
------------------------------------------------------------
key value_x value_y
0 A -0.274974 NaN
1 B 0.421341 -0.306186
2 C 0.473855 NaN
3 D -0.213784 0.187269
4 D -0.213784 -1.267491
9. SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key
------------------------------------------------------------
pd.merge(df1, df2, on="key", how="right")
------------------------------------------------------------
key value_x value_y
0 B 0.421341 -0.306186
1 D -0.213784 0.187269
2 D -0.213784 -1.267491
3 E NaN -0.191261
10. SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key
------------------------------------------------------------
pd.merge(df1, df2, on="key", how="outer")
------------------------------------------------------------
key value_x value_y
0 A -0.274974 NaN
1 B 0.421341 -0.306186
2 C 0.473855 NaN
3 D -0.213784 0.187269
4 D -0.213784 -1.267491
5 E NaN -0.191261
11. SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2
------------------------------------------------------------
pd.concat([df1, df2]).drop_duplicates()
------------------------------------------------------------
key value
0 A -0.274974
1 B 0.421341
2 C 0.473855
3 D -0.213784
0 B -0.306186
1 D 0.187269
2 D -1.267491
3 E -0.191261
12. SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5
------------------------------------------------------------
tips.nlargest(10 + 5, columns="tip").tail(10)
------------------------------------------------------------
total_bill tip sex smoker day time size
183 23.17 6.50 Male Yes Sun Dinner 4
214 28.17 6.50 Female Yes Sat Dinner 3
47 32.40 6.00 Male No Sun Dinner 4
239 29.03 5.92 Male No Sat Dinner 3
88 24.71 5.85 Male No Thur Lunch 2
181 23.33 5.65 Male Yes Sun Dinner 2
44 30.40 5.60 Male No Sun Dinner 4
52 34.81 5.20 Female No Sun Dinner 4
85 34.83 5.17 Female No Thur Lunch 4
211 25.89 5.16 Male Yes Sat Dinner 4
13. DELETE FROM tips WHERE tip > 9
------------------------------------------------------------
tips.loc[tips["tip"] <= 9]
------------------------------------------------------------
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
.. ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2
[243 rows x 7 columns]
1