十九、数据整理(4)
作者:Chris Albon
译者:飞龙
协议:CC BY-NC-SA 4.0
连接和合并数据帧
# 导入模块
import pandas as pd
from IPython.display import display
from IPython.display import Image
raw_data = {
'subject_id': ['1', '2', '3', '4', '5'],
'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
df_a = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_a
|
subject_id |
first_name |
last_name |
0 |
1 |
Alex |
Anderson |
1 |
2 |
Amy |
Ackerman |
2 |
3 |
Allen |
Ali |
3 |
4 |
Alice |
Aoni |
4 |
5 |
Ayoung |
Atiches |
# 创建第二个数据帧
raw_data = {
'subject_id': ['4', '5', '6', '7', '8'],
'first_name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'last_name': ['Bonder', 'Black', 'Balwner', 'Brice', 'Btisan']}
df_b = pd.DataFrame(raw_data, columns = ['subject_id', 'first_name', 'last_name'])
df_b
|
subject_id |
first_name |
last_name |
0 |
4 |
Billy |
Bonder |
1 |
5 |
Brian |
Black |
2 |
6 |
Bran |
Balwner |
3 |
7 |
Bryce |
Brice |
4 |
8 |
Betty |
Btisan |
# 创建第三个数据帧
raw_data = {
'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
df_n = pd.DataFrame(raw_data, columns = ['subject_id','test_id'])
df_n
|
subject_id |
test_id |
0 |
1 |
51 |
1 |
2 |
15 |
2 |
3 |
15 |
3 |
4 |
61 |
4 |
5 |
16 |
5 |
7 |
14 |
6 |
8 |
15 |
7 |
9 |
1 |
8 |
10 |
61 |
9 |
11 |
16 |
# 将两个数据帧按行连接
df_new = pd.concat([df_a, df_b])
df_new
|
subject_id |
first_name |
last_name |
0 |
1 |
Alex |
Anderson |
1 |
2 |
Amy |
Ackerman |
2 |
3 |
Allen |
Ali |
3 |
4 |
Alice |
Aoni |
4 |
5 |
Ayoung |
Atiches |
0 |
4 |
Billy |
Bonder |
1 |
5 |
Brian |
Black |
2 |
6 |
Bran |
Balwner |
3 |
7 |
Bryce |
Brice |
4 |
8 |
Betty |
Btisan |
# 将两个数据帧按列连接
pd.concat([df_a, df_b], axis=1)
|
subject_id |
first_name |
last_name |
subject_id |
first_name |
last_name |
0 |
1 |
Alex |
Anderson |
4 |
Billy |
Bonder |
1 |
2 |
Amy |
Ackerman |
5 |
Brian |
Black |
2 |
3 |
Allen |
Ali |
6 |
Bran |
Balwner |
3 |
4 |
Alice |
Aoni |
7 |
Bryce |
Brice |
4 |
5 |
Ayoung |
Atiches |
8 |
Betty |
Btisan |
# 按两个数据帧按 subject_id 连接
pd.merge(df_new, df_n, on='subject_id')
|
subject_id |
first_name |
last_name |
test_id |
0 |
1 |
Alex |
Anderson |
51 |
1 |
2 |
Amy |
Ackerman |
15 |
2 |
3 |
Allen |
Ali |
15 |
3 |
4 |
Alice |
Aoni |
61 |
4 |
4 |
Billy |
Bonder |
61 |
5 |
5 |
Ayoung |
Atiches |
16 |
6 |
5 |
Brian |
Black |
16 |
7 |
7 |
Bryce |
Brice |
14 |
8 |
8 |
Betty |
Btisan |
15 |
# 将两个数据帧按照左和右数据帧的 subject_id 连接
pd.merge(df_new, df_n, left_on='subject_id', right_on='subject_id')
|
subject_id |
first_name |
last_name |
test_id |
0 |
1 |
Alex |
Anderson |
51 |
1 |
2 |
Amy |
Ackerman |
15 |
2 |
3 |
Allen |
Ali |
15 |
3 |
4 |
Alice |
Aoni |
61 |
4 |
4 |
Billy |
Bonder |
61 |
5 |
5 |
Ayoung |
Atiches |
16 |
6 |
5 |
Brian |
Black |
16 |
7 |
7 |
Bryce |
Brice |
14 |
8 |
8 |
Betty |
Btisan |
15 |
使用外连接来合并。
“全外连接产生表 A 和表 B 中所有记录的集合,带有来自两侧的匹配记录。如果没有匹配,则缺少的一侧将包含空值。” -- [来源](http://blog .codinghorror.com/a-visual-explanation-of-sql-joins/)
pd.merge(df_a, df_b, on='subject_id', how='outer')
|
subject_id |
first_name_x |
last_name_x |
first_name_y |
last_name_y |
0 |
1 |
Alex |
Anderson |
NaN |
NaN |
1 |
2 |
Amy |
Ackerman |
NaN |
NaN |
2 |
3 |
Allen |
Ali |
NaN |
NaN |
3 |
4 |
Alice |
Aoni |
Billy |
Bonder |
4 |
5 |
Ayoung |
Atiches |
Brian |
Black |
5 |
6 |
NaN |
NaN |
Bran |
Balwner |
6 |
7 |
NaN |
NaN |
Bryce |
Brice |
7 |
8 |
NaN |
NaN |
Betty |
Btisan |
使用内连接来合并。
“内联接只生成匹配表 A 和表 B 的记录集。” -- 来源
pd.merge(df_a, df_b, on='subject_id', how='inner')
|
subject_id |
first_name_x |
last_name_x |
first_name_y |
last_name_y |
0 |
4 |
Alice |
Aoni |
Billy |
Bonder |
1 |
5 |
Ayoung |
Atiches |
Brian |
Black |
# 使用右连接来合并
pd.merge(df_a, df_b, on='subject_id', how='right')
|
subject_id |
first_name_x |
last_name_x |
first_name_y |
last_name_y |
0 |
4 |
Alice |
Aoni |
Billy |
Bonder |
1 |
5 |
Ayoung |
Atiches |
Brian |
Black |
2 |
6 |
NaN |
NaN |
Bran |
Balwner |
3 |
7 |
NaN |
NaN |
Bryce |
Brice |
4 |
8 |
NaN |
NaN |
Betty |
Btisan |
使用左连接来合并。
“左外连接从表 A 中生成一组完整的记录,它们在表 B 中有匹配的记录。如果没有匹配,右侧将包含空。” -- 来源
pd.merge(df_a, df_b, on='subject_id', how='left')
|
subject_id |
first_name_x |
last_name_x |
first_name_y |
last_name_y |
0 |
1 |
Alex |
Anderson |
NaN |
NaN |
1 |
2 |
Amy |
Ackerman |
NaN |
NaN |
2 |
3 |
Allen |
Ali |
NaN |
NaN |
3 |
4 |
Alice |
Aoni |
Billy |
Bonder |
4 |
5 |
Ayoung |
Atiches |
Brian |
Black |
# 合并时添加后缀以复制列名称
pd.merge(df_a, df_b, on='subject_id', how='left', suffixes=('_left', '_right'))
|
subject_id |
first_name_left |
last_name_left |
first_name_right |
last_name_right |
0 |
1 |
Alex |
Anderson |
NaN |
NaN |
1 |
2 |
Amy |
Ackerman |
NaN |
NaN |
2 |
3 |
Allen |
Ali |
NaN |
NaN |
3 |
4 |
Alice |
Aoni |
Billy |
Bonder |
4 |
5 |
Ayoung |
Atiches |
Brian |
Black |
# 基于索引的合并
pd.merge(df_a, df_b, right_index=True, left_index=True)
|
subject_id_x |
first_name_x |
last_name_x |
subject_id_y |
first_name_y |
last_name_y |
0 |
1 |
Alex |
Anderson |
4 |
Billy |
Bonder |
1 |
2 |
Amy |
Ackerman |
5 |
Brian |
Black |
2 |
3 |
Allen |
Ali |
6 |
Bran |
Balwner |
3 |
4 |
Alice |
Aoni |
7 |
Bryce |
Brice |
4 |
5 |
Ayoung |
Atiches |
8 |
Betty |
Btisan |
列出 pandas 列中的唯一值
特别感谢 Bob Haffner 指出了一种更好的方法。
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd.set_option('display.max_row', 1000)
# 设置 ipython 的最大列宽
pd.set_option('display.max_columns', 50)
# 创建示例数据帧
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'year': [2012, 2012, 2013, 2014, 2014],
'reports': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
# 列出 df['name'] 的唯一值
df.name.unique()
# array(['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], dtype=object)
加载 JSON 文件
# 加载库
import pandas as pd
# 创建 JSON 文件的 URL(或者可以是文件路径)
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.json'
# 将 JSON 文件加载到数据框中
df = pd.read_json(url, orient='columns')
# 查看前十行
df.head(10)
|
category |
datetime |
integer |
0 |
0 |
2015-01-01 00:00:00 |
5 |
1 |
0 |
2015-01-01 00:00:01 |
5 |
10 |
0 |
2015-01-01 00:00:10 |
5 |
11 |
0 |
2015-01-01 00:00:11 |
5 |
12 |
0 |
2015-01-01 00:00:12 |
8 |
13 |
0 |
2015-01-01 00:00:13 |
9 |
14 |
0 |
2015-01-01 00:00:14 |
8 |
15 |
0 |
2015-01-01 00:00:15 |
8 |
16 |
0 |
2015-01-01 00:00:16 |
2 |
17 |
0 |
2015-01-01 00:00:17 |
1 |
加载 Excel 文件
# 加载库
import pandas as pd
# 创建 Excel 文件的 URL(或者可以是文件路径)
url = 'https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.xlsx'
# 将 Excel 文件的第一页加载到数据框中
df = pd.read_excel(url, sheetname=0, header=1)
# 查看前十行
df.head(10)
|
5 |
2015-01-01 00:00:00 |
0 |
0 |
5 |
2015-01-01 00:00:01 |
0 |
1 |
9 |
2015-01-01 00:00:02 |
0 |
2 |
6 |
2015-01-01 00:00:03 |
0 |
3 |
6 |
2015-01-01 00:00:04 |
0 |
4 |
9 |
2015-01-01 00:00:05 |
0 |
5 |
7 |
2015-01-01 00:00:06 |
0 |
6 |
1 |
2015-01-01 00:00:07 |
0 |
7 |
6 |
2015-01-01 00:00:08 |
0 |
8 |
9 |
2015-01-01 00:00:09 |
0 |
9 |
5 |
2015-01-01 00:00:10 |
0 |
将 Excel 表格加载为数据帧
# 导入模块
import pandas as pd
# 加载 excel 文件并赋给 xls_file
xls_file = pd.ExcelFile('../data/example.xls')
xls_file
#
# 查看电子表格的名称
xls_file.sheet_names
# ['Sheet1']
# 将 xls 文件 的 Sheet1 加载为数据帧
df = xls_file.parse('Sheet1')
df
|
year |
deaths_attacker |
deaths_defender |
soldiers_attacker |
soldiers_defender |
wounded_attacker |
wounded_defender |
0 |
1945 |
425 |
423 |
2532 |
37235 |
41 |
14 |
1 |
1956 |
242 |
264 |
6346 |
2523 |
214 |
1424 |
2 |
1964 |
323 |
1231 |
3341 |
2133 |
131 |
131 |
3 |
1969 |
223 |
23 |
6732 |
1245 |
12 |
12 |
4 |
1971 |
783 |
23 |
12563 |
2671 |
123 |
34 |
5 |
1981 |
436 |
42 |
2356 |
7832 |
124 |
124 |
6 |
1982 |
324 |
124 |
253 |
2622 |
264 |
1124 |
7 |
1992 |
3321 |
631 |
5277 |
3331 |
311 |
1431 |
8 |
1999 |
262 |
232 |
2732 |
2522 |
132 |
122 |
9 |
2004 |
843 |
213 |
6278 |
26773 |
623 |
2563 |
加载 CSV
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', ".", 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'preTestScore': [4, 24, 31, ".", "."],
'postTestScore': ["25,000", "94,000", 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42 |
4 |
25,000 |
1 |
Molly |
Jacobson |
52 |
24 |
94,000 |
2 |
Tina |
. |
36 |
31 |
57 |
3 |
Jake |
Milner |
24 |
. |
62 |
4 |
Amy |
Cooze |
73 |
. |
70 |
# 将数据帧保存为工作目录中的 csv
df.to_csv('pandas_dataframe_importing_csv/example.csv')
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv')
df
|
Unnamed: 0 |
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
0 |
Jason |
Miller |
42 |
4 |
25,000 |
1 |
1 |
Molly |
Jacobson |
52 |
24 |
94,000 |
2 |
2 |
Tina |
. |
36 |
31 |
57 |
3 |
3 |
Jake |
Milner |
24 |
. |
62 |
4 |
4 |
Amy |
Cooze |
73 |
. |
70 |
# 加载无头 CSV
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', header=None)
df
|
0 |
1 |
2 |
3 |
4 |
5 |
0 |
NaN |
first_name |
last_name |
age |
preTestScore |
postTestScore |
1 |
0.0 |
Jason |
Miller |
42 |
4 |
25,000 |
2 |
1.0 |
Molly |
Jacobson |
52 |
24 |
94,000 |
3 |
2.0 |
Tina |
. |
36 |
31 |
57 |
4 |
3.0 |
Jake |
Milner |
24 |
. |
62 |
5 |
4.0 |
Amy |
Cooze |
73 |
. |
70 |
# 在加载 csv 时指定列名称
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df
|
UID |
First Name |
Last Name |
Age |
Pre-Test Score |
Post-Test Score |
0 |
NaN |
first_name |
last_name |
age |
preTestScore |
postTestScore |
1 |
0.0 |
Jason |
Miller |
42 |
4 |
25,000 |
2 |
1.0 |
Molly |
Jacobson |
52 |
24 |
94,000 |
3 |
2.0 |
Tina |
. |
36 |
31 |
57 |
4 |
3.0 |
Jake |
Milner |
24 |
. |
62 |
5 |
4.0 |
Amy |
Cooze |
73 |
. |
70 |
# 通过将索引列设置为 UID 来加载 csv
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', index_col='UID', names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df
|
First Name |
Last Name |
Age |
Pre-Test Score |
Post-Test Score |
UID |
|
|
|
|
|
NaN |
first_name |
last_name |
age |
preTestScore |
postTestScore |
0.0 |
Jason |
Miller |
42 |
4 |
25,000 |
1.0 |
Molly |
Jacobson |
52 |
24 |
94,000 |
2.0 |
Tina |
. |
36 |
31 |
57 |
3.0 |
Jake |
Milner |
24 |
. |
62 |
4.0 |
Amy |
Cooze |
73 |
. |
70 |
# 在加载 csv 时将索引列设置为名字和姓氏
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', index_col=['First Name', 'Last Name'], names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df
|
|
UID |
Age |
Pre-Test Score |
Post-Test Score |
First Name |
Last Name |
|
|
|
|
first_name |
last_name |
NaN |
age |
preTestScore |
postTestScore |
Jason |
Miller |
0.0 |
42 |
4 |
25,000 |
Molly |
Jacobson |
1.0 |
52 |
24 |
94,000 |
Tina |
. |
2.0 |
36 |
31 |
57 |
Jake |
Milner |
3.0 |
24 |
. |
62 |
Amy |
Cooze |
4.0 |
73 |
. |
70 |
# 在加载 csv 时指定 '.' 为缺失值
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', na_values=['.'])
pd.isnull(df)
|
Unnamed: 0 |
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
False |
False |
False |
False |
False |
False |
1 |
False |
False |
False |
False |
False |
False |
2 |
False |
False |
True |
False |
False |
False |
3 |
False |
False |
False |
False |
True |
False |
4 |
False |
False |
False |
False |
True |
False |
# 加载csv,同时指定 '.' 和 'NA' 为“姓氏”列的缺失值,指定 '.' 为 preTestScore 列的缺失值
sentinels = {'Last Name': ['.', 'NA'], 'Pre-Test Score': ['.']}
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', na_values=sentinels)
df
|
Unnamed: 0 |
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
0 |
Jason |
Miller |
42 |
4 |
25,000 |
1 |
1 |
Molly |
Jacobson |
52 |
24 |
94,000 |
2 |
2 |
Tina |
. |
36 |
31 |
57 |
3 |
3 |
Jake |
Milner |
24 |
. |
62 |
4 |
4 |
Amy |
Cooze |
73 |
. |
70 |
# 在加载 csv 时跳过前 3 行
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', na_values=sentinels, skiprows=3)
df
|
2 |
Tina |
. |
36 |
31 |
57 |
0 |
3 |
Jake |
Milner |
24 |
. |
62 |
1 |
4 |
Amy |
Cooze |
73 |
. |
70 |
# 加载 csv,同时将数字字符串中的 ',' 解释为千位分隔符
df = pd.read_csv('pandas_dataframe_importing_csv/example.csv', thousands=',')
df
|
Unnamed: 0 |
first_name |
last_name |
age |
preTestScore |
postTestScore |
0 |
0 |
Jason |
Miller |
42 |
4 |
25000 |
1 |
1 |
Molly |
Jacobson |
52 |
24 |
94000 |
2 |
2 |
Tina |
. |
36 |
31 |
57 |
3 |
3 |
Jake |
Milner |
24 |
. |
62 |
4 |
4 |
Amy |
Cooze |
73 |
. |
70 |
长到宽的格式
# 导入模块
import pandas as pd
raw_data = {'patient': [1, 1, 1, 2, 2],
'obs': [1, 2, 3, 1, 2],
'treatment': [0, 1, 0, 1, 0],
'score': [6252, 24243, 2345, 2342, 23525]}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])
df
|
patient |
obs |
treatment |
score |
0 |
1 |
1 |
0 |
6252 |
1 |
1 |
2 |
1 |
24243 |
2 |
1 |
3 |
0 |
2345 |
3 |
2 |
1 |
1 |
2342 |
4 |
2 |
2 |
0 |
23525 |
制作“宽的”数据。
现在,我们将创建一个“宽的”数据帧,其中行数按患者编号,列按观测编号,单元格值为得分值。
df.pivot(index='patient', columns='obs', values='score')
obs |
1 |
2 |
3 |
patient |
|
|
|
1 |
6252.0 |
24243.0 |
2345.0 |
2 |
2342.0 |
23525.0 |
NaN |
在数据帧中小写列名
# 导入模块
import pandas as pd
# 设置 ipython 的最大行显示
pd.set_option('display.max_row', 1000)
# 设置 ipython 的最大列宽
pd.set_option('display.max_columns', 50)
# 创建示例数据帧
data = {'NAME': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'YEAR': [2012, 2012, 2013, 2014, 2014],
'REPORTS': [4, 24, 31, 2, 3]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df
|
NAME |
REPORTS |
YEAR |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
# 小写列名称
# Map the lowering function to all column names
df.columns = map(str.lower, df.columns)
df
|
name |
reports |
year |
Cochice |
Jason |
4 |
2012 |
Pima |
Molly |
24 |
2012 |
Santa Cruz |
Tina |
31 |
2013 |
Maricopa |
Jake |
2 |
2014 |
Yuma |
Amy |
3 |
2014 |
使用函数创建新列
# 导入模块
import pandas as pd
# 示例数据帧
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df
|
regiment |
company |
name |
preTestScore |
postTestScore |
0 |
Nighthawks |
1st |
Miller |
4 |
25 |
1 |
Nighthawks |
1st |
Jacobson |
24 |
94 |
2 |
Nighthawks |
2nd |
Ali |
31 |
57 |
3 |
Nighthawks |
2nd |
Milner |
2 |
62 |
4 |
Dragoons |
1st |
Cooze |
3 |
70 |
5 |
Dragoons |
1st |
Jacon |
4 |
25 |
6 |
Dragoons |
2nd |
Ryaner |
24 |
94 |
7 |
Dragoons |
2nd |
Sone |
31 |
57 |
8 |
Scouts |
1st |
Sloan |
2 |
62 |
9 |
Scouts |
1st |
Piger |
3 |
70 |
10 |
Scouts |
2nd |
Riani |
2 |
62 |
11 |
Scouts |
2nd |
Ali |
3 |
70 |
# 创建一个接受两个输入,pre 和 post 的函数
def pre_post_difference(pre, post):
# 返回二者的差
return post - pre
# 创建一个变量,它是函数的输出
df['score_change'] = pre_post_difference(df['preTestScore'], df['postTestScore'])
# 查看数据帧
df
|
regiment |
company |
name |
preTestScore |
postTestScore |
score_change |
0 |
Nighthawks |
1st |
Miller |
4 |
25 |
21 |
1 |
Nighthawks |
1st |
Jacobson |
24 |
94 |
70 |
2 |
Nighthawks |
2nd |
Ali |
31 |
57 |
26 |
3 |
Nighthawks |
2nd |
Milner |
2 |
62 |
60 |
4 |
Dragoons |
1st |
Cooze |
3 |
70 |
67 |
5 |
Dragoons |
1st |
Jacon |
4 |
25 |
21 |
6 |
Dragoons |
2nd |
Ryaner |
24 |
94 |
70 |
7 |
Dragoons |
2nd |
Sone |
31 |
57 |
26 |
8 |
Scouts |
1st |
Sloan |
2 |
62 |
60 |
9 |
Scouts |
1st |
Piger |
3 |
70 |
67 |
10 |
Scouts |
2nd |
Riani |
2 |
62 |
60 |
11 |
Scouts |
2nd |
Ali |
3 |
70 |
67 |
# 创建一个接受一个输入 x 的函数
def score_multipler_2x_and_3x(x):
# 返回两个东西,2x 和 3x
return x*2, x*3
# 创建两个新变量,它是函数的两个输出
df['post_score_x2'], df['post_score_x3'] = zip(*df['postTestScore'].map(score_multipler_2x_and_3x))
df
|
regiment |
company |
name |
preTestScore |
postTestScore |
score_change |
post_score_x2 |
post_score_x3 |
0 |
Nighthawks |
1st |
Miller |
4 |
25 |
21 |
50 |
75 |
1 |
Nighthawks |
1st |
Jacobson |
24 |
94 |
70 |
188 |
282 |
2 |
Nighthawks |
2nd |
Ali |
31 |
57 |
26 |
114 |
171 |
3 |
Nighthawks |
2nd |
Milner |
2 |
62 |
60 |
124 |
186 |
4 |
Dragoons |
1st |
Cooze |
3 |
70 |
67 |
140 |
210 |
5 |
Dragoons |
1st |
Jacon |
4 |
25 |
21 |
50 |
75 |
6 |
Dragoons |
2nd |
Ryaner |
24 |
94 |
70 |
188 |
282 |
7 |
Dragoons |
2nd |
Sone |
31 |
57 |
26 |
114 |
171 |
8 |
Scouts |
1st |
Sloan |
2 |
62 |
60 |
124 |
186 |
9 |
Scouts |
1st |
Piger |
3 |
70 |
67 |
140 |
210 |
10 |
Scouts |
2nd |
Riani |
2 |
62 |
60 |
124 |
186 |
11 |
Scouts |
2nd |
Ali |
3 |
70 |
67 |
140 |
210 |
将外部值映射为数据帧的值
# 导入模块
import pandas as pd
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
'age': [42, 52, 36, 24, 73],
'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df
|
first_name |
last_name |
age |
city |
0 |
Jason |
Miller |
42 |
San Francisco |
1 |
Molly |
Jacobson |
52 |
Baltimore |
2 |
Tina |
Ali |
36 |
Miami |
3 |
Jake |
Milner |
24 |
Douglas |
4 |
Amy |
Cooze |
73 |
Boston |
# 创建值的字典
city_to_state = { 'San Francisco' : 'California',
'Baltimore' : 'Maryland',
'Miami' : 'Florida',
'Douglas' : 'Arizona',
'Boston' : 'Massachusetts'}
df['state'] = df['city'].map(city_to_state)
df
|
first_name |
last_name |
age |
city |
state |
0 |
Jason |
Miller |
42 |
San Francisco |
California |
1 |
Molly |
Jacobson |
52 |
Baltimore |
Maryland |
2 |
Tina |
Ali |
36 |
Miami |
Florida |
3 |
Jake |
Milner |
24 |
Douglas |
Arizona |
4 |
Amy |
Cooze |
73 |
Boston |
Massachusetts |
数据帧中的缺失数据
# 导入模块
import pandas as pd
import numpy as np
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
'age': [42, np.nan, 36, 24, 73],
'sex': ['m', np.nan, 'f', 'm', 'f'],
'preTestScore': [4, np.nan, np.nan, 2, 3],
'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
Tina |
Ali |
36.0 |
f |
NaN |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
# 丢弃缺失值
df_no_missing = df.dropna()
df_no_missing
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
# 删除所有单元格为 NA 的行
df_cleaned = df.dropna(how='all')
df_cleaned
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
2 |
Tina |
Ali |
36.0 |
f |
NaN |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
# 创建一个缺失值填充的新列
df['location'] = np.nan
df
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
location |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
NaN |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
Tina |
Ali |
36.0 |
f |
NaN |
NaN |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
NaN |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
NaN |
# 如果列仅包含缺失值,删除列
df.dropna(axis=1, how='all')
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
Tina |
Ali |
36.0 |
f |
NaN |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
# 删除少于五个观测值的行
# 这对时间序列来说非常有用
df.dropna(thresh=5)
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
location |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
NaN |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
NaN |
# 用零填充缺失数据
df.fillna(0)
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
location |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
0.0 |
1 |
0 |
0 |
0.0 |
0 |
0.0 |
0.0 |
0.0 |
2 |
Tina |
Ali |
36.0 |
f |
0.0 |
0.0 |
0.0 |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
0.0 |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
0.0 |
# 使用 preTestScore 的平均值填充 preTestScore 中的缺失
# inplace=True 表示更改会立即保存到 df 中
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
location |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
NaN |
1 |
NaN |
NaN |
NaN |
NaN |
3.0 |
NaN |
NaN |
2 |
Tina |
Ali |
36.0 |
f |
3.0 |
NaN |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
NaN |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
NaN |
# 使用 postTestScore 的每个性别的均值填充 postTestScore 中的缺失
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
df
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
location |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
NaN |
1 |
NaN |
NaN |
NaN |
NaN |
3.0 |
NaN |
NaN |
2 |
Tina |
Ali |
36.0 |
f |
3.0 |
70.0 |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
NaN |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
NaN |
# 选择年龄不是 NaN 且性别不是 NaN 的行
df[df['age'].notnull() & df['sex'].notnull()]
|
first_name |
last_name |
age |
sex |
preTestScore |
postTestScore |
location |
0 |
Jason |
Miller |
42.0 |
m |
4.0 |
25.0 |
NaN |
2 |
Tina |
Ali |
36.0 |
f |
3.0 |
70.0 |
NaN |
3 |
Jake |
Milner |
24.0 |
m |
2.0 |
62.0 |
NaN |
4 |
Amy |
Cooze |
73.0 |
f |
3.0 |
70.0 |
NaN |
pandas 中的移动平均
# 导入模块
import pandas as pd
# 创建数据
data = {'score': [1,1,1,2,2,2,3,3,3]}
# 创建数据帧
df = pd.DataFrame(data)
# 查看数据帧
df
|
score |
0 |
1 |
1 |
1 |
2 |
1 |
3 |
2 |
4 |
2 |
5 |
2 |
6 |
3 |
7 |
3 |
8 |
3 |
# 计算移动平均。也就是说,取前两个值,取平均值
# 然后丢弃第一个,再加上第三个,以此类推。
df.rolling(window=2).mean()
|
score |
0 |
NaN |
1 |
1.0 |
2 |
1.0 |
3 |
1.5 |
4 |
2.0 |
5 |
2.0 |
6 |
2.5 |
7 |
3.0 |
8 |
3.0 |