示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)
print(np.abs(df))
指定轴的方向,默认axis=0,方向是列
指定轴方向,axis=1,方向是行
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)
# 使用apply应用行或列数据
#f = lambda x : x.max()
print(df.apply(lambda x : x.max()))
print(df.apply(lambda x : x.max(), axis=1))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))
格式:sort_index()
含义:排序默认使用升序排序,ascending=False 为降序排序
Series操作
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# Series
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)
# 索引排序
s4.sort_index() # 0 0 1 3 3
DataFrame操作时注意轴方向
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# DataFrame
df4 = pd.DataFrame(np.random.randn(3, 5),
index=np.random.randint(3, size=3),
columns=np.random.randint(5, size=5))
print(df4)
df4_isort = df4.sort_index(axis=1, ascending=False)
print(df4_isort) # 4 2 1 1 0
格式:sort_values(by='column name')
含义:根据某个唯一的列名进行排序,如果有其他相同列名则报错。
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# DataFrame
df4 = pd.DataFrame(np.random.randn(3, 5),
index=np.random.randint(3, size=3),
columns=np.random.randint(5, size=5))
print(df4)
# 按值排序
df4_vsort = df4.sort_values(by=3, ascending=False)
print(df4_vsort)
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
[np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data.head())
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
[np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data.isnull())
根据axis轴方向,丢弃包含NaN的行或列。
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
[np.nan, 4., np.nan], [1., 2., 3.]])
# dropna
print(df_data.dropna())
print(df_data.dropna(axis=1))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
[np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data.fillna(-100.))
pd.merge
根据单个或多个键将不同DataFrame的行连接起来
类似数据库的连接操作
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
print(df_obj1)
print(df_obj2)
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
# 默认将重叠列的列名作为“外键”进行连接
print(pd.merge(df_obj1, df_obj2))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
# on显示指定“外键”
print(pd.merge(df_obj1, df_obj2, on='key'))
默认是“内连接”(inner),即结果中的键是交集
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
# left_on,right_on分别指定左侧数据和右侧数据的“外键”
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2'))
how指定连接方式
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
# “外连接”
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer'))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
# 左连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left'))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
# 右连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))
格式:suffixes,默认为_x, _y
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data' : np.random.randint(0,10,3)})
print(pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right')))
格式:left_index=True或right_index=True
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))
格式:np.concatenate
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))
print(arr1)
print(arr2)
print(np.concatenate([arr1, arr2]))
print(np.concatenate([arr1, arr2], axis=1))
指定轴方向,默认axis=0
join指定合并方式,默认为outer
Series合并时查看行索引有无重复
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))
print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
print(pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))
print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
columns=['A', 'B'])
df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
columns=['C', 'D'])
print(df_obj1)
print(df_obj2)
print(pd.concat([df_obj1, df_obj2]))
print(pd.concat([df_obj1, df_obj2], axis=1, join='inner'))
将列索引旋转为行索引,完成层级索引
DataFrame->Series
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)
stacked = df_obj.stack()
print(stacked)
将层级索引展开
Series->DataFrame
认操作内层索引,即level=-1
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)
stacked = df_obj.stack()
# 默认操作内层索引
print(stacked.unstack())
# 通过level指定操作索引的级别
print(stacked.unstack(level=0))
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
'data2' : np.random.randint(0, 4, 8)})
print(df_obj)
print(df_obj.duplicated())
默认判断全部列,可指定按某些列判断
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
'data2' : np.random.randint(0, 4, 8)})
print(df_obj)
print(df_obj.drop_duplicates())
print(df_obj.drop_duplicates('data2'))
Series根据map传入的函数对每行或每列进行转换
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj)
print(ser_obj.map(lambda x : x ** 2))
replace根据值的内容进行替换
示例代码:
# 导入numpy,别名np
import numpy as np
# 导入pandas,别名 pd
import pandas as pd
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj)
# 单个值替换单个值
print(ser_obj.replace(1, -100))
# 多个值替换一个值
print(ser_obj.replace([6, 8], -100))
# 多个值替换多个值
print(ser_obj.replace([4, 7], [-100, -200]))