obj=pd.Series([4,7,-5,3]) # 创建Series对象
obj.values # 返回值 array([4,7,-5,3])
obj.index # 返回索引
可以指定index
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
由字典类型数据生成
sdata = {
'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
data = {
'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
指定 index与columns
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four',
'five', 'six'])
基础操作:选取行与列!
frame2.columns
frame2['state'] # 法一:读取某一列
frame2.year # 法二:读取某一列
frame2.loc['three'] # 读取某一行
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
new_obj = obj.drop('c') # 删除Series中的值
new_obj
obj.drop(['d', 'c']) # 删除Series中的2个值
数据预处理操作:删除df的行与列
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
# 删除df的行
data.drop(['Colorado', 'Ohio'])
# 删除df的列
data.drop('two', axis=1)
data.drop(['two', 'four'], axis='columns')
# 删除df的列,并覆盖原df
obj.drop('c', inplace=True)
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data.loc['Colorado', ['two', 'three']]
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)
f = lambda x: x.max() - x.min()
frame.apply(f) # 对列使用函数
frame.apply(f, axis='columns') # 对行使用函数
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame.sort_index() # 按index值排序
frame.sort_index(axis=1) # 按columns值排序,默认升序
frame.sort_index(axis=1, ascending=False) # 按columns值排序,设为降序
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values() # NaN值放到末尾
frame = pd.DataFrame({
'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
frame.sort_values(by='b') # 按b列升序排列
frame.sort_values(by=['a', 'b'])
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df.sum() # 求每一列的和
df.sum(axis='columns') # 求每一行的和
df.mean(axis='columns', skipna=False) # 不排除NaN
df.idxmax() # 找到每一列最大值对应的index
df.describe() # 按列汇总统计
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
obj.value_counts() # 统计每个值的取值次数
pd.value_counts(obj.values, sort=False) # 默认降序排列
# 读取分隔好的数据,默认分隔符是逗号/制表符('\t')
pd.read_csv()
pd.read_table()
df = pd.read_csv('examples/ex1.csv')
df # 返回一个dataframe对象
pd.read_table('examples/ex1.csv', sep=',')
# 读取无表头的文件,默认header=0
pd.read_csv('examples/ex2.csv', header=None) # 默认列名
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message']) # 自己指定列名
# 指定某一列为index
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')
# 分层索引
parsed = pd.read_csv('examples/csv_mindex.csv',
index_col=['key1', 'key2'])
parsed
跳过行:先跳过,再生成df
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])
替代NaN值
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result
sentinels = {
'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)
设置显示的行数
pd.options.display.max_rows = 10
result = pd.read_csv('examples/ex6.csv')
result
更多这样使用:
pd.read_csv('examples/ex6.csv', nrows=5)
将数据写入文本
data = pd.read_csv('examples/ex5.csv')
data
data.to_csv('examples/out.csv')
data.to_csv(sys.stdout, index=False, header=False)
JSON数据
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
{"name": "Katie", "age": 38,
"pets": ["Sixes", "Stache", "Cisco"]}]
}
"""
import json
result = json.loads(obj)
result
asjson = json.dumps(result)
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings
data = pd.read_json('examples/example.json')
data
print(data.to_json())
print(data.to_json(orient='records'))
frame = pd.read_csv('examples/ex1.csv')
frame
frame.to_pickle('examples/frame_pickle')
pd.read_pickle('examples/frame_pickle')
注意:pickle仅被推荐为短期的存储格式
# 生成一个实例
xlsx = pd.ExcelFile('examples/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame
学习中…
学习中…
import numpy as np
import pandas as pd
创建一个含有缺失值的Series
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
删除缺失值
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
删除含有缺失值的行
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.],
[1., NA, NA],
[NA, NA, NA],
[NA, 6.5, 3.]])
cleaned = data.dropna() # 删除含有NA的行
cleaned
0 1 2
0 1.0 6.5 3.0
删除全为缺失值的行
data.dropna(how='all') # 删除全为缺失值的行
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
删除全为缺失值的列
data[4]=NA
data.dropna(axis=1,how='all')# 删除全为缺失值的列
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
0 1 2
0 -0.810186 NaN NaN
1 -1.647230 NaN NaN
2 0.048366 NaN -0.409267
3 1.240861 NaN 0.442658
4 -0.353365 2.785994 0.231522
5 1.184725 -1.566254 -0.840143
6 -0.827289 -0.212981 0.576380
df.dropna()
df.dropna(thresh=2) # 删除含有至少2个缺失值的行
填充缺失值
df.fillna(0)
df.fillna({
1: 0.5, 2: 0})
_ = df.fillna(0, inplace=True)
df
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)
使用均值填充缺失值
ata = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
data = pd.DataFrame({
'k1': ['one', 'two'] * 3 + ['two'],
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
# 返回一个布尔值series,反应每一行与之前的行是否重复
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
删除重复的行
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data['v1'] = range(7)
# 按照k1列重复值删除行
data.drop_duplicates(['k1'])
k1 k2 v1
0 one 1 0
1 two 1 1
data.drop_duplicates(['k1','k2'])
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
数值替代
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({
-999: np.nan, -1000: 0})
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data.rename(index=str.title, columns=str.upper)
data.rename(index={
'OHIO': 'INDIANA'},
columns={
'three': 'peekaboo'})
data.rename(index={
'OHIO': 'INDIANA'}, inplace=True)
随机抽样
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
df.take(sampler)
df.sample(n=3)
0 1 2 3
2 8 9 10 11
3 12 13 14 15
0 0 1 2 3
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
见课本