**
**
import pandas as pd
import numpy as np
1.Series
s = pd.Series([1,3,6,np.nan,44,1])
print(s) # 索引在左边,值在右边
print(s.values) # 输出值
print(s.index) # 输出索引
# 默认索引从0开始,可以自己指定索引
s2 = pd.Series([1,2,3,4],['a','b','c','d']) # 指定索引abcd
print(s2)
结果
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
[ 1. 3. 6. nan 44. 1.]
RangeIndex(start=0, stop=6, step=1)
a 1
b 2
c 3
d 4
dtype: int64
举例:各个城市的人口数量
sdata = {
'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
s3 = pd.Series(sdata)
print(s3)
states = ['California','Ohio','Oregon','Texas']
s4 = pd.Series(sdata,index=states) # 指定索引
print(s4) # California为缺失值
print(pd.isnull(s4)) # 检验缺失值(pd.notnull()也行)
# Series对象自身及其索引都有name属性
s4.name = 'population'
s4.index.name = 'state'
print(s4)
结果
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California True
Ohio False
Oregon False
Texas False
dtype: bool
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
2.DataFrame
data1 = {
'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada']
,'year':['2000','2001','2002','2003','2004','2005'],
'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame1 = pd.DataFrame(data1)
print(frame1)
print(frame1['state']) # 索引state一列
结果
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2003 2.4
4 Nevada 2004 2.9
5 Nevada 2005 3.2
0 Ohio
1 Ohio
2 Ohio
3 Nevada
4 Nevada
5 Nevada
Name: state, dtype: object
data2 = pd.date_range('2018-08-19',periods=6)
# dates = pd.date_range(['2018-08-19','2018-08-24']) # 起始 结束
# numpy.random.randn(d0,d1,...,dn)是从标准正态分布中返回一个或多个样本值
# numpy.random.rand(d0,d1,...,dn)的随机样本位于[0,1)中
# (6,4)表示6行4列数据
frame2 = pd.DataFrame(np.random.randn(6,4),index=data2,columns=['a','b','c','d'])
print(frame2)
# DataFrame行列皆可索引,可以被看作由Series组成的大字典
print(frame2['b'])
# 未指定行标签和列标签的数据
frame3 = pd.DataFrame(np.arange(12).reshape(3,4))
print(frame3)
# 另一种方式
frame4 = pd.DataFrame({
'A':[1,2,3,4],
'B':pd.Timestamp('20180819'),
'C':pd.Series([1,6,9,10],dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'})
print(frame4)
print(frame4.index)
print(frame4.columns)
print(frame4.values)
# 数据总结
print(frame4.describe())
# 翻转数据
print(frame4.T)
print(np.transpose(frame4))
# axis=1表示行,axis=0表示列
# 默认ascending(升序)为True,反之False为到倒序
print(frame4.sort_index(axis=0,ascending=True)) # 按列升序
print(frame4.sort_index(axis=1,ascending=False)) # 按行降序
print(frame4.sort_values(by='C',ascending=False)) # 对C列降序
结果
a b c d
2018-08-19 0.354866 0.015070 -0.627576 -0.495415
2018-08-20 2.900697 0.721100 -1.177336 2.631225
2018-08-21 2.587944 0.169646 -0.733831 -0.341553
2018-08-22 0.054560 -0.677785 1.023810 -0.290743
2018-08-23 -2.276774 -0.953618 -1.275852 0.469861
2018-08-24 1.005236 -0.088767 1.049968 1.524960
2018-08-19 0.015070
2018-08-20 0.721100
2018-08-21 0.169646
2018-08-22 -0.677785
2018-08-23 -0.953618
2018-08-24 -0.088767
Freq: D, Name: b, dtype: float64
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
A B C D E F
0 1 2018-08-19 1.0 3 test foo
1 2 2018-08-19 6.0 3 train foo
2 3 2018-08-19 9.0 3 test foo
3 4 2018-08-19 10.0 3 train foo
RangeIndex(start=0, stop=4, step=1)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1 Timestamp('2018-08-19 00:00:00') 1.0 3 'test' 'foo']
[2 Timestamp('2018-08-19 00:00:00') 6.0 3 'train' 'foo']
[3 Timestamp('2018-08-19 00:00:00') 9.0 3 'test' 'foo']
[4 Timestamp('2018-08-19 00:00:00') 10.0 3 'train' 'foo']]
A C D
count 4.000000 4.000000 4.0
mean 2.500000 6.500000 3.0
std 1.290994 4.041452 0.0
min 1.000000 1.000000 3.0
25% 1.750000 4.750000 3.0
50% 2.500000 7.500000 3.0
75% 3.250000 9.250000 3.0
max 4.000000 10.000000 3.0
0 ... 3
A 1 ... 4
B 2018-08-19 00:00:00 ... 2018-08-19 00:00:00
C 1 ... 10
D 3 ... 3
E test ... train
F foo ... foo
[6 rows x 4 columns]
0 ... 3
A 1 ... 4
B 2018-08-19 00:00:00 ... 2018-08-19 00:00:00
C 1 ... 10
D 3 ... 3
E test ... train
F foo ... foo
[6 rows x 4 columns]
A B C D E F
0 1 2018-08-19 1.0 3 test foo
1 2 2018-08-19 6.0 3 train foo
2 3 2018-08-19 9.0 3 test foo
3 4 2018-08-19 10.0 3 train foo
F E D C B A
0 foo test 3 1.0 2018-08-19 1
1 foo train 3 6.0 2018-08-19 2
2 foo test 3 9.0 2018-08-19 3
3 foo train 3 10.0 2018-08-19 4
A B C D E F
3 4 2018-08-19 10.0 3 train foo
2 3 2018-08-19 9.0 3 test foo
1 2 2018-08-19 6.0 3 train foo
0 1 2018-08-19 1.0 3 test foo
3.pandas选择数据
实战筛选数据
dates = pd.date_range('20180819',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
A B C D
2018-08-19 0 1 2 3
2018-08-20 4 5 6 7
2018-08-21 8 9 10 11
2018-08-22 12 13 14 15
2018-08-23 16 17 18 19
2018-08-24 20 21 22 23
# 检索A列
print(df['A']) # df.A也可以
# 选择跨越多行或多列
# 选取前三行
print(df[0:3])
2018-08-19 0
2018-08-20 4
2018-08-21 8
2018-08-22 12
2018-08-23 16
2018-08-24 20
Freq: D, Name: A, dtype: int32
A B C D
2018-08-19 0 1 2 3
2018-08-20 4 5 6 7
2018-08-21 8 9 10 11
# 根据标签选择数据
# 获取特定行或列
# 指定行数据
print(df.loc['20180819'])
# 指定列数据
print(df.loc[:,'A':'C'])
# 行列同时检索
print(df.loc['20180819',['A','B']])
A 0
B 1
C 2
D 3
Name: 2018-08-19 00:00:00, dtype: int32
A B C
2018-08-19 0 1 2
2018-08-20 4 5 6
2018-08-21 8 9 10
2018-08-22 12 13 14
2018-08-23 16 17 18
2018-08-24 20 21 22
A 0
B 1
Name: 2018-08-19 00:00:00, dtype: int32
# 根据序列iloc获取特点位置的值
print(df.iloc[3,1])
print(df.iloc[3:5,1:3])
# 跨行操作
print(df.iloc[[1,3,5],1:3])
# 混合选择
print(df.iloc[:3,[0,2]])
# 通过判断的筛选
print(df[df.A>8])
13
B C
2018-08-22 13 14
2018-08-23 17 18
B C
2018-08-20 5 6
2018-08-22 13 14
2018-08-24 21 22
A C
2018-08-19 0 2
2018-08-20 4 6
2018-08-21 8 10
A B C D
2018-08-22 12 13 14 15
2018-08-23 16 17 18 19
2018-08-24 20 21 22 23
4.pandas设置值
# 创建数据
dates2 = pd.date_range('20180819',periods=6)
df2 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates2,columns=['A','B','C','D'])
print(df2)
# 根据数据设置loc和iloc
df2.iloc[2,2] = 111
df2.loc['20180820','B'] = 2222
print(df2)
# 根据条件设置
# 更改B的数,而更改的位置取决于4的位置,并设相应位置的数为0
df2.B[df2.A>4] = 0
print(df2)
# 按行或列设置
# 列批处理,F列改为NaN
df2['F'] = np.nan
print(df2)
# 添加Series序列(长度必须对齐)
df2['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20180820',periods=6))
print(df2)
# 设定某行某列为特定值
df2.loc['20180820','A'] = 67 # df2.iloc[0,0] = 67
print(df2)
# 修改一整行数据
df2.iloc[1] = np.nan # df.iloc[1,:] = np.nan
print(df2)
结果
A B C D
2018-08-19 0 1 2 3
2018-08-20 4 5 6 7
2018-08-21 8 9 10 11
2018-08-22 12 13 14 15
2018-08-23 16 17 18 19
2018-08-24 20 21 22 23
A B C D
2018-08-19 0 1 2 3
2018-08-20 4 2222 6 7
2018-08-21 8 9 111 11
2018-08-22 12 13 14 15
2018-08-23 16 17 18 19
2018-08-24 20 21 22 23
A B C D
2018-08-19 0 1 2 3
2018-08-20 4 2222 6 7
2018-08-21 8 0 111 11
2018-08-22 12 0 14 15
2018-08-23 16 0 18 19
2018-08-24 20 0 22 23
A B C D F
2018-08-19 0 1 2 3 NaN
2018-08-20 4 2222 6 7 NaN
2018-08-21 8 0 111 11 NaN
2018-08-22 12 0 14 15 NaN
2018-08-23 16 0 18 19 NaN
2018-08-24 20 0 22 23 NaN
A B C D F E
2018-08-19 0 1 2 3 NaN NaN
2018-08-20 4 2222 6 7 NaN 1.0
2018-08-21 8 0 111 11 NaN 2.0
2018-08-22 12 0 14 15 NaN 3.0
2018-08-23 16 0 18 19 NaN 4.0
2018-08-24 20 0 22 23 NaN 5.0
A B C D F E
2018-08-19 0 1 2 3 NaN NaN
2018-08-20 67 2222 6 7 NaN 1.0
2018-08-21 8 0 111 11 NaN 2.0
2018-08-22 12 0 14 15 NaN 3.0
2018-08-23 16 0 18 19 NaN 4.0
2018-08-24 20 0 22 23 NaN 5.0
A B C D F E
2018-08-19 0.0 1.0 2.0 3.0 NaN NaN
2018-08-20 NaN NaN NaN NaN NaN NaN
2018-08-21 8.0 0.0 111.0 11.0 NaN 2.0
2018-08-22 12.0 0.0 14.0 15.0 NaN 3.0
2018-08-23 16.0 0.0 18.0 19.0 NaN 4.0
2018-08-24 20.0 0.0 22.0 23.0 NaN 5.0
Process finished with exit code 0
5.pandas处理丢失数据
# 创建含NaN的矩阵
dates3 = pd.date_range('20180819',periods=6)
df3 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates3,columns=['A','B','C','D'])
print(df3)
df3.iloc[0,1] = np.nan
df3.iloc[1,2] = np.nan
print(df3)
A B C D
2018-08-19 0 1 2 3
2018-08-20 4 5 6 7
2018-08-21 8 9 10 11
2018-08-22 12 13 14 15
2018-08-23 16 17 18 19
2018-08-24 20 21 22 23
A B C D
2018-08-19 0 NaN 2.0 3
2018-08-20 4 5.0 NaN 7
2018-08-21 8 9.0 10.0 11
2018-08-22 12 13.0 14.0 15
2018-08-23 16 17.0 18.0 19
2018-08-24 20 21.0 22.0 23
# 删掉含有NaN的行或列
print(df3.dropna()) # 默认是删掉含有NaN的行
print(df3.dropna(
axis=0, # 0对行进行操作;1对列进行操作
how='any' # 'any':只要存在NaN就drop掉;'all':必须全部是NaN才drop
))
A B C D
2018-08-21 8 9.0 10.0 11
2018-08-22 12 13.0 14.0 15
2018-08-23 16 17.0 18.0 19
2018-08-24 20 21.0 22.0 23
A B C D
2018-08-21 8 9.0 10.0 11
2018-08-22 12 13.0 14.0 15
2018-08-23 16 17.0 18.0 19
2018-08-24 20 21.0 22.0 23
# 删掉含有NaN的列
print(df3.dropna(
axis=1,how='any'
))
A D
2018-08-19 0 3
2018-08-20 4 7
2018-08-21 8 11
2018-08-22 12 15
2018-08-23 16 19
2018-08-24 20 23
# 替换NaN值为0或其他
print(df3.fillna(value=0))
A B C D
2018-08-19 0 0.0 2.0 3
2018-08-20 4 5.0 0.0 7
2018-08-21 8 9.0 10.0 11
2018-08-22 12 13.0 14.0 15
2018-08-23 16 17.0 18.0 19
2018-08-24 20 21.0 22.0 23
# 是否为空
print(df3.isnull())
A B C D
2018-08-19 False True False False
2018-08-20 False False True False
2018-08-21 False False False False
2018-08-22 False False False False
2018-08-23 False False False False
2018-08-24 False False False False
# 是否为NaN
print(df3.isna())
A B C D
2018-08-19 False True False False
2018-08-20 False False True False
2018-08-21 False False False False
2018-08-22 False False False False
2018-08-23 False False False False
2018-08-24 False False False False
# 检测某列是否有缺失数据NaN
print(df3.isnull().any())
A False
B True
C True
D False
dtype: bool
# 检测数据是否存在NaN,如果存在返回True
print(np.any(df3.isnull()==True))
True
后续再更新