一.Pandas常用数据类型
1.一维数组
# Series是pandas提供的一维数组,由索引和值两部分组成
s=pd.Series([1,3,6,np.nan,44,1])
print(s)
2.时间序列矩阵
# date_range(start=None,end=None,periods=None,freq='D',tz=None,normalize=False
# ,name=None,closed=None,**kwargs)
# 参数start与end分别用来指定起止日期时间
# 参数periods用来指定要生成的数据数量
# 参数freq用来指定时间间隔,默认为‘D’,表示相邻两个日期之间相差一天
#打造一个日期的矩阵(默认间隔为一天)
dates=pd.date_range('20220920',periods=6)
print(dates)
# 打造间隔为一年的日期矩阵,年末最后一天
date1=pd.date_range(start='20190101',periods=6,freq='A')
print(date1)
# 打造间隔为一年的日期矩阵,年初第一天
date2=pd.date_range(start='20190101',periods=6,freq='AS')
print(date2)
3.二维数组
#index为行,columns为列
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
df1=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
#返回列的序号
print(df1.index)
#返回行的序列号
print(df1.columns)
#详细描述
print(df1.describe())
#矩阵反转
print(df1.T)
#排序(Flase是进行倒着的排序)
print(df1.sort_index(axis=1,ascending=False))
二.数据相关操作
1.设置值
import numpy as np
import pandas as pd
dates=pd.date_range('20220920',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df['a'],df.a)
#精确设置
df.iloc[2,2]=111
#以标签的形式设置值
df.loc['20220922','b']=2222
#条件修改
df[df.a>0]=3333
#加一行
df['f']=np.nan
df['e']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20221001',periods=6))
2.处理丢失的数据
import numpy as np
import pandas as pd
dates=pd.date_range('20220920',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
#axis=0就是丢掉行
print(df.dropna(axis=0,how='any'))
#这一行全为nan时就丢掉这一行
print(df.dropna(axis=0,how='all'))
#将nan的数据填上
print(df.fillna(0))
#检查数据中是否有nan,存在的话返回True
print(df.isnull)
#检查数据表格中是否含有True
print(np.any(df.isnull())==True)
3.合并
import numpy as np
import pandas as pd
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
#竖向合并,ignore_index是用来忽略前面的序号
res=pd.concat([df1,df2,df3],axis=0,ignore_index=True)
#横向合并
res1=pd.concat([df1,df2,df3],axis=1)
df4=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df5=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
#outer就是取并集
res2=pd.concat([df4,df5],join='outer',ignore_index=True)
#inner取交集
res3=pd.concat([df4,df5],join='inner')
#将df1与df2按照df1的形式进行拼接
rse4=pd.concat([df4,df5],axis=1,join_axes=[df1.index])
#append默认竖向加数据
res5=df1.append(df2,ignore_index=True)
res6=df1.append([df2,df3],ignore_index=True)
s1=pd.Series([1,2,3,4],index=['a','b''c','d'])
res7=df1.append(s1,ignore_index=True)
4.merge合并
import numpy as np
import pandas as pd
left=pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right=pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
#利用merge合并,基于key
res=pd.merge(left,right,on='key')
print(res)
left1=pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right1=pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
#默认合并为inner
res1=pd.merge(left,right,on=['key1','key2'],how='outer')
print(res1)
df1=pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2=pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
res2=pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')
print(res2)
boys=pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls=pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
res3=pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res3)
5.pandas与matplotlib结合
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.Series(np.random.randn(1000),index=np.arange(1000))
data=data.cumsum()
data1=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data1=data1.cumsum()
#print(data.head())
#data1.plot()
#plt.show()
ax=data1.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data1.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
plt.show()