Pandas入门操作

一.Pandas常用数据类型

1.一维数组

# Series是pandas提供的一维数组,由索引和值两部分组成
s=pd.Series([1,3,6,np.nan,44,1])
print(s)

2.时间序列矩阵

# date_range(start=None,end=None,periods=None,freq='D',tz=None,normalize=False
# ,name=None,closed=None,**kwargs)
# 参数start与end分别用来指定起止日期时间
# 参数periods用来指定要生成的数据数量
# 参数freq用来指定时间间隔,默认为‘D’,表示相邻两个日期之间相差一天
#打造一个日期的矩阵(默认间隔为一天)
dates=pd.date_range('20220920',periods=6)
print(dates)

# 打造间隔为一年的日期矩阵,年末最后一天
date1=pd.date_range(start='20190101',periods=6,freq='A')
print(date1)
# 打造间隔为一年的日期矩阵,年初第一天
date2=pd.date_range(start='20190101',periods=6,freq='AS')
print(date2)

3.二维数组

#index为行,columns为列
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)

df1=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)

#返回列的序号
print(df1.index)
#返回行的序列号
print(df1.columns)

#详细描述
print(df1.describe())

#矩阵反转
print(df1.T)

#排序(Flase是进行倒着的排序)
print(df1.sort_index(axis=1,ascending=False))

二.数据相关操作

1.设置值

import numpy as np

import pandas as pd

dates=pd.date_range('20220920',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df['a'],df.a)


#精确设置
df.iloc[2,2]=111

#以标签的形式设置值
df.loc['20220922','b']=2222

#条件修改
df[df.a>0]=3333

#加一行
df['f']=np.nan
df['e']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20221001',periods=6))

2.处理丢失的数据

import numpy as np

import pandas as pd

dates=pd.date_range('20220920',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan

#axis=0就是丢掉行
print(df.dropna(axis=0,how='any'))

#这一行全为nan时就丢掉这一行
print(df.dropna(axis=0,how='all'))

#将nan的数据填上
print(df.fillna(0))

#检查数据中是否有nan,存在的话返回True
print(df.isnull)
#检查数据表格中是否含有True
print(np.any(df.isnull())==True)

3.合并

import numpy as np

import pandas as pd

df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])

print(df1)
print(df2)
print(df3)

#竖向合并,ignore_index是用来忽略前面的序号
res=pd.concat([df1,df2,df3],axis=0,ignore_index=True)

#横向合并
res1=pd.concat([df1,df2,df3],axis=1)



df4=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df5=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])

#outer就是取并集
res2=pd.concat([df4,df5],join='outer',ignore_index=True)
#inner取交集
res3=pd.concat([df4,df5],join='inner')

#将df1与df2按照df1的形式进行拼接
rse4=pd.concat([df4,df5],axis=1,join_axes=[df1.index])

#append默认竖向加数据

res5=df1.append(df2,ignore_index=True)
res6=df1.append([df2,df3],ignore_index=True)

s1=pd.Series([1,2,3,4],index=['a','b''c','d'])
res7=df1.append(s1,ignore_index=True)


4.merge合并

import numpy as np

import pandas as pd

left=pd.DataFrame({'key':['K0','K1','K2','K3'],
                   'A':['A0','A1','A2','A3'],
                   'B':['B0','B1','B2','B3']})

right=pd.DataFrame({'key':['K0','K1','K2','K3'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})

print(left)

#利用merge合并,基于key
res=pd.merge(left,right,on='key')
print(res)



left1=pd.DataFrame({'key1':['K0','K0','K1','K2'],
                    'key2':['K0','K1','K0','K1'],
                   'A':['A0','A1','A2','A3'],
                   'B':['B0','B1','B2','B3']})

right1=pd.DataFrame({'key1':['K0','K1','K1','K2'],
                     'key2':['K0','K0','K0','K0'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})

#默认合并为inner
res1=pd.merge(left,right,on=['key1','key2'],how='outer')
print(res1)


df1=pd.DataFrame({'col1':[0,1],'col_left':['a','b']})

df2=pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})

res2=pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')

print(res2)


boys=pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})

girls=pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})

res3=pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')

print(res3)

5.pandas与matplotlib结合

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

data=pd.Series(np.random.randn(1000),index=np.arange(1000))

data=data.cumsum()


data1=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data1=data1.cumsum()
#print(data.head())
#data1.plot()

#plt.show()


ax=data1.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data1.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
plt.show()

你可能感兴趣的:(pandas,python,数据分析)