pandas学习

import pandas as pd
import numpy as np

#Series是一个一维的数据结构
# s = pd.Series([1,2,3,'ss','213'])
# print(s)

#pandas会默认使用0-n来作为index,我们可以指定index
# s = pd.Series([1,2,3,'ss','213'],index=['a','b','c','d','e'])
# print(s)

#可以使用字典构造Series
# dict1 = {'beijing':50000,'haerbin':20000,'hangzhou':30000,'shenyang':25000,'shuzhou':None}
# apt = pd.Series(dict1)
# print(apt)
#
# #可以使用index访问数据
# print(apt['beijing'])
# print(apt[['beijing','shenyang']])
# print(apt[apt >25000])
#
# #可以使用下标赋值
# apt['beijing'] = 49000
# print(apt)
# apt[apt <30000] = 30000
# print(apt)
# #开方
# print(apt**2)
#求和
# cars = pd.Series({'beijing':40000,'shanghai':38000,'hangzhou':40000,'shenyang':20000})
# # print(apt+cars)
#
# #数据缺失
# print('tianjin' in cars)
# print('beijing' in cars)
# print(apt.notnull())
# print(apt.isnull())

#数据结构Dataframe,类比成excel表格
# data ={
#     'city':['beijing','shanghai','shenyang','tianjin','haerbin'],
#     'year':[2000,2001,2002,2003,2004],
#     'pop':[2000,2500,1900,1600,1300]
#        }
# data_frame = pd.DataFrame(data)
# print(data_frame)
#指定顺序,没有指定‘age’,会自动添加一列空值
# data_frame = pd.DataFrame(data,columns=['city','pop','year','age'])
# print(data_frame)
#columns只的是行,index指列
# data_frame = pd.DataFrame(data,columns=['city','pop','year','age'],index=['one','two','three','four','five'])
# # print(data_frame)
#
# #从DataFrame中选择数据
# # print(data_frame['city'])
# # print(data_frame.city) #读取列
# #读取行
# # print(data_frame.ix['one'])
#
# #DataFrame元素赋值
# # data_frame['pop']['one'] = 2100
# # print(data_frame)
#
# #DataFrame给一整列赋值
# data_frame['age'] = 5000
# print(data_frame)
#
# #DataFrame给一行赋值
# data_frame.ix['six'] = ['wuhan',2000,2005,5000]
# print(data_frame)
#
# #DataFrame支持numpy中的各种函数
# # data_frame.age = np.arange(6)
# # data_frame.ix['seven'] = np.arange(4)
# # print(data_frame)
#
# #可以使用pd中的Series来指定index和对应的values进行修改
# val = pd.Series([6000,4900,4800],index=['one','four','six'])
# data_frame['age'] = val
# print(data_frame)   #没有指定的部分会用空值把原来的数字替换掉
#
# #DataFrame可以转置
# # print(data_frame.T)
#
# #DataFrame也支持切片
# print(data_frame['city'][0:2])
# print(data_frame.ix['one'][0:2])
#
# #DataFrame可以给行和列取名字
# data_frame.index.name = 'index'
# # data_frame.columns.name = 'columns'
# print(data_frame)
#
# #index object
#
# #index的值不能更改,也支持数组操作
# index = data_frame.index
# print(index)
# print(index[0:3])
#
# #如何对Series进行切片:Series除了用下标进行切片外,还可以用index值进行切片
# data1 = pd.Series([1,2,3],index = ['r',"a",'c'])
# print(data1)
# print(data1[0:1])
# print(data1["r":"a"]) #这种切片前后都包含到了

#DataFrame切片

# frame = pd.DataFrame(np.arange(9).reshape(3,3),index=['a','b','c'],columns = ['one','two','three'])
# print(frame)
# print(frame[0:2])
# print(frame.ix[0:2])

#可以先按切行,再按切列
# print(frame.ix[['a',"b"],['one','two']])
# print(frame[frame.one == 3]) #打印one的值为3的行

#对列进行切片
# print(frame.ix[:,"one":"two"])

#重新设置index
# print(frame.reindex(['a','1','b']))

#设置默认填充参数
# print(frame.reindex(['a','v','c'],fill_value = 0))

#根据前面或者后面行进行填充
#根据前面的数值进行填充
# frame = pd.DataFrame(['blue','yellow','red'],index = [0,2,4])
# print(frame.reindex(range(7),method='ffill'))
# #根据后面的数值进填充
# print(frame.reindex(np.arange(6),method ='bfill'))

#reindex可以设置成更改columns
# frame = pd.DataFrame(np.arange(9).reshape(3,3),index=['a','b','c'],columns = ['one','two','three'])
# # print(frame.reindex(columns = ['two','one','three','four'],fill_value= 0))
# print(frame)
# #使用drop删除Series和DataFrame中得index
# print(frame.drop('a'))
# print(frame.drop(columns='one'))
# print(frame.drop(index=['a','b']))
# print(frame.drop('one',axis=1))
#
# #hierarchical(分层级)的Series
# data = pd.Series(np.random.randn(10),index=[["a","a","a","b","b","b","c","c","d","d"],[1,2,3,1,2,3,1,2,1,2]])
# print(data)
# print(data['b'][1:2])
# print(data['b':'d'])
# print(data[1:4])
# #转成DataFrame
# print(data.unstack())
# #将DataFrame转回
# print(data.unstack().stack())
#
# #concatenate将两张表上下连接
# #在concatenate的时候可以给每一个部分加上一个key
# df1 = pd.DataFrame({'apts':[50000,40000],"cars":[10000,20000]},index = ['shanghai','beijing'])
# df2 = pd.DataFrame({'apts':[30000,20000],'cars':[30000,40000]},index=['guangzhou','shenzhen'])
# print(pd.concat([df1,df2]))
# #指定key进行拼接
# #指定key拼接后变成了分层级的index
# print(pd.concat([df1,df2],keys=['x','y']))
#
# #可以升值axis变成左右拼接
# # df3 = pd.DataFrame({'salaries':[12222,11111]},index=['shanghai','beijing'])
# # print(pd.concat([df1,df3],axis=1))
#
# #可以配置join=‘inner’参数,去掉不匹配的项
# df4 = pd.DataFrame({'salaries':[11111,2222]},index = ['tianjin','beijing'])
# print(pd.concat([df1,df4],axis=1,join='inner'))
#
# #append 简单粗暴的上下拼接,不会对重复进行处理
# print(df1.append(df2))
# print(df1.append(df4))

#将Series和DataFrame进行拼接
# Series可以看成是单列或者单行的DataFrame
# df3 = pd.DataFrame({'salaries':[12222,11111]},index=['shanghai','beijing'])
# # ser1 = pd.Series([11111],index=['chongqing'])
# # print(pd.concat([df3,ser1]))
# ser1 = pd.Series([1111,22222],index = ['shanghai','beijing'],name='cars')
# print(pd.concat([df3,ser1],axis=1))

#使用Merge拼接两张表
#使用参数on = ‘’来控制以哪列进行拼接,用how=“outer”(不匹配的都留下),‘left’(不匹配的,留下左边有内容的),‘right’(d遇到不匹配的内容,留下右边有内容的
# df1 = pd.DataFrame({'apts':[5000,60000,7000,8000],'cars':[2000,3000,4000,5000],'citys':['beijing','chongqing','shanghai','shenyang']})
# df2 =pd.DataFrame({'house':[3000,4000,5000,6000],'citys':['beijing','wuhan','shenyang','guangzhou']})
# print(pd.merge(df1,df2,on='citys'))
# print(pd.merge(df1,df2,on='citys',how='outer'))
# print(pd.merge(df1,df2,on='citys',how='right'))
# print(pd.merge(df1,df2,on='citys',how='left'))

#join on index  #可以用how参数
# df1 = pd.DataFrame({'apts':[5000,60000,7000,8000],'cars':[2000,3000,4000,5000]},index=['beijing','chongqing','shanghai','shenyang'])
# df2 =pd.DataFrame({'house':[3000,4000,5000,6000]},index = ['beijing','wuhan','shenyang','guangzhou'])

# print(df1.join(df2))   #留下df1的index,把df2的内容添加到右边
# print(df1.join(df2,how = 'outer')) #使用outer参数保留全部,使用left保留df1的参数,使用right保留df2的index

#使用merge同样可以实现join的功能
# print(pd.merge(df1,df2,left_index=True,right_index=True,how='outer'))

#groupby 一般和aggregate一起使用

# data = pd.DataFrame({'salaries':[2000,4000,6000,8000,10000,5000],'name':['lin','lin','lin','bob','bob','bob'],'year':[2000,2001,2002,2000,2001,2002]})
#
# print(data.groupby(['name','year']).sum()) #求每年的和
# print(data.groupby(['name','year']).max())#求每年的最大值
# print(data.groupby(['name','year']).size())#求每年数据
# print(data.groupby(['name','year']).describe()) #详细信息

#csv文件读取  #目录下bike.csv
bikes = pd.read_csv('bikes.csv',encoding='latin1',sep = ';',parse_dates=['Date'],dayfirst=True,index_col='Date')

# print(bikes.head())
#可以使用dropna来删除值带na的值,可以用how参数来指定
bike = bikes.dropna(axis=1,how='all') #默认是删除行,axis=1指定删除列,这里how的参数是指只有行的值全部为nan的时候才会删除,
# print(bike.head())

#算出这条路线每个工作日有多少人骑行
# bike_berri_row = bike['Berri 1']
# print(bike_berri_row.head())
# print(bike_berri_row.index.weekday)  #.weekday可以打印时间是星期几
# weekday = pd.Series(bike_berri_row.index.weekday,index=bike_berri_row.index,name='weekday')
# bike_berri_row_week=pd.concat([bike_berri_row,weekday],axis=1)
# print(bike_berri_row_week)
# print(bike_berri_row_week.groupby(['weekday']).sum())
#
#算出每条路线的和,然后计算每个工作日一共有多少人骑行
bike = bike.sum(axis=1).to_frame()
# print(bike)
weekday = pd.Series(bike.index.weekday,index = bike.index,name = 'weekday')
# print(weekday)
bike_week = pd.concat([bike,weekday],axis=1)
print(bike_week)
print(bike_week.groupby(['weekday']).sum())

你可能感兴趣的:(pandas学习)