pip install pandas
import pandas as pd
pd.Series([4,7,-5,3]) #S注意大小写
pd.Series([4,7,-5,3],index=['a','b','c,'d'])
pd.Series({'a':1, 'b':2})
pd. Series(0, index=['a,'b','c','d'])
运行实例:
>>> import pandas as pd
>>> pd.Series(range(10,14,1)) #创建列表,以默认1,2,3...为序列
0 10
1 11
2 12
3 13
dtype: int64
>>> pd.Series(range(10,14,1),index=list('abcd')) #创建series列表 以a,b,c,d为序列
a 10
b 11
c 12
d 13
dtype: int64
values 属性和index属性:
#【调用方式】
>>> import pandas as pd
>>> a=pd.Series(range(10,14,1),index=list('abcd'))
>>> a
a 10
b 11
c 12
d 13
dtype: int64
>>> a['a']
10
>>> a['d']
13
>>> a[0] #也可用位置索引
10
>>> a[3]
13
#【获取索引】
>>> a.index
Index(['a', 'b', 'c', 'd'], dtype='object')
#【获取值】
>>> a.values
array([10, 11, 12, 13], dtype=int64)
#【获取单个索引、值】
>>> a.index[0]
'a'
>>> a.values[1]
11
sr表示:series
示例:
>>> import numpy as np
>>> import pandas as pd
#【用np创建series列表】
>>> a=pd.Series(np.arange(4))
0 0
1 1
2 2
3 3
dtype: int32
#【与标量运算】
>>> a+2
0 2
1 3
2 4
3 5
dtype: int32
#【两个Series运算】
>>> a+a
0 0
1 2
2 4
3 6
#【从字典创建】
>>> b=pd.Series({'a':1,'b':3,'c':5})
>>> b
a 1
b 3
c 5
dtype: int64
#【in运算】
>>> 'a' in b #'a'是否在b里
True
>>> b.get('ccc',default=0) #获取键对应值,如果键不存在,返回默认值0
>>> 0
#【键索引】
>>> b['a']
1
>>> b[['a','c']]
a 1
c 5
dtype: int64
#【键切片】
>>> b['a':'c']
a 1
b 3
c 5
dtype: int64
整数索引(键)的pandas对象比较难
例:
>>> sr= pd.Series(np.arange(4.))
>>> sr
0 0.0
1 1.0
2 2.0
3 3.0
dtype: float64
>>> sr[-1] #将报错
Traceback (most recent call last):
File "" , line 1, in <module>
....
KeyError: 0
>>> e=pd.Series({1:5,2:4,3:3,4:2,5:1})
>>> e
1 5
2 4
3 3
4 2
5 1
dtype: int64
>>> e[5] #5指的是键5,而不是下标5
1
>>> e[0] #希望用下标0进行索引,它将报错
Traceback (most recent call last):
File "" , line 1, in <module>
....
KeyError: 0
#【重点:指定用loc(键),iloc(下标)索引】
>>> e.iloc[1] #指定下标索引
4
>>> e.loc[1] #指定键索引
5
pandas在运算时,会按索引进行对齐然后计算。如果存在不同的索引,则结果的索引是两个操作数索引的并集。
sr1 = pd.Series([12,23,34], index=['a','d','c'])
sr2 = pd.Series([11,20,10], index=['d','c','a'])
sr1+sr2
'''
结果:
a 22
c 54
d 34
'''
sr2 = pd.Series([11,20,10,14], index=['d','c','a','b'])
sr1+sr3
'''
结果:
a 22.0
b NaN
c 54.0
d 34.0
dtype: float64
'''
如何在两个Series对象相加时将缺失值设为0?
sr1.add(sr2, fill_value=0)
灵活的算术方法: add, sub, div, mul
sr1.sub(sr2,fill_value=11111)
>>> s1=pd.Series(range(4),index=list('acde'))
>>> s2=pd.Series(range(3),index=list('cdf'))
>>> s1
a 0
c 1
d 2
e 3
dtype: int64
>>> s2
c 0
d 1
f 2
dtype: int64
>>> s1+s2
a NaN
c 1.0
d 3.0
e NaN
f NaN
dtype: float64
>>> s1.add(s2,fill_value=8888)
a 8888.0
c 1.0
d 3.0
e 8891.0
f 8890.0
dtype: float64
缺失数据:使用NaN (Not a Number)来表示缺失数据。其值等于np.nan。
内置的None值也会被当做NaN处理。
处理缺失数据的相关方法:
import pandas as pd
>>> s1=pd.Series(range(4),index=list('acde'))
>>>s2=pd.Series(range(3),index=list('cdf'))
>>> s1+s2
a NaN
c 1.0
d 3.0
e NaN
f NaN
dtype: float64
>>> c=_ #【_表示上一步的结果,】
>>> c
a NaN
c 1.0
d 3.0
e NaN
f NaN
dtype: float64
#-----------【扔掉NaN】------
>>> c.dropna()
c 1.0
d 3.0
dtype: float64
#----【同义:挑出 非isnull 的值】------
>>> c[~c.isnull()]
c 1.0
d 3.0
dtype: float64
#----【同义:挑出notnull】---
>>> c[c.notnull()]
c 1.0
d 3.0
dtype: float64
#--------【填充nan】------
>>> c
a NaN
c 1.0
d 3.0
e NaN
f NaN
>>> c.fillna(000)
a 0.0
c 1.0
d 3.0
e 0.0
f 0.0
dtype: float64
pd.DataFrame({‘one’:[1,2,3],‘two’:[7,8,9]})
b=pd.DataFrame({‘one’:pd.Series([‘a’,‘b’,‘c’],index=[1,2,3]),‘two’:pd.Series(range(4,7,1),index=list(‘abc’))})
…
实例:
>>> import pandas as pd
#--------【不指定索引index】--------------
>>> a=pd.DataFrame({'one':[1,2,3],'two':[7,8,9]})
>>> a
one two
0 1 7
1 2 8
2 3 9
#--------【指定索引index,所有列共用一个索引,内部用range,index快捷创建值,和索引】----------------
>>> b=pd.DataFrame({'one':pd.Series(['a','b','c'],index=[1,2,3]),'two':pd.Series
(range(4,7,1),index=list('abc'))})
>>> b
one two
1 a NaN
2 b NaN
3 c NaN
a NaN 4.0
b NaN 5.0
c NaN 6.0
#------------【指定索引index2】---------------
>>> c=pd.DataFrame({'one':pd.Series([1,2,3],index=['a','b','c']),'two':pd.Series
(range(4,8,1),index=list('bacd'))})
>>> c
one two
a 1.0 5
b 2.0 4
c 3.0 6
d NaN 7
读取:df.read_csv(‘filename.csv’)
保存:df.to csv(‘new.csv’)
首先:进入csv目录方法有两种:
然后:进入python输入如下
import pandas as pd
#---【读取方式1】---
pd.read_csv('jr2.csv')
id date open close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
.....
18 18 2008-9-8 38.123 39.569 48.123 48.123 159917 15224
#-------【读取方式2】---
>>> x=open('jr2.csv')
>>> pd.read_csv(x) #返回列表省略
#-----------【保存CSV】--------
>>> a=pd.read_csv('jr2.csv') #先读取
>>> a.to_csv('newjr.csv') #然后保存成新的文件
>>> import pandas as pd
>>> a=pd.read_csv('jr2.csv')
>>> a.to_csv('newjr.csv')
#【获取行索引】
>>> a.index
RangeIndex(start=0, stop=19, step=1)
#【获取列索引】
>>> a.columns
Index(['id', 'date', 'open', 'close', 'high', 'low', 'volume', 'code'], dtype='object')
#【返回值】
>>> a.values
array([[0, '2007-3-1', 20.123, 21.569, 30.123, 30.123, 159899, 15223],
[1, '2007-3-1', 21.123, 22.569, 31.123, 31.123, 159900, 15223],
...
]],dtype=object)
#【概述】给出数量、平均值、方差、最小、最大,第1/4位的数,中位数等信息
>>> a=pd.read_csv('jr.csv')
>>> a.describe()
id open close ... low volume c
ode
count 19.000000 19.000000 19.000000 ... 19.000000 19.000000 19.000
000
mean 9.000000 29.123000 30.569000 ... 39.123000 159908.000000 15223.631
579
std 5.627314 5.627314 5.627314 ... 5.627314 5.627314 0.495
595
min 0.000000 20.123000 21.569000 ... 30.123000 159899.000000 15223.000
000
25% 4.500000 24.623000 26.069000 ... 34.623000 159903.500000 15223.000
000
50% 9.000000 29.123000 30.569000 ... 39.123000 159908.000000 15224.000
000
75% 13.500000 33.623000 35.069000 ... 43.623000 159912.500000 15224.000
000
max 18.000000 38.123000 39.569000 ... 48.123000 159917.000000 15224.000
000
[8 rows x 7 columns]
>>>
#【给第一列索引标题一个名字】
>>> a.index.name='n_id'
>>> a #注意,第一列名字多了一个n_id名字
id date open close high low volume code
n_id
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
按上一列:
#【给指定列的标题换个新名字】注意,第2、3列标题
>>> a.rename(columns={'close':'new_close','open':'new_open'})
id date new_open new_close high low volume code
n_id
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
表格内容:
id date new_open new_close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
2 2 2007-3-1 22.123 23.569 32.123 32.123 159901 15223
3 3 2007-3-1 23.123 24.569 33.123 33.123 159902 15223
4 4 2007-4-1 24.123 25.569 34.123 34.123 159903 15223
5 5 2007-4-2 25.123 26.569 35.123 35.123 159904 15224
6 6 2007-4-3 26.123 27.569 36.123 36.123 159905 15223
7 7 2007-4-4 27.123 28.569 37.123 37.123 159906 15224
8 8 2007-4-5 28.123 29.569 38.123 38.123 159907 15223
9 9 2007-4-6 29.123 30.569 39.123 39.123 159908 15224
10 10 2007-4-7 30.123 31.569 40.123 40.123 159909 15224
11 11 2008-9-1 31.123 32.569 41.123 41.123 159910 15224
12 12 2008-9-2 32.123 33.569 42.123 42.123 159911 15224
13 13 2008-9-3 33.123 34.569 43.123 43.123 159912 15224
14 14 2008-9-4 34.123 35.569 44.123 44.123 159913 15224
15 15 2008-9-5 35.123 36.569 45.123 45.123 159914 15224
16 16 2008-9-6 36.123 37.569 46.123 46.123 159915 15224
17 17 2008-9-7 37.123 38.569 47.123 47.123 159916 15224
18 18 2008-9-8 38.123 39.569 48.123 48.123 159917 15224
#【date列的第3个值】
>>> a['date'][2]
'2007-3-1'
#【拿date、open两列】
>>> a[['date','open']]
date open
0 2007-3-1 20.123
1 2007-3-1 21.123
...
18 2008-9-8 38.123
#【取0-9行所有】
>>> a[0:10]
id date open close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
...
9 9 2007-4-6 29.123 30.569 39.123 39.123 159908 15224
#【取前3行,第close,open列所有值】
>>> a[0:3][['close','open']]
close open
0 21.569 20.123
1 22.569 21.123
2 23.569 22.123
df.loc[:,[‘a’,‘b’]] 和 df.iloc[0:2,0:2]
#【行按索引,取所有行,date,close列】
>>> a.loc[:,['date','close']]
date close
0 2007-3-1 21.569
1 2007-3-1 22.569
2 2007-3-1 23.569
...
18 2008-9-8 39.569
#【行按索引,取1到3行,date,close列】
>>> a.loc[1:3,['date','close']]
date close
1 2007-3-1 22.569
2 2007-3-1 23.569
3 2007-3-1 24.569
#【行列都按索引,取0-3行,2-4列值】
>>> a.iloc[:3,2:4]
open close
0 20.123 21.569
1 21.123 22.569
2 22.123 23.569
通过布尔值过滤:
df[df[‘A’]]>0]
df[df[‘A’].isin([1,3,5])]
df[df<0] = 0
#【open列哪些大于18】
>>> a['close']>20
0 True
1 True
...
5 True
#【★取出open列大于18的所有值】
>>> a[a['open']>18]
id date open close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
...
5 5 2007-4-2 25.123 26.569 35.123 35.123 159904 15224
#【把所有大于30的数找出来】
>>> import pandas as pd
>>> a=pd.read_csv('jr.csv')
>>> a2=a.loc[:,'open':'volume'].copy() #取出除时间外所有列(因为第一列时间是字符串,数据类型不同)
>>> a2
open close high low volume
0 20.123 21.569 30.123 30.123 159899
1 21.123 22.569 31.123 31.123 159900
....
18 38.123 39.569 48.123 48.123 159917
>>> a2>30 #看看哪些大于30,返回布尔
open close high low volume
0 False False True True True
1 False False True True True
...
18 True True True True True
>>> a2[a2>30] #只显示大于30的数
open close high low volume
0 NaN NaN 30.123 30.123 159899
1 NaN NaN 31.123 31.123 159900
2 NaN NaN 32.123 32.123 159901
3 NaN NaN 33.123 33.123 159902
...
17 37.123 38.569 47.123 47.123 159916
18 38.123 39.569 48.123 48.123 159917
#-----【把大于30的数改为0】---------
>>> a2[a2>30].fillna(0) #用fillna()原因见上一步,NaN
open close high low volume
0 0.000 0.000 30.123 30.123 159899
1 0.000 0.000 31.123 31.123 159900
...
17 37.123 38.569 47.123 47.123 159916
18 38.123 39.569 48.123 48.123 159917
#-------------【挑出含指定内容的行】------
>>> a[a['date'].isin(['2007-3-1','2007-4-2'])]
id date open close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
2 2 2007-3-1 22.123 23.569 32.123 32.123 159901 15223
3 3 2007-3-1 23.123 24.569 33.123 33.123 159902 15223
5 5 2007-4-2 25.123 26.569 35.123 35.123 159904 15224
#------【笨方法】
>>> a[(a['date']=='2007-3-1') | (a['date']=='2007-4-2')]
id date open close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
2 2 2007-3-1 22.123 23.569 32.123 32.123 159901 15223
3 3 2007-3-1 23.123 24.569 33.123 33.123 159902 15223
5 5 2007-4-2 25.123 26.569 35.123 35.123 159904 15224
#【当所有数据类型统一没有‘字符串’时,大于指定值的所有数据都变成0】
>>> a2[a2>30]=0
>>> a2
open close high low volume
0 20.123 21.569 0.0 0.0 0
1 21.123 22.569 0.0 0.0 0
2 22.123 23.569 0.0 0.0 0
3 23.123 24.569 0.0 0.0 0
4 24.123 25.569 0.0 0.0 0
5 25.123 26.569 0.0 0.0 0
6 26.123 27.569 0.0 0.0 0
7 27.123 28.569 0.0 0.0 0
8 28.123 29.569 0.0 0.0 0
9 29.123 0.000 0.0 0.0 0
#【两个Df可直接相加,标题相同的列相加后对齐,没有的列数据做NaN处理】
>>> a+a2
close code date high id low open volume
0 43.138 NaN NaN 30.123 NaN 30.123 40.246 159899
1 45.138 NaN NaN 31.123 NaN 31.123 42.246 159900
2 47.138 NaN NaN 32.123 NaN 32.123 44.246 159901
3 49.138 NaN NaN 33.123 NaN 33.123 46.246 159902
2)缺失数据处理
#接上一步a+a2
>>> b=_
#【isnull()】
>>> b.isnull()
close code date high id low open volume
0 False True True False True False False False
1 False True True False True False False False
...
#【notnull()】
>>> b.notnull()
close code date high id low open volume
0 True False False True False True True True
1 True False False True False True True True
#【fillna()】
>>> b
close code date high id low open volume
0 43.138 NaN NaN 30.123 NaN 30.123 40.246 159899
1 45.138 NaN NaN 31.123 NaN 31.123 42.246 159900
2 47.138 NaN NaN 32.123 NaN 32.123 44.246 159901
3 49.138 NaN NaN 33.123 NaN 33.123 46.246 159902
#【nan填充为0.001】
>>> b.fillna(0.001)
close code date high id low open volume
0 43.138 0.001 0.001 30.123 0.001 30.123 40.246 159899
1 45.138 0.001 0.001 31.123 0.001 31.123 42.246 159900
2 47.138 0.001 0.001 32.123 0.001 32.123 44.246 159901
3 49.138 0.001 0.001 33.123 0.001 33.123 46.246 159902
#【dropna()】
>>> b.dropna() #不填参数意思是:把含有nan的行数据全部删除,所以结果将为空表
Empty DataFrame
Columns: [close, code, date, high, id, low, open, volume]
Index: []
#【加参数:把一行所有数据都为NaN的数据删除】,结果都在,因为没一行是全为NaN的
>>> b.dropna(how='all')
close code date high id low open volume
0 43.138 NaN NaN 30.123 NaN 30.123 40.246 159899
1 45.138 NaN NaN 31.123 NaN 31.123 42.246 159900
2 47.138 NaN NaN 32.123 NaN 32.123 44.246 159901
...
18 39.569 NaN NaN 48.123 NaN 48.123 38.123 159917
#【axis=0表示看行,=1看列】
>>> b.dropna(how='all',axis=1) #结果删除了有NaN的列
close high low open volume
0 43.138 30.123 30.123 40.246 159899
1 45.138 31.123 31.123 42.246 159900
...
18 39.569 48.123 48.123 38.123 159917
#按索引列降序排列
>>> b.sort_index(ascending=False)
close code date high id low open volume
18 39.569 NaN NaN 48.123 NaN 48.123 38.123 159917
17 38.569 NaN NaN 47.123 NaN 47.123 37.123 159916
...
1 45.138 NaN NaN 31.123 NaN 31.123 42.246 159900
0 43.138 NaN NaN 30.123 NaN 30.123 40.246 159899
>>> b.sort_index(ascending=False,axis=1) # 按标题降序排列
volume open low id high date code close
0 159899 40.246 30.123 NaN 30.123 NaN NaN 43.138
1 159900 42.246 31.123 NaN 31.123 NaN NaN 45.138
>>> import pandas as pd
>>> a=pd.read_csv('jr.csv')
>>> a
id date open close high low volume code
0 0 2007-3-1 20.123 21.569 30.123 30.123 159899 15223
1 1 2007-3-1 21.123 22.569 31.123 31.123 159900 15223
2 2 2007-3-1 22.123 23.569 32.123 32.123 159901 15223
3 3 2007-3-1 23.123 24.569 33.123 33.123 159902 15223
...
#【close值降序排列】(看close列,从大到小,原,从小到大)
>>> a.sort_values('close',ascending=False)
id date open close high low volume code
18 18 2008-9-8 38.123 39.569 48.123 48.123 159917 15224
17 17 2008-9-7 37.123 38.569 47.123 47.123 159916 15224
16 16 2008-9-6 36.123 37.569 46.123 46.123 159915 15224
15 15 2008-9-5 35.123 36.569 45.123 45.123 159914 15224
14 14 2008-9-4 34.123 35.569 44.123 44.123 159913 15224
NumPy的通用函数同样适用于pandas
>>> b=a.loc[:,'open':'code']
>>> b
open close high low volume code
0 20.123 21.569 30.123 30.123 159899 15223
1 21.123 22.569 31.123 31.123 159900 15223
2 22.123 23.569 32.123 32.123 159901 15223
3 23.123 24.569 33.123 33.123 159902 15223
>>> b.abs()#numpy函数的使用
open close high low volume code
0 20.123 21.569 30.123 30.123 159899.0 15223.0
1 21.123 22.569 31.123 31.123 159900.0 15223.0
2 22.123 23.569 32.123 32.123 159901.0 15223.0
3 23.123 24.569 33.123 33.123 159902.0 15223.0
4 24.123 25.569 34.123 34.123 159903.0 15223.0
5 25.123 26.569 35.123 35.123 159904.0 15224.0
#【applymap(func)将函数应用在DataFrame各个元素上】
>>> b.applymap(lambda x:x+1) #此处是用lambda(λ一个匿名函数),也可用普通函数直接写进来
open close high low volume code
0 21.123 22.569 31.123 31.123 159900 15224
1 22.123 23.569 32.123 32.123 159901 15224
2 23.123 24.569 33.123 33.123 159902 15224
层次化索引是Pandas的- -项重要功能,它使我们能够在一个轴上拥有多个索引级别。
例: data=pd.Series(np.random.rand(9),index=[[‘a’,‘a’,‘a’,‘b’,‘b’,‘b’,‘c’,‘c’,‘c’],[1,2,3,1,2,3,1,2,3]])
>>> import numpy as np
#【创建一个多层索引的列表】
>>> data=pd.Series(np.random.rand(9),index=[['a','a','a','b','b','b','c','c','c'
],[1,2,3,1,2,3,1,2,3]])
>>> data
a 1 0.970743
2 0.048131
3 0.360269
b 1 0.311093
2 0.866132
3 0.591954
c 1 0.991314
2 0.141202
3 0.960226
dtype: float64
>>> data['a'] #索引
1 0.970743
2 0.048131
3 0.360269
dtype: float64
>>> data['a'][2] #索引多层
0.04813066871483096
例
#用sep参数自定义分隔符(read_table通用)
pd.read_csv('jr.csv',sep='\t')
#sep参数也用正则表达式(\s+ 表示任意长度空白)
pd.read_csv('jr.csv',sep='\s+')
#【如果文件第一行不是标题名】
pd.read_csv('jr.csv',sep='\s+',header=None) #或header=false
#【第一行没有标题,自取一个】
pd.read_csv('jr.csv',sep='\s+',header=None,names=['id','date',...])
#【指定date列为索引】
>>> a2=pd.read_csv('jr.csv',index_col='date')
>>> a2
id open close high low volume code
date
2007-3-1 0 20.123 21.569 30.123 30.123 159899 15223
2007-3-1 1 21.123 22.569 31.123 31.123 159900 15223
2007-3-1 2 22.123 23.569 32.123 32.123 159901 15223
2007-3-1 3 23.123 24.569 33.123 33.123 159902 15223
2007-4-1 4 24.123 25.569 34.123 34.123 159903 15223
2007-4-2 5 25.123 26.569 35.123 35.123 159904 15224
#【把时间转换成专用time类型】
>>> a2=pd.read_csv('jr.csv',parse_dates=['date'])
#【把指定符号按非数NaN处理】
>>> a2=pd.read_csv('jr.csv',na_values=['none','null','nan'...])
#【防止写新文件时自动加入新索引列】index=False
>>> a.to_csv('new2.csv',index=False)
>>> import pandas as pd
#【返回19.1.1-19.2.28间的所有工作日】
>>> pd.date_range('2019-01-01','2019-02-28',freq='B')
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-07', '2019-01-08', '2019-01-09', '2019-01-10',
'2019-01-11', '2019-01-14', '2019-01-15', '2019-01-16',
'2019-01-17', '2019-01-18', '2019-01-21', '2019-01-22',
'2019-01-23', '2019-01-24', '2019-01-25', '2019-01-28',
'2019-01-29', '2019-01-30', '2019-01-31', '2019-02-01',
'2019-02-04', '2019-02-05', '2019-02-06', '2019-02-07',
'2019-02-08', '2019-02-11', '2019-02-12', '2019-02-13',
'2019-02-14', '2019-02-15', '2019-02-18', '2019-02-19',
'2019-02-20', '2019-02-21', '2019-02-22', '2019-02-25',
'2019-02-26', '2019-02-27', '2019-02-28'],
dtype='datetime64[ns]', freq='B')
#【每半个月返回一次】
>>> pd.date_range('2019-01-01','2019-12-28',freq='SM')
DatetimeIndex(['2019-01-15', '2019-01-31', '2019-02-15', '2019-02-28',
'2019-03-15', '2019-03-31', '2019-04-15', '2019-04-30',
'2019-05-15', '2019-05-31', '2019-06-15', '2019-06-30',
'2019-07-15', '2019-07-31', '2019-08-15', '2019-08-31',
'2019-09-15', '2019-09-30', '2019-10-15', '2019-10-31',
'2019-11-15', '2019-11-30', '2019-12-15'],
dtype='datetime64[ns]', freq='SM-15')
#【不指截止日期,按时间长度,往后产生20天(freq默认是天)】
>>> pd.date_range('2019-01-01',periods=20)
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
'2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12',
'2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16',
'2019-01-17', '2019-01-18', '2019-01-19', '2019-01-20'],
dtype='datetime64[ns]', freq='D')
时间序列就是以时间对象为索引的Series或DataFrame。
datetime对象作为索引时是存储在DatetimeIndex对象中的。
>>> import pandas as pd
#【读取表格,让date作为列索引,并把时间解析为时间对象】
>>> a=pd.read_csv('jr.csv',index_col='date',parse_dates=['date'])
>>> a
id open close high low volume code
date
2007-03-01 0 20.123 21.569 30.123 30.123 159899 15223
2007-03-01 1 21.123 22.569 31.123 31.123 159900 15223
2007-03-01 2 22.123 23.569 32.123 32.123 159901 15223
2007-03-01 3 23.123 24.569 33.123 33.123 159902 15223
2007-04-01 4 24.123 25.569 34.123 34.123 159903 15223
2007-04-02 5 25.123 26.569 35.123 35.123 159904 15224
2007-04-03 6 26.123 27.569 36.123 36.123 159905 15223
2007-04-04 7 27.123 28.569 37.123 37.123 159906 15224
2007-04-05 8 28.123 29.569 38.123 38.123 159907 15223
...
>>> type(a.index[0]) #显示索引的类型(时间对象)
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
#【返回所有2008年的数据】
>>> a['2008']
id open close high low volume code
date
2008-09-01 11 31.123 32.569 41.123 41.123 159910 15224
2008-09-02 12 32.123 33.569 42.123 42.123 159911 15224
2008-09-03 13 33.123 34.569 43.123 43.123 159912 15224
2008-09-04 14 34.123 35.569 44.123 44.123 159913 15224
2008-09-05 15 35.123 36.569 45.123 45.123 159914 15224
2008-09-06 16 36.123 37.569 46.123 46.123 159915 15224
...
#【返回所有2007年4月数据】
>>> a['2007-04']
id open close high low volume code
date
2007-04-01 4 24.123 25.569 34.123 34.123 159903 15223
2007-04-02 5 25.123 26.569 35.123 35.123 159904 15224
2007-04-03 6 26.123 27.569 36.123 36.123 159905 15223
2007-04-04 7 27.123 28.569 37.123 37.123 159906 15224
2007-04-05 8 28.123 29.569 38.123 38.123 159907 15223
2007-04-06 9 29.123 30.569 39.123 39.123 159908 15224
2007-04-07 10 30.123 31.569 40.123 40.123 159909 15224
#【返回2007-3月到4月数据】
>>> a['2007-03':'2007-04']
id open close high low volume code
date
2007-03-01 0 20.123 21.569 30.123 30.123 159899 15223
2007-03-01 1 21.123 22.569 31.123 31.123 159900 15223
2007-03-01 2 22.123 23.569 32.123 32.123 159901 15223
2007-03-01 3 23.123 24.569 33.123 33.123 159902 15223
2007-04-01 4 24.123 25.569 34.123 34.123 159903 15223
2007-04-02 5 25.123 26.569 35.123 35.123 159904 15224
2007-04-03 6 26.123 27.569 36.123 36.123 159905 15223
2007-04-04 7 27.123 28.569 37.123 37.123 159906 15224
2007-04-05 8 28.123 29.569 38.123 38.123 159907 15223
2007-04-06 9 29.123 30.569 39.123 39.123 159908 15224
2007-04-07 10 30.123 31.569 40.123 40.123 159909 15224