import pandas as pd
from pandas import Series,DataFrame
import numpy as np
#Series一维数组型对象,包含数据标签,称为索引
def sseries():
obj = pd.Series([4,7,-5,3])
#索引左边值右边
#值
print(obj.values)
#索引
print(obj.index)
#标签标识每个数据点
obj2 =pd.Series([4,7,-5,3],index=['d','b','a','c'])
print(obj2.index)
print(obj2['a'])
print(obj2['b'])
print(obj2[obj2>2])
print(obj2*2)
print(np.exp(obj2))
print('b' in obj2)
print('e' in obj2)
#sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
sdata={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = pd.Series(sdata)
print(obj3)
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(sdata,index=states)
print(obj4)
#isnull和notnull检验缺失或“NA”数据
print(pd.isnull(obj4))
print(pd.notnull(obj4))
print(obj4.isnull())
print(obj3)
print(obj4)
print(obj3+obj4)
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)
print(obj)
obj.index = ['Bob','Steve','Jeff','Ryan']
print(obj)
if __name__=='__main__':
sseries()
[ 4 7 -5 3]
RangeIndex(start=0, stop=4, step=1)
Index(['d', 'b', 'a', 'c'], dtype='object')
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
7
d 4
b 7
c 3
dtype: int64
d 8
b 14
a -10
c 6
dtype: int64
d 54.598150
b 1096.633158
a 0.006738
c 20.085537
dtype: float64
True
False
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California True
Ohio False
Oregon False
Texas False
dtype: bool
California False
Ohio True
Oregon True
Texas True
dtype: bool
California True
Ohio False
Oregon False
Texas False
dtype: bool
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
0 4
1 7
2 -5
3 3
dtype: int64
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
C:\Users\37596>G:\数据分析\利用python进行数据分析\pd0531.py
[ 4 7 -5 3]
RangeIndex(start=0, stop=4, step=1)
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
7
d 4
b 7
c 3
dtype: int64
d 8
b 14
a -10
c 6
dtype: int64
d 54.598150
b 1096.633158
a 0.006738
c 20.085537
dtype: float64
True
False
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California True
Ohio False
Oregon False
Texas False
dtype: bool
California False
Ohio True
Oregon True
Texas True
dtype: bool
California True
Ohio False
Oregon False
Texas False
dtype: bool
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
0 4
1 7
2 -5
3 3
dtype: int64
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
import pymysql as MySQLdb
import pandas as pd
import numpy as np
#等长度的列表
data ={'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002,2003],
'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)
print(frame)
frame.head()
print(pd.DataFrame(data,columns=['year','state','pop']))
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five','six'])
print(frame2)
print(frame2.columns)
print(frame2['state'])
print(frame2.year)
#行可以用位置或者特殊属性loc进行选取
print(frame2.loc['three'])
#修改列的引用
frame2['debt']=16.5
print(frame2)
frame2['debt']=np.arange(6.)
print(frame2)
val = pd.Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
frame2['debt']=val
print(frame2)
frame2['eastern']=frame2.state == 'Ohio'
print(frame2)
#del移除列
del frame2['eastern']
print(frame2.columns)
#包含字典的嵌套字典
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = pd.DataFrame(pop)
print(frame3)
print(frame3.T)
#pd.DataFrame(pop,index = [2001,2002,2003])
#'list' object has no attribute 'astype',不知哪里错了出了这句提示
#包含Series的字典
pdate ={'Ohio':frame3['Ohio'][:-1],
'Nevada':frame3['Nevada'][:2]}
print(pd.DataFrame(pdate))
#DataFrame构造函数的有效输入 见P133
frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print(frame3.values)
print(frame2.values)
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')
one Ohio
two Ohio
three Ohio
four Nevada
five Nevada
six Nevada
Name: state, dtype: object
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
six 2003 Nevada 3.2 16.5
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
six 2003 Nevada 3.2 5.0
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
six 2003 Nevada 3.2 NaN False
Index(['year', 'state', 'pop', 'debt'], dtype='object')
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
Ohio Nevada
2000 1.5 NaN
2001 1.7 2.4
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
[[nan 1.5]
[2.4 1.7]
[2.9 3.6]]
[[2000 'Ohio' 1.5 nan]
[2001 'Ohio' 1.7 -1.2]
[2002 'Ohio' 3.6 nan]
[2001 'Nevada' 2.4 -1.5]
[2002 'Nevada' 2.9 -1.7]
[2003 'Nevada' 3.2 nan]]
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = pd.DataFrame(pop)
obj = pd.Series(range(3),index = ['a','b','c'])
index = obj.index
print(index)
print(index[1:])
labels = pd.Index(np.arange(3))
print(labels)
obj2= pd.Series([1.5,-2.5,0],index = labels)
print(obj2)
print(obj2.index is labels)
Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')
Int64Index([0, 1, 2], dtype='int64')
0 1.5
1 -2.5
2 0.0
dtype: float64
True
表5-2一些索引对象地方法和属性 p135
方法 | 描述 |
---|---|
append | 将额外地索引对象黏贴到原索引后产生一个新的索引 |
difference | 差集 |
intersection | 交集 |
union | 并集 |
isin | 每一个值在传值器中的布尔数组 |
delete | 将位置i的元素删除,并产生新的索引 |
drop | 根据传参删除指定索引值,并产生新的索引 |
insert | 在位置i插入元素,并产生新的索引 |
is_monotonic | 如果索引序列递增则返回TRUE |
is_uniqu | 如果索引序列唯一则返回TRUE |
unique | 计算索引的唯一值序列 |
obj = pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
print(obj)
#reindex:重建索引,会补充缺失值
obj2=obj.reindex(['a','b','c','d','e'])
print(obj2)
#在时间序列数据中,method参数里的ffill会将值前向填充
obj3 = pd.Series(['blue','purple','yellow'],index = [0,2,4])
print(obj3)
print(obj3.reindex(range(6),method ='ffill'))
#reindex可以改变行索引,列索引,也可以同时改变两者
frame= pd.DataFrame(np.arange(9).reshape((3,3)),
index =['a','b','d'],
columns = ['Ohio','Texas','California'])
print(frame)
frame2 = frame.reindex(['a','b','c','d'])
print(frame2)
states =['Texas','Utah','California']
frame.reindex(columns=states)
#使用loc进行更为简洁的标签索引
print(frame.loc[['a','b','c','d'],states])
#表5-3reindex方法的参数
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
0 blue
2 purple
4 yellow
dtype: object
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
Ohio Texas California
a 0 1 2
b 3 4 5
d 6 7 8
Ohio Texas California
a 0.0 1.0 2.0
b 3.0 4.0 5.0
c NaN NaN NaN
d 6.0 7.0 8.0
G:\数据分析\0531.py:26: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
print(frame.loc[['a','b','c','d'],states])
Texas Utah California
a 1.0 NaN 2.0
b 4.0 NaN 5.0
c NaN NaN NaN
d 7.0 NaN 8.0