python模块pandas的基本使用实例

  • pandas用处
  1. pandas为处理数据分析任务而创建 
  2. 适合处理序列数据、表格数据等具良好结构的数据
  3. 其中数据结构:from pandas import Series、DataFrame、Panel
  4. Series:一维数组,类似list,不过数组中元素数据类型相同,且索引不限于0~n(默认)
  5. DataFrame:二维表格型数据结构,即键值对型。Series的容器。可以像操作SQL一样。
  6. Panel:三维数组。DataFrame的容器

通过values和index可以获得索引和值 ,可以把dist{}字典型转化成Series

  • Series一维数组Series( [1,2] , index=['a','b'] ) 

1、创建

from pandas import Series,DataFrame
import  pandas as pd

>>>Series([4,7,-5,3])
0 4
1 7
2 -5
3 3

>>>obj.values
array([4,7,-5,3])
>>>obj.index
Int64Index([0,1,2,3])

>>>Series([4,7,-5,3],index=['d','b','a','c'])
d 4
b 7
a -5
c 3

#字典->Series
sdata={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
>>>Series(sdata)
Ohio   35000
Texas  71000
Oregon 16000
Utah   5000

2、层次化索引data = Series(np.random.randn(10), index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd' ],[1,2,3,1,2,3,1,2,2,3]]) 

data = Series(np.random.randn(10), index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd' ],[1,2,3,1,2,3,1,2,2,3]])
>>>data
a  1    0.169239
   2    0.689271
   3    0.879309
b  1   -0.699176
   2    0.260446
   3   -0.321751
c  1    0.893105
   2    0.757505
d  2   -1.223344
   3   -0.802812
dtype: float64

#索引方式
In[3]:data['b':'d']
Out[3]:
b  1   -0.699176
   2    0.260446
   3   -0.321751
c  1    0.893105
   2    0.757505
d  2   -1.223344
   3   -0.802812
dtype: float64

#内层选取
In[4]:data[:, 2]
Out[4]:
a    0.689271
b    0.260446
c    0.757505
d   -1.223344
dtype: float64

数据重塑:将Series转化成DataFrame: 

data = Series(np.random.randn(10), index = [['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd' ],[1,2,3,1,2,3,1,2,2,3]])

>>>data.unstack()
        1	        2    	    3
a	-0.711240	1.636465	-2.023830
b	1.301891	1.236052	-0.515719
c	1.961935	-0.406532	NaN
d	NaN	        0.107543	0.086120

3、排序

①s.sort_index():按索引排序

obj = Series(range(4), index=['d','a','b','c'])
>>> obj.sort_index()
a    1
b    2
c    3
d    0
dtype: int64

② s.sort_values():按值排序

>>> obj.sort_values()
d    0
a    1
b    2
c    3
dtype: int64

4、删除某行drop('a') #drop()返回新对象,原对象不改变

series=Series([4.5,7.2,-5.3,3.6], index=['d','b','a','c'])
>>>ser.drop('c')
d    4.5
b    7.2
a   -5.3
dtype: float64)
  •  DataFrame表格型:可看成多个Series组成列,行索引也可自定义

1、创建

①由字典创建DataFrame(dict,index=[]) 

#DataFrame创建:
dictionary = {'state':['0hio','0hio','0hio','Nevada','Nevada'],
         'year':[2000,2001,2002,2001,2002],
         'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(dictionary)

#修改行名:
frame=DataFrame(dictionary,index=['one','two','three','four','five'])

#添加、修改:
frame['add']=[0,0,0,0,0]

#添加Series类型:
value = Series([1,3,1,4,6,8],index = ['one','two','three','four','five'])
frame['add1'] = value

②指定矩阵后自定义行列索引DataFrame(range(6),index=['1','2'],columns=['a','b','c']) 

>>>frame = DataFrame(np.arange(8).reshape((2,4)),index=['three', 'one'],columns=['d','a','b','c'])

       d  a  b  c
three  0  1  2  3
one    4  5  6  7

③由pd.read_csv(url,header=0)读取.csv格式数据返回DataFrame类型

# Reading a csv into Pandas.
# 如果数据集中有中文的话,最好在里面加上 encoding = 'gbk' ,以避免乱码问题。后面的导出数据的时候也一样。
df = pd.read_csv('uk_rain_2014.csv', header=0)

#查看前n行
df.head(5)

#查看后n行
df.tail(5)

#查看总行数
len(df)

#修改列名
#我们通常使用列的名字来在 Pandas 中查找列。这一点很好而且易于使用,但是有时列名太长,我们需要缩短列名
df.columns = ['water_year','rain_octsep','outflow_octsep','rain_decfeb', 'outflow_decfeb', 'rain_junaug', 'outflow_junaug']

3、排序

①按行索引排序DataFrame.sort_index(axis=0level=Noneascending=Trueinplace=Falsekind='quicksort'na_position='last'sort_remaining=Trueby=None):按第0列升序

d=DataFrame(np.arange(12).reshape(3,4),columns=['b','a','d','c'],index=['one','two','three'])
>>> d
       b  a   d   c
one    0  1   2   3
two    4  5   6   7
three  8  9  10  11

>>> d.sort_index()
       b  a   d   c
one    0  1   2   3
three  8  9  10  11
two    4  5   6   7
#one three two按字母排序

②按列索引排序DataFrame.sort_index(axis=1level=Noneascending=Trueinplace=Falsekind='quicksort'na_position='last'sort_remaining=Trueby=None):按第0列升序

d=DataFrame(np.arange(12).reshape(3,4),columns=['b','a','d','c'],index=['one','two','three'])
>>> d
       b  a   d   c
one    0  1   2   3
two    4  5   6   7
three  8  9  10  11

>>> d.sort_index(axis=1)
       a  b   c   d
one    1  0   3   2
two    5  4   7   6
three  9  8  11  10

③按某列值排序

DataFrame.sort_values(byaxis=0ascending=Trueinplace=Falsekind='quicksort'na_position='last') #必须指定by即按哪行或哪列的值排序

>>> frame=pd.DataFrame([[2,4,1,5],[3,1,4,5],[5,1,4,2]],columns=['b','a','d','c'],index=['one','two','three'])
>>> frame
       b  a  d  c
one    2  4  1  5
two    3  1  4  5
three  5  1  4  2

#按a列从小到大排
>>> frame.sort_values(by='a')
       b  a  d  c
two    3  1  4  5
three  5  1  4  2
one    2  4  1  5

#先a列再c列从小到大排
>>> frame.sort_values(by=['a','c'])
       b  a  d  c
three  5  1  4  2
two    3  1  4  5
one    2  4  1  5

④按某行值排序

DataFrame.sort_values(byaxis=1ascending=Trueinplace=Falsekind='quicksort'na_position='last') #必须指定by即按哪行或哪列的值排序

>>> frame=pd.DataFrame([[2,4,1,5],[3,1,4,5],[5,1,4,2]],columns=['b','a','d','c'],index=['one','two','three'])
>>> frame
       b  a  d  c
one    2  4  1  5
two    3  1  4  5
three  5  1  4  2

#按two行从小到大排
>>> frame.sort_values(by='two',axis=1)
        a  b  d  c
one    4  2  1  5
two    1  3  4  5
three  1  5  4  2

3、删除行或列df.drop(['oh','te'],axis=1)

>>>df = DataFrame(np.arange(9).reshape(3,3), index=['a','c','d'], columns=['oh','te','ca'])
   oh  te  ca
a   0   1   2
c   3   4   5
d   6   7   8

#删除某行
>>>df.drop('a')
   oh  te  ca
c   3   4   5
d   6   7   8

#删除多列
>>>df.drop(['oh','te'],axis=1)
   ca
a   2
c   5
d   8

4、运算:对应位置运算,没有的用NaN代替,或自定义填充

①df1+df2 (NaN填充)

df1 = DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))

>>>df1+df2
    a   b   c   d   e
0   0   2   4   6 NaN
1   9  11  13  15 NaN
2  18  20  22  24 NaN
3 NaN NaN NaN NaN NaN

②df1.add(df2,fill_value=0) #可自定义填充(运算时填维数小的DataFrame)

>>>df1.add(df2, fill_value=0)
    a   b   c   d   e
0   0   2   4   6   4
1   9  11  13  15   9
2  18  20  22  24  14
3  15  16  17  18  19

3、各行的查重df.duplicated()与去重df.drop_duplicates() #返回新对象,原对象不改变

>>>df = DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[1,1,2,3,3,4,4]})
    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4

>>>df.duplicated()
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

>>>df.drop_duplicates()
    k1  k2
0  one   1
2  one   2
3  two   3
5  two   4

 

你可能感兴趣的:(python)