将鱼图像数据进行操作,使用numpy知识
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#咱们可以不用show方法,嵌套代码里面
fish = plt.imread('./fish.png')
plt.imshow(fish)
fish.shape
(243, 326, 3)
#把图片变成灰色的
fish1 = fish[::,::,0]
fish1.shape
(243, 326)
plt.imshow(fish1,cmap = "gray")
#灰度化处理 (本质 就是降维) 黑白照片就是二维数据,彩色照片三维或者多维的!!!
#人脸识别的
#使用一个叫opencv扩展库 计算机视觉库(书专门介绍 opencv的)
import cv2
sanpang = cv2.imread("./cv2_change_head/j.jpg")
#RGB CV2在读数据的时候,BGR
plt.imshow(sanpang[::,::,::-1])
#专门用来识别人脸部位的一个对象
cascade = cv2.CascadeClassifier()
#加载现成的算法
cascade.load("./cv2_change_head/haarcascade_frontalface_default.xml")
#使用人脸识别的类进行识别
face = cascade.detectMultiScale(sanpang)
face
array([[225, 76, 72, 72]], dtype=int32)
dog = cv2.imread("./cv2_change_head/dog.jpg")
small_dog = cv2.resize(dog, (72,72))
for (h,w, p,p) in face:
#([[225, 76, 72, 72]]
sanpang[w:w+p,h:h+p] = small_dog
plt.imshow(sanpang[::,::,::-1])
Pandas的数据结构
导入pandas:
三剑客
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import numpy as np
1、Series
Series是一种类似与一维数组的对象,由下面两个部分组成:
- values:一组数据(ndarray类型)
- index:相关的数据索引标签
#Series 其实是对ndarray的一个封装(包装)
#index: 索引
#values:值,是一个(一维的ndarray)
1)Series的创建
两种创建方式:
(1) 由列表或numpy数组创建
默认索引为0到N-1的整数型索引
nd = np.array([1,2,3])
nd
array([1, 2, 3])
s = Series([1,2,3])
s
0 1
1 2
2 3
dtype: int64
s = Series(nd, index=list("abc"))
s[0]
#注意index 索引传值的时候是一个list
1
s = Series(nd, index = ["a","b","c"])
s
a 1
b 2
c 3
dtype: int32
#咱们的索引值可不可以相同
s = Series(nd, index = list("AAA"))
s
A 1
A 2
A 3
dtype: int32
s["A"]
A 1
A 2
A 3
dtype: int32
#当索引值相同的时候,使用默认的索引拿数据的时候会出现问题!!!!! 如果自己定义索引不相同的时候,是可以使用默认的索引的!!!!!
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2559 return self._engine.get_value(s, k,
-> 2560 tz=getattr(series.dtype, 'tz', None))
2561 except KeyError as e1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine._get_loc_duplicates()
TypeError: '<' not supported between instances of 'str' and 'int'
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
in ()
----> 1 s[0]
C:\anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
621 key = com._apply_if_callable(key, self)
622 try:
--> 623 result = self.index.get_value(self, key)
624
625 if not is_scalar(result):
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2578 # python 3
2579 if is_scalar(key): # pragma: no cover
-> 2580 raise IndexError(key)
2581 raise InvalidIndexError(key)
2582
IndexError: 0
s = Series(data = np.random.randint(0,100,size = 10), index = list("abcdefghtq"))
s
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in ()
----> 1 s = Series(data = np.random.randint(0,100,size = 10), index = list("abcdefghtqw"))
2 s
C:\anaconda\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
264 raise_cast_failure=True)
265
--> 266 data = SingleBlockManager(data, index, fastpath=True)
267
268 generic.NDFrame.__init__(self, data, fastpath=True)
C:\anaconda\lib\site-packages\pandas\core\internals.py in __init__(self, block, axis, do_integrity_check, fastpath)
4400 if not isinstance(block, Block):
4401 block = make_block(block, placement=slice(0, len(axis)), ndim=1,
-> 4402 fastpath=True)
4403
4404 self.blocks = [block]
C:\anaconda\lib\site-packages\pandas\core\internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2955 placement=placement, dtype=dtype)
2956
-> 2957 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2958
2959 # TODO: flexible with index=None and/or items=None
C:\anaconda\lib\site-packages\pandas\core\internals.py in __init__(self, values, placement, ndim, fastpath)
118 raise ValueError('Wrong number of items passed %d, placement '
119 'implies %d' % (len(self.values),
--> 120 len(self.mgr_locs)))
121
122 @property
ValueError: Wrong number of items passed 10, placement implies 11
还可以通过设置index参数指定索引
s
a 32
b 11
c 73
d 13
e 34
f 4
g 67
h 76
t 62
q 76
dtype: int32
s.index = list("ABCDEFGHTQ")
s
A 32
B 11
C 73
D 13
E 34
F 4
G 67
H 76
T 62
Q 76
dtype: int32
#可以不可以单个的去修改索引
s.index[0] = ["Y"]
s
#Series 索引值,不能是对他单个修改的
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in ()
1 #可以不可以单个的去修改索引
----> 2 s.index[0] = ["Y"]
3 s
4 #Series 索引值,不能是对他单个修改的
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
1722
1723 def __setitem__(self, key, value):
-> 1724 raise TypeError("Index does not support mutable operations")
1725
1726 def __getitem__(self, key):
TypeError: Index does not support mutable operations
特别地,由ndarray创建的是引用,而不是副本。对Series元素的改变也会改变原来的ndarray对象中的元素。(列表没有这种情况)
nd = np.array([0,2,4,6])
s = Series(nd, index = list("ABCD"))
s
A 0
B 2
C 4
D 6
dtype: int32
s['C'] = 16
s
A 0
B 2
C 16
D 6
dtype: int32
nd
array([ 0, 2, 16, 6])
(2) 由字典创建
s = Series(data = {"a":10,"pi":3.14,"e":2.713,"g":0.618}, index =["a","pi","e","g","kk"])
s
#假如使用字典的时候,index可以多出来值,但是数据会补上Nan
a 10.000
pi 3.140
e 2.713
g 0.618
kk NaN
dtype: float64
============================================
练习1:
使用多种方法创建以下Series,命名为s1:
语文 150
数学 150
英语 150
理综 300
============================================
s1 = Series(data = {"语文":93,"数学":79,"英语":120,"理综":20})
s1
数学 79
理综 20
英语 120
语文 93
dtype: int64
2)Series的索引和切片
可以使用中括号取单个索引(此时返回的是元素类型),或者中括号里一个列表取多个索引(此时返回的仍然是一个Series类型)。分为显示索引和隐式索引:
(1) 显式索引:
- 使用index中的元素作为索引值
- 使用.loc[](推荐)
注意,此时是闭区间
s
a 10.000
pi 3.140
e 2.713
g 0.618
kk NaN
dtype: float64
s["e"]
#返回的是float
2.713
s["a","g"]
#这种写法是不对的!!!
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2565 try:
-> 2566 return libts.get_value_box(s, key)
2567 except IndexError:
pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box()
pandas/_libs/tslib.pyx in pandas._libs.tslib.get_value_box()
TypeError: 'tuple' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
in ()
----> 1 s["a","g"]
C:\anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
621 key = com._apply_if_callable(key, self)
622 try:
--> 623 result = self.index.get_value(self, key)
624
625 if not is_scalar(result):
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2572 raise InvalidIndexError(key)
2573 else:
-> 2574 raise e1
2575 except Exception: # pragma: no cover
2576 raise e1
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
2558 try:
2559 return self._engine.get_value(s, k,
-> 2560 tz=getattr(series.dtype, 'tz', None))
2561 except KeyError as e1:
2562 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ('a', 'g')
s[["a","g"]]
#Series
a 10.000
g 0.618
dtype: float64
s1 = s.loc[["a","g"]]
s1
#使用loc取多个的值
a 10.000
g 0.618
dtype: float64
#取单个的值
s.loc["a"]
#float
10.0
type(s.loc[["a"]])
#Series
pandas.core.series.Series
(2) 隐式索引:
- 使用整数作为索引值
- 使用.iloc[](推荐)
注意,此时是半开区间
s
a 10.000
pi 3.140
e 2.713
g 0.618
kk NaN
dtype: float64
s[0]
10.0
s.iloc[0]
#在使用iloc的时候 必须穿的值是隐藏起来的索引值(也就是整型的)
10.0
s.iloc[[0,1,2]]
#取多个值的时候,加两个中括号
a 10.000
pi 3.140
e 2.713
dtype: float64
#切片
s["a":"g"]
#左闭右闭
a 10.000
pi 3.140
e 2.713
g 0.618
dtype: float64
s.loc["a":"g"]
a 10.000
pi 3.140
e 2.713
g 0.618
dtype: float64
s.iloc[0:3]
#在使用iloc的时候,左闭右开
a 10.000
pi 3.140
e 2.713
dtype: float64
============================================
练习2:
使用多种方法对练习1创建的Series s1进行索引和切片:
索引:
数学 150
切片:
语文 150
数学 150
英语 150
============================================
3)Series的基本概念
可以把Series看成一个定长的有序字典
可以通过shape,size,index,values等得到series的属性
s
a 10.000
pi 3.140
e 2.713
g 0.618
kk NaN
dtype: float64
s.shape
(5,)
s.size
5
s.index
Index(['a', 'pi', 'e', 'g', 'kk'], dtype='object')
s.values
#打印出来的数据是一个ndarray
numpy.ndarray
可以通过head(),tail()快速查看Series对象的样式
#扩展
data = pd.read_csv("./president_heights.csv")
type(data)
pandas.core.frame.DataFrame
data
order | name | height(cm) | |
---|---|---|---|
0 | 1 | George Washington | 189 |
1 | 2 | John Adams | 170 |
2 | 3 | Thomas Jefferson | 189 |
3 | 4 | James Madison | 163 |
4 | 5 | James Monroe | 183 |
5 | 6 | John Quincy Adams | 171 |
6 | 7 | Andrew Jackson | 185 |
7 | 8 | Martin Van Buren | 168 |
8 | 9 | William Henry Harrison | 173 |
9 | 10 | John Tyler | 183 |
10 | 11 | James K. Polk | 173 |
11 | 12 | Zachary Taylor | 173 |
12 | 13 | Millard Fillmore | 175 |
13 | 14 | Franklin Pierce | 178 |
14 | 15 | James Buchanan | 183 |
15 | 16 | Abraham Lincoln | 193 |
16 | 17 | Andrew Johnson | 178 |
17 | 18 | Ulysses S. Grant | 173 |
18 | 19 | Rutherford B. Hayes | 174 |
19 | 20 | James A. Garfield | 183 |
20 | 21 | Chester A. Arthur | 183 |
21 | 23 | Benjamin Harrison | 168 |
22 | 25 | William McKinley | 170 |
23 | 26 | Theodore Roosevelt | 178 |
24 | 27 | William Howard Taft | 182 |
25 | 28 | Woodrow Wilson | 180 |
26 | 29 | Warren G. Harding | 183 |
27 | 30 | Calvin Coolidge | 178 |
28 | 31 | Herbert Hoover | 182 |
29 | 32 | Franklin D. Roosevelt | 188 |
30 | 33 | Harry S. Truman | 175 |
31 | 34 | Dwight D. Eisenhower | 179 |
32 | 35 | John F. Kennedy | 183 |
33 | 36 | Lyndon B. Johnson | 193 |
34 | 37 | Richard Nixon | 182 |
35 | 38 | Gerald Ford | 183 |
36 | 39 | Jimmy Carter | 177 |
37 | 40 | Ronald Reagan | 185 |
38 | 41 | George H. W. Bush | 188 |
39 | 42 | Bill Clinton | 188 |
40 | 43 | George W. Bush | 182 |
41 | 44 | Barack Obama | 185 |
#一个DataFrame就是由多个Series组成的!!1
s_height = data['height(cm)']
type(s_height)
pandas.core.series.Series
s_height.head(2)
#head方法 取数据的前五条,而且还可以传参自定义出来的数据
0 189
1 170
Name: height(cm), dtype: int64
s_height.tail()
#tail方法,去最后的五条数据
37 185
38 188
39 188
40 182
41 185
Name: height(cm), dtype: int64
当索引没有对应的值时,可能出现缺失数据显示NaN(not a number)的情况
s = Series(data = {"a":10,"b":20,"c":30}, index =list("abcd"))
s
a 10.0
b 20.0
c 30.0
d NaN
dtype: float64
可以使用pd.isnull(),pd.notnull(),或自带isnull(),notnull()函数检测缺失数据
#后面会用到
pd.isnull(s)
a False
b False
c False
d True
dtype: bool
ind = s.isnull()
ind
a False
b False
c False
d True
dtype: bool
#使用ind给空值赋值,后面会用到
s[ind] = 1000
s
a 10.0
b 20.0
c 30.0
d 1000.0
dtype: float64
pd.notnull(s)
a True
b True
c True
d True
dtype: bool
s.notnull()
a True
b True
c True
d True
dtype: bool
Series对象本身及其实例都有一个name属性
s =Series(data = np.random.randint(0,150,size = 5), index = ["张三","李四","Lisa","Sara","Jack"])
s
张三 36
李四 83
Lisa 67
Sara 110
Jack 58
dtype: int32
s.name = "Python"
s
张三 36
李四 83
Lisa 67
Sara 110
Jack 58
Name: Python, dtype: int32
s =Series(data = np.random.randint(0,150,size = 5), index = ["张三","李四","Lisa","Sara","Jack"], name = "Math")
s
张三 72
李四 40
Lisa 69
Sara 27
Jack 8
Name: Math, dtype: int32
#扩展
df = pd.read_csv("./president_heights.csv")
s2 = df["order"]
s2
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 23
22 25
23 26
24 27
25 28
26 29
27 30
28 31
29 32
30 33
31 34
32 35
33 36
34 37
35 38
36 39
37 40
38 41
39 42
40 43
41 44
Name: order, dtype: int64
4)Series的运算
(1) 适用于numpy的数组运算也适用于Series
s
张三 72
李四 40
Lisa 69
Sara 27
Jack 8
Name: Math, dtype: int32
s2 = s + 50
s2
张三 122
李四 90
Lisa 119
Sara 77
Jack 58
Name: Math, dtype: int32
s.add(20)
张三 92
李四 60
Lisa 89
Sara 47
Jack 28
Name: Math, dtype: int32
(2) Series之间的运算
-
在运算中自动对齐不同索引的数据
-
如果索引不对应,则补NaN
-
注意:要想保留所有的index,则需要使用.add()函数
s3 = s1.add(s2,fill_value = 1)
s3
A 114.0
B 48.0
C 26.0
Jack 59.0
Lisa 120.0
Sara 103.0
张三 123.0
李四 91.0
dtype: float64
s1 = Series(np.random.randint(0,150,size =4), index = ["A","B","C","Sara"], name = "数学")
s1
A 113
B 47
C 25
Sara 26
Name: 数学, dtype: int32
s2
张三 122
李四 90
Lisa 119
Sara 77
Jack 58
Name: Math, dtype: int32
np.nan
nan
113 + np.nan
nan
s1 + s2
#s1 里面有A 值 113 s2没有A值 Nan
A NaN
B NaN
C NaN
Jack NaN
Lisa NaN
Sara 103.0
张三 NaN
李四 NaN
dtype: float64
============================================
练习3:
-
想一想Series运算和ndarray运算的规则有什么不同?
-
新建另一个索引包含“文综”的Series s2,并与s2进行多种算术操作。思考如何保存所有数据。
============================================
nd1 = np.array([0,1,2])
nd2 = np.array([4,5,6])
nd1 + nd2
array([4, 6, 8])
2、DataFrame
DataFrame是一个【表格型】的数据结构,可以看做是【由Series组成的字典】(共用同一个索引)。DataFrame由按一定顺序排列的多列数据组成。设计初衷是将Series的使用场景从一维拓展到多维。DataFrame既有行索引,也有列索引。
- 行索引:index
- 列索引:columns
- 值:values(numpy的二维数组)
#重点
1)DataFrame的创建
最常用的方法是传递一个字典来创建。DataFrame以字典的键作为每一【列】的名称,以字典的值(一个数组)作为每一列。
此外,DataFrame会自动加上每一行的索引(和Series一样)。
同Series一样,若传入的列与字典的键不匹配,则相应的值为NaN。
import pandas as pd
from pandas import Series,DataFrame
#创建 第一种写法
df1 = DataFrame(data = {"Python":[99,101,120,98], "数学":[120,136,141,123],"语文":[98,78,99,101]}, index = list("abcd"))
df1
#这种情况是行索引多的的话会报错
Python | 数学 | 语文 | |
---|---|---|---|
a | 99 | 120 | 98 |
b | 101 | 136 | 78 |
c | 120 | 141 | 99 |
d | 98 | 123 | 101 |
df1 = DataFrame(data = {"Python":[99,101,120,98], "数学":[120,136,141,123],"语文":[98,78,99,101]},index = list("abcd"),
columns = ["Python","数学","语文","英语"])
df1
Python | 数学 | 语文 | 英语 | |
---|---|---|---|---|
a | 99 | 120 | 98 | NaN |
b | 101 | 136 | 78 | NaN |
c | 120 | 141 | 99 | NaN |
d | 98 | 123 | 101 | NaN |
#列更加重要点
DataFrame属性:values、columns、index、shape
df1.values
array([[99, 120, 98, nan],
[101, 136, 78, nan],
[120, 141, 99, nan],
[98, 123, 101, nan]], dtype=object)
df1.columns
#列索引
Index(['Python', '数学', '语文', '英语'], dtype='object')
df1.index
Index(['a', 'b', 'c', 'd'], dtype='object')
df1.shape
(4, 4)
import numpy as np
#第二种写法
df2 = DataFrame(data = np.random.randint(0,150,size = (4,4)), index = list("abcd"), columns = ["Python","Java","PHP","Html"])
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 98 | 37 | 124 |
b | 28 | 71 | 23 | 148 |
c | 68 | 66 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
============================================
练习4:
根据以下考试成绩表,创建一个DataFrame,命名为df:
张三 李四
语文 150 0
数学 150 0
英语 150 0
理综 300 0
============================================
2)DataFrame的索引
(1) 对列进行索引
- 通过类似字典的方式
- 通过属性的方式
可以将DataFrame的列获取为一个Series。返回的Series拥有原DataFrame相同的索引,且name属性也已经设置好了,就是相应的列名。
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 98 | 37 | 124 |
b | 28 | 71 | 23 | 148 |
c | 68 | 66 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
df2["Python"]
a 48
b 28
c 68
d 70
Name: Python, dtype: int32
df2.Python
#columns 列名 属性名
a 48
b 28
c 68
d 70
Name: Python, dtype: int32
df2[["Python","Java"]]
Python | Java | |
---|---|---|
a | 48 | 98 |
b | 28 | 71 |
c | 68 | 66 |
d | 70 | 28 |
#让你拿出来一行数据 a DataFrame 是无法通过中括号来直接获取行数据的
df2["a"]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'a'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
in ()
1 #让你拿出来一行数据 a
----> 2 df2["a"]
C:\anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
C:\anaconda\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
C:\anaconda\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
C:\anaconda\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3841
3842 if not isna(item):
-> 3843 loc = self.items.get_loc(item)
3844 else:
3845 indexer = np.arange(len(self.items))[isna(self.items)]
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2525 return self._engine.get_loc(key)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
2529 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'a'
#切片
df2["a":"c"]
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 98 | 37 | 124 |
b | 28 | 71 | 23 | 148 |
c | 68 | 66 | 127 | 13 |
(2) 对行进行索引
- 使用.loc[]加index来进行行索引
- 使用.iloc[]加整数来进行行索引
同样返回一个Series,index为原来的columns。
df2.loc["a"]
#Series
Python 48
Java 98
PHP 37
Html 124
Name: a, dtype: int32
df2.loc[["a"]]
#DataFrame数据
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 98 | 37 | 124 |
df2.iloc[0]
Python 48
Java 98
PHP 37
Html 124
Name: a, dtype: int32
df2.iloc[[1]]
Python | Java | PHP | Html | |
---|---|---|---|---|
b | 28 | 71 | 23 | 148 |
df2.iloc[[1,2]]
Python | Java | PHP | Html | |
---|---|---|---|---|
b | 28 | 71 | 23 | 148 |
c | 68 | 66 | 127 | 13 |
df2.iloc[0:3]
#左闭右开
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 98 | 37 | 124 |
b | 28 | 71 | 23 | 148 |
c | 68 | 66 | 127 | 13 |
df2.loc["a"]["Java"]
98
df2.loc['a',"Java"]
#忘记这种情况吧!!!!!!
98
df2["Java"]["a"]
98
df2["Java","a"]
#总结统一一下, 取单个数据数据的时候,行和列不能写在同一个中括号里面
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ('Java', 'a')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
in ()
----> 1 df2["Java","a"]
C:\anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
C:\anaconda\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
C:\anaconda\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
C:\anaconda\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3841
3842 if not isna(item):
-> 3843 loc = self.items.get_loc(item)
3844 else:
3845 indexer = np.arange(len(self.items))[isna(self.items)]
C:\anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2525 return self._engine.get_loc(key)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
2529 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: ('Java', 'a')
(3) 对元素索引的方法
- 使用列索引
- 使用行索引(iloc[3,1]相当于两个参数;iloc[[3,3]] 里面的[3,3]看做一个参数)
- 使用values属性(二维numpy数组)
df2["Java"]["a":"c"]
#左闭右闭
a 98
b 71
c 66
Name: Java, dtype: int32
df2.iloc[1:3]["Html"]
#左闭右开
b 148
c 13
Name: Html, dtype: int32
df2.loc["a","Python"]
48
df2.loc[["a","b"], "Python"]
#loc是一个非常特殊的方法
a 48
b 28
Name: Python, dtype: int32
df2.loc['a':"b", "Python"]
a 48
b 28
Name: Python, dtype: int32
df2.iloc[0:2,"Python"]
#不行啦,和loc不一样!!!!!
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in ()
----> 1 df2.iloc[0:2,"Python"]
C:\anaconda\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1365 except (KeyError, IndexError):
1366 pass
-> 1367 return self._getitem_tuple(key)
1368 else:
1369 # we by definition only have the 0th axis
C:\anaconda\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
1735 def _getitem_tuple(self, tup):
1736
-> 1737 self._has_valid_tuple(tup)
1738 try:
1739 return self._getitem_lowerdim(tup)
C:\anaconda\lib\site-packages\pandas\core\indexing.py in _has_valid_tuple(self, key)
205 raise ValueError("Location based indexing can only have "
206 "[{types}] types"
--> 207 .format(types=self._valid_types))
208
209 def _should_validate_iterable(self, axis=None):
ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types
#赋值
df2["Python"]["b"] += 50
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 98 | 37 | 124 |
b | 78 | 71 | 23 | 148 |
c | 68 | 66 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
df2.loc["a":"c", "Java"] += 20
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 138 | 37 | 124 |
b | 78 | 111 | 23 | 148 |
c | 68 | 106 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
【注意】
直接用中括号时:
- 索引表示的是列索引
- 切片表示的是行切片
df2["a":"b"]
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 138 | 37 | 124 |
b | 78 | 111 | 23 | 148 |
df2["Python"]
a 48
b 78
c 68
d 70
Name: Python, dtype: int32
============================================
练习5:
使用多种方法对ddd进行索引和切片,并比较其中的区别
============================================
3)DataFrame的运算
(1) DataFrame之间的运算
同Series一样:
- 在运算中自动对齐不同索引的数据
- 如果索引不对应,则补NaN
df1
Python | 数学 | 语文 | 英语 | |
---|---|---|---|---|
a | 99 | 120 | 98 | NaN |
b | 101 | 136 | 78 | NaN |
c | 120 | 141 | 99 | NaN |
d | 98 | 123 | 101 | NaN |
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 138 | 37 | 124 |
b | 78 | 111 | 23 | 148 |
c | 68 | 106 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
df1 + df2
Html | Java | PHP | Python | 数学 | 英语 | 语文 | |
---|---|---|---|---|---|---|---|
a | NaN | NaN | NaN | 147 | NaN | NaN | NaN |
b | NaN | NaN | NaN | 179 | NaN | NaN | NaN |
c | NaN | NaN | NaN | 188 | NaN | NaN | NaN |
d | NaN | NaN | NaN | 168 | NaN | NaN | NaN |
df1.add(df2, fill_value = 0)
Html | Java | PHP | Python | 数学 | 英语 | 语文 | |
---|---|---|---|---|---|---|---|
a | 125.0 | 139.0 | 38.0 | 147 | 121.0 | NaN | 99.0 |
b | 149.0 | 112.0 | 24.0 | 179 | 137.0 | NaN | 79.0 |
c | 14.0 | 107.0 | 128.0 | 188 | 142.0 | NaN | 100.0 |
d | 84.0 | 29.0 | 75.0 | 168 | 124.0 | NaN | 102.0 |
df1 = DataFrame(np.random.randint(0,150,size = (4,2)),
index = list("cdef"),
columns = ["Python","Java"])
df1
Python | Java | |
---|---|---|
c | 82 | 122 |
d | 128 | 130 |
e | 15 | 126 |
f | 74 | 133 |
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 138 | 37 | 124 |
b | 78 | 111 | 23 | 148 |
c | 68 | 106 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
df1.add(df2, axis = "index", fill_value = 0)
#出现第一个bug axis 在这个地方显示不出来!!!!!
Html | Java | PHP | Python | |
---|---|---|---|---|
a | 124.0 | 138.0 | 37.0 | 48.0 |
b | 148.0 | 111.0 | 23.0 | 78.0 |
c | 13.0 | 228.0 | 127.0 | 150.0 |
d | 83.0 | 158.0 | 74.0 | 198.0 |
e | NaN | 126.0 | NaN | 15.0 |
f | NaN | 133.0 | NaN | 74.0 |
创建DataFrame df1 不同人员的各科目成绩,月考一
创建DataFrame df2 不同人员的各科目成绩,月考二
有新学生转入
下面是Python 操作符与pandas操作函数的对应表:
Python Operator | Pandas Method(s) |
---|---|
+ |
add() |
- |
sub() , subtract() |
* |
mul() , multiply() |
/ |
truediv() , div() , divide() |
// |
floordiv() |
% |
mod() |
** |
pow() |
(2) Series与DataFrame之间的运算
【重要】
-
使用Python操作符:以行为单位操作(参数必须是行),对所有行都有效。(类似于numpy中二维数组与一维数组的运算,但可能出现NaN)
-
使用pandas操作函数:
axis=0:以列为单位操作(参数必须是列),对所有列都有效。 axis=1:以行为单位操作(参数必须是行),对所有行都有效。
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 138 | 37 | 124 |
b | 78 | 111 | 23 | 148 |
c | 68 | 106 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
s_row = df2.loc['c']
s_row
#Series
Python 68
Java 106
PHP 127
Html 13
Name: c, dtype: int32
s_columns = df2["Python"]
s_columns
a 48
b 78
c 68
d 70
Name: Python, dtype: int32
df2
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 48 | 138 | 37 | 124 |
b | 78 | 111 | 23 | 148 |
c | 68 | 106 | 127 | 13 |
d | 70 | 28 | 74 | 83 |
df2.add(s_columns,axis = 0)
#s_columns
#a 48
# b 78
# c 68
# d 70
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 96 | 186 | 85 | 172 |
b | 156 | 189 | 101 | 226 |
c | 136 | 174 | 195 | 81 |
d | 140 | 98 | 144 | 153 |
df2.add(s_columns,axis = "index")
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 96 | 186 | 85 | 172 |
b | 156 | 189 | 101 | 226 |
c | 136 | 174 | 195 | 81 |
d | 140 | 98 | 144 | 153 |
df2.add(s_row,axis = "columns")
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 116 | 244 | 164 | 137 |
b | 146 | 217 | 150 | 161 |
c | 136 | 212 | 254 | 26 |
d | 138 | 134 | 201 | 96 |
df2 + s_row
Python | Java | PHP | Html | |
---|---|---|---|---|
a | 116 | 244 | 164 | 137 |
b | 146 | 217 | 150 | 161 |
c | 136 | 212 | 254 | 26 |
d | 138 | 134 | 201 | 96 |
#DataFrame和Series进行运算的时候要严格注意 axis
#Series,是一个一维的数据,DataFrame 是一个多维的数据索引不对齐的话肯定会出现错误
============================================
练习6:
-
假设ddd是期中考试成绩,ddd2是期末考试成绩,请自由创建ddd2,并将其与ddd相加,求期中期末平均值。
-
假设张三期中考试数学被发现作弊,要记为0分,如何实现?
-
李四因为举报张三作弊立功,期中考试所有科目加100分,如何实现?
-
后来老师发现有一道题出错了,为了安抚学生情绪,给每位学生每个科目都加10分,如何实现?
============================================