本文为《利用Python进行数据分析》的部分读书笔记
import numpy as np
ndarray包含的每一个元素均为相同类型
numpy 支持的数据类型比 Python 内置的类型要多很多,基本上可以和 C 语言的数据类型对应上,其中部分类型对应为 Python 内置的类型.
下表给出部分常见的数据类型
类型 | 类型代码 |
---|---|
int, uint8 | i1, u1 |
int16, uint16 | i2, u2 |
int32, uint32 | i4, u4 |
int64, uint64 | i8, u8 |
float16 | f2 |
float32 | f4 / f |
float64 | f2 / d |
bool | ? |
np.astype可以显式地转换数组的数据类型。使用astype时总是生成一个新的数组,即使你传入的dtype与之前一样:
>>> arr = np.arange(10)
>>> arr.dtype
dtype('int32')
>>> float_arr = arr.astype(np.float64)
>>> float_arr.dtype
dtype('float64')
>>> float_arr = arr.astype('f8')
>>> float_arr.dtype
dtype('float64')
>>> np.array(['1.23', '3.3']).astype('f4')
array([1.23, 3.3 ], dtype=float32)
>>> arr1 = np.array(range(10))
>>> arr1
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> arr2 = np.array([[1, 2], [3, 4]])
>>> arr2
array([[1, 2],
[3, 4]])
>>> arr2 = np.array([[1, 2], [3, 4]], dtype=np.float64)
>>> arr2
array([[1., 2.],
[3., 4.]])
>>> np.arange(10)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> np.arange(0, 12, 4)
array([0, 4, 8])
>>> np.linspace(0, 12, 4)
array([ 0., 4., 8., 12.])
>>> a
array([[0, 1, 0, 1],
[0, 0, 1, 1],
[0, 1, 0, 1],
[0, 1, 0, 0]])
>>> b=np.asarray(a)
>>> b
array([[0, 1, 0, 1],
[0, 0, 1, 1],
[0, 1, 0, 1],
[0, 1, 0, 0]])
>>> b[0,0]=1
>>> a
array([[1, 1, 0, 1],
[0, 0, 1, 1],
[0, 1, 0, 1],
[0, 1, 0, 0]])
>>> np.asarray([1,2])
array([1, 2])
>>> np.asarray([1,2])
array([1, 2])
>>> np.ones(10)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
>>> np.ones((3, 6))
array([[1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1.]])
>>> np.full(5, 7)
array([7, 7, 7, 7, 7])
>>> np.full((2,2), 7)
array([[7, 7],
[7, 7]])
>>> np.ones_like(a)
array([[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1]])
>>> np.full_like(a, 9)
array([[9, 9, 9, 9],
[9, 9, 9, 9],
[9, 9, 9, 9],
[9, 9, 9, 9]])
>>> np.eye(3)
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
>>> np.identity(3)
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
>>> arr = np.arange(10).reshape(2,5)
>>> arr
array([[0, 1, 2, 3, 4],
[5, 6, 7, 8, 9]])
>>> arr * arr
array([[ 0, 1, 4, 9, 16],
[25, 36, 49, 64, 81]])
>>> arr - arr
array([[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]])
>>> arr * 8
array([[ 0, 8, 16, 24, 32],
[40, 48, 56, 64, 72]])
>>> arr ** 0.5
array([[0. , 1. , 1.41421356, 1.73205081, 2. ],
[2.23606798, 2.44948974, 2.64575131, 2.82842712, 3. ]])
>>> arr1
array([[0, 1, 2, 3, 4],
[5, 6, 7, 8, 9]])
>>> arr2
array([[0, 1, 3, 4, 4],
[5, 6, 7, 7, 7]])
>>> arr1 == arr2
array([[ True, True, False, False, True],
[ True, True, True, False, False]])
>>> arr = np.arange(10)
>>> arr[5: 8] = 12
>>> arr
array([ 0, 1, 2, 3, 4, 12, 12, 12, 8, 9])
>>> arr_slice = arr[5: 8]
>>> arr_slice
array([12, 12, 12])
>>> arr_slice[1] = 10
>>> arr
array([ 0, 1, 2, 3, 4, 12, 10, 12, 8, 9])
>>> arr_slice[:] = 13
>>> arr
array([ 0, 1, 2, 3, 4, 13, 13, 13, 8, 9])
>>> arr
array([[0, 1, 2, 3, 4],
[5, 6, 7, 8, 9]])
>>> arr[0][2]
2
>>> arr[0, 2]
2
>>> arr[0] = 5
>>> arr
array([[5, 5, 5, 5, 5],
[5, 6, 7, 8, 9]])
>>> arr2d
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
>>> arr2d[:2] #沿着轴0进行切片
array([[1, 2, 3],
[4, 5, 6]])
>>> arr2d[:2, 1:] #进行多组切片
array([[2, 3],
[5, 6]])
>>> arr2d[1, 1:]
array([5, 6])
>>> arr2d[:, 1:]
array([[2, 3],
[5, 6],
[8, 9]])
>>> a
array([0, 1, 2, 3, 4, 5, 6])
>>> data = np.random.randn(7, 4)
>>> data
array([[ 0.61831111, -0.90884626, -0.10049213, 0.78700141],
[-1.45371672, -1.26565266, -0.97361111, -1.82156543],
[ 0.33910806, -0.25890023, -0.42405515, -0.60169062],
[ 0.34129922, 0.16233639, -0.10267673, 0.11954078],
[-0.3281058 , -0.34615303, -0.64944325, -0.64894346],
[-0.41454747, -0.50209097, -0.15752718, 1.61812454],
[ 0.88890955, 1.56256735, 0.76624777, 0.70458894]])
>>> a > 3
array([False, False, False, False, True, True, True])
>>> data[a > 3]
array([[-0.3281058 , -0.34615303, -0.64944325, -0.64894346],
[-0.41454747, -0.50209097, -0.15752718, 1.61812454],
[ 0.88890955, 1.56256735, 0.76624777, 0.70458894]])
>>> data[a > 3, 2:]
array([[-0.64944325, -0.64894346],
[-0.15752718, 1.61812454],
[ 0.76624777, 0.70458894]])
>>> data[data < 0] = 0
>>> data
array([[0.61831111, 0. , 0. , 0.78700141],
[0. , 0. , 0. , 0. ],
[0.33910806, 0. , 0. , 0. ],
[0.34129922, 0.16233639, 0. , 0.11954078],
[0. , 0. , 0. , 0. ],
[0. , 0. , 0. , 1.61812454],
[0.88890955, 1.56256735, 0.76624777, 0.70458894]])
>>> data[~(a > 3)] #对条件取反
array([[ 0.61831111, -0.90884626, -0.10049213, 0.78700141],
[-1.45371672, -1.26565266, -0.97361111, -1.82156543],
[ 0.33910806, -0.25890023, -0.42405515, -0.60169062],
[ 0.34129922, 0.16233639, -0.10267673, 0.11954078]])
>>> mask = (a < 3) | (a > 5)
>>> mask
array([ True, True, True, False, False, False, True])
>>> mask = (a > 2) & (a < 4)
>>> mask
array([False, False, False, True, False, False, False])
>>> arr
array([[0., 0., 0., 0.],
[1., 1., 1., 1.],
[2., 2., 2., 2.],
[3., 3., 3., 3.],
[4., 4., 4., 4.],
[5., 5., 5., 5.],
[6., 6., 6., 6.],
[7., 7., 7., 7.]])
>>> arr[[4, 3, 0, 6]]
array([[4., 4., 4., 4.],
[3., 3., 3., 3.],
[0., 0., 0., 0.],
[6., 6., 6., 6.]])
>>> arr[[-3, -5, -7]]
array([[5., 5., 5., 5.],
[3., 3., 3., 3.],
[1., 1., 1., 1.]])
>>> arr=np.arange(32).reshape(8,4)
>>> arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23],
[24, 25, 26, 27],
[28, 29, 30, 31]])
>>> arr[:, [0,3,1,2]]
array([[ 0, 3, 1, 2],
[ 4, 7, 5, 6],
[ 8, 11, 9, 10],
[12, 15, 13, 14],
[16, 19, 17, 18],
[20, 23, 21, 22],
[24, 27, 25, 26],
[28, 31, 29, 30]])
>>> arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23],
[24, 25, 26, 27],
[28, 29, 30, 31]])
>>> arr[[1,5,7,-2],[0,3,1,2]]
array([ 4, 23, 29, 26])
>>> arr
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
>>> arr.T
array([[ 0, 5, 10],
[ 1, 6, 11],
[ 2, 7, 12],
[ 3, 8, 13],
[ 4, 9, 14]])
>>> arr.T[0,0]=1
>>> arr
array([[ 1, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
>>> arr
array([[ 1, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
>>> arr.transpose((1,0))
array([[ 1, 5, 10],
[ 1, 6, 11],
[ 2, 7, 12],
[ 3, 8, 13],
[ 4, 9, 14]])
>>> arr.transpose((1,0))[0,0] = 2
>>> arr
array([[ 2, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
通用函数就是对一些简单函数的向量化封装,通常比纯Python的等价实现快上一到两个数量级。
函数名 | 描述 |
---|---|
np.abs | |
np.sqrt | |
np.square | |
np.exp | ex |
np.abs | |
np.sin, np.cos | |
np.log | |
np.std | |
np.mean | |
np.sort | 返回已经排序好的数组的拷贝 |
np.unique | 返回数组中唯一值排序后形成的数组 |
np.ceil | 返回大于等于输入的最小整数 |
np.pad | 对数组进行填补(可用于卷积层的计算中) |
函数名 | 描述 |
---|---|
np.maximum | 逐个元素计算最大值 |
np.minimum | 逐个元素计算最小值 |
np.meshgrid | 接收两个一维数组,根据两个数组的所有(x, y)对生成一个二维矩阵 |
np.where | 三元表达式x if condition else y的向量化版本 |
np.concatenate, np.vstack, np.hstack | 连接数组 |
np.split | 分隔数组 |
>>> a
array([[1, 2, 3],
[4, 5, 6]])
>>> b
array([[ 7, 8, 9],
[10, 11, 12]])
>>> np.concatenate([a, b], axis=0)
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
>>> np.vstack((a, b))
array([[ 1, 2, 3],
[ 4, 5, 6],
[ 7, 8, 9],
[10, 11, 12]])
>>> np.concatenate([a, b], axis=1)
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
>>> np.hstack((a, b))
array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
>>> arr
array([[ 1.05317489, 0.1942955 ],
[ 1.35835198, -0.28694235],
[ 0.44143825, -0.76987271],
[-0.97456841, 0.70900741],
[-0.10578273, 0.77677573]])
>>> a, b, c = np.split(arr, [1, 3]) # 1,3为数组拆分时的索引位置
>>> a
array([[1.05317489, 0.1942955 ]])
>>> b
array([[ 1.35835198, -0.28694235],
[ 0.44143825, -0.76987271]])
>>> c
array([[-0.97456841, 0.70900741],
[-0.10578273, 0.77677573]])
>>> x = np.random.randn(8)
>>> y = np.random.randn(8)
>>> x
array([ 0.76959912, 1.67965149, -1.32080017, 2.30574518, 0.10331594,
-0.64966056, -1.07463182, 0.94832031])
>>> y
array([ 0.73280851, -0.58093964, 1.03949011, 0.3253899 , 0.97249721,
-0.56702881, 2.50223921, -1.41706575])
>>> np.maximum(x,y)
array([ 0.76959912, 1.67965149, 1.03949011, 2.30574518, 0.97249721,
-0.56702881, 2.50223921, 0
# xs和ys两个数组即为生成点的x和y坐标
>>> points = np.arange(5)
>>> xs, ys = np.meshgrid(points, points)
>>> xs
array([[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4]])
>>> ys
array([[0, 0, 0, 0, 0],
[1, 1, 1, 1, 1],
[2, 2, 2, 2, 2],
[3, 3, 3, 3, 3],
[4, 4, 4, 4, 4]])
>>> x
array([1.1, 1.2, 1.3, 1.4, 1.5])
>>> y
array([2.1, 2.2, 2.3, 2.4, 2.5])
>>> cond
array([ True, False, True, True, False])
>>> np.where(cond, x, y)
array([1.1, 2.2, 1.3, 1.4, 2.5])
>>> arr = np.random.randn(4,4)
>>> arr
array([[ 0.95044471, 0.62067361, -0.795527 , 0.35033725],
[-1.27284505, 0.69694069, -0.04915669, 0.39890032],
[ 0.98207569, 0.10176966, -0.71230618, 0.90359928],
[ 0.15749577, 0.33156875, -0.63446873, -0.57748146]])
>>> np.where(arr > 0, 2, -2)
array([[ 2, 2, -2, 2],
[-2, 2, -2, 2],
[ 2, 2, -2, 2],
[ 2, 2, -2, -2]])
>>> np.where(arr > 0, 2, arr) # 将arr中的正值替换为2
array([[ 2. , 2. , -0.795527 , 2. ],
[-1.27284505, 2. , -0.04915669, 2. ],
[ 2. , 2. , -0.71230618, 2. ],
[ 2. , 2. , -0.63446873, -0.57748146]])
方法 | 描述 |
---|---|
arr.sum | |
arr.mean | |
arr.std | |
arr.var | |
arr.min, arr.max | |
arr.argmin, arr.argmax | |
arr.cumsum | 从0开始元素累积和 |
arr.cumprod | 从1开始元素累积积 |
arr.any | 检查数组中是否至少有一个是True |
arr.all | 检查数组中是否每个值都是True |
arr.sort | 对原数组按位置排序,如果想用降序,用arr[::-1]即可 |
arr.reshape | 返回改变形状后的数组视图,并且不复制任何数据 |
>>> arr = np.arange(12).reshape((3,4))
>>> arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
>>> other_array = np.arange(12)
>>> arr.reshape(other_array.shape)
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
>>> arr.reshape((4, -1)) # -1表示维度通过数据进行判断
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])
>>> arr
array([[-0.01061292, -1.20747302, -0.20352168],
[-1.69323959, -1.23248814, -0.95252552]])
>>> arr.sum()
-5.299860871950473
>>> arr.sum(axis=0)
array([-1.70385251, -2.43996116, -1.1560472 ])
>>> arr.sum(0)
array([-1.70385251, -2.43996116, -1.1560472 ])
# 计算大于5的元素的个数
>>> arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
>>> (arr > 5).sum()
6
>>> arr
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> arr.cumsum()
array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45], dtype=int32)
>>> arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
>>> arr.cumsum(axis=0)
array([[ 0, 1, 2, 3],
[ 4, 6, 8, 10],
[12, 15, 18, 21]], dtype=int32)
>>> arr.cumsum(0)
array([[ 0, 1, 2, 3],
[ 4, 6, 8, 10],
[12, 15, 18, 21]], dtype=int32)
函数 | 描述 |
---|---|
np.save | |
np.savez | |
np.savez_compressed | |
np.load |
>>> arr = np.arange(10)
>>> np.save('saved_arr', arr)
>>> np.load('saved_arr.npy')
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> np.savez('saved_array.npz', a=arr, b=arr)
>>> arch = np.load('saved_array.npz')
>>> arch['a'], arch['b']
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
np.savez_compressed('compressed_array.npz', a=arr, b=arr)
函数 | 描述 |
---|---|
np.dot(x, y) / x @ y | 点积 |
np.linalg.diag | 将一个方阵的对角元素作为一维数组返回,或者将一维数组转换成一个方阵 |
np.linalg.trace | 计算对角元素和 |
np.linalg.det | 计算矩阵行列式 |
np.linalg.eig | 计算方阵的特征值和特征向量 |
np.linalg.inv | 计算方阵的逆矩阵 |
np.linalg.pinv | 计算矩阵的Moore-Penrose伪逆 |
np.linalg.qr | 计算QR分解 |
np.linalg.svd | 计算奇异值分解(SVD) |
np.linalg.solve | 求解x的线性系统Ax = b, 其中A是方阵 |
np.linalg.lstsq | 计算Ax = b的最小二乘解 |
函数 | 描述 |
---|---|
np.random.seed | 更改numpy的随机数种子 |
np.random.shuffle | 随机排列一个序列 |
np.random.randn | 从标准正态分布中抽取样本 |
np.random.randint | 根据给定范围抽取随机整数 |
np.random.choice | 从整数或序列中抽样 |
np.random.rand | 从均匀分布中抽取样本 |
np.random.binomial | 从二项分布中抽取样本 |
np.random.normal | 从正态分布中抽取样本 |
np.random.beta | 从beta分布中抽取样本 |
np.random.chisquare | 从卡方分布中抽取样本 |
np.random.gamma | 从gamma分布中抽取样本 |
np.random.uniform | 从均匀[0, 1)分布中抽取样本 |
>>> np.random.seed(1234)
为了避免全局状态,可以使用np.random.RandomState创建一个随机数生成器,是数据独立于其他的随机数状态:
>>> np.random.seed(1234)
>>> rng = np.random.RandomState(1234)
>>> rng.randn(10)
array([ 0.47143516, -1.19097569, 1.43270697, -0.3126519 , -0.72058873,
0.88716294, 0.85958841, -0.6365235 , 0.01569637, -2.24268495])
>>> np.random.normal(size=(2,2))
array([[ 0.40545341, 0.28909194],
[ 1.32115819, -1.54690555]])
>>> np.random.normal(size=2)
array([-0.20264632, -0.65596934])
>>> np.random.randint(0,2,size=10)
array([1, 0, 1, 1, 1, 1, 0, 1, 0, 1])
该函数的解释转自链接
def choice(a, size=None, replace=True, p=None)
表示从a中随机选取size个数
replacement 代表的意思是抽样之后还放不放回去,如果是False的话,那么通一次挑选出来的数都不一样,如果是True的话, 有可能会出现重复的,因为前面的抽的放回去了。
p表示每个元素被抽取的概率,如果没有指定,a中所有元素被选取的概率是相等的。
Numpy从最后开始往前逐个比较两个数组的维度大小,若两者对应维度相同,或其中一个(或二者都)等于1,则继续比较,直至最前面的维度。若不满足这两个条件,程序报错。之后,广播会在丢失的或长度为1的轴上进行。
>>> arr
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])
>>> demeaned = arr.mean(0)
>>> demeaned
array([4.5, 5.5, 6.5])
>>> arr.shape
(4, 3)
>>> demeaned.shape
(3,)
>>> arr - demeaned # 在轴0上进行广播,在每一列减去该列的均值
array([[-4.5, -4.5, -4.5],
[-1.5, -1.5, -1.5],
[ 1.5, 1.5, 1.5],
[ 4.5, 4.5, 4.5]])
>>> arr
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])
>>> row_mean = arr.mean(1)
>>> row_mean.shape
(4,)
>>> arr - row_mean # 注意这里(4,3) 和 (4,) 是无法进行广播的
Traceback (most recent call last):
File "" , line 1, in <module>
ValueError: operands could not be broadcast together with shapes (4,3) (4,)
>>> row_mean = row_mean.reshape(4, 1) # (4,3) 和 (4, 1) 可以进行广播
>>> arr - row_mean
array([[-1., 0., 1.],
[-1., 0., 1.],
[-1., 0., 1.],
[-1., 0., 1.]])
在上述例子中,除了reshape之外,Numpy也提供了一种通过索引插入新轴的特殊语法。使用np.newaxis属性和“完整”切片来插入新轴
>>> arr = np.zeros((4, 4))
>>> arr_3d = arr[:, np.newaxis, :]
>>> arr_3d.shape
(4, 1, 4)
# 在轴2上减去均值
>>> arr = np.arange(60).reshape(3, 4, 5)
>>> means = arr.mean(2)
>>> means.shape
(3, 4)
>>> arr - means[:, :, np.newaxis]
array([[[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.]],
[[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.]],
[[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.],
[-2., -1., 0., 1., 2.]]])
# 推广到在任意轴向上减去平均值
def demean_axis(arr, axis=0):
means = arr.mean(axis)
indexer = [slice(None)] * arr.ndim
indexer[axis] = np.newaxis
return arr - means[indexer]
以上几个主题先留个位置,等用到的时候再更新吧