对《利用Python 进行数据分析》(Wes Mckinney著)一书中的第四章中Numpy基础进行代码实验。原书中采用的是Python2.7,而我采用的Python3.7在Pycharm调试的,因此对书中源代码进行了一定的修改,每步打印结果(除“随机”相关外)与原文校验对照一致(输出结果在注释中,简单的输出就没写结果),全手工敲写,供参考。
Pdf文档和数据集参见:《利用Python 进行数据分析》第二章:引言中的分析代码(含pdf和数据集下载链接)
因为代码过长,放在一个代码段中显得冗长,因此进行了拆分,如下的库引入每个代码段中均可能有必要引入。
# -*- coding:utf-8 -*-
import numpy as np
from numpy.random import randn
data = [[0.9526, -0.246, -0.8856],
[0.5639, 0.22379, 0.9104]]
data = np.array(data)
print(data)
print(data * 10)
print(data + data)
print(data.shape) # 数据维度
print(data.dtype) # 数据类型
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
print(arr1) # [6. 7.5 8. 0. 1. ]
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
print(arr2)
print(arr2.ndim) # ->2
print(arr2.shape) # (2, 4)
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
print(arr1.dtype) # 数据类型为浮点型float64
print(arr2.dtype) # 数据类型为整型int32
print(np.zeros(10)) # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
print(np.zeros((3, 6))) # 不要写成np.zeros(3,6)
'''[[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]]'''
print(np.empty((2, 3, 2))) # 返回未初始化的垃圾值
'''[[[6.23042070e-307 3.56043053e-307]
[1.60219306e-306 2.44763557e-307]
[1.69119330e-306 1.33514617e-307]]
[[1.42417221e-306 1.37961641e-306]
[1.27945651e-307 8.01097889e-307]
[1.20161526e-306 1.42410974e-306]]]'''
print(np.arange(15)) # [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)
print(arr1.dtype) # float64
print(arr2.dtype) # int32
arr = np.array([1, 2, 3, 4, 5])
print(arr.dtype) # int32
float_arr = arr.astype(np.float64)
print(float_arr.dtype) # float64
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
print(arr)
print(arr.astype(np.int32)) # [ 3 -1 -2 0 12 10]
numeric_strings = np.array(["1.25", "-9.6", "42"], dtype=np.string_)
print(numeric_strings.astype(float)) # [ 1.25 -9.6 42. ]
empty_unit32 = np.empty(8, dtype='u4')
print(empty_unit32) # [1 2 3 4 5 6 7 8] 与原文结果不一致??原文为随机出来的值
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
print(arr)
print(arr * arr)
'''[[ 1. 4. 9.]
[16. 25. 36.]]'''
print(arr - arr)
print(1 / arr)
'''[[1. 0.5 0.33333333]
[0.25 0.2 0.16666667]]'''
print(arr ** 0.5)
'''[[1. 1.41421356 1.73205081]
[2. 2.23606798 2.44948974]]'''
arr = np.arange(10)
print(arr[5])
print(arr[5:8])
arr[5:8] = 12 # 更改numpy数组中的元素
print(arr) # 由于广播机制 输出->[ 0 1 2 3 4 12 12 12 8 9]
arr_slice = arr[5:8]
arr_slice[1] = 12345
print(arr) # [0 1 2 3 4 12 12345 12 8 9]
arr_slice[:] = 64
print(arr) # [ 0 1 2 3 4 64 64 64 8 9]
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(arr2d[2]) # [7 8 9]
print(arr2d[0][2]) # 3
print(arr2d[0, 2]) # 3
arr3d = np.array([[[1, 2, 3], [4, 5, 6]],
[[7, 8, 9], [10, 11, 12]]])
print(arr3d)
print(arr3d.shape) # (2, 2, 3)
print(arr3d[0])
old_values = arr3d[0].copy()
arr3d[0] = 42
print(arr3d)
'''[[[42 42 42]
[42 42 42]]
[[ 7 8 9]
[10 11 12]]]'''
arr3d[0] = old_values
print(arr3d)
# print(arr) #[ 0 1 2 3 4 64 64 64 8 9] (arr值参见“基本的索引和切片”中)
# arr2d = np.array([[1,2,3],[4,5,6],[7,8,9]])
print(arr[1:6]) # ->[ 1 2 3 4 64]
print(arr2d[:2]) # ->[[1 2 3], [4 5 6]]
print(arr2d[:2, 1:]) # -> [[2 3], [5 6]]
print(arr2d[2, :1]) # -> 7
print(arr2d[1, :2]) # ->[4,5]
print(arr2d[:, :1]) # 冒号表示整轴选取 ->[[1],[4],[7]]
arr2d[:2, 1:] = 0
print(arr2d) # ->[[1 0 0], [4 0 0], [7 8 9]]
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = randn(7, 4)
print(data)
'''[[ 0.81586467 0.22772329 -1.0415652 -0.3245785 ]
[-1.2945232 0.53349873 -0.75547509 -1.43954103]
[-1.26108396 0.67093283 0.31324912 0.30667379]
[-0.20715904 0.46115929 -0.99327577 -0.81750551]
[ 0.27188598 0.50276487 0.33443927 1.19355676]
[ 0.57509673 -0.1303353 -1.26974167 -1.24611761]
[-0.61857004 -0.14564834 1.54315148 -0.43394236]]'''
print(names == 'Bob') # ->[ True False False True False False False]
print(data[names == 'Bob'])
'''[[ 0.81586467 0.22772329 -1.0415652 -0.3245785 ]
[-0.20715904 0.46115929 -0.99327577 -0.81750551]]'''
print(data[names == 'Bob', 2:])
'''[[-1.0415652 -0.3245785 ]
[-0.99327577 -0.81750551]]'''
print(data[names == 'Bob', 3]) # [-0.3245785 -0.81750551]
print(names != 'Bob') # [False True True False True True True]
print(data[~(names == 'Bob')]) # python3中使用“~”而不是“-”,与!=效果一样
'''[[-1.2945232 0.53349873 -0.75547509 -1.43954103]
[-1.26108396 0.67093283 0.31324912 0.30667379]
[ 0.27188598 0.50276487 0.33443927 1.19355676]
[ 0.57509673 -0.1303353 -1.26974167 -1.24611761]
[-0.61857004 -0.14564834 1.54315148 -0.43394236]]'''
mask = (names == 'Bob') | (names == 'Will')
print(mask) # [ True False True True True False False]
print(data[mask])
'''[[ 0.81586467, 0.22772329, -1.0415652 , -0.3245785 ],
[-1.26108396, 0.67093283, 0.31324912, 0.30667379],
[-0.20715904, 0.46115929, -0.99327577, -0.81750551],
[ 0.27188598, 0.50276487, 0.33443927, 1.19355676]]'''
data[data < 0] = 0 # 将所有负值设为0
print(data)
'''
[[ 0.81586467 0.22772329 0. 0. ]
[0. 0.53349873 0. 0. ]
[0. 0.67093283 0.31324912 0.30667379]
[0. 0.46115929 0. 0. ]
[ 0.27188598 0.50276487 0.33443927 1.19355676]
[ 0.57509673 0. 0. 0. ]
[0. 0. 1.54315148 0. ]]'''
# 通过一维布尔数组设置整行或整列
data[names != 'Joe'] = 7
print(data)
'''
[[7. 7. 7. 7. ]
[0. 0.53349873 0. 0. ]
[7. 7. 7. 7. ]
[7. 7. 7. 7. ]
[7. 7. 7. 7. ]
[ 0.57509673 0. 0. 0. ]
[0. 0. 1.54315148 0. ]]'''
arr = np.empty((8,4))
for i in range(8):
arr[i] = i
print(arr)
'''[[0. 0. 0. 0.]
[1. 1. 1. 1.]
[2. 2. 2. 2.]
[3. 3. 3. 3.]
[4. 4. 4. 4.]
[5. 5. 5. 5.]
[6. 6. 6. 6.]
[7. 7. 7. 7.]]'''
# 按特定顺序选取子集
print(arr[[4,3,0,6]])
'''[[4. 4. 4. 4.]
[3. 3. 3. 3.]
[0. 0. 0. 0.]
[6. 6. 6. 6.]]'''
# 加负号后会从后面选取
print(arr[[-3,-5,-7]])
'''[[5. 5. 5. 5.]
[3. 3. 3. 3.]
[1. 1. 1. 1.]]'''
arr = np.arange(32).reshape((8,4))
print(arr)
'''[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]
[12 13 14 15]
[16 17 18 19]
[20 21 22 23]
[24 25 26 27]
[28 29 30 31]]'''
print(arr[[1,5,7,2],[0,3,1,2]]) # [ 4 23 29 10]
print(arr[[1,5,7,2]][:,[0,3,1,2]])
'''[[ 4 7 5 6]
[20 23 21 22]
[28 31 29 30]
[ 8 11 9 10]]'''
print(arr[np.ix_([1,5,7,2], [0,3,1,2])])
'''[[ 4 7 5 6]
[20 23 21 22]
[28 31 29 30]
[ 8 11 9 10]]'''
arr = np.arange(15).reshape((3,5))
print(arr)
'''[[ 0 1 2 3 4]
[ 5 6 7 8 9]
[10 11 12 13 14]]'''
print(arr.T) #数组转置
'''[[ 0 5 10]
[ 1 6 11]
[ 2 7 12]
[ 3 8 13]
[ 4 9 14]]'''
arr = np.random.randn(6,3)
print(np.dot(arr.T, arr)) # 内积计算
arr = np.arange(16).reshape((2,2,4))
print(arr)
'''[[[ 0 1 2 3]
[ 4 5 6 7]]
[[ 8 9 10 11]
[12 13 14 15]]]'''
#高维数组需要一个轴编号才能转置(!!没看懂)
print(arr.transpose((1,0,2)))
'''[[[ 0 1 2 3]
[ 8 9 10 11]]
[[ 4 5 6 7]
[12 13 14 15]]]'''
print(arr.swapaxes(1,2))
'''[[[ 0 4]
[ 1 5]
[ 2 6]
[ 3 7]]
[[ 8 12]
[ 9 13]
[10 14]
[11 15]]]'''
arr = np.arange(10)
print(np.sqrt(arr)) # 求平方根
'''[0. 1. 1.41421356 1.73205081 2. 2.23606798
2.44948974 2.64575131 2.82842712 3. ]'''
# 求指数e的指数
print(np.exp(arr))
'''[1.00000000e+00 2.71828183e+00 7.38905610e+00 2.00855369e+01
5.45981500e+01 1.48413159e+02 4.03428793e+02 1.09663316e+03
2.98095799e+03 8.10308393e+03]'''
x = randn(8)
y = randn(8)
print(x)
'''[ 0.63811097 -0.51027577 -0.17726455 0.97772163 -2.14367248 -0.28716377
-0.20549874 0.46916848]'''
print(y)
'''[ 0.63032743 1.21848016 -1.86789595 0.42896225 0.66737185 1.31428271
-0.81983392 -0.33119709]'''
# 求元素级最大值,x和y对应元素比较
print(np.maximum(x,y))
'''[ 0.63811097 1.21848016 -0.17726455 0.97772163 0.66737185 1.31428271
-0.20549874 0.46916848]'''
arr = randn(7) *5
print(arr)
'''[-1.04444876 1.31390346 -2.57308959 9.68938735 -7.54923318 -2.23501749
-4.00171934]'''
# 将数组各元素的小数和整数部分以两个独立数组形式返回(书上没写清楚)
print(np.modf(arr))
'''(array([-0.04444876, 0.31390346, -0.57308959, 0.68938735, -0.54923318,
-0.23501749, -0.00171934]), array([-1., 1., -2., 9., -7., -2., -4.]))'''
points = np.arange(-5, 5, 0.01) #1000个间隔相等的点
xs, ys = np.meshgrid(points, points)
import matplotlib.pyplot as plt
z = np.sqrt(xs**2 + ys**2)
print(z)
'''[[7.07106781 7.06400028 7.05693985 ... 7.04988652 7.05693985 7.06400028]
[7.06400028 7.05692568 7.04985815 ... 7.04279774 7.04985815 7.05692568]
[7.05693985 7.04985815 7.04278354 ... 7.03571603 7.04278354 7.04985815]
...
[7.04988652 7.04279774 7.03571603 ... 7.0286414 7.03571603 7.04279774]
[7.05693985 7.04985815 7.04278354 ... 7.03571603 7.04278354 7.04985815]
[7.06400028 7.05692568 7.04985815 ... 7.04279774 7.04985815 7.05692568]]'''
plt.imshow(z, cmap=plt.cm.gray)
plt.colorbar()
plt.title("Image plot of sqart{x^2+y^2} for a gid values")
plt.show()
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1 ,2.2, 2.3, 2.4 ,2.5])
cond = np.array([True, False, True, True, False])
#根据真假值选取新值,为真则从xarr中选取,否则从yarr中选取
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
print(result) # [1.1, 2.2, 1.3, 1.4, 2.5]
result = np.where(cond, xarr, yarr) # 新的表达方式
print(result) # [1.1 2.2 1.3 1.4 2.5]
arr = randn(4,4)
print(arr)
'''[[ 0.0346191 1.22446873 -1.22899471 -0.3369665 ]
[ 1.47604911 -0.99256909 0.00272824 -1.95770979]
[-0.74920869 -0.4935277 1.94493423 -0.69479795]
[ 0.48006709 1.10094828 0.06475939 -1.31085929]]'''
# 将正值替换为2,负值替换
print(np.where(arr > 0, 2, -2)) 为-2
'''[[ 2 2 -2 -2]
[ 2 -2 2 -2]
[-2 -2 2 -2]
[ 2 2 2 -2]]'''
# 将正值替换为2,负值不动
print(np.where(arr > 0, 2, arr))
'''[[ 2. 2. -1.22899471 -0.3369665 ]
[ 2. -0.99256909 2. -1.95770979]
[-0.74920869 -0.4935277 2. -0.69479795]
[ 2. 2. 2. -1.31085929]]'''
cond1 = np.array([True, False, False, True, True, False])
cond2 = np.array([True, True, False, False, False, True])
result = []
for i in range(6):
if cond1[i] and cond2[i]:
result.append(0)
elif cond1[i]:
result.append(1)
elif cond2[i]:
result.append(2)
else:
result.append(3)
print(result) # [0, 2, 3, 1, 1, 2]
print(np.where(cond1&cond2,0, np.where(cond1, 1, np.where(cond2,2,3)))) # [0 2 3 1 1 2], 与上述逻辑表达式效果一致
arr = np.random.randn(5,4)
print(arr)
'''[[ 0.86379303 -0.5274916 0.49188262 0.3381654 ]
[-1.25237271 -1.02041665 -0.45664488 2.14356969]
[ 0.94932601 -1.49865692 -0.60859843 -0.33866988]
[-0.14164747 -0.73383431 -0.6075893 0.44304416]
[-0.50505771 -1.49829159 0.09844207 0.67972674]]'''
print(arr.mean()) # 求均值 -0.15906608693333124
print(np.mean(arr)) # -0.15906608693333124
print(arr.sum()) #求和 -3.181321738666625
print(arr.mean(axis = 1)) # 按列求均值 [ 0.29158736 -0.14646614 -0.37414981 -0.26000673 -0.30629512]
print(arr.sum(0)) # 按行求和 [-0.08595885 -5.27869107 -1.08250792 3.2658361 ]
arr = np.array([[0,1,2],[3,4,5],[6,7,8]])
print(arr)
print(arr.cumsum(0)) # 按行求累计和
print(arr.cumprod(1)) # 按列求累计积
arr = randn(100)
print((arr > 0).sum()) #求正数的数量
bools = np.array([False, False, True, False])
print(bools.any()) # True. 检查是否存在一个或者多个True
print(bools.all()) # Fasle. 检查是否都为True
arr = randn(8)
print(arr)
'''[ 0.52054089 0.11640705 1.11630459 -0.488596 1.385098 2.35582882
0.40295761 -0.98801818]'''
# 对数组排序,注意与sorted(arr)区分, 前者会改变原数组,后者不会
arr.sort()
print(arr)
'''[-0.98801818 -0.488596 0.11640705 0.40295761 0.52054089 1.11630459
1.385098 2.35582882]'''
arr = randn(5,3)
print(arr)
'''[[ 0.77209717 1.11046007 0.38905073]
[ 0.1855081 0.1714953 0.64185395]
[ 1.35953558 -1.32439862 -1.03117232]
[-1.65155383 -0.45173971 1.52520206]
[ 0.50943937 0.16410033 -0.3458179 ]]'''
arr.sort(1) #在某个轴上进行排序
print(arr)
'''[[ 0.38905073 0.77209717 1.11046007]
[ 0.1714953 0.1855081 0.64185395]
[-1.32439862 -1.03117232 1.35953558]
[-1.65155383 -0.45173971 1.52520206]
[-0.3458179 0.16410033 0.50943937]]'''
large_arr = randn(1000)
large_arr.sort()
print(large_arr[int(0.05 * len(large_arr))]) # 5%分位数
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
print(np.unique(names)) # ['Bob' 'Joe' 'Will']
ints = np.array([3,3,3,2,2,1,1,4,4])
print(np.unique(ints)) # [1 2 3 4]
print(sorted(set(names))) # ['Bob', 'Joe', 'Will']
vaules = np.array([6,0,0,3,2,5,6])
print(np.in1d(vaules, [2,3,6])) # [ True False False True True False True]
arr = np.arange(10)
# 保存时自动扩展为.npy格式文件
np.save("some_array", arr)
ret = np.load("some_array.npy")
print(ret) # [0 1 2 3 4 5 6 7 8 9]
# 将多个数组保存到一个压缩文件中,数组以关键字参数的形式传入
np.savez("array_achive.npz", a = arr, b = arr)
arch = np.load("array_achive.npz")
print(arch['b']) # [0 1 2 3 4 5 6 7 8 9]
arr = np.loadtxt("python_data/ch04/array_ex.txt", delimiter=',')
print(arr)
'''[[ 0.580052 0.18673 1.040717 1.134411]
[ 0.194163 -0.636917 -0.938659 0.124094]
[-0.12641 0.268607 -0.695724 0.047428]
[-1.484413 0.004176 -0.744203 0.005487]
[ 2.302869 0.200131 1.670238 -1.88109 ]
[-0.19323 1.047233 0.482803 0.960334]]'''
x = np.array([[1.,2.,3.],[4.,5.,6.]])
y = np.array([[6.,23.],[-1,7],[8,9]])
print(x)
'''[[1. 2. 3.]
[4. 5. 6.]]'''
print(y)
'''[[ 6. 23.]
[-1. 7.]
[ 8. 9.]]'''
print(x.dot(y)) # 相当于np.dot(x,y)
'''[[ 28. 64.]
[ 67. 181.]]'''
print(np.dot(x, np.ones(3))) # [ 6. 15.]
from numpy.linalg import inv, qr
X= randn(5,5)
mat = X.T.dot(X)
print(inv(mat)) # 计算矩阵的逆
t = mat.dot(inv(mat))
q,r = qr(mat) # 计算QR分解
samples = np.random.normal(size=(4,4))
print(samples)
from random import normalvariate
N =100000
samples = [normalvariate(0,1) for _ in range(N)] #正态分布; python3.7中没有xrange函数
ret = np.random.normal(size=N)
import random
position = 0
walk =[position]
steps = 1000
for i in range(steps):
step = 1 if random.randint(0,1) else - 1
position += step
walk.append(position)
plt.plot(walk)
plt.title("Random walk with +1/-1 steps")
plt.show()
nsteps = 1000
draws = np.random.randint(0,2,size=nsteps)
steps = np.where(draws > 0, 1, -1) # 感觉这个比较实用
walk = steps.cumsum()
print(walk)
print(walk.min())
print(walk.max())
print((np.abs(walk)>10).argmax())
#一次模拟多个随机漫步
nwalks = 5000
nsteps = 1000
draws = np.random.randint(0,2, size=(nwalks, nsteps)) # 0或1
steps = np.where(draws > 0, 1, -1)
walks = steps.cumsum(1)
print(walks)
print(walk.max())
print(walk.min())
# 计算30的最小穿越时间
hits30 = (np.abs(walks) > 30).any(1)
print(hits30)
# 达到30的数量
print(hits30.sum())
# 选出超过30(绝对值)的随机漫步,并获取在轴以上的穿越时间
crossing_times = (np.abs(walks[hits30])>=30).argmax(1)
print(crossing_times.mean())