python数据分析numpy

numpy

一、numpy创建数组

1. 导numpy包

import numpy as np

2. 用numpy创建数组3种方式

data = np.array([1, 2, 3])
print(f"data:{data}")	# data:[1 2 3]
print(type(data))		# 
data1 = np.arange(10)
print(f"data1:{data1}") # data1:[0 1 2 3 4 5 6 7 8 9]
data2 = np.array(range(10))
print(f"data2:{data2}") # data2:[0 1 2 3 4 5 6 7 8 9]

3. numpy数组元素的类型

data3 = np.array([1, 2, 3], dtype='i4')
print(data3)		# [1 2 3]
print(type(data3))	# 
print(data3.dtype)	# int32

data4 = data3.astype('i1')	# 改类型
print(data4.dtype)			# int8

详情见 numpy数据类型

int8 ‘i1’ int16 ‘i2’ int32 ‘i3’ int64 ‘i4’
uint8 uint16 uint32 uint64
float8 float16 float32 float64

下面虽然不是numpy数据类型,但可以直接填在dtype=右边

int 默认为int32,即’i4’
float 默认为float64,即’f8’
bool 列表中元素0为False,其它为True

4. numpy.round()用法

data5 = np.array([random.random() for i in range(1, 10)])
print(data5)		
# [0.6853715, 0.40404027, 0.7960919, 0.1866534, 0.59791729, 0.64194195, 0.57661904, 0.41688956, 0.02173108]
print(data5.dtype)	# float64

# 1. 数组的round()
data6 = data5.round(2)
print(data6)	# [0.69, 0.4, 0.8, 0.19, 0.6, 0.64, 0.58, 0.42, 0.02]

# 2. numpy的round()
data7 = np.round(data5, 2)
print(data7)	# [0.69, 0.4, 0.8, 0.19, 0.6, 0.64, 0.58, 0.42, 0.02]

二、shape()和reshape()用法

1. numpy二维数组

import numpy as np

t1 = np.arange(24).reshape(4, 6)
"""     [[0, 1, 2, 3, 4, 5], 
         [6, 7, 8, 9, 10, 11],
         [12, 13, 14, 15, 16, 17],
         [18, 19, 20, 21, 22, 23]]      """
print(np.shape(t1))		# (4, )

t2 = np.array(range(6))		# [0, 1, 2, 3, 4, 5]
print(t1 + t2)
"""		[[ 0  2  4  6  8 10]
 		 [ 6  8 10 12 14 16]    第1列加0,第2列加1,第3列加2...
 		 [12 14 16 18 20 22]
 		 [18 20 22 24 26 28]]	"""

t3 = np.arange(6).reshape(1, 6)  # [[0, 1, 2, 3, 4, 5]]
print(t1 + t3)
"""		[[ 0,  2,  4,  6,  8, 10],
       	 [ 6,  8, 10, 12, 14, 16],   跟上面一样的结果
         [12, 14, 16, 18, 20, 22],
         [18, 20, 22, 24, 26, 28]]	"""

t4 = np.arange(4).reshape(4, 1)
"""     [[0],
         [1],
         [2],
         [3]]    """
print(t1 + t4)
"""		[[ 0  1  2  3  4  5]
 		 [ 7  8  9 10 11 12]    第1行加0,第2行加1,第3行加2... 
 		 [14 15 16 17 18 19]
 		 [21 22 23 24 25 26]]	"""

2. numpy三维数组

t5 = np.arange(18).reshape(3, 3, 2)

t6 = np.arange(9).reshape(3, 3, 1)
t7 = np.arange(3).reshape(3, 1, 1)
# t5 + t6 和 t5 + t7 都可以

t8 = np.arange(6).reshape(1, 3, 2)
t9 = np.arange(2).reshape(1, 1, 2)
# t5 + t8 和 t5 + t9 都可以

t10 = np.arange(6).reshape(3, 2)
# t5 + t10 可以
  1. 数组行和列相同,相同行列元素计算
    • [1, 2, 3] + [4, 5, 6] = [5, 7, 9]
  2. 数组与一个数字计算,数组每行每列都与该数字计算
    • [1, 2, 3] + 1 = [2, 3, 4]
  3. 二维数组可以与行相同,列为1的二维数组计算,结果是行计算
    • (4, 6) + (4, 1) = (4, 6) 详见 t1 + t4
  4. 二维数组可以与列相同,行为1的二维数组计算,结果是列计算
    • (4, 6) + (1, 6) = (6, 3) 详见 t1 + t3
  5. 二维数组可以与一维数组计算,结果是是列计算
    • (4, 6) + [0, 1, 2, 3, 4, 5, 6] = (4, 6) 详见 t1 + t2
  6. 三维数组情况见上面

三、numpy数组取值

import numpy as np

us_file_address = "D:\\python_class_project\\dataAnalysis\\file\\us"
data = np.loadtxt(us_file_address, delimiter=",", dtype="i4")

print(data)
print('*' * 40)
# 取一行
newData = data[0]

# 取连续多行
newData = data[1:3]   # [1, 3)

# 取不连续多行
newData = data[[0, 2]]

# 取一列
newData = data[:, 0]

# 取连续多列
newData = data[:, 2:]

# 取不连续多列
newData = data[:, [0, 2, 3]]

# 取一元素
newData = data[2, 3]

# 取一块
newData = data[2:, 0:3]

# 取多个指定元素
newData = data[[0, 3], [0, 3]]

四、numpy数组赋值

1. 导入数据

import numpy as np

us_file_address = "D:\\python_class_project\\dataAnalysis\\file\\us"
data = np.loadtxt(us_file_address, delimiter=",", dtype="i4")
print(data)
"""		[[ 145   64   78   20]
 		 [ 241  111   45   20]
 		 [1111   54   10   36]
 		 [  45    2    3    0]
 		 [  77   45   23   12]
 		 [ 359   72   26   44]]		"""

2. 改变数组的元素

newData = np.copy(data)
newData[:, [0, 2]] = 0
print(newData)
"""		[[  0  64   0  20]
 		 [  0 111   0  20]
 		 [  0  54   0  36]
 		 [  0   2   0   0]
 		 [  0  45   0  12]
 		 [  0  72   0  44]]		"""

3. 布尔索引

newData = np.copy(data)
print(newData > 10)
""" [[ True  True  True  True]
     [ True  True  True  True]
     [ True  True False  True]
     [ True False False False]
     [ True  True  True  True]
     [ True  True  True  True]]     """
newData[newData > 10] = 1
print(newData)
""" [[ 1  1  1  1]
     [ 1  1  1  1]
     [ 1  1 10  1]
     [ 1  2  3  0]
     [ 1  1  1  1]
     [ 1  1  1  1]]     """

4. numpy中三元运算符

newData = np.copy(data)
newData = np.where(newData > 5, 100, 0)
print(newData)
"""    [[100 100 100 100]
        [100 100 100 100]
        [100 100 100 100]
        [100   0   0   0]
        [100 100 100 100]
        [100 100 100 100]]      """

# 上面等价于下面
newData[newData > 5] = 100
newData[newData <= 5] = 0
print(newData)
"""    [[100 100 100 100]
        [100 100 100 100]
        [100 100 100 100]
        [100   0   0   0]
        [100 100 100 100]
        [100 100 100 100]]     """

5. numpy中的clip(裁剪)

newData = np.copy(data)
newData = newData.clip(10, 100)
# <=10变为10  >=100变为100  之间不变
print(newData)
"""     [[100  64  78  20]
         [100 100  45  20]
         [100  54  10  36]
         [ 45  10  10  10]
         [ 77  45  23  12]
         [100  72  26  44]]     """

五、numpy数组中nan元素

import numpy as np

us_file_address = "D:\\python_class_project\\dataAnalysis\\file\\us"
data = np.loadtxt(us_file_address, delimiter=",", dtype="i4")

data = data.astype(float)
data[2, 2:] = np.nan
print(data)
"""		[[ 145.,   64.,   78.,   20.],
       	 [ 241.,  111.,   45.,   20.],
       	 [1111.,   54.,   nan,   nan],
         [  45.,    2.,    3.,    0.],
         [  77.,   45.,   23.,   12.],
         [ 359.,   72.,   26.,   44.]]		"""

1. nan != nan

# nan != nan
print(data != data)
"""  	[[False, False, False, False],
         [False, False, False, False],
         [False, False,  True,  True],
         [False, False, False, False],
         [False, False, False, False],
         [False, False, False, False]]		"""

2. 确定nan个数

# 方法一
np.count_nonzero(data != data)

# 方法二
np.isnan(data)
print(np.isnan(data))
"""		[[False, False, False, False],
       	 [False, False, False, False],
         [False, False,  True,  True],
         [False, False, False, False],
         [False, False, False, False],
         [False, False, False, False]]		"""
np.count_nonzero(np.isnan(data))

六、numpy数组常用函数

import numpy as np

us_file_address = "D:\\python_class_project\\dataAnalysis\\file\\us"
data = np.loadtxt(us_file_address, delimiter=",", dtype="i4")

data = data.astype(float)
data[2, 2:] = np.nan
# print(data)
"""     [[ 145.   64.   78.   20.]
         [ 241.  111.   45.   20.]
         [1111.   54.   nan   nan]
         [  45.    2.    3.    0.]
         [  77.   45.   23.   12.]
         [ 359.   72.   26.   44.]]"""

1. sum() 求和

print(data.sum())  # nan
print(data.sum(axis=0))  # [1978.  348.   nan   nan]         每列求和
print(data.sum(axis=1))  # [307. 417.  nan  50. 157. 501.]   每行求和

2. mean() 均值

print(data.mean())  # nan
print(data.mean(axis=0))  # [329.66666667  58.  nan  nan]   每列均值
print(data.mean(axis=1))  # [76.75 104.25    nan  12.5   39.25 125.25]  每行均值

3. np.median() 中值

np.median(data, axis=0)  # [193.  59.  nan  nan]

4. max() 最大值

data.max(axis=0)  # [1111.,  111.,   nan,   nan]

5. min() 最小值

data.min(axis=0)  # [45.,  2., nan, nan]

6. np.ptp() 极值(最大值和最小值的差值)

np.ptp(data, axis=0)  # [1066.,  109.,   nan,   nan]

7. std() 标准差

data.std(axis=0)  # [364.73308353,  32.56275992, nan, nan]

8. 将数组nan改为该列均值

def fun(data):
    for i in range(data.shape[1]):
        data_col = data[:, i]  # 一列
        data_col_nan = np.count_nonzero(data_col != data_col)  # 该列nan的个数
        if data_col_nan > 0:  # 判断该列是否有nan
            data_col_not_nan = data_col[np.isnan(data_col) == False].sum()  # 该列除nan外其它求和
            data_col_not_nan_mean = data_col_not_nan / (data.shape[0] - data_col_nan)  # 除nan以外均值
            data_col[np.isnan(data_col) == True] = data_col_not_nan_mean  # 将均值赋值给nan
            data[:, i] = data_col


print(data)
"""     [[ 145.   64.   78.   20.]
         [ 241.  111.   45.   20.]
         [1111.   54.   nan   nan]
         [  45.    2.    3.    0.]
         [  77.   45.   23.   12.]
         [ 359.   72.   26.   44.]]"""
fun(data)
print(data)
"""     [[ 145.    64.    78.    20. ]
         [ 241.   111.    45.    20. ]
         [1111.    54.    35.    19.2]
         [  45.     2.     3.     0. ]
         [  77.    45.    23.    12. ]
         [ 359.    72.    26.    44. ]]      """

七、numpy数组拼接

import numpy as np

t1 = np.arange(1, 5).reshape(2, 2)
t2 = np.array(range(10, 18, 2)).reshape(2, 2)

1. 竖直拼接

t3 = np.vstack((t1, t2))
print(t3)
"""     [[ 1  2]
         [ 3  4]
         [10 12]
         [14 16]]    """

t4 = np.vstack((t2, t1))
print(t4)
"""     [[10 12]
         [14 16]
         [ 1  2]
         [ 3  4]]   """

2. 水平拼接

t5 = np.hstack((t1, t2))
print(t5)
"""     [[ 1  2 10 12]
         [ 3  4 14 16]]     """

3. 数组的行交换

data = np.arange(24).reshape(4, 6)
print(data)
"""     [[ 0  1  2  3  4  5]
         [ 6  7  8  9 10 11]
         [12 13 14 15 16 17]
         [18 19 20 21 22 23]]       """
data[[1, 2]] = data[[2, 1]]
print(data)
"""     [[ 0  1  2  3  4  5]
         [12 13 14 15 16 17]
         [ 6  7  8  9 10 11]
         [18 19 20 21 22 23]]       """

4. 数组的列交换

np.random.seed(10)     # 随机种子
newData = np.random.randint(1, 10, (4, 4))
print(newData)
"""     [[5 1 2 1]
         [2 9 1 9]
         [7 5 4 1]
         [5 7 9 2]]"""
newData[:, [0, 3]] = newData[:, [3, 0]]
print(newData)
"""     [[1 1 2 5]
         [9 9 1 2]
         [1 5 4 7]
         [2 7 9 5]]     """

你可能感兴趣的:(numpy,python,数据分析)