目录
1 数组详解
2 转化详解
在数据预处理中,经常需要各种数据结构相互转化,元组、列表、numpy.array、字典、张量 tensor、dataframe。
本文中代码较多,文字较少,注释大都是运行结果,请需要代码解释的仔细看代码就好,文字只是辅助。
列表 list
可以随时添加和删除其中的元素
x = [1, 2, 3, 4]
x2 = [tf.constant(1)]*4
print(x) # [1, 2, 3, 4]
print(x2) # [,..., ]
基本操作:
x1 = []
x2 = []
y1 = [[1,2], [3,4], [5, 6]]
y2 = [[11, 22], [33, 44], [55, 66]]
# 追加
x1.append(y1)
x1.append(y2)
print(x1) # [[[1, 2], [3, 4], [5, 6]], [[11, 22], [33, 44], [55, 66]]]
# 扩展
x2.extend(y1)
x2.extend(y2)
print(x2) # [[1, 2], [3, 4], [5, 6], [11, 22], [33, 44], [55, 66]]
# 替换
x3 = ['as', 'as as', 'sdd']
print([temp.replace(' ', '') for temp in x3]) # ['as', 'asas', 'sdd']
元组 tuple
一旦初始化就不能修改,tensor.shape就用tuple表示。
x = (1, 2, 3, 4)
x2 = (tf.constant(1), tf.constant(1))
x3 = (1, 2, 3, 4)
print(x) # (1, 2, 3, 4)
print(x2) # (, )
print(x3[:]) # (1, 2, 3, 4)
# tuple定义一个元素,必须加,
x = ()
x1 = (1)
x2 = (1,)
print(x) # ()
print(x1) # 1
print(x2) # (1,)
字典 dict
x = {1: 'Zarten_1', 2: 'Zarten_2', 3: 'Zarten_3'}
print(x)
print(x[1]) # Zarten_1
print(x[0]) # KeyError: 0
# print(x[:]) # unhashable type: 'slice'
# 无法直接切片,必须转list
print(list(x.values())[:]) # ['Zarten_1', 'Zarten_2', 'Zarten_3']
print(x.keys()) # dict_keys([1, 2, 3])
print(x.items())
numpy.array
可存储不同类型
import numpy as np
x = np.array([1, 2, 3, 4])
x1 = np.array((1, 2, 3, 4))
x2 = tf.constant([1, 2, 3, 4]).numpy()
print(x[1:3])
print(x)
print(x1)
print(x2)
张量 tensor
各大机器学习框架常用的数据结构。
x = tf.constant([1, 2, 3, 4])
x2 = tf.constant((1, 2, 3, 4))
x3 = tf.constant(np.array([1, 2, 3, 4]))
print(x) # tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
print(x2) # tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
print(x3) # tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
print(x[:2]) # tf.Tensor([1 2], shape=(2,), dtype=int32)
x = tf.constant([1])
x2 = tf.constant((1))
x3 = tf.constant((1,))
print(x) # tf.Tensor([1], shape=(1,), dtype=int32)
print(x2) # tf.Tensor(1, shape=(), dtype=int32)
print(x3) # tf.Tensor([1], shape=(1,), dtype=int32)
基本操作
x = tf.constant([1, 2, 3]) # (3,)
x2 = tf.constant([[1, 2, 3], [4, 5, 6], [11, 22, 33]]) # (3, 3)
x3 = tf.constant(1) # ()
x4 = tf.constant([[1, 2, 3]]) # (1, 3)
x5 = tf.constant([[[1, 2], [3, 4], [11, 22]],
[[5, 6], [7, 8], [44, 55]]]) # (2, 3, 2)
y1 = tf.broadcast_to(x, shape=[3, 3]) # (3, 3)
y11 = tf.reshape(x, [1]) # (1,)
y12 = tf.reshape(x, (1,)) # (1,)
y2 = tf.reshape(x, shape=[1, 3]) # (1, 3)
y3 = tf.reshape(y1, shape=[3, 3, 1]) # (3, 3, 1)
y4 = tf.reshape(y3, shape=[-1, 9]) # (1, 9) 此操作仅改变shape,元素总量不变
# 合并和拆分
x4 = [tf.constant(1), tf.constant(2), tf.constant(3), tf.constant(4)]
y5 = tf.stack(x4) # tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
y2 = tf.stack(x5, axis=1) # 返回tensor,(3, 2, 2)
"""
y2:
tf.Tensor(
[[[ 1 2]
[ 5 6]]
[[ 3 4]
[ 7 8]]
[[11 22]
[44 55]]], shape=(3, 2, 2), dtype=int32)
"""
x = tf.random.uniform(shape=[20, 80, 9])
# y1 = tf.unstack(x, axis=1) # 返回tensor列表
y1 = [temp for temp in tf.unstack(x, axis=1)]
y2 = tf.stack(y1, axis=1)
print(len(y1))
print(y2.shape)
"""
y1:
[, , ]
"""
# 填充,默认填充0,constant_values可设置填充值
y6 = tf.pad(x, [[2, 4]]) # (9,),前面、后面分别填充2、4个0
y7 = tf.pad(x2, [[0, 0], [0, 1]]) # (3, 4),第一维前面、后面分别填充0、0个0;第二维前面、后面分别填充0、1个0
# 挤压,去掉多余的维度
y8 = tf.squeeze(x4, axis=[0]) # [1, 2, 3]=>(3,)
dataframe
pandas 数据类型DataFrame,可以看成excel表。它含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔型等),DataFrame既有行索引(index)也有列索引(column)。 DataFrame的创建有多种方式,不过最重要的还是根据dict进行创建,以及读取csv或者txt文件来创建。
# 以字典dict方式建立
df1 = pd.DataFrame({'name': ['Tom', 'Jone', 'Marry'],
'age': [20, 18, 19],
'income': [1000, 3000, 2000]},
index=['person1', 'person2', 'person3']) # 若没有写index的值,则采用默认的index:0,1,2...
print(df1)
"""
out:
name age income
person1 Tom 20 1000
person2 Jone 18 3000
person3 Marry 19 2000
"""
dataframe1 = pd.read_csv('../datasets/X-IIoTID.csv', dtype=str)
print(dataframe1.head(5))
"""
out:
Date Timestamp ... class2 class3
0 9/01/2020 1578540956 ... Reconnaissance Attack
1 13/01/2020 1578871873 ... Normal Normal
2 9/01/2020 1578522486 ... Normal Normal
3 27/02/2020 1582757640 ... Normal Normal
4 16/12/2019 1576452612 ... Normal Normal
[5 rows x 68 columns]
"""
# header参数为None指明不加载列名,用int代替
dataframe2 = pd.read_csv('../datasets/X-IIoTID.csv', dtype=str, header=None)
# data2 = dataframe.drop([0], axis=0) # 删除0行,即原来的列名
print(dataframe2.head(5))
"""
out:
0 1 ... 66 67
0 Date Timestamp ... class2 class3
1 9/01/2020 1578540956 ... Reconnaissance Attack
2 13/01/2020 1578871873 ... Normal Normal
3 9/01/2020 1578522486 ... Normal Normal
4 27/02/2020 1582757640 ... Normal Normal
[5 rows x 68 columns]
"""
# dataframe的属性
print(df1.index) # Index(['person1', 'person2', 'person3'], dtype='object')
print(df1.columns) # Index(['name', 'age', 'income'], dtype='object')
print(df1.values) # ->np.ndarray
# 取一行,可切片
print(df1[:1])
# 取一列,必须有列名,即特征名
print(df1[['name', 'name']])
print(dataframe2[[0, 67]]) # 使用自动列名,但不能切片
list、tuple、字典
list、tuple无法转化成字典
x = [1, 2, 3, 4]
y = (1, 2, 3, 4)
z = {10: '1.1', 20: '2.1', 30: '3.1', 40: '4.1'}
x1 = tuple(x)
y2 = list(y)
z1 = list(z) # 只转化keys
z2 = list(z.valuas())
print(z1) # [10, 20, 30, 40]
list、tensor、array、dataframe
x = [1, 2, 3, 4]
m = tf.constant([1, 2, 3, 4])
n = np.array([1, 2, 3, 4])
df = pd.DataFrame({'c1': [1, 2, 3],
'c2': [20, 18, 19],
'c3': [1000, 3000, 2000]},
index=['r1', 'r2', 'r3'])
# tensor => array
m1 = m.numpy()
# tensor => list 没有直接转的方式
m2 = list(m1)
# array => list
n1 = n.tolist()
n2 = list(n)
# array => dataframe
df_array = pd.DataFrame(n) # 行列标签都是数字0,1,2...
# dataframe => array
df_array = np.array(df) # 二维数组,不包含行列标签
# dataframe => list
print(list(df_array)[0]) # [1 20 1000]
print(list(df_array)[0][0]) # 1
# dataframe => tensor
df_tensor = tf.constant(df) # shape=[3, 3]
dataframe、dict
dataframe转字典有多种方法,目标是获取array或list,方便转
print(df.to_dict('dict')) # {'c1': {'r1': 1, 'r2': 2, 'r3': 3}, 'c2': {'r1': 20, 'r2': 18, 'r3': 19}, 'c3': {'r1': 1000, 'r2': 3000, 'r3': 2000}}
print(df.to_dict('list')) # {'c1': [1, 2, 3], 'c2': [20, 18, 19], 'c3': [1000, 3000, 2000]}
print(df.to_dict('split')) # {'index': ['r1', 'r2', 'r3'], 'columns': ['c1', 'c2', 'c3'], 'data': [[1, 20, 1000], [2, 18, 3000], [3, 19, 2000]]}
print(df.to_dict('split')['data'])
print(df.to_dict('index')) # {'r1': {'c1': 1, 'c2': 20, 'c3': 1000}, 'r2': {'c1': 2, 'c2': 18, 'c3': 3000}, 'r3': {'c1': 3, 'c2': 19, 'c3': 2000}}
df_dict = dict(df)
ds = tf.data.Dataset.from_tensor_slices(df_dict)
for step, item in enumerate(ds):
print(item.values()) # dict_values([, ......
print(tf.stack(list(item.values()))) # tf.Tensor([ 1 20 1000], shape=(3,), dtype=int64)
如有补充,欢迎留言。