mapping = {"Freezing" : 0 ,"Warm" : 1 ,"Cold" : 2 ,"Boiling Hot" : 3 ,"Hot" : 4 ,"Lava Hot" : 5}
现在,我们可以读取数据集,并轻松地将这些类别转换为数字。
import pandas as pd
df = pd.read_csv(" . /input/cat_train.csv")
df.loc[:, "*ord_2*"] = df.*ord_2*.map(mapping)
映射前的数值计数:
df . * ord_2 * . value_counts ()Freezing 142726Warm 124239Cold 97822Boiling Hot 84790Hot 67508Lava Hot 64840Name : * ord_2 * , dtype : int64
映射后的数值计数:
0.0 1427261.0 1242392.0 978223.0 847904.0 675085.0 64840Name : * ord_2 * , dtype : int64
import pandas as pd
from sklearn import preprocessing
df = pd.read_csv(" . /input/cat_train.csv")
df.loc[:, "*ord_2*"] = df.*ord_2*.fillna("NONE")
lbl_enc = preprocessing.LabelEncoder()
df.loc[:, "*ord_2*"] = lbl_enc.fit_transform(df.*ord_2*.values)
import numpy as np
example = np.array(
[
[0, 0, 1],
[1, 0, 0],
[1, 0, 1]
]
)
print(example.nbytes)
import numpy as np
from scipy import sparse
example = np.array(
[
[0, 0, 1],
[1, 0, 0],
[1, 0, 1]
]
)
sparse_example = sparse.csr_matrix(example)
print(sparse_example.data.nbytes)
这将打印 32,⽐我们的密集数组少了这么多!稀疏 csr 矩阵的总⼤⼩是三个值的总和。
print(
sparse_example.data.nbytes +
sparse_example.indptr.nbytes +
sparse_example.indices.nbytes
)
import numpy as np
from scipy import sparse
n_rows = 10000
n_cols = 100000
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))
print(f"Size of dense array: {example.nbytes}")
sparse_example = sparse.csr_matrix(example)
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
sparse_example.data.nbytes +
sparse_example.indptr.nbytes +
sparse_example.indices.nbytes
)
print(f"Full size of sparse array: {full_size}")
这将打印:
Size of dense array : 8000000000Size of sparse array : 399932496Full size of sparse array : 599938748
每个样本有3个特征。但在这种情况下,独热向量的⼤⼩为 6。因此,我们有6个特征,⽽不是3个。
因此,我们有 6 个特征,⽽在这个 3x6 数组中,只有 3 个1。使⽤ numpy 计算⼤⼩与⼆值化⼤⼩计算脚本⾮常相似。你需要改变的只是数组。让我们看看这段代码。
import numpy as np
from scipy import sparse
example = np.array(
[
[0, 0, 0, 0, 1, 0],
[0, 1, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0]
]
)
print(f"Size of dense array: {example.nbytes}")
sparse_example = sparse.csr_matrix(example)
print(f"Size of sparse array: {sparse_example.data.nbytes}")
full_size = (
sparse_example.data.nbytes +
sparse_example.indptr.nbytes +
sparse_example.indices.nbytes
)
print(f"Full size of sparse array: {full_size}")
打印内存⼤⼩为:
Size of dense array : 144Size of sparse array : 24Full size of sparse array : 52
import numpy as np
from sklearn import preprocessing
example = np.random.randint(1000, size=1000000)
ohe = preprocessing.OneHotEncoder(sparse=False)
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
print(f"Size of dense array: {ohe_example.nbytes}")
ohe = preprocessing.OneHotEncoder(sparse=True)
ohe_example = ohe.fit_transform(example.reshape(-1, 1))
print(f"Size of sparse array: {ohe_example.data.nbytes}")
full_size = (
ohe_example.data.nbytes +
ohe_example.indptr.nbytes +
ohe_example.indices.nbytes
)
print(f"Full size of sparse array: {full_size}")
上⾯代码打印的输出:
Size of dense array : 8000000000Size of sparse array : 8000000Full size of sparse array : 16000004
In [ X ]: df [ df . ord_2 = "Boiling Hot" ]. shapeOut [ X ]: ( 84790 , 25 )
In [ X ]: df . groupby ([ "ord_2" ])[ "id" ]. count ()Out [ X ]:ord_2Boiling Hot 84790Cold 97822Freezing 142726Hot 67508Lava Hot 64840Warm 124239Name : id , dtype : int64
In [ X ]: df . groupby ([ "ord_2" ])[ "id" ]. transform ( "count" )Out [ X ]:0 67508.01 124239.02 142726.03 64840.04 97822.0.599995 142726.0599996 84790.0599997 142726.0599998 124239.0599999 84790.0Name : id , Length : 600000 , dtype : float64
In [ X ]: df . groupby (. : [. : "ord_1" ,. : "ord_2". : ]. : )[ "id" ]. count (). reset_index ( name = "count" )Out [ X ]:ord_1 ord_2 count0 Contributor Boiling Hot 156341 Contributor Cold 177342 Contributor Freezing 260823 Contributor Hot 124284 Contributor Lava Hot 119195 Contributor Warm 227746 Expert Boiling Hot 194777 Expert Cold 229568 Expert Freezing 332499 Expert Hot 1579210 Expert Lava Hot 1507811 Expert Warm 2890012 Grandmaster Boiling Hot 1362313 Grandmaster Cold 1546414 Grandmaster Freezing 2281815 Grandmaster Hot 1080516 Grandmaster Lava Hot 1036317 Grandmaster Warm 1989918 Master Boiling Hot 10800.
In [ X ]: df [ "new_feature" ] = (. : df . ord_1 . astype ( str ). : + "_". : + df . ord_2 . astype ( str ). : )In [ X ]: df . new_featureOut [ X ]:0 Contributor_Hot1 Grandmaster_Warm2 nan_Freezing3 Novice_Lava Hot4 Grandmaster_Cold.599999 Contributor_Boiling HotName : new_feature , Length : 600000 , dtype : object
In [ X ]: df [ "new_feature" ] = (. : df . ord_1 . astype ( str ). : + "_". : + df . ord_2 . astype ( str ). : + "_". : + df . ord_3 . astype ( str ). : )In [ X ]: df . new_featureOut [ X ]:0 Contributor_Hot_c1 Grandmaster_Warm_e2 nan_Freezing_n3 Novice_Lava Hot_a4 Grandmaster_Cold_h.599999 Contributor_Boiling Hot_bName : new_feature , Length : 600000 , dtype : object
In [ X ]: df . ord_2 . value_counts ()Out [ X ]:Freezing 142726Warm 124239Cold 97822Boiling Hot 84790Hot 67508Lava Hot 64840Name : ord_2 , dtype : int64
填⼊ NaN 值后,就变成了
In [ X ]: df . ord_2 . fillna ( "NONE" ). value_counts ()Out [ X ]:Freezing 142726Warm 124239Cold 97822Boiling Hot 84790Hot 67508Lava Hot 64840NONE 18075Name : ord_2 , dtype : int64
import pandas as pd
from sklearn import preprocessing
train = pd.read_csv(" . /input/cat_train.csv")
test = pd.read_csv(" . /input/cat_test.csv")
test.loc[:, "target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)
features = [x for x in train.columns if x not in ["id", "target"]]
for feat in features:
lbl_enc = preprocessing.LabelEncoder()
temp_col = data[feat].fillna("NONE").astype(str).values
data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
train = data[data.target = -1].reset_index(drop=True)
test = data[data.target = -1].reset_index(drop=True)
In [ X ]: df . ord_2 . fillna ( "NONE" ). value_counts ()Out [ X ]:Freezing 142726Warm 124239Cold 97822Boiling Hot 84790Hot 67508Lava Hot 64840NONE 18075Name : ord_2 , dtype : int64