3-3 OneHot编码

3.3 OneHot编码

请参考《数据准备和特征工程》中的相关章节,调试如下代码。

3-3 OneHot编码_第1张图片

基础知识

import pandas as pd

g = pd.DataFrame({"gender": ["man", 'woman', 'woman', 'man', 'woman']})
g
gender
0 man
1 woman
2 woman
3 man
4 woman
# 将分类型特征转换为“虚拟变量”(哑变量)
pd.get_dummies(g)
gender_man gender_woman
0 1 0
1 0 1
2 0 1
3 1 0
4 0 1
df = pd.DataFrame({"gene_seg": ['A', 'B', 'B', 'A', 'A'],
                   'dis': ['gall', 'hyp', 'gall', 'hyp', 'hyp']
                  })
df
gene_seg dis
0 A gall
1 B hyp
2 B gall
3 A hyp
4 A hyp
pd.get_dummies(df)
gene_seg_A gene_seg_B dis_gall dis_hyp
0 1 0 1 0
1 0 1 0 1
2 0 1 1 0
3 1 0 0 1
4 1 0 0 1
persons = pd.DataFrame({"name":["Newton", "Andrew Ng", "Jodan", "Bill Gates"],
                        'color':['white', 'yellow', 'black', 'white']})
persons
name color
0 Newton white
1 Andrew Ng yellow
2 Jodan black
3 Bill Gates white
# drop_first=True:去除冗余的特征
df_dum = pd.get_dummies(persons['color'], drop_first=True)   

# left_index、right_index:通过左侧/右侧的行索引进行合并
persons.merge(df_dum, left_index=True, right_index=True)
name color white yellow
0 Newton white 1 0
1 Andrew Ng yellow 0 1
2 Jodan black 0 0
3 Bill Gates white 1 0
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
features = ohe.fit_transform(persons[['color']])
features.toarray()
array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])
# 去除冗余特征
features.toarray()[:, 1:]
array([[1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.]])

项目案例

df = pd.DataFrame({
    "color": ['green', 'red', 'blue', 'red'],
    "size": ['M', 'L', 'XL', 'L'],
    "price": [29.9, 69.9, 99.9, 59.9],
    "classlabel": ['class1', 'class2', 'class1', 'class1']
})
df
color size price classlabel
0 green M 29.9 class1
1 red L 69.9 class2
2 blue XL 99.9 class1
3 red L 59.9 class1
size_mapping = {'XL': 3, 'L': 2, 'M': 1}

# map()方法相当于replace()方法,将特征“size”数值化
df['size'] = df['size'].map(size_mapping)    
df
color size price classlabel
0 green 1 29.9 class1
1 red 2 69.9 class2
2 blue 3 99.9 class1
3 red 2 59.9 class1
from sklearn.preprocessing import OneHotEncoder

# 获取OneHot编码模型ohe
ohe = OneHotEncoder()
fs = ohe.fit_transform(df[['color']])

# 去除冗余特征
fs_ohe = pd.DataFrame(fs.toarray()[:, 1:], columns=["color_green", 'color_red'])

# 将两表df, fs_ohe按列拼接为一个表
df = pd.concat([df, fs_ohe], axis=1)
df
color size price classlabel color_green color_red
0 green 1 29.9 class1 1.0 0.0
1 red 2 69.9 class2 0.0 1.0
2 blue 3 99.9 class1 0.0 0.0
3 red 2 59.9 class1 0.0 1.0

动手练习

cl = ohe.fit_transform(df[['classlabel']])

# 去除冗余特征
cl_ohe = pd.DataFrame(cl.toarray()[:, :-1], columns=["classlabel_class1"])

# 将两表df, cl_ohe按列拼接为一个表
df = pd.concat([df, cl_ohe], axis=1)
df
color size price classlabel color_green color_red classlabel_class1
0 green 1 29.9 class1 1.0 0.0 1.0
1 red 2 69.9 class2 0.0 1.0 0.0
2 blue 3 99.9 class1 0.0 0.0 1.0
3 red 2 59.9 class1 0.0 1.0 1.0
pd.read_csv("/home/aistudio/data/data20513/breast-cancer.data").head()
no-recurrence-events 30-39 premeno 30-34 0-2 no 3 left left_low no.1
0 no-recurrence-events 40-49 premeno 20-24 0-2 no 2 right right_up no
1 no-recurrence-events 40-49 premeno 20-24 0-2 no 2 left left_low no
2 no-recurrence-events 60-69 ge40 15-19 0-2 no 2 right left_up no
3 no-recurrence-events 40-49 premeno 0-4 0-2 no 2 right right_low no
4 no-recurrence-events 60-69 ge40 15-19 0-2 no 2 left left_low no
# 根据预览上面的csv文件之后,发现数据集没有特征行,因此标记为:header=None
df = pd.read_csv("/home/aistudio/data/data20513/breast-cancer.data", header=None).iloc[:, 1:]

print(df.shape)
df.head()
(286, 9)
1 2 3 4 5 6 7 8 9
0 30-39 premeno 30-34 0-2 no 3 left left_low no
1 40-49 premeno 20-24 0-2 no 2 right right_up no
2 40-49 premeno 20-24 0-2 no 2 left left_low no
3 60-69 ge40 15-19 0-2 no 2 right left_up no
4 40-49 premeno 0-4 0-2 no 2 right right_low no
dataset = df.values
print(dataset)

# 前8列作为自变量X,最后1列作为样本标签,即因变量Y
X = dataset[:, 0:8]
X = X.astype(str)

Y = dataset[:, 8]
[['30-39' 'premeno' '30-34' ... 'left' 'left_low' 'no']
 ['40-49' 'premeno' '20-24' ... 'right' 'right_up' 'no']
 ['40-49' 'premeno' '20-24' ... 'left' 'left_low' 'no']
 ...
 ['60-69' 'ge40' '20-24' ... 'right' 'left_up' 'no']
 ['40-49' 'ge40' '30-34' ... 'left' 'left_low' 'no']
 ['50-59' 'ge40' '30-34' ... 'left' 'left_low' 'no']]
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoded_x = None
for i in range(0, X.shape[1]):

    label_encoder = LabelEncoder()   #只接受行向量
    feature = label_encoder.fit_transform(X[:,i])    # 将X的每1列转置为行向量进行数值化
    feature = feature.reshape(X.shape[0], 1)         #将feature转置为列向量

    onehot_encoder = OneHotEncoder(sparse=False)    # OneHot编码,只接受列向量
    feature = onehot_encoder.fit_transform(feature)

    if encoded_x is None:
        encoded_x = feature
    else:
        encoded_x = np.concatenate((encoded_x, feature), axis=1)

# 8列的特征通过变换后增大到41列
print("X shape: : ", encoded_x.shape)
X shape: :  (286, 41)
label_encoder = LabelEncoder()
feature = label_encoder.fit_transform(Y)

onehot_encoder = OneHotEncoder(sparse=False)    # OneHot编码,只接受列向量
feature = onehot_encoder.fit_transform(feature.reshape(Y.shape[0], 1))

print("Y shape: : ", feature.shape)
Y shape: :  (286, 2)

你可能感兴趣的:(CH3-数据准备和特征工程,python,数据挖掘,机器学习)