3.3 OneHot编码
请参考《数据准备和特征工程》中的相关章节,调试如下代码。
基础知识
import pandas as pd
g = pd.DataFrame({"gender": ["man", 'woman', 'woman', 'man', 'woman']})
g
|
gender |
0 |
man |
1 |
woman |
2 |
woman |
3 |
man |
4 |
woman |
pd.get_dummies(g)
|
gender_man |
gender_woman |
0 |
1 |
0 |
1 |
0 |
1 |
2 |
0 |
1 |
3 |
1 |
0 |
4 |
0 |
1 |
df = pd.DataFrame({"gene_seg": ['A', 'B', 'B', 'A', 'A'],
'dis': ['gall', 'hyp', 'gall', 'hyp', 'hyp']
})
df
|
gene_seg |
dis |
0 |
A |
gall |
1 |
B |
hyp |
2 |
B |
gall |
3 |
A |
hyp |
4 |
A |
hyp |
pd.get_dummies(df)
|
gene_seg_A |
gene_seg_B |
dis_gall |
dis_hyp |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
1 |
2 |
0 |
1 |
1 |
0 |
3 |
1 |
0 |
0 |
1 |
4 |
1 |
0 |
0 |
1 |
persons = pd.DataFrame({"name":["Newton", "Andrew Ng", "Jodan", "Bill Gates"],
'color':['white', 'yellow', 'black', 'white']})
persons
|
name |
color |
0 |
Newton |
white |
1 |
Andrew Ng |
yellow |
2 |
Jodan |
black |
3 |
Bill Gates |
white |
df_dum = pd.get_dummies(persons['color'], drop_first=True)
persons.merge(df_dum, left_index=True, right_index=True)
|
name |
color |
white |
yellow |
0 |
Newton |
white |
1 |
0 |
1 |
Andrew Ng |
yellow |
0 |
1 |
2 |
Jodan |
black |
0 |
0 |
3 |
Bill Gates |
white |
1 |
0 |
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
features = ohe.fit_transform(persons[['color']])
features.toarray()
array([[0., 1., 0.],
[0., 0., 1.],
[1., 0., 0.],
[0., 1., 0.]])
features.toarray()[:, 1:]
array([[1., 0.],
[0., 1.],
[0., 0.],
[1., 0.]])
项目案例
df = pd.DataFrame({
"color": ['green', 'red', 'blue', 'red'],
"size": ['M', 'L', 'XL', 'L'],
"price": [29.9, 69.9, 99.9, 59.9],
"classlabel": ['class1', 'class2', 'class1', 'class1']
})
df
|
color |
size |
price |
classlabel |
0 |
green |
M |
29.9 |
class1 |
1 |
red |
L |
69.9 |
class2 |
2 |
blue |
XL |
99.9 |
class1 |
3 |
red |
L |
59.9 |
class1 |
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
df
|
color |
size |
price |
classlabel |
0 |
green |
1 |
29.9 |
class1 |
1 |
red |
2 |
69.9 |
class2 |
2 |
blue |
3 |
99.9 |
class1 |
3 |
red |
2 |
59.9 |
class1 |
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
fs = ohe.fit_transform(df[['color']])
fs_ohe = pd.DataFrame(fs.toarray()[:, 1:], columns=["color_green", 'color_red'])
df = pd.concat([df, fs_ohe], axis=1)
df
|
color |
size |
price |
classlabel |
color_green |
color_red |
0 |
green |
1 |
29.9 |
class1 |
1.0 |
0.0 |
1 |
red |
2 |
69.9 |
class2 |
0.0 |
1.0 |
2 |
blue |
3 |
99.9 |
class1 |
0.0 |
0.0 |
3 |
red |
2 |
59.9 |
class1 |
0.0 |
1.0 |
动手练习
cl = ohe.fit_transform(df[['classlabel']])
cl_ohe = pd.DataFrame(cl.toarray()[:, :-1], columns=["classlabel_class1"])
df = pd.concat([df, cl_ohe], axis=1)
df
|
color |
size |
price |
classlabel |
color_green |
color_red |
classlabel_class1 |
0 |
green |
1 |
29.9 |
class1 |
1.0 |
0.0 |
1.0 |
1 |
red |
2 |
69.9 |
class2 |
0.0 |
1.0 |
0.0 |
2 |
blue |
3 |
99.9 |
class1 |
0.0 |
0.0 |
1.0 |
3 |
red |
2 |
59.9 |
class1 |
0.0 |
1.0 |
1.0 |
pd.read_csv("/home/aistudio/data/data20513/breast-cancer.data").head()
|
no-recurrence-events |
30-39 |
premeno |
30-34 |
0-2 |
no |
3 |
left |
left_low |
no.1 |
0 |
no-recurrence-events |
40-49 |
premeno |
20-24 |
0-2 |
no |
2 |
right |
right_up |
no |
1 |
no-recurrence-events |
40-49 |
premeno |
20-24 |
0-2 |
no |
2 |
left |
left_low |
no |
2 |
no-recurrence-events |
60-69 |
ge40 |
15-19 |
0-2 |
no |
2 |
right |
left_up |
no |
3 |
no-recurrence-events |
40-49 |
premeno |
0-4 |
0-2 |
no |
2 |
right |
right_low |
no |
4 |
no-recurrence-events |
60-69 |
ge40 |
15-19 |
0-2 |
no |
2 |
left |
left_low |
no |
df = pd.read_csv("/home/aistudio/data/data20513/breast-cancer.data", header=None).iloc[:, 1:]
print(df.shape)
df.head()
(286, 9)
|
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
0 |
30-39 |
premeno |
30-34 |
0-2 |
no |
3 |
left |
left_low |
no |
1 |
40-49 |
premeno |
20-24 |
0-2 |
no |
2 |
right |
right_up |
no |
2 |
40-49 |
premeno |
20-24 |
0-2 |
no |
2 |
left |
left_low |
no |
3 |
60-69 |
ge40 |
15-19 |
0-2 |
no |
2 |
right |
left_up |
no |
4 |
40-49 |
premeno |
0-4 |
0-2 |
no |
2 |
right |
right_low |
no |
dataset = df.values
print(dataset)
X = dataset[:, 0:8]
X = X.astype(str)
Y = dataset[:, 8]
[['30-39' 'premeno' '30-34' ... 'left' 'left_low' 'no']
['40-49' 'premeno' '20-24' ... 'right' 'right_up' 'no']
['40-49' 'premeno' '20-24' ... 'left' 'left_low' 'no']
...
['60-69' 'ge40' '20-24' ... 'right' 'left_up' 'no']
['40-49' 'ge40' '30-34' ... 'left' 'left_low' 'no']
['50-59' 'ge40' '30-34' ... 'left' 'left_low' 'no']]
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
encoded_x = None
for i in range(0, X.shape[1]):
label_encoder = LabelEncoder()
feature = label_encoder.fit_transform(X[:,i])
feature = feature.reshape(X.shape[0], 1)
onehot_encoder = OneHotEncoder(sparse=False)
feature = onehot_encoder.fit_transform(feature)
if encoded_x is None:
encoded_x = feature
else:
encoded_x = np.concatenate((encoded_x, feature), axis=1)
print("X shape: : ", encoded_x.shape)
X shape: : (286, 41)
label_encoder = LabelEncoder()
feature = label_encoder.fit_transform(Y)
onehot_encoder = OneHotEncoder(sparse=False)
feature = onehot_encoder.fit_transform(feature.reshape(Y.shape[0], 1))
print("Y shape: : ", feature.shape)
Y shape: : (286, 2)