enc = OneHotEncoder()
for feature in one_hot_feature: #one_hot_feature
enc.fit(data[feature].values.reshape(-1, 1))
train_a=enc.transform(train[feature].values.reshape(-1, 1))
test_a = enc.transform(test[feature].values.reshape(-1, 1))
train_x= sparse.hstack((train_x, train_a))
test_x = sparse.hstack((test_x, test_a))
下面是实例:
data
k1 k2 k3
0 1 2 3
1 7 5 9
2 8 6 10
>>> enc.fit(data['k1'].reshape(-1,1)) # 这里只提取‘k1’:[1,7,8]这一列出来做fit
>>> aa=enc.transform(pd.Series([1,7,3]).reshape(-1,1))#这里就是将[1,7,3]转换为(3,1)的形式再做transform。因为k1是(1,3)的,
# 所以这里要转换的都被换成(1,3)
>>> print(aa)
(0, 0) 1.0
(1, 1) 1.0
>>> aa.toarray
'
with 2 stored elements in Compressed Sparse Row format>>
>>> aa.toarray() # toarray()是集合转数组的意思,要善用这个!!5.10
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 0.]])
>>> aa=enc.transform(pd.Series([15,0,3]).values.reshape(-1,1))
Traceback (most recent call last):
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2046, in _transform
"during transform." % X.ravel()[~mask])
ValueError: unknown categorical feature present [15] during transform.
>>> aa=enc.transform(pd.Series([10,0,3]).values.reshape(-1,1))
Traceback (most recent call last):
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2046, in _transform
"during transform." % X.ravel()[~mask])
ValueError: unknown categorical feature present [10] during transform.
>>> aa=enc.transform(pd.Series([1,0,3]).values.reshape(-1,1))
>>> aa.toarray()
array([[1., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
这里出现两个一样的错误,细致分析,得到这样的结论:在OneHotEncoder中,要transform的对象X不要求包含fit对象F的全部元素,但是X中的第一个元素必须要为F中元素。
上面的是对一列做fit的,接下来看看对nxn的做fit。
enc.fit(data)
>>> cc=enc.transform(pd.Series([1,5,2]).reshape(-1,1))
Warning (from warnings module):
File "__main__", line 1
FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
Traceback (most recent call last):
File "", line 1, in
cc=enc.transform(pd.Series([1,5,2]).reshape(-1,1))
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2075, in transform
self.categorical_features, copy=True)
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 1812, in _transform_selected
return transform(X)
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2032, in _transform
% (indices.shape[0] - 1, n_features))
ValueError: X has different shape than during fitting. Expected 3, got 1.
>>> cc=enc.transform(pd.Series([1,5,2],[5,6,7],[8,9,10]))
Traceback (most recent call last):
File "", line 1, in
cc=enc.transform(pd.Series([1,5,2],[5,6,7],[8,9,10]))
File "...\Python\lib\site-packages\pandas\core\series.py", line 177, in __init__
dtype = self._validate_dtype(dtype)
File "...\Python\lib\site-packages\pandas\core\generic.py", line 152, in _validate_dtype
dtype = pandas_dtype(dtype)
File "...\Python\lib\site-packages\pandas\core\dtypes\common.py", line 1951, in pandas_dtype
npdtype = np.dtype(dtype)
TypeError: data type not understood
>>> cc=enc.transform(pd.DataFrame([1,5,2],[5,6,7],[8,9,10]))
Traceback (most recent call last):
File "...\Python\lib\site-packages\pandas\core\internals.py", line 4622, in create_block_manager_from_blocks
placement=slice(0, len(axes[0])))]
File "...\Python\lib\site-packages\pandas\core\internals.py", line 2957, in make_block
return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
File "...\Python\lib\site-packages\pandas\core\internals.py", line 120, in __init__
len(self.mgr_locs)))
ValueError: Wrong number of items passed 1, placement implies 3
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
cc=enc.transform(pd.DataFrame([1,5,2],[5,6,7],[8,9,10]))
File "...\Python\lib\site-packages\pandas\core\frame.py", line 385, in __init__
copy=copy)
File "...\Python\lib\site-packages\pandas\core\frame.py", line 533, in _init_ndarray
return create_block_manager_from_blocks([values], [columns, index])
File "...\Python\lib\site-packages\pandas\core\internals.py", line 4631, in create_block_manager_from_blocks
construction_error(tot_items, blocks[0].shape[1:], axes, e)
File "...\Python\lib\site-packages\pandas\core\internals.py", line 4608, in construction_error
passed, implied))
ValueError: Shape of passed values is (1, 3), indices imply (3, 3)
>>> cc=enc.transform(pd.Series([1,5,2]))
Traceback (most recent call last):
File "", line 1, in
cc=enc.transform(pd.Series([1,5,2]))
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2075, in transform
self.categorical_features, copy=True)
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 1809, in _transform_selected
X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
File "...\Python\lib\site-packages\sklearn\utils\validation.py", line 441, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[1. 5. 2.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
>>> cc=enc.transform(pd.Series([1,5,2]).reshape(-1,1))
Traceback (most recent call last):
File "", line 1, in
cc=enc.transform(pd.Series([1,5,2]).reshape(-1,1))
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2075, in transform
self.categorical_features, copy=True)
File " ...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 1812, in _transform_selected
return transform(X)
File "...\Python\lib\site-packages\sklearn\preprocessing\data.py", line 2032, in _transform
% (indices.shape[0] - 1, n_features))
ValueError: X has different shape than during fitting. Expected 3, got 1.
>>> cc=enc.transform(pd.Series([1,5,2]).reshape(-1,3))
>>> cc.toarray()
array([[1., 0., 0., 0., 1., 0., 0., 0., 0.]])
>>> cc=enc.transform(pd.Series([1,2,5]).reshape(-1,3))
>>> cc.toarray()
array([[1., 0., 0., 1., 0., 0., 0., 0., 0.]])
>>> cc=enc.transform(pd.Series([1,2,3]).reshape(-1,3))
>>> cc.toarray()
array([[1., 0., 0., 1., 0., 0., 1., 0., 0.]])
>>> cc=enc.transform(pd.Series([8,6,10]).reshape(-1,3))
>>> cc.toarray()
array([[0., 0., 1., 0., 0., 1., 0., 0., 1.]])
上面的那么多错误都是因为没理解深刻的,细心想想之后就会明白:fit整个data,此时data是'k1':[1,7,8], 'k2':[2,5,6], 'k3':[3,9,10]三列的数据,自然transform的对象就是要有三列以对应fit的对象。至此,就对应到所谓的OneHotEncoder数据扩维的观点
#OneHotEncoder 用于将表示分类的数据扩维:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit([[1],[2],[3],[4]])
ohe.transform([2],[3],[1],[4]).toarray()
输出:[ [0,1,0,0] , [0,0,1,0] , [1,0,0,0] ,[0,0,0,1] ]
下面说明一下OneHotEncoder之后的应用:
train_x= sparse.hstack((train_x, train_a))
test_x = sparse.hstack((test_x, test_a))
在上面的分析中,的确可以看到OneHotEncoder的数据扩维的性质,接下来看看独热编码之后数据具体变成什么样。
enc.fit(data['k1'].reshape(-1,1))
>>> ll=enc.transform(pd.Series([1,7,8]).reshape(-1,1))
Warning (from warnings module):
File "__main__", line 1
FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
>>> ll=enc.transform(pd.Series([1,7,8]).values.reshape(-1,1))
>>> ll.toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
>>> ll=enc.transform(pd.Series([1,1,1]).values.reshape(-1,1))#这里的reshape(-1,1)可以理解为将[1,1,1]转换为[1],[1],[1]共3行1列的数据,
#所以对于单独的[1]做transform结果就是[1,0,0]
>>> ll.toarray()
array([[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.]])
# // 上面的可以不管,大意了
# // 上面的可以不管,大意了
>>> ll=enc.transform(pd.Series([2,5,6]).values.reshape(-1,1)) # 这里大意了,重复定义了ll 所以这里就ll对应k2,下面的lll才对应k1
>>> ll.toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
>>> enc.fit(data['k1'].reshape(-1,1))
Warning (from warnings module):
File "__main__", line 1
FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values='auto', sparse=True)
>>> lll=enc.transform(pd.Series([1,7,8]).values.reshape(-1,1))
>>> lll.toarray()
array([[1., 0., 0.],
[0., 1., 0.],
[0., 0., 1.]])
>>> from scipy import sparse
>>> rt = sparse.hstack((lll,ll)) # 这里应用 稀疏合并 (sparse.hstack)就是为了看看实际应用中对所编码的特征转换后的结果是怎么个样子的
>>> rt
<3x6 sparse matrix of type ''
with 6 stored elements in COOrdinate format>
>>> rt.toarray() # 具体应用可以看最上面给出的代码,这里就是用一个例子做结果分析。
array([[1., 0., 0., 1., 0., 0.], # 这里可以看到,若根据对data三个特征做完相应的transform之后得到的结果,
[0., 1., 0., 0., 1., 0.], # 假设要fit的对象X是 n x m 的,transform得到的结果Y的size就是 1 x nm 的,
[0., 0., 1., 0., 0., 1.]]) #所以Y会变得非常大,因此OneHotEncoder经常跟pca联合起来使用
>>> data = pd.DataFrame({'name':['joe','john','bob','kevin'],
'age':[10,20,30,10],
'sex':['male','male','female','male']})
>>> data['age']=LabelEncoder().fit_transform(data['age'])
>>> data # 可以看出LabelEncoder会自动对元素进行编号,根据首字母大小顺序或者数字的大小顺序编号。
age name sex
0 0 joe male
1 1 john male
2 2 bob female
3 0 kevin male
>>> data['name']=LabelEncoder().fit_transform(data['name'])
>>> data['sex']=LabelEncoder().fit_transform(data['sex'])
>>> data # 对data全体进行LabelEncoder之后的数据
age name sex
0 0 1 1
1 1 2 1
2 2 0 0
3 0 3 1
>>> enc=OneHotEncoder()
>>> from scipy import sparse
>>> trainx=data['age'] #这里是data[ ],,
>>> for feature in ['age','name','sex']:
enc.fit(data[feature].values.reshape(-1,1))
kkk1=enc.transform(data[feature].values.reshape(-1,1))
trainx =sparse.hstack((trainx,kkk1))
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values='auto', sparse=True)
Traceback (most recent call last):
File "", line 4, in
trainx =sparse.hstack((trainx,kkk1))
File "...\Python\lib\site-packages\scipy\sparse\construct.py", line 464, in hstack
return bmat([blocks], format=format, dtype=dtype)
File "...\Python\lib\site-packages\scipy\sparse\construct.py", line 585, in bmat
raise ValueError(msg)
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 4, expected 1.#从上面可以看出因为
#数据是Series类型所以出现此错误
>>> trainx=data[['age']] #这里是data[[ ]],
>>> for feature in ['age','name','sex']:
enc.fit(data[feature].values.reshape(-1,1))
kkk1=enc.transform(data[feature].values.reshape(-1,1))
trainx =sparse.hstack((trainx,kkk1))
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values='auto', sparse=True)
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values='auto', sparse=True)
OneHotEncoder(categorical_features='all', dtype=,
handle_unknown='error', n_values='auto', sparse=True)
>>> trainx.toarray()
array([[0., 1., 0., 0., 0., 1., 0., 0., 0., 1.],
[1., 0., 1., 0., 0., 0., 1., 0., 0., 1.],
[2., 0., 0., 1., 1., 0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0., 0., 0., 1., 0., 1.]])
>>> trainx=data[['age']]
>>> type(trainx)
>>> trainx=data['age']
>>> type(trainx)
接下来分析一下上面的那个trainx.toarray()结果:
第一列就是trainx的初始值data[['age']] ,接下来就是循环对应的三个feature。
第一个feature:age,transform的data['age']=[0,1,2,0],而将其reshape(-1,1),就变成,此时独热编码是将每一行看做一次编码
[0]编码得到的结果就是[1,0,0](因为0在age这个特征变量中的三个特征值中为第一个)。[1]编码得到的结果就是[0,1,0](因为1在age这个特征变量中的三个特征值中为第二个)。[2]编码得到的结果就是[0,0,1],再来个[0]就得到结果。
第二个feature:name,transform的data['name']=[1,2,0,3],按上面的分析,最后得到的结果就是
第三个feature:sex,transform的data['sex']=[1,1,0,1], 结果是。