【机器学习手册】【5】为特征编码,分类

前言

最近几天很难受,之前天天写博客写的烦了,今天想起来这些天日记没写,是因为没写日记所以烦?怎么可能,是因为没有反思,没有考虑自己是否在做对的事。我不知道怎么才算是休息,因为我不爱打游戏,喜欢的游戏比如坎巴拉都是几个人一起玩才意思。昨天出门玩了漂移板,什么时候才能耍帅啊!


对无序标签编码

import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer


feature_matrix = np.array(
    [
        ["Texas"],
        ["California"],
        ["Texas"],
        ["Delaware"],
        ["Texas"],
    ]
)


# 创建one-hot编码器
coder = LabelBinarizer()

# 编码器拟合特征
dataframe = pd.DataFrame(
    coder.fit_transform(feature_matrix), columns=["California", "Delaware", "Texas"]
)

print(dataframe)

   California  Delaware  Texas
0           0         0      1   
1           1         0      0   
2           0         0      1   
3           0         1      0   
4           0         0      1   

对有序标签编码

import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer


dataframe = pd.DataFrame({
     "Score": ["low", "high", "low", "high", "low", "medium"]})

# 创建映射器
mapper = {
     "low": 1, "medium": 2, "high": 3}

# 用映射器替换特征
print(dataframe["Score"].replace(mapper))


print(dataframe)

0    1
1    3
2    1
3    3
4    1
5    2
Name: Score, dtype: int64        
    Score
0     low
1    high
2     low
3    high
4     low
5  medium

对字典编码

import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer


datadict = [
    {
     "red": 2, "blue": 4},
    {
     "red": 4, "blue": 3},
    {
     "red": 1, "yellow": 2},
    {
     "red": 2, "yellow": 2},
]


# 字典向量化
dv = DictVectorizer(sparse=False)

# 字典向量拟合特征
print(dv.fit_transform(datadict))

[[4. 2. 0.]
 [3. 4. 0.]
 [0. 1. 2.]
 [0. 2. 2.]]

缺失值填充

import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from fancyimpute import KNN
from sklearn.covariance import EllipticEnvelope
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier


feature_matrix = np.array(
    [[0, 2.1, 1.45], [1, 1.18, 1.33], [0, 1.22, 1.27], [1, -0.22, 1.45]]  # 4x3
)


# KNN分类器
c = KNeighborsClassifier(3, weights="distance")

# 训练模型
model = c.fit(feature_matrix[:, 1:], feature_matrix[:, 0])  # 其他列  # 第一列


# 带缺失值的矩阵
poor_matrix = np.array([[np.nan, 0.87, 1.3], [np.nan, -0.67, -0.23]])


# 用模型预测缺失值
value = model.predict(poor_matrix[:, 1:])  # 其他列
print(value)
print('--------------------------------------')

# 连接缺失矩阵和预测出的值
connected = np.hstack((value.reshape(-1, 1), poor_matrix[:, 1:]))
print(connected)
print('--------------------------------------')

# 连接两个特征矩阵
#print(np.vstack((connected, feature_matrix)))

[1. 1.]
--------------------------------------  
[[ 1.    0.87  1.3 ]
 [ 1.   -0.67 -0.23]]
--------------------------------------  

你可能感兴趣的:(python,机器学习,分类,python)