LabelEncoder 特征编码

1. 连续编码

import pandas as pd
from sklearn.preprocessing import LabelEncoder


class MultiColumnLabelEncoder:
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode

    def fit(self, X, y=None):
        return self  # not relevant here

    def transform(self, X):
        """
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        """

        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname, col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self, X, y=None):
        # return self.fit(X, y).transform(X)
        return self.transform(X)


if __name__ == "__main__":
    fruit_data = pd.DataFrame({
        'fruit': ['apple', 'orange', 'pear', 'orange'],
        'color': ['red', 'orange', 'green', 'green'],
    })

    data_encode = MultiColumnLabelEncoder().fit_transform(fruit_data)
    print(data_encode)

LabelEncoder 特征编码_第1张图片

2. one-hot编码

import pandas as pd

def one_hot_encoding(df, nan_as_category=True):
    # nan_as_category=True 决定是否对None单独作为一个类别进行编码
    original_columns = df.columns.tolist()
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

if __name__ == "__main__":
    fruit_data = pd.DataFrame({
        'fruit': ['apple', 'orange', 'pear', 'orange'],
        # 'color': ['red', 'orange', 'green', 'green']
    })

    data_encode, cols_new = one_hot_encoding(fruit_data)
    print(data_encode)
    print(cols_new)

LabelEncoder 特征编码_第2张图片

# 对某一列进行编码
dummies_Timestamp = pd.get_dummies(ser, prefix='Timestamp')
df = pd.concat([df, dummies_Timestamp], axis=1)

你可能感兴趣的:(机器学习,机器学习)