sklearn 常用api(一)

import numpy as np
np.set_printoptions(precision=4)

类别的编码

将类别变为从0开始计数;

from sklearn.preprocessing import LabelEncoder
>>> y = np.array(['r', 'g', 'g', 'b'])
>>> enc = LabelEncoder()

>>> enc.fit(y)
>>> y = enc.transform(y)
>>> y
array([2, 1, 1, 0], dtype=int32)

>>> enc.transform(['r', 'g', 'b'])
[2 1 0]

或者fit、transform两步并作一步fit_transform

>>> y = np.array(['r', 'g', 'g', 'b'])
>>> enc = LabelEncoder()
>>> y = enc.fit_transform(y)
>>> y
array([2, 1, 1, 0], dtype=int32)

自然也有求逆的转换 (inverse_transform)(我们为了操作的 方便,将原本内含丰富的类别标签转换为以0开始的数字型标签,操作完成之后,我们希望能转换为原始的标签形态):

>>> enc.inverse_transform(y)
['r', 'g', 'g', 'b']

clone:同时保存原始和fitted后的模型

from sklearn.base import clone
self.classifiers_ = []
for clf in self.classifiers:
    fitted_clf = clone(clf).fit(X, self.enc.transform(y))
    self.classifiers_.append(fitted_clf)

clf.predict_proba()

from sklearn.naive_bayes import BernoulliNB
>>> classes = ['one','two','three','one','three']
>>> feature = [[0,1,1,0],[0,1,0,1],[1,1,0,0],[0,0,0,0],[0,1,1,1]]

>>> clf = BernoulliNB()
>>> clf.fit(feature, classes)
>>> P = clf.predict_proba(feature)
>>> P
[[ 0.5428  0.3619  0.0953]
 [ 0.1958  0.3916  0.4126]
 [ 0.2835  0.5671  0.1494]
 [ 0.7154  0.159   0.1256]
 [ 0.2467  0.4934  0.2599]]

>>> P.shape
(5, 3)
                                 # n_samples * n_classes

>>> P.dot(np.ones(len(np.unique(classes))))                              
[ 1.  1.  1.  1.  1.]           
                        # P的行和为1
                        # 也即P的每一行代表当前样本属于每一类的概率分布

scikit-learn uses the predict_proba method (if applicable) to compute the ROC AUC score. 关于ROC AUC 理解及实践的详细信息,见[ROC曲线与AUC区域的理解与实践],[分类模型的精确率(precision)与召回率(recall)(Python)]

fit

模型(model)从训练数据集中学习得到(模型所需)参数的过程即为 fit

class MajVoteClf(BaseEstimator, ClassifierMixin):

    def __init__(self, classifiers):
        self.classifiers = classifiers

    def fit(self, X, y):
        self.enc = LabelEncoder()
        self.enc.fit(y)
        self.classes = self.enc.classes
        self.classifiers_ = []
        for clf in  self.classifiers:
            fitted_clf = clone(clf).fit(X, self.enc.transform(y))
            self.classifiers_.append(fitted_clf)

_name_estimators

from sklearn.pipeline import _name_estimators
>>> from sklearn.pipeline import _name_estimators
>>> help(_name_estimators)

    Generate names for estimators.(及参数信息)

以二元tuple构成的list的形式返回,每一个list中的元素(二元tuple)对应于一个classifier。

>>> from sklearn.svm import SVC
>>> from sklearn.linear_model import LogisticRegression

>>> _name_estimators([SVC(), LogisticRegression()])

[
(
    'logisticregression', 
    LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0)
), 

(
    'svc', 
    SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
)
]

接下来,我们将这些二元tuple构成的list转换为dict类型(非常实用的一种转换):

>>> named_clfs = {k:v for k, v in _name_classifiers([LogisticRegression(), SVC()])}

{

    'logisticregression': 
    LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0),

    'svc': 
    SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)

}

你可能感兴趣的:(sklearn 常用api(一))