signature=b75bd62a847d521364573a908c49e265,scikit-learn/base.py at 7b136e92acf49d46251479b75c88cba63...

"""Base classes for all estimators."""

# Author: Gael Varoquaux

# License: BSD 3 clause

import copy

import warnings

from collections import defaultdict

import platform

import numpy as np

from scipy import sparse

from .externals import six

from .utils.fixes import signature

from .utils import _IS_32BIT

from . import __version__

##############################################################################

def _first_and_last_element(arr):

"""Returns first and last element of numpy array or sparse matrix."""

if isinstance(arr, np.ndarray) or hasattr(arr, 'data'):

# numpy array or sparse matrix with .data attribute

data = arr.data if sparse.issparse(arr) else arr

return data.flat[0], data.flat[-1]

else:

# Sparse matrices without .data attribute. Only dok_matrix at

# the time of writing, in this case indexing is fast

return arr[0, 0], arr[-1, -1]

def clone(estimator, safe=True):

"""Constructs a new estimator with the same parameters.

Clone does a deep copy of the model in an estimator

without actually copying attached data. It yields a new estimator

with the same parameters that has not been fit on any data.

Parameters

----------

estimator : estimator object, or list, tuple or set of objects

The estimator or group of estimators to be cloned

safe : boolean, optional

If safe is false, clone will fall back to a deep copy on objects

that are not estimators.

"""

estimator_type = type(estimator)

# XXX: not handling dictionaries

if estimator_type in (list, tuple, set, frozenset):

return estimator_type([clone(e, safe=safe) for e in estimator])

elif not hasattr(estimator, 'get_params'):

if not safe:

return copy.deepcopy(estimator)

else:

raise TypeError("Cannot clone object '%s' (type %s): "

"it does not seem to be a scikit-learn estimator "

"as it does not implement a 'get_params' methods."

% (repr(estimator), type(estimator)))

klass = estimator.__class__

new_object_params = estimator.get_params(deep=False)

for name, param in six.iteritems(new_object_params):

new_object_params[name] = clone(param, safe=False)

new_object = klass(**new_object_params)

params_set = new_object.get_params(deep=False)

# quick sanity check of the parameters of the clone

for name in new_object_params:

param1 = new_object_params[name]

param2 = params_set[name]

if param1 is not param2:

raise RuntimeError('Cannot clone object %s, as the constructor '

'either does not set or modifies parameter %s' %

(estimator, name))

return new_object

###############################################################################

def _pprint(params, offset=0, printer=repr):

"""Pretty print the dictionary 'params'

Parameters

----------

params : dict

The dictionary to pretty print

offset : int

The offset in characters to add at the begin of each line.

printer : callable

The function to convert entries to strings, typically

the builtin str or repr

"""

# Do a multi-line justified repr:

options = np.get_printoptions()

np.set_printoptions(precision=5, threshold=64, edgeitems=2)

params_list = list()

this_line_length = offset

line_sep = ',\n' + (1 + offset // 2) * ' '

for i, (k, v) in enumerate(sorted(six.iteritems(params))):

if type(v) is float:

# use str for representing floating point numbers

# this way we get consistent representation across

# architectures and versions.

this_repr = '%s=%s' % (k, str(v))

else:

# use repr of the rest

this_repr = '%s=%s' % (k, printer(v))

if len(this_repr) > 500:

this_repr = this_repr[:300] + '...' + this_repr[-100:]

if i > 0:

if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr):

params_list.append(line_sep)

this_line_length = len(line_sep)

else:

params_list.append(', ')

this_line_length += 2

params_list.append(this_repr)

this_line_length += len(this_repr)

np.set_printoptions(**options)

lines = ''.join(params_list)

# Strip trailing space to avoid nightmare in doctests

lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n'))

return lines

###############################################################################

class BaseEstimator(object):

"""Base class for all estimators in scikit-learn

Notes

-----

All estimators should specify all the parameters that can be set

at the class level in their ``__init__`` as explicit keyword

arguments (no ``*args`` or ``**kwargs``).

"""

@classmethod

def _get_param_names(cls):

"""Get parameter names for the estimator"""

# fetch the constructor or the original constructor before

# deprecation wrapping if any

init = getattr(cls.__init__, 'deprecated_original', cls.__init__)

if init is object.__init__:

# No explicit constructor to introspect

return []

# introspect the constructor arguments to find the model parameters

# to represent

init_signature = signature(init)

# Consider the constructor parameters excluding 'self'

parameters = [p for p in init_signature.parameters.values()

if p.name != 'self' and p.kind != p.VAR_KEYWORD]

for p in parameters:

if p.kind == p.VAR_POSITIONAL:

raise RuntimeError("scikit-learn estimators should always "

"specify their parameters in the signature"

" of their __init__ (no varargs)."

" %s with constructor %s doesn't "

" follow this convention."

% (cls, init_signature))

# Extract and sort argument names excluding 'self'

return sorted([p.name for p in parameters])

def get_params(self, deep=True):

"""Get parameters for this estimator.

Parameters

----------

deep : boolean, optional

If True, will return the parameters for this estimator and

contained subobjects that are estimators.

Returns

-------

params : mapping of string to any

Parameter names mapped to their values.

"""

out = dict()

for key in self._get_param_names():

value = getattr(self, key, None)

if deep and hasattr(value, 'get_params'):

deep_items = value.get_params().items()

out.update((key + '__' + k, val) for k, val in deep_items)

out[key] = value

return out

def set_params(self, **params):

"""Set the parameters of this estimator.

The method works on simple estimators as well as on nested objects

(such as pipelines). The latter have parameters of the form

``__`` so that it's possible to update each

component of a nested object.

Returns

-------

self

"""

if not params:

# Simple optimization to gain speed (inspect is slow)

return self

valid_params = self.get_params(deep=True)

nested_params = defaultdict(dict) # grouped by prefix

for key, value in params.items():

key, delim, sub_key = key.partition('__')

if key not in valid_params:

raise ValueError('Invalid parameter %s for estimator %s. '

'Check the list of available parameters '

'with `estimator.get_params().keys()`.' %

(key, self))

if delim:

nested_params[key][sub_key] = value

else:

setattr(self, key, value)

valid_params[key] = value

for key, sub_params in nested_params.items():

valid_params[key].set_params(**sub_params)

return self

def __repr__(self):

class_name = self.__class__.__name__

return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),

offset=len(class_name),),)

def __getstate__(self):

try:

state = super(BaseEstimator, self).__getstate__()

except AttributeError:

state = self.__dict__.copy()

if type(self).__module__.startswith('sklearn.'):

return dict(state.items(), _sklearn_version=__version__)

else:

return state

def __setstate__(self, state):

if type(self).__module__.startswith('sklearn.'):

pickle_version = state.pop("_sklearn_version", "pre-0.18")

if pickle_version != __version__:

warnings.warn(

"Trying to unpickle estimator {0} from version {1} when "

"using version {2}. This might lead to breaking code or "

"invalid results. Use at your own risk.".format(

self.__class__.__name__, pickle_version, __version__),

UserWarning)

try:

super(BaseEstimator, self).__setstate__(state)

except AttributeError:

self.__dict__.update(state)

###############################################################################

class ClassifierMixin(object):

"""Mixin class for all classifiers in scikit-learn."""

_estimator_type = "classifier"

def score(self, X, y, sample_weight=None):

"""Returns the mean accuracy on the given test data and labels.

In multi-label classification, this is the subset accuracy

which is a harsh metric since you require for each sample that

each label set be correctly predicted.

Parameters

----------

X : array-like, shape = (n_samples, n_features)

Test samples.

y : array-like, shape = (n_samples) or (n_samples, n_outputs)

True labels for X.

sample_weight : array-like, shape = [n_samples], optional

Sample weights.

Returns

-------

score : float

Mean accuracy of self.predict(X) wrt. y.

"""

from .metrics import accuracy_score

return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

###############################################################################

class RegressorMixin(object):

"""Mixin class for all regression estimators in scikit-learn."""

_estimator_type = "regressor"

def score(self, X, y, sample_weight=None):

"""Returns the coefficient of determination R^2 of the prediction.

The coefficient R^2 is defined as (1 - u/v), where u is the residual

sum of squares ((y_true - y_pred) ** 2).sum() and v is the total

sum of squares ((y_true - y_true.mean()) ** 2).sum().

The best possible score is 1.0 and it can be negative (because the

model can be arbitrarily worse). A constant model that always

predicts the expected value of y, disregarding the input features,

would get a R^2 score of 0.0.

Parameters

----------

X : array-like, shape = (n_samples, n_features)

Test samples. For some estimators this may be a

precomputed kernel matrix instead, shape = (n_samples,

n_samples_fitted], where n_samples_fitted is the number of

samples used in the fitting for the estimator.

y : array-like, shape = (n_samples) or (n_samples, n_outputs)

True values for X.

sample_weight : array-like, shape = [n_samples], optional

Sample weights.

Returns

-------

score : float

R^2 of self.predict(X) wrt. y.

"""

from .metrics import r2_score

return r2_score(y, self.predict(X), sample_weight=sample_weight,

multioutput='variance_weighted')

###############################################################################

class ClusterMixin(object):

"""Mixin class for all cluster estimators in scikit-learn."""

_estimator_type = "clusterer"

def fit_predict(self, X, y=None):

"""Performs clustering on X and returns cluster labels.

Parameters

----------

X : ndarray, shape (n_samples, n_features)

Input data.

y : Ignored

not used, present for API consistency by convention.

Returns

-------

labels : ndarray, shape (n_samples,)

cluster labels

"""

# non-optimized default implementation; override when a better

# method is possible for a given clustering algorithm

self.fit(X)

return self.labels_

class BiclusterMixin(object):

"""Mixin class for all bicluster estimators in scikit-learn"""

@property

def biclusters_(self):

"""Convenient way to get row and column indicators together.

Returns the ``rows_`` and ``columns_`` members.

"""

return self.rows_, self.columns_

def get_indices(self, i):

"""Row and column indices of the i'th bicluster.

Only works if ``rows_`` and ``columns_`` attributes exist.

Parameters

----------

i : int

The index of the cluster.

Returns

-------

row_ind : np.array, dtype=np.intp

Indices of rows in the dataset that belong to the bicluster.

col_ind : np.array, dtype=np.intp

Indices of columns in the dataset that belong to the bicluster.

"""

rows = self.rows_[i]

columns = self.columns_[i]

return np.nonzero(rows)[0], np.nonzero(columns)[0]

def get_shape(self, i):

"""Shape of the i'th bicluster.

Parameters

----------

i : int

The index of the cluster.

Returns

-------

shape : (int, int)

Number of rows and columns (resp.) in the bicluster.

"""

indices = self.get_indices(i)

return tuple(len(i) for i in indices)

def get_submatrix(self, i, data):

"""Returns the submatrix corresponding to bicluster `i`.

Parameters

----------

i : int

The index of the cluster.

data : array

The data.

Returns

-------

submatrix : array

The submatrix corresponding to bicluster i.

Notes

-----

Works with sparse matrices. Only works if ``rows_`` and

``columns_`` attributes exist.

"""

from .utils.validation import check_array

data = check_array(data, accept_sparse='csr')

row_ind, col_ind = self.get_indices(i)

return data[row_ind[:, np.newaxis], col_ind]

###############################################################################

class TransformerMixin(object):

"""Mixin class for all transformers in scikit-learn."""

def fit_transform(self, X, y=None, **fit_params):

"""Fit to data, then transform it.

Fits transformer to X and y with optional parameters fit_params

and returns a transformed version of X.

Parameters

----------

X : numpy array of shape [n_samples, n_features]

Training set.

y : numpy array of shape [n_samples]

Target values.

Returns

-------

X_new : numpy array of shape [n_samples, n_features_new]

Transformed array.

"""

# non-optimized default implementation; override when a better

# method is possible for a given clustering algorithm

if y is None:

# fit method of arity 1 (unsupervised transformation)

return self.fit(X, **fit_params).transform(X)

else:

# fit method of arity 2 (supervised transformation)

return self.fit(X, y, **fit_params).transform(X)

class DensityMixin(object):

"""Mixin class for all density estimators in scikit-learn."""

_estimator_type = "DensityEstimator"

def score(self, X, y=None):

"""Returns the score of the model on the data X

Parameters

----------

X : array-like, shape = (n_samples, n_features)

Returns

-------

score : float

"""

pass

class OutlierMixin(object):

"""Mixin class for all outlier detection estimators in scikit-learn."""

_estimator_type = "outlier_detector"

def fit_predict(self, X, y=None):

"""Performs fit on X and returns labels for X.

Returns -1 for outliers and 1 for inliers.

Parameters

----------

X : ndarray, shape (n_samples, n_features)

Input data.

y : Ignored

not used, present for API consistency by convention.

Returns

-------

y : ndarray, shape (n_samples,)

1 for inliers, -1 for outliers.

"""

# override for transductive outlier detectors like LocalOulierFactor

return self.fit(X).predict(X)

###############################################################################

class MetaEstimatorMixin(object):

"""Mixin class for all meta estimators in scikit-learn."""

# this is just a tag for the moment

class _UnstableArchMixin(object):

"""Mark estimators that are non-determinstic on 32bit or PowerPC"""

def _more_tags(self):

return {'non_deterministic': (

_IS_32BIT or platform.machine().startswith(('ppc', 'powerpc')))}

def is_classifier(estimator):

"""Returns True if the given estimator is (probably) a classifier.

Parameters

----------

estimator : object

Estimator object to test.

Returns

-------

out : bool

True if estimator is a classifier and False otherwise.

"""

return getattr(estimator, "_estimator_type", None) == "classifier"

def is_regressor(estimator):

"""Returns True if the given estimator is (probably) a regressor.

Parameters

----------

estimator : object

Estimator object to test.

Returns

-------

out : bool

True if estimator is a regressor and False otherwise.

"""

return getattr(estimator, "_estimator_type", None) == "regressor"

def is_outlier_detector(estimator):

"""Returns True if the given estimator is (probably) an outlier detector.

Parameters

----------

estimator : object

Estimator object to test.

Returns

-------

out : bool

True if estimator is an outlier detector and False otherwise.

"""

return getattr(estimator, "_estimator_type", None) == "outlier_detector"

你可能感兴趣的:(signature=b75bd62a847d521364573a908c49e265,scikit-learn/base.py at 7b136e92acf49d46251479b75c88cba63...)