想了解这个的原因,是因为Gbdt不能设置这个参数,但是GBDT是最常用的分类器了...
查了一会儿还想了下原理,想知道原理上为啥gbdt不能设置class_weight... 然后发现虽然sklearn没有这个选项,但是LightGBM是有的啊...所以应该是可以实现,但是Sklearn只是没有实现而已。。。
scale_pos_weight
, default=1.0
, type=double
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.md
然后,class_weight='balanced'的实现,根据我跟sklearn svc的源码,原理应该就是改变sample weight
有结论之后可以不看了,然后下面比较无聊,是跟代码过程,记录是因为学到几个函数的用法
LabelEncoder 等
计算class weight的代码:
def compute_class_weight(class_weight, classes, y):
"""Estimate class weights for unbalanced datasets.
Parameters
----------
class_weight : dict, 'balanced' or None
If 'balanced', class weights will be given by
``n_samples / (n_classes * np.bincount(y))``.
If a dictionary is given, keys are classes and values
are corresponding class weights.
If None is given, the class weights will be uniform.
classes : ndarray
Array of the classes occurring in the data, as given by
``np.unique(y_org)`` with ``y_org`` the original class labels.
y : array-like, shape (n_samples,)
Array of original class labels per sample;
Returns
-------
class_weight_vect : ndarray, shape (n_classes,)
Array with class_weight_vect[i] the weight for i-th class
References
----------
The "balanced" heuristic is inspired by
Logistic Regression in Rare Events Data, King, Zen, 2001.
"""
# Import error caused by circular imports.
from ..preprocessing import LabelEncoder
if set(y) - set(classes):
raise ValueError("classes should include all valid labels that can "
"be in y")
if class_weight is None or len(class_weight) == 0:
# uniform class weights
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
elif class_weight in ['auto', 'balanced']:
# Find the weight of each class as present in y.
le = LabelEncoder()
y_ind = le.fit_transform(y)
if not all(np.in1d(classes, le.classes_)):
raise ValueError("classes should have valid labels that are in y")
# inversely proportional to the number of samples in the class
if class_weight == 'auto':
recip_freq = 1. / bincount(y_ind)
weight = recip_freq[le.transform(classes)] / np.mean(recip_freq)
warnings.warn("The class_weight='auto' heuristic is deprecated in"
" 0.17 in favor of a new heuristic "
"class_weight='balanced'. 'auto' will be removed in"
" 0.19", DeprecationWarning)
else:
recip_freq = len(y) / (len(le.classes_) *
bincount(y_ind).astype(np.float64))
weight = recip_freq[le.transform(classes)]
else:
# user-defined dictionary
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
if not isinstance(class_weight, dict):
raise ValueError("class_weight must be dict, 'balanced', or None,"
" got: %r" % class_weight)
for c in class_weight:
i = np.searchsorted(classes, c)
if i >= len(classes) or classes[i] != c:
raise ValueError("Class label %d not present." % c)
else:
weight[i] = class_weight[c]
return weight
def compute_sample_weight(class_weight, y, indices=None):
"""Estimate sample weights by class for unbalanced datasets.
Parameters
----------
class_weight : dict, list of dicts, "balanced", or None, optional
Weights associated with classes in the form ``{class_label: weight}``.
If not given, all classes are supposed to have weight one. For
multi-output problems, a list of dicts can be provided in the same
order as the columns of y.
The "balanced" mode uses the values of y to automatically adjust
weights inversely proportional to class frequencies in the input data:
``n_samples / (n_classes * np.bincount(y))``.
For multi-output, the weights of each column of y will be multiplied.
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
Array of original class labels per sample.
indices : array-like, shape (n_subsample,), or None
Array of indices to be used in a subsample. Can be of length less than
n_samples in the case of a subsample, or equal to n_samples in the
case of a bootstrap subsample with repeated indices. If None, the
sample weight will be calculated over the full sample. Only "auto" is
supported for class_weight if this is provided.
Returns
-------
sample_weight_vect : ndarray, shape (n_samples,)
Array with sample weights as applied to the original y
"""
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if isinstance(class_weight, six.string_types):
if class_weight not in ['balanced', 'auto']:
raise ValueError('The only valid preset for class_weight is '
'"balanced". Given "%s".' % class_weight)
elif (indices is not None and
not isinstance(class_weight, six.string_types)):
raise ValueError('The only valid class_weight for subsampling is '
'"balanced". Given "%s".' % class_weight)
elif n_outputs > 1:
if (not hasattr(class_weight, "__iter__") or
isinstance(class_weight, dict)):
raise ValueError("For multi-output, class_weight should be a "
"list of dicts, or a valid string.")
if len(class_weight) != n_outputs:
raise ValueError("For multi-output, number of elements in "
"class_weight should match number of outputs.")
expanded_class_weight = []
for k in range(n_outputs):
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight in ['balanced', 'auto'] or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y[indices, k]
classes_subsample = np.unique(y_subsample)
weight_k = np.choose(np.searchsorted(classes_subsample,
classes_full),
compute_class_weight(class_weight_k,
classes_subsample,
y_subsample),
mode='clip')
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(class_weight_k,
classes_full,
y_full)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[in1d(y_full, list(classes_missing))] = 0.
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight,
axis=0,
dtype=np.float64)
return expanded_class_weight