按照论文的说法,landmarking
方法因为过于耗时所以不算在元特征中
EXCLUDE_META_FEATURES_CLASSIFICATION
Out[5]:
{
'Landmark1NN',
'LandmarkDecisionNodeLearner',
'LandmarkDecisionTree',
'LandmarkLDA',
'LandmarkNaiveBayes',
'PCA',
'PCAFractionOfComponentsFor95PercentVariance',
'PCAKurtosisFirstPC',
'PCASkewnessFirstPC'}
EXCLUDE_META_FEATURES_REGRESSION
Out[6]:
{
'ClassEntropy',
'ClassOccurences',
'ClassProbabilityMax',
'ClassProbabilityMean',
'ClassProbabilityMin',
'ClassProbabilitySTD',
'Landmark1NN',
'LandmarkDecisionNodeLearner',
'LandmarkDecisionTree',
'LandmarkLDA',
'LandmarkNaiveBayes',
'LandmarkRandomNodeLearner',
'NumberOfClasses',
'PCA',
'PCAFractionOfComponentsFor95PercentVariance',
'PCAKurtosisFirstPC',
'PCASkewnessFirstPC'}
回归任务因为没有离散标签,所以与class相关的元特征也排除了
'ClassEntropy',
'ClassOccurences',
'ClassProbabilityMax',
'ClassProbabilityMean',
'ClassProbabilityMin',
'ClassProbabilitySTD',
calculate.update(npy_metafeatures)
calculate_all_metafeatures_with_labels
中,在npy_metafeatures
的基础上更新不用的元特征。
只有npy_metafeatures
需要用DataPreprocessor
做transform
LandmarkRandomNodeLearner
,
A S K L ASKL ASKL计算数据集元特征的方法其实就放在smbo.py里面,一共有两个函数,一个函数用在计算 X , y X,y X,y的时候不需要做encode
,只获取General的元特征。另一个需要做encode
。
# metalearning helpers
def _calculate_metafeatures(data_feat_type, data_info_task, basename,
x_train, y_train, watcher, logger):
# == Calculate metafeatures
task_name = 'CalculateMetafeatures'
watcher.start_task(task_name)
categorical = [True if feat_type.lower() in ['categorical'] else False
for feat_type in data_feat_type]
EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \
if data_info_task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION
if data_info_task in [MULTICLASS_CLASSIFICATION, BINARY_CLASSIFICATION,
MULTILABEL_CLASSIFICATION, REGRESSION,
MULTIOUTPUT_REGRESSION]:
logger.info('Start calculating metafeatures for %s', basename)
result = calculate_all_metafeatures_with_labels(
x_train, y_train, categorical=categorical,
dataset_name=basename,
dont_calculate=EXCLUDE_META_FEATURES, )
for key in list(result.metafeature_values.keys()):
if result.metafeature_values[key].type_ != 'METAFEATURE':
del result.metafeature_values[key]
else:
result = None
logger.info('Metafeatures not calculated')
watcher.stop_task(task_name)
logger.info(
'Calculating Metafeatures (categorical attributes) took %5.2f',
watcher.wall_elapsed(task_name))
return result
def _calculate_metafeatures_encoded(basename, x_train, y_train, watcher,
task, logger):
EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \
if task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION
task_name = 'CalculateMetafeaturesEncoded'
watcher.start_task(task_name)
result = calculate_all_metafeatures_encoded_labels(
x_train, y_train, categorical=[False] * x_train.shape[1],
dataset_name=basename, dont_calculate=EXCLUDE_META_FEATURES)
for key in list(result.metafeature_values.keys()):
if result.metafeature_values[key].type_ != 'METAFEATURE':
del result.metafeature_values[key]
watcher.stop_task(task_name)
logger.info(
'Calculating Metafeatures (encoded attributes) took %5.2fsec',
watcher.wall_elapsed(task_name))
return result