可以参考一下这篇博客:AutoGluon Tabular 表数据全流程自动机器学习 AutoML ,不过这位博主虽然有所总结,但是没有深入代码层面。
hyperparams = {'NN': {'num_epochs': 10, 'activation': 'relu', 'dropout_prob': ag.Real(0.0,0.5)},
'GBM': {'num_boost_round': 1000, 'learning_rate': ag.Real(0.01,0.1,log=True)} }
进入autogluon.task.tabular_prediction.tabular_prediction.TabularPrediction#fit
,映入眼帘的是
learner = Learner(path_context=output_directory, label=label, problem_type=problem_type, objective_func=eval_metric, stopping_metric=stopping_metric,
id_columns=id_columns, feature_generator=feature_generator, trainer_type=trainer_type,
label_count_threshold=label_count_threshold, random_seed=random_seed)
learner.fit(X=train_data, X_test=tuning_data, scheduler_options=scheduler_options,
hyperparameter_tune=hyperparameter_tune, feature_prune=feature_prune,
holdout_frac=holdout_frac, num_bagging_folds=num_bagging_folds, num_bagging_sets=num_bagging_sets, stack_ensemble_levels=stack_ensemble_levels,
hyperparameters=hyperparameters, ag_args_fit=ag_args_fit, excluded_model_types=excluded_model_types, time_limit=time_limits_orig, save_data=cache_data, save_bagged_folds=save_bagged_folds, verbosity=verbosity)
Learner:autogluon.utils.tabular.ml.learner.default_learner.DefaultLearner#__init__
Learner encompasses full problem, loading initial data, feature generation, model training, model prediction
进入autogluon.utils.tabular.ml.learner.default_learner.DefaultLearner#fit
X, y, X_test, y_test, holdout_frac, num_bagging_folds =
self.general_data_processing(X, X_test, holdout_frac, num_bagging_folds)
进入当前代码文件的general_data_processing
函数
首先看到的就是这个代码,先mark一下。不会爆内存?
X = copy.deepcopy(X)
label的缺失值
missinglabel_inds = [index for index, x in X[self.label].isna().iteritems() if x]
处理缺失值的方法是drop
X = X.drop(missinglabel_inds, axis=0)
mark一下当前代码文件的get_problem_type
函数。 problem type有MULTICLASS_LIMIT
,BINARY
,REGRESSION
三种类型
其实是4种。还有一个闻所未闻的
softclass
。详情见autogluon/utils/tabular/ml/constants.py:5
处理完标签,就开始处理特征了。
如果定义了X_test
,就叠起来一起做特征工程。这个操作属于基操,不过要小心数据泄露。
X_super = pd.concat([X, X_test], ignore_index=True)
。。。处理。。。
X = X_super.head(len(X)).set_index(X.index)
X_test = X_super.tail(len(X_test)).set_index(X_test.index)
好,开始看数据处理
X = self.feature_generator.fit_transform(X, banned_features=self.submission_columns, drop_duplicates=False)
self.feature_generator
来自autogluon.utils.tabular.features.auto_ml_feature_generator.AutoMLFeatureGenerator
进入之。
mark代码文件下get_feature_types
函数,用于解析date特征与text特征,值得借鉴。
在minimize_categorical_memory_usage
函数中,是用这种神奇的方法做OrdinalEncoding的(传入前已经将object处理成了category)
for column in cat_columns:
new_categories = list(range(len(X_features[column].cat.categories.values)))
X_features[column].cat.rename_categories(new_categories, inplace=True)
出栈,回到autogluon/utils/tabular/ml/learner/default_learner.py:66
self.trainer_type
<class 'autogluon.utils.tabular.ml.trainer.auto_trainer.AutoTrainer'>
进入autogluon.utils.tabular.ml.trainer.auto_trainer.AutoTrainer#train
函数
处理超参数(看不出个所以然来)
self.hyperparameters = self._process_hyperparameters(hyperparameters=hyperparameters, ag_args_fit=ag_args_fit, excluded_model_types=excluded_model_types)
获取模型
models = self.get_models(hyperparameters=self.hyperparameters, hyperparameter_tune=hyperparameter_tune, level=0)
获取模型的get_models
函数调用了autogluon.utils.tabular.ml.trainer.model_presets.presets.get_preset_models
level_key
= default
怀疑model
是kwargs
model
{'num_epochs': 10, 'activation': 'relu', 'dropout_prob': Real: lower=0.0, upper=0.5, 'AG_args': {'model_type': 'NN'}}
果然,autogluon/utils/tabular/ml/trainer/model_presets/presets.py:129
model_names_set.add(name)
model_params = copy.deepcopy(model)
model_params.pop(AG_ARGS)
model_init
就是模型实例了
model_init = model_type(path=path, name=name, problem_type=problem_type, objective_func=objective_func, stopping_metric=stopping_metric, num_classes=num_classes, hyperparameters=model_params)
进入autogluon.utils.tabular.ml.models.abstract.abstract_model.AbstractModel#__init__
mark一下TabularNN
的所在地为autogluon.utils.tabular.ml.models.tabular_nn.tabular_nn_model.TabularNeuralNetModel
进入autogluon.utils.tabular.ml.trainer.abstract_trainer.AbstractTrainer#stack_new_level
有是个数据处理?每太看懂
X_train_init = self.get_inputs_to_stacker(X, level_start=0, level_end=level, fit=True)
return self.train_multi(X_train=X_train_init, y_train=y, X_test=X_test, y_test=y_test, models=models, hyperparameter_tune=hyperparameter_tune, feature_prune=feature_prune, level=level, stack_name=stack_name, kfolds=kfolds, n_repeats=n_repeats, time_limit=time_limit)
进入当前代码文件的train_multi
套娃进入train_multi_initial
套娃进入train_multi_fold
套娃进入train_single_full
hpo_models, hpo_model_performances, hpo_results = model.hyperparameter_tune(X_train=X_train, X_test=X_test, Y_train=y_train, Y_test=y_test, scheduler_options=(self.scheduler_func, self.scheduler_options), verbosity=self.verbosity)
model(autogluon.utils.tabular.ml.models.lgb.lgb_model.LGBModel
)自带一个hyperparameter_tune
方法,
self.scheduler_func
<class 'autogluon.scheduler.fifo.FIFOScheduler'>
self.scheduler_options
{'resource': {'num_cpus': 12, 'num_gpus': 0}, 'searcher': 'random', 'search_options': {}, 'checkpoint': None, 'resume': False, 'num_trials': 5, 'time_out': 27.0, 'reward_attr': 'validation_performance', 'time_attr': 'epoch', 'visualizer': 'none', 'dist_ip_addrs': []}
我们先来看LGBM
(同时也是优先级最高的模型。paper一直说自己的tabularNN多么牛x,但实际上也没设为最高优先级,口嫌体正直)
进入autogluon.utils.tabular.ml.models.lgb.lgb_model.LGBModel#hyperparameter_tune
这段代码在check min_data_in_leaf
这个超参
if isinstance(params_copy['min_data_in_leaf'], Int):
upper_minleaf = params_copy['min_data_in_leaf'].upper
if upper_minleaf > X_train.shape[0]: # TODO: this min_data_in_leaf adjustment based on sample size may not be necessary
upper_minleaf = max(1, int(X_train.shape[0] / 5.0))
lower_minleaf = params_copy['min_data_in_leaf'].lower
if lower_minleaf > upper_minleaf:
lower_minleaf = max(1, int(upper_minleaf / 3.0))
params_copy['min_data_in_leaf'] = Int(lower=lower_minleaf, upper=upper_minleaf)
超参搜索(HPO)与训练就有意思了:
先在hyperparameter_tune
函数的最后调用一个scheduler.run()
按F7
进入之。
run
函数的最后是个循环,不断调用self.schedule_next()
。
按F7
进入之。
config在这里是随机推荐的:(AG实现了其他的推荐器,如skopt等)
config = self.searcher.get_config(**extra_kwargs)
schedule_next
函数的最后是这样一段代码:
task = self._create_new_task(config, resources=resources)
self.add_job(task, **extra_kwargs)
task
Task (task_id: 0,
fn: <function lgb_trial at 0x7f4de2db31e0>,
args: {args: {'util_args': {'dataset_train_filename': 'dataset_train.bin', 'dataset_val_filename': 'dataset_val.b.., config: {'feature_fraction': 1.0, 'learning_rate': 0.0316227766, 'min_data_in_leaf': 20, 'num_leaves': 31}, },
resource: DistributedResource(
Node = Remote REMOTE_ID: 0,
<Remote: 'inproc://192.168.1.106/2563/1' processes=1 threads=12, memory=16.68 GB>
nCPUs = 12, CPU_IDs = {[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}))
type(task)
<class 'autogluon.core.task.Task'>
最后估计要调autogluon.utils.tabular.ml.models.lgb.hyperparameters.lgb_trial.lgb_trial
进入autogluon/utils/tabular/ml/models/lgb/hyperparameters/lgb_trial.py:19
,打断点F9
args.keys()
dict_keys(['util_args', 'num_boost_round', 'num_threads', 'objective', 'verbose', 'boosting_type', 'two_round', 'learning_rate', 'feature_fraction', 'min_data_in_leaf', 'num_leaves', 'seed_value', 'task_id'])
简单扫了一眼,num_boost_round
, learning_rate
, feature_fraction
, min_data_in_leaf
, num_leaves
都是LGBM常见的超参。参数中混杂了一些其他的参数,如task_id
。
进入autogluon.utils.tabular.ml.models.abstract.model_trial.prepare_inputs
一波操作骚的可以
type(args["util_args"])
<class 'autogluon.utils.edict.EasyDict'>
args["util_args"].model
<autogluon.utils.tabular.ml.models.lgb.lgb_model.LGBModel object at 0x7f4db0840860>
最后调用了一个autogluon.utils.tabular.ml.models.abstract.model_trial.fit_and_save_model
函数。
看的头晕眼花,还是不管LGBM和莫名其妙的流程了,直接看tabularNN吧
在autogluon/scheduler/fifo.py:235
打一断点,待LGBM的5次trial都执行完之后, Run To Cursor 到 autogluon/scheduler/fifo.py:300
,打印
task.fn
<function tabular_nn_trial at 0x7f616248f730>
按两次shift键查询tabular_nn_trial
,进入autogluon.utils.tabular.ml.models.tabular_nn.tabular_nn_trial.tabular_nn_trial
,在函数内打一断点。(Run To Cursor并不会执行到该函数,AG和HpBandSter差不多,worker和master在两个进程/线程)
重新运行代码,跑到tabular_nn_trial
函数
train_dataset = TabularNNDataset.load(util_args.train_path)
train_dataset.feature_groups
{'vector': ['age', 'fnlwgt', 'education-num', 'hours-per-week', 'capital-gain', 'capital-loss', 'sex'], 'embed': ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country'], 'language': []}
首先关注TabularNN的数据处理(我对data skew
的处理很感兴趣)
在autogluon.utils.tabular.ml.models.tabular_nn.tabular_nn_model.TabularNeuralNetModel#generate_datasets
的autogluon/utils/tabular/ml/models/tabular_nn/tabular_nn_model.py:452
打一断点,按F7
进入
发现TabularNN并没有实现自己的preprocess
函数,而用的是父类的函数。
进入当前代码文件的process_train_data
。这个函数才是预处理的关键
首先就是获取特征类型了,进入_get_types_of_features
,一共有5种特征类型:
types_of_features = {'continuous': [], 'skewed': [], 'onehot': [], 'embed': [], 'language': []}
# continuous = numeric features to rescale
# skewed = features to which we will apply power (ie. log / box-cox) transform before normalization
# onehot = features to one-hot encode (unknown categories for these features encountered at test-time are encoded as all zeros). We one-hot encode any features encountered that only have two unique values.
for feature in self.features:
feature_data = df[feature] # pd.Series
num_unique_vals = len(feature_data.unique())
if num_unique_vals == 2: # will be onehot encoded regardless of proc.embed_min_categories value
types_of_features['onehot'].append(feature)
elif feature in continuous_featnames:
if np.abs(feature_data.skew()) > skew_threshold:
types_of_features['skewed'].append(feature)
else:
types_of_features['continuous'].append(feature)
elif feature in categorical_featnames:
if num_unique_vals >= embed_min_categories: # sufficiently many categories to warrant learned embedding dedicated to this feature
types_of_features['embed'].append(feature)
else:
types_of_features['onehot'].append(feature)
elif feature in language_featnames:
types_of_features['language'].append(feature)
return types_of_features
skew_threshold
=0.99
, embed_min_categories
= 4
在识别完特征之后,会开始构造ColumnTransformer
直接放代码了:
impute_strategy
= median
, max_category_levels
= 100
def _create_preprocessor(self, impute_strategy, max_category_levels):
""" Defines data encoders used to preprocess different data types and creates instance variable which is sklearn ColumnTransformer object """
if self.processor is not None:
Warning("Attempting to process training data for TabularNeuralNetModel, but previously already did this.")
continuous_features = self.types_of_features['continuous']
skewed_features = self.types_of_features['skewed']
onehot_features = self.types_of_features['onehot']
embed_features = self.types_of_features['embed']
language_features = self.types_of_features['language']
transformers = [] # order of various column transformers in this list is important!
if len(continuous_features) > 0:
continuous_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy=impute_strategy)),
('scaler', StandardScaler())])
transformers.append( ('continuous', continuous_transformer, continuous_features) )
if len(skewed_features) > 0:
power_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy=impute_strategy)),
('quantile', QuantileTransformer(output_distribution='normal')) ]) # Or output_distribution = 'uniform'
# TODO: remove old code: ('power', PowerTransformer(method=self.params['proc.power_transform_method'])) ])
transformers.append( ('skewed', power_transformer, skewed_features) )
if len(onehot_features) > 0:
onehot_transformer = Pipeline(steps=[
# TODO: Consider avoiding converting to string for improved memory efficiency
('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
('onehot', OneHotMergeRaresHandleUnknownEncoder(max_levels=max_category_levels, sparse=False))]) # test-time unknown values will be encoded as all zeros vector
transformers.append( ('onehot', onehot_transformer, onehot_features) )
if len(embed_features) > 0: # Ordinal transformer applied to convert to-be-embedded categorical features to integer levels
ordinal_transformer = Pipeline(steps=[
('to_str', FunctionTransformer(self.convert_df_dtype_to_str)),
('imputer', SimpleImputer(strategy='constant', fill_value=self.unique_category_str)),
('ordinal', OrdinalMergeRaresHandleUnknownEncoder(max_levels=max_category_levels))]) # returns 0-n when max_category_levels = n-1. category n is reserved for unknown test-time categories.
transformers.append( ('ordinal', ordinal_transformer, embed_features) )
if len(language_features) > 0:
raise NotImplementedError("language_features cannot be used at the moment")
return ColumnTransformer(transformers=transformers) # numeric features are processed in the same order as in numeric_features vector, so feature-names remain the same.
用了
QuantileTransformer
而没用PowerTransformer
,但是变量名申明的却是power_transformer
,发生了什么
一个一个看:
median
SimpleImputer
做Z-score
, 偏斜值用QuantileTransformer(output_distribution='normal')
将分布处理为正态分布。onehot
特征用的是OneHotMergeRaresHandleUnknownEncoder
embed
特征用的是OrdinalMergeRaresHandleUnknownEncoder
自研Encoder参数:max_levels=max_category_levels(100)
,研究一下这两个自研 Encoder
感觉写的挺烂的。max_levels=max_category_levels(100)
的思路和auto-sklearn 2.0
的Category Coalescence
、Minority Coalescer
神似。只不过ASKL采取的是ratio或者说fraction的思想(Minimum percentaage samples
∈[0.0001, 0.5]
),而AG采取的是指定一个数的思想,并且是写死的(max_category_levels
= 100
)
self.feature_arraycol_map = self._get_feature_arraycol_map(max_category_levels=max_category_levels)
OrderedDict of feature-name -> list of column-indices in df corresponding to this feature
{'age': [0], 'fnlwgt': [1], 'education-num': [2], 'hours-per-week': [3], 'capital-gain': [4], 'capital-loss': [5], 'sex': [6, 7], 'workclass': [8], 'education': [9], 'marital-status': [10], 'occupation': [11], 'relationship': [12], 'race': [13], 'native-country': [14]}
单独搞了个函数算特征处理的一对多关系,脱裤子放屁。
TabularNN数据处理的代码就是这些了,看训练的代码吧
autogluon.utils.tabular.ml.models.abstract.model_trial.fit_and_save_model
autogluon.utils.tabular.ml.models.abstract.abstract_model.AbstractModel#fit
autogluon.utils.tabular.ml.models.tabular_nn.tabular_nn_model.TabularNeuralNetModel#_fit
进入get_net
self.model = EmbedNet(train_dataset=train_dataset, params=params, num_net_outputs=self.num_net_outputs, ctx=self.ctx)
params
{'num_epochs': 10, 'epochs_wo_improve': 20, 'seed_value': None, 'proc.embed_min_categories': 4, 'proc.impute_strategy': 'median', 'proc.max_category_levels': 100, 'proc.skew_threshold': 0.99, 'network_type': 'widedeep', 'layers': None, 'numeric_embed_dim': None, 'activation': 'relu', 'max_layer_width': 2056, 'embedding_size_factor': 1.0, 'embed_exponent': 0.56, 'max_embedding_dim': 100, 'y_range': None, 'y_range_extend': 0.05, 'use_batchnorm': True, 'dropout_prob': 0.25, 'batch_size': 512, 'loss_function': None, 'optimizer': 'adam', 'learning_rate': 0.0003, 'weight_decay': 1e-06, 'clip_gradient': 100.0, 'momentum': 0.9, 'lr_scheduler': None, 'base_lr': 3e-05, 'target_lr': 1.0, 'lr_decay': 0.1, 'warmup_epochs': 10, 'use_ngram_features': False}
进入EmbedNet
的构造函数
train_dataset.getNumCategoriesEmbeddings()
的意义在于统计每个cat feature的基数
getEmbedSizes
计算Embed后各个cat feature的维度
mark一下,调研MLBox的EntityCoding
def getEmbedSizes(train_dataset, params, num_categs_per_feature):
""" Returns list of embedding sizes for each categorical variable.
Selects this adaptively based on training_datset.
Note: Assumes there is at least one embed feature.
"""
max_embedding_dim = params['max_embedding_dim']
embed_exponent = params['embed_exponent']
size_factor = params['embedding_size_factor']
embed_dims = [int(size_factor*max(2, min(max_embedding_dim,
1.6 * num_categs_per_feature[i]**embed_exponent)))
for i in range(len(num_categs_per_feature))]
return embed_dims
autogluon.utils.tabular.ml.models.tabular_nn.tabular_nn_model.TabularNeuralNetModel#set_net_defaults
vector_dim = train_dataset.dataset._data[train_dataset.vectordata_index].shape[1] # total dimensionality of vector features
prop_vector_features = train_dataset.num_vector_features() / float(train_dataset.num_features) # Fraction of features that are numeric
min_numeric_embed_dim = 32
max_numeric_embed_dim = params['max_layer_width']
params['numeric_embed_dim'] = int(min(max_numeric_embed_dim, max(min_numeric_embed_dim,
params['layers'][0]*prop_vector_features*np.log10(vector_dim+10) )))
params['layers']
[256, 128]
autogluon/utils/tabular/ml/models/tabular_nn/tabular_nn_model.py:328
self.model
EmbedNet(
(numeric_block): NumericBlock(
(body): Dense(8 -> 160, Activation(relu))
)
(embed_blocks): HybridSequential(
(0): EmbedBlock(
(body): Embedding(7 -> 4, float32)
)
(1): EmbedBlock(
(body): Embedding(14 -> 7, float32)
)
(2): EmbedBlock(
(body): Embedding(6 -> 4, float32)
)
(3): EmbedBlock(
(body): Embedding(14 -> 7, float32)
)
(4): EmbedBlock(
(body): Embedding(7 -> 4, float32)
)
(5): EmbedBlock(
(body): Embedding(6 -> 4, float32)
)
(6): EmbedBlock(
(body): Embedding(6 -> 4, float32)
)
)
(output_block): WideAndDeepBlock(
(deep): FeedforwardBlock(
(body): HybridSequential(
(0): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=194)
(1): Dropout(p = 0.25, axes=())
(2): Dense(194 -> 256, Activation(relu))
(3): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=256)
(4): Dropout(p = 0.25, axes=())
(5): Dense(256 -> 128, Activation(relu))
(6): BatchNorm(axis=1, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=128)
(7): Dropout(p = 0.25, axes=())
(8): Dense(128 -> 2, linear)
)
)
(wide): Dense(194 -> 2, linear)
)
)