python-autosklearn-多进程

import multiprocessing
import shutil
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from autosklearn.metrics import accuracy
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.constants import MULTICLASS_CLASSIFICATION

tmp_folder = '/tmp/autosklearn_parallel_2_example_tmp'
output_folder = '/tmp/autosklearn_parallel_2_example_out'


for dir_ in [tmp_folder, output_folder]:
    try:
    	# 递归的删除文件夹下的所有子文件夹和子文件
        shutil.rmtree(dir_)
    except OSError:
        pass


def get_spawn_classifier(X_train, y_train):
    def spawn_classifier(seed, dataset_name):
        """Spawn a subprocess.

        auto-sklearn does not take care of spawning worker processes. This
        function, which is called several times in the main block is a new
        process which runs one instance of auto-sklearn.
        """

        # Use the initial configurations from meta-learning only in one out of
        # the four processes spawned. This prevents auto-sklearn from evaluating
        # the same configurations in four processes.
        # 产生子进程
        if seed == 0:
            initial_configurations_via_metalearning = 25
            smac_scenario_args = {}
        else:
            initial_configurations_via_metalearning = 0
            smac_scenario_args = {'initial_incumbent': 'RANDOM'}

        # Arguments which are different to other runs of auto-sklearn:
        # 1. all classifiers write to the same output directory
        # 2. shared_mode is set to True, this enables sharing of data between
        # models.
        # 3. all instances of the AutoSklearnClassifier must have a different seed!
        automl = AutoSklearnClassifier(
        	# 寻找合适模型所用的时间
            time_left_for_this_task=60,
            # 每一个机器学习模型所用的时间
            per_run_time_limit=15,
			# 对于每一个机器学习算法的内存限制
            ml_memory_limit=1024,
			# 在共享模型节点中运行smac
            shared_mode=True, 
            tmp_folder=tmp_folder,
            output_folder=output_folder,
            # 完成后删除tmp_folder
            delete_tmp_folder_after_terminate=False,
            # 从模型库中通过ensemble选择构建的集合中添加的模型数
            ensemble_size=0,
			# 初始化超参数优化算法的配置数
            initial_configurations_via_metalearning=(
                initial_configurations_via_metalearning
            ),
            seed=seed,
            # 插入smac场景的其他参数
            smac_scenario_args=smac_scenario_args,
        )
        automl.fit(X_train, y_train, dataset_name=dataset_name)
    return spawn_classifier


def main():

    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = \
        sklearn.model_selection.train_test_split(X, y, random_state=1)

    processes = []
    spawn_classifier = get_spawn_classifier(X_train, y_train)
    # 创建多进程
    for i in range(4): 
        p = multiprocessing.Process(
            target=spawn_classifier,
            args=(i, 'breast_cancer'),
        )
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    print('Starting to build an ensemble!')
    automl = AutoSklearnClassifier(
    	# 寻找合适模型所用的时间
        time_left_for_this_task=30,
        # 每一个机器学习模型所用的时间
        per_run_time_limit=15,
        # 对于每一个机器学习算法的内存限制
        ml_memory_limit=1024,
        # 在共享模型节点中运行smac
        shared_mode=True,
        # 从模型库中通过ensemble选择构建的集合中添加的模型数
        ensemble_size=50,
        # 构建过程只考虑模型,实现模型库选择最大化
        ensemble_nbest=200,
        tmp_folder=tmp_folder,
        output_folder=output_folder,
        # 初始化超参数优化算法的配置数
        initial_configurations_via_metalearning=0,
        seed=1,
    )

    automl.fit_ensemble(
        y_train,
        task=MULTICLASS_CLASSIFICATION,
        metric=accuracy,
        # 整数数据的数字精度
        precision='32',
        dataset_name='digits',
        # 集成模型整体大小
        ensemble_size=20,
        # 从集成算法中需要考虑的数量
        ensemble_nbest=50,
    )

    predictions = automl.predict(X_test)
    print(automl.show_models())
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == '__main__':
    main()

你可能感兴趣的:(机器学习)