以决策树为基函数的提升方法称为提升树(boosting tree)。提升树采用前向分步算法,根据每轮迭代的残差值,学习得到一个回归树,用加法模型得到提升树的结果。回归问题采用平方误差损失函数,分类问题用指数损失函数。
指数损失通过最小化指数损失来逐步学习多个输出为1和−1的二值基函数 { φ j } j = 1 b \{\varphi_j\}^b_{j=1} {φj}j=1b的线性组合。对离群点、噪声非常敏感,常用在AdaBoost算法中。指数损失详解
HIGGS 数据集包含有 11 million 个样本,具有 28 个特征,用于分类问题,来区分产生希格斯玻色子的信号过程和不产生希格斯玻色子的后台过程。
下载数据
解压下载的数据,用pandas read_csv().as_matrix(),然后用numpy转成压缩文件备用。
# tf.__version__ == 1.9.0
URL_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
INPUT_FILE = "HIGGS.csv.gz"
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file to contain "data" array.
def _download_higgs_data_and_save_npz(data_dir):
"""Download higgs data and store as a numpy compressed file."""
input_url = os.path.join(URL_ROOT, INPUT_FILE)
np_filename = os.path.join(data_dir, NPZ_FILE)
if tf.gfile.Exists(np_filename):
raise ValueError("data_dir already has the processed data file: {}".format(
np_filename))
if not tf.gfile.Exists(data_dir):
tf.gfile.MkDir(data_dir)
# 2.8 GB to download.
try:
tf.logging.info("Data downloading...")
temp_filename, _ = urllib.request.urlretrieve(input_url)
# Reading and parsing 11 million csv lines takes 2~3 minutes.
tf.logging.info("Data processing... taking multiple minutes...")
with gzip.open(temp_filename, "rb") as csv_file:
data = pd.read_csv(
csv_file,
dtype=np.float32,
names=["c%02d" % i for i in range(29)] # label + 28 features.
).as_matrix()
finally:
tf.gfile.Remove(temp_filename)
# Writing to temporary location then copy to the data_dir (0.8 GB).
f = tempfile.NamedTemporaryFile()
np.savez_compressed(f, data=data)
tf.gfile.Copy(f.name, np_filename)
tf.logging.info("Data saved to: {}".format(np_filename))
读取数据:
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file containing "data" array
def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
"""Reads higgs data from csv and returns train and eval data.
Args:
data_dir: A string, the directory of higgs dataset.
train_start: An integer, the start index of train examples within the data.
train_count: An integer, the number of train examples within the data.
eval_start: An integer, the start index of eval examples within the data.
eval_count: An integer, the number of eval examples within the data.
Returns:
Numpy array of train data and eval data.
"""
npz_filename = os.path.join(data_dir, NPZ_FILE)
print("read data from %s..." % (npz_filename))
try:
# gfile allows numpy to read data from network data sources as well.
with tf.gfile.Open(npz_filename, "rb") as npz_file:
with np.load(npz_file) as npz:
data = npz["data"]
except tf.errors.NotFoundError as e:
raise RuntimeError(
"Error loading data; use data_download.py to prepare the data.\n{}: {}"
.format(type(e).__name__, e))
return (data[train_start:train_start+train_count],
data[eval_start:eval_start+eval_count])
数据预处理
用numpy arrays格式的数据制作train_input_fn:
def make_inputs_from_np_arrays(features_np, label_np):
"""Makes and returns input_fn and feature_columns from numpy arrays.
The generated input_fn will return tf.data.Dataset of feature dictionary and a
label, and feature_columns will consist of the list of
tf.feature_column.BucketizedColumn.
Note, for in-memory training, tf.data.Dataset should contain the whole data
as a single tensor. Don't use batch.
Args:
features_np: A numpy ndarray (shape=[batch_size, num_features]) for
float32 features.
label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
Returns:
input_fn: A function returning a Dataset of feature dict and label.
feature_names: A list of feature names.
feature_column: A list of tf.feature_column.BucketizedColumn.
"""
# 特征个数
num_features = features_np.shape[1]
# 把原始特征纵向切分成num_features列,返回一个list,每个元素表示一列特征
features_np_list = np.split(features_np, num_features, axis=1)
# 定义 1-based序列作为特征名.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
# Create source feature_columns and bucketized_columns.
# 计算每列特征的百分比分位数,分桶.
def get_bucket_boundaries(feature):
"""Returns bucket boundaries for feature by percentiles."""
return np.unique(np.percentile(feature, range(0, 100))).tolist()
# 把连续变量转成dtype类型,并指定默认值
source_columns = [
tf.feature_column.numeric_column(
feature_name, dtype=tf.float32,
# Although higgs data have no missing values, in general, default
# could be set as 0 or some reasonable value for missing values.
default_value=0.0)
for feature_name in feature_names
]
# 将连续变量进行分桶离散化,输出one-hot的结果
bucketized_columns = [
tf.feature_column.bucketized_column(
source_columns[i],
boundaries=get_bucket_boundaries(features_np_list[i]))
for i in range(num_features)
]
# Make an input_fn that extracts source features.
def input_fn():
"""Returns features as a dictionary of numpy arrays, and a label.
把{feature_name, 特征的numpy array} 和 label_np zipping起来
"""
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features),
tf.data.Dataset.from_tensors(label_np),))
return input_fn, feature_names, bucketized_columns
类似的,制作测试集的 input_fn:
def make_eval_inputs_from_np_arrays(features_np, label_np):
"""Makes eval input as streaming batches.
"""
num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
def input_fn():
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((
tf.data.Dataset.from_tensor_slices(features),
tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000)
return input_fn
训练过程:
默认情况下,1100万个样本中前100万个会被用于训练,最后100万个会用于评估。可以通过标志 --train_start, -train_count, -eval_start, -eval_count 等选择训练/评估数据作为索引范围。
def train_boosted_trees(flags_obj):
"""Train boosted_trees estimator on HIGGS data.
Args:
flags_obj: An object containing parsed flag values.
"""
# 先清除model_dir
if tf.gfile.Exists(flags_obj.model_dir):
tf.gfile.DeleteRecursively(flags_obj.model_dir)
# 加载数据,按指定序号区分训练集和测试集
tf.logging.info("## Data loading...")
train_data, eval_data = read_higgs_data(
flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
flags_obj.eval_start, flags_obj.eval_count)
tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
# 制作出训练集的 input_fn:A function returning a Dataset of feature dict and label.
# 制作对数值分桶了的feature_columns,以及feature_names
# Data consists of one label column followed by 28 feature columns.
train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
# 制作测试集的 input_fn
eval_input_fn = make_eval_inputs_from_np_arrays(
features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
tf.logging.info("## Features prepared. Training starts...")
# Create benchmark logger to log info about the training and metric values
run_params = {
"model_name": "boosted_trees",
"dataset_name": "higgs",
"train_start": flags_obj.train_start,
"train_count": flags_obj.train_count,
"eval_start": flags_obj.eval_start,
"eval_count": flags_obj.eval_count,
"n_trees": flags_obj.n_trees,
"max_depth": flags_obj.max_depth,
}
tf.logging.info("run params:\n %s", run_params)
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# training is yet provided as a contrib library.
# train_input_fn是训练数据带labels,feature_columns是bucketized_columns
# n_trees是提升树的个数,max_depth是提升树的最大深度。
classifier = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(
train_input_fn,
feature_columns,
model_dir=flags_obj.model_dir or None,
n_trees=flags_obj.n_trees,
max_depth=flags_obj.max_depth,
learning_rate=flags_obj.learning_rate)
# 用测试集评估
eval_results = classifier.evaluate(eval_input_fn)
# Benchmark the evaluation results
# benchmark_logger.log_evaluation_result(eval_results)
tf.logging.info("Benchmark metric: %s", eval_results)
模型评估
当使用默认参数训练时,最终的精度将在74%左右,并且在eval集上的损失大约为0.516。
由于指定了n_trees=100,max_depth=6,所以global_step=600就自动结束了。
日志输出:
Benchmark metric: {‘accuracy’: 0.738377, ‘accuracy_baseline’: 0.529618, ‘auc’: 0.8194437, ‘auc_precision_recall’: 0.83378166, ‘average_loss’: 0.5168768, ‘label/mean’: 0.529618, ‘loss’: 0.51687634, ‘precision’: 0.74909055, ‘prediction/mean’: 0.52907485, ‘recall’: 0.76087105, ‘global_step’: 600}
导出模型:
if flags_obj.export_dir is not None:
classifier.export_savedmodel(
flags_obj.export_dir,
_make_csv_serving_input_receiver_fn(
column_names=feature_names,
# columns are all floats.
column_defaults=[[0.0]] * len(feature_names)))
代码来自:https://github.com/tensorflow/models/tree/r1.9.0/official/boosted_trees
使用模型进行预测,需要用到 saved_model_cli 。