原教程:Overview - AutoKeras
能够完成以下任务
图像分类
图像回归
文本分类
文本回归
结构化数据分类
结构化数据回归
import os
import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files
import autokeras as ak
入门小demo:IMBD情感分类
#载入数据集
dataset = tf.keras.utils.get_file(
fname="aclImdb.tar.gz",
origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
extract=True,
)
# set path to dataset
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), "aclImdb")
classes = ["pos", "neg"]
train_data = load_files(
os.path.join(IMDB_DATADIR, "train"), shuffle=True, categories=classes
)
test_data = load_files(
os.path.join(IMDB_DATADIR, "test"), shuffle=False, categories=classes
)
x_train = np.array(train_data.data)
y_train = np.array(train_data.target)
x_test = np.array(test_data.data)
y_test = np.array(test_data.target)
print(x_train.shape) # (25000,)
print(y_train.shape) # (25000, 1)
print(x_train[0][:50]) # this film was just brilliant casting
#训练
# Initialize the text classifier.
clf = ak.TextClassifier(
overwrite=True, max_trials=1
) # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)
#预测
# Predict with the best model.
predicted_y = clf.predict(x_test)
#评估
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))
#调节验证集大小
clf.fit(
x_train,
y_train,
# Split the training data and use the last 15% as validation data.
validation_split=0.15,
)
#自定义验证集
split = 5000
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
clf.fit(
x_train,
y_train,
epochs=2,
# Use your own validation set.
validation_data=(x_val, y_val),
)
#句子编码方式【block_type:'sequence', 'ngram'】
input_node = ak.TextInput()
output_node = ak.TextBlock(block_type="ngram")(input_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(
inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
clf.fit(x_train, y_train, epochs=2)
#定制化
input_node = ak.TextInput()
output_node = ak.TextToIntSequence()(input_node)
output_node = ak.Embedding()(output_node)
# Use separable Conv layers in Keras.
output_node = ak.ConvBlock(separable=True)(output_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(
inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
clf.fit(x_train, y_train, epochs=2)
#泰坦尼克实例
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
#建模
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
overwrite=True, max_trials=3
) # It tries 3 different models.
#训练
# Feed the structured data classifier with training data.
clf.fit(
# The path to the train.csv file.
train_file_path,
# The name of the label column.
"survived",
epochs=10,
)
#预测
# Predict with the best model.
predicted_y = clf.predict(test_file_path)
#评估
# Evaluate the best model with testing data.
print(clf.evaluate(test_file_path, "survived"))
#numpy.ndarray变成tf.data.Dataset
train_set = tf.data.Dataset.from_tensor_slices((x_train.astype(np.unicode), y_train))
test_set = tf.data.Dataset.from_tensor_slices(
(x_test.to_numpy().astype(np.unicode), y_test)
)
#特征指定
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
column_names=[
"sex",
"age",
"n_siblings_spouses",
"parch",
"fare",
"class",
"deck",
"embark_town",
"alone",
],
column_types={"sex": "categorical", "fare": "numerical"},
max_trials=10, # It tries 10 different models.
overwrite=True,
)
#验证集大小
clf.fit(
x_train,
y_train,
# Split the training data and use the last 15% as validation data.
validation_split=0.15,
epochs=10,
)
#验证集自定义
split = 500
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
clf.fit(
x_train,
y_train,
# Use your own validation set.
validation_data=(x_val, y_val),
epochs=10,
)
#定制化
input_node = ak.StructuredDataInput()
output_node = ak.CategoricalToNumerical()(input_node)
output_node = ak.DenseBlock()(output_node)
output_node = ak.ClassificationHead()(output_node)
clf = ak.AutoModel(
inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
clf.fit(x_train, y_train, epochs=1)
clf.predict(x_train)
#模型探索
model = clf.export_model()
model.summary()
print(x_train.dtype)
# numpy array in object (mixed type) is not supported.
# convert it to unicode.
model.predict(x_train.astype(np.unicode))
#加州房屋数据
#https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset
#载入数据集
house_dataset = fetch_california_housing()
df = pd.DataFrame(
np.concatenate(
(house_dataset.data, house_dataset.target.reshape(-1, 1)), axis=1
),
columns=house_dataset.feature_names + ["Price"],
)
train_size = int(df.shape[0] * 0.9)
df[:train_size].to_csv("train.csv", index=False)
df[train_size:].to_csv("eval.csv", index=False)
train_file_path = "train.csv"
test_file_path = "eval.csv"
#建模
# Initialize the structured data regressor.
reg = ak.StructuredDataRegressor(
overwrite=True, max_trials=3
) # It tries 3 different models.
#训练
# Feed the structured data regressor with training data.
reg.fit(
# The path to the train.csv file.
train_file_path,
# The name of the label column.
"Price",
epochs=10,
)
#预测
# Predict with the best model.
predicted_y = reg.predict(test_file_path)
#评估
# Evaluate the best model with testing data.
print(reg.evaluate(test_file_path, "Price"))
#其余类似功能不再赘述,同上一节表格分类
#UCI空气质量数据集
dataset = tf.keras.utils.get_file(
fname="AirQualityUCI.csv",
origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00360/"
"AirQualityUCI.zip",
extract=True,
)
#预处理
dataset = pd.read_csv(dataset, sep=";")
dataset = dataset[dataset.columns[:-2]]
dataset = dataset.dropna()
dataset = dataset.replace(",", ".", regex=True)
val_split = int(len(dataset) * 0.7)
data_train = dataset[:val_split]
validation_data = dataset[val_split:]
data_x = data_train[
[
"CO(GT)",
"PT08.S1(CO)",
"NMHC(GT)",
"C6H6(GT)",
"PT08.S2(NMHC)",
"NOx(GT)",
"PT08.S3(NOx)",
"NO2(GT)",
"PT08.S4(NO2)",
"PT08.S5(O3)",
"T",
"RH",
]
].astype("float64")
data_x_val = validation_data[
[
"CO(GT)",
"PT08.S1(CO)",
"NMHC(GT)",
"C6H6(GT)",
"PT08.S2(NMHC)",
"NOx(GT)",
"PT08.S3(NOx)",
"NO2(GT)",
"PT08.S4(NO2)",
"PT08.S5(O3)",
"T",
"RH",
]
].astype("float64")
# Data with train data and the unseen data from subsequent time steps.
data_x_test = dataset[
[
"CO(GT)",
"PT08.S1(CO)",
"NMHC(GT)",
"C6H6(GT)",
"PT08.S2(NMHC)",
"NOx(GT)",
"PT08.S3(NOx)",
"NO2(GT)",
"PT08.S4(NO2)",
"PT08.S5(O3)",
"T",
"RH",
]
].astype("float64")
data_y = data_train["AH"].astype("float64")
data_y_val = validation_data["AH"].astype("float64")
print(data_x.shape) # (6549, 12)
print(data_y.shape) # (6549,)
#参数
predict_from = 1
predict_until = 10
lookback = 3
#建模
clf = ak.TimeseriesForecaster(
lookback=lookback,
predict_from=predict_from,
predict_until=predict_until,
max_trials=1,
objective="val_loss",
)
#训练
# Train the TimeSeriesForecaster with train data
clf.fit(
x=data_x,
y=data_y,
validation_data=(data_x_val, data_y_val),
batch_size=32,
epochs=10,
)
#预测
# Predict with the best model(includes original training data).
predictions = clf.predict(data_x_test)
print(predictions.shape)
#评估
# Evaluate the best model with testing data.
print(clf.evaluate(data_x_val, data_y_val))
Trains 会自动记录有关 AutoKera 任务的全面信息: 代码源代码控制、执行环境、超参数等等。
from tensorflow import keras
tensorboard_callback_train = keras.callbacks.TensorBoard(log_dir='log')
tensorboard_callback_test = keras.callbacks.TensorBoard(log_dir='log')
clf.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback_train])
clf.fit(x_test, y_test, epochs=2, callbacks=[tensorboard_callback_test])
配置:
1 TRAINS - AutoKeras
2 First Steps | ClearML
from trains import Task
from tensorflow import keras
import os
import numpy as np
import tensorflow as tf
from sklearn.datasets import load_files
import autokeras as ak
task = Task.init(project_name="autokeras", task_name="autokeras titanic test")
# 泰坦尼克实例
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
# 建模
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
overwrite=True, max_trials=3
) # It tries 3 different models.
tensorboard_callback_train = keras.callbacks.TensorBoard(log_dir='log')
# 训练
# Feed the structured data classifier with training data.
clf.fit(
# The path to the train.csv file.
train_file_path,
# The name of the label column.
"survived",
epochs=10,callbacks=[tensorboard_callback_train]
)
# 预测
# Predict with the best model.
predicted_y = clf.predict(test_file_path)
# 评估
# Evaluate the best model with testing data.
print(clf.evaluate(test_file_path, "survived"))