… 接上
下面我们将采用CNN模型来基于格拉姆角场python实践上的结果(时序数据已转化为图像数据)进行金融预测。
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import *
import datetime as dt
import glob
import os
# Chunks DataFrames in a way that part of the data points is found in the previous chunk
def chunker(seq: pd.DataFrame, size: int, loops: int) -> Generator:
"""
:param seq: As DataFrame
:param size: As Integer
:param loops: As integer
:return: Generator with overlapping index DataFrames
"""
rem = (seq.shape[0] - size)
rem_split = rem // loops
for i in range(10):
yield seq.iloc[(i * rem_split): -(rem - (i * rem_split))]
def ensemble_data(networks_chunks: int, path: str) -> List[pd.DataFrame]:
"""
:param networks_chunks: As Integer
:param path: As String
:return: List of overlapping index DataFrames
"""
dataframes = []
for sub_folder in ['LONG', 'SHORT']:
images = glob.glob(path + '/{}/*.png'.format(sub_folder)) # Get path to images
dates = [dt.split('/')[-1].split('\\')[-1].split('.')[0].replace('_', '-') for dt in images]
data_slice = pd.DataFrame({'Images': images, 'Labels': [sub_folder] * len(images), 'Dates': dates})
data_slice['Dates'] = pd.to_datetime(data_slice['Dates'])
dataframes.append(data_slice)
data = pd.concat(dataframes)
data.sort_values(by='Dates', inplace=True)
del data['Dates']
shape = (data.shape[0] // 5) * 4
loops = networks_chunks
return list(chunker(data, shape, loops))
# Ensemble CNN network to train a CNN model on GAF images labeled Long and Short
PATH = "G:\\financial_data"
IMAGES_PATH = os.path.join(PATH, 'TRAIN')
REPO = os.path.join(PATH, 'Models')
PATH_DOC = os.path.join(PATH, 'Documents')
PATH_OUT = os.path.join(PATH, 'Output')
EPOCHS = 5
SPLIT = 0.30
LR = 0.001
TIMESTAMP = dt.datetime.now().strftime("%Y%m%d%H%M%S")
我们采用了集成模型,训练了三个cnn模型并取其平均分数。cnn模型有8层,并由一个密集层进行二分类预测,激化函数采用relu,dropout rate为0.4来标准化各层输入,且包含了批规范化。
cnn_networks = 3
model = []
for j in range(cnn_networks):
model.append(
tf.keras.models.Sequential([
# First Convolution
Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(255, 255, 3)),
BatchNormalization(),
Conv2D(32, kernel_size=(3, 3), activation='relu'),
BatchNormalization(),
Conv2D(32, kernel_size=(3, 3), strides=2, padding='same', activation='relu'),
BatchNormalization(),
Dropout(0.4),
# Second Convolution
Conv2D(64, kernel_size=(3, 3), activation='relu'),
BatchNormalization(),
Conv2D(64, kernel_size=(3, 3), activation='relu'),
BatchNormalization(),
Conv2D(64, kernel_size=(3, 3), strides=2, padding='same', activation='relu'),
BatchNormalization(),
Dropout(0.4),
# Third Convolution
Conv2D(128, kernel_size=4, activation='relu'),
BatchNormalization(),
Flatten(),
Dropout(0.4),
# Output layer
Dense(1, activation='sigmoid')]
))
# Compile each model
model[j].compile(optimizer=Adam(learning_rate=LR), loss='binary_crossentropy', metrics=['acc'])
编译模型,使用Adam优化算子,学习率为0.001.
# All images will be rescaled by 1./255
train_validate_datagen = ImageDataGenerator(rescale=1/255, validation_split=SPLIT) # set validation split
test_datagen = ImageDataGenerator(rescale=1/255)
data_chunks = ensemble_data(cnn_networks, IMAGES_PATH)
for j in range(cnn_networks):
print('Net : {}'.format(j+1))
df_train = data_chunks[j].iloc[:-60]
df_test = data_chunks[j].iloc[-60:]
train_generator = train_validate_datagen.flow_from_dataframe(
dataframe=df_train,
directory=IMAGES_PATH,
target_size=(255, 255),
x_col='Images',
y_col='Labels',
batch_size=32,
class_mode='binary',
subset='training')
validation_generator = train_validate_datagen.flow_from_dataframe(
dataframe=df_train,
directory=IMAGES_PATH,
target_size=(255, 255),
x_col='Images',
y_col='Labels',
batch_size=32,
class_mode='binary',
subset='validation')
test_generator = test_datagen.flow_from_dataframe(
dataframe=df_test,
x_col='Images',
y_col='Labels',
directory=IMAGES_PATH,
target_size=(255, 255),
class_mode='binary')
Net : 1
Found 3495 validated image filenames belonging to 2 classes.
Found 1497 validated image filenames belonging to 2 classes.
Found 60 validated image filenames belonging to 2 classes.
Net : 2
Found 3495 validated image filenames belonging to 2 classes.
Found 1497 validated image filenames belonging to 2 classes.
Found 60 validated image filenames belonging to 2 classes.
Net : 3
Found 3495 validated image filenames belonging to 2 classes.
Found 1497 validated image filenames belonging to 2 classes.
Found 60 validated image filenames belonging to 2 classes.
前面所定义的ensemble_data用来把数据分割成cnn网络的数量,也就是3。
然后使用keras的ImageGenerator,修改图像尺寸并把图像数据分为训练、验证和测试集。
steps_per_epoch = train_generator.n // train_generator.batch_size
validation_steps = validation_generator.n // validation_generator.batch_size
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=0, factor=0.5, min_lr=0.00001)
history = model[j].fit_generator(train_generator,
epochs=EPOCHS,
steps_per_epoch=steps_per_epoch,
validation_data=validation_generator,
callbacks=[learning_rate_reduction],
verbose=0)
print('CNN Model {0:d}: '
'Epochs={1:d}, '
'Training Accuracy={2:.5f}, '
'Validation Accuracy={3:.5f}'.format(j + 1,
EPOCHS,
max(history.history['acc']),
max(history.history['val_acc'])))
scores = model[j].evaluate_generator(test_generator, steps=5)
print("{0}s: {1:.2f}%".format(model[j].metrics_names[1], scores[1]*100))
string_list = []
model[j].summary(print_fn=lambda x: string_list.append(x))
summary = "\n".join(string_list)
logging = ['{0}: {1}'.format(key, val[-1]) for key, val in history.history.items()]
log = 'Results:\n' + '\n'.join(logging)
model[j].save(os.path.join(REPO, 'computer_vision_model_{0}_{1}_of_{2}.h5'.format(TIMESTAMP, j+1, cnn_networks)))
f = open(os.path.join(REPO, 'computer_vision_summary_{0}_{1}_of_{2}.h5'.format(TIMESTAMP, j+1, cnn_networks)), 'w')
f.write("EPOCHS: {0}\nSteps per epoch: {1}\nValidation steps: {2}\nVal Split:{3}\nLearning RT:{5}\n\n\n{4}"
"\n\n=========TRAINING LOG========\n{6}".format(EPOCHS, steps_per_epoch, validation_steps, SPLIT, summary,
LR, log))
f.close()
CNN Model 3: Epochs=5, Training Accuracy=0.55270, Validation Accuracy=0.53173
在模型拟合的过程中建立了ReduceROnPlateau作为回调函数在模型性能经过一段时间没有改善的情况下缩减学习率以优化性能。
最后,模型取得了超过53%的正确率,这个成绩并不能认为很好。但如果在股票交易市场中能够长期稳定的得到高于50%的正确率,那就是非常令人振奋的成绩啦。另外也可在卷积神经网络中增加层数,融合基本面分析、风险因子、场景分析、ESG分数等,以使模型可以应对更为复杂的情况。