Code Review #3 - Modeling#
import warnings
warnings.filterwarnings('ignore')
import sys, os
from time import time
from pathlib import Path
from itertools import product
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dot, BatchNormalization
from tensorflow.keras.models import Model
from sklearn.preprocessing import quantile_transform
from scipy.stats import spearmanr
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import MultipleTimeSeriesCV, format_time
idx = pd.IndexSlice
sns.set_style('whitegrid')
np.random.seed(42)
results_path = Path('KR2_results', 'asset_pricing')
if not results_path.exists():
results_path.mkdir(parents=True)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
print('Using GPU')
tf.config.experimental.set_memory_growth(gpu_devices[0], True)
else:
print('Using CPU')
characteristics = ['beta', 'betasq', 'chmom', 'krwvol', 'idiovol', 'ill', 'indmom',
'maxret', 'mom12m', 'mom1m', 'mom36m', 'mvel', 'retvol', 'turn', 'turn_std']
데이터 불러오기#
with pd.HDFStore(results_path / 'autoencoder.h5') as store:
print(store.info())
data = (pd.read_hdf(results_path / 'autoencoder.h5', 'returns')
.stack(dropna=False)
.to_frame('returns')
.loc[idx['2006':, :], :])
with pd.HDFStore(results_path / 'autoencoder.h5') as store:
keys = [k[1:] for k in store.keys() if k[1:].startswith('factor')]
for key in keys:
data[key.split('/')[-1]] = store[key].squeeze()
characteristics = data.drop('returns', axis=1).columns.tolist()
data['returns_fwd'] = data.returns.unstack('ticker').shift(-1).stack()
Rank-normalize characteristics#
data.loc[:, characteristics] = (data.loc[:, characteristics]
.groupby(level='date')
.apply(lambda x: pd.DataFrame(quantile_transform(x,
copy=True,
n_quantiles=x.shape[0]),
columns=characteristics,
index=x.index.get_level_values('ticker')))
.mul(2).sub(1))
Architecture#
Key parameters#
n_factors = 3
n_characteristics = len(characteristics)
n_tickers = len(data.index.unique('ticker'))
Input Layer#
input_beta = Input((n_tickers, n_characteristics), name='input_beta')
input_factor = Input((n_tickers,), name='input_factor')
Stock Characteristics Network#
hidden_layer = Dense(units=8, activation='relu', name='hidden_layer')(input_beta)
batch_norm = BatchNormalization(name='batch_norm')(hidden_layer)
output_beta = Dense(units=n_factors, name='output_beta')(batch_norm)
Factor Network#
pfo_factor = Dense(units=n_characteristics, name='pfo_factor')(input_factor)
output_factor = Dense(units=n_factors, name='output_factor')(pfo_factor)
Output Layer#
output = Dot(axes=(2,1), name='output_layer')([output_beta, output_factor])
Compile Layer#
model = Model(inputs=[input_beta, input_factor], outputs=output)
model.compile(loss='mse', optimizer='adam')
Automate model generation#
def make_model(hidden_units=8, n_factors=3):
input_beta = Input((n_tickers, n_characteristics), name='input_beta')
input_factor = Input((n_tickers,), name='input_factor')
hidden_layer = Dense(units=hidden_units, activation='relu', name='hidden_layer')(input_beta)
batch_norm = BatchNormalization(name='batch_norm')(hidden_layer)
output_beta = Dense(units=n_factors, name='output_beta')(batch_norm)
pfo_factor = Dense(units=n_characteristics, name='pfo_factor')(input_factor)
output_factor = Dense(units=n_factors, name='output_factor')(pfo_factor)
output = Dot(axes=(2,1), name='output_layer')([output_beta, output_factor])
model = Model(inputs=[input_beta, input_factor], outputs=output)
model.compile(loss='mse', optimizer='adam')
return model
Model Summary#
model.summary()
모델 시각화#
Train Model#
Cross-validation parameters#
YEAR = 52
cv = MultipleTimeSeriesCV(n_splits=5, # 5
train_period_length=15*YEAR,
test_period_length=1*YEAR,
lookahead=1)
def get_train_valid_data(data, train_idx, val_idx):
train, val = data.iloc[train_idx], data.iloc[val_idx]
X1_train = train.loc[:, characteristics].values.reshape(-1, n_tickers, n_characteristics)
X1_val = val.loc[:, characteristics].values.reshape(-1, n_tickers, n_characteristics)
X2_train = train.loc[:, 'returns'].unstack('ticker')
X2_val = val.loc[:, 'returns'].unstack('ticker')
y_train = train.returns_fwd.unstack('ticker')
y_val = val.returns_fwd.unstack('ticker')
return X1_train, X2_train, y_train, X1_val, X2_val, y_val
Hyperparameter Options#
factor_opts = [2, 3, 4, 5, 6] # 2, 3, 4, 5, 6
unit_opts = [8, 16, 32] # 8, 16, 32
param_grid = list(product(unit_opts, factor_opts))
Run Cross-Validation#
batch_size = 64
cols = ['units', 'n_factors', 'fold', 'epoch', 'ic_mean',
'ic_daily_mean', 'ic_daily_std', 'ic_daily_median']
import gc
from tensorflow.keras import backend as k
from tensorflow.keras.callbacks import Callback
from keras.utils import Sequence
# Define a custom data generator to feed data to the model
class DataGenerator(Sequence):
def __init__(self, X1, X2, y, batch_size):
self.X1 = X1
self.X2 = X2
self.y = y
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.X1) / float(self.batch_size)))
def __getitem__(self, idx):
batch_X1 = self.X1[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_X2 = self.X2[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
return [batch_X1, batch_X2], batch_y
class ClearMemory(Callback):
def on_epoch_end(self, epoch, logs=None):
gc.collect()
k.clear_session()
from keras.callbacks import EarlyStopping
# define early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
start = time()
for units, n_factors in param_grid:
print("units:", units, ", n_factors:", n_factors)
scores = []
for fold, (train_idx, val_idx) in enumerate(cv.split(data)):
X1_train, X2_train, y_train, X1_val, X2_val, y_val = get_train_valid_data(data,
train_idx,
val_idx)
train_gen = DataGenerator(X1_train, X2_train, y_train, batch_size=batch_size)
val_gen = DataGenerator(X1_val, X2_val, y_val, batch_size=batch_size)
model = make_model(hidden_units=units, n_factors=n_factors)
for epoch in range(250):
model.fit_generator(train_gen,
validation_data=val_gen,
epochs=epoch + 1,
initial_epoch=epoch, verbose=False, shuffle=True, callbacks=ClearMemory())
y_pred = model.predict_generator(val_gen, callbacks=ClearMemory())
y_true = y_val.stack().values
date_index = y_val.stack().index
result = (pd.DataFrame({'y_pred': y_pred.reshape(-1),
'y_true': y_true},
index=date_index)
.replace(-2, np.nan).dropna())
r0 = spearmanr(result.y_true, result.y_pred)[0]
r1 = result.groupby(level='date').apply(lambda x: spearmanr(x.y_pred,
x.y_true)[0])
scores.append([units, n_factors, fold, epoch, r0,
r1.mean(), r1.std(), r1.median()])
if epoch % 50 == 0:
print(f'{format_time(time()-start)} | {n_factors} | {units:02} | {fold:02}-{epoch:03} | {r0:6.2%} | '
f'{r1.mean():6.2%} | {r1.median():6.2%}')
scores = pd.DataFrame(scores, columns=cols)
scores.to_hdf(results_path / 'scores.h5', f'{units}/{n_factors}')
Evaluate Results#
scores = []
with pd.HDFStore(results_path / 'scores.h5') as store:
for key in store.keys():
scores.append(store[key])
print(scores)
scores = pd.concat(scores)
avg = (scores.groupby(['n_factors', 'units', 'epoch'])
['ic_mean', 'ic_daily_mean', 'ic_daily_median']
.mean()
.reset_index())
top = (avg.groupby(['n_factors', 'units'])
.apply(lambda x: x.nlargest(n=5, columns=['ic_daily_median']))
.reset_index(-1, drop=True))
avg.nlargest(n=50, columns=['ic_daily_median'])
top.nlargest(n=5, columns=['ic_daily_median'])
Generate Predictions#
n_factors = 3
units = 8
batch_size = 64
first_epoch = 50
last_epoch = 80
predictions = []
for epoch in tqdm(list(range(first_epoch, last_epoch))):
epoch_preds = []
for fold, (train_idx, val_idx) in enumerate(cv.split(data)):
X1_train, X2_train, y_train, X1_val, X2_val, y_val = get_train_valid_data(data,
train_idx,
val_idx)
train_gen = DataGenerator(X1_train, X2_train, y_train, batch_size=batch_size)
val_gen = DataGenerator(X1_val, X2_val, y_val, batch_size=batch_size)
model.fit_generator(train_gen,
validation_data=val_gen,
epochs=epoch,
verbose=0,
shuffle=True, callbacks=[ClearMemory(), early_stop])
epoch_preds.append(pd.Series(model.predict_generator(val_gen, callbacks=[ClearMemory(), early_stop]).reshape(-1),
index=y_val.stack().index).to_frame(epoch))
predictions.append(pd.concat(epoch_preds))
predictions_combined = pd.concat(predictions, axis=1).sort_index()
predictions_combined.to_hdf(results_path / 'predictions.h5', 'predictions')