Skip to content

Commit

Permalink
first upload
Browse files Browse the repository at this point in the history
  • Loading branch information
danielzak committed Apr 1, 2016
1 parent 2518c2d commit ad5415d
Show file tree
Hide file tree
Showing 2 changed files with 408 additions and 0 deletions.
205 changes: 205 additions & 0 deletions ex1-self_learning_quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from __future__ import print_function

import numpy as np
np.random.seed(1335) # for reproducibility
np.set_printoptions(precision=5, suppress=True, linewidth=150)

import pandas as pd
import backtest as twp
from matplotlib import pyplot as plt

from sklearn import metrics, preprocessing

#Load data
def load_data():
price = np.arange(200/10.0) #linearly increasing prices
return price

#Initialize first state, all items are placed deterministically
def init_state(data):

close = data
diff = np.diff(data)
diff = np.insert(diff, 0, 0)

#--- Preprocess data
xdata = np.column_stack((close, diff))
xdata = np.nan_to_num(xdata)
scaler = preprocessing.StandardScaler()
xdata = scaler.fit_transform(xdata)

state = xdata[0:1, :]
return state, xdata

#Take Action
def take_action(state, xdata, action, signal, time_step):
#this should generate a list of trade signals that at evaluation time are fed to the backtester
#the backtester should get a list of trade signals and a list of price data for the assett

#make necessary adjustments to state and then return it
time_step += 1

#if the current iteration is the last state ("terminal state") then set terminal_state to 1
if time_step == xdata.shape[0]:
state = xdata[time_step-1:time_step, :]
terminal_state = 1
signal.loc[time_step] = 0
return state, time_step, signal, terminal_state

#move the market data window one step forward
state = xdata[time_step-1:time_step, :]
#take action
if action != 0:
if action == 1:
signal.loc[time_step] = 100
elif action == 2:
signal.loc[time_step] = -100
elif action == 3:
signal.loc[time_step] = 0
terminal_state = 0

return state, time_step, signal, terminal_state

#Get Reward, the reward is returned at the end of an episode
def get_reward(new_state, time_step, action, xdata, signal, terminal_state, epoch=0):
reward = 0
signal.fillna(value=0, inplace=True)
if terminal_state == 0:
#get reward for the most current action
if signal[time_step] != signal[time_step-1] and terminal_state == 0:
i=1
while signal[time_step-i] == signal[time_step-1-i] and time_step - 1 - i > 0:
i += 1
reward = (xdata[time_step-1, 0] - xdata[time_step - i-1, 0]) * signal[time_step - 1]*-100 + i*np.abs(signal[time_step - 1])/10.0
if signal[time_step] == 0 and signal[time_step - 1] == 0:
reward -= 10

#calculate the reward for all actions if the last iteration in set
if terminal_state == 1:
#run backtest, send list of trade signals and asset data to backtest function
bt = twp.Backtest(pd.Series(data=[x[0] for x in xdata]), signal, signalType='shares')
reward = bt.pnl.iloc[-1]

return reward

def evaluate_Q(eval_data, eval_model):
#This function is used to evaluate the perofrmance of the system each epoch, without the influence of epsilon and random actions
signal = pd.Series(index=np.arange(len(eval_data)))
state, xdata = init_state(eval_data)
status = 1
terminal_state = 0
time_step = 1
while(status == 1):
#We start in state S
#Run the Q function on S to get predicted reward values on all the possible actions
qval = eval_model.predict(state.reshape(1,2), batch_size=1)
action = (np.argmax(qval))
#Take action, observe new state S'
new_state, time_step, signal, terminal_state = take_action(state, xdata, action, signal, time_step)
#Observe reward
eval_reward = get_reward(new_state, time_step, action, xdata, signal, terminal_state, i)
state = new_state
if terminal_state == 1: #terminal state
status = 0
return eval_reward

#This neural network is the the Q-function, run it like this:
#model.predict(state.reshape(1,64), batch_size=1)

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop

model = Sequential()
model.add(Dense(4, init='lecun_uniform', input_shape=(2,)))
model.add(Activation('relu'))
#model.add(Dropout(0.2)) I'm not using dropout in this example

model.add(Dense(4, init='lecun_uniform'))
model.add(Activation('relu'))
#model.add(Dropout(0.2))

model.add(Dense(4, init='lecun_uniform'))
model.add(Activation('linear')) #linear output so we can have range of real-valued outputs

rms = RMSprop()
model.compile(loss='mse', optimizer=rms)


import random, timeit

start_time = timeit.default_timer()

indata = load_data()
epochs = 10
gamma = 0.9 #a high gamma makes a long term reward more valuable
epsilon = 1
learning_progress = []
#stores tuples of (S, A, R, S')
h = 0
signal = pd.Series(index=np.arange(len(indata)))
for i in range(epochs):

state, xdata = init_state(indata)
status = 1
terminal_state = 0
time_step = 1
#while learning is still in progress
while(status == 1):
#We start in state S
#Run the Q function on S to get predicted reward values on all the possible actions
qval = model.predict(state.reshape(1,2), batch_size=1)
if (random.random() < epsilon) and i != epochs - 1: #maybe choose random action if not the last epoch
action = np.random.randint(0,4) #assumes 4 different actions
else: #choose best action from Q(s,a) values
action = (np.argmax(qval))
#Take action, observe new state S'
new_state, time_step, signal, terminal_state = take_action(state, xdata, action, signal, time_step)
#Observe reward
reward = get_reward(new_state, time_step, action, xdata, signal, terminal_state, i)
#Get max_Q(S',a)
newQ = model.predict(new_state.reshape(1,2), batch_size=1)
maxQ = np.max(newQ)
y = np.zeros((1,4))
y[:] = qval[:]
if terminal_state == 0: #non-terminal state
update = (reward + (gamma * maxQ))
else: #terminal state (means that it is the last state)
update = reward
y[0][action] = update #target output
model.fit(state.reshape(1,2), y, batch_size=1, nb_epoch=1, verbose=0)
state = new_state
if terminal_state == 1: #terminal state
status = 0
eval_reward = evaluate_Q(indata, model)
print("Epoch #: %s Reward: %f Epsilon: %f" % (i,eval_reward, epsilon))
learning_progress.append((eval_reward))
if epsilon > 0.1:
epsilon -= (1.0/epochs)

elapsed = np.round(timeit.default_timer() - start_time, decimals=2)
print("Completed in %f" % (elapsed,))

#plot results
bt = twp.Backtest(pd.Series(data=[x[0] for x in xdata]), signal, signalType='shares')
bt.data['delta'] = bt.data['shares'].diff().fillna(0)

print(bt.data)

plt.figure()
bt.plotTrades()
plt.suptitle('epoch' + str(i))
plt.savefig('plt/final_trades'+'.png', bbox_inches='tight', pad_inches=1, dpi=72) #assumes there is a ./plt dir
plt.close('all')

plt.figure()
plt.subplot(3,1,1)
bt.plotTrades()
plt.subplot(3,1,2)
bt.pnl.plot(style='x-')
plt.subplot(3,1,3)
plt.plot(learning_progress)

plt.show()


Loading

0 comments on commit ad5415d

Please sign in to comment.