05.01 - TIME SERIES PREDICTIONS

Contents

05.01 - TIME SERIES PREDICTIONS#

!wget --no-cache -O init.py -q https://raw.githubusercontent.com/fagonzalezo/ai4eng-unal/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import local.lib.timeseries as ts
%matplotlib inline

The data#

d = pd.read_csv("local/data/eurcop.csv")
d.index = pd.to_datetime(d.Date)
del(d["Date"])
d.head()

	Rate	High (est)	Low (est)
Date
1999-09-06	2068.55	0.0	0.0
1999-09-07	2078.17	2103.0	2053.7
1999-09-08	2091.05	0.0	0.0
1999-09-09	2093.84	2118.6	2069.4
1999-09-10	2087.55	0.0	0.0

d.plot(figsize=(15,3))

<matplotlib.axes._subplots.AxesSubplot at 0x7fd05ad8cf50>

../_images/3c155b7153dd3fc7f04c23d6b9003312965a3f0a26a6fd0eab2037892279bc3f.png

d[["Rate"]].plot(figsize=(15,3))

<matplotlib.axes._subplots.AxesSubplot at 0x7fd05853d290>

../_images/cb09037ea23a4d23fb200a94d103d229aeeb0ce528822e41408bff62b63946d0.png

d = d[["Rate"]]
d.head(10)

	Rate
Date
1999-09-06	2068.55
1999-09-07	2078.17
1999-09-08	2091.05
1999-09-09	2093.84
1999-09-10	2087.55
1999-09-13	2062.96
1999-09-14	2047.08
1999-09-15	2040.93
1999-09-16	2047.17
1999-09-17	2060.87

A predictive model#

First create a time series dataset with look back#

dt = ts.timeseries_as_many2one(d, columns=["Rate"], nb_timesteps_in=4, timelag=0)
dt.head()

	Rate_0	Rate_1	Rate_2	Rate_3	Rate
Date
1999-09-10	2068.55	2078.17	2091.05	2093.84	2087.55
1999-09-13	2078.17	2091.05	2093.84	2087.55	2062.96
1999-09-14	2091.05	2093.84	2087.55	2062.96	2047.08
1999-09-15	2093.84	2087.55	2062.96	2047.08	2040.93
1999-09-16	2087.55	2062.96	2047.08	2040.93	2047.17

Split dataset for trian and for test#

trds = dt[:"2008"]
tsds = dt["2009":]
print (dt.shape, trds.shape, tsds.shape)
plt.figure(figsize=(15,3))
plt.plot(trds.index.values, trds.Rate.values, color="black", lw=2, label="train", alpha=.5)
plt.plot(tsds.index.values, tsds.Rate.values, color="red", lw=2, label="test", alpha=.5)
plt.grid();
plt.legend();

(5040, 5) (2691, 5) (2349, 5)

../_images/6f46be5baafb126828c172606e30f4bb8b5621d94253497d455f15f6061df0ef.png

Create `X` and `y` matrices for train and test#

Xtr, ytr = trds[[i for i in trds.columns if i!="Rate"]].values, trds.Rate.values
Xts, yts = tsds[[i for i in tsds.columns if i!="Rate"]].values, tsds.Rate.values

trds[:5]

	Rate_0	Rate_1	Rate_2	Rate_3	Rate
Date
1999-09-10	2068.55	2078.17	2091.05	2093.84	2087.55
1999-09-13	2078.17	2091.05	2093.84	2087.55	2062.96
1999-09-14	2091.05	2093.84	2087.55	2062.96	2047.08
1999-09-15	2093.84	2087.55	2062.96	2047.08	2040.93
1999-09-16	2087.55	2062.96	2047.08	2040.93	2047.17

print (Xtr[:10])
print (ytr[:10])

[[2068.55 2078.17 2091.05 2093.84]
 [2078.17 2091.05 2093.84 2087.55]
 [2091.05 2093.84 2087.55 2062.96]
 [2093.84 2087.55 2062.96 2047.08]
 [2087.55 2062.96 2047.08 2040.93]
 [2062.96 2047.08 2040.93 2047.17]
 [2047.08 2040.93 2047.17 2060.87]
 [2040.93 2047.17 2060.87 2065.02]
 [2047.17 2060.87 2065.02 2061.61]
 [2060.87 2065.02 2061.61 2080.33]]
[2087.55 2062.96 2047.08 2040.93 2047.17 2060.87 2065.02 2061.61 2080.33
 2085.85]

tsds[:5]

	Rate_0	Rate_1	Rate_2	Rate_3	Rate
Date
2009-01-01	3079.180176	3079.180176	3140.934326	3193.720215	3197.497070
2009-01-02	3079.180176	3140.934326	3193.720215	3197.497070	3093.394775
2009-01-04	3140.934326	3193.720215	3197.497070	3093.394775	3029.256836
2009-01-05	3193.720215	3197.497070	3093.394775	3029.256836	3029.256836
2009-01-06	3197.497070	3093.394775	3029.256836	3029.256836	2914.927246

print (Xts[:10])
print (yts[:20])

[[3079.18017578 3079.18017578 3140.93432617 3193.72021484]
 [3079.18017578 3140.93432617 3193.72021484 3197.49707031]
 [3140.93432617 3193.72021484 3197.49707031 3093.39477539]
 [3193.72021484 3197.49707031 3093.39477539 3029.25683594]
 [3197.49707031 3093.39477539 3029.25683594 3029.25683594]
 [3093.39477539 3029.25683594 3029.25683594 2914.92724609]
 [3029.25683594 3029.25683594 2914.92724609 2969.78344727]
 [3029.25683594 2914.92724609 2969.78344727 2954.19067383]
 [2914.92724609 2969.78344727 2954.19067383 2983.23510742]
 [2969.78344727 2954.19067383 2983.23510742 2923.44677734]]
[3197.49707031 3093.39477539 3029.25683594 3029.25683594 2914.92724609
 2969.78344727 2954.19067383 2983.23510742 2923.44677734 2923.44677734
 2913.42114258 2897.17138672 2885.69604492 2940.19921875 2941.0378418
 2941.0378418  2878.10864258 2848.63378906 2873.06689453 2828.56445312]

convert target into classification task for TREND PREDICTION (1 up, 0 down)#

yts = (yts>Xts[:,-1]).astype(int)
ytr = (ytr>Xtr[:,-1]).astype(int)
print (ytr[:20])
print (yts[:20])

[0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0]
[1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0]

inspect target class distributions#

print ("1's in train %.2f%s"%(np.mean(ytr)*100, "%"))
print ("1's in test  %.2f%s"%(np.mean(yts)*100, "%"))

1's in train 45.04%
1's in test  41.72%

train a predictive model#

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
estimator = RandomForestClassifier(n_estimators=5, max_depth=30)
#estimator = DecisionTreeClassifier(max_depth=2)
#estimator = LogisticRegression()
#estimator = Pipeline((("pca", PCA(n_components=2)), ("estimator", estimator)))
estimator.fit(Xtr,ytr);

get predictive accuracy in train and test#

print ("train accuracy %.2f"%estimator.score(Xtr,ytr))
print ("test accuracy  %.2f"%estimator.score(Xts,yts))

train accuracy 0.92
test accuracy  0.52

inspect confusion matrix#

from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(yts, estimator.predict(Xts))
sns.heatmap(cm,annot=True,cbar=False, fmt="d")
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')

Text(0.5, 1, 'Confusion Matrix')

../_images/4d80176627a83be93b4263b7b68f98ad2411cc941b5ba0e1c99fe31c4cf298db.png

A strategy#

if model predicts 1 (price up) we buy 10 EUR today and sell them tomorrow
if model predicts 0 (price down) we sell 10 EUR today and buy them tomorrow

def trade(d, date_close, op, qty):
    assert op in ["buy", "sell"]
    assert qty>=0
    
    r = (d.loc[:date_close].iloc[-2].Rate-d.loc[date_close].Rate)*qty    
    if op=="buy":
        r = -r
    return r

example: a buy operation on 2011-01-03 closed (with a sell operation) on 2011-01-04

trade(tsds, "2011-01-04", "buy", 100)

701.0498039999675

trade(tsds, "2011-01-05", "buy", 100)

-77.17285099997753

tsds["2011-01-02":].iloc[:5]

	Rate_0	Rate_1	Rate_2	Rate_3	Rate
Date
2011-01-02	2528.971680	2528.971680	2618.146240	2567.137939	2520.103760
2011-01-03	2528.971680	2618.146240	2567.137939	2520.103760	2520.103760
2011-01-04	2618.146240	2567.137939	2520.103760	2520.103760	2527.114258
2011-01-05	2567.137939	2520.103760	2520.103760	2527.114258	2526.342529
2011-01-06	2520.103760	2520.103760	2527.114258	2526.342529	2478.751709

yts

array([1, 0, 0, ..., 0, 0, 1])

def compute_pnl(d, y, predictions, qty=10):
    pnl = []
    for date,prediction in zip(d.index[1:], predictions[1:]):
        pnl.append(trade(d, date, "sell" if prediction==0 else "buy", qty))
    pnl = pd.DataFrame(np.r_[[pnl]].T, index=d.index[1:], columns=["pnl"])
    pnl["prediction"]=predictions[1:]
    pnl["y"]=y[1:]
    return pnl

preds = estimator.predict(Xts)
pnl = compute_pnl(tsds, yts, preds)

pnl.pnl.plot()
plt.title("TOTAL PNL %.2f COP"%pnl.pnl.sum())
plt.ylabel("PNL")
plt.grid();
plt.ylim(-5000,5000);

../_images/1fc250a02a0510ad5a0d64761cc02a08cb7c0bf6e612e864911595148ac866f9.png

def plot_pnlhist(pnl_series, label=""):
    k = pnl_series.values
    total = np.sum(k); 
    k = k[np.abs(k)<50000]
    plt.hist(k, bins=30);
    plt.title("PNL for %s, total %.2f COP"%(label, total))

plt.figure(figsize=(12,8))
plt.subplot(221); plot_pnlhist(pnl[pnl.y==1].pnl, "REAL = 1 (up)"); plt.grid();
plt.subplot(222); plot_pnlhist(pnl[pnl.y==0].pnl, "REAL = 0 (down)"); plt.grid();
plt.subplot(223); plot_pnlhist(pnl[preds[1:]==1].pnl, "PREDS = 1 (up)"); plt.grid();
plt.subplot(224); plot_pnlhist(pnl[preds[1:]==0].pnl, "PREDS = 0 (down)"); plt.grid();

../_images/64718b9245141a56506d170029193563748e53a4552e06c351f8ab471b8134a1.png