-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset.py
151 lines (126 loc) · 6.45 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from itertools import filterfalse
from math import floor
from typing import Union
import numpy as np
from numpy.lib import stride_tricks
import pandas as pd
from more_itertools import powerset
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
import utils
from utils import Scaler
from constants import DataDict, AssetID, DATASET_SYMBOLS
class TimeSeriesDataset:
def __init__(self,
X: Union[np.ndarray, pd.DataFrame, pd.Series],
y: Union[np.ndarray, pd.DataFrame, pd.Series],
period=5,
test_size=0.2,
scaler: Scaler = None,
fit=True,
flatten=True,
name=None,
clone_scaler=False):
"""
Dataset class for time series data (note: assumes data is already aligned)
:param X: input data
:param y: target data
:param period: period of analysis
:param test_size: proportion of dataset to use in the testing set
:param scaler: scaler to use with the data. Doesn't scale data if not provided (default)
:param flatten: whether to flatten the sliding windows generated by the dataset
:param name: name of the dataset
:param clone_scaler: whether to clone the scaler provided (prevents aliasing when building datasets in loops)
"""
self.name = name # set the dataset name
self.scaler = clone(scaler) if clone_scaler else scaler # set the scaler for the dataset
self.period = period
self.y = y[self.period - 1:] # drop unusable values for y and set y
# get the indices of end of the training set / start of the testing set
train_end = test_start = floor(len(X) * (1 - test_size))
if scaler is not None:
if fit:
self.scaler.fit(X[:train_end]) # fit the scaler to the training data
X = self.scaler.transform(X) # scale the data
if isinstance(X, (pd.DataFrame, pd.Series)): # Convert to numpy if DataFrame or Series
X = X.to_numpy()
if X.ndim == 1: # if passed a 1D array for X (occurs if X is a series)
X = np.expand_dims(X, axis=1) # add an extra dimension (required for sliding window)
self.features = X.shape[1] # get the number of features
# https://stackoverflow.com/questions/43185589/sliding-windows-from-2d-array-that-slides-along-axis-0-or-rows-to-give-a-3d-arra
# Generate sliding windows
nd0 = X.shape[0] - self.period + 1
s0, s1 = X.strides
self.X = stride_tricks.as_strided(X, shape=(nd0, self.period, self.features), strides=(s0, s0, s1))
if flatten: # flatten the sliding windows
self.X = self.X.reshape(-1, self.period * self.features)
# set training and testing subsets
self.X_train = self.X[:train_end]
self.y_train = self.y[:train_end]
self.X_test = self.X[test_start:]
self.y_test = self.y[test_start:]
class MultiAssetDataset(TimeSeriesDataset):
def __init__(self, symbols: list[AssetID], data: DataDict, y, **kwargs):
"""
Dataset class for time series datasets with multiple assets
:param symbols: list of tuples (asset type, SYMBOL)
:param data: dictionary of data to pull data from based on symbols
:param y: target data
:param kwargs: arbitrary keyword arguments to pass to TimeSeriesDataset
"""
self.symbols = symbols
self.dfs = [data[asset_type][symbol] for asset_type, symbol in self.symbols]
joined = utils.join_datasets(self.dfs)
X, y = utils.align_data(joined, y)
super().__init__(X, y, **kwargs)
def build_datasets(period=5,
rand_features=5,
test_size=0.2,
zero_col_thresh=1,
replace_zero=None) -> list[TimeSeriesDataset]:
"""
Builds the full suite of datasets for experimentation
:param period: period for sliding windows
:param rand_features: number of features to generate for random sample datasets
:param test_size: proportion of dataset to use in the testing set
:param zero_col_thresh: proportion of a column that must be zero to drop it (passed to make_percent_dict)
:param replace_zero: value to replace zeros in y_base with. No replacement if None (default)
:return: list of datasets
"""
# Load all the data
raw_data = utils.load_data()
returns_data = utils.make_returns_datadict(raw_data, zero_col_thresh=zero_col_thresh)
# Generate the FULL available y set
y_base = utils.make_returns_series(raw_data['stock']['SPY.US']['close'])
y_base = y_base.apply(np.sign).shift(-1).iloc[:-1]
if replace_zero is not None: # replace zeros if desired
y_base = y_base.replace(0, replace_zero) # replace 0s with specified value
# Initialize list of datasets with just the random normal sample data and SPY alone
datasets = [
TimeSeriesDataset(np.random.normal(len(y_base), size=(len(y_base), rand_features)),
y_base,
period=period,
test_size=test_size,
scaler=StandardScaler(),
name='Normal Sample'),
TimeSeriesDataset(*utils.align_data(returns_data['stock']['SPY.US'], y_base),
period=period,
test_size=test_size,
scaler=StandardScaler(),
name='SPY Only')
]
# MULTI-ASSET DATASET GENERATION
# Generate powerset of available asset types (EXCLUDES SPY!!)
asset_powerset = filterfalse(lambda x: x == (), powerset(DATASET_SYMBOLS.keys()))
for asset_set in asset_powerset: # for each set of assets in the powerset of asset types
symbol_list = utils.generate_symbol_list(asset_set) # get list of symbols for those assets
# generate and append new MultiAssetDataset to dataset list
datasets.append(MultiAssetDataset([('stock', 'SPY.US')] + symbol_list[0],
returns_data,
y_base,
name=symbol_list[1],
period=period,
test_size=test_size,
scaler=StandardScaler(),
clone_scaler=True))
return datasets