Commit 01cd8c20 authored by Jean-Marie Lepioufle's avatar Jean-Marie Lepioufle
Browse files

clean code

parent f6029975
......@@ -4,6 +4,15 @@ A package for testing AQ forecasting with different DL models
Package under dev: changes might occur at anytime.
# install
```bash
cd /tmp
git clone https://git.nilu.no/aqdl/mtsaq_pkg.git
cd mtsaq_pkg
pip3 install -e .
```
# jupyter with mtsaq on docker
```bash
docker pull jmll/jupyter.aqdl:0.1
......
´´´shell
docker pull jmll/jupyter:0.1
docker run -p 8888:8888 jmll/jupyter:0.1
'''
from typing import Dict
from datetime import datetime
from mtsaq.utils import get_random_alphanumeric_string
from mtsaq.utils.utils import get_random_alphanumeric_string
from mtsaq.dictionnary.dictionnary_model import model_dict
import torch
class class_model():
def __init__(self, params: dict):
......@@ -11,8 +13,8 @@ class class_model():
self.filename_core = get_random_alphanumeric_string(20)
def build(self):
if self.name in dictionnary_model:
self.model = dictionnary_model[self.name](**self.params)
if self.name in model_dict:
self.model = model_dict[self.name](**self.params)
self.model.to(self.device)
else:
raise Exception("Model " + self.name +" not found.")
......
......@@ -2,26 +2,56 @@
#import numpy as np
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
import torch
from mtsaq.utils.utils import encode_cos, encode_sin
from mtsaq.dictionnary.dictionnary_scaling import scaling_dict
class class_ts():
def __init__(self, path: str, date: list, id_start: int, id_stop: int, target: list, gap_length: int, horizon_length: int, features: list, history_length: int, ):
def __init__(self, path: str, date: list, id_start: int, id_stop: int, target: list, gap_length: int, horizon_length: int, features: list, features_cyclic: list, history_length: int ):
self.gap_length = gap_length
self.horizon_length = horizon_length
self.history_length = history_length
self.SCALED = False
self.features = features
self.target = target
tmp = pd.read_csv(path).sort_values(by=date)
self.df_features = tmp.loc[id_start: id_stop, features]
self.df_target = tmp.loc[id_start:id_stop, target]
if features_cyclic is not None:
for i in features_cyclic:
tmp[i] = [float(ele) for ele in tmp[i]]
tmp[i+'_sin'] = encode_sin(tmp[i], max(tmp[i]))
tmp[i+'_cos'] = encode_cos(tmp[i], max(tmp[i]))
del tmp[i]
self.features = features + [s + '_sin' for s in features_cyclic] + [s + '_cos' for s in features_cyclic]
else:
self.features = features
self.target = target
self.df_features = tmp.loc[id_start: id_stop, self.features]
self.df_target = tmp.loc[id_start:id_stop, self.target]
del tmp
self.scaling_dict = {
"StandardScaler": StandardScaler(),
"RobustScaler": RobustScaler(),
"MinMaxScaler": MinMaxScaler(),
"MaxAbsScaler": MaxAbsScaler()}
#################################
# !! self.scaling_dict[scaling] is defined as a global variable;scaling features is replaced by scaling targ
# Todo: writting properly the scaling functions
tmp = self.df_features.mean()
self.mean_features = pd.DataFrame(tmp.values).transpose()
self.mean_features.columns = tmp.index
tmp = self.df_features.std()
self.std_features = pd.DataFrame(tmp.values).transpose()
self.std_features.columns = tmp.index
tmp = self.df_target.mean()
self.mean_target = pd.DataFrame(tmp.values).transpose()
self.mean_target.columns = tmp.index
tmp = self.df_target.std()
self.std_target = pd.DataFrame(tmp.values).transpose()
self.std_target.columns = tmp.index
for i in self.std_features:
if self.std_features[i].values == 0:
self.std_features[i] = 1
for i in self.std_target:
if self.std_target[i].values == 0:
self.std_target[i] = 1
###################################
# https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset
def __getitem__(self, id):
......@@ -36,22 +66,29 @@ class class_ts():
# https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset
def __len__(self) -> int:
return (
len(self.df_features.index) - self.history_length - self.gap_length - self.horizon_length - 1
)
return (len(self.df_features.index) - self.history_length - self.gap_length - self.horizon_length - 1)
def scale(self, scaling = None):
if scaling is not None:
if self.SCALED == False:
self.scaling_fun_features = self.scaling_dict[scaling]
self.scaling_fun_features.fit(self.df_features)
tmp_df = self.scaling_fun_features.transform(self.df_features)
self.df_features = pd.DataFrame(tmp_df, index=self.df_features.index, columns=self.df_features.columns)
del tmp_df
self.scaling_fun_target = self.scaling_dict[scaling]
self.scaling_fun_target.fit(self.df_target)
tmp_df = self.scaling_fun_target.transform(self.df_target)
self.df_target = pd.DataFrame(tmp_df, index=self.df_target.index, columns=self.df_target.columns)
# !! self.scaling_dict[scaling] is defined as a global variable;consequence: scaling features is replaced by scaling targ
# Todo: writting properly the scaling functions
#self.scalefeat = self.scaling_dict[scaling]
#self.scalefeat.fit(self.df_features)
#tmp_df = self.scalefeat.transform(self.df_features)
#self.df_features = pd.DataFrame(tmp_df, index=self.df_features.index, columns=self.df_features.columns)
#del tmp_df
#self.scaletarg = self.scaling_dict[scaling]
#self.scaletarg.fit(self.df_target)
#tmp_df = self.scaletarg.transform(self.df_target)
#self.df_target = pd.DataFrame(tmp_df, index=self.df_target.index, columns=self.df_target.columns)
#del tmp_df
# so far only standardscaler
for i in self.df_features:
self.df_features[i] = self.df_features[i].apply(lambda x: (x - self.mean_features[i])/ self.std_features[i])
for i in self.df_target:
self.df_target[i] = self.df_target[i].apply(lambda x: (x - self.mean_target[i])/ self.std_target[i])
self.SCALED = True
else:
print('df already scaled')
......@@ -59,43 +96,59 @@ class class_ts():
print("scaling parameters required")
def unscale(self, newdata=None, datatype = None):
print(datatype)
if self.SCALED == True:
tmp_df = self.scaling_fun_features.inverse_transform(self.df_features)
self.df_features = pd.DataFrame(tmp_df, index=self.df_features.index, columns=self.df_features.columns)
del tmp_df
tmp_df = self.scaling_fun_target.inverse_transform(self.df_target)
self.df_target = pd.DataFrame(tmp_df, index=self.df_target.index, columns=self.df_target.columns)
self.SCALED = False
del tmp_df
elif newdata is not None:
if newdata is not None:
if isinstance(newdata, torch.Tensor):
newdata_np = newdata.numpy()
tmp = newdata.numpy()
elif isinstance(newdata, pd.Series) or isinstance(newdata, pd.DataFrame):
newdata_np = newdata.values
tmp = np.asarray(newdata)
elif isinstance(newdata, np.ndarray):
newdata_np = newdata
tmp = newdata
else:
print('instance of newdata not known')
if datatype == 'target' :
tmp = self.scaling_fun_target.inverse_transform(newdata_np)
# ! same remark as earlier
#tmp = self.scaletarg.inverse_transform(newdata_np)
# so far only standardscaler
for i in range(len(self.target)):
tmp[:,:,i] = tmp[:,:,i]*self.std_target.iloc[i].values + self.mean_target.iloc[i].values
#tmp[i] = tmp[i].apply(lambda x: (x *self.std_target[i] + self.mean_target[i]))
if isinstance(newdata, torch.Tensor):
res = torch.from_numpy(tmp)
elif isinstance(newdata, pd.Series) or isinstance(newdata, pd.DataFrame):
res = pd.DataFrame(tmp_df, index=newdata.index, columns=newdata.columns)
res = pd.DataFrame(tmp, index=newdata.index, columns=newdata.columns)
elif isinstance(newdata, np.ndarray):
res = tmp
if datatype == 'features':
tmp = self.scaling_fun_features.inverse_transform(newdata_np)
else:
print('instance of tmp not known')
elif datatype == 'features':
# ! same remark as earlier
#tmp = self.scalefeat.inverse_transform(newdata_np)
for i in range(len(self.features)):
tmp[:,:,i] = tmp[:,:,i]*self.std_features.iloc[i].values + self.mean_features.iloc[i].values
#tmp[i] = tmp[i].apply(lambda x: (x *self.std_features[i] + self.mean_features[i]))
if isinstance(newdata, torch.Tensor):
res = torch.from_numpy(tmp)
elif isinstance(newdata, pd.Series) or isinstance(newdata, pd.DataFrame):
res = pd.DataFrame(tmp_df, index=newdata.index, columns=newdata.columns)
res = pd.DataFrame(tmp, index=newdata.index, columns=newdata.columns)
elif isinstance(newdata, np.ndarray):
res = tmp
else:
print('instance of tmp not known')
else:
print('datatype either target or features')
return res
elif self.SCALED == True:
# ! same remark as earlier
#tmp = self.scalefeat.inverse_transform(np.asarray(self.df_features))
#self.df_features = pd.DataFrame(tmp, index=self.df_features.index, columns=self.df_features.columns)
#del tmp
#tmp = self.scaletarg.inverse_transform(np.asarray(self.df_target))
#self.df_target = pd.DataFrame(tmp, index=self.df_target.index, columns=self.df_target.columns)
#self.SCALED = False
#del tmp
for i in self.df_features:
self.df_features[i] = self.df_features[i].apply(lambda x: (x *self.std_features[i] + self.mean_features[i]))
for i in self.df_target:
self.df_target[i] = self.df_target[i].apply(lambda x: (x *self.std_target[i] + self.mean_target[i]))
else:
print('df already unscaled')
......@@ -2,8 +2,8 @@
from torch.optim import Adam, SGD
from mtsaq.optim.optim import BertAdam
from torch.nn import MSELoss, SmoothL1Loss, PoissonNLLLoss, L1Loss
from mtsaq.optim.optim import RMSELoss, MAPELoss
from mtsaq.optim.dilate_loss import DilateLoss
from mtsaq.criterion.criterion import RMSELoss, MAPELoss
from mtsaq.criterion.dilate_loss import DilateLoss
optim_dict = {"Adam": Adam, "SGD": SGD, "BertAdam": BertAdam}
......@@ -16,4 +16,4 @@ criterion_dict = {
"DilateLoss": DilateLoss,
"L1": L1Loss}
evaluation_dict = {"NSE": "", "MSE": ""}
#NSE, KGE,...
#from mtsaq.models.transformer.multi_head_base import MultiAttnHeadSimple
from mtsaq.models.transformer.transformer_basic import SimpleTransformer, CustomTransformerDecoder
#from mtsaq.models.transformer.transformer_xl import TransformerXL
#from mtsaq.models.transformer.dummy_torch import DummyTorchModel
from mtsaq.models.lstm.lstm import LSTM_mts
from mtsaq.models.linear_regression.linear_regression import SimpleLinearModel
from mtsaq.models.da_rnn.model import DARNN
from mtsaq.models.autoencoder.basic_ae import AE
#from mtsaq.models.multi_head_base import MultiAttnHeadSimple
from mtsaq.models.transformer_basic import SimpleTransformer #, CustomTransformerDecoder
#from mtsaq.models.transformer_xl import TransformerXL
#from mtsaq.models.dummy_torch import DummyTorchModel
from mtsaq.models.lstm_mts import LSTM_mts
from mtsaq.models.linear_regression import SimpleLinearModel
#from mtsaq.models.da_rnn.model import DARNN
#from mtsaq.models.autoencoder import AE
import torch
"""
Utility dictionaries to map a string to a class
"""
dictionnary_model = {
model_dict = {
#"MultiAttnHeadSimple": MultiAttnHeadSimple,
"SimpleTransformer": SimpleTransformer,
#"TransformerXL": TransformerXL,
#"DummyTorchModel": DummyTorchModel,
"LSTM_mts": LSTM_mts #,
"LSTM_mts": LSTM_mts ,
"SimpleLinearModel": SimpleLinearModel,
#"CustomTransformerDecoder": CustomTransformerDecoder,
#"DARNN": DARNN,
......
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
scaling_dict = {
"StandardScaler": StandardScaler(),
"RobustScaler": RobustScaler(),
"MinMaxScaler": MinMaxScaler(),
"MaxAbsScaler": MaxAbsScaler()}
......@@ -28,7 +28,6 @@ class LSTM_mts(torch.nn.Module):
super().__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
print(locals().keys())
self.lstm = torch.nn.LSTM(input_size=input_size,
hidden_size = hidden_size,
num_layers = num_layers,
......@@ -42,17 +41,7 @@ class LSTM_mts(torch.nn.Module):
self.init_hidden(batch_size)
def init_hidden(self, batch_size) -> None:
self.hidden = (
torch.zeros(
self.num_layers,
batch_size,
self.hidden_size).to(
self.device),
torch.zeros(
self.num_layers,
batch_size,
self.hidden_size).to(
self.device))
self.hidden = (torch.zeros(self.num_layers,batch_size,self.hidden_size).to(self.device),torch.zeros(self.num_layers,batch_size,self.hidden_size).to(self.device))
def forward(self, x: torch.Tensor) -> torch.Tensor:
batch_size = x.size()[0]
......
......@@ -176,7 +176,7 @@ def greedy_decode(
def generate_square_subsequent_mask(sz: int) -> torch.Tensor:
r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
Unmasked positions are filled with float(0.0).
"""
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
......
from mtsaq.classes import class_model
from torch.utils.data import DataLoader
import torch
def predict(model: class_model,
params: dict,
data_loader: DataLoader) -> float:
nb_target = len(params['target'])
horizon_length = params['horizon_length']
output = []
for features_data, target_data in data_loader:
features_data = features_data.to(model.device)
target_data = target_data.to(model.device)
tmp = model.model(features_data)
output.append(tmp.view(1,horizon_length,nb_target))
res = torch.cat(output, axis=0).to(model.device).detach()
return res
from mtsaq.classes.class_ts import class_ts
from mtsaq.classes.class_model import class_model
from mtsaq.train.train import single_train, loss
from mtsaq.predict.predict import predict
from torch.utils.data import DataLoader
##############
# Paramaters #
##############
# dataset
train_dataset_param = {'path': 'https://git.nilu.no/jmll/dataset/-/raw/master/no2_oslo10_2015_2018.csv', 'date': ['YEAR','MONTH','DAY','HOUR'], 'id_start': 0, 'id_stop': 400, 'scaling': 'StandardScaler'} #8760 = (365*24)
test_dataset_param = {'path': 'https://git.nilu.no/jmll/dataset/-/raw/master/no2_oslo10_2015_2018.csv', 'date': ['YEAR','MONTH','DAY','HOUR'], 'id_start': 401, 'id_stop': 801, 'scaling': 'StandardScaler'} #8760 = (365*24)
train_dataset_param = {'path': 'https://git.nilu.no/jmll/dataset/-/raw/master/no2_oslo10_2015_2018.csv', 'date': ['YEAR','MONTH','DAY','HOUR'], 'id_start': 0, 'id_stop': 399, 'scaling': 'StandardScaler'} #id_stop : 8760 => (365*24)
test_dataset_param = {'path': 'https://git.nilu.no/jmll/dataset/-/raw/master/no2_oslo10_2015_2018.csv', 'date': ['YEAR','MONTH','DAY','HOUR'], 'id_start': 400, 'id_stop': 799, 'scaling': 'StandardScaler'}
validation_dataset_param = {'path': 'https://git.nilu.no/jmll/dataset/-/raw/master/no2_oslo10_2015_2018.csv', 'date': ['YEAR','MONTH','DAY','HOUR'], 'id_start': 800, 'id_stop': 1199, 'scaling': 'StandardScaler'}
# target
target_param = {'target': ['NO2_7'], 'gap_length': 6, 'horizon_length': 1}
target_param = {'target': ['NO2_7'], 'gap_length': 6, 'horizon_length': 3}
# features
features_param = {'features': ['WDAY','YEAR','MONTH','DAY','HOUR', 'NO2_11', 'NO2_163','NO2_464','NO2_504'], 'history_length': 30}
features_param = {'features': ['NO2_11', 'NO2_163','NO2_464','NO2_504'], 'features_cyclic': ['WDAY','YEAR','MONTH','DAY','HOUR'], 'history_length': 30}
# data loader
dataloader_param = {'batch_size': 6}
# model
model_param = {'name': 'LSTM_mts',
'param':{'input_size': len(features_param['features']),
'param':{'input_size': len(features_param['features']+2*features_param['features_cyclic']),
'hidden_size': 20,
'num_layers': 2,
'bias' : True,
......@@ -35,7 +41,7 @@ training_param = {'criterion':{'name': 'MSE', 'param':{}},
'optimizer': {'name': 'Adam','param':{'lr': 0.3}},
'takes_target': False,
'forward_params': {},
'epochs': 200,
'epochs': 50,
'batch_size':dataloader_param['batch_size']}
# decoder
......@@ -43,21 +49,19 @@ training_param = {'criterion':{'name': 'MSE', 'param':{}},
decoder_param = {}
# eval
eval_param = {'metrics': 'MSE'}
metrics_param = {'name':{'MSE'}}
##############################
# train/test dataset Loader #
##############################
# train
train = class_ts(train_dataset_param['path'], train_dataset_param['date'], train_dataset_param['id_start'], train_dataset_param['id_stop'], target_param['target'], target_param['gap_length'], target_param['horizon_length'], features_param['features'], features_param['history_length'])
train = class_ts(train_dataset_param['path'], train_dataset_param['date'], train_dataset_param['id_start'], train_dataset_param['id_stop'], target_param['target'], target_param['gap_length'], target_param['horizon_length'], features_param['features'], features_param['features_cyclic'], features_param['history_length'])
train.scale(train_dataset_param['scaling'])
#len(train)
train_data_loader = DataLoader(train,batch_size=dataloader_param['batch_size'], shuffle=False,sampler=None,batch_sampler=None,num_workers=0,collate_fn=None,pin_memory=False,drop_last=False,timeout=0,worker_init_fn=None)
#print(list(train_data_loader))
# test
test = class_ts(test_dataset_param['path'], test_dataset_param['date'], test_dataset_param['id_start'], test_dataset_param['id_stop'], target_param['target'], target_param['gap_length'], target_param['horizon_length'], features_param['features'], features_param['history_length'])
test = class_ts(test_dataset_param['path'], test_dataset_param['date'], test_dataset_param['id_start'], test_dataset_param['id_stop'], target_param['target'], target_param['gap_length'], target_param['horizon_length'], features_param['features'], features_param['features_cyclic'], features_param['history_length'])
test.scale(test_dataset_param['scaling'])
test_data_loader = DataLoader(test,batch_size=dataloader_param['batch_size'], shuffle=False,sampler=None,batch_sampler=None,num_workers=0,collate_fn=None,pin_memory=False,drop_last=False,timeout=0,worker_init_fn=None)
......@@ -71,32 +75,33 @@ model_1.build()
# Training model #
##########################
# single_train
#loss = single_train(model_1,training_param,train_data_loader)
session_params = []
for epoch in range(training_param['epochs']):
total_loss = single_train(model_1,training_param,train_data_loader)
print('epoch ' + str(epoch))
print(total_loss)
test = loss(test_data_loader,model_1)
if test < 0.01:
raise('Error validation loss is zero there is a problem with the validator.')
epoch_params = {
'epoch': epoch,
'train_loss': str(total_loss),
'test_loss': str(test)}
single_train(model_1,training_param,train_data_loader)
train_loss = loss(model_1,training_param,train_data_loader)
test_loss = loss(model_1,training_param,test_data_loader)
epoch_params = {'epoch': epoch, 'train_loss': str(train_loss),'test_loss': str(test_loss)}
print(epoch_params)
session_params.append(epoch_params)
#if es:
# if not es.check_loss(model.model, test):
# print('Stopping model now')
# model.model.load_state_dict(torch.load('checkpoint.pth'))
# break
#validation = data_ts(path, date, target, features, id_start = 17522, id_stop = 26282)
##########################
# Validation #
##########################
# validation dataset loader
validation = class_ts(validation_dataset_param['path'], validation_dataset_param['date'], validation_dataset_param['id_start'], validation_dataset_param['id_stop'], target_param['target'], target_param['gap_length'], target_param['horizon_length'], features_param['features'], features_param['features_cyclic'], features_param['history_length'])
validation.scale(validation_dataset_param['scaling'])
validation_data_loader = DataLoader(validation,batch_size=1, shuffle=False,sampler=None,batch_sampler=None,num_workers=0,collate_fn=None,pin_memory=False,drop_last=False,timeout=0,worker_init_fn=None)
# prediction
pred = predict(model_1,target_param,validation_data_loader)
validation.unscale(newdata=pred,datatype="target")
validation.unscale()
# metrics: MSE
import numpy as np
m_fun = criterion_dict[m]()
res = 0.0
for id in range(len(pred)):
res = res + sum(((validation[i][1].float()-pred[i].float())**2).numpy())[0]/validation.horizon_length/len(validation.target)
from mtsaq.classes import class_model
from mtsaq.dictionnary.dictionnary_metrics import optim_dict, criterion_dict
from torch.utils.data import DataLoader
import torch
def single_train(model: class_model,
params: dict,
data_loader: DataLoader) -> float:
optim_name = params['optimizer']['name']
optim_param = params['optimizer']['param']
opt = optim_dict[optim_name](model.model.parameters(), **optim_param)
crit_name = params['criterion']['name']
crit_param = params['criterion']['param']
criterion = criterion_dict[crit_name](**crit_param)
takes_target = params['takes_target']
forward_params = params['forward_params']
for features_data, target_data in data_loader:
opt.zero_grad()
features_data = features_data.to(model.device)
target_data = target_data.to(model.device)
if takes_target:
forward_params["t"] = target_data
output = model.model(features_data, **forward_params)
labels = target_data[:, :, 0]
loss = criterion(output, labels.float())
loss.backward()
opt.step()
if torch.isnan(loss) or loss == float('inf'):
raise("Error infinite or NaN loss detected")
def loss(model: class_model,
params: dict,
data_loader: DataLoader) -> float:
crit_name = params['criterion']['name']
crit_param = params['criterion']['param']
criterion = criterion_dict[crit_name](**crit_param)
model.model.eval()
with torch.no_grad():
i = 0
running_loss = 0.0
for features_data, target_data in data_loader:
i += 1
features_data = features_data.to(model.device)
target_data = target_data.to(model.device)
output = model.model(features_data.float())
labels = target_data[:, :, 0]
loss = criterion(output, labels.float())
running_loss += loss.item()
total_loss = running_loss / float(i)
model.model.train()
return total_loss
import random
import string
import numpy as np
def get_random_alphanumeric_string(length):
letters_and_digits = string.ascii_letters + string.digits
result_str = ''.join((random.choice(letters_and_digits) for i in range(length)))
return(result_str.lower())
def encode_cos(values, max_val):
res = np.cos(2 * np.pi * values/max_val)
return res
def encode_sin(values, max_val):
res = np.sin(2 * np.pi * values/max_val)
return res
......@@ -21,11 +21,12 @@ setup(
author_email="jml@nilu.no",
packages=[
'mtsaq',
'mtsaq.class',
'mtsaq.classes',
'mtsaq.criterion',
"mtsaq.dictionnary",
"mtsaq.models",
"mtsaq.optim",
"mtsaq.train",
"mtsaq.utils"],
license='MIT + Copyright NILU',
description='A package for testing AQ forecasting with different DL models',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment