Commit bee2f12b authored by Jean-Marie Lepioufle's avatar Jean-Marie Lepioufle
Browse files

version 0.0.5 w/ install&script

parent 0411ef24
^.*\.Rproj$
^\.Rproj\.user$
lib/
Package: cipred
Type: Package
Version: 0.0.4
Version: 0.0.5
Authors@R: c(person("Jean-Marie", "Lepioufle", , "jml@nilu.no", role=c("aut","cre")))
Title: cipred
Description: CI prediction.
......
......@@ -92,8 +92,10 @@ pred_rfci_lu2019 <- function(lu2019,dataset,predictors){
# code adapted from function quantForestError() in package forestError,
# https://github.com/benjilu/forestError/blob/master/R/quantforesterror.R
# Like quantForestError(), predict_lu2019() provides the prediction and the quantiles reprenstation of Q_alpha
# In addition, the function provides a relative Q_alpha against the observation and the p-value of the observation against the errors related to the specific tree/nodes
# The function fit_lu2019 gather the ranger model and the terminal nodes got from the training that alltogether is called model lu2019.
# avoid check note
if(getRversion() >= "2.15.1"){utils::globalVariables(c("tree", "terminal_node",
......@@ -241,7 +243,7 @@ predict_lu2019_prime <- function(lu2019, X.test, Y.test, alpha, n.cores = 1) {
c(alpha, rev(alpha)))
col_names <- c(col_names, rel_interval_col_names)
# compute Q-values alpha
# compute relative Q-values alpha
oob_error_stats[, (rel_interval_col_names) := lapply(percentiles, FUN = function(p) {
100*1/pred * purrr::map_dbl(all_errs, ~.x[ceiling(length(.x) * p)])})]
......
# avoid check note
if(getRversion() >= "2.15.1"){utils::globalVariables(c("method","U","pvalue"))}
if(getRversion() >= "2.15.1"){utils::globalVariables(c("ID","method","U","pvalue"))}
#' predictive QQ-plot
......@@ -32,12 +32,12 @@ pqq_plot <- function(df,DESKTOP=TRUE,path=tempdir(),name=NULL,width_mm=200,heigh
} else grDevices::dev.new()
df <- df %>% group_by(method) %>% arrange(pvalue) %>% mutate(U = seq(1:n())/n())
df <- df %>% group_by(ID, method) %>% arrange(pvalue) %>% mutate(U = seq(1:n())/n())
p <- ggplot2::ggplot(df, ggplot2::aes(x = U, y = pvalue)) +
ggplot2::geom_abline(lty = "dashed", col = "red") +
#ggplot2::geom_point(ggplot2::aes(colour = method,shape = ID)) +
ggplot2::geom_point(ggplot2::aes(colour = method)) +
#ggplot2::scale_colour_gradient(low = "white", high = "black") +
ggplot2::coord_fixed(ratio = 1) +
ggplot2::labs(
x = "U[0,1]",
......
# avoid check note
if(getRversion() >= "2.15.1"){utils::globalVariables(c("obs","method","reu"))}
#' plot relative expanded uncertainty
#' plot_ru
#' @param reu_obj reu_obj object
#' @param DQO DQO Digital Quality Objective
#' @param ylim ylim
#' @param DESKTOP boolean
#' @param path path tempdir() by default
#' @param name name to be added to the filename
#' @param width_mm width in mm, 200 by default
#' @param height_mm height in mm, 200 by default
#' @keywords cipred
#' @export
#' @examples
#' \dontrun{
#' plot_ru()
#' }
#
plot_ru <- function(reu_obj,DQO=30,ylim = c(0,100),DESKTOP=TRUE,path=tempdir(),name=NULL,width_mm=200,height_mm=200){
if (inherits(reu_obj,"reu")){
if (!DESKTOP) {
filename <- normalizePath(file.path(path,paste0("plot_reu_",name,".tiff")),mustWork = FALSE)
grDevices::tiff(filename = filename, width = width_mm, height = height_mm,
units = "mm", pointsize = 12,
compression = "lzw",
bg = "transparent", res = 300)
} else grDevices::dev.new()
p <- ggplot2::ggplot(reu_obj, ggplot2::aes(x = obs, y = reu)) +
ggplot2::geom_point(ggplot2::aes(colour = method),alpha=1/10) +
ggplot2::geom_hline(yintercept = DQO) +
ggplot2::annotate("text", max(reu_obj$obs), DQO, vjust = -1, label = "DQO") +
#ggplot2::geom_smooth(method="loess") + #takes ages on large datasets
ggplot2::ylim(ylim[1],ylim[2]) +
ggplot2::labs(
x = "observation",
y = "ratio [%]"
)
print(p)
if (!DESKTOP) {
grDevices::dev.off()
return(filename)
} else {
return(TRUE)
}
}
}
......@@ -4,15 +4,15 @@ CI prediction.
Methods: Lu2019, Wager2014, Eaamm2010.
## Installation of Cipred v0.0.4 in base R
## Installation of Cipred v0.0.5 in base R
```r
setwd("~")
download.file("https://git.nilu.no/rqcr/cipred/-/archive/0.0.4/cipred-0.0.4.tar.gz",destfile="cipred-0.0.4.tar.gz")
untar("cipred-0.0.4.tar.gz")
setwd(file.path("~","cipred-0.0.4"))
source(file.path("~/cipred-0.0.4/lib/buildpkgs.R"))
install.packages(file.path("~","cipred-0.0.4.tar.gz"), repos=NULL, type='source')
unlink(file.path("~/cipred-0.0.4"),recursive=TRUE)
file.remove(file.path("~/cipred-0.0.4.tar.gz"))
download.file("https://git.nilu.no/rqcr/cipred/-/archive/0.0.5/cipred-0.0.5.tar.gz",destfile="cipred-0.0.5.tar.gz")
untar("cipred-0.0.5.tar.gz")
setwd(file.path("~","cipred-0.0.5"))
source(file.path("~/cipred-0.0.5/lib/buildpkgs.R"))
install.packages(file.path("~","cipred-0.0.5.tar.gz"), repos=NULL, type='source')
unlink(file.path("~/cipred-0.0.5"),recursive=TRUE)
file.remove(file.path("~/cipred-0.0.5.tar.gz"))
```
This diff is collapsed.
This diff is collapsed.
library(dplyr)
########################
# PARAMETERS #
########################
element_id <- c("NO2")
# keeping the interesting stations
#station_id <- c("7","464","827","665","9","11","163","504","809")
station_id <- c("7","464","827","665","9","11","163","809")
timeResolution <- "hourly"
precision <- "hourly"
#############################
## Training/testing dataset #
#############################
# Period 2017-2018
fromDate <- "2017/01/01 00:00:00"
toDate <- "2018/12/31 23:00:00"
# luft
tmp <- data.luftkval.oslo10::get_short_data(stations= station_id,
referencetimeFrom=fromDate,
referencetimeTo=toDate,
timeResolution=timeResolution,
elements=element_id)
luft <- friendlyts::ts_spread(df=tmp, col_date=c("date"),
col_key=c("CO_NAME","ID"),col_target="value",
precision=timeResolution,tzone="UTC",
date_type="posixlt")
luft <- friendlyts::check_date(luft,
fromDate=fromDate,
toDate=toDate,
timeResolution=timeResolution,
v=1,
precision=timeResolution,
tzone="UTC",
missingValues=NA,
date_type="posixlt")
# luft qa
luft_qa <- friendlyts::ts_spread(df=tmp, col_date=c("date"),
col_key=c("CO_NAME","ID"),col_target="qualityCode",
precision=timeResolution,tzone="UTC",
date_type="posixlt")
luft_qa <- friendlyts::check_date(luft_qa,
fromDate=fromDate,
toDate=toDate,
timeResolution=timeResolution,
v=1,
precision=timeResolution,
tzone="UTC",
missingValues=NA,
date_type="posixlt")
icol <- friendlyts::get_date_col(precision=timeResolution)
names(luft_qa) <- c(names(luft_qa)[icol],paste0("qa_",names(luft_qa)[-c(icol)]))
#luft <- glow::ts(service=friendly_service,
# fromDate=fromDate,toDate=toDate,precision=timeResolution,timeResolution=timeResolution,
# element_id=element_id,station_id=station_id)
# qa indicator
#luft_qa <- glow::qa(service=friendly_service,
# fromDate=fromDate,toDate=toDate,precision=timeResolution,timeResolution=timeResolution,
# element_id=element_id,station_id=station_id)
study <- friendlyts::bind_cols_fts(luft,luft_qa,precision=precision)
study <- stats::na.omit(study)
########################
## Validation dataset #
########################
# Period 2015-2016
fromDate <- "2015/01/01 00:00:00"
toDate <- "2016/12/31 23:00:00"
# luft
tmp <- data.luftkval.oslo10::get_short_data(stations= station_id,
referencetimeFrom=fromDate,
referencetimeTo=toDate,
timeResolution=timeResolution,
elements=element_id)
luft <- friendlyts::ts_spread(df=tmp, col_date=c("date"),
col_key=c("CO_NAME","ID"),col_target="value",
precision=timeResolution,tzone="UTC",
date_type="posixlt")
luft <- friendlyts::check_date(luft,
fromDate=fromDate,
toDate=toDate,
timeResolution=timeResolution,
v=1,
precision=timeResolution,
tzone="UTC",
missingValues=NA,
date_type="posixlt")
# luft qa
luft_qa <- friendlyts::ts_spread(df=tmp, col_date=c("date"),
col_key=c("CO_NAME","ID"),col_target="qualityCode",
precision=timeResolution,tzone="UTC",
date_type="posixlt")
luft_qa <- friendlyts::check_date(luft_qa,
fromDate=fromDate,
toDate=toDate,
timeResolution=timeResolution,
v=1,
precision=timeResolution,
tzone="UTC",
missingValues=NA,
date_type="posixlt")
icol <- friendlyts::get_date_col(precision=timeResolution)
names(luft_qa) <- c(names(luft_qa)[icol],paste0("qa_",names(luft_qa)[-c(icol)]))
#luft <- glow::ts(service=friendly_service,
# fromDate=fromDate,toDate=toDate,precision=timeResolution,timeResolution=timeResolution,
# element_id=element_id,station_id=station_id)
# qa indicator
#luft_qa <- glow::qa(service=friendly_service,
# fromDate=fromDate,toDate=toDate,precision=timeResolution,timeResolution=timeResolution,
# element_id=element_id,station_id=station_id)
validation <- friendlyts::bind_cols_fts(luft,luft_qa,precision=precision)
validation <- stats::na.omit(validation)
#########################
# Focus on valid data #
#########################
library(dplyr)
# read qcflag nilu meta
path <- try(normalizePath(file.path(.libPaths()[1],"data.luftkval.oslo10","data","qcflag_meta.rda"),mustWork=TRUE))
tmp <- load(file=path)
qcflag_meta <- get(tmp)
known_ok <- qcflag_meta %>% filter(FlagType=="OK") %>% select(ID)
all <- paste0(element_id,"_",station_id)
qa_all <- paste0("qa_",element_id,"_",station_id)
cond <- paste(qa_all, "%in%", known_ok,collapse=" & ")
# valid data on train/test dataset
study <- study %>% filter(eval(parse(text=cond))) %>% select(c("WDAY","YEAR","MONTH","DAY","HOUR"),tidyselect::all_of(all),tidyselect::all_of(qa_all)) %>% friendlyts::as_tbl_friendlyts(precision=precision,date_type="friendlyts",CHECKNAS=FALSE)
study <- stats::na.omit(study)
# split 80/20
split_dataset <- cipred::partition(study,train=0.8,test=0.2)
# valid data on validation dataset
validation <- validation %>% filter(eval(parse(text=cond))) %>% select(c("WDAY","YEAR","MONTH","DAY","HOUR"),tidyselect::all_of(all),tidyselect::all_of(qa_all)) %>% friendlyts::as_tbl_friendlyts(precision=precision,date_type="friendlyts",CHECKNAS=FALSE)
validation <- stats::na.omit(validation)
################################################################
# FIT on TRAIN DATASET / PRED ON TEST DATASET for station 504 #
################################################################
perf_testing <- list()
difference_testing <- list()
pred_test_eaam2010_prime <- list()
pred_test_wager2014_prime <- list()
pred_test_lu2019_prime <- list()
p_values_test <- list()
# target and predictors
station <- "11"
target <- paste0(element_id,"_",station)
predictors <- setdiff(names(luft), c(c("WDAY","YEAR","MONTH","DAY","HOUR"),target,names(luft_qa)))
cat('**********\n')
cat(target,'\n')
cat('-> fit ranger model\n')
model_rf <- cipred::fit_ranger(split_dataset$train,target,predictors)
cat('-> Ranger prediction on the test dataset\n')
tmp <- cipred::predict_ranger(model_rf,split_dataset$test)
pred_test <- data.frame(obs=split_dataset$test[[target]],
prediction=tmp$predictions)
cat('-> metrics of Ranger on test dataset\n')
perf_testing[[i_station]] <- sapply(c("rmse","mse","mae","crmse","ncrmse","bias","nbias","r2"),
FUN=function(x)
metrics::metrics(reference=pred_test[["obs"]],client=pred_test[["prediction"]],fun=x) )
perf_testing[[i_station]] <- c(ID= target,perf_testing[[i_station]])
cat('-> difference between obs and prediction on test dataset\n')
difference_testing[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_test[,"obs"])),obs=pred_test[["obs"]], pred=pred_test[["prediction"]], eps=metrics::differences(reference=pred_test[["obs"]],client=pred_test[["prediction"]],fun="eps")))
cat('-> eaam2010 prediction on the test dataset\n')
pred_test_eaam2010_prime[[i_station]] <- cipred::pred_rfci_eaamm2010_prime(model_rf,
dataset=split_dataset$test,
target=target)
pred_test_eaam2010_prime[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_test_eaam2010_prime[[i_station]][,"obs"]))),pred_test_eaam2010_prime[[i_station]])
cat('-> wager2014 prediction on the test dataset\n')
pred_test_wager2014_prime[[i_station]] <- cipred::pred_rfci_wager2014_prime(model_rf,
dataset=split_dataset$test,
target=target)
pred_test_wager2014_prime[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_test_wager2014_prime[[i_station]][,"obs"]))),pred_test_wager2014_prime[[i_station]])
cat('-> lu2019 prediction on the test dataset\n')
model_lu2019 <- cipred::fit_rfci_lu2019(split_dataset$train,target,predictors)
pred_test_lu2019_prime[[i_station]] <- cipred::pred_rfci_lu2019_prime(lu2019=model_lu2019,
dataset=split_dataset$test,
target=target,
predictors=predictors)
pred_test_lu2019_prime[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_test_lu2019_prime[[i_station]][,"obs"]))),pred_test_lu2019_prime[[i_station]])
cat('-> Gathering p-values\n')
p_values_test[[i_station]] <- data.frame(ID = numeric(0), pvalue=numeric(0),method=character(0))
p_values_test[[i_station]] <- rbind(p_values_test[[i_station]],data.frame(ID= rep(target,length(pred_test_eaam2010_prime[[i_station]][,"pvalue"])),pvalue=pred_test_eaam2010_prime[[i_station]][,"pvalue"],method=pred_test_eaam2010_prime[[i_station]][,"method"]))
p_values_test[[i_station]] <- rbind(p_values_test[[i_station]],data.frame(ID= rep(target,length(pred_test_wager2014_prime[[i_station]][,"pvalue"])),pvalue=pred_test_wager2014_prime[[i_station]][,"pvalue"],method=pred_test_wager2014_prime[[i_station]][,"method"]))
p_values_test[[i_station]] <- rbind(p_values_test[[i_station]],data.frame(ID= rep(target,length(pred_test_lu2019_prime[[i_station]][,"pvalue"])),pvalue=pred_test_lu2019_prime[[i_station]][,"pvalue"],method=pred_test_lu2019_prime[[i_station]][,"method"]))
perf_testing <- do.call("rbind",perf_testing)
difference_testing <- do.call("rbind",difference_testing)
p_values_test <- do.call("rbind",p_values_test)
##################################
# FIT/PRED ON VALIDATION DATASET #
##################################
perf_validation <- list()
difference_validation <- list()
pred_validation_eaam2010_prime <- list()
pred_validation_wager2014_prime <- list()
pred_validation_lu2019_prime <- list()
p_values_validation <- list()
# target and predictors
station <- "11"
target <- paste0(element_id,"_",station)
predictors <- setdiff(names(luft), c(c("WDAY","YEAR","MONTH","DAY","HOUR"),target,names(luft_qa)))
cat('**********\n')
cat(target,'\n')
cat('-> fit ranger model\n')
model_rf <- cipred::fit_ranger(split_dataset$train,target,predictors)
cat('-> Ranger prediction on the validation dataset\n')
tmp <- cipred::predict_ranger(model_rf,validation)
pred_validation <- data.frame(obs=validation[[target]],
prediction=tmp$predictions)
cat('-> metrics of Ranger on validation dataset\n')
perf_validation[[i_station]] <- sapply(c("rmse","mse","mae","crmse","ncrmse","bias","nbias","r2"),
FUN=function(x)
metrics::metrics(reference=pred_validation[["obs"]],client=pred_validation[["prediction"]],fun=x) )
perf_validation[[i_station]] <- c(ID= target,perf_validation[[i_station]])
cat('-> difference between obs and prediction on validation dataset\n')
difference_validation[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_validation[,"obs"])),obs=pred_validation[["obs"]], pred=pred_validation[["prediction"]],eps=metrics::differences(reference=pred_validation[["obs"]],client=pred_validation[["prediction"]],fun="eps")))
cat('-> eaam2010 prediction on the validation dataset\n')
pred_validation_eaam2010_prime[[i_station]] <- cipred::pred_rfci_eaamm2010_prime(model_rf,
dataset=validation,
target=target)
pred_validation_eaam2010_prime[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_validation_eaam2010_prime[[i_station]][,"obs"]))),pred_validation_eaam2010_prime[[i_station]])
cat('-> wager2014 prediction on the validation dataset\n')
pred_validation_wager2014_prime[[i_station]] <- cipred::pred_rfci_wager2014_prime(model_rf,
dataset=validation,
target=target)
pred_validation_wager2014_prime[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_validation_wager2014_prime[[i_station]][,"obs"]))),pred_validation_wager2014_prime[[i_station]])
cat('-> lu2019 prediction on the validation dataset\n')
model_lu2019 <- cipred::fit_rfci_lu2019(split_dataset$train,target,predictors)
pred_validation_lu2019_prime[[i_station]] <- cipred::pred_rfci_lu2019_prime(lu2019=model_lu2019,
dataset=validation,
target=target,
predictors=predictors)
pred_validation_lu2019_prime[[i_station]] <- cbind(data.frame(ID= rep(target,length(pred_validation_lu2019_prime[[i_station]][,"obs"]))),pred_validation_lu2019_prime[[i_station]])
cat('-> Gathering p-values\n')
p_values_validation[[i_station]] <- data.frame(ID = numeric(0), pvalue=numeric(0),method=character(0))
p_values_validation[[i_station]] <- rbind(p_values_validation[[i_station]],data.frame(ID= pred_validation_eaam2010_prime[[i_station]][,"ID"],pvalue=pred_validation_eaam2010_prime[[i_station]][,"pvalue"],method=pred_validation_eaam2010_prime[[i_station]][,"method"]))
p_values_validation[[i_station]] <- rbind(p_values_validation[[i_station]],data.frame(ID= pred_validation_wager2014_prime[[i_station]][,"ID"],pvalue=pred_validation_wager2014_prime[[i_station]][,"pvalue"],method=pred_validation_wager2014_prime[[i_station]][,"method"]))
p_values_validation[[i_station]] <- rbind(p_values_validation[[i_station]],data.frame(ID= pred_validation_lu2019_prime[[i_station]][,"ID"],pvalue=pred_validation_lu2019_prime[[i_station]][,"pvalue"],method=pred_validation_lu2019_prime[[i_station]][,"method"]))
perf_validation <- do.call("rbind",perf_validation)
difference_validation <- do.call("rbind",difference_validation)
p_values_validation <- do.call("rbind",p_values_validation)
####################################################################################
#################### CHARTS ##########################################################
####################################################################################
###############################################################
# figure pqqplot including the three methods #
###############################################################
cipred::pqq_plot(df=p_values_test,DESKTOP=FALSE,name="oslo_testing_no2_11", path='~')
cipred::pqq_plot(df=p_values_validation,DESKTOP=FALSE,name="oslo_validation_no2_11", path='~')
#################################
# flattening the lists #
#################################
pred_validation_eaam2010_prime <- do.call("rbind",pred_validation_eaam2010_prime)
pred_validation_wager2014_prime <- do.call("rbind",pred_validation_wager2014_prime)
pred_validation_lu2019_prime <- do.call("rbind",pred_validation_lu2019_prime)
#######################################################################
# Illustrative figure of observation, prediction and CI95% Lu2019 #
#######################################################################
res <- cbind(validation,pred_validation_lu2019_prime %>% filter(ID=="NO2_11"))
res <- plotfts::sub_period(tbl_fts=res,fromDate="2016/12/04 00:00:00",toDate="2016/12/08 00:00:00",timeResolution=timeResolution,tzone="UTC")
res <- friendlyts::dfts(tbl_fts=res,timeResolution=timeResolution,tzone="UTC")
res <- dplyr::select(res,c("date","NO2_11",prediction,lower_pred_0_05,upper_pred_0_05))
names(res) <- c("date","NO2_11","prediction","lower_pred_0_05","upper_pred_0_05")
filename_x <- normalizePath(file.path("~","timeseries_NO2_11_lu2019.tiff"),mustWork = FALSE)
grDevices::tiff(filename = filename_x, width=200,height=200,
units = "mm", pointsize = 12,
compression = "lzw",
bg = "transparent", res = 300)
p <- ggplot2::ggplot(res, ggplot2::aes(date)) +
ggplot2::geom_ribbon(ggplot2::aes(ymin = lower_pred_0_05, ymax = upper_pred_0_05), fill = "grey80") +
ggplot2::geom_point(ggplot2::aes(y = NO2_11)) +
ggplot2::geom_line(ggplot2::aes(y = prediction)) +
ggplot2::labs(x = "date",y = "[ug/m^3]")
p
grDevices::dev.off()
# plot prediction to highlight the artefact
plot_artefact <- function(dataset,pred,name,training=NULL,subsample=NULL){
filename <- file.path("~",paste0(name,".tiff"))
grDevices::tiff(filename = filename, width=100,height=100,
units = "mm", pointsize = 12,
compression = "lzw",
bg = "transparent", res = 300)
if (!is.null(training)) {
plot( pred, dataset$y, xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min(dataset$y,pred), max(dataset$y,pred)), pch = 19, cex = 0.6, xlab = "prediction", ylab = "output",
panel.first = rect(training[1],-1e6, training[2], 1e6, col='grey', border=NA))
} else if(!is.null(subsample)) {
plot( pred, dataset$y, xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min(dataset$y,pred), max(dataset$y,pred)), pch = 19, cex = 0.6, xlab = "prediction", ylab = "output")
for (i in 1:length(subsample)) {
abline(v=subsample[i],col="grey",lwd=0.05)
}
par(new=TRUE)
plot( pred, dataset$y, xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min(dataset$y,pred), max(dataset$y,pred)), pch = 19, cex = 0.6, xlab = "prediction", ylab = "output")
} else {
plot( pred, dataset$y, xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min(dataset$y,pred), max(dataset$y,pred)), pch = 19, cex = 0.6, xlab = "prediction", ylab = "output")
}
abline(a=0,b=1,lty=2)
dev.off()
return(filename)
}
# plot error of prediction to highlight the artefact
plot_error <- function(dataset,pred,name,training=NULL,subsample=NULL){
filename <- file.path("~",paste0(name,".tiff"))
grDevices::tiff(filename = filename, width=100,height=100,
units = "mm", pointsize = 12,
compression = "lzw",
bg = "transparent", res = 300)
if (!is.null(training)) {
plot(pred, (dataset$y-pred), xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min((dataset$y-pred)), max((dataset$y-pred))), col="purple",pch = 19, cex = 0.6, xlab = "prediction", ylab = "error",
panel.first = rect(training[1],-1e6, training[2], 1e6, col='grey', border=NA))
} else if(!is.null(subsample)) {
plot(pred, (dataset$y-pred), xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min((dataset$y-pred)), max((dataset$y-pred))), col="purple",pch = 19, cex = 0.6, xlab = "prediction", ylab = "error")
for (i in 1:length(subsample)) {
abline(v=subsample[i],col="grey",lwd=0.05)
}
par(new=TRUE)
plot(pred, (dataset$y-pred), xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min((dataset$y-pred)), max((dataset$y-pred))), col="purple",pch = 19, cex = 0.6, xlab = "prediction", ylab = "error")
} else {
plot(pred, (dataset$y-pred), xlim = c(min(dataset$y,pred), max(dataset$y,pred)), ylim = c(min((dataset$y-pred)), max((dataset$y-pred))), col="purple",pch = 19, cex = 0.6, xlab = "prediction", ylab = "error")
}
abline(h=0,lty=2)
dev.off()
return(filename)
}
#############################
# HIGHLIGHT RFreg artefact #
#############################
# Adaptation from http://freerangestats.info/blog/2016/12/10/extrapolation
set.seed(123) # for reproducibility
library(ranger) # for random forests
# full and perfect dataset (truth: no error)
# assumption: distribution uniform of the value
x0 <- seq(1,300,0.1)
y <- 1/4 * x0
dataset <- data.frame(x0=x0, y=y)
# a subset of the dataset used as training dataset