Commit 8cc1a255 authored by Jean-Marie Lepioufle's avatar Jean-Marie Lepioufle
Browse files

put on gitlab

parents
Package: fillger
Type: Package
Version: 0.0.1
Authors@R: c(person("Jean-Marie", "Lepioufle", , "jml@nilu.no", role=c("aut","cre")))
Title: fillger
Description: Fill-in the gaps.
First attempts.
Depends:
R (>= 3.4.1)
Imports:
ranger
VignetteBuilder: knitr
License: GPL-3
Encoding: UTF-8
LazyData: false
RoxygenNote: 6.1.1
# Generated by roxygen2: do not edit by hand
damaging <- function(dataset,col,nb_na){
res <- dataset
i <- round(runif(nb_na,1,nrow(dataset)))
j <- round(runif(nb_na,1,ncol(dataset)))
coord <- data.frame(i=i,j=col[j])
dam <-lapply(1:nrow(coord),function(x){
res <- as.data.frame(setNames(replicate((length(colnames(dataset))+1),numeric(0)),c("indice",colnames(dataset))))
res <- rbind(res,cbind(indice=coord[x,"i"],as.data.frame(dataset[coord[x,"i"],colnames(dataset)])))
res[,as.character(coord[x,"j"])] <- NA
return(res)
})
dam_df <- do.call("rbind",dam)
res[dam_df[,"indice"],col] <- dam_df[,col]
original <-lapply(1:nrow(coord),function(x){
res <- data.frame(row=numeric(0),col=character(0),value=numeric(0))
res <- rbind(res,data.frame(row=coord[x,"i"],col=as.character(coord[x,"j"]),value=unlist(dataset[coord[x,"i"],as.character(coord[x,"j"])]),row.names = NULL,stringsAsFactors = FALSE))
return(res)
})
original_df <- do.call("rbind",original)
return(list(original = original_df,
dataset = res ))
}
fill_in_sequential_one <- function(dataset,col){
res <- dataset
nb_na <- 1
while(nb_na<length(col)) {
with_na <- row_with_na(res[,col],nb_na)
indice <- which(sapply(with_na,function(x) !is.null(x)))
if (length(indice)>0) {
nb_round <- 0
while(nb_round<nb_na){
nb_round <- nb_round +1
filled <- lapply(indice,function(x){
# get target
target <- col[with_na[[x]]][nb_round]
# get predictors with non-NA value
predictors <- setdiff(col,target)
predictors <- predictors[which(!is.na(res[x,predictors]))]
# fill the gap
res <- fill_in_row(res,x,target,predictors)
return(res)
})
filled_df <- do.call("rbind",filled)
rm(filled)
# fill-in the dataset
res[filled_df[,"indice"],col] <- filled_df[,col]
}
}
nb_na <- nb_na +1
}
return(res)
}
fill_in_row <- function(dataset,row,target,predictors){
# 80/20 training/testing on dataset without any gaps
nona_dataset <- na.omit(dataset)
split_dataset <- partition(nona_dataset)
# fit model
model <- fit_model(split_dataset$train,target=target,predictors=predictors)
# predict model
pred <- predict_model(model,dataset[row,])
res <- as.data.frame(setNames(replicate((length(colnames(dataset))+1),numeric(0)),c("indice",colnames(dataset))))
res <- rbind(res,cbind(indice=row,as.data.frame(dataset[row,colnames(dataset)],stringsAsFactors = FALSE)))
res[,target] <- pred
return(res)
}
# model function
fit_model <- function(dataset,target,predictors){
predictors_ <- paste(predictors, collapse = "+")
rf_formula <- stats::as.formula(paste(target, predictors_, sep = " ~ "))
res <- ranger::ranger(formula=rf_formula, data=dataset, num.tree=800,importance='impurity',keep.inbag=TRUE)
# rem: importance Variable importance mode, one of 'none', 'impurity', 'impurity_corrected', 'permutation'.
# The 'impurity' measure is the Gini index for classification, the variance of the responses for regression and the sum of test statistics (see \code{splitrule}) for survival.
return(res)
}
predict_model <- function(object,dataset){
res <- stats::predict(object=object ,tibble::as_tibble(dataset),type="response")$prediction
return(res)
}
# partition function
partition <- function(dataset,train=0.8,test=0.2,seed=NULL){
if (!is.null(seed)) set.seed(seed)
fact <- sample(c("train","test"),dim(dataset)[1],prob=c(train,test),replace=TRUE)
res <- split(dataset,fact)
return(res)
}
row_with_na <- function(dataset,nb_na){
nr <- nrow(dataset)
res <- lapply(1:nr,function(x) {
col_na <- which(is.na(dataset[x,]))
if (length(col_na)==nb_na){
return(col_na)
}
})
return(res)
}
get_data <- function(dataset,coord){
tmp <-lapply(1:nrow(coord),function(x){
res <- data.frame(row=numeric(0),col=character(0),value=numeric(0))
res <- rbind(res,data.frame(row=coord[x,"i"],col=as.character(coord[x,"j"]),value=unlist(dataset[coord[x,"i"],as.character(coord[x,"j"])]),row.names = NULL,stringsAsFactors = FALSE))
return(res)
})
tmp_df <- do.call("rbind",tmp)
return(tmp_df)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment