Rcpp
, increasing the speed by several orders of magnitude, so that 100 million rows of data or even more can be processed normally.
# col1 <- runif (12^5, 0, 2) col2 <- rnorm (12^5, 0, 2) col3 <- rpois (12^5, 3) col4 <- rchisq (12^5, 2) df <- data.frame (col1, col2, col3, col4)
which()
command to select strings, you can achieve one-third the speed of Rcpp
.
# system.time({ want = which(rowSums(df) > 4) output = rep("less than 4", times = nrow(df)) output[want] = "greater than 4" })
# = 3 () user system elapsed 0.396 0.074 0.481
apply()
function to implement the same logic and compare it with the vectorized for loop. The results grow with an increase in the number of orders, but they are slower than ifelse()
and the version where the check was done outside the loop. This may be useful, but it may need some ingenuity for complex business logic.
# apply system.time({ myfunc <- function(x) { if ((x['col1'] + x['col2'] + x['col3'] + x['col4']) > 4) { "greater_than_4" } else { "lesser_than_4" } } output <- apply(df[, c(1:4)], 1, FUN=myfunc) # 'myfunc' df$output <- output })
# library(compiler) myFuncCmp <- cmpfun(myfunc) system.time({ output <- apply(df[, c (1:4)], 1, FUN=myFuncCmp) })
ifelse()
most effective. What if we add another zero? Below we implement the same logic with Rcpp
, with a data set of 100 million rows. We compare Rcpp
and ifelse()
speeds.
library(Rcpp) sourceCpp("MyFunc.cpp") system.time (output <- myFunc(df)) # Rcpp
// [[Rcpp::export]]
required and must be placed immediately before the function you want to execute from R.
// MyFunc.cpp #include using namespace Rcpp; // [[Rcpp::export]] CharacterVector myFunc(DataFrame x) { NumericVector col1 = as(x["col1"]); NumericVector col2 = as(x["col2"]); NumericVector col3 = as(x["col3"]); NumericVector col4 = as(x["col4"]); int n = col1.size(); CharacterVector out(n); for (int i=0; i 4){ out[i] = "greater_than_4"; } else { out[i] = "lesser_than_4"; } } return out; }
Rcpp
and ifelse
performance
# library(foreach) library(doSNOW) cl <- makeCluster(4, type="SOCK") # for 4 cores machine registerDoSNOW (cl) condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4 # system.time({ output <- foreach(i = 1:nrow(df), .combine=c) %dopar% { if (condition[i]) { return("greater_than_4") } else { return("lesser_than_4") } } }) df$output <- output
rm()
as early as possible, especially before long cycles. Sometimes using gc()
at the end of each iteration can help.
Data.table()
is a great example because it does not overload the memory. This allows you to speed up operations like data fusion.
dt <- data.table(df) # data.table system.time({ for (i in 1:nrow (dt)) { if ((dt[i, col1] + dt[i, col2] + dt[i, col3] + dt[i, col4]) > 4) { dt[i, col5:="greater_than_4"] # 5- } else { dt[i, col5:="lesser_than_4"] # 5- } } })
data.table()
, byte-by-code compilation and parallelization, since they will be very different in each particular case and depending on how you use them.Source: https://habr.com/ru/post/277693/