Rcpp
, increasing the speed by several orders of magnitude, so that 100 million rows of data or even more can be processed normally. # col1 <- runif (12^5, 0, 2) col2 <- rnorm (12^5, 0, 2) col3 <- rpois (12^5, 3) col4 <- rchisq (12^5, 2) df <- data.frame (col1, col2, col3, col4)
# R: system.time({ for (i in 1:nrow(df)) { # for every row if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4 df[i, 5] <- "greater_than_4" # 5- } else { df[i, 5] <- "lesser_than_4" # 5- } } })
# output <- character (nrow(df)) # system.time({ for (i in 1:nrow(df)) { if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { output[i] <- "greater_than_4" } else { output[i] <- "lesser_than_4" } } df$output})
# , output <- character (nrow(df)) condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4 # system.time({ for (i in 1:nrow(df)) { if (condition[i]) { output[i] <- "greater_than_4" } else { output[i] <- "lesser_than_4" } } df$output <- output })
output <- character(nrow(df)) condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4 system.time({ for (i in (1:nrow(df))[condition]) { # if (condition[i]) { output[i] <- "greater_than_4" } else { output[i] <- "lesser_than_4" } } df$output })
ifelse()
. The syntax is similar to the if
function in MS Excel, but the acceleration is phenomenal, especially since there is no preallocation, and the condition is checked every time. This seems like a very profitable way to speed up the execution of simple loops. system.time({ output <- ifelse ((df$col1 + df$col2 + df$col3 + df$col4) > 4, "greater_than_4", "lesser_than_4") df$output <- output })
Source: https://habr.com/ru/post/277681/
All Articles