outfile <- "data/blockshuffle_testing_data.csv.gz"
if(!file.exists(outfile)) {
  libs <- system("pkg-config --libs libzstd", intern = TRUE)
  cflags <- system("pkg-config --cflags libzstd", intern = TRUE)
  Sys.setenv(PKG_CPPFLAGS = "-mavx2 %s %s" | c(cflags, libs))
  Sys.setenv(PKG_LIBS = "-mavx2 %s %s" | c(cflags, libs))
  sourceCpp("blockshuffle_heuristic.cpp", verbose=TRUE, rebuild = TRUE)
  min_shuffleblock_size <- 262144
  
  blocks_df <- lapply(datasets, function(d) {
    tmp <- tempfile()
    data <- read_dataset(d)
    dname<- basename(d) %>% gsub("\\..+", "", .)
    qs2::qd_save(data, file = tmp)
    x <- qs2::qx_dump(tmp)
    r1 <- tibble(dataset = dname, blocks = x$blocks, algo = "qdata")
    qs2::qs_save(data, file = tmp)
    x <- qs2::qx_dump(tmp)
    r2 <- tibble(dataset = dname, blocks = x$blocks, algo = "qs2")
    rbind(r1, r2)
  }) %>% rbindlist
  blocks_df$blocksize <- sapply(blocks_df$blocks, length)
  blocks_df <- filter(blocks_df, blocksize >= min_shuffleblock_size)
  
  gc(full=TRUE)
  compress_levels <- 22:1
  results <- mclapply(compress_levels, function(cl) {
    print(cl)
    output <- shuffle_heuristic(blocks_df$blocks)
    output$no_shuffle_zblocksize <- og_compress(blocks_df$blocks, cl)$size
    output$shuffle_zblocksize <- shuffle_compress(blocks_df$blocks, 8, cl)$size
    output <- output %>% mutate(compress_level = cl)
  }, mc.cores=8, mc.preschedule=FALSE) %>% rbindlist
  results2 <- blocks_df %>% dplyr::select(dataset, algo, blocksize) %>%
    {lapply(1:length(compress_levels), function(i) .)} %>% rbindlist
  results <- cbind(results2, results)
  
  # add block index per dataset
  results <- results %>%
    group_by(dataset, compress_level, algo) %>%
    mutate(index = 1:n()) %>%
    as.data.frame
  fwrite(results, outfile, sep = ",")
} else {
  results <- fread(outfile, data.table=FALSE)
}
## 
## Generated extern "C" functions 
## --------------------------------------------------------
## 
## 
## #include <Rcpp.h>
## #ifdef RCPP_USE_GLOBAL_ROSTREAM
## Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
## Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
## #endif
## 
## // SIMD_test
## std::string SIMD_test();
## RcppExport SEXP sourceCpp_3_SIMD_test() {
## BEGIN_RCPP
##     Rcpp::RObject rcpp_result_gen;
##     rcpp_result_gen = Rcpp::wrap(SIMD_test());
##     return rcpp_result_gen;
## END_RCPP
## }
## // shuffle_compress
## DataFrame shuffle_compress(List blocks, int elementsize, int compress_level);
## RcppExport SEXP sourceCpp_3_shuffle_compress(SEXP blocksSEXP, SEXP elementsizeSEXP, SEXP compress_levelSEXP) {
## BEGIN_RCPP
##     Rcpp::RObject rcpp_result_gen;
##     Rcpp::traits::input_parameter< List >::type blocks(blocksSEXP);
##     Rcpp::traits::input_parameter< int >::type elementsize(elementsizeSEXP);
##     Rcpp::traits::input_parameter< int >::type compress_level(compress_levelSEXP);
##     rcpp_result_gen = Rcpp::wrap(shuffle_compress(blocks, elementsize, compress_level));
##     return rcpp_result_gen;
## END_RCPP
## }
## // og_compress
## DataFrame og_compress(List blocks, int compress_level);
## RcppExport SEXP sourceCpp_3_og_compress(SEXP blocksSEXP, SEXP compress_levelSEXP) {
## BEGIN_RCPP
##     Rcpp::RObject rcpp_result_gen;
##     Rcpp::traits::input_parameter< List >::type blocks(blocksSEXP);
##     Rcpp::traits::input_parameter< int >::type compress_level(compress_levelSEXP);
##     rcpp_result_gen = Rcpp::wrap(og_compress(blocks, compress_level));
##     return rcpp_result_gen;
## END_RCPP
## }
## // shuffle_heuristic
## DataFrame shuffle_heuristic(List blocks);
## RcppExport SEXP sourceCpp_3_shuffle_heuristic(SEXP blocksSEXP) {
## BEGIN_RCPP
##     Rcpp::RObject rcpp_result_gen;
##     Rcpp::traits::input_parameter< List >::type blocks(blocksSEXP);
##     rcpp_result_gen = Rcpp::wrap(shuffle_heuristic(blocks));
##     return rcpp_result_gen;
## END_RCPP
## }
## 
## Generated R functions 
## -------------------------------------------------------
## 
## `.sourceCpp_3_DLLInfo` <- dyn.load('/tmp/Rtmpf23OsJ/sourceCpp-x86_64-pc-linux-gnu-1.0.12/sourcecpp_1e9091a5542cd/sourceCpp_4.so')
## 
## SIMD_test <- Rcpp:::sourceCppFunction(function() {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_SIMD_test')
## shuffle_compress <- Rcpp:::sourceCppFunction(function(blocks, elementsize, compress_level) {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_shuffle_compress')
## og_compress <- Rcpp:::sourceCppFunction(function(blocks, compress_level) {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_og_compress')
## shuffle_heuristic <- Rcpp:::sourceCppFunction(function(blocks) {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_shuffle_heuristic')
## 
## rm(`.sourceCpp_3_DLLInfo`)
## 
## Building shared library
## --------------------------------------------------------
## 
## DIR: /tmp/Rtmpf23OsJ/sourceCpp-x86_64-pc-linux-gnu-1.0.12/sourcecpp_1e9091a5542cd
## 
## /usr/lib/R/bin/R CMD SHLIB --preclean -o 'sourceCpp_4.so' 'blockshuffle_heuristic.cpp'
# compare C++ implementation
test_data <- results %>% mutate(improvement = log(no_shuffle_zblocksize/shuffle_zblocksize))
timing_data <- test_data %>% dplyr::select(h1,h2,h3,h4,h5,h6,h7,h8,compress_level) %>% {lapply(1:5, function(i) .)} %>% rbindlist
tic(msg = "R package prediction time")
dtest <- xgb.DMatrix(data = timing_data %>% data.matrix)
r_pred <- predict(bst, dtest)
toc()
## R package prediction time: 3.576 sec elapsed
tic(msg = "Cpp package prediction time")
cpp_pred <- predict_xgboost_cpp(timing_data)
toc()
## Cpp package prediction time: 6.966 sec elapsed
dtest <- xgb.DMatrix(data = test_data %>% dplyr::select(h1,h2,h3,h4,h5,h6,h7,h8,compress_level) %>% data.matrix)
test_data <- test_data %>%
  mutate(r_prediction = predict(bst, dtest)) %>%
  mutate(cpp_prediction = predict_xgboost_cpp(test_data %>% dplyr::select(h1,h2,h3,h4,h5,h6,h7,h8,compress_level)))
# compare predictions
abs(test_data$r_prediction - test_data$cpp_prediction) %>% summary
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000001 0.032395 0.072015 0.119527 0.129750 1.731415
# compare prediction to actual
MAX_COMPRESS <- 16 # don't plot blocks with >16x compression, noisy
test_data2 <- test_data %>%
  filter(algo == "qdata") %>%
  filter(32768*4/(h2 + h4 + h6 + h8) < MAX_COMPRESS)
pal <- palette.colors(palette = "Okabe-Ito")
ggplot(test_data2, aes(x = cpp_prediction, y = improvement, color = dataset)) + 
  geom_abline(aes(slope=1, intercept = 0), lty = 2) + 
  geom_vline(aes(xintercept=0), lty=2, color = "orange") +
  geom_point(shape=21, alpha=0.75) + 
  facet_wrap(~compress_level, ncol=4) +
  scale_color_manual(values = pal) +
  theme_bw(base_size = 12)

# plot cumulative savings
test_data3 <- test_data %>%
  filter(algo == "qdata") %>%
  mutate(predicted_compression = 32768*4/(h2 + h4 + h6 + h8)) %>%
  mutate(do_blockshuffle = predicted_compression < MAX_COMPRESS & cpp_prediction > 0) %>%
  group_by(dataset, compress_level) %>%
  mutate(optimal = cumsum(pmin(no_shuffle_zblocksize, shuffle_zblocksize))) %>%
  mutate(shuffle_heuristic = cumsum(ifelse(do_blockshuffle, shuffle_zblocksize, no_shuffle_zblocksize))) %>%
  mutate(no_shuffle = cumsum(no_shuffle_zblocksize)) %>%
  mutate(heuristic_was_optimal = do_blockshuffle == (shuffle_zblocksize < no_shuffle_zblocksize) ) %>%
  ungroup
test_data3 <- test_data3 %>%
  filter(compress_level %in% c(3,9)) %>%
  dplyr::select(dataset, compress_level, index, optimal, shuffle_heuristic, no_shuffle) %>%
  pivot_longer(c(-index, -dataset, -optimal, -compress_level), names_to = "shuffle_selection", values_to = "cumulative_bytes") %>%
  mutate(inefficiency = (cumulative_bytes - optimal)/max(optimal) )
  
ggplot(test_data3, aes(x = index, y = inefficiency, color = shuffle_selection, lty = factor(compress_level))) + 
  geom_line() + 
  scale_y_continuous(labels = scales::percent) + 
  facet_wrap(~dataset, scales = "free") +
  theme_bw(base_size=12) + 
  labs(x = "Block Index", y = "Inefficiency", lty = "Compress Level", color = "Shuffle Selection")

# compare prediction to actual
MAX_COMPRESS <- 16 # don't plot blocks with >16x compression, noisy
test_data2 <- test_data %>%
  filter(algo == "qs2") %>%
  filter(32768*4/(h2 + h4 + h6 + h8) < MAX_COMPRESS)
pal <- palette.colors(palette = "Okabe-Ito")
ggplot(test_data2, aes(x = cpp_prediction, y = improvement, color = dataset)) + 
  geom_abline(aes(slope=1, intercept = 0), lty = 2) + 
  geom_vline(aes(xintercept=0), lty=2, color = "orange") +
  geom_point(shape=21, alpha=0.75) + 
  facet_wrap(~compress_level, ncol=4) +
  scale_color_manual(values = pal) +
  theme_bw(base_size = 12)

# plot cumulative savings
test_data3 <- test_data %>%
  filter(algo == "qs2") %>%
  mutate(predicted_compression = 32768*4/(h2 + h4 + h6 + h8)) %>%
  mutate(do_blockshuffle = predicted_compression < MAX_COMPRESS & cpp_prediction > 0) %>%
  group_by(dataset, compress_level) %>%
  mutate(optimal = cumsum(pmin(no_shuffle_zblocksize, shuffle_zblocksize))) %>%
  mutate(shuffle_heuristic = cumsum(ifelse(do_blockshuffle, shuffle_zblocksize, no_shuffle_zblocksize))) %>%
  mutate(no_shuffle = cumsum(no_shuffle_zblocksize)) %>%
  mutate(heuristic_was_optimal = do_blockshuffle == (shuffle_zblocksize < no_shuffle_zblocksize) ) %>%
  ungroup
test_data3 <- test_data3 %>%
  filter(compress_level %in% c(3,9)) %>%
  dplyr::select(dataset, compress_level, index, optimal, shuffle_heuristic, no_shuffle) %>%
  pivot_longer(c(-index, -dataset, -optimal, -compress_level), names_to = "shuffle_selection", values_to = "cumulative_bytes") %>%
  mutate(inefficiency = (cumulative_bytes - optimal)/max(optimal) )
  
ggplot(test_data3, aes(x = index, y = inefficiency, color = shuffle_selection, lty = factor(compress_level))) + 
  geom_line() + 
  scale_y_continuous(labels = scales::percent) + 
  facet_wrap(~dataset, scales = "free") +
  theme_bw(base_size=12) + 
  labs(x = "Block Index", y = "Inefficiency", lty = "Compress Level", color = "Shuffle Selection")
