- Introduction
- The power of data.table
- Winning points
- Some examples for advanced usage
Live coding on a case study: sales from one of our clients, an e-commerce fashion business (sample)
2018-05-14
János Divényi
Emarsys
wifi: CEU Guest, pwd: Budapest1991
https://b.socrative.com/login/student/
Room Name: ERUM
Live coding on a case study: sales from one of our clients, an e-commerce fashion business (sample)
Download the data (2 large csv files) from divenyijanos.github.io/pages/talks
library(data.table) # 1.11.2 library(magrittr) # 1.5 library(ggplot2) # 2.2.2 library(dplyr) # 0.7.4 options(datatable.print.class = TRUE) sessionInfo() # R 3.4.2
## R version 3.4.2 (2017-09-28) ## Platform: x86_64-apple-darwin15.6.0 (64-bit) ## Running under: macOS High Sierra 10.13.4 ## ## Matrix products: default ## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib ## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib ## ## locale: ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] dplyr_0.7.4 ggplot2_2.2.1 magrittr_1.5 data.table_1.11.2 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_0.12.13 bindr_0.1 knitr_1.20 munsell_0.4.3 ## [5] colorspace_1.3-2 R6_2.2.2 rlang_0.2.0 stringr_1.2.0 ## [9] plyr_1.8.4 tools_3.4.2 grid_3.4.2 gtable_0.2.0 ## [13] htmltools_0.3.6 assertthat_0.2.0 yaml_2.1.14 lazyeval_0.2.1 ## [17] rprojroot_1.3-2 digest_0.6.12 tibble_1.4.2 bindrcpp_0.2 ## [21] glue_1.2.0 evaluate_0.10.1 rmarkdown_1.9 stringi_1.1.6 ## [25] compiler_3.4.2 pillar_1.2.2 scales_0.5.0 backports_1.1.2 ## [29] pkgconfig_2.0.1
Make sure you have the required packages.
data_file <- '../data/erum_workshop_dt_sales.csv' sales <- fread(data_file)
plotMicrobenchmark <- function(microbenchmark_result, x) { microbenchmark_result$expr <- factor(microbenchmark_result$expr, labels = x) p <- ggplot(microbenchmark_result, aes(expr, time/10^6, color = expr)) + geom_boxplot() + scale_y_log10() + scale_color_discrete(guide = guide_legend(title = "Method")) + labs(x = '', y = 'time (ms) - log scale') print(p) }
read_benchmark <- microbenchmark::microbenchmark( read.csv(data_file), readr::read_csv(data_file), fread(data_file) )
plotMicrobenchmark(read_benchmark, c('read.csv', 'read_csv', 'fread'))
Note: readr::read_csv()
parses date column as class Date
sales class(sales)
sales[1:10] sales[customer_lifecycle_status == 'Lead']
Similar to dplyr
, the environment is the data.table itself.
sales[, sum(sales_amount)] sales[, list(min(purchase_date), max(purchase_date))]
sales[, sum(sales_amount), by = purchase_date] sales[, sum(sales_amount), by = list(customer_lifecycle_status, purchase_date)] sales[, sum(sales_amount), by = (purchase_date > '2016-12-31')] sales[, sum(sales_amount), by = list(in_2017 = purchase_date > '2016-12-31')]
sales[, mean(sales_amount), by = customer_lifecycle_status]
sales[, sales_amount] # returns vector sales[, list(sales_amount)] # returns data.table sales[, .(sales_amount)] # shortcut for list() sales[, c('sales_amount')] # data.frame-way but returns data.table sales[, c('contact_id', 'sales_amount')] # same as with data.frame
sales[, .(sum(sales_amount), sd(sales_amount))] sales[, .(sum_sales_amount = sum(sales_amount), sd_sales_amount = sd(sales_amount))] sales[, .(mean_sales = mean(sales_amount)), by = customer_lifecycle_status]
sales[, .N, by = customer_lifecycle_status] sales[, uniqueN(contact_id), by = customer_lifecycle_status]
sales[, .N, by = contact_id]
sales[purchase_date >= '2017-01-01', .N, by = contact_id]
sales[, .( sum_sales = sum(sales_amount), num_purchases = .N, num_buyers = uniqueN(contact_id) ), by = customer_lifecycle_status ]
sales[, table(customer_lifecycle_status)] sales[, hist(sales_amount)] sales[, lm(sales_amount ~ customer_lifecycle_status)]
sales[, .(mean_sales = mean(sales_amount)), by = customer_lifecycle_status] %>% ggplot(aes(customer_lifecycle_status, mean_sales)) + geom_col()
sales[, .(daily_sales = sum(sales_amount)), by = purchase_date][daily_sales > 400000] sales[, .(daily_sales = sum(sales_amount)), by = purchase_date] %>% .[daily_sales > 400000]
sales[, .(max_sales = max(sales_amount)), by = purchase_date] %>% .[max_sales > 10000]
sales[, a := 'a'] sales sales[, a := NULL] sales
sales[, purchase_date := as.Date(purchase_date)] # DON'T DO! very very slow
Parse the character date with fasttime:fastPOSIXct()
first
date_conversion_benchmark <- microbenchmark::microbenchmark( copy(sales)[, purchase_date := as.Date(purchase_date, '%Y-%m-%d')], copy(sales)[, purchase_date := as.Date(lubridate::fast_strptime(purchase_date, '%Y-%m-%d'))], copy(sales)[, purchase_date := as.Date(fasttime::fastPOSIXct(purchase_date))] )
plotMicrobenchmark(date_conversion_benchmark, c('direct', 'lubridate', 'fasttime'))
sales_date <- copy(sales)[, purchase_date := as.Date(fasttime::fastPOSIXct(purchase_date))] # integer storage for fast sorting sales[, purchase_date := as.IDate(fasttime::fastPOSIXct(purchase_date))]
order_benchmark <- microbenchmark::microbenchmark( sales_date[order(purchase_date)], sales[order(purchase_date)] )
plotMicrobenchmark(order_benchmark, c('Date', 'IDate'))
sales[, year := year(purchase_date)] sales[, month := month(purchase_date)]
sales[, .N, by = .(customer_lifecycle_status, year)] sales[, table(customer_lifecycle_status, year)]
sales[, daily_sum := sum(sales_amount), by = purchase_date] sales[purchase_date > '2016-12-31', daily_sum_in_2017 := sum(sales_amount), by = purchase_date]
sales[, lifecycle_average_sales := mean(sales_amount), by = customer_lifecycle_status]
sales[, c('year', 'month') := .(year(purchase_date), month(purchase_date))] sales[, `:=`(year = year(purchase_date), month = month(purchase_date))]
sales[, c('daily_sum', 'daily_sum_in_2017') := NULL]
sales[, lapply(.SD, uniqueN)]
sales[, lapply(.SD, median), .SDcols = c('quantity', 'sales_amount')]
sales[, lapply(.SD, function(x) quantile(x, p = 0.75)), by = customer_lifecycle_status, .SDcols = c('quantity', 'sales_amount') ]
sales[, lapply(.SD, max), by = year, .SDcols = c('sales_amount', 'quantity')]
sales[ sales_amount == max(sales_amount), .(contact_id, sales_amount), by = year # by only applies to calculation in j! ] sales[order(-sales_amount), lapply(.SD, head, n = 1), by = year] sales[order(-sales_amount), .SD[1], by = year]
Replace zero and negative values to NA
for (j in c('sales_amount', 'quantity')) { set(sales, which(sales[[j]] < 0), j, NA) }
returnMax <- function(dt, column) { dt[, max(get(column))] } returnMax(sales, 'purchase_date') returnMax(sales, 'quantity')
sales[ order(contact_id, purchase_date), days_since_last_purchase := purchase_date - shift(purchase_date), by = contact_id ] sales %>% ggplot(aes(is.na(days_since_last_purchase))) + geom_bar() + labs(x = 'First purchase') sales[!is.na(days_since_last_purchase)] %>% ggplot(aes(days_since_last_purchase)) + geom_histogram() + labs(x = 'Days since last purchase')
sales[, max(sales_amount), by = .(customer_lifecycle_status, year)] %>% dcast(customer_lifecycle_status ~ year)
sales[, max(sales_amount), by = .(customer_lifecycle_status, year)] %>% dcast(customer_lifecycle_status ~ year) %>% melt(id.vars = 'customer_lifecycle_status')
sales[, lapply(.SD, sum), by = purchase_date, .SDcols = c('sales_amount', 'quantity')] %>% melt(id.vars = 'purchase_date') %>% ggplot(aes(purchase_date, value)) + geom_line() + facet_wrap(~ variable, ncol = 1, scales = 'free_y')
sales[, .(yearly_sales = sum(sales_amount)), by = year] sales[, .(yearly_sales = sum(sales_amount)), keyby = year]
Sorts on the key by reference
setkey(sales, customer_lifecycle_status) sales sales['Lead']
status_sales <- sales[, .(sum_sales_by_status = sum(sales_amount)), keyby = customer_lifecycle_status ] status_sales[sales] merge(sales, status_sales, by = 'customer_lifecycle_status') # works as well
https://b.socrative.com/login/student/
Room Name: ERUM
sales_df <- as.data.frame(sales_date) sales_tibble <- tibble::as.tibble(sales_date)
unique_benchmark <- microbenchmark::microbenchmark( length(unique(sales_df$contact_id)), summarise(sales_tibble, n_distinct(contact_id)), sales[, uniqueN(contact_id)] )
plotMicrobenchmark(unique_benchmark, x = c('base', 'dplyr', 'data.table'))
aggregate_benchmark <- microbenchmark::microbenchmark( aggregate( sales_df$sales_amount, by = list(sales_df$customer_lifecycle_status, sales_df$purchase_date), FUN = mean ), sales_tibble %>% group_by(customer_lifecycle_status, purchase_date) %>% summarise(mean(sales_amount)), sales[, mean(sales_amount), by = .(customer_lifecycle_status, purchase_date)] )
plotMicrobenchmark(aggregate_benchmark, x = c('base', 'dplyr', 'data.table'))
new_var_benchmark <- microbenchmark::microbenchmark( sales_df$sales_squared <- sales_df$sales_amount^2, sales_tibble %<>% mutate(sales_squared = sales_amount^2), sales[, sales_squared := sales_amount^2] )
plotMicrobenchmark(new_var_benchmark, x = c('base', 'dplyr', 'data.table'))
columns <- c('sales_amount', 'quantity') replace_benchmark <- microbenchmark::microbenchmark( sales_df[, columns][sales_df[, columns] < 0] <- NA, mutate_at(sales_tibble, columns, funs(replace(., . < 0, NA))), for (j in columns) { set(sales, which(sales[[j]] < 0), j, NA) } )
plotMicrobenchmark(replace_benchmark, x = c('base', 'dplyr', 'data.table'))
lookup_benchmark <- microbenchmark::microbenchmark( sales_df[sales_df$order_id == 2430931,], filter(sales, order_id == 2430931), sales[order_id == 2430931] )
plotMicrobenchmark(lookup_benchmark, c('base', 'dplyr', 'data.table'))
web <- fread('../data/erum_workshop_dt_web.csv') web[, event_date := as.IDate(fasttime::fastPOSIXct(event_date))] web[, .N, by = event_date] %>% ggplot(aes(event_date, N)) + geom_line()
sales[contact_id == 7226] web[contact_id == 7226]
sales[, join_date := as.IDate(purchase_date)] web[, join_date := event_date] setkey(sales, contact_id, join_date) setkey(web, contact_id, join_date) merged <- sales[web, roll = -Inf] # each web session to the nearest following purchase
merged merged[contact_id == 7226] merged[contact_id == 4775737]
merged[, .(conversion_rate = mean(!is.na(purchase_date))), by = event_date] %>% ggplot(aes(event_date, conversion_rate)) + geom_line() + scale_y_continuous(labels = scales::percent)
# each web session to the nearest following purchase within a week merged <- sales[web, roll = -7] merged[contact_id == 7226] merged[contact_id == 4775737]
data_for_plot <- sales[, .(daily_sales = sum(sales_amount)), by = .(purchase_date, year, month) ][, fitted := fitted(lm(log(daily_sales) ~ month)), by = year ]
Instead of lm()
you can use what you want.
ggplot(data_for_plot, aes(month, daily_sales)) + geom_point(size = 2, alpha = 0.5) + geom_line(aes(y = exp(fitted)), size = 1, color = 'firebrick') + scale_x_continuous(breaks = scales::pretty_breaks(6)) + facet_wrap(~year)
createSquares <- function(dt, columns) { new_column_names <- paste(columns, 'square', sep = '_') dt[, (new_column_names) := lapply(.SD, function(x) x^2), .SDcols = columns] } createSquares(sales, c('sales_amount', 'quantity')) sales
createAvg <- function(dt, column, by = NULL) { new_column_name <- paste('avg', column, sep = '_') if (!is.null(by)) { new_column_name <- paste(new_column_name, 'by', by, sep = '_') } dt[, (new_column_name) := mean(get(column)), by = by] } createAvg(sales, 'sales_amount') sales createAvg(sales, 'quantity', by = 'customer_lifecycle_status') sales
sales[ year == 2017, .(revenue = sum(sales_amount), count = 1), by = contact_id ][ order(-revenue), .( share_of_revenue = cumsum(revenue)/sum(revenue), share_of_contacts = cumsum(count)/.N ) ][ seq(1, .N, 100) # trick to make the plotting faster ] %>% ggplot(aes(share_of_contacts, share_of_revenue)) + geom_line()
@paljenczy
@MattDowle
Stackoverflow guys, esp. @DavidArenburg, @akrun
https://b.socrative.com/login/student/
Room Name: ERUM