- Introduction
- The power of data.table
- Winning points
- Some examples for advanced usage
Live coding on a case study: sales from one of our clients, an e-commerce fashion business (sample)
2018-05-14
János Divényi
Emarsys
wifi: CEU Guest, pwd: Budapest1991
https://b.socrative.com/login/student/
Room Name: ERUM
Live coding on a case study: sales from one of our clients, an e-commerce fashion business (sample)
Download the data (2 large csv files) from divenyijanos.github.io/pages/talks
library(data.table) # 1.11.2 library(magrittr) # 1.5 library(ggplot2) # 2.2.2 library(dplyr) # 0.7.4 options(datatable.print.class = TRUE) sessionInfo() # R 3.4.2
## R version 3.4.2 (2017-09-28) ## Platform: x86_64-apple-darwin15.6.0 (64-bit) ## Running under: macOS High Sierra 10.13.4 ## ## Matrix products: default ## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib ## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib ## ## locale: ## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] dplyr_0.7.4 ggplot2_2.2.1 magrittr_1.5 data.table_1.11.2 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_0.12.13 bindr_0.1 knitr_1.20 munsell_0.4.3 ## [5] colorspace_1.3-2 R6_2.2.2 rlang_0.2.0 stringr_1.2.0 ## [9] plyr_1.8.4 tools_3.4.2 grid_3.4.2 gtable_0.2.0 ## [13] htmltools_0.3.6 assertthat_0.2.0 yaml_2.1.14 lazyeval_0.2.1 ## [17] rprojroot_1.3-2 digest_0.6.12 tibble_1.4.2 bindrcpp_0.2 ## [21] glue_1.2.0 evaluate_0.10.1 rmarkdown_1.9 stringi_1.1.6 ## [25] compiler_3.4.2 pillar_1.2.2 scales_0.5.0 backports_1.1.2 ## [29] pkgconfig_2.0.1
Make sure you have the required packages.
data_file <- '../data/erum_workshop_dt_sales.csv' sales <- fread(data_file)
plotMicrobenchmark <- function(microbenchmark_result, x) {
microbenchmark_result$expr <- factor(microbenchmark_result$expr, labels = x)
p <- ggplot(microbenchmark_result, aes(expr, time/10^6, color = expr)) +
geom_boxplot() +
scale_y_log10() +
scale_color_discrete(guide = guide_legend(title = "Method")) +
labs(x = '', y = 'time (ms) - log scale')
print(p)
}
read_benchmark <- microbenchmark::microbenchmark(
read.csv(data_file),
readr::read_csv(data_file),
fread(data_file)
)
plotMicrobenchmark(read_benchmark, c('read.csv', 'read_csv', 'fread'))
Note: readr::read_csv() parses date column as class Date
sales class(sales)
sales[1:10] sales[customer_lifecycle_status == 'Lead']
Similar to dplyr, the environment is the data.table itself.
sales[, sum(sales_amount)] sales[, list(min(purchase_date), max(purchase_date))]
sales[, sum(sales_amount), by = purchase_date] sales[, sum(sales_amount), by = list(customer_lifecycle_status, purchase_date)] sales[, sum(sales_amount), by = (purchase_date > '2016-12-31')] sales[, sum(sales_amount), by = list(in_2017 = purchase_date > '2016-12-31')]
sales[, mean(sales_amount), by = customer_lifecycle_status]
sales[, sales_amount] # returns vector
sales[, list(sales_amount)] # returns data.table
sales[, .(sales_amount)] # shortcut for list()
sales[, c('sales_amount')] # data.frame-way but returns data.table
sales[, c('contact_id', 'sales_amount')] # same as with data.frame
sales[, .(sum(sales_amount), sd(sales_amount))] sales[, .(sum_sales_amount = sum(sales_amount), sd_sales_amount = sd(sales_amount))] sales[, .(mean_sales = mean(sales_amount)), by = customer_lifecycle_status]
sales[, .N, by = customer_lifecycle_status] sales[, uniqueN(contact_id), by = customer_lifecycle_status]
sales[, .N, by = contact_id]
sales[purchase_date >= '2017-01-01', .N, by = contact_id]
sales[,
.(
sum_sales = sum(sales_amount),
num_purchases = .N,
num_buyers = uniqueN(contact_id)
),
by = customer_lifecycle_status
]
sales[, table(customer_lifecycle_status)] sales[, hist(sales_amount)] sales[, lm(sales_amount ~ customer_lifecycle_status)]
sales[, .(mean_sales = mean(sales_amount)), by = customer_lifecycle_status] %>%
ggplot(aes(customer_lifecycle_status, mean_sales)) + geom_col()
sales[, .(daily_sales = sum(sales_amount)), by = purchase_date][daily_sales > 400000]
sales[, .(daily_sales = sum(sales_amount)), by = purchase_date] %>%
.[daily_sales > 400000]
sales[, .(max_sales = max(sales_amount)), by = purchase_date] %>%
.[max_sales > 10000]
sales[, a := 'a'] sales sales[, a := NULL] sales
sales[, purchase_date := as.Date(purchase_date)] # DON'T DO! very very slow
Parse the character date with fasttime:fastPOSIXct() first
date_conversion_benchmark <- microbenchmark::microbenchmark(
copy(sales)[, purchase_date := as.Date(purchase_date, '%Y-%m-%d')],
copy(sales)[, purchase_date := as.Date(lubridate::fast_strptime(purchase_date, '%Y-%m-%d'))],
copy(sales)[, purchase_date := as.Date(fasttime::fastPOSIXct(purchase_date))]
)
plotMicrobenchmark(date_conversion_benchmark, c('direct', 'lubridate', 'fasttime'))
sales_date <- copy(sales)[, purchase_date := as.Date(fasttime::fastPOSIXct(purchase_date))] # integer storage for fast sorting sales[, purchase_date := as.IDate(fasttime::fastPOSIXct(purchase_date))]
order_benchmark <- microbenchmark::microbenchmark(
sales_date[order(purchase_date)],
sales[order(purchase_date)]
)
plotMicrobenchmark(order_benchmark, c('Date', 'IDate'))
sales[, year := year(purchase_date)] sales[, month := month(purchase_date)]
sales[, .N, by = .(customer_lifecycle_status, year)] sales[, table(customer_lifecycle_status, year)]
sales[, daily_sum := sum(sales_amount), by = purchase_date] sales[purchase_date > '2016-12-31', daily_sum_in_2017 := sum(sales_amount), by = purchase_date]
sales[, lifecycle_average_sales := mean(sales_amount), by = customer_lifecycle_status]
sales[, c('year', 'month') := .(year(purchase_date), month(purchase_date))]
sales[, `:=`(year = year(purchase_date), month = month(purchase_date))]
sales[, c('daily_sum', 'daily_sum_in_2017') := NULL]
sales[, lapply(.SD, uniqueN)]
sales[, lapply(.SD, median), .SDcols = c('quantity', 'sales_amount')]
sales[,
lapply(.SD, function(x) quantile(x, p = 0.75)),
by = customer_lifecycle_status,
.SDcols = c('quantity', 'sales_amount')
]
sales[, lapply(.SD, max), by = year, .SDcols = c('sales_amount', 'quantity')]
sales[
sales_amount == max(sales_amount),
.(contact_id, sales_amount),
by = year # by only applies to calculation in j!
]
sales[order(-sales_amount), lapply(.SD, head, n = 1), by = year]
sales[order(-sales_amount), .SD[1], by = year]
Replace zero and negative values to NA
for (j in c('sales_amount', 'quantity')) {
set(sales, which(sales[[j]] < 0), j, NA)
}
returnMax <- function(dt, column) {
dt[, max(get(column))]
}
returnMax(sales, 'purchase_date')
returnMax(sales, 'quantity')
sales[
order(contact_id, purchase_date),
days_since_last_purchase := purchase_date - shift(purchase_date),
by = contact_id
]
sales %>%
ggplot(aes(is.na(days_since_last_purchase))) +
geom_bar() +
labs(x = 'First purchase')
sales[!is.na(days_since_last_purchase)] %>%
ggplot(aes(days_since_last_purchase)) +
geom_histogram() +
labs(x = 'Days since last purchase')
sales[, max(sales_amount), by = .(customer_lifecycle_status, year)] %>%
dcast(customer_lifecycle_status ~ year)
sales[, max(sales_amount), by = .(customer_lifecycle_status, year)] %>%
dcast(customer_lifecycle_status ~ year) %>%
melt(id.vars = 'customer_lifecycle_status')
sales[, lapply(.SD, sum), by = purchase_date, .SDcols = c('sales_amount', 'quantity')] %>%
melt(id.vars = 'purchase_date') %>%
ggplot(aes(purchase_date, value)) +
geom_line() +
facet_wrap(~ variable, ncol = 1, scales = 'free_y')
sales[, .(yearly_sales = sum(sales_amount)), by = year] sales[, .(yearly_sales = sum(sales_amount)), keyby = year]
Sorts on the key by reference
setkey(sales, customer_lifecycle_status) sales sales['Lead']
status_sales <- sales[,
.(sum_sales_by_status = sum(sales_amount)),
keyby = customer_lifecycle_status
]
status_sales[sales]
merge(sales, status_sales, by = 'customer_lifecycle_status') # works as well
https://b.socrative.com/login/student/
Room Name: ERUM
sales_df <- as.data.frame(sales_date) sales_tibble <- tibble::as.tibble(sales_date)
unique_benchmark <- microbenchmark::microbenchmark(
length(unique(sales_df$contact_id)),
summarise(sales_tibble, n_distinct(contact_id)),
sales[, uniqueN(contact_id)]
)
plotMicrobenchmark(unique_benchmark, x = c('base', 'dplyr', 'data.table'))
aggregate_benchmark <- microbenchmark::microbenchmark(
aggregate(
sales_df$sales_amount,
by = list(sales_df$customer_lifecycle_status, sales_df$purchase_date),
FUN = mean
),
sales_tibble %>%
group_by(customer_lifecycle_status, purchase_date) %>%
summarise(mean(sales_amount)),
sales[, mean(sales_amount), by = .(customer_lifecycle_status, purchase_date)]
)
plotMicrobenchmark(aggregate_benchmark, x = c('base', 'dplyr', 'data.table'))
new_var_benchmark <- microbenchmark::microbenchmark(
sales_df$sales_squared <- sales_df$sales_amount^2,
sales_tibble %<>% mutate(sales_squared = sales_amount^2),
sales[, sales_squared := sales_amount^2]
)
plotMicrobenchmark(new_var_benchmark, x = c('base', 'dplyr', 'data.table'))
columns <- c('sales_amount', 'quantity')
replace_benchmark <- microbenchmark::microbenchmark(
sales_df[, columns][sales_df[, columns] < 0] <- NA,
mutate_at(sales_tibble, columns, funs(replace(., . < 0, NA))),
for (j in columns) {
set(sales, which(sales[[j]] < 0), j, NA)
}
)
plotMicrobenchmark(replace_benchmark, x = c('base', 'dplyr', 'data.table'))
lookup_benchmark <- microbenchmark::microbenchmark(
sales_df[sales_df$order_id == 2430931,],
filter(sales, order_id == 2430931),
sales[order_id == 2430931]
)
plotMicrobenchmark(lookup_benchmark, c('base', 'dplyr', 'data.table'))
web <- fread('../data/erum_workshop_dt_web.csv')
web[, event_date := as.IDate(fasttime::fastPOSIXct(event_date))]
web[, .N, by = event_date] %>% ggplot(aes(event_date, N)) + geom_line()
sales[contact_id == 7226] web[contact_id == 7226]
sales[, join_date := as.IDate(purchase_date)] web[, join_date := event_date] setkey(sales, contact_id, join_date) setkey(web, contact_id, join_date) merged <- sales[web, roll = -Inf] # each web session to the nearest following purchase
merged merged[contact_id == 7226] merged[contact_id == 4775737]
merged[, .(conversion_rate = mean(!is.na(purchase_date))), by = event_date] %>%
ggplot(aes(event_date, conversion_rate)) +
geom_line() +
scale_y_continuous(labels = scales::percent)
# each web session to the nearest following purchase within a week merged <- sales[web, roll = -7] merged[contact_id == 7226] merged[contact_id == 4775737]
data_for_plot <- sales[,
.(daily_sales = sum(sales_amount)),
by = .(purchase_date, year, month)
][,
fitted := fitted(lm(log(daily_sales) ~ month)),
by = year
]
Instead of lm() you can use what you want.
ggplot(data_for_plot, aes(month, daily_sales)) +
geom_point(size = 2, alpha = 0.5) +
geom_line(aes(y = exp(fitted)), size = 1, color = 'firebrick') +
scale_x_continuous(breaks = scales::pretty_breaks(6)) +
facet_wrap(~year)
createSquares <- function(dt, columns) {
new_column_names <- paste(columns, 'square', sep = '_')
dt[, (new_column_names) := lapply(.SD, function(x) x^2), .SDcols = columns]
}
createSquares(sales, c('sales_amount', 'quantity'))
sales
createAvg <- function(dt, column, by = NULL) {
new_column_name <- paste('avg', column, sep = '_')
if (!is.null(by)) {
new_column_name <- paste(new_column_name, 'by', by, sep = '_')
}
dt[, (new_column_name) := mean(get(column)), by = by]
}
createAvg(sales, 'sales_amount')
sales
createAvg(sales, 'quantity', by = 'customer_lifecycle_status')
sales
sales[
year == 2017, .(revenue = sum(sales_amount), count = 1), by = contact_id
][
order(-revenue),
.(
share_of_revenue = cumsum(revenue)/sum(revenue),
share_of_contacts = cumsum(count)/.N
)
][
seq(1, .N, 100) # trick to make the plotting faster
] %>%
ggplot(aes(share_of_contacts, share_of_revenue)) + geom_line()
@paljenczy
@MattDowle
Stackoverflow guys, esp. @DavidArenburg, @akrun
https://b.socrative.com/login/student/
Room Name: ERUM