library(readxl)
library(dplyr)
library(pastecs)
library(ggplot2)
library(lubridate)
library(rfm)
library(cluster)
library(factoextra)
library(CLVTools)
library(caret)
Reads data and removes any missing values that might be present.
<- "https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx"
url <- "online_retail_II.xlsx"
destfile ::curl_download(url, destfile)
curl<- read_excel(destfile)
df <- na.omit(df) df
I will create the following columns for analysis:
and
$InvoiceDateDay <- as_date(df$InvoiceDate)
df$Revenue <- as.numeric(df$Quantity * df$Price)
df
<- df %>%
df filter(Revenue > "0")
<- df %>%
pf group_by(`Customer ID`) %>%
summarise(count = n_distinct(Invoice))
<- ggplot(data = pf, aes(x = reorder(`Customer ID`, -count), y = count, fill = `Customer ID`)) +
pf_bp geom_bar(stat = "identity")
pf_bp
To take a look at the top six spenders, we can run the following code:
<- pf[order(pf$count, decreasing = T), ]
pf
head(pf)
As we can see, the distribution of purchases from customers follow a fairly wide distribution, with the customer #14911 purchasing the most number of orders. However, simply because the customer may purchase the most orders may not necessarily mean they spend the most money.
<- df %>%
tms group_by(`Customer ID`) %>%
summarise(total = sum(Revenue))
<- ggplot(data = tms, aes(x = reorder(`Customer ID`, -total), y = total, fill = `Customer ID`)) +
tms_bp geom_bar(stat = "identity")
tms_bp
<- tms[order(tms$total, decreasing = T), ]
tms
head(tms)
<- as_date("2010-12-09")
analysis_date
<- rfm_table_order(df, `Customer ID`, InvoiceDateDay, Revenue, analysis_date)
rfm_result
rfm_result
rfm_heatmap(rfm_result)
We can notice here that a large amount of monetary value is concentrated in high-frequency-purchasing consumers.
rfm_bar_chart(rfm_result)
Many of the consumers shop frequently, but there are quite a few consumers who have not shopped recently. We should delve deeper to see what types of products these consumers bought and if any promotions, coupons, and advertisements would entice them to revisit the store.
rfm_rm_plot(rfm_result)
It seems like a majority of consumers have shopped within the last 100 days. However, there are a few medium spenders who have have not purchased anything near the last 200 and 300 days.
rfm_fm_plot(rfm_result)
Unsurprisingly, there is a positive correlation between frequency of purchases and monetary value.
<- c("Champions", "Loyal Customers", "Potential Loyalist", "New Customers",
segment_names "Promising", "Need Attention", "About To Sleep", "At Risk", "Can't Lose Them",
"Lost")
<- c(4, 2, 3, 4, 3, 2, 2, 1, 1, 1)
recency_lower <- c(5, 5, 5, 5, 4, 3, 3, 2, 1, 2)
recency_upper <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
frequency_lower <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
frequency_upper <- c(4, 3, 1, 1, 1, 2, 1, 2, 4, 1)
monetary_lower <- c(5, 5, 3, 1, 1, 3, 2, 5, 5, 2)
monetary_upper
<- rfm_segment(rfm_result, segment_names, recency_lower, recency_upper, frequency_lower,
segment frequency_upper, monetary_lower, monetary_upper)
%>%
segment group_by(segment) %>%
summarise(count = n_distinct(customer_id))
rfm_plot_median_recency(segment)
<- df %>%
product group_by(Description) %>%
summarise(count = n_distinct(Invoice))
<- product[order(product$count, decreasing = T), ] product
<- c("recency_score", "frequency_score", "monetary_score")
cdf_columns
<- segment %>%
cdf select(all_of(cdf_columns))
<- scale(cdf) cdf
set.seed(123)
fviz_nbclust(cdf, kmeans, method = "wss")
For the elbow method, it seems that the recommendation is 4 clusters.
set.seed(123)
fviz_nbclust(cdf, kmeans, method = "silhouette")
This method shows that 2 clusters maximize the average silhouette values. 3 clusters coming in second and 6 clusters coming in third as the next optimal values.
<- clusGap(cdf, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
gap_stat
fviz_gap_stat(gap_stat)
The gap statistic has been published by R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001). For this method, the optimal amount of clusters is 6. For this sample, we will use this method and the 6 clusters that recommends.
set.seed(123)
<- kmeans(cdf, 6, nstart = 25)
final_cluster print(final_cluster)
## K-means clustering with 6 clusters of sizes 551, 730, 620, 973, 935, 503
##
## Cluster means:
## recency_score frequency_score monetary_score
## 1 -1.0114978 -0.082082173 0.09271237
## 2 0.6097657 0.008418914 0.04617592
## 3 0.4877222 -0.990474486 -0.99241265
## 4 1.1134929 1.151350302 1.10430293
## 5 -1.1680195 -1.040511389 -1.10804316
## 6 -0.3608584 1.005546631 0.97820379
##
## Clustering vector:
## [1] 1 4 5 6 3 3 3 5 4 4 4 6 4 5 5 5 1 6 1 6 1 6 3 4 1 2 6 4 3 3 1 2 6 5 2 4 1
## [38] 2 3 2 5 5 4 2 6 5 3 6 3 5 4 5 6 6 3 3 4 2 6 1 1 3 4 4 4 3 4 4 1 1 3 5 3 3
## [75] 3 6 2 6 2 2 5 5 5 3 6 6 5 4 4 4 3 4 4 4 4 6 4 6 1 2 5 4 3 2 5 5 4 5 6 1 3
## [112] 2 1 6 6 1 2 5 4 4 1 5 5 6 1 1 2 3 6 4 1 1 5 1 2 5 5 2 2 3 4 5 5 2 1 2 4 2
## [149] 1 6 4 4 2 6 2 6 4 5 6 4 4 3 4 5 3 4 4 6 3 5 4 6 6 2 6 1 4 5 4 5 4 5 2 2 6
## [186] 6 4 1 4 3 3 1 3 6 5 4 5 1 3 2 4 2 1 5 2 1 6 5 4 4 4 4 3 6 3 5 2 2 1 2 5 2
## [223] 6 1 4 4 4 1 4 4 5 4 4 4 4 2 6 3 1 6 4 3 1 3 2 5 1 6 1 5 4 4 4 1 4 6 4 5 1
## [260] 3 6 4 2 5 1 5 5 5 3 5 1 2 4 5 4 6 1 1 3 5 5 3 2 5 3 5 2 2 5 1 1 6 2 1 2 6
## [297] 3 3 1 1 2 1 2 1 4 5 2 1 4 5 5 6 4 1 4 4 6 4 3 4 1 5 2 5 4 5 5 6 6 2 6 3 1
## [334] 5 5 6 4 5 4 1 4 2 1 4 5 4 2 2 1 4 6 2 3 3 2 5 3 5 3 5 3 3 2 1 2 2 2 4 6 4
## [371] 3 2 6 4 1 5 4 2 2 5 6 3 4 4 6 4 5 4 6 4 5 3 5 5 5 3 5 4 4 5 5 5 6 2 5 6 3
## [408] 4 5 4 5 4 4 6 3 5 6 4 1 4 1 2 4 5 6 2 6 3 3 5 1 3 2 1 5 6 3 4 3 5 3 5 3 3
## [445] 6 4 6 6 6 2 1 4 3 1 1 5 1 4 3 2 6 4 3 3 3 5 4 1 1 6 6 3 5 4 6 3 4 5 6 5 6
## [482] 5 1 6 5 1 4 3 4 1 5 3 1 5 4 4 3 3 4 6 3 5 1 2 5 3 4 4 1 6 4 3 3 4 1 4 5 3
## [519] 5 3 6 2 5 6 5 5 4 5 4 4 4 5 1 2 5 4 5 5 6 5 2 5 4 4 2 6 2 4 5 3 3 4 5 1 6
## [556] 4 2 4 6 3 5 4 5 5 3 3 3 1 5 3 4 4 1 3 4 1 5 3 4 5 1 5 6 3 3 6 5 4 4 6 4 4
## [593] 1 1 4 5 1 6 4 4 6 3 5 5 1 5 3 6 3 4 2 1 2 3 6 4 3 5 6 2 4 3 5 3 3 2 3 3 5
## [630] 3 4 2 4 1 6 4 4 3 3 3 4 3 2 4 5 3 5 1 1 1 5 5 5 6 3 4 1 2 2 4 2 2 5 3 1 4
## [667] 3 5 5 3 3 5 6 5 4 4 4 2 6 3 4 4 6 3 2 6 1 1 6 5 6 2 6 1 4 2 2 3 3 2 6 6 3
## [704] 1 4 3 4 6 5 4 2 2 1 5 1 4 2 1 5 5 5 2 3 1 3 6 4 2 1 1 1 1 5 1 4 6 3 1 6 4
## [741] 5 1 6 1 1 2 2 5 2 2 3 1 6 2 5 6 4 4 4 5 3 5 4 5 4 1 6 6 4 1 2 4 5 1 1 1 2
## [778] 6 6 1 2 4 5 3 4 5 4 5 4 4 2 5 3 6 3 6 4 5 5 4 5 3 1 2 5 3 5 4 2 1 6 5 4 4
## [815] 3 2 4 2 1 3 4 1 6 2 2 3 1 3 2 5 6 1 4 4 3 6 5 4 6 4 3 6 6 4 4 2 2 4 4 3 4
## [852] 3 3 3 1 5 1 5 1 1 4 2 4 5 6 3 1 4 5 2 2 1 2 3 5 5 4 4 1 1 2 4 1 2 5 5 1 4
## [889] 3 5 3 1 5 2 4 4 6 5 5 1 3 4 2 1 4 4 4 3 4 5 4 4 1 4 6 2 5 4 4 4 3 4 3 2 5
## [926] 3 5 2 1 6 5 2 3 6 5 5 3 2 4 4 5 2 1 5 2 5 5 2 2 5 6 6 3 6 1 2 5 4 5 4 3 6
## [963] 2 5 1 4 2 3 1 4 6 2 5 6 5 3 3 2 2 5 1 5 4 5 4 6 1 1 6 6 5 5 4 1 4 2 6 4 3
## [1000] 1 3 4 1 5 2 1 6 1 3 2 4 1 5 3 3 5 5 2 5 6 4 6 2 4 2 2 3 3 6 1 6 3 1 2 4 4
## [1037] 2 1 3 5 4 2 3 5 5 2 4 5 2 4 3 3 5 2 4 2 5 4 6 1 4 5 3 3 1 3 4 4 6 1 2 1 4
## [1074] 5 1 5 4 4 6 4 6 4 4 2 5 6 5 1 2 2 4 6 3 5 3 5 2 4 3 5 3 6 2 6 2 1 5 6 4 6
## [1111] 2 2 2 4 5 5 5 2 2 3 4 2 3 3 4 2 2 3 4 3 1 3 3 3 4 3 6 5 4 1 3 4 1 3 4 4 4
## [1148] 4 2 2 1 3 4 4 2 1 1 4 2 4 5 4 1 5 2 5 5 5 2 2 4 5 5 1 4 5 3 6 1 1 4 4 4 5
## [1185] 6 1 4 5 4 5 1 6 2 4 2 4 4 2 3 1 4 5 4 1 4 6 6 2 1 5 1 5 5 2 3 2 5 6 4 3 2
## [1222] 5 4 5 6 6 1 4 1 5 1 4 4 2 6 5 6 5 6 6 4 2 1 3 3 3 5 5 4 5 2 4 5 5 4 4 4 5
## [1259] 1 2 5 1 4 2 3 1 4 5 4 4 2 4 6 4 5 1 1 3 2 4 5 1 5 6 3 4 2 2 1 2 6 5 2 5 3
## [1296] 3 3 4 1 3 2 2 5 3 2 3 6 4 2 5 5 4 4 6 1 2 4 3 4 2 1 6 2 6 1 6 1 2 2 5 1 6
## [1333] 2 3 5 6 3 5 4 5 2 1 6 3 5 2 1 6 4 4 1 3 4 1 3 5 5 3 5 5 2 2 6 3 5 3 4 1 1
## [1370] 1 6 6 4 6 4 3 6 3 4 4 2 2 3 4 2 4 1 6 3 5 4 3 1 2 5 3 1 2 3 2 5 1 4 5 4 5
## [1407] 2 6 5 3 6 3 6 4 6 5 2 3 2 3 5 3 2 5 5 1 1 5 6 5 5 2 5 1 4 5 4 2 4 5 3 5 4
## [1444] 4 2 4 2 6 5 5 4 3 1 2 4 2 2 2 4 4 3 4 6 4 5 2 6 1 2 1 4 1 5 5 2 6 4 4 6 2
## [1481] 5 5 3 2 4 2 4 2 2 2 3 6 2 4 2 5 5 6 4 4 5 6 3 1 1 1 3 5 4 4 2 6 4 5 2 5 2
## [1518] 6 6 3 6 4 3 5 4 5 6 4 6 4 4 4 5 4 2 5 6 4 5 5 5 1 6 6 2 4 4 5 6 2 4 4 5 1
## [1555] 4 6 4 1 5 4 6 2 2 2 2 6 4 6 5 3 4 4 1 6 1 2 2 5 1 3 6 4 5 1 3 2 3 5 1 4 2
## [1592] 2 5 3 6 4 3 1 5 2 5 2 3 5 5 2 4 6 2 3 3 1 3 5 3 2 3 3 2 1 6 4 2 4 4 3 6 5
## [1629] 2 6 5 5 6 6 1 2 5 4 5 6 4 5 4 6 5 2 6 2 2 2 2 4 5 6 5 1 1 4 5 2 4 5 5 4 4
## [1666] 3 6 5 2 6 6 2 4 2 4 2 5 6 2 2 2 6 4 2 5 3 2 4 4 3 4 2 1 5 5 3 3 1 4 1 2 5
## [1703] 5 3 4 4 4 6 4 1 2 5 4 6 3 2 5 4 6 3 4 5 5 5 5 2 4 4 4 5 5 4 5 4 6 1 5 5 3
## [1740] 4 3 5 3 5 5 5 4 5 5 2 4 5 2 5 4 1 2 5 2 1 2 3 5 4 5 4 1 5 2 4 5 4 4 5 4 4
## [1777] 1 5 5 3 5 6 2 2 5 4 5 6 5 3 2 3 4 6 5 6 2 4 6 6 5 5 4 3 4 4 4 4 2 1 1 5 5
## [1814] 1 5 2 4 5 4 5 2 1 2 5 1 1 4 4 6 4 4 5 2 2 2 2 5 4 5 6 4 2 4 4 2 1 6 5 1 2
## [1851] 1 3 3 5 2 6 2 1 4 1 4 6 2 5 2 5 4 3 2 1 4 6 5 5 2 5 5 4 5 4 2 1 1 5 6 3 3
## [1888] 5 6 1 5 3 5 5 2 2 2 4 5 1 3 5 5 4 1 5 2 5 3 4 6 4 1 5 3 2 6 6 2 3 3 4 5 4
## [1925] 4 1 2 2 2 5 2 4 4 4 2 4 4 5 5 5 6 4 6 3 6 4 1 5 1 4 4 4 6 6 3 6 2 2 3 5 4
## [1962] 2 1 1 4 4 2 4 5 2 5 5 1 5 4 5 6 1 2 1 1 2 2 5 2 4 5 2 5 1 6 6 5 6 5 5 3 4
## [1999] 3 2 3 5 1 3 1 6 6 3 4 1 6 2 1 6 3 1 3 4 3 3 5 4 3 5 6 2 3 4 3 1 2 4 2 3 4
## [2036] 1 3 1 5 2 2 1 2 4 2 2 3 2 1 1 2 4 2 3 5 2 3 3 4 5 5 4 5 2 6 5 1 4 6 5 5 3
## [2073] 2 4 6 3 2 5 2 5 5 6 6 5 6 6 6 1 4 5 3 1 4 4 3 5 1 2 1 2 4 4 6 5 4 3 5 4 1
## [2110] 4 5 3 5 3 3 4 1 4 4 5 3 5 6 4 2 4 5 1 3 5 6 1 1 4 4 3 5 6 1 5 5 4 1 3 4 1
## [2147] 4 3 3 1 6 5 5 3 2 2 4 5 4 5 4 4 5 5 1 2 6 6 6 3 6 2 5 1 4 5 5 2 5 4 2 2 4
## [2184] 4 3 5 2 4 5 2 1 4 5 3 5 4 5 3 5 1 5 1 1 1 2 2 4 4 5 5 4 4 2 1 5 5 4 5 2 2
## [2221] 6 5 2 5 5 2 4 5 5 6 3 3 5 5 4 4 2 5 5 2 5 5 4 5 1 5 6 2 4 4 1 4 5 5 5 4 3
## [2258] 4 6 3 5 4 4 5 2 4 5 1 4 3 3 4 4 3 4 2 4 4 5 2 4 4 1 4 5 2 3 1 3 6 5 4 3 1
## [2295] 4 5 2 4 4 5 4 5 1 3 4 2 6 5 3 4 4 4 4 1 4 5 5 2 4 4 3 2 1 3 5 4 5 5 4 5 3
## [2332] 5 6 4 4 5 2 4 1 2 2 5 6 4 5 2 3 6 4 4 3 1 4 6 5 1 6 3 3 1 2 2 1 4 4 5 3 2
## [2369] 3 2 1 2 4 4 3 4 2 5 1 6 1 5 5 6 6 3 2 4 4 2 5 5 5 5 3 3 5 3 5 4 1 2 5 5 5
## [2406] 6 4 5 2 5 4 2 5 1 4 3 5 4 5 6 2 3 3 4 3 3 2 5 4 1 3 6 4 5 5 4 5 4 6 4 6 4
## [2443] 4 5 1 5 6 5 5 6 4 5 1 3 3 6 5 2 3 5 2 1 4 6 5 6 6 2 2 5 6 4 5 6 2 1 1 4 2
## [2480] 4 1 4 4 6 2 5 6 5 6 1 4 1 1 4 4 5 4 5 4 2 2 1 3 3 5 4 5 5 4 5 4 3 4 3 4 6
## [2517] 4 1 3 6 4 5 5 2 5 2 5 4 5 2 1 6 2 2 3 4 2 2 3 4 3 5 5 3 4 3 2 1 3 4 5 2 1
## [2554] 4 6 1 5 3 2 4 1 5 2 2 2 3 3 5 5 5 5 5 3 2 3 5 4 5 5 2 2 1 2 4 3 4 5 4 1 2
## [2591] 2 1 3 2 1 5 4 5 4 2 3 5 2 6 3 6 6 5 6 6 1 4 4 6 3 3 1 2 6 5 2 2 2 2 4 5 2
## [2628] 5 4 6 4 2 4 4 4 4 6 4 2 4 6 3 4 4 3 1 5 4 6 4 2 5 6 1 3 2 2 2 3 6 5 5 2 4
## [2665] 3 5 6 2 6 3 4 2 3 4 5 4 6 5 3 1 6 3 1 1 1 3 4 5 2 4 4 2 1 2 5 1 4 5 2 4 2
## [2702] 1 6 4 6 6 6 5 1 2 4 2 1 2 6 4 3 6 4 4 5 4 6 2 4 1 4 6 5 5 5 2 5 2 3 4 4 5
## [2739] 4 5 3 4 6 2 4 1 1 4 4 5 3 6 4 4 1 1 1 6 6 3 6 1 2 5 4 3 4 4 5 3 3 5 5 6 1
## [2776] 3 4 3 6 2 2 4 4 4 2 4 3 1 2 4 5 4 5 4 3 6 3 1 2 4 4 6 2 5 4 5 3 2 4 4 4 1
## [2813] 3 5 3 6 2 4 1 4 2 2 2 3 1 4 2 5 1 6 5 2 6 1 5 1 4 5 2 1 6 3 5 5 2 5 5 6 5
## [2850] 2 4 5 1 3 5 6 5 4 5 2 6 2 3 3 4 5 4 1 6 5 5 6 4 4 5 6 2 2 2 1 6 6 6 6 2 1
## [2887] 1 3 4 3 4 6 4 2 2 3 3 2 2 5 4 1 2 3 1 5 6 3 6 5 5 2 2 4 2 2 2 2 5 5 4 2 4
## [2924] 4 5 5 3 4 4 3 1 2 5 4 5 4 2 2 2 1 4 5 2 1 3 2 4 3 6 2 3 6 6 1 3 5 5 2 1 5
## [2961] 4 2 1 5 1 1 5 5 5 4 4 4 5 6 2 4 1 1 5 4 2 2 3 3 4 5 6 5 5 3 5 2 5 1 2 6 4
## [2998] 3 4 1 4 5 5 4 4 4 3 3 4 6 3 6 1 4 5 6 6 5 2 2 1 2 5 1 4 3 1 3 5 5 3 4 5 5
## [3035] 4 4 2 3 4 4 1 4 4 3 4 5 4 5 3 2 1 4 4 2 2 5 5 3 3 6 5 1 4 5 2 4 2 4 3 3 4
## [3072] 2 3 5 5 3 4 6 6 4 2 5 4 4 5 2 6 3 3 6 5 4 3 6 2 3 3 5 3 3 5 1 5 5 5 6 5 6
## [3109] 2 6 6 2 3 1 3 5 5 5 2 4 5 1 2 2 2 1 5 5 4 4 3 3 3 3 2 5 5 4 4 6 6 1 4 2 4
## [3146] 4 2 2 4 5 5 1 4 4 4 4 4 1 3 4 2 4 5 6 5 4 6 5 5 3 5 6 6 4 4 6 3 4 6 6 3 3
## [3183] 5 2 6 4 1 4 6 6 5 2 4 3 6 4 3 2 4 5 4 2 3 4 6 4 2 4 4 4 6 2 1 6 6 2 2 4 4
## [3220] 1 4 5 3 5 4 3 2 3 3 1 5 3 1 5 4 2 1 4 5 6 6 1 5 1 5 4 6 3 5 4 2 1 2 1 3 3
## [3257] 3 5 2 5 3 6 3 6 3 5 4 1 3 5 2 2 2 5 4 5 2 6 1 5 2 2 5 4 4 4 6 2 5 1 6 5 2
## [3294] 2 1 6 4 3 1 1 4 5 4 4 2 2 4 4 2 2 5 4 1 6 5 5 5 4 4 1 3 4 1 4 2 2 4 2 3 5
## [3331] 5 2 3 5 2 3 5 2 3 4 6 4 6 4 3 3 6 4 1 2 5 3 4 1 2 2 1 2 5 3 5 5 1 1 4 2 4
## [3368] 1 1 3 2 2 3 1 1 2 2 6 3 2 5 3 2 1 5 2 2 4 3 6 6 1 4 2 1 5 5 1 4 2 6 5 5 5
## [3405] 4 4 3 4 5 5 6 3 5 2 4 4 6 4 4 5 3 2 4 6 5 3 6 4 2 2 1 2 5 3 2 4 5 3 1 3 5
## [3442] 6 1 1 3 3 1 4 2 5 6 1 5 2 5 6 4 4 6 2 2 6 6 1 5 4 5 3 1 1 6 1 1 4 1 5 4 2
## [3479] 5 5 3 5 2 1 5 5 2 6 3 3 5 5 5 6 1 6 4 5 3 1 1 2 2 5 3 6 3 4 4 2 1 3 1 3 4
## [3516] 1 4 5 5 5 4 3 6 1 6 2 5 5 1 4 4 6 4 1 4 2 2 4 2 5 4 1 4 2 5 5 5 3 2 3 2 5
## [3553] 5 2 4 3 5 2 2 5 1 4 1 5 3 2 3 2 5 5 3 1 3 4 1 3 4 4 4 2 5 4 5 1 2 1 2 5 5
## [3590] 1 2 4 2 3 2 4 4 4 3 5 5 4 4 4 6 4 2 2 3 5 1 5 1 3 2 5 4 3 4 4 2 4 1 4 4 6
## [3627] 2 2 2 5 1 6 2 1 2 5 2 5 2 2 4 4 6 4 5 5 4 2 3 3 3 4 3 3 4 2 5 5 3 2 4 3 5
## [3664] 2 5 2 2 6 5 4 5 6 4 2 5 6 2 4 3 2 4 5 3 4 3 2 4 3 6 4 2 3 4 2 3 3 1 1 5 5
## [3701] 5 4 6 4 4 6 2 4 3 4 1 1 6 3 3 4 4 5 5 2 1 3 1 2 5 5 1 3 5 5 5 2 4 1 3 5 3
## [3738] 1 3 1 4 1 4 3 4 6 3 1 2 3 2 4 2 1 2 4 1 4 1 3 3 6 3 6 2 5 5 5 2 4 2 2 4 4
## [3775] 1 6 3 5 3 5 2 4 5 5 2 1 3 5 3 5 4 5 1 1 2 4 6 5 6 6 5 5 2 4 2 5 6 1 4 6 3
## [3812] 5 2 6 1 6 5 5 3 1 6 6 6 4 2 3 5 4 2 1 6 1 6 5 5 2 3 2 6 5 2 4 6 3 4 4 5 2
## [3849] 1 6 2 3 2 5 5 6 1 1 1 6 6 2 6 4 3 4 4 4 5 1 3 5 4 2 4 6 3 3 3 4 2 4 6 2 4
## [3886] 5 4 4 3 5 4 3 2 1 3 4 6 5 4 3 5 4 5 4 3 4 6 5 4 5 4 1 4 4 4 3 3 6 5 3 1 1
## [3923] 5 5 3 1 4 2 4 6 2 2 5 5 2 5 3 3 4 4 5 3 1 4 2 5 1 2 5 5 1 2 5 2 1 5 4 4 5
## [3960] 4 1 4 3 2 5 2 1 4 1 4 2 4 3 3 2 5 4 3 3 6 2 1 3 1 6 5 2 4 5 4 5 3 1 5 6 5
## [3997] 4 6 2 6 1 3 2 4 4 4 2 4 2 4 1 2 3 2 2 2 3 4 5 2 2 5 2 5 4 1 4 2 4 4 5 5 5
## [4034] 2 4 5 4 5 5 3 4 5 5 1 1 4 4 5 5 3 3 1 5 5 2 4 4 5 3 2 3 1 1 3 1 4 4 3 2 5
## [4071] 4 5 5 5 5 4 6 1 2 4 1 2 4 6 5 4 4 2 4 4 3 3 5 3 4 4 1 3 2 5 2 1 4 3 2 2 2
## [4108] 5 1 3 3 2 3 6 6 2 5 2 3 3 5 5 4 4 1 5 5 2 2 3 4 3 5 4 4 5 2 4 5 2 5 1 3 1
## [4145] 1 5 3 4 5 4 5 6 4 2 3 6 4 2 4 5 2 2 4 3 4 2 5 2 3 2 4 5 2 5 4 1 6 2 6 2 2
## [4182] 5 2 4 5 6 5 1 4 3 5 2 3 5 4 4 3 5 5 3 1 5 1 5 2 2 5 1 2 4 4 5 6 6 5 5 2 3
## [4219] 5 3 3 1 1 5 4 4 1 4 5 5 5 1 6 1 4 5 5 1 6 5 5 5 2 1 3 6 5 5 5 2 5 1 5 3 5
## [4256] 1 6 5 5 4 2 3 4 4 4 5 4 2 6 5 2 5 2 2 6 2 2 4 5 2 6 1 5 5 5 3 6 1 2 4 3 1
## [4293] 3 5 1 3 3 1 1 5 1 4 2 3 5 3 5 4 3 5 6 4
##
## Within cluster sum of squares by cluster:
## [1] 394.0904 639.2132 417.7872 418.7616 422.2724 255.2868
## (between_SS / total_SS = 80.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(final_cluster, data = cdf)
Now that we have our 6 clusters, we should extract these clusters into our initial data and provide the means of these clusters.
<- final_cluster$cluster
cluster
<- cbind(segment, cluster) %>%
consumer_cluster group_by(cluster) %>%
summarise_all("mean")
print(consumer_cluster)
## # A tibble: 6 x 10
## cluster customer_id segment rfm_score transaction_count recency_days amount
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 15275. NA 190. 45.4 166. 1043.
## 2 2 15488. NA 422. 51.1 30.3 910.
## 3 3 15350. NA 389. 17.1 34.2 291.
## 4 4 15319. NA 512. 253. 12.9 5917.
## 5 5 15432. NA 152. 15.2 214. 241.
## 6 6 15135. NA 300. 147. 81.7 2844.
## # ... with 3 more variables: recency_score <dbl>, frequency_score <dbl>,
## # monetary_score <dbl>
From these means, we can categorize our 6 clusters.
<- clvdata(df, date.format = "ymd", time.unit = "day", name.id = "Customer ID",
clv_df name.date = "InvoiceDateDay", name.price = "Revenue")
summary(clv_df)
## CLV Transaction Data
##
## Time unit Days
## Estimation length 373.0000 Days
## Holdout length -
##
## Transaction Data Summary
## Estimation Holdout Total
## Number of customers - - 4312
## First Transaction in period 2009-12-01 - 2009-12-01
## Last Transaction in period 2010-12-09 - 2010-12-09
## Total # Transactions 16999 - 16999
## Mean # Transactions per cust 3.942 - 3.942
## (SD) 6.020 - 6.020
## Mean Spending per Transaction 519.560 - 519.560
## (SD) 1121.623 - 1121.623
## Total Spending 8832003.274 - 8832003.274
## Total # zero repeaters 1494 - -
## Percentage of zero repeaters 34.647 - -
## Mean Interpurchase time 77.368 - 77.368
## (SD) 63.105 - 63.105
<- gg(clv_df)
est_clv predict(est_clv)
Here we have the predicted value of each customer’s spending. We can further analyze or even cluster these results. Another method that would approximate CLV is calculating the average revenue per using (ARPU).
Let’s filter out all the data that is after January 1st, 2010 to focus on the customers within the year 2009-2010.
<- df %>%
df09 filter(df$InvoiceDateDay < "2010-01-01")
In order to predict which customers from this period will be most likely to be repeat customers in 2010-2011, we could conduct an RFM analysis in order to see which customers are the most frequent shoppers.
<- as_date("2009-12-31")
analysis_date2
<- rfm_table_order(df09, `Customer ID`, InvoiceDateDay, Revenue, analysis_date2)
rfm_result2
rfm_result2
<- rfm_segment(rfm_result2, segment_names, recency_lower, recency_upper,
segment2 frequency_lower, frequency_upper, monetary_lower, monetary_upper)
= c("customer_id", "segment")
shopper_cols
<- segment2 %>%
frequent_shoppers select(all_of(shopper_cols)) %>%
filter(segment == "Champions" | segment == "Loyal Customers")
frequent_shoppers
Here is the list of 2009-2010 customers that we would expect to be repeat customers in 2010-2011. We could also add the list of customers who we labelled as “Potential Loyalists.”
In order to predict future purchases, we could model the probability of buying a product based on the RFM scores of each unique customer. One method would be using a logit model, where we could create a cutoff at probabilities above or below 0.5 for customers that WILL BUY and customers that WILL NOT BUY in the future.
Another technique for this method could be to use machine learning. I could split the data into a 70% to create the model and 30% to verify the accuracy.
If I created a dummy variable where 1 represented bought and 0 represented not bought, the following code would look something like this
inTrain = createDataPartition(y = segment2$transaction_count, p = 0.7, list = F) trainingSet = segment2[inTrain,] testSet = segment2[-inTrain,]
mod = train( transaction_count ~ ., data = trainingSet, method = “glm”, family = “binomial”
To get the predictions, I would run the following:
ptest = predict(mod, newdata = testSet) ptrain = predict(mod, newdata = trainingSet)
And to get the accuracy:
mean(ptrain == trainingSet\(y) mean(ptest == TestSet\)y)
In order to determine if the 10 percent discount on sales will result in greater or lower total revenue, we would need to predict the following:
If the monetary value from this offsets the 10 percent discount on sales, then the company could potentially see greater revenue. This is, of course, assuming other factors remain the same, such as other competing firms do not change their prices, too.
rfm_fm_plot(rfm_result2)
= lm(transaction_count ~ frequency_score + monetary_score + amount, data = segment2)
est_rfm
summary(est_rfm)
##
## Call:
## lm(formula = transaction_count ~ frequency_score + monetary_score +
## amount, data = segment2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.25 -12.34 -3.26 7.23 456.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -26.598059 2.530994 -10.509 < 2e-16 ***
## frequency_score 17.141026 0.798433 21.468 < 2e-16 ***
## monetary_score 1.626595 0.836508 1.945 0.0521 .
## amount 0.004585 0.000556 8.247 5.38e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29.7 on 951 degrees of freedom
## Multiple R-squared: 0.4917, Adjusted R-squared: 0.4901
## F-statistic: 306.7 on 3 and 951 DF, p-value: < 2.2e-16
This is just the initial model but I could create and incorporate dummy variables for the country of the consumer and see if the adjusted R squared results in a better explanation of the phenomenon.
Academy, R. (2019). RFM Analysis in R — R-bloggers. Retrieved 2022, from https://www.r-bloggers.com/2019/02/rfm-analysis-in-r/
Chen, D., Guo, K., & Ubakanma, G. (2015). Predicting customer profitability over time based on RFM time series. International Journal of Business Forecasting and Marketing Intelligence, 2(1), 1.
Chen, D., Sain, S. L., & Guo, K. (2012). Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining. Journal of Database Marketing & Customer Strategy Management, 19(3), 197–208.
G, D. (N.d.). RFM Customer Segmentation with K Means. RPubs. Retrieved 2022, from https://rpubs.com/DessiG/671942
Zaldivar, E. (2020). Tutorial: RFM Analysis Using R. RPubs. Retrieved 2022, from https://rpubs.com/Eddie_Zaldivar/705462