We can use str()
to look at the format of each column and see a sample of data.
#### Examine transaction data
str(transactionData)
## Classes 'data.table' and 'data.frame': 12043 obs. of 23 variables:
## $ status : chr "authorized" "authorized" "authorized" "authorized" ...
## $ card_present_flag: int 1 0 1 1 1 NA 1 1 1 NA ...
## $ bpay_biller_code : chr "" "" "" "" ...
## $ account : chr "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
## $ currency : chr "AUD" "AUD" "AUD" "AUD" ...
## $ long_lat : chr "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
## $ txn_description : chr "POS" "SALES-POS" "POS" "SALES-POS" ...
## $ merchant_id : chr "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
## $ merchant_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ first_name : chr "Diana" "Diana" "Michael" "Rhonda" ...
## $ balance : num 35.39 21.2 5.71 2117.22 17.95 ...
## $ date : chr "8/1/2018" "8/1/2018" "8/1/2018" "8/1/2018" ...
## $ gender : chr "F" "F" "M" "F" ...
## $ age : int 26 26 38 40 26 20 43 43 27 40 ...
## $ merchant_suburb : chr "Ashmore" "Sydney" "Sydney" "Buderim" ...
## $ merchant_state : chr "QLD" "NSW" "NSW" "QLD" ...
## $ extraction : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
## $ amount : num 16.25 14.19 6.42 40.9 3.25 ...
## $ transaction_id : chr "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
## $ country : chr "Australia" "Australia" "Australia" "Australia" ...
## $ customer_id : chr "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
## $ merchant_long_lat: chr "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
## $ movement : chr "debit" "debit" "debit" "debit" ...
## - attr(*, ".internal.selfref")=<externalptr>
Next, we need to see if there is any null or blank cell in each variable.
# Count both NA and blank cells except extraction
sapply(transactionData[,-"extraction"], function(x) sum(is.na(x)|x == ""))
## status card_present_flag bpay_biller_code account
## 0 4326 11158 0
## currency long_lat txn_description merchant_id
## 0 0 0 4326
## merchant_code first_name balance date
## 11160 0 0 0
## gender age merchant_suburb merchant_state
## 0 0 4326 4326
## amount transaction_id country customer_id
## 0 0 0 0
## merchant_long_lat movement
## 4326 0
# Count both NA and blank cells for extraction
sum(sapply(as.numeric(transactionData$extraction), function(x) sum(is.na(x)|x == "")))
## [1] 0
There are 4326 missing values in card_present_flag
column, 11158 missing values in bpay_biller_code
column, 4326 missing values in merchant_id
column, 11160 missing values in merchant_code
column, 4326 missing values in merchant_suburb
column, 4326 missing values in merchant_state
column, and 4326 missing values in merchant_long_lat
column.
#### Convert status column to a factor format
transactionData$status <- as.factor(transactionData$status)
#### Convert date column to a date format
transactionData$date <- as.Date(transactionData$date, format = "%m/%d/%Y")
#### Convert gender column to a factor format
transactionData$gender <- as.factor(transactionData$gender)
#### Convert merchant_state column to a factor format
transactionData$merchant_state <- as.factor(transactionData$merchant_state)
#### Convert movement column to a factor format
transactionData$movement <- as.factor(transactionData$movement)
str(transactionData)
## Classes 'data.table' and 'data.frame': 12043 obs. of 23 variables:
## $ status : Factor w/ 2 levels "authorized","posted": 1 1 1 1 1 2 1 1 1 2 ...
## $ card_present_flag: int 1 0 1 1 1 NA 1 1 1 NA ...
## $ bpay_biller_code : chr "" "" "" "" ...
## $ account : chr "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
## $ currency : chr "AUD" "AUD" "AUD" "AUD" ...
## $ long_lat : chr "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
## $ txn_description : chr "POS" "SALES-POS" "POS" "SALES-POS" ...
## $ merchant_id : chr "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
## $ merchant_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ first_name : chr "Diana" "Diana" "Michael" "Rhonda" ...
## $ balance : num 35.39 21.2 5.71 2117.22 17.95 ...
## $ date : Date, format: "2018-08-01" "2018-08-01" ...
## $ gender : Factor w/ 2 levels "F","M": 1 1 2 1 1 2 1 1 1 2 ...
## $ age : int 26 26 38 40 26 20 43 43 27 40 ...
## $ merchant_suburb : chr "Ashmore" "Sydney" "Sydney" "Buderim" ...
## $ merchant_state : Factor w/ 9 levels "","ACT","NSW",..: 5 3 3 5 5 1 8 8 9 1 ...
## $ extraction : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
## $ amount : num 16.25 14.19 6.42 40.9 3.25 ...
## $ transaction_id : chr "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
## $ country : chr "Australia" "Australia" "Australia" "Australia" ...
## $ customer_id : chr "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
## $ merchant_long_lat: chr "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
## $ movement : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Show them in percentage for non-character variables
sapply(transactionData[,c("merchant_code","balance","gender","age","merchant_state","amount","movement","long_lat")],
function(x) sum(is.na(x)|x == "")/length(x)*100)
## merchant_code balance gender age merchant_state
## 92.66794 0.00000 0.00000 0.00000 35.92128
## amount movement long_lat
## 0.00000 0.00000 0.00000
We found 92.67% of data in merchant_code
column and 35.92% of data in merchant_state
column. We’ll leave the missing values in merchant_code
and merchant_state
right now, and if necessary, we’ll try to fill those missing values for future analysis.
#### Find the missing date
DateRange <- seq(range(transactionData$date)[1],range(transactionData$date)[2],by=1)
DateRange[!DateRange %in% transactionData$date]
## [1] "2018-08-16"
We found one date was not in the dataset. We would go back to this, if this date is important for the analysis.
#### Create month variable
transactionData$month <-
ifelse(month(transactionData$date) == 8, "Aug",
ifelse(month(transactionData$date) == 9, "Sep", "Oct"))
table(transactionData$month)
##
## Aug Oct Sep
## 3943 4087 4013
sum(table(transactionData$month))
## [1] 12043
We created new variable month
for comparing the transactions on those three months.
#### Find the outliers in amount column
boxplot(transactionData$amount)
#### Check the stats of the normal amount data
boxplot.stats(transactionData$amount)$stats
## [1] 0.100 16.000 29.000 53.655 110.000
#### Check the proportion of the outliers
sum(transactionData$amount > 110)/length(transactionData$amount)
## [1] 0.153118
#### Check whether the outlier amounts belong to specific customers
table(transactionData$customer_id[transactionData$amount > 110])
##
## CUS-1005756958 CUS-1117979751 CUS-1140341822 CUS-1147642491 CUS-1196156254
## 29 16 12 31 19
## CUS-1220154422 CUS-1233833708 CUS-1271030853 CUS-127297539 CUS-134193016
## 25 21 24 21 11
## CUS-134833760 CUS-1388323263 CUS-1433879684 CUS-1462656821 CUS-1478398256
## 18 22 11 24 20
## CUS-1499065773 CUS-1604596597 CUS-1609060617 CUS-1614226872 CUS-1617121891
## 16 8 21 20 23
## CUS-164374203 CUS-1646183815 CUS-1646621553 CUS-1654129794 CUS-1669695324
## 26 18 21 19 24
## CUS-1739931018 CUS-1790886359 CUS-1816693151 CUS-1842679196 CUS-1892177589
## 5 18 16 18 20
## CUS-1896554896 CUS-1928710999 CUS-2031327464 CUS-2059096722 CUS-2083971310
## 12 14 27 20 13
## CUS-2110742437 CUS-2142601169 CUS-2155701614 CUS-2178051368 CUS-2206365095
## 5 19 25 7 8
## CUS-2283904812 CUS-2317998716 CUS-2348881191 CUS-2370108457 CUS-2376382098
## 8 7 10 16 17
## CUS-2484453271 CUS-2487424745 CUS-2500783281 CUS-2505971401 CUS-2599279756
## 10 22 27 22 16
## CUS-261674136 CUS-2630892467 CUS-2650223890 CUS-2663907001 CUS-2688605418
## 14 14 24 22 23
## CUS-2695611575 CUS-2738291516 CUS-2819545904 CUS-2977593493 CUS-3026014945
## 23 22 26 15 21
## CUS-3117610635 CUS-3129499595 CUS-3142625864 CUS-3151318058 CUS-3174332735
## 23 30 27 14 16
## CUS-3180318393 CUS-3201519139 CUS-3249305314 CUS-325142416 CUS-3255104878
## 18 23 28 18 18
## CUS-326006476 CUS-331942311 CUS-3325710106 CUS-3336454548 CUS-3378712515
## 18 11 21 13 21
## CUS-3395687666 CUS-3431016847 CUS-3462882033 CUS-3702001629 CUS-3716701010
## 15 13 13 17 25
## CUS-3904958894 CUS-3989008654 CUS-4023861240 CUS-4123612273 CUS-4142663097
## 20 12 19 32 32
## CUS-423725039 CUS-443776336 CUS-495599312 CUS-497688347 CUS-511326734
## 8 16 21 18 18
## CUS-51506836 CUS-527400765 CUS-537508723 CUS-55310383 CUS-586638664
## 12 24 17 25 15
## CUS-72755508 CUS-809013380 CUS-860700529 CUS-880898248 CUS-883482547
## 16 21 19 10 21
uniqueN(transactionData$customer_id[transactionData$amount > 110])
## [1] 100
uniqueN(transactionData$customer_id)
## [1] 100
From the plot above, we can see that there are over 15% of transactions which exceeded 110 dollars. However, we can’t remove those transaction data as outliers, for every customer has multiple transactions over 110 dollars.
# Histogram
x <- transactionData$amount
h <- hist(x,breaks = 20, col = "grey",
xlab = "Transaction amount (AUD$)",
ylab = "The number of transactions")
xfit <- seq(min(x),max(x),length=40)
yfit <- dnorm(xfit, mean = mean(x), sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit,yfit,col="orange",lwd=2)
# Main statistics
mean(x)
## [1] 187.9336
median(x)
## [1] 29
max(x)
## [1] 8835.98
The distribution of amount
has a positive skew. Half of amount
are less than 29 dollars; however, the mean of amount
is 187.9336 dollars, which indicates the other half of amount
is much larger than the half which are less than 29 dollars.
### The average number of transaction per customer on Aug 2018
total_txn_n_Aug <- uniqueN(transactionData$transaction_id[transactionData$month == "Aug"])
total_cust_n_Aug <- uniqueN(transactionData$customer_id[transactionData$month == "Aug"])
(avg_txn_n_per_customer_Aug <- total_txn_n_Aug/total_cust_n_Aug)
## [1] 39.43
### The average number of transaction per customer on Sep 2018
total_txn_n_Sep <- uniqueN(transactionData$transaction_id[transactionData$month == "Sep"])
total_cust_n_Sep <- uniqueN(transactionData$customer_id[transactionData$month == "Sep"])
(avg_txn_n_per_customer_Sep <- total_txn_n_Sep/total_cust_n_Sep)
## [1] 40.13
### The average number of transaction per customer on Oct 2018
total_txn_n_Oct <- uniqueN(transactionData$transaction_id[transactionData$month == "Oct"])
total_cust_n_Oct <- uniqueN(transactionData$customer_id[transactionData$month == "Oct"])
(avg_txn_n_per_customer_Oct <- total_txn_n_Oct/total_cust_n_Oct)
## [1] 40.87
On average, every customer had about 40 transactions in each month. Now, let’s see whether we can find any interesting behaviours for different lengths of time.
#### Split the time out of the extraction column into the time column
transactionData$time <- substr(transactionData$extraction,12,19)
#### Split the date into two columns, weekday, and weekend
transactionData$day <- weekdays(transactionData$date)
total_txn_n <- uniqueN(transactionData$transaction_id)
total_day_n <- uniqueN(transactionData$date)
avg_txn_n_per_day <- total_txn_n/total_day_n
txn_n_df <- transactionData %>% group_by(date) %>% summarise_at(vars(transaction_id),list(n = uniqueN))
txn_n_df$day <- weekdays(txn_n_df$date)
head(txn_n_df)
#### Plot the transaction volume over the course of an average day
barplot(txn_n_df$n/100, names.arg = txn_n_df$date,
cex.names = 0.6,
xlab = "Date",
ylab = "The Average Number of Transactions",
main = "The Average Number of Daily Transactions per Customer")
We can see a clear pattern in the plot above. Peaks and troughs frequently appeared in the plot. Let’s take a closer look.
txn_n_df1 <- txn_n_df %>% group_by(day) %>% summarise(n = mean(n))
txn_n_df1.0 <- txn_n_df1[c(2,6,7,5,1,3,4),]
#### Plot the transaction volume over days
barplot(txn_n_df1.0$n/100, names.arg = txn_n_df1.0$day,
cex.names = 0.6,
xlab = "Days",
ylab = "The Average Number of Transactions",
main = "The Average Number of Daily Transactions per Customer")
It seems like customers tend to have more transactions between Wednesday and Friday. How about the number of transactions within a day?
transactionData$hour <- as.integer(substr(transactionData$time,1,2))
txn_n_df2 <- transactionData %>% group_by(hour) %>% summarise_at(vars(transaction_id),list(n = uniqueN))
#### Plot the transaction volume over the course of an average day
barplot(txn_n_df2$n/(100*uniqueN(transactionData$date)), names.arg = txn_n_df2$hour,
cex.names = 0.6,
xlab = "Hours",
ylab = "The Number of Transactions",
main = "The Number of Hourly Transactions per Customer")
From the plot above, we can see that most of custmoers purchase something around 9:00 am, 11:00 am, and 5:00 pm. Not surprisely, those times are meal times.
After exploring some insights of the number of transactions, let’s have a look at the transaction spending for different lengths of time.
spending_df <- transactionData %>% group_by(date) %>% summarise_at(vars(amount),list(spending = sum))
avg_spending_df <- merge(spending_df,txn_n_df)
avg_spending_df$avg_spending <- avg_spending_df$spending/avg_spending_df$n
head(avg_spending_df)
#### Plot the transaction volume over the course of an average day
barplot(avg_spending_df$avg_spending/100, names.arg = avg_spending_df$date,
cex.names = 0.6,
xlab = "Date",
ylab = "Average Spending (AUD$)",
main = "The Average Daily Spending per Customer")
The pattern of average daily spending per customer is similar to that in the average number of daily transactions per customer. Peaks and troughs occurred periodically. Now, let’s have a closer look.
spending_df1 <- transactionData %>% group_by(day) %>% summarise_at(vars(amount),list(spending = sum))
avg_spending_df1 <- merge(spending_df1,txn_n_df2)
avg_spending_df1$avg_spending <- avg_spending_df1$spending/avg_spending_df1$n
avg_spending_df1$avg_spending_per_customer <- avg_spending_df1$avg_spending/100
head(avg_spending_df1)
avg_spending_df1.0 <- avg_spending_df1[c(2,6,7,5,1,3,4),]
#### Plot the transaction volume over weekdays
barplot(avg_spending_df1.0$avg_spending_per_customer, names.arg = avg_spending_df1.0$day,
cex.names = 0.6,
xlab = "Days",
ylab = "Avearge Spending (AUD$)",
main = "The Average Daily Spending per Customer")
It seems like customers tend to spend more on Monday and Friday and spend least on weekend. Next, we take a look at the hourly transaction spending.
avg_spending_df2 <- transactionData %>% group_by(hour) %>% summarise_at(vars(amount),list(avg_spending = mean))
avg_spending_df2$avg_spending_per_customer <- avg_spending_df2$avg_spending/100
#### Plot the transaction volume over the course of an average day
barplot(avg_spending_df2$avg_spending_per_customer, names.arg = avg_spending_df2$hour,
cex.names = 0.6,
xlab = "Hours",
ylab = "The Averge Spending (AUD$)",
main = "The Average Hourly Spending per Customer")
Customers tend to spend the most at 1:00 pm and 5:00 pm.
Let’s have a look where the customers are from based on the information in long_lat
column.
account_loc_inf <- transactionData[match(unique(transactionData$account),transactionData$account), c("account","long_lat")]
account_loc_inf$long <- as.numeric(substr(account_loc_inf$long_lat,1,6))
account_loc_inf$lat <- as.numeric(substr(account_loc_inf$long_lat,8,13))
head(account_loc_inf)
coords_state <- function(coords_state, loc_inf,state_name){
loc_inf$states <- ifelse(loc_inf$long <= coords_state[[1]][3] &
loc_inf$long >= coords_state[[1]][1] &
loc_inf$lat <= coords_state[[1]][4] &
loc_inf$lat >= coords_state[[1]][2],
state_name,loc_inf$states)
}
coords_NSW <- as.data.frame(c(140.9993, -37.50503, 153.6299, -28.15703 ),c("xmin","ymin","xmax","ymax"))
coords_VIC <- as.data.frame(c(140.9617, -39.13396, 149.9763, -33.99605 ),c("xmin","ymin","xmax","ymax"))
coords_QLD <- as.data.frame(c(137.9943, -29.17719, 153.5522, -9.229287 ),c("xmin","ymin","xmax","ymax"))
coords_SA <- as.data.frame(c(129.0013, -38.06, 141.003, -25.99638 ),c("xmin","ymin","xmax","ymax"))
coords_WA <- as.data.frame(c(112.9211, -35.12228, 129.0019, -13.74142 ),c("xmin","ymin","xmax","ymax"))
coords_TAS <- as.data.frame(c(143.8353, -43.63203, 148.4472, -39.45196 ),c("xmin","ymin","xmax","ymax"))
coords_NT <- as.data.frame(c(129.0005, -25.99862, 137.9991, -10.97392 ),c("xmin","ymin","xmax","ymax"))
coords_ACT <- as.data.frame(c(148.7628, -148.7628, 149.3972, -35.12442 ),c("xmin","ymin","xmax","ymax"))
account_loc_inf$states <- NA
account_loc_inf$states <- coords_state(coords_NSW, account_loc_inf, "NSW")
account_loc_inf$states <- coords_state(coords_VIC, account_loc_inf, "VIC")
account_loc_inf$states <- coords_state(coords_QLD, account_loc_inf, "QLD")
account_loc_inf$states <- coords_state(coords_SA, account_loc_inf, "SA")
account_loc_inf$states <- coords_state(coords_WA, account_loc_inf, "WA")
account_loc_inf$states <- coords_state(coords_TAS, account_loc_inf, "TAS")
account_loc_inf$states <- coords_state(coords_NT, account_loc_inf, "NT")
account_loc_inf$states <- coords_state(coords_ACT, account_loc_inf, "ACT")
head(account_loc_inf)
#### Create a dataframe with the number of cusomters in each state
state_freq_df <- as.data.frame(table(account_loc_inf$states))
#### Make a bar plot
barplot(state_freq_df$Freq, names.arg = state_freq_df$Var1,
cex.names = 0.6,
xlab = "States", ylab = "The number of customers",
main = "The Number of Cusotmers in Each State"
)
As the plot shown above, most of customers are from Victoria and New South Wales states.