Load required libraries and datasets

Exploratory data analysis

Examining transaction data

We can use str() to look at the format of each column and see a sample of data.

#### Examine transaction data
str(transactionData)
## Classes 'data.table' and 'data.frame':   12043 obs. of  23 variables:
##  $ status           : chr  "authorized" "authorized" "authorized" "authorized" ...
##  $ card_present_flag: int  1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : chr  "" "" "" "" ...
##  $ account          : chr  "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : chr  "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr  "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : chr  "POS" "SALES-POS" "POS" "SALES-POS" ...
##  $ merchant_id      : chr  "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr  "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num  35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : chr  "8/1/2018" "8/1/2018" "8/1/2018" "8/1/2018" ...
##  $ gender           : chr  "F" "F" "M" "F" ...
##  $ age              : int  26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : chr  "Ashmore" "Sydney" "Sydney" "Buderim" ...
##  $ merchant_state   : chr  "QLD" "NSW" "NSW" "QLD" ...
##  $ extraction       : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
##  $ amount           : num  16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr  "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : chr  "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr  "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr  "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : chr  "debit" "debit" "debit" "debit" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Find the missing values

Next, we need to see if there is any null or blank cell in each variable.

# Count both NA and blank cells except extraction 
sapply(transactionData[,-"extraction"], function(x) sum(is.na(x)|x == "")) 
##            status card_present_flag  bpay_biller_code           account 
##                 0              4326             11158                 0 
##          currency          long_lat   txn_description       merchant_id 
##                 0                 0                 0              4326 
##     merchant_code        first_name           balance              date 
##             11160                 0                 0                 0 
##            gender               age   merchant_suburb    merchant_state 
##                 0                 0              4326              4326 
##            amount    transaction_id           country       customer_id 
##                 0                 0                 0                 0 
## merchant_long_lat          movement 
##              4326                 0
# Count both NA and blank cells for extraction 
sum(sapply(as.numeric(transactionData$extraction), function(x) sum(is.na(x)|x == ""))) 
## [1] 0

There are 4326 missing values in card_present_flag column, 11158 missing values in bpay_biller_code column, 4326 missing values in merchant_id column, 11160 missing values in merchant_code column, 4326 missing values in merchant_suburb column, 4326 missing values in merchant_state column, and 4326 missing values in merchant_long_lat column.

Change the formats for some variables

#### Convert status column to a factor format
transactionData$status <- as.factor(transactionData$status)
#### Convert date column to a date format
transactionData$date <- as.Date(transactionData$date, format = "%m/%d/%Y")
#### Convert gender column to a factor format
transactionData$gender <- as.factor(transactionData$gender)
#### Convert merchant_state column to a factor format
transactionData$merchant_state <- as.factor(transactionData$merchant_state)
#### Convert movement column to a factor format
transactionData$movement <- as.factor(transactionData$movement)
str(transactionData)
## Classes 'data.table' and 'data.frame':   12043 obs. of  23 variables:
##  $ status           : Factor w/ 2 levels "authorized","posted": 1 1 1 1 1 2 1 1 1 2 ...
##  $ card_present_flag: int  1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : chr  "" "" "" "" ...
##  $ account          : chr  "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : chr  "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr  "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : chr  "POS" "SALES-POS" "POS" "SALES-POS" ...
##  $ merchant_id      : chr  "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr  "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num  35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : Date, format: "2018-08-01" "2018-08-01" ...
##  $ gender           : Factor w/ 2 levels "F","M": 1 1 2 1 1 2 1 1 1 2 ...
##  $ age              : int  26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : chr  "Ashmore" "Sydney" "Sydney" "Buderim" ...
##  $ merchant_state   : Factor w/ 9 levels "","ACT","NSW",..: 5 3 3 5 5 1 8 8 9 1 ...
##  $ extraction       : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
##  $ amount           : num  16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr  "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : chr  "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr  "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr  "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# Show them in percentage for non-character variables
sapply(transactionData[,c("merchant_code","balance","gender","age","merchant_state","amount","movement","long_lat")],
function(x) sum(is.na(x)|x == "")/length(x)*100) 
##  merchant_code        balance         gender            age merchant_state 
##       92.66794        0.00000        0.00000        0.00000       35.92128 
##         amount       movement       long_lat 
##        0.00000        0.00000        0.00000

We found 92.67% of data in merchant_code column and 35.92% of data in merchant_state column. We’ll leave the missing values in merchant_code and merchant_state right now, and if necessary, we’ll try to fill those missing values for future analysis.

Find the missing date

#### Find the missing date 
DateRange <- seq(range(transactionData$date)[1],range(transactionData$date)[2],by=1)
DateRange[!DateRange %in% transactionData$date]
## [1] "2018-08-16"

We found one date was not in the dataset. We would go back to this, if this date is important for the analysis.

Create new variable

#### Create month variable
transactionData$month <- 
  ifelse(month(transactionData$date) == 8, "Aug", 
         ifelse(month(transactionData$date) == 9, "Sep", "Oct"))
table(transactionData$month)
## 
##  Aug  Oct  Sep 
## 3943 4087 4013
sum(table(transactionData$month))
## [1] 12043

We created new variable month for comparing the transactions on those three months.

Find the outliers

#### Find the outliers in amount column
boxplot(transactionData$amount)

#### Check the stats of the normal amount data
boxplot.stats(transactionData$amount)$stats
## [1]   0.100  16.000  29.000  53.655 110.000
#### Check the proportion of the outliers
sum(transactionData$amount > 110)/length(transactionData$amount)
## [1] 0.153118
#### Check whether the outlier amounts belong to specific customers
table(transactionData$customer_id[transactionData$amount > 110])
## 
## CUS-1005756958 CUS-1117979751 CUS-1140341822 CUS-1147642491 CUS-1196156254 
##             29             16             12             31             19 
## CUS-1220154422 CUS-1233833708 CUS-1271030853  CUS-127297539  CUS-134193016 
##             25             21             24             21             11 
##  CUS-134833760 CUS-1388323263 CUS-1433879684 CUS-1462656821 CUS-1478398256 
##             18             22             11             24             20 
## CUS-1499065773 CUS-1604596597 CUS-1609060617 CUS-1614226872 CUS-1617121891 
##             16              8             21             20             23 
##  CUS-164374203 CUS-1646183815 CUS-1646621553 CUS-1654129794 CUS-1669695324 
##             26             18             21             19             24 
## CUS-1739931018 CUS-1790886359 CUS-1816693151 CUS-1842679196 CUS-1892177589 
##              5             18             16             18             20 
## CUS-1896554896 CUS-1928710999 CUS-2031327464 CUS-2059096722 CUS-2083971310 
##             12             14             27             20             13 
## CUS-2110742437 CUS-2142601169 CUS-2155701614 CUS-2178051368 CUS-2206365095 
##              5             19             25              7              8 
## CUS-2283904812 CUS-2317998716 CUS-2348881191 CUS-2370108457 CUS-2376382098 
##              8              7             10             16             17 
## CUS-2484453271 CUS-2487424745 CUS-2500783281 CUS-2505971401 CUS-2599279756 
##             10             22             27             22             16 
##  CUS-261674136 CUS-2630892467 CUS-2650223890 CUS-2663907001 CUS-2688605418 
##             14             14             24             22             23 
## CUS-2695611575 CUS-2738291516 CUS-2819545904 CUS-2977593493 CUS-3026014945 
##             23             22             26             15             21 
## CUS-3117610635 CUS-3129499595 CUS-3142625864 CUS-3151318058 CUS-3174332735 
##             23             30             27             14             16 
## CUS-3180318393 CUS-3201519139 CUS-3249305314  CUS-325142416 CUS-3255104878 
##             18             23             28             18             18 
##  CUS-326006476  CUS-331942311 CUS-3325710106 CUS-3336454548 CUS-3378712515 
##             18             11             21             13             21 
## CUS-3395687666 CUS-3431016847 CUS-3462882033 CUS-3702001629 CUS-3716701010 
##             15             13             13             17             25 
## CUS-3904958894 CUS-3989008654 CUS-4023861240 CUS-4123612273 CUS-4142663097 
##             20             12             19             32             32 
##  CUS-423725039  CUS-443776336  CUS-495599312  CUS-497688347  CUS-511326734 
##              8             16             21             18             18 
##   CUS-51506836  CUS-527400765  CUS-537508723   CUS-55310383  CUS-586638664 
##             12             24             17             25             15 
##   CUS-72755508  CUS-809013380  CUS-860700529  CUS-880898248  CUS-883482547 
##             16             21             19             10             21
uniqueN(transactionData$customer_id[transactionData$amount > 110])
## [1] 100
uniqueN(transactionData$customer_id)
## [1] 100

From the plot above, we can see that there are over 15% of transactions which exceeded 110 dollars. However, we can’t remove those transaction data as outliers, for every customer has multiple transactions over 110 dollars.

Insights Exploration

Average transaction amount

# Histogram
x <- transactionData$amount
h <- hist(x,breaks = 20, col = "grey",
          xlab = "Transaction amount (AUD$)",
          ylab = "The number of transactions")
xfit <- seq(min(x),max(x),length=40)
yfit <- dnorm(xfit, mean = mean(x), sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit,yfit,col="orange",lwd=2)

# Main statistics
mean(x)
## [1] 187.9336
median(x)
## [1] 29
max(x)
## [1] 8835.98

The distribution of amount has a positive skew. Half of amount are less than 29 dollars; however, the mean of amount is 187.9336 dollars, which indicates the other half of amount is much larger than the half which are less than 29 dollars.

The average number of transactions per customer each month

### The average number of transaction per customer on Aug 2018
total_txn_n_Aug <- uniqueN(transactionData$transaction_id[transactionData$month == "Aug"])
total_cust_n_Aug <- uniqueN(transactionData$customer_id[transactionData$month == "Aug"])
(avg_txn_n_per_customer_Aug <- total_txn_n_Aug/total_cust_n_Aug)
## [1] 39.43
### The average number of transaction per customer on Sep 2018
total_txn_n_Sep <- uniqueN(transactionData$transaction_id[transactionData$month == "Sep"])
total_cust_n_Sep <- uniqueN(transactionData$customer_id[transactionData$month == "Sep"])
(avg_txn_n_per_customer_Sep <- total_txn_n_Sep/total_cust_n_Sep)
## [1] 40.13
### The average number of transaction per customer on Oct 2018
total_txn_n_Oct <- uniqueN(transactionData$transaction_id[transactionData$month == "Oct"])
total_cust_n_Oct <- uniqueN(transactionData$customer_id[transactionData$month == "Oct"])
(avg_txn_n_per_customer_Oct <- total_txn_n_Oct/total_cust_n_Oct)
## [1] 40.87

On average, every customer had about 40 transactions in each month. Now, let’s see whether we can find any interesting behaviours for different lengths of time.

Segment the dataset by transaction date and time

#### Split the time out of the extraction column into the time column
transactionData$time <- substr(transactionData$extraction,12,19)
#### Split the date into two columns, weekday, and weekend
transactionData$day <- weekdays(transactionData$date)

Visualise transaction spending over the course of an average day or week

total_txn_n <- uniqueN(transactionData$transaction_id)
total_day_n <- uniqueN(transactionData$date)
avg_txn_n_per_day <- total_txn_n/total_day_n
txn_n_df <- transactionData %>% group_by(date) %>% summarise_at(vars(transaction_id),list(n = uniqueN))
txn_n_df$day <- weekdays(txn_n_df$date)
head(txn_n_df)
#### Plot the transaction volume over the course of an average day
barplot(txn_n_df$n/100, names.arg = txn_n_df$date,
        cex.names = 0.6,
        xlab = "Date",
        ylab = "The Average Number of Transactions",
        main = "The Average Number of Daily Transactions per Customer")

We can see a clear pattern in the plot above. Peaks and troughs frequently appeared in the plot. Let’s take a closer look.

txn_n_df1 <- txn_n_df %>% group_by(day) %>% summarise(n = mean(n))
txn_n_df1.0 <- txn_n_df1[c(2,6,7,5,1,3,4),]


#### Plot the transaction volume over days
barplot(txn_n_df1.0$n/100, names.arg = txn_n_df1.0$day,
        cex.names = 0.6,
        xlab = "Days",
        ylab = "The Average Number of Transactions",
        main = "The Average Number of Daily Transactions per Customer")

It seems like customers tend to have more transactions between Wednesday and Friday. How about the number of transactions within a day?

transactionData$hour <- as.integer(substr(transactionData$time,1,2))
txn_n_df2 <- transactionData %>% group_by(hour) %>% summarise_at(vars(transaction_id),list(n = uniqueN))

#### Plot the transaction volume over the course of an average day
barplot(txn_n_df2$n/(100*uniqueN(transactionData$date)), names.arg = txn_n_df2$hour,
        cex.names = 0.6,
        xlab = "Hours",
        ylab = "The Number of Transactions",
        main = "The Number of Hourly Transactions per Customer")

From the plot above, we can see that most of custmoers purchase something around 9:00 am, 11:00 am, and 5:00 pm. Not surprisely, those times are meal times.

After exploring some insights of the number of transactions, let’s have a look at the transaction spending for different lengths of time.

Visualise transaction spending over the course of an average day or week

spending_df <- transactionData %>% group_by(date) %>%  summarise_at(vars(amount),list(spending = sum))
avg_spending_df <- merge(spending_df,txn_n_df)
avg_spending_df$avg_spending <- avg_spending_df$spending/avg_spending_df$n
head(avg_spending_df)
#### Plot the transaction volume over the course of an average day
barplot(avg_spending_df$avg_spending/100, names.arg = avg_spending_df$date,
        cex.names = 0.6,
        xlab = "Date",
        ylab = "Average Spending (AUD$)",
        main = "The Average Daily Spending per Customer")

The pattern of average daily spending per customer is similar to that in the average number of daily transactions per customer. Peaks and troughs occurred periodically. Now, let’s have a closer look.

spending_df1 <- transactionData %>% group_by(day) %>%  summarise_at(vars(amount),list(spending = sum))
avg_spending_df1 <- merge(spending_df1,txn_n_df2)
avg_spending_df1$avg_spending <- avg_spending_df1$spending/avg_spending_df1$n
avg_spending_df1$avg_spending_per_customer <- avg_spending_df1$avg_spending/100
head(avg_spending_df1)
avg_spending_df1.0 <- avg_spending_df1[c(2,6,7,5,1,3,4),]

#### Plot the transaction volume over weekdays
barplot(avg_spending_df1.0$avg_spending_per_customer, names.arg = avg_spending_df1.0$day,
        cex.names = 0.6,
        xlab = "Days",
        ylab = "Avearge Spending (AUD$)",
        main = "The Average Daily Spending per Customer")

It seems like customers tend to spend more on Monday and Friday and spend least on weekend. Next, we take a look at the hourly transaction spending.

avg_spending_df2 <- transactionData %>% group_by(hour) %>% summarise_at(vars(amount),list(avg_spending = mean))
avg_spending_df2$avg_spending_per_customer <- avg_spending_df2$avg_spending/100


#### Plot the transaction volume over the course of an average day
barplot(avg_spending_df2$avg_spending_per_customer, names.arg = avg_spending_df2$hour,
        cex.names = 0.6,
        xlab = "Hours",
        ylab = "The Averge Spending (AUD$)",
        main = "The Average Hourly Spending per Customer")

Customers tend to spend the most at 1:00 pm and 5:00 pm.

Location information

Let’s have a look where the customers are from based on the information in long_lat column.

Create a new dataframe with information with only account number and coordinates

account_loc_inf <- transactionData[match(unique(transactionData$account),transactionData$account), c("account","long_lat")]
account_loc_inf$long <- as.numeric(substr(account_loc_inf$long_lat,1,6))
account_loc_inf$lat <- as.numeric(substr(account_loc_inf$long_lat,8,13))
head(account_loc_inf)

Create a function to find the state given the coordinates

coords_state <- function(coords_state, loc_inf,state_name){
        loc_inf$states <- ifelse(loc_inf$long <= coords_state[[1]][3] & 
                                 loc_inf$long >= coords_state[[1]][1] &
                                 loc_inf$lat <=  coords_state[[1]][4] & 
                                 loc_inf$lat >= coords_state[[1]][2], 
                                 state_name,loc_inf$states)
}

Define the coordinates for each state

coords_NSW <- as.data.frame(c(140.9993, -37.50503, 153.6299, -28.15703 ),c("xmin","ymin","xmax","ymax"))
coords_VIC <- as.data.frame(c(140.9617, -39.13396, 149.9763, -33.99605 ),c("xmin","ymin","xmax","ymax"))
coords_QLD <- as.data.frame(c(137.9943, -29.17719, 153.5522, -9.229287 ),c("xmin","ymin","xmax","ymax"))
coords_SA <- as.data.frame(c(129.0013, -38.06, 141.003, -25.99638 ),c("xmin","ymin","xmax","ymax"))
coords_WA <- as.data.frame(c(112.9211, -35.12228, 129.0019, -13.74142 ),c("xmin","ymin","xmax","ymax"))
coords_TAS <- as.data.frame(c(143.8353, -43.63203, 148.4472, -39.45196 ),c("xmin","ymin","xmax","ymax"))
coords_NT <- as.data.frame(c(129.0005, -25.99862, 137.9991, -10.97392 ),c("xmin","ymin","xmax","ymax"))
coords_ACT <- as.data.frame(c(148.7628, -148.7628, 149.3972, -35.12442 ),c("xmin","ymin","xmax","ymax"))

Update the location information dataframe with states

account_loc_inf$states <- NA

account_loc_inf$states <- coords_state(coords_NSW, account_loc_inf, "NSW")
account_loc_inf$states <- coords_state(coords_VIC, account_loc_inf, "VIC")
account_loc_inf$states <- coords_state(coords_QLD, account_loc_inf, "QLD")
account_loc_inf$states <- coords_state(coords_SA, account_loc_inf, "SA")
account_loc_inf$states <- coords_state(coords_WA, account_loc_inf, "WA")
account_loc_inf$states <- coords_state(coords_TAS, account_loc_inf, "TAS")
account_loc_inf$states <- coords_state(coords_NT, account_loc_inf, "NT")
account_loc_inf$states <- coords_state(coords_ACT, account_loc_inf, "ACT")

head(account_loc_inf)

Make a bar plot to display the location distribution of customers

#### Create a dataframe with the number of cusomters in each state
state_freq_df <- as.data.frame(table(account_loc_inf$states))

#### Make a bar plot 
barplot(state_freq_df$Freq, names.arg = state_freq_df$Var1,
        cex.names = 0.6,
        xlab = "States", ylab = "The number of customers",
        main = "The Number of Cusotmers in Each State"
        )

As the plot shown above, most of customers are from Victoria and New South Wales states.