Load required libraries and datasets

#### Load the required libraries
library(data.table)
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(lubridate)
library(tidyverse)
## Warning: package 'purrr' was built under R version 3.5.3
library(modelr)
library(knitr)
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.3
library(dplyr)

#### Load the dataset
filePath <- "C:/Users/User/Desktop/JobSeeking/Project/Forage/ANZ/"
transactionData <- fread(paste0(filePath,"ANZ synthesised transaction dataset.csv"))

Identify the annual salary for each customer

Examining transaction data

We can use str() to look at the format of each column and see a sample of data.

str(transactionData)
## Classes 'data.table' and 'data.frame':   12043 obs. of  23 variables:
##  $ status           : chr  "authorized" "authorized" "authorized" "authorized" ...
##  $ card_present_flag: int  1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : chr  "" "" "" "" ...
##  $ account          : chr  "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : chr  "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr  "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : chr  "POS" "SALES-POS" "POS" "SALES-POS" ...
##  $ merchant_id      : chr  "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr  "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num  35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : chr  "8/1/2018" "8/1/2018" "8/1/2018" "8/1/2018" ...
##  $ gender           : chr  "F" "F" "M" "F" ...
##  $ age              : int  26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : chr  "Ashmore" "Sydney" "Sydney" "Buderim" ...
##  $ merchant_state   : chr  "QLD" "NSW" "NSW" "QLD" ...
##  $ extraction       : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
##  $ amount           : num  16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr  "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : chr  "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr  "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr  "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : chr  "debit" "debit" "debit" "debit" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Change the format for some variables

#### Convert status column to a factor format
transactionData$status <- as.factor(transactionData$status)
#### Convert date column to a date format
transactionData$date <- as.Date(transactionData$date, format = "%m/%d/%Y")
#### Convert gender column to a factor format
transactionData$gender <- as.factor(transactionData$gender)
#### Convert merchant_state column to a factor format
transactionData$merchant_state <- as.factor(transactionData$merchant_state)
#### Convert movement column to a factor format
transactionData$movement <- as.factor(transactionData$movement)
str(transactionData)
## Classes 'data.table' and 'data.frame':   12043 obs. of  23 variables:
##  $ status           : Factor w/ 2 levels "authorized","posted": 1 1 1 1 1 2 1 1 1 2 ...
##  $ card_present_flag: int  1 0 1 1 1 NA 1 1 1 NA ...
##  $ bpay_biller_code : chr  "" "" "" "" ...
##  $ account          : chr  "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
##  $ currency         : chr  "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr  "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
##  $ txn_description  : chr  "POS" "SALES-POS" "POS" "SALES-POS" ...
##  $ merchant_id      : chr  "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
##  $ merchant_code    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr  "Diana" "Diana" "Michael" "Rhonda" ...
##  $ balance          : num  35.39 21.2 5.71 2117.22 17.95 ...
##  $ date             : Date, format: "2018-08-01" "2018-08-01" ...
##  $ gender           : Factor w/ 2 levels "F","M": 1 1 2 1 1 2 1 1 1 2 ...
##  $ age              : int  26 26 38 40 26 20 43 43 27 40 ...
##  $ merchant_suburb  : chr  "Ashmore" "Sydney" "Sydney" "Buderim" ...
##  $ merchant_state   : Factor w/ 9 levels "","ACT","NSW",..: 5 3 3 5 5 1 8 8 9 1 ...
##  $ extraction       : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
##  $ amount           : num  16.25 14.19 6.42 40.9 3.25 ...
##  $ transaction_id   : chr  "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
##  $ country          : chr  "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr  "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
##  $ merchant_long_lat: chr  "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
##  $ movement         : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Create weekly salary data frame

salary_week_df <- transactionData[movement == "credit",
                                  c("account", "date", "amount", "movement")]
salary_week_df <- salary_week_df %>% group_by(account) %>% 
                  summarise(avg_week_salary = mean(amount))
head(salary_week_df)

We created a salary dataframe with unique account id and the average weekly salary, which we found from the credit movement.

Create annual salary data frame

salary_annual_df <- cbind(salary_week_df,annual_salary = salary_week_df$avg_week_salary*52)
head(salary_annual_df)

Then, we got the annual salary by timing the weekly salary with 52, for there are 52 weeks in a year.

Explore correlations between annual salary and various customer attributes

Create a new transactionData dataframe with annual salary

transactionData1 <- merge(transactionData,salary_annual_df[,-2], by = "account")
str(transactionData1)
## Classes 'data.table' and 'data.frame':   12043 obs. of  24 variables:
##  $ account          : chr  "ACC-1037050564" "ACC-1037050564" "ACC-1037050564" "ACC-1037050564" ...
##  $ status           : Factor w/ 2 levels "authorized","posted": 1 1 2 1 1 1 1 1 1 1 ...
##  $ card_present_flag: int  1 1 NA 1 1 1 1 1 1 1 ...
##  $ bpay_biller_code : chr  "" "" "" "" ...
##  $ currency         : chr  "AUD" "AUD" "AUD" "AUD" ...
##  $ long_lat         : chr  "153.10 -27.66" "153.10 -27.66" "153.10 -27.66" "153.10 -27.66" ...
##  $ txn_description  : chr  "SALES-POS" "POS" "PAYMENT" "SALES-POS" ...
##  $ merchant_id      : chr  "48514682-c78a-4a88-b0da-2d6302e64673" "02d45834-6f65-4f52-9a33-0b242dd7a078" "" "1ab3e343-3775-4d54-9dac-2b0834500025" ...
##  $ merchant_code    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ first_name       : chr  "Rhonda" "Rhonda" "Rhonda" "Rhonda" ...
##  $ balance          : num  2117 2072 2040 2029 1964 ...
##  $ date             : Date, format: "2018-08-01" "2018-08-01" ...
##  $ gender           : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age              : int  40 40 40 40 40 40 40 40 40 40 ...
##  $ merchant_suburb  : chr  "Buderim" "North Lakes" "" "Port Macquarie" ...
##  $ merchant_state   : Factor w/ 9 levels "","ACT","NSW",..: 5 5 1 3 5 5 9 5 3 5 ...
##  $ extraction       : POSIXct, format: "2018-08-01 01:38:45" "2018-08-01 09:15:03" ...
##  $ amount           : num  40.9 45.1 32 11.1 65.1 ...
##  $ transaction_id   : chr  "2698170da3704fd981b15e64a006079e" "ad101b96b9d44b968b2519e2bb26a649" "94622881419c44e1b799a52d82572207" "353c30ef66e345059a22305b36f138a2" ...
##  $ country          : chr  "Australia" "Australia" "Australia" "Australia" ...
##  $ customer_id      : chr  "CUS-1614226872" "CUS-1614226872" "CUS-1614226872" "CUS-1614226872" ...
##  $ merchant_long_lat: chr  "153.05 -26.68" "152.99 -27.21" "" "152.92 -31.43" ...
##  $ movement         : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
##  $ annual_salary    : num  46389 46389 46389 46389 46389 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "account"

We updated the transaction dataframe with the annual salary found the last section. And then, we began to explore the corrleations between annual salary and other customer attributes.

Explore correlations between annual salary and age

cor(transactionData1$annual_salary, transactionData1$age)
## [1] -0.0838749
plot(transactionData1$annual_salary, transactionData1$age,
     xlab = "Annual Salary (AUD$)",
     ylab = "Age",
     xlim = c(28000,60000))
lines(predict(lm(transactionData1$annual_salary~transactionData1$age)),col = "blue") #? trendline 

A weak negative correlation exists between age and annual_salary.

Explore correlations between annual salary and gender

### Scatter plot
plot(transactionData1$gender,transactionData1$annual_salary,
      xlab = "Gender",
      ylab = "Annual Salary (AUD$)",
      ylim = c(28000,500000))

### Chi-squared test for annual salary and gender
chisq.test(transactionData1$annual_salary, transactionData1$gender, correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  transactionData1$annual_salary and transactionData1$gender
## X-squared = 12043, df = 99, p-value < 2.2e-16

We have a high chi-squared and a p-value of less than 0.05 signifance level, so we reject the null hypothesis and conclude that age and annual_salary have a significant relationship.

Explore correlations between annual salary and states

account_loc_inf <- transactionData1[match(unique(transactionData$account),
                                          transactionData1$account), c("account","long_lat")]
account_loc_inf$long <- as.numeric(substr(account_loc_inf$long_lat,1,6))
account_loc_inf$lat <- as.numeric(substr(account_loc_inf$long_lat,8,13))
coords_state <- function(coords_state, loc_inf,var_name,state_name){
        var_name <- ifelse(loc_inf$long <= coords_state[[1]][3] & 
                                 loc_inf$long >= coords_state[[1]][1] &
                                 loc_inf$lat <=  coords_state[[1]][4] & 
                                 loc_inf$lat >= coords_state[[1]][2], 
                                 state_name,var_name)
}
coords_NSW <- as.data.frame(c(140.9993, -37.50503, 153.6299, -28.15703 ),
                            c("xmin","ymin","xmax","ymax"))
coords_VIC <- as.data.frame(c(140.9617, -39.13396, 149.9763, -33.99605 ),
                            c("xmin","ymin","xmax","ymax"))
coords_QLD <- as.data.frame(c(137.9943, -29.17719, 153.5522, -9.229287 ),
                            c("xmin","ymin","xmax","ymax"))
coords_SA <- as.data.frame(c(129.0013, -38.06, 141.003, -25.99638 ),
                           c("xmin","ymin","xmax","ymax"))
coords_WA <- as.data.frame(c(112.9211, -35.12228, 129.0019, -13.74142 ),
                           c("xmin","ymin","xmax","ymax"))
coords_TAS <- as.data.frame(c(143.8353, -43.63203, 148.4472, -39.45196 ),
                            c("xmin","ymin","xmax","ymax"))
coords_NT <- as.data.frame(c(129.0005, -25.99862, 137.9991, -10.97392 ),
                           c("xmin","ymin","xmax","ymax"))
coords_ACT <- as.data.frame(c(148.7628, -148.7628, 149.3972, -35.12442 ),
                            c("xmin","ymin","xmax","ymax"))
account_loc_inf$state <- NA

account_loc_inf$state <- coords_state(coords_NSW, account_loc_inf, 
                                      account_loc_inf$state, "NSW")
account_loc_inf$state <- coords_state(coords_VIC, account_loc_inf, 
                                      account_loc_inf$state, "VIC")
account_loc_inf$state <- coords_state(coords_QLD, account_loc_inf, 
                                      account_loc_inf$state, "QLD")
account_loc_inf$state <- coords_state(coords_SA, account_loc_inf, 
                                      account_loc_inf$state,"SA")
account_loc_inf$state <- coords_state(coords_WA, account_loc_inf, 
                                      account_loc_inf$state, "WA")
account_loc_inf$state <- coords_state(coords_TAS, account_loc_inf, 
                                      account_loc_inf$state, "TAS")
account_loc_inf$state <- coords_state(coords_NT, account_loc_inf, 
                                      account_loc_inf$state, "NT")
account_loc_inf$state <- coords_state(coords_ACT, account_loc_inf, 
                                      account_loc_inf$state, "ACT")

head(account_loc_inf)
#### Create a new transactionData with state variable
transactionData2 <- merge(transactionData1,account_loc_inf[,c(1,5)],by = "account")
transactionData2$state <- as.factor(transactionData2$state)

### Scatter plot
plot(transactionData2$state,transactionData2$annual_salary,
      xlab = "State",
      ylab = "Annual Salary (AUD$)",
      ylim = c(28000,500000))

### Chi-squared test for annual salary and state
chisq.test(transactionData2$annual_salary, transactionData2$state, correct = FALSE)
## Warning in chisq.test(transactionData2$annual_salary, transactionData2$state, :
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  transactionData2$annual_salary and transactionData2$state
## X-squared = 83440, df = 686, p-value < 2.2e-16

We have a high chi-squared and a p-value of less than 0.05 signifance level, so we reject the null hypothesis and conclude that state and annual_salary have a significant relationship.

Now, let’s build simple regression models based on the selected attributes.

Build a Simple Regression Model

Build a simple regression model to predict the annual salary for each customer

model1 <-  annual_salary ~ age + gender
summary(lm(formula = model1, data = transactionData2))
## 
## Call:
## lm(formula = model1, data = transactionData2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -92917 -53637 -12854  32641 333039 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 123144.2     2186.9  56.311   <2e-16 ***
## age           -626.4       64.8  -9.667   <2e-16 ***
## genderM      28344.4     1303.2  21.750   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 71430 on 12040 degrees of freedom
## Multiple R-squared:  0.04457,    Adjusted R-squared:  0.04442 
## F-statistic: 280.9 on 2 and 12040 DF,  p-value: < 2.2e-16

The p-values of all attributes in model1 are less than 0.05; however, the R-squared is too small to show it is an accurate model.

model2 <-  annual_salary ~ age + gender + state
summary(lm(formula = model2, data = transactionData2))
## 
## Call:
## lm(formula = model2, data = transactionData2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -102448  -49412  -12313   35770  317533 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  70212.80   11483.86   6.114 1.00e-09 ***
## age           -303.45      68.02  -4.461 8.23e-06 ***
## genderM      33938.28    1332.93  25.461  < 2e-16 ***
## stateNSW     36312.85   11179.16   3.248 0.001164 ** 
## stateNT      86812.25   11735.79   7.397 1.48e-13 ***
## stateQLD     40849.39   11190.35   3.650 0.000263 ***
## stateSA      77286.55   11393.03   6.784 1.23e-11 ***
## stateTAS    -20839.91   13486.84  -1.545 0.122324    
## stateVIC     25763.97   11145.27   2.312 0.020814 *  
## stateWA      49925.10   11242.96   4.441 9.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69910 on 11910 degrees of freedom
##   (123 observations deleted due to missingness)
## Multiple R-squared:  0.08949,    Adjusted R-squared:  0.0888 
## F-statistic: 130.1 on 9 and 11910 DF,  p-value: < 2.2e-16

The p-values of all attributes in model2 are less than 0.05. And the R-squared in model2 is larger than that in model1, but it is still too small to show it is an accurate model. Next, we should try to fit decision-tree models to see whether we could a more accurate model.

Build a Decision-Tree Model

Data partition

set.seed(123)
ind <- sample(2, nrow(transactionData2), replace = T , prob =  c(0.5,0.5))
train <- transactionData2[ind == 1, ]
test <- transactionData2[ind == 2, ]
### Tree Classification
tree1 <- rpart(model1, data = train)
tree2 <- rpart(model2, data = train)

Prediction and Accuarcy

### Prediction
p1 <- predict(tree1, train)
### Root Mean Square Error
sqrt(mean(train$annual_salary - p1)^2)
## [1] 5.801124e-11
### R Square
(cor(train$annual_salary,p1))^2
## [1] 0.4380375
### Prediction
p2 <- predict(tree2, train)
### Root Mean Square Error
sqrt(mean(train$annual_salary - p2)^2)
## [1] 8.736721e-11
### R Square
(cor(train$annual_salary,p2))^2
## [1] 0.7847837

Overall, the decision-tree models have much higher accuracy than the simple regression models have.

The decision-tree model with model2 has the higher accuracy compared to the decision-tree model with model1. As a result, we would recommend the decision-tree model with model2 as our final model for prediction.