#### Load the required libraries
library(data.table)
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(lubridate)
library(tidyverse)
## Warning: package 'purrr' was built under R version 3.5.3
library(modelr)
library(knitr)
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.3
library(dplyr)
#### Load the dataset
filePath <- "C:/Users/User/Desktop/JobSeeking/Project/Forage/ANZ/"
transactionData <- fread(paste0(filePath,"ANZ synthesised transaction dataset.csv"))
We can use str()
to look at the format of each column and see a sample of data.
str(transactionData)
## Classes 'data.table' and 'data.frame': 12043 obs. of 23 variables:
## $ status : chr "authorized" "authorized" "authorized" "authorized" ...
## $ card_present_flag: int 1 0 1 1 1 NA 1 1 1 NA ...
## $ bpay_biller_code : chr "" "" "" "" ...
## $ account : chr "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
## $ currency : chr "AUD" "AUD" "AUD" "AUD" ...
## $ long_lat : chr "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
## $ txn_description : chr "POS" "SALES-POS" "POS" "SALES-POS" ...
## $ merchant_id : chr "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
## $ merchant_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ first_name : chr "Diana" "Diana" "Michael" "Rhonda" ...
## $ balance : num 35.39 21.2 5.71 2117.22 17.95 ...
## $ date : chr "8/1/2018" "8/1/2018" "8/1/2018" "8/1/2018" ...
## $ gender : chr "F" "F" "M" "F" ...
## $ age : int 26 26 38 40 26 20 43 43 27 40 ...
## $ merchant_suburb : chr "Ashmore" "Sydney" "Sydney" "Buderim" ...
## $ merchant_state : chr "QLD" "NSW" "NSW" "QLD" ...
## $ extraction : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
## $ amount : num 16.25 14.19 6.42 40.9 3.25 ...
## $ transaction_id : chr "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
## $ country : chr "Australia" "Australia" "Australia" "Australia" ...
## $ customer_id : chr "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
## $ merchant_long_lat: chr "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
## $ movement : chr "debit" "debit" "debit" "debit" ...
## - attr(*, ".internal.selfref")=<externalptr>
#### Convert status column to a factor format
transactionData$status <- as.factor(transactionData$status)
#### Convert date column to a date format
transactionData$date <- as.Date(transactionData$date, format = "%m/%d/%Y")
#### Convert gender column to a factor format
transactionData$gender <- as.factor(transactionData$gender)
#### Convert merchant_state column to a factor format
transactionData$merchant_state <- as.factor(transactionData$merchant_state)
#### Convert movement column to a factor format
transactionData$movement <- as.factor(transactionData$movement)
str(transactionData)
## Classes 'data.table' and 'data.frame': 12043 obs. of 23 variables:
## $ status : Factor w/ 2 levels "authorized","posted": 1 1 1 1 1 2 1 1 1 2 ...
## $ card_present_flag: int 1 0 1 1 1 NA 1 1 1 NA ...
## $ bpay_biller_code : chr "" "" "" "" ...
## $ account : chr "ACC-1598451071" "ACC-1598451071" "ACC-1222300524" "ACC-1037050564" ...
## $ currency : chr "AUD" "AUD" "AUD" "AUD" ...
## $ long_lat : chr "153.41 -27.95" "153.41 -27.95" "151.23 -33.94" "153.10 -27.66" ...
## $ txn_description : chr "POS" "SALES-POS" "POS" "SALES-POS" ...
## $ merchant_id : chr "81c48296-73be-44a7-befa-d053f48ce7cd" "830a451c-316e-4a6a-bf25-e37caedca49e" "835c231d-8cdf-4e96-859d-e9d571760cf0" "48514682-c78a-4a88-b0da-2d6302e64673" ...
## $ merchant_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ first_name : chr "Diana" "Diana" "Michael" "Rhonda" ...
## $ balance : num 35.39 21.2 5.71 2117.22 17.95 ...
## $ date : Date, format: "2018-08-01" "2018-08-01" ...
## $ gender : Factor w/ 2 levels "F","M": 1 1 2 1 1 2 1 1 1 2 ...
## $ age : int 26 26 38 40 26 20 43 43 27 40 ...
## $ merchant_suburb : chr "Ashmore" "Sydney" "Sydney" "Buderim" ...
## $ merchant_state : Factor w/ 9 levels "","ACT","NSW",..: 5 3 3 5 5 1 8 8 9 1 ...
## $ extraction : POSIXct, format: "2018-08-01 01:01:15" "2018-08-01 01:13:45" ...
## $ amount : num 16.25 14.19 6.42 40.9 3.25 ...
## $ transaction_id : chr "a623070bfead4541a6b0fff8a09e706c" "13270a2a902145da9db4c951e04b51b9" "feb79e7ecd7048a5a36ec889d1a94270" "2698170da3704fd981b15e64a006079e" ...
## $ country : chr "Australia" "Australia" "Australia" "Australia" ...
## $ customer_id : chr "CUS-2487424745" "CUS-2487424745" "CUS-2142601169" "CUS-1614226872" ...
## $ merchant_long_lat: chr "153.38 -27.99" "151.21 -33.87" "151.21 -33.87" "153.05 -26.68" ...
## $ movement : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
## - attr(*, ".internal.selfref")=<externalptr>
salary_week_df <- transactionData[movement == "credit",
c("account", "date", "amount", "movement")]
salary_week_df <- salary_week_df %>% group_by(account) %>%
summarise(avg_week_salary = mean(amount))
head(salary_week_df)
We created a salary
dataframe with unique account id and the average weekly salary, which we found from the credit movement.
salary_annual_df <- cbind(salary_week_df,annual_salary = salary_week_df$avg_week_salary*52)
head(salary_annual_df)
Then, we got the annual salary by timing the weekly salary with 52, for there are 52 weeks in a year.
transactionData1 <- merge(transactionData,salary_annual_df[,-2], by = "account")
str(transactionData1)
## Classes 'data.table' and 'data.frame': 12043 obs. of 24 variables:
## $ account : chr "ACC-1037050564" "ACC-1037050564" "ACC-1037050564" "ACC-1037050564" ...
## $ status : Factor w/ 2 levels "authorized","posted": 1 1 2 1 1 1 1 1 1 1 ...
## $ card_present_flag: int 1 1 NA 1 1 1 1 1 1 1 ...
## $ bpay_biller_code : chr "" "" "" "" ...
## $ currency : chr "AUD" "AUD" "AUD" "AUD" ...
## $ long_lat : chr "153.10 -27.66" "153.10 -27.66" "153.10 -27.66" "153.10 -27.66" ...
## $ txn_description : chr "SALES-POS" "POS" "PAYMENT" "SALES-POS" ...
## $ merchant_id : chr "48514682-c78a-4a88-b0da-2d6302e64673" "02d45834-6f65-4f52-9a33-0b242dd7a078" "" "1ab3e343-3775-4d54-9dac-2b0834500025" ...
## $ merchant_code : int NA NA NA NA NA NA NA NA NA NA ...
## $ first_name : chr "Rhonda" "Rhonda" "Rhonda" "Rhonda" ...
## $ balance : num 2117 2072 2040 2029 1964 ...
## $ date : Date, format: "2018-08-01" "2018-08-01" ...
## $ gender : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 40 40 40 40 40 40 40 40 40 40 ...
## $ merchant_suburb : chr "Buderim" "North Lakes" "" "Port Macquarie" ...
## $ merchant_state : Factor w/ 9 levels "","ACT","NSW",..: 5 5 1 3 5 5 9 5 3 5 ...
## $ extraction : POSIXct, format: "2018-08-01 01:38:45" "2018-08-01 09:15:03" ...
## $ amount : num 40.9 45.1 32 11.1 65.1 ...
## $ transaction_id : chr "2698170da3704fd981b15e64a006079e" "ad101b96b9d44b968b2519e2bb26a649" "94622881419c44e1b799a52d82572207" "353c30ef66e345059a22305b36f138a2" ...
## $ country : chr "Australia" "Australia" "Australia" "Australia" ...
## $ customer_id : chr "CUS-1614226872" "CUS-1614226872" "CUS-1614226872" "CUS-1614226872" ...
## $ merchant_long_lat: chr "153.05 -26.68" "152.99 -27.21" "" "152.92 -31.43" ...
## $ movement : Factor w/ 2 levels "credit","debit": 2 2 2 2 2 2 2 2 2 2 ...
## $ annual_salary : num 46389 46389 46389 46389 46389 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "account"
We updated the transaction dataframe with the annual salary found the last section. And then, we began to explore the corrleations between annual salary and other customer attributes.
cor(transactionData1$annual_salary, transactionData1$age)
## [1] -0.0838749
plot(transactionData1$annual_salary, transactionData1$age,
xlab = "Annual Salary (AUD$)",
ylab = "Age",
xlim = c(28000,60000))
lines(predict(lm(transactionData1$annual_salary~transactionData1$age)),col = "blue") #? trendline
A weak negative correlation exists between age
and annual_salary
.
### Scatter plot
plot(transactionData1$gender,transactionData1$annual_salary,
xlab = "Gender",
ylab = "Annual Salary (AUD$)",
ylim = c(28000,500000))
### Chi-squared test for annual salary and gender
chisq.test(transactionData1$annual_salary, transactionData1$gender, correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: transactionData1$annual_salary and transactionData1$gender
## X-squared = 12043, df = 99, p-value < 2.2e-16
We have a high chi-squared and a p-value of less than 0.05 signifance level, so we reject the null hypothesis and conclude that age
and annual_salary
have a significant relationship.
account_loc_inf <- transactionData1[match(unique(transactionData$account),
transactionData1$account), c("account","long_lat")]
account_loc_inf$long <- as.numeric(substr(account_loc_inf$long_lat,1,6))
account_loc_inf$lat <- as.numeric(substr(account_loc_inf$long_lat,8,13))
coords_state <- function(coords_state, loc_inf,var_name,state_name){
var_name <- ifelse(loc_inf$long <= coords_state[[1]][3] &
loc_inf$long >= coords_state[[1]][1] &
loc_inf$lat <= coords_state[[1]][4] &
loc_inf$lat >= coords_state[[1]][2],
state_name,var_name)
}
coords_NSW <- as.data.frame(c(140.9993, -37.50503, 153.6299, -28.15703 ),
c("xmin","ymin","xmax","ymax"))
coords_VIC <- as.data.frame(c(140.9617, -39.13396, 149.9763, -33.99605 ),
c("xmin","ymin","xmax","ymax"))
coords_QLD <- as.data.frame(c(137.9943, -29.17719, 153.5522, -9.229287 ),
c("xmin","ymin","xmax","ymax"))
coords_SA <- as.data.frame(c(129.0013, -38.06, 141.003, -25.99638 ),
c("xmin","ymin","xmax","ymax"))
coords_WA <- as.data.frame(c(112.9211, -35.12228, 129.0019, -13.74142 ),
c("xmin","ymin","xmax","ymax"))
coords_TAS <- as.data.frame(c(143.8353, -43.63203, 148.4472, -39.45196 ),
c("xmin","ymin","xmax","ymax"))
coords_NT <- as.data.frame(c(129.0005, -25.99862, 137.9991, -10.97392 ),
c("xmin","ymin","xmax","ymax"))
coords_ACT <- as.data.frame(c(148.7628, -148.7628, 149.3972, -35.12442 ),
c("xmin","ymin","xmax","ymax"))
account_loc_inf$state <- NA
account_loc_inf$state <- coords_state(coords_NSW, account_loc_inf,
account_loc_inf$state, "NSW")
account_loc_inf$state <- coords_state(coords_VIC, account_loc_inf,
account_loc_inf$state, "VIC")
account_loc_inf$state <- coords_state(coords_QLD, account_loc_inf,
account_loc_inf$state, "QLD")
account_loc_inf$state <- coords_state(coords_SA, account_loc_inf,
account_loc_inf$state,"SA")
account_loc_inf$state <- coords_state(coords_WA, account_loc_inf,
account_loc_inf$state, "WA")
account_loc_inf$state <- coords_state(coords_TAS, account_loc_inf,
account_loc_inf$state, "TAS")
account_loc_inf$state <- coords_state(coords_NT, account_loc_inf,
account_loc_inf$state, "NT")
account_loc_inf$state <- coords_state(coords_ACT, account_loc_inf,
account_loc_inf$state, "ACT")
head(account_loc_inf)
#### Create a new transactionData with state variable
transactionData2 <- merge(transactionData1,account_loc_inf[,c(1,5)],by = "account")
transactionData2$state <- as.factor(transactionData2$state)
### Scatter plot
plot(transactionData2$state,transactionData2$annual_salary,
xlab = "State",
ylab = "Annual Salary (AUD$)",
ylim = c(28000,500000))
### Chi-squared test for annual salary and state
chisq.test(transactionData2$annual_salary, transactionData2$state, correct = FALSE)
## Warning in chisq.test(transactionData2$annual_salary, transactionData2$state, :
## Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: transactionData2$annual_salary and transactionData2$state
## X-squared = 83440, df = 686, p-value < 2.2e-16
We have a high chi-squared and a p-value of less than 0.05 signifance level, so we reject the null hypothesis and conclude that state
and annual_salary
have a significant relationship.
Now, let’s build simple regression models based on the selected attributes.
model1 <- annual_salary ~ age + gender
summary(lm(formula = model1, data = transactionData2))
##
## Call:
## lm(formula = model1, data = transactionData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92917 -53637 -12854 32641 333039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123144.2 2186.9 56.311 <2e-16 ***
## age -626.4 64.8 -9.667 <2e-16 ***
## genderM 28344.4 1303.2 21.750 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 71430 on 12040 degrees of freedom
## Multiple R-squared: 0.04457, Adjusted R-squared: 0.04442
## F-statistic: 280.9 on 2 and 12040 DF, p-value: < 2.2e-16
The p-values of all attributes in model1 are less than 0.05; however, the R-squared is too small to show it is an accurate model.
model2 <- annual_salary ~ age + gender + state
summary(lm(formula = model2, data = transactionData2))
##
## Call:
## lm(formula = model2, data = transactionData2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -102448 -49412 -12313 35770 317533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70212.80 11483.86 6.114 1.00e-09 ***
## age -303.45 68.02 -4.461 8.23e-06 ***
## genderM 33938.28 1332.93 25.461 < 2e-16 ***
## stateNSW 36312.85 11179.16 3.248 0.001164 **
## stateNT 86812.25 11735.79 7.397 1.48e-13 ***
## stateQLD 40849.39 11190.35 3.650 0.000263 ***
## stateSA 77286.55 11393.03 6.784 1.23e-11 ***
## stateTAS -20839.91 13486.84 -1.545 0.122324
## stateVIC 25763.97 11145.27 2.312 0.020814 *
## stateWA 49925.10 11242.96 4.441 9.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69910 on 11910 degrees of freedom
## (123 observations deleted due to missingness)
## Multiple R-squared: 0.08949, Adjusted R-squared: 0.0888
## F-statistic: 130.1 on 9 and 11910 DF, p-value: < 2.2e-16
The p-values of all attributes in model2 are less than 0.05. And the R-squared in model2 is larger than that in model1, but it is still too small to show it is an accurate model. Next, we should try to fit decision-tree models to see whether we could a more accurate model.
set.seed(123)
ind <- sample(2, nrow(transactionData2), replace = T , prob = c(0.5,0.5))
train <- transactionData2[ind == 1, ]
test <- transactionData2[ind == 2, ]
### Tree Classification
tree1 <- rpart(model1, data = train)
tree2 <- rpart(model2, data = train)
### Prediction
p1 <- predict(tree1, train)
### Root Mean Square Error
sqrt(mean(train$annual_salary - p1)^2)
## [1] 5.801124e-11
### R Square
(cor(train$annual_salary,p1))^2
## [1] 0.4380375
### Prediction
p2 <- predict(tree2, train)
### Root Mean Square Error
sqrt(mean(train$annual_salary - p2)^2)
## [1] 8.736721e-11
### R Square
(cor(train$annual_salary,p2))^2
## [1] 0.7847837
Overall, the decision-tree models have much higher accuracy than the simple regression models have.
The decision-tree model with model2 has the higher accuracy compared to the decision-tree model with model1. As a result, we would recommend the decision-tree model with model2 as our final model for prediction.