Using R to Create Decision Tree Classification

R is a great language for creating decision tree classification for a wide array of applications. Decision trees are a tree-like model in machine learning commonly used in decision analysis. The technique is commonly used in creating strategies for reaching a particular goal based on multi-dimensional datasets.

Decision trees are commonly used for applications such as determining what type of consumer is at higher risk of defaulting on a loan than borrowers of lower risk. What sort of factors impacts whether a company can retain customers, and what type of students are more at risk at dropping out and require mediation based on school attendance, grades, family structure, etc.

Below are the typically libraries for building machine learning analysis are below including decision trees, linear and logistic regression

library(tidyverse)
library(dplyr)
library(broom)
library(yardstick)
library(DescTools)
library(margins)
library(cutpointr)
library(tidyverse)
library(caTools)
library(rsample)
library(ROSE)
library(rpart)
library(rpart.plot)
library(caret)
install.packages("rsample")
install.packages("caTools")
install.packages("ROSE")
install.packages("rpart")
install.packages("rpart.plot")
install.packages("yardstick")
install.packages("DescTools")
install.packages("margins")
install.packages("cutpointr")

The following code block creates regression and decision tree analysis of custom churn predictions.

# Import the customer_churn.csv and explore it.
# Drop all observations with NAs (missing values)


customers <- read.csv('customer_churn.csv')
summary(customers)
str(customers)
customers$customerID <- NULL
customers$tenure <- NULL
sapply(customers, function(x) sum(is.na(x)))
customers <- customers[complete.cases(customers),]


#===================================================================



#  Build a logistic regression model to predict customer churn by using predictor variables (You determine which ones will be included).
# Calculate the Pseudo R2 for the logistic regression.


# Build a logistic regression model to predict customer churn by using predictor variables (You determine which ones will be included).

customers <- customers %>%
 mutate(Churn=if_else(Churn=="No", 0, 1))


str(customers)
         
regression1 <- glm(Churn ~ Contract + MonthlyCharges + TotalCharges + TechSupport + MultipleLines + InternetService, data=customers, family="binomial")


# Calculate the Pseudo R2 for the logistic regression.


regression1 %>%
  PseudoR2()

#  Split data into 70% train and 30% test datasets.
#  Train the same logistic regression on only "train" data.

#  Split data into 70% train and 30% test datasets.


set.seed(645)

customer_split <- initial_split(customers, prop=0.7)
train_customers <- training(customer_split)
test_customers <- testing(customer_split)

# Train the same logistic regression on only "train" data.

regression_train <- glm(Churn ~ Contract + MonthlyCharges + TotalCharges + TechSupport + MultipleLines + InternetService, data=train_customers, family="binomial")
#regression_test <- glm(Churn ~ Contract + MonthlyCharges + TotalCharges + tenure + TechSupport, data=test_customers)



#  For "test" data, make prediction using the logistic regression trained in Question 2.
#  With the cutoff of 0.5, predict a binary classification ("Yes" or "No") based on the cutoff value, 
#  Create a confusion matrix using the prediction result.

#. For "test" data, make prediction using the logistic regression trained in Question 2.

str(regression_train)
prediction <- regression_train %>%
  predict(newdata = test_customers, type = "response")
 
 


# With the cutoff of 0.5, predict a binary classification ("Yes" or "No") based on the cutoff value, 


#train_customers <- train_customers %>%
#  mutate(Churn=if_else(Churn=="0", "No", "Yes"))



The last code creates the decision tree.


train_cust_dtree <- rpart(Churn ~ ., data=train_customers, method = "class")
rpart.plot(train_cust_dtree, cex=0.8)

Check the sensitivity and specificity of the classification tree, we create a confusion matrix for ROC charts. ROC Charts are receiver operating characteristic curves that have the diagnostic ability of a binary classifier system as its threshold.

set.seed(1304)

train_cust_dtree_over <- ovun.sample(Churn ~., data=train_customers, method="over", p = 0.5)$data
train_cust_dtree_under <- ovun.sample(Churn ~., data=train_customers,  method="under", p=0.5)$data
train_cust_dtree_both <- ovun.sample(Churn ~., data=train_customers, method="both", p=0.5)$data

table(train_customers$Churn)
table(train_cust_dtree_over$Churn)
table(train_cust_dtree_under$Churn)
table(train_cust_dtree_both$Churn)

train_cust_dtree_over_A <- rpart(Churn ~ ., data = train_cust_dtree_over, method="class")
rpart.plot(train_cust_dtree_over_A, cex=0.8)

customers_dtree_prob <- train_cust_dtree_over_A %>%
  predict(newdata = test_customers, type = "prob")

#  Create a confusion matrix using the prediction result.


head(customers_dtree_prob)

table(customers_dtree_prob)

customers_dtree_class <- train_cust_dtree_over_A %>%
  predict(newdata = test_customers, type = "class")

table(customers_dtree_class)

test_customers <- test_customers %>%
    mutate(.fitted = customers_dtree_prob[, 2]) %>%
    mutate(predicted_class = customers_dtree_class)


confusionMatrix(as.factor(test_customers$predicted_class), as.factor(test_customers$Churn), positive = "1")
#===================================================================


#  Based on prediction results in Question 3, draw a ROC curve and calculate AUC.


roc <- roc(test_customers, x=.fitted, class=Churn, pos_class=1, neg_clas=0)

plot(roc)
auc(roc)

plot(roc) +
    geom_line(data = roc, color = "red") +
    geom_abline(slope = 1) +
    labs(title = "ROC Curve for Classification Tree")

Leave a Reply