In this exercise, I show how to compare logit and probit models with Hanley and McNeil test for two AUCs. Also, I show the impact of using different cut-offs for predicting the probability of being in one group. A database for the weights of both obese and non-obese groups is simulated

set.seed(123)
n=200
weight.ob=rnorm(n,80,5) ## simulating weight for obese population
weight.nob=rnorm(n,60,5) ## simulating weight for non-obese population
  1. Generate the desity plots for both weight for obese population and weight for non-obese one. Use the logit model to predict the probability of being obese. Plot the predicted values
plot(density(weight.ob),xlim=c(20,120),col="red",ylim=c(0,0.1))
lines(density(weight.nob),col="blue")

## use the logit model to predict the probability of being obese and plot the predicted values
data.obeses=data.frame(obese=c(rep(1,n),rep(0,n)),weight=c(weight.ob,weight.nob))

colors <- c(rep("red",n), rep("blue",n))
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")

model1=glm(obese ~weight,family=binomial(link='logit'),data=data.obeses)
weight=data.frame(data.obeses$weight)
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)

fit.data=data.frame(weight=weight,fitted=fitted(model1),obese=data.obeses[,1])
head(fit.data)
##   data.obeses.weight    fitted obese
## 1           77.19762 0.9992413     1
## 2           78.84911 0.9998813     1
## 3           87.79354 1.0000000     1
## 4           80.35254 0.9999781     1
## 5           80.64644 0.9999842     1
## 6           88.57532 1.0000000     1
points(fit.data[,1],fit.data[,2],pch=20,col=colors)

  1. Use the caret library in order to compute the confusion matrix and statistics for three different cut-off.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
pred=predict(model1,type="response")


c1=confusionMatrix(data=as.factor(as.numeric(pred>0.5)), ## cut-off=0.5
                reference=as.factor(fit.data[,3]),
                positive = "1")
c1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 196   3
##          1   4 197
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9643, 0.9929)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.965           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9850          
##             Specificity : 0.9800          
##          Pos Pred Value : 0.9801          
##          Neg Pred Value : 0.9849          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4925          
##    Detection Prevalence : 0.5025          
##       Balanced Accuracy : 0.9825          
##                                           
##        'Positive' Class : 1               
## 
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.5)

library(caret)
pred=predict(model1,type="response")


c2=confusionMatrix(data=as.factor(as.numeric(pred>0.2)), ## cut-off=0.2
                reference=as.factor(fit.data[,3]),
                positive = "1")
c2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 193   1
##          1   7 199
##                                          
##                Accuracy : 0.98           
##                  95% CI : (0.961, 0.9913)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.96           
##                                          
##  Mcnemar's Test P-Value : 0.0771         
##                                          
##             Sensitivity : 0.9950         
##             Specificity : 0.9650         
##          Pos Pred Value : 0.9660         
##          Neg Pred Value : 0.9948         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4975         
##    Detection Prevalence : 0.5150         
##       Balanced Accuracy : 0.9800         
##                                          
##        'Positive' Class : 1              
## 
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.2)