Binary regression

In this exercise, I show how to compare logit and probit models with Hanley and McNeil test for two AUCs. Also, I show the impact of using different cut-offs for predicting the probability of being in one group. A database for the weights of both obese and non-obese groups is simulated

set.seed(123)
n=200
weight.ob=rnorm(n,80,5) ## simulating weight for obese population
weight.nob=rnorm(n,60,5) ## simulating weight for non-obese population

Generate the desity plots for both weight for obese population and weight for non-obese one. Use the logit model to predict the probability of being obese. Plot the predicted values

plot(density(weight.ob),xlim=c(20,120),col="red",ylim=c(0,0.1))
lines(density(weight.nob),col="blue")

## use the logit model to predict the probability of being obese and plot the predicted values
data.obeses=data.frame(obese=c(rep(1,n),rep(0,n)),weight=c(weight.ob,weight.nob))

colors <- c(rep("red",n), rep("blue",n))
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")

model1=glm(obese ~weight,family=binomial(link='logit'),data=data.obeses)
weight=data.frame(data.obeses$weight)
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)

fit.data=data.frame(weight=weight,fitted=fitted(model1),obese=data.obeses[,1])
head(fit.data)

##   data.obeses.weight    fitted obese
## 1           77.19762 0.9992413     1
## 2           78.84911 0.9998813     1
## 3           87.79354 1.0000000     1
## 4           80.35254 0.9999781     1
## 5           80.64644 0.9999842     1
## 6           88.57532 1.0000000     1

points(fit.data[,1],fit.data[,2],pch=20,col=colors)

Use the caret library in order to compute the confusion matrix and statistics for three different cut-off.

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

pred=predict(model1,type="response")


c1=confusionMatrix(data=as.factor(as.numeric(pred>0.5)), ## cut-off=0.5
                reference=as.factor(fit.data[,3]),
                positive = "1")
c1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 196   3
##          1   4 197
##                                           
##                Accuracy : 0.9825          
##                  95% CI : (0.9643, 0.9929)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.965           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9850          
##             Specificity : 0.9800          
##          Pos Pred Value : 0.9801          
##          Neg Pred Value : 0.9849          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4925          
##    Detection Prevalence : 0.5025          
##       Balanced Accuracy : 0.9825          
##                                           
##        'Positive' Class : 1               
##

plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.5)

library(caret)
pred=predict(model1,type="response")


c2=confusionMatrix(data=as.factor(as.numeric(pred>0.2)), ## cut-off=0.2
                reference=as.factor(fit.data[,3]),
                positive = "1")
c2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 193   1
##          1   7 199
##                                          
##                Accuracy : 0.98           
##                  95% CI : (0.961, 0.9913)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.96           
##                                          
##  Mcnemar's Test P-Value : 0.0771         
##                                          
##             Sensitivity : 0.9950         
##             Specificity : 0.9650         
##          Pos Pred Value : 0.9660         
##          Neg Pred Value : 0.9948         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4975         
##    Detection Prevalence : 0.5150         
##       Balanced Accuracy : 0.9800         
##                                          
##        'Positive' Class : 1              
##

plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.2)

library(caret)
pred=predict(model1,type="response")


c3=confusionMatrix(data=as.factor(as.numeric(pred>0.7)), ## cut-off=0.7
                reference=as.factor(fit.data[,3]),
                positive = "1")
c3

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 198   3
##          1   2 197
##                                           
##                Accuracy : 0.9875          
##                  95% CI : (0.9711, 0.9959)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.975           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9850          
##             Specificity : 0.9900          
##          Pos Pred Value : 0.9899          
##          Neg Pred Value : 0.9851          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4925          
##    Detection Prevalence : 0.4975          
##       Balanced Accuracy : 0.9875          
##                                           
##        'Positive' Class : 1               
##

plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.7)

Use the pROC library to plot the ROC curve and compute the AUC

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc1 <- roc(fit.data[,3],pred)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

plot((1-roc1$specificities),roc1$sensitivities,type="l",
     xlab="1-specificity",ylab="sensitivity")

roc1$auc

## Area under the curve: 0.9991

Use the MKmisc library to compare the ROC curves for logit and probit models and evaluate the Hanley and McNeil test for two AUCs

model2=glm(obese ~ weight,family=binomial(link='probit'),data=data.obeses)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

fit.data2=data.frame(weight=weight,fitted=fitted(model2),obese=data.obeses[,1])


plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="weight", ylab="Condición")
curve(predict(model2,newdata = list(weight=x),type="resp"),add=TRUE)

points(fit.data2[,1],fit.data2[,2],pch=20,col=colors)

pred2=predict(model2,type="response")
roc2 <- roc(fit.data2[,3],pred2)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

roc2$auc

## Area under the curve: 0.9991

library(MKmisc)

pred1=pred
lab1=fit.data[,3]
pred2=pred2
lab2=fit.data2[,3]
AUC.test(pred1, lab1, pred2, lab2, conf.level = 0.95)

## $Variable1
##         AUC          SE      low CI       up CI 
## 0.999125000 0.001483025 0.996218324 1.002031676 
## 
## $Variable2
##         AUC          SE      low CI       up CI 
## 0.999125000 0.001483025 0.996218324 1.002031676 
## 
## $Test
## 
##  Hanley and McNeil test for two AUCs
## 
## data:  pred1 and pred2
## z = 0, p-value = 1
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  -0.004110661  0.004110661
## sample estimates:
## Difference in AUC 
##                 0

Binary regression

Eduardo Alarcón-Bustamante

December 10, 2021