In this exercise, I show how to compare logit and probit models with Hanley and McNeil test for two AUCs. Also, I show the impact of using different cut-offs for predicting the probability of being in one group. A database for the weights of both obese and non-obese groups is simulated
set.seed(123)
n=200
weight.ob=rnorm(n,80,5) ## simulating weight for obese population
weight.nob=rnorm(n,60,5) ## simulating weight for non-obese population
plot(density(weight.ob),xlim=c(20,120),col="red",ylim=c(0,0.1))
lines(density(weight.nob),col="blue")
## use the logit model to predict the probability of being obese and plot the predicted values
data.obeses=data.frame(obese=c(rep(1,n),rep(0,n)),weight=c(weight.ob,weight.nob))
colors <- c(rep("red",n), rep("blue",n))
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
model1=glm(obese ~weight,family=binomial(link='logit'),data=data.obeses)
weight=data.frame(data.obeses$weight)
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
fit.data=data.frame(weight=weight,fitted=fitted(model1),obese=data.obeses[,1])
head(fit.data)
## data.obeses.weight fitted obese
## 1 77.19762 0.9992413 1
## 2 78.84911 0.9998813 1
## 3 87.79354 1.0000000 1
## 4 80.35254 0.9999781 1
## 5 80.64644 0.9999842 1
## 6 88.57532 1.0000000 1
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
pred=predict(model1,type="response")
c1=confusionMatrix(data=as.factor(as.numeric(pred>0.5)), ## cut-off=0.5
reference=as.factor(fit.data[,3]),
positive = "1")
c1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 196 3
## 1 4 197
##
## Accuracy : 0.9825
## 95% CI : (0.9643, 0.9929)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.965
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9850
## Specificity : 0.9800
## Pos Pred Value : 0.9801
## Neg Pred Value : 0.9849
## Prevalence : 0.5000
## Detection Rate : 0.4925
## Detection Prevalence : 0.5025
## Balanced Accuracy : 0.9825
##
## 'Positive' Class : 1
##
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.5)
library(caret)
pred=predict(model1,type="response")
c2=confusionMatrix(data=as.factor(as.numeric(pred>0.2)), ## cut-off=0.2
reference=as.factor(fit.data[,3]),
positive = "1")
c2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 193 1
## 1 7 199
##
## Accuracy : 0.98
## 95% CI : (0.961, 0.9913)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.96
##
## Mcnemar's Test P-Value : 0.0771
##
## Sensitivity : 0.9950
## Specificity : 0.9650
## Pos Pred Value : 0.9660
## Neg Pred Value : 0.9948
## Prevalence : 0.5000
## Detection Rate : 0.4975
## Detection Prevalence : 0.5150
## Balanced Accuracy : 0.9800
##
## 'Positive' Class : 1
##
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.2)
library(caret)
pred=predict(model1,type="response")
c3=confusionMatrix(data=as.factor(as.numeric(pred>0.7)), ## cut-off=0.7
reference=as.factor(fit.data[,3]),
positive = "1")
c3
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 198 3
## 1 2 197
##
## Accuracy : 0.9875
## 95% CI : (0.9711, 0.9959)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.975
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9850
## Specificity : 0.9900
## Pos Pred Value : 0.9899
## Neg Pred Value : 0.9851
## Prevalence : 0.5000
## Detection Rate : 0.4925
## Detection Prevalence : 0.4975
## Balanced Accuracy : 0.9875
##
## 'Positive' Class : 1
##
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="Weight", ylab="Probability of being Obese")
curve(predict(model1,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data[,1],fit.data[,2],pch=20,col=colors)
abline(h=0.7)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc1 <- roc(fit.data[,3],pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot((1-roc1$specificities),roc1$sensitivities,type="l",
xlab="1-specificity",ylab="sensitivity")
roc1$auc
## Area under the curve: 0.9991
model2=glm(obese ~ weight,family=binomial(link='probit'),data=data.obeses)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
fit.data2=data.frame(weight=weight,fitted=fitted(model2),obese=data.obeses[,1])
plot(data.obeses[,2],data.obeses[,1],col=colors,xlab="weight", ylab="Condición")
curve(predict(model2,newdata = list(weight=x),type="resp"),add=TRUE)
points(fit.data2[,1],fit.data2[,2],pch=20,col=colors)
pred2=predict(model2,type="response")
roc2 <- roc(fit.data2[,3],pred2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc2$auc
## Area under the curve: 0.9991
library(MKmisc)
pred1=pred
lab1=fit.data[,3]
pred2=pred2
lab2=fit.data2[,3]
AUC.test(pred1, lab1, pred2, lab2, conf.level = 0.95)
## $Variable1
## AUC SE low CI up CI
## 0.999125000 0.001483025 0.996218324 1.002031676
##
## $Variable2
## AUC SE low CI up CI
## 0.999125000 0.001483025 0.996218324 1.002031676
##
## $Test
##
## Hanley and McNeil test for two AUCs
##
## data: pred1 and pred2
## z = 0, p-value = 1
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
## -0.004110661 0.004110661
## sample estimates:
## Difference in AUC
## 0