In this file, I start generating the Bayesian Network from our data using the gRain
package.
The first variable we add is age
. Then I add smoking
, which is associated with hypertension.
yn <- c("N","Y")
#this is the true conditional prob table for HTNtreat given HTN status
#condHTNtreat <- t(HTNtreat/rowSums(HTNtreat))
#I adjusted according to David's suggestions
condHTNtreat <- t(matrix(nrow=2,c(100,0,40,60), byrow = TRUE, dimnames=list(yn,yn)))
## Age probabilities are calculated from our cohort
ageLevels <- c("0-20","20-40", "40-55", "55-70","70-90")
ageProbMat <- read.csv(system.file("cpt", "age_probabilities.csv", package="cvdRiskData"))
ageProb <- ageProbMat$Freq
names(ageProb) <- ageProbMat$out
ageProb
## (0,20] (20,40] (40,55] (55,70] (70,90]
## 0.08744536 0.29710261 0.25755875 0.23272094 0.12517234
a <- cptable(~age, values=ageProb*100, levels=ageLevels)
## These conditional probabilities are derived from our cohort
condHTNAgeprob <- read.csv(system.file("cpt", "age_htn_cpt.csv", package="cvdRiskData"), check.names = FALSE)
## Here are the conditional age|htn probabilities
condHTNAgeprob
## (0,20] (20,40] (40,55] (55,70] (70,90]
## 1 0.97665607 0.8516853 0.7206523 0.523628 0.3901981
## 2 0.02334393 0.1483147 0.2793477 0.476372 0.6098019
rownames(condHTNAgeprob) <- c(0,1)
colnames(condHTNAgeprob) <- ageLevels
## Compile our CPTs so far
a.htn <- cptable(~htn+age, values=as.matrix(condHTNAgeprob), levels=yn)
htn.treat <- cptable(~treat+htn, values=condHTNtreat, levels=yn)
##Smoking|age CPT - peaks at 25% smoking for 55-70 category
a.s <- cptable(~smoking+age, values=c(95,5,90,10,85,15,75,25,90,10),levels=yn)
##Smoking htn, if you smoke, you have 67% prob of having hypertension
s.h <- cptable(~htn+smoking, values=c(50,50,33,67))
Now we load in probabilities for race, which was calculated from our patient cohort.
pRaceMat <- read.csv(system.file("cpt", "race_probabilities.csv", package="cvdRiskData"))
pRace <- pRaceMat$Freq
names(pRace) <- pRaceMat$Var1
pRace
## AmInd Asian/PI Black/AfAm White
## 0.005527018 0.191968947 0.053266109 0.749237925
race <- cptable(~race, values=pRace, levels=names(pRace))
#These probabilities are estimated for US patients overall (from CDC report)
bmiLevels <- c("15-18","18-25", "25-31","31+")
bmiProb <- c(0.16, 0.68, 0.10, 0.06)
bmi <- cptable(~bmi, values=bmiProb, levels=bmiLevels)
#conditional CPT for BMI given race
#these values were estimated from CDC report
#If you are Black/AfAm or White, have a higher
#probability of being in a higher BMI category
valuesBMI <- c(97, 3, 98, 1, 98, 2, 99, 1,
88, 12, 92,8, 92, 8, 96, 4,
55, 45, 70, 30, 70, 30, 85, 15,
20, 80, 33, 66, 33, 66, 65, 35)
##t2d.race.bmi is a joint cpt: p(t2d|race,bmi)
t2d.race.bmi <- cptable(~t2d|race:bmi, values=c(valuesBMI), levels=yn)
#t2d.cvd <- cptable(~cvd|t2d, values=c(95, 5, 5, 95), levels= yn)
##incidence of smoking is 15%
smoking <- cptable(~smoking, values=c(85, 15), levels=yn)
These first 2 snps are on the same chromosome and so I will model them as having identical risk, and having both does not increase risk. I am limiting myself to the homozygous variants (excluding the heterozygous variants) in order to simplify things. I also only consider 6 genotypes overall.
Rs10757278 (Chr 9, 22124478). (A;A) 0.78 x risk of heart disease, (A;G) 1.3 x risk, (G;G) 1.6x risk for heart disease. Co-morbidity: diabetes
Rs1333049 (Chr 9, 22125504). (G;G) Normal (WT); (C;G) 1.5 increased risk; (C;C) 1.9x increased risk.
These last 2 snps are on different chromosomes for the first. Again, limiting to the homozygous cases to simplify things.
Rs4665058, 4x risk if have two copies of the SNP (A;A), 2 X (A;C) and (C;C) - wild type. Chr 2, 159333698
rs8055236 (Chr 16, 83178793) (T;T) Normal (WT); (G;T) 1.9x risk, (G;G) 2.2x increased risk
raceNames <- names(pRace)
#Conversion table between aggregate genotype and SNP values
snpLookup <- list("0000" = c("AA", "CC", "CC", "TT"),
"0001" = c("AA", "CC", "CC", "GG"),
"0010" = c("AA", "CC", "AA", "TT"),
"1100" = c("GG", "GG", "CC", "TT"),
"1110" = c("GG", "GG", "AA", "GG"),
"1111" = c("GG", "GG", "AA", "GG"))
snpNames <- c("rs10757278", "rs1333049", "rs4665058", "rs8055236")
snpLookup <- lapply(snpLookup, function(x) {names(x) <- c("rs10757278", "rs1333049", "rs4665058", "rs8055236"); return(x)})
rs10757278race = matrix(nrow=2, byrow = TRUE, data=c(21, 24, 80, 30,
79, 76, 20, 70),
dimnames = list( c("AA","GG"),raceNames))/100
r1 <- rs10757278race
#p("AA"|race) = c(10, 10, 10, 10)
#p("CC"|race) = c(80,80,80,80)
rs4665058race =
matrix(nrow=2, data= c(10, 10, 10, 10, 90,90,90,90), byrow=TRUE,
dimnames= list(c("AA","CC"), raceNames ))/100
r2 <- rs4665058race
#p("GG"|race) = c(62, 65, 20, 62)
#P("TT"|race) = c(38, 35, 80, 38)
rs8055236race = matrix(nrow=2, byrow=TRUE, data=c(62, 65, 20, 40,
38, 35, 80, 60),
dimnames = list(c("GG","TT"), raceNames))/100
r3 <- rs8055236race
genotypes <- c("1111", "1110", "1100", "0010", "0001", "0000")
probList <- list()
probList[["0000"]] <- r1[2,] * r2[2,] * r3[2,]
probList[["1100"]] <- r1[1,] * r2[2,] * r3[2,]
probList[["1110"]] <- r1[1,] * r2[1,] * r3[2,]
probList[["0010"]] <- r1[2,] * r2[1,] * r3[2,]
probList[["0001"]] <- r1[2,] * r2[2,] * r3[1,]
probList[["1111"]] <- r1[1,] * r2[1,] * r3[1,]
cptgenoRace <- do.call(rbind,probList)
cptgenoRace <- cptgenoRace[genotypes,]
race.genotype <- cptable(~genotype|race, values=cptgenoRace, levels=genotypes, normalize=TRUE)
##Gender is unbalanced
gender <- cptable(~gender, c(45,55), levels=c("M","F"))
##Two SNPs are associated with higher cholesterol
tcholStates <- c("<160", "160-199", "200-239","240+")
genotype.tchol <- cptable(~tchol|genotype,
values = c(5,10,30,55,
10,20,30,40,
15,20,45,30,
15,20,45,30,
30,60,5,5,
30,60,5,5), levels=tcholStates)
Finally, we compile our Bayesian Network and save it.
plist <- compileCPT(list(a, a.htn, htn.treat, smoking, s.h, race,
bmi, t2d.race.bmi, race.genotype,
genotype.tchol,
gender))
cvd_bayes_net <- compile(grain(plist))
devtools::use_data(cvd_bayes_net, overwrite = TRUE)
## Saving cvd_bayes_net as cvd_bayes_net.rda to /Users/roshi/Code/cvdRiskData/data