Exercise Results

One of the possible solutions to Stuart Baird exercise.
One of the possible solutions to Stuart Baird exercise.

Plot the Max Log Likelihood for all locus.
data = read.csv('~/StuartsReads.csv', header = F,
                row.names = 1,
                stringsAsFactors = F)
data[1:5,1:5]
require(magrittr)
### Fours Steps

# 1st: form hap.: Write a function ToSortedDiplotypes ->
# Save into SortedDiplotypes

ToSortedDiplotypes <- function(G1, G2){
  G3 = rbind(G1,G2)
  Min = apply(G3, 2, min)
  Max = apply(G3, 2, max)
  G4_aux = data.frame(Min,Max)
  G4 = apply(G4_aux, 1, function(x) paste0(x[1], x[2]))
  return(G4)
}

ToSortedDiplotypes(data[1,],data[2,])

IDS = seq(from = 1, to = (nrow(data)-1), by = 2)

for (rows in IDS){
  print (rows)
  Out = ToSortedDiplotypes(data[rows,], data[rows+1,])  
  if(rows > 1){
    M = rbind(M,Out)
  }
  if (rows == 1){
    M = Out
  }
}
SortedDiplotypes = M

for ( i in 1:ncol(SortedDiplotypes)){
  # for ( i in 1:10){
  TAB = table(SortedDiplotypes[,i])
  NAs = stringr::str_count(names(TAB), '_')
  TAB = TAB[NAs == 0]
  TAB %<>% as.data.frame(.)
  row.names(TAB) = TAB$Var1
  names(TAB) = c('Allele', paste0('A_',i))
  # TAB = TAB[-1]
  if(i == 1){
    SortedDiplotypesTallies = TAB
  }
  if (i > 1){
    SortedDiplotypesTallies = plyr::join( SortedDiplotypesTallies,TAB, type = 'full')
  }
}
SortedDiplotypesTallies[,1:5]


### Step 2
for ( i in 1:ncol(data)){
  data2 = data
  data[data == '_'] <- NA
  TAB = table(data[,i]) %>% prop.table()
  # TAB
  TAB %<>% as.data.frame(.)
  row.names(TAB) = TAB$Var1
  names(TAB) = c('Allele', paste0('A_',i))
  # TAB = TAB[-1]
  if(i == 1){
    StateFrequencies = TAB
  }
  if (i > 1){
    StateFrequencies = plyr::join( StateFrequencies,TAB, type = 'full')
  }
}
row.names(StateFrequencies) = StateFrequencies$Allele
StateFrequencies = StateFrequencies[,-1]
StateFrequencies = StateFrequencies[order(row.names(StateFrequencies)),]

row.names(SortedDiplotypesTallies) = SortedDiplotypesTallies$Allele
SortedDiplotypesTallies = SortedDiplotypesTallies[,-1]
SortedDiplotypesTallies = SortedDiplotypesTallies[order(row.names(SortedDiplotypesTallies)),]

LLforF <- function(p,q, F., P, Q, PQ){
  LL  = P*log(p^2+F.) + Q*log(q^2+F.) + PQ*log(2*p*q* (1-F.))
  return(LL)
}

F.vec = seq(from = 0.01, to = 0.99, by = 0.01)
OUT = matrix(NA, ncol = 3)
for (i in 1:ncol(SortedDiplotypesTallies)){
  for(j in 1:length(F.vec)){
    SortedDiplotypesTallies_aux = SortedDiplotypesTallies[i] %>% na.exclude()
    StateFrequencies_aux = StateFrequencies[i]%>% na.exclude()
    LL = LLforF(p = StateFrequencies_aux[1,1],
                q = StateFrequencies_aux[2,1],
                P = SortedDiplotypesTallies_aux[1,1],
                PQ = SortedDiplotypesTallies_aux[2,1],
                Q = SortedDiplotypesTallies_aux[3,1], F. = F.vec[j] )
    OUT_aux = cbind(i, LL, F.vec[j])
    OUT = rbind(OUT, OUT_aux)
  }
}
dim(OUT )

OUT = data.frame(OUT)
OUT = OUT[-1, ]
head(OUT)
names(OUT) = c('Col', 'LL', 'F')
LL180 = subset(OUT, OUT$Col == 180)
LL180ML = subset(OUT, OUT$LL == max(LL180$LL))

plot(LL180$F,LL180$LL , type = 'l', xlab = 'F', ylab = 'LL', las = 1)
abline(h = max(LL180$LL))
abline(v = max(LL180ML$F))

MLE_SUP = matrix(NA, ncol = 3)
for (i in 1:nrow(SortedDiplotypes)){
  Xx = subset(OUT, OUT$Col == i)
  XxML = subset(OUT, OUT$LL == max(Xx$LL))
  
  Xx_sup = subset(Xx, Xx$LL >= (XxML$LL-2))
  LB = min(Xx_sup$F)
  UB = max(Xx_sup$F)
  MLF = XxML$F
  MLE_SUP_aux = cbind(LB, UB, MLF)
  MLE_SUP = rbind(MLE_SUP, MLE_SUP_aux)
}
MLE_SUP %<>% as.data.frame() %>% na.exclude()
dim(MLE_SUP)
plot(MLE_SUP$MLF, type = 'l', las = 1,
     ylab = 'MLE')
lines(MLE_SUP$LB, col = 'green')
lines(MLE_SUP$UB, col = 'red')
Exercise Results

Table of contents

One of the possible solutions to Stuart Baird exercise.