Chapter 2 Box Problems
## Box A
Problem a Let’s call PGI-2a ‘a’ and PGI-2b ‘b’
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 phenotypes <- c ( "aa" , "ab" , "bb" ) observed <- c ( 35 , 19 , 3 ) total <- sum ( observed) Freq.a <- ( 2 * 35 + 19 ) / ( 2 * total) Freq.b <- ( 19 + 2 * 3 ) / ( 2 * total) expected <- c ( ( Freq.a^ 2 ) * total, 2 * ( Freq.a* Freq.b) * total, ( Freq.b^ 2 ) * total) expected <- round ( expected, 2 ) data.frame( cbind( phenotypes, observed, expected ) )
Problem b
Set up a data.frame to hold the Drosophila mobility and inversion data
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 aGpdh <- c ( rep ( "F" , 4 ) , rep ( "S" , 4 ) ) amy <- rep ( c ( "F" , "F" , "S" , "S" ) , 2 ) NS <- rep ( c ( "non-NS" , "NS" ) , 4 ) counts <- c ( 726 , 90 , 111 , 1 , 172 , 32 , 26 , 0 ) results <- data.frame( cbind( aGpdh, amy, NS, counts) , stringsAsFactors= FALSE ) results$ counts <- as.numeric ( results$ counts) results
1 2 3 4 5 6 7 8 9 10 11 12 total <- sum ( results$ counts) total S.table <- results[ results$ aGpdh== "S" , ] S.freq <- sum ( S.table$ counts) / total S.freq
1 2 3 F.table <- results[ results$ aGpdh== "F" , ] F.freq <- sum ( F.table$ counts) / total F.freq
1 2 3 4 5 S.table <- results[ results$ amy== "S" , ] S.freq <- sum ( S.table$ counts) / total S.freq
1 2 3 F.table <- results[ results$ amy== "F" , ] F.freq <- sum ( F.table$ counts) / total F.freq
1 2 3 4 5 NS.table <- results[ results$ NS== "NS" , ] NS.freq <- sum ( NS.table$ counts) / total NS.freq
1 2 3 nonNS.table <- results[ results$ NS== "non-NS" , ] nonNS.freq <- sum ( nonNS.table$ counts) / total nonNS.freq
## Box C
Problem A
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 m <- 4 i <- c ( 0 , 1 , 2 , 3 , m) s <- c ( 0.5 , 0.5 , 0.5 , 0.5 , 0 ) b <- c ( 0 , 1 , 1.5 , 1 , 0 ) ce <- vector( "numeric" , 5 ) ce
1 2 3 4 5 6 7 8 9 lambda <- 1 ce[ 1 ] <- lambda ce[ 2 ] <- s[ 1 ] * lambda^ ( - 1 ) ce[ 3 ] <- s[ 1 ] * s[ 2 ] * lambda^ ( - 2 ) ce[ 4 ] <- s[ 1 ] * s[ 2 ] * s[ 3 ] * lambda^ ( - 3 ) ce[ 5 ] <- s[ 1 ] * s[ 2 ] * s[ 3 ] * s[ 4 ] * lambda^ ( - 4 ) ce
Problem b
$$N_{t^{*}} = N_0\lambda^{t^{*}}=2N_0$$
$$N_0\lambda^{t^{*}}=2N_0$$
$$\lambda^{t^{*}}=2$$
$${t^{*}}\ln(\lambda)=\ln(2)$$
$$\ln(\lambda)=\frac{\ln(2)}{t^{*}}$$
$$\lambda=e^{(\frac{\ln(2)}{t^{*}})}$$
1 2 3 4 5 t.star <- 173 lambda <- exp ( log ( 2 ) / t.star) lambda
1 2 3 t.star <- 19.8 lambda <- exp ( log ( 2 ) / t.star) lambda
## Box D ##
Problem A Using the table I extract coefficients fromt the “Offspring Frequency” Aa column and apply to the “Frequency of Mating” column to obtain the frequency of heterozygotes in the next generation, Q’:
$$Q’ = \frac{1}{2}(2PQ) + 2PR + \frac{1}{2}Q^2 + \frac{1}{2}(2QR)$$
$$ = PQ + 2PR + \frac{1}{2}Q^2 + QR$$
Factor out Q/2 + R
$$= 2P(\frac{Q}{2} + R) + Q(\frac{Q}{2}+R)$$ $$= (2P + Q)(\frac{Q}{2} + R)$$ $$= 2(P + \frac{Q}{2})( \frac{Q}{2} + R)$$ Given that p = P + Q/2 and q= Q/2+R = 2pq
## Box E ##
Problem A 1 2 3 4 5 6 7 8 9 10 11 total <- 2060 O <- 702/ total A <- 862/ total B <- 365/ total AB <- 131/ total p <- 1- sqrt ( B+ O) q <- 1- sqrt ( A+ O) r <- sqrt ( O) p
Problem B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 theta <- 1- p- q- r theta p.final <- p* ( 1 + theta/ 2 ) q.final <- q* ( 1 + theta/ 2 ) r.final <- ( r+ theta/ 2 ) * ( 1 + theta/ 2 ) p.final q.final r.final p.final+ q.final+ r.final
Problem C 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 O.expected <- r.final^ 2 O.expected* total A.expected <- p.final^ 2 + 2 * p.final* r.final A.expected* total B.expected <- q.final^ 2 + 2 * q.final* r.final B.expected* total AB.expected <- 2* p.final* q.final AB.expected* total O.expected + A.expected + B.expected + AB.expected
Problem D 1 2 3 4 observed <- c ( O, A, B, AB) expected <- c ( O.expected, A.expected, B.expected, AB.expected) chisq.test( observed, p = expected, correct= FALSE )
1 2 chisq.test( observed, p = expected, rescale.p= TRUE , correct= FALSE )
1 2 observed <- c ( 701 , 862 , 365 , 131 ) chisq.test( observed, p = expected, rescale.p= TRUE , correct= FALSE )
Though the \(X^2\) value is correct I cannot modify the degrees of freedom using chisq.test. Try a manual maximum liklihood instead.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 n.A <- 862 n.AB <- 131 n.B <- 365 n.OO<- 702 N <- sum ( n.A, n.AB, n.B, n.OO) p.a <- 0.33 p.b <- 0.33 p.o <- 0.34 num.iter <- 6 results <- data.frame( matrix( nrow= num.iter, ncol= 10 ) ) names ( results) <- c ( "iter" , "Naa" , "Nao" , "Nbb" , "Nbo" , "Nab" , "Noo" , "p.a" , "p.b" , "p.o" ) for ( i in 1 : num.iter) { Naa <- n.A* ( p.a^ 2 / ( p.a^ 2 + 2 * p.a* p.o) ) Nao <- n.A* ( ( 2 * p.a* p.o) / ( p.a^ 2 + 2 * p.a* p.o) ) Nbb <- n.B* ( p.b^ 2 / ( p.b^ 2 + 2 * p.b* p.o) ) Nbo <- n.B* ( ( 2 * p.b* p.o) / ( p.b^ 2 + 2 * p.b* p.o) ) Nab <- n.AB Noo <- n.OO p.a <- ( 2 * Naa + Nao + Nab) / ( 2 * N) p.b <- ( 2 * Nbb + Nbo + Nab) / ( 2 * N) p.o <- ( 2 * Noo + Nao + Nbo) / ( 2 * N) results[ i, ] <- c ( i, Naa, Nao, Nbb, Nbo, Nab, Noo, p.a, p.b, p.o) } results
Converges to 3 significant figures after about 4 iterations.
## Box F ##
Problem A Given:
$$m_n = f_{n-1}$$
$$f_n = \frac{1}{2}(m_{n-1} + f_{n-1})$$
Then:
$$f_n - m_n = \frac{1}{2}(m_{n-1} + f_{n-1}) - f_{n-1}$$
$$f_n - m_n = \frac{1}{2}(m_{n-1} + f_{n-1} - 2f_{n-1})$$
$$f_n - m_n = \frac{1}{2}(m_{n-1} - f_{n-1})$$
$$f_n - m_n = -\frac{1}{2}(f_{n-1} - m_{n-1})$$
Problem B Given the expression for the current generation:
$$\frac{2}{3}(f_n) + \frac{1}{3}(m_n)$$
Substitute in:
$$m_{n} = f_{n-1}$$
$$f_n = \frac{1}{2}(m_{n-1} + f_{n-1})$$
to get:
$$\frac{2}{3}[\frac{1}{2}(m_{n-1}+f_{n-1})]+\frac{1}{3}(f_{n-1})$$
$$\frac{1}{3}(m_{n-1}+f_{n-1})+\frac{1}{3}(f_{n-1})$$
$$\frac{1}{3}m_{n-1}+\frac{2}{3}(f_{n-1})$$
Which is the expression for the previous generation - the same expression as the current generation.
Problem C Set up a vector to handle the frequencies, noting that the vector index will be one off from the generation.
1 2 3 4 5 6 7 8 9 10 11 12 13 m <- vector( mode= "numeric" , length = 7 ) f <- vector( mode= "numeric" , length = 7 ) m[ 1 ] <- 0.2 f[ 1 ] <- 0.8 for ( i in 2 : 7 ) { m[ i] <- f[ i- 1 ] f[ i] <- 0.5 * ( m[ i- 1 ] + f[ i- 1 ] ) } results <- data.frame( cbind( m, f) ) results
1 2 3 4 p <- ( ( 2 / 3 ) * f[ 1 ] + ( 1 / 3 ) * m[ 1 ] ) q <- 1- p p; q
## Box G ##
Problem A
A1 allele frequency $$p_1 = P_{11} + P_{12}$$
A2 allel frequency $$p_2 = P_{21} + P_{22}$$
B1 allele frequency $$q1 = P_{11} + P_{21}$$
B2 allel frequency $$q2 = P_{12} + P_{22}$$
disequilibrium parameter $$D = P_{11}*P_{22} - P_{12}*P_{21}$$
Show that $P_{11} = p_1q_1 + D$
Substitute for $p_1, q_1, D$
$$P_{11} = (P_{11} + P_{12})(P_{11} + p_{21}) + (P_{11}*P_{22} - P_{12}*p_{21})$$
$$P_{11} = P_{11}*P_{11} + P_{11}*p_{21} + P_{12}*P_{11} + P_{12}*p_{21} + P_{11}*P_{22} - P_{12}*p_{21}$$
$$P_{11} = P_{11}*P_{11} + P_{11}*p_{21} + P_{12}*P_{11} + P_{11}*P_{22}$$
$$P_{11} = P_{11}*(P_{11} + p_{21} + P_{12} + P_{22})$$
Noting that $$P_{11} + P_{21} + P_{12} + P_{22} = 1$$
$$P_{11} = P_{11}*1$$
Problem D 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 aGpdh <- c ( rep ( "F" , 4 ) , rep ( "S" , 4 ) ) amy <- rep ( c ( "F" , "F" , "S" , "S" ) , 2 ) NS <- rep ( c ( "non-NS" , "NS" ) , 4 ) counts <- c ( 726 , 90 , 111 , 1 , 172 , 32 , 26 , 0 ) results <- data.frame( cbind( aGpdh, amy, NS, counts) , stringsAsFactors= FALSE ) results.w <- reshape( results, idvar= c ( "amy" , "NS" ) , v.names= "counts" , timevar = "aGpdh" , direction= "wide" ) results.w$ counts.sum <- as.numeric ( results.w$ counts.F) + as.numeric ( results.w$ counts.S) results.w
1 2 3 total <- sum ( results.w$ counts.sum) total
1 2 3 4 5 6 7 8 9 10 S.table <- results.w[ results.w$ amy== "S" , ] S.freq <- sum ( S.table$ counts.sum) / total S.freq F.table <- results.w[ results.w$ amy== "F" , ] F.freq <- sum ( F.table$ counts.sum) / total F.freq
1 2 3 4 5 NS.table <- results.w[ results.w$ NS== "NS" , ] NS.freq <- sum ( NS.table$ counts.sum) / total NS.freq
1 2 3 nonNS.table <- results.w[ results.w$ NS== "non-NS" , ] nonNS.freq <- sum ( nonNS.table$ counts.sum) / total nonNS.freq
1 2 3 4 5 results.w[ results.w$ amy== "F" & results.w$ NS== "non-NS" , ]
1 2 D <- results.w[ results.w$ amy== "F" & results.w$ NS== "non-NS" , "counts.sum" ] / total - F.freq* nonNS.freq D
1 2 3 4 rho <- D/ sqrt ( S.freq* F.freq* NS.freq* nonNS.freq ) rho
1 2 Chi.square <- rho^ 2 * total Chi.square
1 2 observed <- results.w[ , "counts.sum" ] observed
1 2 3 4 5 expected <- c ( ( nonNS.freq* F.freq/ total) , ( NS.freq* F.freq/ total) , ( nonNS.freq* S.freq/ total) , ( NS.freq* S.freq/ total) ) expected
1 chisq.test( observed, p = expected, rescale.p= TRUE , correct= FALSE )
Problem E This is the same as E only now we use aGpdh instead of amy
1 2 3 4 5 6 7 8 results.w <- reshape( results, idvar= c ( "aGpdh" , "NS" ) , v.names= "counts" , timevar = "amy" , direction= "wide" ) results.w$ counts.sum <- as.numeric ( results.w$ counts.F) + as.numeric ( results.w$ counts.S) results.w
1 2 3 total <- sum ( results.w$ counts.sum) total
1 2 3 4 5 S.table <- results.w[ results.w$ aGpdh== "S" , ] S.freq <- sum ( S.table$ counts.sum) / total S.freq
1 2 3 F.table <- results.w[ results.w$ aGpdh== "F" , ] F.freq <- sum ( F.table$ counts.sum) / total F.freq
1 2 3 4 5 NS.table <- results.w[ results.w$ NS== "NS" , ] NS.freq <- sum ( NS.table$ counts.sum) / total NS.freq
1 2 3 nonNS.table <- results.w[ results.w$ NS== "non-NS" , ] nonNS.freq <- sum ( nonNS.table$ counts.sum) / total nonNS.freq
1 2 3 4 5 6 results.w[ results.w$ aGpdh== "F" & results.w$ NS== "non-NS" , ]
1 2 D <- results.w[ results.w$ aGpdh== "F" & results.w$ NS== "non-NS" , "counts.sum" ] / total - F.freq* nonNS.freq D
1 2 3 4 rho <- D/ sqrt ( S.freq* F.freq* NS.freq* nonNS.freq ) rho
1 2 Chi.square <- rho^ 2 * total Chi.square
1 2 observed <- results.w[ , "counts.sum" ] observed
1 2 3 4 5 expected <- c ( ( nonNS.freq* F.freq/ total) , ( NS.freq* F.freq/ total) , ( nonNS.freq* S.freq/ total) , ( NS.freq* S.freq/ total) ) expected
1 chisq.test( observed, p = expected, rescale.p= TRUE , correct= FALSE )
Note that the degrees of freedom of 3 used by R is inappropriate. For 1 degree of freedon (4 - 1 (sample size) -1 (estimating p1) -1 (estimating p2) = 1 ) you must read a p ~0.07 off a chi square table. Do not reject the null hypothesis ( independence or linkage equilibrium) and so conclude linkage equilibrium.
## Box H ##
Problem A For an autosomal gene the paths are:
GC
A E: $(\frac{1}{2})^4*(1+1) = \frac{1}{16}*2 = \frac{8}{64}$
GD
A E: $(\frac{1}{2})^4*(1+1) = \frac{1}{16}*2 = \frac{8}{64}$
GD
B E: $(\frac{1}{2})^4*(1+\frac{1}{4}) = \frac{1}{16}*\frac{5}{4} = \frac{5}{64}$
Total: $\frac{8}{64} + \frac{8}{64} + \frac{5}{64} = \frac{21}{64}$
Problem B For a sex linked gene:
GCA E: CAE are male so this path is not included GDA E: AE are male so this path is not included
GD
B E: $(\frac{1}{2})^{3}*(1+\frac{1}{4}) = \frac{1}{8}*\frac{5}{4} = \frac{5}{32}$
Total: $\displaystyle \frac{5}{32}$