2014-02-01

Population Genetics Chpt 2 Box

Chapter 2 Box Problems

## Box A

Problem a

Let’s call PGI-2a ‘a’ and PGI-2b ‘b’

phenotypes <- c("aa","ab","bb")
observed <- c(35,19,3)
total <- sum(observed)


Freq.a <- (2*35 + 19)/(2*total)
Freq.b <- (19 + 2*3)/(2*total)

expected <- c((Freq.a^2)*total, 2*(Freq.a*Freq.b)*total, (Freq.b^2)*total)
expected <- round(expected, 2)

data.frame( cbind( phenotypes, observed, expected ))

##   phenotypes observed expected
## 1         aa       35    34.74
## 2         ab       19    19.52
## 3         bb        3     2.74

Problem b

Set up a data.frame to hold the Drosophila mobility and inversion data

aGpdh <- c( rep("F",4), rep("S",4))
amy <- rep(c("F","F","S","S"), 2)
NS <- rep( c("non-NS","NS"), 4)
counts <- c(726,90,111,1,172,32,26,0)

results <- data.frame( cbind(aGpdh, amy, NS, counts), stringsAsFactors=FALSE)

results$counts <- as.numeric(results$counts)
results

##   aGpdh amy     NS counts
## 1     F   F non-NS    726
## 2     F   F     NS     90
## 3     F   S non-NS    111
## 4     F   S     NS      1
## 5     S   F non-NS    172
## 6     S   F     NS     32
## 7     S   S non-NS     26
## 8     S   S     NS      0

#calculate the total number of observations
total <- sum(results$counts)
total
## [1] 1158

#part b: calculate the aGpdh (F)ast and (S)low frequencies

S.table <- results[ results$aGpdh=="S",]
S.freq <- sum(S.table$counts)/total
S.freq

## [1] 0.1986183

1
2
3

F.table <- results[ results$aGpdh=="F",]
F.freq <- sum(F.table$counts)/total
F.freq

1	## [1] 0.8013817

#part c: calculate the Amy (F)ast and (S)low frequencies

S.table <- results[ results$amy=="S",]
S.freq <- sum(S.table$counts)/total
S.freq

1	## [1] 0.119171

1
2
3

F.table <- results[ results$amy=="F",]
F.freq <- sum(F.table$counts)/total
F.freq

1	## [1] 0.880829

#part d: calculate the NS inversion bearing frequencies

NS.table <- results[ results$NS=="NS",]
NS.freq <- sum(NS.table$counts)/total
NS.freq

1	## [1] 0.1062176

1
2
3

nonNS.table <- results[ results$NS=="non-NS",]
nonNS.freq <- sum(nonNS.table$counts)/total
nonNS.freq

1	## [1] 0.8937824

## Box C

Problem A

#maximum age = m = 4
m <- 4

# i is the age class
 i <- c(0,1,2,3,m)

#s probability of survival from age class i to i+1
s <- c( 0.5, 0.5, 0.5, 0.5, 0)

# b is the average number of offspring born to an individual in age class i

b <- c(0, 1, 1.5, 1, 0)

#using the characteristic equation, create a vector ce  where each element holds an element of the equation.  As there are 5 elements create a 5 element vector

ce <- vector("numeric", 5)
ce
## [1] 0 0 0 0 0

lambda <- 1 #using lambda = 1


ce[1] <- lambda
ce[2] <- s[1]*lambda^(-1)
ce[3] <- s[1]*s[2]*lambda^(-2)
ce[4] <- s[1]*s[2]*s[3]*lambda^(-3)
ce[5] <- s[1]*s[2]*s[3]*s[4]*lambda^(-4)
ce

1	## [1] 1.0000 0.5000 0.2500 0.1250 0.0625

Problem b

$$N_{t^{*}} = N_0\lambda^{t^{*}}=2N_0$$ $$N_0\lambda^{t^{*}}=2N_0$$ $$\lambda^{t^{*}}=2$$ $${t^{*}}\ln(\lambda)=\ln(2)$$ $$\ln(\lambda)=\frac{\ln(2)}{t^{*}}$$ $$\lambda=e^{(\frac{\ln(2)}{t^{*}})}$$

#Calculate lambda for Sweden and Mexico

t.star <- 173  #Sweden
lambda <- exp(log(2)/t.star)
lambda

1	## [1] 1.004015

1
2
3

t.star <- 19.8  #Mexico
lambda <- exp(log(2)/t.star)
lambda

1	## [1] 1.035627

## Box D ##

Problem A

Using the table I extract coefficients fromt the “Offspring Frequency” Aa column and apply to the “Frequency of Mating” column to obtain the frequency of heterozygotes in the next generation, Q’:

$$Q’ = \frac{1}{2}(2PQ) + 2PR + \frac{1}{2}Q^2 + \frac{1}{2}(2QR)$$

$$ = PQ + 2PR + \frac{1}{2}Q^2 + QR$$

Factor out Q/2 + R

$$= 2P(\frac{Q}{2} + R) + Q(\frac{Q}{2}+R)$$
$$= (2P + Q)(\frac{Q}{2} + R)$$
$$= 2(P + \frac{Q}{2})( \frac{Q}{2} + R)$$
Given that p = P + Q/2 and q= Q/2+R = 2pq

## Box E ##

Problem A

total <- 2060
O <- 702/total
A <- 862/total
B <- 365/total
AB <- 131/total

p <- 1-sqrt(B+O)
q <- 1-sqrt(A+O)
r <- sqrt(O)

p

## [1] 0.2803048

q
## [1] 0.1286658

r
## [1] 0.5837608

#total p+q+r
p+q+r
## [1] 0.9927314

Problem B

theta <- 1-p-q-r
theta
## [1] 0.007268573

p.final <- p*(1+theta/2)
q.final <- q*(1+theta/2)
r.final <- (r+theta/2)*(1+theta/2)

p.final
## [1] 0.2813235

q.final
## [1] 0.1291334

r.final
## [1] 0.5895299

#total p.final+q.final+r.final
p.final+q.final+r.final
## [1] 0.9999868

Problem C

O.expected <- r.final^2 #probability of O
O.expected*total
## [1] 715.9437

A.expected <- p.final^2 + 2*p.final*r.final #probability of A
A.expected*total
## [1] 846.3307

B.expected <- q.final^2 + 2*q.final*r.final  #probability of B
B.expected*total
## [1] 347.9987

AB.expected <- 2*p.final*q.final #probability of AB
AB.expected*total
## [1] 149.6724

#total
O.expected + A.expected + B.expected + AB.expected
## [1] 0.9999736

Problem D

observed <- c(O,A,B,AB)
expected <- c(O.expected,A.expected,B.expected,AB.expected)

chisq.test( observed, p = expected, correct=FALSE )

1	## Error in chisq.test(observed, p = expected, correct = FALSE): probabilities must sum to 1.

expected

1	## [1] 0.34754547 0.41084016 0.16893143 0.07265653

1	sum(expected)

1	## [1] 0.9999736

1 2	#argument rescale.p allows for rescaling the p values so they sum to 1 chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )

1 2	## Warning in chisq.test(observed, p = expected, rescale.p = TRUE, correct = ## FALSE): Chi-squared approximation may be incorrect

## 
## 	Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 0.0018066, df = 3, p-value = 1

1 2	observed <- c(701,862,365,131) chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )

## 
## 	Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 3.7634, df = 3, p-value = 0.2882

Though the $X^2$ value is correct I cannot modify the degrees of freedom using chisq.test.
Try a manual maximum liklihood instead.

#lowercase = genotypes
#uppercase = phenotypes


n.A <- 862
n.AB <- 131
n.B <- 365
n.OO<- 702
N <- sum( n.A, n.AB, n.B, n.OO)

#first guess - wildest guess is all alleles about equal

p.a <- 0.33
p.b <- 0.33
p.o <- 0.34

num.iter <- 6 #number of estimate iterations

#Build a data.frame to hold results from each iteration

results <- data.frame( matrix( nrow=num.iter, ncol=10))
names(results) <- c("iter","Naa","Nao","Nbb","Nbo","Nab","Noo","p.a","p.b","p.o")

for( i in 1:num.iter){

    Naa <- n.A*(p.a^2/(p.a^2+2*p.a*p.o))
    Nao <- n.A*((2*p.a*p.o)/(p.a^2+2*p.a*p.o))

    Nbb <- n.B*(p.b^2/(p.b^2+2*p.b*p.o))
    Nbo <- n.B*((2*p.b*p.o)/(p.b^2+2*p.b*p.o))

    Nab <- n.AB
    Noo <- n.OO



#reassignment of frequencies

    p.a <- (2*Naa + Nao + Nab)/(2*N)
    p.b <- (2*Nbb + Nbo + Nab)/(2*N)
    p.o <- (2*Noo + Nao + Nbo)/(2*N)

    results[i,] <- c(i,Naa,Nao,Nbb,Nbo,Nab,Noo,p.a,p.b,p.o)

}

results

##   iter      Naa      Nao       Nbb      Nbo Nab Noo       p.a       p.b
## 1    1 281.6436 580.3564 119.25743 245.7426 131 702 0.3093795 0.1493343
## 2    2 191.5908 670.4092  44.24607 320.7539 131 702 0.2875220 0.1311277
## 3    3 170.9007 691.0993  36.99224 328.0078 131 702 0.2825002 0.1293670
## 4    4 166.9323 695.0677  36.16559 328.8344 131 702 0.2815370 0.1291664
## 5    5 166.2077 695.7923  36.05077 328.9492 131 702 0.2813611 0.1291385
## 6    6 166.0775 695.9225  36.03253 328.9675 131 702 0.2813295 0.1291341
##         p.o
## 1 0.5412862
## 2 0.5813503
## 3 0.5881328
## 4 0.5892966
## 5 0.5895004
## 6 0.5895364

Converges to 3 significant figures after about 4 iterations.

## Box F ##

Problem A

Given:

$$m_n = f_{n-1}$$ $$f_n = \frac{1}{2}(m_{n-1} + f_{n-1})$$ Then: $$f_n - m_n = \frac{1}{2}(m_{n-1} + f_{n-1}) - f_{n-1}$$ $$f_n - m_n = \frac{1}{2}(m_{n-1} + f_{n-1} - 2f_{n-1})$$ $$f_n - m_n = \frac{1}{2}(m_{n-1} - f_{n-1})$$ $$f_n - m_n = -\frac{1}{2}(f_{n-1} - m_{n-1})$$

Problem B

Given the expression for the current generation:

$$\frac{2}{3}(f_n) + \frac{1}{3}(m_n)$$ Substitute in: $$m_{n} = f_{n-1}$$ $$f_n = \frac{1}{2}(m_{n-1} + f_{n-1})$$ to get: $$\frac{2}{3}[\frac{1}{2}(m_{n-1}+f_{n-1})]+\frac{1}{3}(f_{n-1})$$ $$\frac{1}{3}(m_{n-1}+f_{n-1})+\frac{1}{3}(f_{n-1})$$
$$\frac{1}{3}m_{n-1}+\frac{2}{3}(f_{n-1})$$

Which is the expression for the previous generation - the same expression as the current generation.

Problem C

Set up a vector to handle the frequencies, noting that the vector index will be one off from the generation.

m <- vector( mode="numeric", length = 7)
f <- vector( mode="numeric", length = 7)

m[1] <- 0.2
f[1] <- 0.8

for( i in 2:7){
     m[i] <- f[i-1]
     f[i] <- 0.5*(m[i-1] + f[i-1])
}

results <- data.frame( cbind(m, f))
results

##         m        f
## 1 0.20000 0.800000
## 2 0.80000 0.500000
## 3 0.50000 0.650000
## 4 0.65000 0.575000
## 5 0.57500 0.612500
## 6 0.61250 0.593750
## 7 0.59375 0.603125

p <- ((2/3)*f[1] + (1/3)*m[1]) #ultimate frequency in both sexes
q <- 1-p

p; q

1	## [1] 0.6

1	## [1] 0.4

1
2
3

#in females
#AA    Aa      aa
p^2;   2*p*q;  q^2

1	## [1] 0.36

1	## [1] 0.48

1	## [1] 0.16

1
2
3

# in males
# A    a
  p;   q

1	## [1] 0.6

1	## [1] 0.4

## Box G ##

Problem A

A1 allele frequency $$p_1 = P_{11} + P_{12}$$ A2 allel frequency $$p_2 = P_{21} + P_{22}$$ B1 allele frequency $$q1 = P_{11} + P_{21}$$ B2 allel frequency $$q2 = P_{12} + P_{22}$$ disequilibrium parameter $$D = P_{11}*P_{22} - P_{12}*P_{21}$$
Show that $P_{11} = p_1q_1 + D$

Substitute for $p_1, q_1, D$ $$P_{11} = (P_{11} + P_{12})(P_{11} + p_{21}) + (P_{11}*P_{22} - P_{12}*p_{21})$$ $$P_{11} = P_{11}*P_{11} + P_{11}*p_{21} + P_{12}*P_{11} + P_{12}*p_{21} + P_{11}*P_{22} - P_{12}*p_{21}$$ $$P_{11} = P_{11}*P_{11} + P_{11}*p_{21} + P_{12}*P_{11} + P_{11}*P_{22}$$ $$P_{11} = P_{11}*(P_{11} + p_{21} + P_{12} + P_{22})$$ Noting that $$P_{11} + P_{21} + P_{12} + P_{22} = 1$$ $$P_{11} = P_{11}*1$$

Problem D

aGpdh <- c( rep("F",4), rep("S",4))
amy <- rep(c("F","F","S","S"), 2)
NS <- rep( c("non-NS","NS"), 4)
counts <- c(726,90,111,1,172,32,26,0)

results <- data.frame( cbind(aGpdh, amy, NS, counts), stringsAsFactors=FALSE)

# note that we are no longer interested in aGpdh.
# use aGpdh as a "replicates" indicator and reprocess the table to combine replicate
#counts prior to calculating frequencies

results.w <- reshape( results, idvar= c("amy","NS"), v.names="counts", timevar = "aGpdh", direction="wide")

results.w$counts.sum <- as.numeric(results.w$counts.F) + as.numeric(results.w$counts.S)
results.w

##   amy     NS counts.F counts.S counts.sum
## 1   F non-NS      726      172        898
## 2   F     NS       90       32        122
## 3   S non-NS      111       26        137
## 4   S     NS        1        0          1

1
2
3

#calculate the total number of observations
total <- sum(results.w$counts.sum)
total

1	## [1] 1158

#part c: calculate the Amy (F)ast and (S)low frequencies

S.table <- results.w[ results.w$amy=="S",]
S.freq <- sum(S.table$counts.sum)/total
S.freq
## [1] 0.119171

F.table <- results.w[ results.w$amy=="F",]
F.freq <- sum(F.table$counts.sum)/total
F.freq

1	## [1] 0.880829

#part d: calculate the NS inversion bearing frequencies

NS.table <- results.w[ results.w$NS=="NS",]
NS.freq <- sum(NS.table$counts.sum)/total
NS.freq

1	## [1] 0.1062176

1
2
3

nonNS.table <- results.w[ results.w$NS=="non-NS",]
nonNS.freq <- sum(nonNS.table$counts.sum)/total
nonNS.freq

1	## [1] 0.8937824

#D <- P11 - p1q1,  -.011794
# Let's use amy "F" and NS "non-NS" as P11. and calculate D from there
# First, pull ot the relevent row then calculate the ratio

results.w[ results.w$amy=="F" & results.w$NS=="non-NS",]

1 2	## amy NS counts.F counts.S counts.sum ## 1 F non-NS 726 172 898

1 2	D <- results.w[ results.w$amy=="F" & results.w$NS=="non-NS", "counts.sum"]/total - F.freq*nonNS.freq D

1	## [1] -0.0117945

# rho = D/sqrt(p1p2q1q2)

rho <- D/sqrt(S.freq*F.freq*NS.freq*nonNS.freq )
rho

1	## [1] -0.1181502

1 2	Chi.square <- rho^2*total Chi.square

1	## [1] 16.16505

1 2	observed <- results.w[, "counts.sum"] observed

1	## [1] 898 122 137 1

expected <- c( (nonNS.freq*F.freq/total),
	       (NS.freq*F.freq/total),
	       (nonNS.freq*S.freq/total),
	       (NS.freq*S.freq/total)) 
expected

1	## [1] 6.798527e-04 8.079409e-05 9.198007e-05 1.093097e-05

1	chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )

## 
## 	Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 16.165, df = 3, p-value = 0.001049

Problem E

This is the same as E only now we use aGpdh instead of amy

# note that we are no longer interested in amy.
# use amy as a "replicates" indicator and reprocess the table to combine replicate
#counts prior to calculating frequencies

results.w <- reshape( results, idvar= c("aGpdh","NS"), v.names="counts", timevar = "amy", direction="wide")

results.w$counts.sum <- as.numeric(results.w$counts.F) + as.numeric(results.w$counts.S)
results.w

##   aGpdh     NS counts.F counts.S counts.sum
## 1     F non-NS      726      111        837
## 2     F     NS       90        1         91
## 5     S non-NS      172       26        198
## 6     S     NS       32        0         32

1
2
3

#calculate the total number of observations
total <- sum(results.w$counts.sum)
total

1	## [1] 1158

#part c: calculate the Amy (F)ast and (S)low frequencies

S.table <- results.w[ results.w$aGpdh=="S",]
S.freq <- sum(S.table$counts.sum)/total
S.freq

1	## [1] 0.1986183

1
2
3

F.table <- results.w[ results.w$aGpdh=="F",]
F.freq <- sum(F.table$counts.sum)/total
F.freq

1	## [1] 0.8013817

#part d: calculate the NS inversion bearing frequencies

NS.table <- results.w[ results.w$NS=="NS",]
NS.freq <- sum(NS.table$counts.sum)/total
NS.freq

1	## [1] 0.1062176

1
2
3

nonNS.table <- results.w[ results.w$NS=="non-NS",]
nonNS.freq <- sum(nonNS.table$counts.sum)/total
nonNS.freq

1	## [1] 0.8937824

#D <- P11 - p1q1

# Let's use aGpdh "F" and NS "non-NS" as P11. and calculate D from there
# First, pull ot the relevent row then calculate the ratio

results.w[ results.w$aGpdh=="F" & results.w$NS=="non-NS",]

1 2	## aGpdh NS counts.F counts.S counts.sum ## 1 F non-NS 726 111 837

1 2	D <- results.w[ results.w$aGpdh=="F" & results.w$NS=="non-NS", "counts.sum"]/total - F.freq*nonNS.freq D

1	## [1] 0.006537088

# rho = D/sqrt(p1p2q1q2)

rho <- D/sqrt(S.freq*F.freq*NS.freq*nonNS.freq )
rho

1	## [1] 0.05317908

1 2	Chi.square <- rho^2*total Chi.square

1	## [1] 3.274841

1 2	observed <- results.w[, "counts.sum"] observed

1	## [1] 837 91 198 32

expected <- c( (nonNS.freq*F.freq/total),
	       (NS.freq*F.freq/total),
	       (nonNS.freq*S.freq/total),
	       (NS.freq*S.freq/total)) 
expected

1	## [1] 6.185327e-04 7.350678e-05 1.533001e-04 1.821828e-05

1	chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )

## 
## 	Chi-squared test for given probabilities
## 
## data:  observed
## X-squared = 3.2748, df = 3, p-value = 0.3512

Note that the degrees of freedom of 3 used by R is inappropriate. For 1 degree of freedon (4 - 1 (sample size) -1 (estimating p1) -1 (estimating p2) = 1 ) you must read a p ~0.07 off a chi square table. Do not reject the null hypothesis ( independence or linkage equilibrium) and so conclude linkage equilibrium.

## Box H ##

Problem A

For an autosomal gene the paths are:

GCAE: $(\frac{1}{2})^4*(1+1) = \frac{1}{16}*2 = \frac{8}{64}$
GDAE: $(\frac{1}{2})^4*(1+1) = \frac{1}{16}*2 = \frac{8}{64}$
GDBE: $(\frac{1}{2})^4*(1+\frac{1}{4}) = \frac{1}{16}*\frac{5}{4} = \frac{5}{64}$

Total: $\frac{8}{64} + \frac{8}{64} + \frac{5}{64} = \frac{21}{64}$

Problem B

For a sex linked gene:

GCAE: CAE are male so this path is not included

GDAE: AE are male so this path is not included

GDBE: $(\frac{1}{2})^{3}*(1+\frac{1}{4}) = \frac{1}{8}*\frac{5}{4} = \frac{5}{32}$
Total: $\displaystyle \frac{5}{32}$