Population Genetics Chpt 2 Box

Chapter 2 Box Problems


## Box A

Problem a

Let’s call PGI-2a ‘a’ and PGI-2b ‘b’

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
phenotypes <- c("aa","ab","bb")
observed <- c(35,19,3)
total <- sum(observed)


Freq.a <- (2*35 + 19)/(2*total)
Freq.b <- (19 + 2*3)/(2*total)

expected <- c((Freq.a^2)*total, 2*(Freq.a*Freq.b)*total, (Freq.b^2)*total)
expected <- round(expected, 2)

data.frame( cbind( phenotypes, observed, expected ))

## phenotypes observed expected
## 1 aa 35 34.74
## 2 ab 19 19.52
## 3 bb 3 2.74

Problem b

Set up a data.frame to hold the Drosophila mobility and inversion data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
aGpdh <- c( rep("F",4), rep("S",4))
amy <- rep(c("F","F","S","S"), 2)
NS <- rep( c("non-NS","NS"), 4)
counts <- c(726,90,111,1,172,32,26,0)

results <- data.frame( cbind(aGpdh, amy, NS, counts), stringsAsFactors=FALSE)

results$counts <- as.numeric(results$counts)
results

## aGpdh amy NS counts
## 1 F F non-NS 726
## 2 F F NS 90
## 3 F S non-NS 111
## 4 F S NS 1
## 5 S F non-NS 172
## 6 S F NS 32
## 7 S S non-NS 26
## 8 S S NS 0
1
2
3
4
5
6
7
8
9
10
11
12
#calculate the total number of observations
total <- sum(results$counts)
total
## [1] 1158

#part b: calculate the aGpdh (F)ast and (S)low frequencies

S.table <- results[ results$aGpdh=="S",]
S.freq <- sum(S.table$counts)/total
S.freq

## [1] 0.1986183
1
2
3
F.table <- results[ results$aGpdh=="F",]
F.freq <- sum(F.table$counts)/total
F.freq
1
## [1] 0.8013817
1
2
3
4
5
#part c: calculate the Amy (F)ast and (S)low frequencies

S.table <- results[ results$amy=="S",]
S.freq <- sum(S.table$counts)/total
S.freq
1
## [1] 0.119171
1
2
3
F.table <- results[ results$amy=="F",]
F.freq <- sum(F.table$counts)/total
F.freq
1
## [1] 0.880829
1
2
3
4
5
#part d: calculate the NS inversion bearing frequencies

NS.table <- results[ results$NS=="NS",]
NS.freq <- sum(NS.table$counts)/total
NS.freq
1
## [1] 0.1062176
1
2
3
nonNS.table <- results[ results$NS=="non-NS",]
nonNS.freq <- sum(nonNS.table$counts)/total
nonNS.freq
1
## [1] 0.8937824

## Box C

Problem A

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#maximum age = m = 4
m <- 4

# i is the age class
i <- c(0,1,2,3,m)

#s probability of survival from age class i to i+1
s <- c( 0.5, 0.5, 0.5, 0.5, 0)

# b is the average number of offspring born to an individual in age class i

b <- c(0, 1, 1.5, 1, 0)

#using the characteristic equation, create a vector ce where each element holds an element of the equation. As there are 5 elements create a 5 element vector

ce <- vector("numeric", 5)
ce
## [1] 0 0 0 0 0
1
2
3
4
5
6
7
8
9
lambda <- 1 #using lambda = 1


ce[1] <- lambda
ce[2] <- s[1]*lambda^(-1)
ce[3] <- s[1]*s[2]*lambda^(-2)
ce[4] <- s[1]*s[2]*s[3]*lambda^(-3)
ce[5] <- s[1]*s[2]*s[3]*s[4]*lambda^(-4)
ce
1
## [1] 1.0000 0.5000 0.2500 0.1250 0.0625

Problem b

$$N_{t^{*}} = N_0\lambda^{t^{*}}=2N_0$$ $$N_0\lambda^{t^{*}}=2N_0$$ $$\lambda^{t^{*}}=2$$ $${t^{*}}\ln(\lambda)=\ln(2)$$ $$\ln(\lambda)=\frac{\ln(2)}{t^{*}}$$ $$\lambda=e^{(\frac{\ln(2)}{t^{*}})}$$
1
2
3
4
5
#Calculate lambda for Sweden and Mexico

t.star <- 173 #Sweden
lambda <- exp(log(2)/t.star)
lambda
1
## [1] 1.004015
1
2
3
t.star <- 19.8  #Mexico
lambda <- exp(log(2)/t.star)
lambda
1
## [1] 1.035627

## Box D ##

Problem A

Using the table I extract coefficients fromt the “Offspring Frequency” Aa column and apply to the “Frequency of Mating” column to obtain the frequency of heterozygotes in the next generation, Q’:

$$Q’ = \frac{1}{2}(2PQ) + 2PR + \frac{1}{2}Q^2 + \frac{1}{2}(2QR)$$

$$ = PQ + 2PR + \frac{1}{2}Q^2 + QR$$

Factor out Q/2 + R

$$= 2P(\frac{Q}{2} + R) + Q(\frac{Q}{2}+R)$$
$$= (2P + Q)(\frac{Q}{2} + R)$$
$$= 2(P + \frac{Q}{2})( \frac{Q}{2} + R)$$
Given that p = P + Q/2 and q= Q/2+R = 2pq


## Box E ##

Problem A

1
2
3
4
5
6
7
8
9
10
11
total <- 2060
O <- 702/total
A <- 862/total
B <- 365/total
AB <- 131/total

p <- 1-sqrt(B+O)
q <- 1-sqrt(A+O)
r <- sqrt(O)

p
1
2
3
4
5
6
7
8
9
10
11
## [1] 0.2803048

q
## [1] 0.1286658

r
## [1] 0.5837608

#total p+q+r
p+q+r
## [1] 0.9927314

Problem B

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
theta <- 1-p-q-r
theta
## [1] 0.007268573

p.final <- p*(1+theta/2)
q.final <- q*(1+theta/2)
r.final <- (r+theta/2)*(1+theta/2)

p.final
## [1] 0.2813235

q.final
## [1] 0.1291334

r.final
## [1] 0.5895299

#total p.final+q.final+r.final
p.final+q.final+r.final
## [1] 0.9999868

Problem C

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
O.expected <- r.final^2 #probability of O
O.expected*total
## [1] 715.9437

A.expected <- p.final^2 + 2*p.final*r.final #probability of A
A.expected*total
## [1] 846.3307

B.expected <- q.final^2 + 2*q.final*r.final #probability of B
B.expected*total
## [1] 347.9987

AB.expected <- 2*p.final*q.final #probability of AB
AB.expected*total
## [1] 149.6724

#total
O.expected + A.expected + B.expected + AB.expected
## [1] 0.9999736

Problem D

1
2
3
4
observed <- c(O,A,B,AB)
expected <- c(O.expected,A.expected,B.expected,AB.expected)

chisq.test( observed, p = expected, correct=FALSE )
1
## Error in chisq.test(observed, p = expected, correct = FALSE): probabilities must sum to 1.
1
expected
1
## [1] 0.34754547 0.41084016 0.16893143 0.07265653
1
sum(expected)
1
## [1] 0.9999736
1
2
#argument rescale.p allows for rescaling the p values so they sum to 1
chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )
1
2
## Warning in chisq.test(observed, p = expected, rescale.p = TRUE, correct =
## FALSE): Chi-squared approximation may be incorrect
1
2
3
4
5
## 
## Chi-squared test for given probabilities
##
## data: observed
## X-squared = 0.0018066, df = 3, p-value = 1
1
2
observed <- c(701,862,365,131)
chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )
1
2
3
4
5
## 
## Chi-squared test for given probabilities
##
## data: observed
## X-squared = 3.7634, df = 3, p-value = 0.2882

Though the \(X^2\) value is correct I cannot modify the degrees of freedom using chisq.test.
Try a manual maximum liklihood instead.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#lowercase = genotypes
#uppercase = phenotypes


n.A <- 862
n.AB <- 131
n.B <- 365
n.OO<- 702
N <- sum( n.A, n.AB, n.B, n.OO)

#first guess - wildest guess is all alleles about equal

p.a <- 0.33
p.b <- 0.33
p.o <- 0.34

num.iter <- 6 #number of estimate iterations

#Build a data.frame to hold results from each iteration

results <- data.frame( matrix( nrow=num.iter, ncol=10))
names(results) <- c("iter","Naa","Nao","Nbb","Nbo","Nab","Noo","p.a","p.b","p.o")

for( i in 1:num.iter){

Naa <- n.A*(p.a^2/(p.a^2+2*p.a*p.o))
Nao <- n.A*((2*p.a*p.o)/(p.a^2+2*p.a*p.o))

Nbb <- n.B*(p.b^2/(p.b^2+2*p.b*p.o))
Nbo <- n.B*((2*p.b*p.o)/(p.b^2+2*p.b*p.o))

Nab <- n.AB
Noo <- n.OO



#reassignment of frequencies

p.a <- (2*Naa + Nao + Nab)/(2*N)
p.b <- (2*Nbb + Nbo + Nab)/(2*N)
p.o <- (2*Noo + Nao + Nbo)/(2*N)

results[i,] <- c(i,Naa,Nao,Nbb,Nbo,Nab,Noo,p.a,p.b,p.o)

}

results

## iter Naa Nao Nbb Nbo Nab Noo p.a p.b
## 1 1 281.6436 580.3564 119.25743 245.7426 131 702 0.3093795 0.1493343
## 2 2 191.5908 670.4092 44.24607 320.7539 131 702 0.2875220 0.1311277
## 3 3 170.9007 691.0993 36.99224 328.0078 131 702 0.2825002 0.1293670
## 4 4 166.9323 695.0677 36.16559 328.8344 131 702 0.2815370 0.1291664
## 5 5 166.2077 695.7923 36.05077 328.9492 131 702 0.2813611 0.1291385
## 6 6 166.0775 695.9225 36.03253 328.9675 131 702 0.2813295 0.1291341
## p.o
## 1 0.5412862
## 2 0.5813503
## 3 0.5881328
## 4 0.5892966
## 5 0.5895004
## 6 0.5895364

Converges to 3 significant figures after about 4 iterations.


## Box F ##

Problem A

Given:

$$m_n = f_{n-1}$$ $$f_n = \frac{1}{2}(m_{n-1} + f_{n-1})$$ Then: $$f_n - m_n = \frac{1}{2}(m_{n-1} + f_{n-1}) - f_{n-1}$$ $$f_n - m_n = \frac{1}{2}(m_{n-1} + f_{n-1} - 2f_{n-1})$$ $$f_n - m_n = \frac{1}{2}(m_{n-1} - f_{n-1})$$ $$f_n - m_n = -\frac{1}{2}(f_{n-1} - m_{n-1})$$

Problem B

Given the expression for the current generation:

$$\frac{2}{3}(f_n) + \frac{1}{3}(m_n)$$ Substitute in: $$m_{n} = f_{n-1}$$ $$f_n = \frac{1}{2}(m_{n-1} + f_{n-1})$$ to get: $$\frac{2}{3}[\frac{1}{2}(m_{n-1}+f_{n-1})]+\frac{1}{3}(f_{n-1})$$ $$\frac{1}{3}(m_{n-1}+f_{n-1})+\frac{1}{3}(f_{n-1})$$
$$\frac{1}{3}m_{n-1}+\frac{2}{3}(f_{n-1})$$

Which is the expression for the previous generation - the same expression as the current generation.

Problem C

Set up a vector to handle the frequencies, noting that the vector index will be one off from the generation.

1
2
3
4
5
6
7
8
9
10
11
12
13
m <- vector( mode="numeric", length = 7)
f <- vector( mode="numeric", length = 7)

m[1] <- 0.2
f[1] <- 0.8

for( i in 2:7){
m[i] <- f[i-1]
f[i] <- 0.5*(m[i-1] + f[i-1])
}

results <- data.frame( cbind(m, f))
results
1
2
3
4
5
6
7
8
##         m        f
## 1 0.20000 0.800000
## 2 0.80000 0.500000
## 3 0.50000 0.650000
## 4 0.65000 0.575000
## 5 0.57500 0.612500
## 6 0.61250 0.593750
## 7 0.59375 0.603125
1
2
3
4
p <- ((2/3)*f[1] + (1/3)*m[1]) #ultimate frequency in both sexes
q <- 1-p

p; q
1
## [1] 0.6
1
## [1] 0.4
1
2
3
#in females
#AA Aa aa
p^2; 2*p*q; q^2
1
## [1] 0.36
1
## [1] 0.48
1
## [1] 0.16
1
2
3
# in males
# A a
p; q
1
## [1] 0.6
1
## [1] 0.4

## Box G ##

Problem A

A1 allele frequency $$p_1 = P_{11} + P_{12}$$ A2 allel frequency $$p_2 = P_{21} + P_{22}$$ B1 allele frequency $$q1 = P_{11} + P_{21}$$ B2 allel frequency $$q2 = P_{12} + P_{22}$$ disequilibrium parameter $$D = P_{11}*P_{22} - P_{12}*P_{21}$$
Show that $P_{11} = p_1q_1 + D$

Substitute for $p_1, q_1, D$ $$P_{11} = (P_{11} + P_{12})(P_{11} + p_{21}) + (P_{11}*P_{22} - P_{12}*p_{21})$$ $$P_{11} = P_{11}*P_{11} + P_{11}*p_{21} + P_{12}*P_{11} + P_{12}*p_{21} + P_{11}*P_{22} - P_{12}*p_{21}$$ $$P_{11} = P_{11}*P_{11} + P_{11}*p_{21} + P_{12}*P_{11} + P_{11}*P_{22}$$ $$P_{11} = P_{11}*(P_{11} + p_{21} + P_{12} + P_{22})$$ Noting that $$P_{11} + P_{21} + P_{12} + P_{22} = 1$$ $$P_{11} = P_{11}*1$$

Problem D

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
aGpdh <- c( rep("F",4), rep("S",4))
amy <- rep(c("F","F","S","S"), 2)
NS <- rep( c("non-NS","NS"), 4)
counts <- c(726,90,111,1,172,32,26,0)

results <- data.frame( cbind(aGpdh, amy, NS, counts), stringsAsFactors=FALSE)

# note that we are no longer interested in aGpdh.
# use aGpdh as a "replicates" indicator and reprocess the table to combine replicate
#counts prior to calculating frequencies

results.w <- reshape( results, idvar= c("amy","NS"), v.names="counts", timevar = "aGpdh", direction="wide")

results.w$counts.sum <- as.numeric(results.w$counts.F) + as.numeric(results.w$counts.S)
results.w
1
2
3
4
5
##   amy     NS counts.F counts.S counts.sum
## 1 F non-NS 726 172 898
## 2 F NS 90 32 122
## 3 S non-NS 111 26 137
## 4 S NS 1 0 1
1
2
3
#calculate the total number of observations
total <- sum(results.w$counts.sum)
total
1
## [1] 1158
1
2
3
4
5
6
7
8
9
10
#part c: calculate the Amy (F)ast and (S)low frequencies

S.table <- results.w[ results.w$amy=="S",]
S.freq <- sum(S.table$counts.sum)/total
S.freq
## [1] 0.119171

F.table <- results.w[ results.w$amy=="F",]
F.freq <- sum(F.table$counts.sum)/total
F.freq
1
## [1] 0.880829
1
2
3
4
5
#part d: calculate the NS inversion bearing frequencies

NS.table <- results.w[ results.w$NS=="NS",]
NS.freq <- sum(NS.table$counts.sum)/total
NS.freq
1
## [1] 0.1062176
1
2
3
nonNS.table <- results.w[ results.w$NS=="non-NS",]
nonNS.freq <- sum(nonNS.table$counts.sum)/total
nonNS.freq
1
## [1] 0.8937824
1
2
3
4
5
#D <- P11 - p1q1,  -.011794
# Let's use amy "F" and NS "non-NS" as P11. and calculate D from there
# First, pull ot the relevent row then calculate the ratio

results.w[ results.w$amy=="F" & results.w$NS=="non-NS",]
1
2
##   amy     NS counts.F counts.S counts.sum
## 1 F non-NS 726 172 898
1
2
D <- results.w[ results.w$amy=="F" & results.w$NS=="non-NS", "counts.sum"]/total - F.freq*nonNS.freq
D
1
## [1] -0.0117945
1
2
3
4
# rho = D/sqrt(p1p2q1q2)

rho <- D/sqrt(S.freq*F.freq*NS.freq*nonNS.freq )
rho
1
## [1] -0.1181502
1
2
Chi.square <- rho^2*total
Chi.square
1
## [1] 16.16505
1
2
observed <- results.w[, "counts.sum"]
observed
1
## [1] 898 122 137   1
1
2
3
4
5
expected <- c( (nonNS.freq*F.freq/total),
(NS.freq*F.freq/total),
(nonNS.freq*S.freq/total),
(NS.freq*S.freq/total))
expected
1
## [1] 6.798527e-04 8.079409e-05 9.198007e-05 1.093097e-05
1
chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )
1
2
3
4
5
## 
## Chi-squared test for given probabilities
##
## data: observed
## X-squared = 16.165, df = 3, p-value = 0.001049

Problem E

This is the same as E only now we use aGpdh instead of amy

1
2
3
4
5
6
7
8
# note that we are no longer interested in amy.
# use amy as a "replicates" indicator and reprocess the table to combine replicate
#counts prior to calculating frequencies

results.w <- reshape( results, idvar= c("aGpdh","NS"), v.names="counts", timevar = "amy", direction="wide")

results.w$counts.sum <- as.numeric(results.w$counts.F) + as.numeric(results.w$counts.S)
results.w
1
2
3
4
5
##   aGpdh     NS counts.F counts.S counts.sum
## 1 F non-NS 726 111 837
## 2 F NS 90 1 91
## 5 S non-NS 172 26 198
## 6 S NS 32 0 32
1
2
3
#calculate the total number of observations
total <- sum(results.w$counts.sum)
total
1
## [1] 1158
1
2
3
4
5
#part c: calculate the Amy (F)ast and (S)low frequencies

S.table <- results.w[ results.w$aGpdh=="S",]
S.freq <- sum(S.table$counts.sum)/total
S.freq
1
## [1] 0.1986183
1
2
3
F.table <- results.w[ results.w$aGpdh=="F",]
F.freq <- sum(F.table$counts.sum)/total
F.freq
1
## [1] 0.8013817
1
2
3
4
5
#part d: calculate the NS inversion bearing frequencies

NS.table <- results.w[ results.w$NS=="NS",]
NS.freq <- sum(NS.table$counts.sum)/total
NS.freq
1
## [1] 0.1062176
1
2
3
nonNS.table <- results.w[ results.w$NS=="non-NS",]
nonNS.freq <- sum(nonNS.table$counts.sum)/total
nonNS.freq
1
## [1] 0.8937824
1
2
3
4
5
6
#D <- P11 - p1q1

# Let's use aGpdh "F" and NS "non-NS" as P11. and calculate D from there
# First, pull ot the relevent row then calculate the ratio

results.w[ results.w$aGpdh=="F" & results.w$NS=="non-NS",]
1
2
##   aGpdh     NS counts.F counts.S counts.sum
## 1 F non-NS 726 111 837
1
2
D <- results.w[ results.w$aGpdh=="F" & results.w$NS=="non-NS", "counts.sum"]/total - F.freq*nonNS.freq
D
1
## [1] 0.006537088
1
2
3
4
# rho = D/sqrt(p1p2q1q2)

rho <- D/sqrt(S.freq*F.freq*NS.freq*nonNS.freq )
rho
1
## [1] 0.05317908
1
2
Chi.square <- rho^2*total
Chi.square
1
## [1] 3.274841
1
2
observed <- results.w[, "counts.sum"]
observed
1
## [1] 837  91 198  32
1
2
3
4
5
expected <- c( (nonNS.freq*F.freq/total),
(NS.freq*F.freq/total),
(nonNS.freq*S.freq/total),
(NS.freq*S.freq/total))
expected
1
## [1] 6.185327e-04 7.350678e-05 1.533001e-04 1.821828e-05
1
chisq.test( observed, p = expected, rescale.p=TRUE, correct=FALSE )
1
2
3
4
5
## 
## Chi-squared test for given probabilities
##
## data: observed
## X-squared = 3.2748, df = 3, p-value = 0.3512

Note that the degrees of freedom of 3 used by R is inappropriate. For 1 degree of freedon (4 - 1 (sample size) -1 (estimating p1) -1 (estimating p2) = 1 ) you must read a p ~0.07 off a chi square table. Do not reject the null hypothesis ( independence or linkage equilibrium) and so conclude linkage equilibrium.


## Box H ##

Problem A

For an autosomal gene the paths are:


GCAE: $(\frac{1}{2})^4*(1+1) = \frac{1}{16}*2 = \frac{8}{64}$
GDAE: $(\frac{1}{2})^4*(1+1) = \frac{1}{16}*2 = \frac{8}{64}$
GDBE: $(\frac{1}{2})^4*(1+\frac{1}{4}) = \frac{1}{16}*\frac{5}{4} = \frac{5}{64}$

Total: $\frac{8}{64} + \frac{8}{64} + \frac{5}{64} = \frac{21}{64}$

Problem B

For a sex linked gene:


GCAE: CAE are male so this path is not included

GDAE: AE are male so this path is not included


GDBE: $(\frac{1}{2})^{3}*(1+\frac{1}{4}) = \frac{1}{8}*\frac{5}{4} = \frac{5}{32}$
Total: $\displaystyle \frac{5}{32}$
Share