Overview

For most of this lab, you should be able to just run the R code. Be aware that you can also make plots of top hits using LocusZoom.

Key Concepts: Interfacing with genetic data in R, PC analysis; logistic and linear regression; adding covariates; visualizing associations

library(qqman)
library(ggplot2)

Load the Data

Import the data

geno <- data.matrix(read.table("geno_lab.txt", header = FALSE))
pheno <- read.table("phenotype.txt", header = TRUE)
snpPos <- read.table("legend_lab.txt", header = TRUE)

How does the genotype data geno look like?

geno[1:6,1:6]
##      V1 V2 V3 V4 V5 V6
## [1,]  2  0  0  0  0 NA
## [2,]  1  0  0  0  0  0
## [3,]  1  0  0  0  0  0
## [4,]  2  0  0  0  0  0
## [5,]  2  0  0  0  0  0
## [6,]  2  0  0  0  0  0

Lets look at the phenotype data pheno.

dim(pheno)
## [1] 6081    3
head(pheno)
##   Y gender ancestry
## 1 1      F European
## 2 1      F European
## 3 1      F European
## 4 1      F European
## 5 1      F European
## 6 1      F European

Metadata on the SNP features snpPos.

dim(snpPos)
## [1] 101   4
head(snpPos)
##   chr position X0 X1
## 1  10   474309  C  T
## 2  10   480773  A  G
## 3  10   483144  A  G
## 4  10   485982  A  G
## 5  10   487331  G  A
## 6  10   494161  C  G

Not all 2 ’s are bad and not all 0’s are good.

Note that the 0, 1, 2 denotes only the number of secondary allele (X1) instead of X0. Does not mean that X1 is deleterious or even the minor allele.

For SNP 90,

table(geno[,90])
## 
##    0    1    2 
##  107  994 4856

For SNP 1,

table(geno[,1])
## 
##    0    1    2 
## 2609 2395  956

Tabular form of the phenotype data.

table(pheno)
## , , ancestry = African
## 
##    gender
## Y     F   M
##   0 796 775
##   1 729 731
## 
## , , ancestry = European
## 
##    gender
## Y     F   M
##   0 780 783
##   1 778 709

Quality Control

  • Missing rate in SNPs
snpMissingRate <- colMeans(is.na(geno))
summary(snpMissingRate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01595 0.01891 0.02023 0.02148 0.02171 0.07252
  • Person missing rate
personMissingRate <- rowMeans(is.na(geno))
summary(personMissingRate)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.009901 0.019802 0.021476 0.029703 0.089109

Number of bad SNPs (As per QC).

numVariantsMissing <- sum(snpMissingRate>0.05)
numVariantsMissing
## [1] 2

Number of bad indivuduals (As per QC).

numIndividualMissing <- sum(personMissingRate>0.05)
numIndividualMissing
## [1] 145

The filtered data

geno <- geno[personMissingRate <= 0.05, snpMissingRate <= 0.05]
snpPos <- snpPos[snpMissingRate <= 0.05,]
pheno <- pheno[personMissingRate <= 0.05,]
dim(geno)
## [1] 5936   99
dim(snpPos)
## [1] 99  4
dim(pheno)
## [1] 5936    3
geno[1:6,1:6]
##      V1 V2 V3 V5 V6 V7
## [1,]  1  0  0  0  0  0
## [2,]  1  0  0  0  0  0
## [3,]  2  0  0  0  0  0
## [4,]  2  0  0  0  0  0
## [5,]  2  0  0  0  0  0
## [6,]  1  0  0  0  0  0

Normalization/Standardization of genotype data

for (j in 1:dim(geno)[2]) {
  temp <- geno[,j]
  geno[,j][is.na(temp)] <- median(temp,na.rm = TRUE)  #replace missing with median
  geno[,j] <- (geno[,j]-mean(geno[,j]))/sd(geno[,j])   #normalization / standarization
}
geno[1:6,1:6]
##             V1         V2         V3         V5         V6         V7
## [1,] 0.3826988 -0.7566315 -0.7502653 -0.7341486 -0.6857083 -0.6592869
## [2,] 0.3826988 -0.7566315 -0.7502653 -0.7341486 -0.6857083 -0.6592869
## [3,] 1.7798085 -0.7566315 -0.7502653 -0.7341486 -0.6857083 -0.6592869
## [4,] 1.7798085 -0.7566315 -0.7502653 -0.7341486 -0.6857083 -0.6592869
## [5,] 1.7798085 -0.7566315 -0.7502653 -0.7341486 -0.6857083 -0.6592869
## [6,] 0.3826988 -0.7566315 -0.7502653 -0.7341486 -0.6857083 -0.6592869

Principal Components Analysis

PC <- prcomp(t(geno))
allDF <- data.frame(ancestry = pheno$ancestry, gender = pheno$gender,
                    Y = pheno$Y, PC$rotation[,1:4])
shuf <- function(df) return(df[sample(1:dim(df)[1], dim(df)[1]),])

ggplot(allDF, aes(x = PC1, y = PC2, color = ancestry, shape = gender, size = Y)) +
  geom_point() + theme_bw()

A clear sign of population stratification in this example.

Preparing the Output data frame

result <- snpPos
result$noadjp <- 0; result$ancestryp <- 0; result$PCp <- 0
head(result)
##   chr position X0 X1 noadjp ancestryp PCp
## 1  10   474309  C  T      0         0   0
## 2  10   480773  A  G      0         0   0
## 3  10   483144  A  G      0         0   0
## 5  10   487331  G  A      0         0   0
## 6  10   494161  C  G      0         0   0
## 7  10   498349  A  C      0         0   0
for (j in 1:dim(geno)[2]) {

  # Fit models
  noadj.mod   <- summary(glm(allDF$Y~geno[,j]+allDF$gender,
                             family=binomial()))$coeff
  ancestry.mod<- summary(glm(allDF$Y~geno[,j]+allDF$gender+allDF$ancestry,
                             family=binomial()))$coeff
  PC.mod      <- summary(glm(allDF$Y~geno[,j]+allDF$gender+allDF$PC1,
                             family=binomial()))$coeff
  
  # Overwrite the results data frame
  result[j,c("noadjp","ancestryp","PCp")] <- c(noadj.mod[2,4],ancestry.mod[2,4],PC.mod[2,4])
}
head(result)
##   chr position X0 X1    noadjp ancestryp       PCp
## 1  10   474309  C  T 0.8441398 0.6996626 0.7530635
## 2  10   480773  A  G 0.5362881 0.5506277 0.4983197
## 3  10   483144  A  G 0.6472804 0.7224431 0.6466385
## 5  10   487331  G  A 0.2916373 0.2352932 0.2104367
## 6  10   494161  C  G 0.5404932 0.5753381 0.5157116
## 7  10   498349  A  C 0.5739175 0.6252326 0.5645841

Are the p-values for the three approaches correlated? Check if they are !

Visualizing the summary statistics

#' ## Look at summary statistics plot
#+ cache = FALSE, message = FALSE, warning = FALSE, eval = TRUE, echo = TRUE, fig.height = 4, fig.width = 4, fig.align = "center"
manhattan(result, chr = "chr", bp = "position", p = "noadjp")
## Warning in manhattan(result, chr = "chr", bp = "position", p = "noadjp"):
## No SNP column found. OK unless you're trying to highlight.