We start by reading the gene counts per sample, metadata, and functional annotations (obtained form EGGNOG).
library(mbtools)
library(data.table)
genes <- fread("zcat < data/function_counts.csv.gz", drop=1)
manifest <- fread("data/manifest.csv")
annotations <- fread("data/annotated/denovo.emapper.annotations",
sep = "\t", na.strings = c("", "NA"))
Stopped early on line 495133. Expected 22 fields but found 1. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<# 495128 queries scanned>>
genes <- genes[annotations, on = c(locus_tag = "#query_name"), nomatch = NULL]
genes[, vendor_observation_id := tstrsplit(sample, "\\.")[[1]]]
head(genes)
Genes are not really universal:
library(ggplot2)
theme_set(theme_minimal())
sample_counts <- genes[, .(N=uniqueN(sample)), by="seed_eggNOG_ortholog"]
ggplot(sample_counts, aes(x=N)) + geom_bar()
A better way is to group by unique KO term assignments:
ko_map <- data.table(KEGG_ko = genes[KEGG_ko != "", KEGG_ko])
ko_map <- ko_map[, .(ko_term = strsplit(KEGG_ko, ",")[[1]]), by=KEGG_ko]
expanded <- ko_map[genes, on="KEGG_ko", allow.cartesian = TRUE, nomatch = 0]
ko <- expanded[, .(
reads = sum(reads),
tpm = sum(tpm)
), by = c("ko_term", "vendor_observation_id")]
concat <- function(x) {
x <- unlist(strsplit(x[!is.na(x)], ","))
paste0(unique(x), collapse=",")
}
anns <- expanded[, .(
EC = concat(EC),
bigg_reaction = concat(BiGG_Reaction),
description = concat(`eggNOG free text desc.`),
CAZy = concat(CAZy),
KEGG_Pathway = concat(KEGG_Pathway),
BRITE = concat(BRITE),
name = concat(Preferred_name)
), by="ko_term"]
setkey(anns, ko_term)
head(ko)
This generalizes better:
ggplot(ko[, .N, by="ko_term"], aes(x=N)) + geom_bar()
Let’s build up the data structures for testing:
counts <- dcast(ko, ko_term ~ vendor_observation_id, value.var = "reads", fill = 0)
ko_terms <- counts[, ko_term]
counts <- as.matrix(counts[, ko_term := NULL])
rownames(counts) <- ko_terms
meta <- manifest[has_close_microbiome == T] %>% as.data.frame()
rownames(meta) <- as.character(meta$vendor_observation_id)
No we can start building the DESeq analysis:
library(futile.logger)
taxa <- matrix(ko_terms, ncol=1)
colnames(taxa) <- "ko_term"
rownames(taxa) <- ko_terms
meta$subset <- factor(meta$subset)
meta$sex <- factor(meta$sex)
flog.threshold(DEBUG)
NULL
ps <- phyloseq(
otu_table(t(round(counts)), taxa_are_rows=FALSE),
tax_table(taxa),
sample_data(meta)
)
tests <- association(
ps,
presence_threshold = 1,
min_abundance = 10,
in_samples = 1,
confounders = c("age", "bmi", "sex"),
variables = "subset",
taxa_rank = "ko_term",
method = "deseq2",
shrink=F
)
bmi <- association(
ps,
presence_threshold = 1,
min_abundance = 10,
in_samples = 1,
confounders = c("age", "sex"),
variables = "bmi",
taxa_rank = "ko_term",
method = "deseq2",
shrink=F
)
tests <- rbind(tests, bmi)
tests <- anns[tests, on="ko_term"]
fwrite(tests[order(padj)], "data/tests_metagenome_ko.csv")
tests[variable == "subset"][order(padj)]
Volcano plots
ggplot(tests, aes(log2FoldChange, y=-log10(pvalue),
shape=padj<0.1, col=variable)) +
geom_point() +
labs(shape="FDR < 0.1", size="FDR < 0.1")
#ggsave("figures/gene_volcano.png", dpi=300, width=4, height=3)
Corr
tests[, "t" := log2FoldChange / lfcSE]
wide <- dcast(tests, ko_term ~ variable, value.var=c("t", "padj"), fill=0)
wide[, "sig" := "none"]
wide[padj_bmi < 0.05, "sig" := "BMI"]
wide[padj_subset < 0.05, "sig" := "weight loss"]
wide[padj_bmi < 0.05 & padj_subset < 0.05, "sig" := "both"]
wide[, "sig" := factor(sig, levels=c("none", "BMI", "weight loss", "both"))]
ggplot(wide, aes(x=t_bmi, y=t_subset, color=sig)) +
geom_vline(xintercept = 0, color="gray30", lty="dashed") +
geom_hline(yintercept = 0, color="gray30", lty="dashed") +
geom_point() + stat_smooth(method="glm", aes(group = 1)) +
labs(x = "t statistic BMI", y = "t statistic weight loss") +
scale_color_manual(values=c(none = "black", BMI = "royalblue", `weight loss`="orange", both = "red"))
ggsave("figures/koterm_t.png", dpi=300, width=5, height=3)
cor.test(~ t_bmi + t_subset, data=wide)
Pearson's product-moment correlation
data: t_bmi and t_subset
t = -16.829, df = 2973, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3273916 -0.2617626
sample estimates:
cor
-0.2949249
Let’s visualize the significant hits.
sig <- tests[variable == "subset" & padj<0.05, ko_term]
ko_counts <- mbtools::normalize(round(counts))
kos <- rownames(ko_counts)
ko_counts <- as.data.table(ko_counts)[, ko_term := kos]
ko_counts <- melt(setDT(ko_counts), id.vars = "ko_term", variable.name = "vendor_observation_id", value.name = "reads")
sig <- ko_counts[ko_term %chin% sig]
sig <- anns[sig, on="ko_term"]
manifest[, vendor_observation_id := as.character(vendor_observation_id)]
sig <- manifest[corebiome == T][sig, on="vendor_observation_id"]
groups <- fread("data/sig_grouping.csv")
tests <- groups[tests, on="ko_term"]
ggplot(sig, aes(x = bmi, y = reads)) +
geom_point() +
scale_y_log10() +
stat_smooth(method="glm", aes(group = 1), col = "black") +
facet_wrap(~ ko_term, scales = "free_y") +
labs(x="BMI", y="normalized reads") + guides(color = F)
`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
ggsave("figures/gene_bmi.png", dpi=300, width=12, height=10)
ggplot(sig, aes(x = subset, y = reads, color=subset)) +
geom_jitter(width=0.2) +
scale_y_log10() +
stat_summary(fun.y = "mean", geom = "point", pch=23, stroke = 1, size=3, fill = "white") +
facet_wrap(~ ko_term, scales = "free_y") +
labs(x="", y="normalized reads") + guides(color = F)
`fun.y` is deprecated. Use `fun` instead.`guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
ggsave("figures/gene_abundance.png", dpi=300, width=12, height=10)
pd <- position_dodge(0.3)
ggplot(tests[variable == "subset" & padj<0.05, ],
aes(x=log2FoldChange, y=reorder(group, log2FoldChange), color=log10(baseMean), group=ko_term)) +
geom_vline(xintercept = 0, lty="dashed", color="gray20") +
geom_linerange(aes(xmin=log2FoldChange - lfcSE, xmax=log2FoldChange + lfcSE), position=pd) +
geom_point(size=2, position=pd) +
labs(x = "log2 fold-change", y = "", color = "abundance") +
theme(legend.position="bottom")
ggsave("figures/lfcs.png", dpi=300, width=5, height=5)
fwrite(tests[variable == "subset" & padj<0.05, ], "data/significant_functions.csv")
and the same for CAZy terms
pd <- position_dodge(0.3)
ggplot(tests[variable == "subset" & padj<0.05 & CAZy != "", ],
aes(x=log2FoldChange, y=reorder(CAZy, log2FoldChange), color=log10(baseMean), group=ko_term)) +
geom_vline(xintercept = 0, lty="dashed", color="gray20") +
geom_linerange(aes(xmin=log2FoldChange - lfcSE, xmax=log2FoldChange + lfcSE), position=pd) +
geom_point(size=2, position=pd) +
labs(x = "log2 fold-change", y = "", color = "abundance") +
theme(legend.position="bottom")
ggsave("figures/lfcs_cazy.png", dpi=300, width=3, height=3)
library(vegan)
m <- as(otu_table(ps), "matrix")
perm <- adonis(mbtools::normalize(m) ~ age + sex + bmi + subset, data=as(sample_data(ps), "data.frame"), method="bray")
perm
Call:
adonis(formula = mbtools::normalize(m) ~ age + sex + bmi + subset, data = as(sample_data(ps), "data.frame"), method = "bray")
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
age 1 0.04256 0.042564 1.74059 0.06627 0.014 *
sex 1 0.01850 0.018504 0.75668 0.02881 0.829
bmi 1 0.06054 0.060537 2.47556 0.09426 0.005 **
subset 1 0.03157 0.031566 1.29084 0.04915 0.139
Residuals 20 0.48908 0.024454 0.76151
Total 24 0.64225 1.00000
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
rich <- ps %>% rarefy_even_depth() %>% estimate_richness()
You set `rngseed` to FALSE. Make sure you've set & recorded
the random seed of your session for reproducibility.
See `?set.seed`
...
14OTUs were removed because they are no longer
present in any sample after random subsampling
...
rich[["vendor_observation_id"]] <- substr(rownames(rich), 2, 1e6)
sdf <- sample_data(ps) %>% as("data.frame")
sdf$vendor_observation_id <- as.character(sdf$vendor_observation_id)
merged <- merge(rich, sdf, by="vendor_observation_id")
mod <- glm(Observed ~ sex + age + bmi + subset, data=merged)
summary(mod)
Call:
glm(formula = Observed ~ sex + age + bmi + subset, data = merged)
Deviance Residuals:
Min 1Q Median 3Q Max
-662.43 -70.39 50.38 130.30 343.86
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4862.435 466.469 10.424 1.57e-09 ***
sexM -104.165 140.195 -0.743 0.4661
age 6.358 6.562 0.969 0.3441
bmi -18.150 9.657 -1.879 0.0748 .
subsetweight loss -57.203 141.956 -0.403 0.6912
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 65640.23)
Null deviance: 2026646 on 24 degrees of freedom
Residual deviance: 1312805 on 20 degrees of freedom
AIC: 354.67
Number of Fisher Scoring iterations: 2