Replication rates
library(mbtools)
rates <- fread("data/rates.csv")
manifest <- fread("data/manifest.csv")
rates <- rates[manifest, on = c(id = "vendor_observation_id"), allow.cartesian = T]
taxonomy <- fread("data/contigs.classification.names.txt", sep="\t", na.strings=c("NA", "not classified", ""), fill=T)
for (col in c("superkingdom", "phylum", "class", "order", "family", "genus", "species")) {
taxonomy[[col]] <- gsub(":.+$", "", taxonomy[[col]])
}
names(taxonomy)[1] <- "contig"
rates <- taxonomy[rates, on="contig"]
head(rates)
theme_set(theme_minimal())
ggplot(rates, aes(x = subset, y = rate, color = phylum, group = subset)) +
scale_y_log10() +
geom_jitter(width=0.3) +
labs(x = "", y = "replication rate [a.u.]") + theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))

pj <- position_jitter(width=0.3)
ggplot(rates, aes(x = subset, y = rate, color = phylum, group=subset)) +
scale_y_log10() +
geom_point(position=pj, size=1) + facet_wrap(~ phylum, nrow=1) +
stat_summary(fun.y = "mean", geom = "point", pch=23, stroke = 1, size=2, color="black") +
coord_flip() +
labs(x = "", y = "replication rate [a.u.]") + guides(color = FALSE)
`fun.y` is deprecated. Use `fun` instead.

ggsave("figures/replication.png", width=8, height=1.5, dpi=300)
for (p in rates[!is.na(phylum), unique(phylum)]) {
if (rates[phylum == p, uniqueN(subset) == 2]) {
print(p)
glm(log(rate) ~ age + bmi + subset, data=rates[phylum == p]) %>% summary() %>% print()
}
}
[1] "Bacteroidetes"
Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum ==
p])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.3073 -0.1641 -0.0828 0.0452 3.4557
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1633029 0.0664069 2.459 0.01408 *
age 0.0020464 0.0014240 1.437 0.15098
bmi 0.0008312 0.0014698 0.566 0.57180
subsetweight loss 0.0698759 0.0221775 3.151 0.00167 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.1091428)
Null deviance: 124.15 on 1123 degrees of freedom
Residual deviance: 122.24 on 1120 degrees of freedom
AIC: 706
Number of Fisher Scoring iterations: 2
[1] "Firmicutes"
Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum ==
p])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.25251 -0.14101 -0.06097 0.15636 0.63189
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.133337 0.161538 -0.825 0.411498
age 0.010908 0.002733 3.991 0.000141 ***
bmi -0.003015 0.003614 -0.834 0.406494
subsetweight loss 0.064405 0.047429 1.358 0.178171
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.04107161)
Null deviance: 4.0968 on 86 degrees of freedom
Residual deviance: 3.4089 on 83 degrees of freedom
AIC: -24.942
Number of Fisher Scoring iterations: 2
[1] "Verrucomicrobia"
Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum ==
p])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.11663 -0.03739 -0.01317 0.02312 0.15550
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -8.327e-01 2.493e-01 -3.341 0.00136 **
age 2.133e-02 4.649e-03 4.589 1.98e-05 ***
bmi -8.290e-17 4.231e-03 0.000 1.00000
subsetweight loss -2.545e-01 7.485e-02 -3.401 0.00113 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.003937753)
Null deviance: 0.40761 on 71 degrees of freedom
Residual deviance: 0.26777 on 68 degrees of freedom
AIC: -188.46
Number of Fisher Scoring iterations: 2
[1] "Actinobacteria"
Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum ==
p])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.08196 -0.06402 0.00000 0.01981 0.15352
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.976304 0.336267 2.903 0.0157 *
age 0.004305 0.004324 0.995 0.3430
bmi 0.000000 0.010124 0.000 1.0000
subsetweight loss -1.016904 0.067955 -14.964 3.58e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.007518454)
Null deviance: 1.820978 on 13 degrees of freedom
Residual deviance: 0.075185 on 10 degrees of freedom
AIC: -23.446
Number of Fisher Scoring iterations: 2
print("NA")
[1] "NA"
glm(log(rate) ~ age + bmi + subset, data=rates[is.na(phylum)]) %>% summary()
Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[is.na(phylum)])
Deviance Residuals:
Min 1Q Median 3Q Max
-0.25418 -0.10913 -0.05278 0.06350 0.83375
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.273270 0.067254 4.063 5.75e-05 ***
age 0.002490 0.001184 2.103 0.03608 *
bmi -0.004040 0.001422 -2.840 0.00472 **
subsetweight loss 0.036150 0.019988 1.809 0.07121 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.03366417)
Null deviance: 14.981 on 435 degrees of freedom
Residual deviance: 14.543 on 432 degrees of freedom
(1 observation deleted due to missingness)
AIC: -235.32
Number of Fisher Scoring iterations: 2
print("all")
[1] "all"
glm(log(rate) ~ age + bmi + subset, data=rates) %>% summary()
Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.2951 -0.1515 -0.0763 0.0425 3.4832
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.1760664 0.0486548 3.619 0.000305 ***
age 0.0017205 0.0009379 1.834 0.066768 .
bmi 0.0006676 0.0010573 0.631 0.527882
subsetweight loss 0.0513619 0.0156397 3.284 0.001044 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.08466307)
Null deviance: 148.45 on 1736 degrees of freedom
Residual deviance: 146.72 on 1733 degrees of freedom
(1 observation deleted due to missingness)
AIC: 646.6
Number of Fisher Scoring iterations: 2
library(arivale.data.interface)
gastro <- get_snapshot("assessments_digestive_health", clean=T)
`assessments_digestive_health` is slated for deprecation and will be part of `assessments` with raw data will be available in `assessments_raw`
gastro[, public_client_id := as.integer(public_client_id)]
NAs introduced by coercion
merged <- gastro[, .(public_client_id, assessment_digestion_bowel_movements_enum)][rates, on="public_client_id"]
merged[, "bowel_movement" := assessment_digestion_bowel_movements_enum]
ggplot(merged, aes(x=bowel_movement, y=rate, color=phylum, group=bowel_movement)) +
geom_jitter(width=0.2) + scale_y_log10() +
geom_boxplot(color="black", width=0.2, outlier.color=NA)

glm(log(rate) ~ bowel_movement + sex + age, data=unique(merged[, .(rate, bowel_movement, sex, age)]), family="gaussian") %>% summary()
Call:
glm(formula = log(rate) ~ bowel_movement + sex + age, family = "gaussian",
data = unique(merged[, .(rate, bowel_movement, sex, age)]))
Deviance Residuals:
Min 1Q Median 3Q Max
-0.4127 -0.1609 -0.0772 0.0509 3.5035
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.080222 0.072684 1.104 0.27000
bowel_movement(3) 1-3 times daily 0.049494 0.022931 2.158 0.03115 *
bowel_movement(4) 4+ times daily 0.186934 0.046594 4.012 6.5e-05 ***
sexM 0.033888 0.027568 1.229 0.21928
age 0.004252 0.001512 2.811 0.00504 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 0.08973296)
Null deviance: 85.857 on 935 degrees of freedom
Residual deviance: 83.541 on 931 degrees of freedom
(1 observation deleted due to missingness)
AIC: 406.62
Number of Fisher Scoring iterations: 2
glm(factor(subset) ~ bowel_movement + sex + age, data=unique(merged[, .(public_client_id, bowel_movement, sex, age, subset)]), family="binomial") %>% summary()
Call:
glm(formula = factor(subset) ~ bowel_movement + sex + age, family = "binomial",
data = unique(merged[, .(public_client_id, bowel_movement,
sex, age, subset)]))
Deviance Residuals:
Min 1Q Median 3Q Max
-1.7878 -1.1603 0.7238 0.9273 1.2411
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.563e-01 2.801e+00 -0.163 0.871
bowel_movement(3) 1-3 times daily 7.284e-01 9.248e-01 0.788 0.431
bowel_movement(4) 4+ times daily 1.663e+01 2.400e+03 0.007 0.994
sexM 5.761e-01 1.083e+00 0.532 0.595
age 8.319e-03 5.522e-02 0.151 0.880
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 35.594 on 26 degrees of freedom
Residual deviance: 33.098 on 22 degrees of freedom
AIC: 43.098
Number of Fisher Scoring iterations: 15
LS0tCnRpdGxlOiAiUmVwbGljYXRpb24gcmF0ZXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgUmVwbGljYXRpb24gcmF0ZXMKCmBgYHtyfQpsaWJyYXJ5KG1idG9vbHMpCgpyYXRlcyA8LSBmcmVhZCgiZGF0YS9yYXRlcy5jc3YiKQptYW5pZmVzdCA8LSBmcmVhZCgiZGF0YS9tYW5pZmVzdC5jc3YiKQpyYXRlcyA8LSByYXRlc1ttYW5pZmVzdCwgb24gPSBjKGlkID0gInZlbmRvcl9vYnNlcnZhdGlvbl9pZCIpLCBhbGxvdy5jYXJ0ZXNpYW4gPSBUXQoKdGF4b25vbXkgPC0gZnJlYWQoImRhdGEvY29udGlncy5jbGFzc2lmaWNhdGlvbi5uYW1lcy50eHQiLCBzZXA9Ilx0IiwgbmEuc3RyaW5ncz1jKCJOQSIsICJub3QgY2xhc3NpZmllZCIsICIiKSwgZmlsbD1UKQpmb3IgKGNvbCBpbiBjKCJzdXBlcmtpbmdkb20iLCAicGh5bHVtIiwgImNsYXNzIiwgIm9yZGVyIiwgImZhbWlseSIsICJnZW51cyIsICJzcGVjaWVzIikpIHsKICAgIHRheG9ub215W1tjb2xdXSA8LSBnc3ViKCI6LiskIiwgIiIsIHRheG9ub215W1tjb2xdXSkKfQpuYW1lcyh0YXhvbm9teSlbMV0gPC0gImNvbnRpZyIKcmF0ZXMgPC0gdGF4b25vbXlbcmF0ZXMsIG9uPSJjb250aWciXQpoZWFkKHJhdGVzKQpgYGAKCmBgYHtyLCBmaWcud2lkdGg9My41LCBmaWcuaGVpZ2h0PTR9CnRoZW1lX3NldCh0aGVtZV9taW5pbWFsKCkpCgpnZ3Bsb3QocmF0ZXMsIGFlcyh4ID0gc3Vic2V0LCB5ID0gcmF0ZSwgY29sb3IgPSBwaHlsdW0sIGdyb3VwID0gc3Vic2V0KSkgKyAKICAgIHNjYWxlX3lfbG9nMTAoKSArCiAgICBnZW9tX2ppdHRlcih3aWR0aD0wLjMpICsKICAgIGxhYnMoeCA9ICIiLCB5ID0gInJlcGxpY2F0aW9uIHJhdGUgW2EudS5dIikgKyB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZT00NSwgaGp1c3Q9MSwgdmp1c3Q9MSkpCmBgYAoKYGBge3IsIGZpZy53aWR0aD04LCBmaWcuaGVpZ2h0PTEuNX0KcGogPC0gcG9zaXRpb25faml0dGVyKHdpZHRoPTAuMykKZ2dwbG90KHJhdGVzLCBhZXMoeCA9IHN1YnNldCwgeSA9IHJhdGUsIGNvbG9yID0gcGh5bHVtLCBncm91cD1zdWJzZXQpKSArIAogICAgc2NhbGVfeV9sb2cxMCgpICsKICAgIGdlb21fcG9pbnQocG9zaXRpb249cGosIHNpemU9MSkgKyBmYWNldF93cmFwKH4gcGh5bHVtLCBucm93PTEpICsgCiAgICBzdGF0X3N1bW1hcnkoZnVuLnkgPSAibWVhbiIsIGdlb20gPSAicG9pbnQiLCBwY2g9MjMsIHN0cm9rZSA9IDEsIHNpemU9MiwgY29sb3I9ImJsYWNrIikgKwogICAgY29vcmRfZmxpcCgpICsKICAgIGxhYnMoeCA9ICIiLCB5ID0gInJlcGxpY2F0aW9uIHJhdGUgW2EudS5dIikgKyBndWlkZXMoY29sb3IgPSBGQUxTRSkKZ2dzYXZlKCJmaWd1cmVzL3JlcGxpY2F0aW9uLnBuZyIsIHdpZHRoPTgsIGhlaWdodD0xLjUsIGRwaT0zMDApCmBgYAoKYGBge3J9CmZvciAocCBpbiByYXRlc1shaXMubmEocGh5bHVtKSwgdW5pcXVlKHBoeWx1bSldKSB7CiAgICBpZiAocmF0ZXNbcGh5bHVtID09IHAsIHVuaXF1ZU4oc3Vic2V0KSA9PSAyXSkgewogICAgICAgIHByaW50KHApCiAgICAgICAgZ2xtKGxvZyhyYXRlKSB+IGFnZSArIGJtaSArIHN1YnNldCwgZGF0YT1yYXRlc1twaHlsdW0gPT0gcF0pICU+JSBzdW1tYXJ5KCkgJT4lIHByaW50KCkKICAgIH0KfQpwcmludCgiTkEiKQpnbG0obG9nKHJhdGUpIH4gYWdlICsgYm1pICsgc3Vic2V0LCBkYXRhPXJhdGVzW2lzLm5hKHBoeWx1bSldKSAlPiUgc3VtbWFyeSgpCnByaW50KCJhbGwiKQpnbG0obG9nKHJhdGUpIH4gYWdlICsgYm1pICsgc3Vic2V0LCBkYXRhPXJhdGVzKSAlPiUgc3VtbWFyeSgpCmBgYAoKYGBge3J9CmxpYnJhcnkoYXJpdmFsZS5kYXRhLmludGVyZmFjZSkKCgpnYXN0cm8gPC0gZ2V0X3NuYXBzaG90KCJhc3Nlc3NtZW50c19kaWdlc3RpdmVfaGVhbHRoIiwgY2xlYW49VCkKZ2FzdHJvWywgcHVibGljX2NsaWVudF9pZCA6PSBhcy5pbnRlZ2VyKHB1YmxpY19jbGllbnRfaWQpXQptZXJnZWQgPC0gZ2FzdHJvWywgLihwdWJsaWNfY2xpZW50X2lkLCBhc3Nlc3NtZW50X2RpZ2VzdGlvbl9ib3dlbF9tb3ZlbWVudHNfZW51bSldW3JhdGVzLCBvbj0icHVibGljX2NsaWVudF9pZCJdCm1lcmdlZFssICJib3dlbF9tb3ZlbWVudCIgOj0gYXNzZXNzbWVudF9kaWdlc3Rpb25fYm93ZWxfbW92ZW1lbnRzX2VudW1dCmBgYAoKYGBge3J9CmdncGxvdChtZXJnZWQsIGFlcyh4PWJvd2VsX21vdmVtZW50LCB5PXJhdGUsIGNvbG9yPXBoeWx1bSwgZ3JvdXA9Ym93ZWxfbW92ZW1lbnQpKSArIAogICAgZ2VvbV9qaXR0ZXIod2lkdGg9MC4yKSArIHNjYWxlX3lfbG9nMTAoKSArIAogICAgZ2VvbV9ib3hwbG90KGNvbG9yPSJibGFjayIsIHdpZHRoPTAuMiwgb3V0bGllci5jb2xvcj1OQSkKCmdsbShsb2cocmF0ZSkgfiBib3dlbF9tb3ZlbWVudCArIHNleCArIGFnZSwgZGF0YT11bmlxdWUobWVyZ2VkWywgLihyYXRlLCBib3dlbF9tb3ZlbWVudCwgc2V4LCBhZ2UpXSksIGZhbWlseT0iZ2F1c3NpYW4iKSAlPiUgc3VtbWFyeSgpCmBgYApgYGB7cn0KZ2xtKGZhY3RvcihzdWJzZXQpIH4gYm93ZWxfbW92ZW1lbnQgKyBzZXggKyBhZ2UsIGRhdGE9dW5pcXVlKG1lcmdlZFssIC4ocHVibGljX2NsaWVudF9pZCwgYm93ZWxfbW92ZW1lbnQsIHNleCwgYWdlLCBzdWJzZXQpXSksIGZhbWlseT0iYmlub21pYWwiKSAlPiUgc3VtbWFyeSgpCmBgYAoKIyBDb2hvcnQgY29tcG9zaXRpb24KCmBgYHtyfQpsaWJyYXJ5KG1idG9vbHMpCmdlbmVyYSA8LSBmcmVhZCgiZGF0YS9HX2NvdW50cy5jc3YiKQoKbWF0IDwtIGRjYXN0KGdlbmVyYSwgYHNhbXBsZWAgfiBnZW51cywgdmFsdWUudmFyPSJyZWFkcyIpCmlkcyA8LSBtYXRbLCBgc2FtcGxlYF0KbWF0IDwtIGFzLm1hdHJpeChtYXRbLCAiZ2VudXMiIDo9IE5VTExdKQpyb3duYW1lcyhtYXQpIDwtIGlkcwp0YXhhIDwtIG1hdHJpeChjb2xuYW1lcyhtYXQpLCBuY29sPTEpCmNvbG5hbWVzKHRheGEpIDwtICJnZW51cyIKcm93bmFtZXModGF4YSkgPC0gdGF4YVssMV0KcHMgPC0gcGh5bG9zZXEob3R1X3RhYmxlKG1hdCwgdGF4YV9hcmVfcm93cyA9IEYpLCB0YXhfdGFibGUodGF4YSkpCgpwbG90X3RheGEocHMsICJnZW51cyIpCmBgYA==