Replication rates

library(mbtools)

rates <- fread("data/rates.csv")
manifest <- fread("data/manifest.csv")
rates <- rates[manifest, on = c(id = "vendor_observation_id"), allow.cartesian = T]

taxonomy <- fread("data/contigs.classification.names.txt", sep="\t", na.strings=c("NA", "not classified", ""), fill=T)
for (col in c("superkingdom", "phylum", "class", "order", "family", "genus", "species")) {
    taxonomy[[col]] <- gsub(":.+$", "", taxonomy[[col]])
}
names(taxonomy)[1] <- "contig"
rates <- taxonomy[rates, on="contig"]
head(rates)
theme_set(theme_minimal())

ggplot(rates, aes(x = subset, y = rate, color = phylum, group = subset)) + 
    scale_y_log10() +
    geom_jitter(width=0.3) +
    labs(x = "", y = "replication rate [a.u.]") + theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1))

pj <- position_jitter(width=0.3)
ggplot(rates, aes(x = subset, y = rate, color = phylum, group=subset)) + 
    scale_y_log10() +
    geom_point(position=pj, size=1) + facet_wrap(~ phylum, nrow=1) + 
    stat_summary(fun.y = "mean", geom = "point", pch=23, stroke = 1, size=2, color="black") +
    coord_flip() +
    labs(x = "", y = "replication rate [a.u.]") + guides(color = FALSE)
`fun.y` is deprecated. Use `fun` instead.

ggsave("figures/replication.png", width=8, height=1.5, dpi=300)
for (p in rates[!is.na(phylum), unique(phylum)]) {
    if (rates[phylum == p, uniqueN(subset) == 2]) {
        print(p)
        glm(log(rate) ~ age + bmi + subset, data=rates[phylum == p]) %>% summary() %>% print()
    }
}
[1] "Bacteroidetes"

Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum == 
    p])

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3073  -0.1641  -0.0828   0.0452   3.4557  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)   
(Intercept)       0.1633029  0.0664069   2.459  0.01408 * 
age               0.0020464  0.0014240   1.437  0.15098   
bmi               0.0008312  0.0014698   0.566  0.57180   
subsetweight loss 0.0698759  0.0221775   3.151  0.00167 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.1091428)

    Null deviance: 124.15  on 1123  degrees of freedom
Residual deviance: 122.24  on 1120  degrees of freedom
AIC: 706

Number of Fisher Scoring iterations: 2

[1] "Firmicutes"

Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum == 
    p])

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.25251  -0.14101  -0.06097   0.15636   0.63189  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -0.133337   0.161538  -0.825 0.411498    
age                0.010908   0.002733   3.991 0.000141 ***
bmi               -0.003015   0.003614  -0.834 0.406494    
subsetweight loss  0.064405   0.047429   1.358 0.178171    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.04107161)

    Null deviance: 4.0968  on 86  degrees of freedom
Residual deviance: 3.4089  on 83  degrees of freedom
AIC: -24.942

Number of Fisher Scoring iterations: 2

[1] "Verrucomicrobia"

Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum == 
    p])

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.11663  -0.03739  -0.01317   0.02312   0.15550  

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -8.327e-01  2.493e-01  -3.341  0.00136 ** 
age                2.133e-02  4.649e-03   4.589 1.98e-05 ***
bmi               -8.290e-17  4.231e-03   0.000  1.00000    
subsetweight loss -2.545e-01  7.485e-02  -3.401  0.00113 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.003937753)

    Null deviance: 0.40761  on 71  degrees of freedom
Residual deviance: 0.26777  on 68  degrees of freedom
AIC: -188.46

Number of Fisher Scoring iterations: 2

[1] "Actinobacteria"

Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[phylum == 
    p])

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.08196  -0.06402   0.00000   0.01981   0.15352  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.976304   0.336267   2.903   0.0157 *  
age                0.004305   0.004324   0.995   0.3430    
bmi                0.000000   0.010124   0.000   1.0000    
subsetweight loss -1.016904   0.067955 -14.964 3.58e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.007518454)

    Null deviance: 1.820978  on 13  degrees of freedom
Residual deviance: 0.075185  on 10  degrees of freedom
AIC: -23.446

Number of Fisher Scoring iterations: 2
print("NA")
[1] "NA"
glm(log(rate) ~ age + bmi + subset, data=rates[is.na(phylum)]) %>% summary()

Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates[is.na(phylum)])

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-0.25418  -0.10913  -0.05278   0.06350   0.83375  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.273270   0.067254   4.063 5.75e-05 ***
age                0.002490   0.001184   2.103  0.03608 *  
bmi               -0.004040   0.001422  -2.840  0.00472 ** 
subsetweight loss  0.036150   0.019988   1.809  0.07121 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.03366417)

    Null deviance: 14.981  on 435  degrees of freedom
Residual deviance: 14.543  on 432  degrees of freedom
  (1 observation deleted due to missingness)
AIC: -235.32

Number of Fisher Scoring iterations: 2
print("all")
[1] "all"
glm(log(rate) ~ age + bmi + subset, data=rates) %>% summary()

Call:
glm(formula = log(rate) ~ age + bmi + subset, data = rates)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.2951  -0.1515  -0.0763   0.0425   3.4832  

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       0.1760664  0.0486548   3.619 0.000305 ***
age               0.0017205  0.0009379   1.834 0.066768 .  
bmi               0.0006676  0.0010573   0.631 0.527882    
subsetweight loss 0.0513619  0.0156397   3.284 0.001044 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.08466307)

    Null deviance: 148.45  on 1736  degrees of freedom
Residual deviance: 146.72  on 1733  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 646.6

Number of Fisher Scoring iterations: 2
library(arivale.data.interface)


gastro <- get_snapshot("assessments_digestive_health", clean=T)
`assessments_digestive_health` is slated for deprecation and will be part of `assessments` with raw data will be available in `assessments_raw`
gastro[, public_client_id := as.integer(public_client_id)]
NAs introduced by coercion
merged <- gastro[, .(public_client_id, assessment_digestion_bowel_movements_enum)][rates, on="public_client_id"]
merged[, "bowel_movement" := assessment_digestion_bowel_movements_enum]
ggplot(merged, aes(x=bowel_movement, y=rate, color=phylum, group=bowel_movement)) + 
    geom_jitter(width=0.2) + scale_y_log10() + 
    geom_boxplot(color="black", width=0.2, outlier.color=NA)


glm(log(rate) ~ bowel_movement + sex + age, data=unique(merged[, .(rate, bowel_movement, sex, age)]), family="gaussian") %>% summary()

Call:
glm(formula = log(rate) ~ bowel_movement + sex + age, family = "gaussian", 
    data = unique(merged[, .(rate, bowel_movement, sex, age)]))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.4127  -0.1609  -0.0772   0.0509   3.5035  

Coefficients:
                                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)                       0.080222   0.072684   1.104  0.27000    
bowel_movement(3) 1-3 times daily 0.049494   0.022931   2.158  0.03115 *  
bowel_movement(4) 4+ times daily  0.186934   0.046594   4.012  6.5e-05 ***
sexM                              0.033888   0.027568   1.229  0.21928    
age                               0.004252   0.001512   2.811  0.00504 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 0.08973296)

    Null deviance: 85.857  on 935  degrees of freedom
Residual deviance: 83.541  on 931  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 406.62

Number of Fisher Scoring iterations: 2
glm(factor(subset) ~ bowel_movement + sex + age, data=unique(merged[, .(public_client_id, bowel_movement, sex, age, subset)]), family="binomial") %>% summary()

Call:
glm(formula = factor(subset) ~ bowel_movement + sex + age, family = "binomial", 
    data = unique(merged[, .(public_client_id, bowel_movement, 
        sex, age, subset)]))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.7878  -1.1603   0.7238   0.9273   1.2411  

Coefficients:
                                    Estimate Std. Error z value Pr(>|z|)
(Intercept)                       -4.563e-01  2.801e+00  -0.163    0.871
bowel_movement(3) 1-3 times daily  7.284e-01  9.248e-01   0.788    0.431
bowel_movement(4) 4+ times daily   1.663e+01  2.400e+03   0.007    0.994
sexM                               5.761e-01  1.083e+00   0.532    0.595
age                                8.319e-03  5.522e-02   0.151    0.880

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 35.594  on 26  degrees of freedom
Residual deviance: 33.098  on 22  degrees of freedom
AIC: 43.098

Number of Fisher Scoring iterations: 15

Cohort composition

library(mbtools)
genera <- fread("data/G_counts.csv")

mat <- dcast(genera, `sample` ~ genus, value.var="reads")
ids <- mat[, `sample`]
mat <- as.matrix(mat[, "genus" := NULL])
Column 'genus' does not exist to remove
rownames(mat) <- ids
taxa <- matrix(colnames(mat), ncol=1)
colnames(taxa) <- "genus"
rownames(taxa) <- taxa[,1]
ps <- phyloseq(otu_table(mat, taxa_are_rows = F), tax_table(taxa))

plot_taxa(ps, "genus")

LS0tCnRpdGxlOiAiUmVwbGljYXRpb24gcmF0ZXMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgUmVwbGljYXRpb24gcmF0ZXMKCmBgYHtyfQpsaWJyYXJ5KG1idG9vbHMpCgpyYXRlcyA8LSBmcmVhZCgiZGF0YS9yYXRlcy5jc3YiKQptYW5pZmVzdCA8LSBmcmVhZCgiZGF0YS9tYW5pZmVzdC5jc3YiKQpyYXRlcyA8LSByYXRlc1ttYW5pZmVzdCwgb24gPSBjKGlkID0gInZlbmRvcl9vYnNlcnZhdGlvbl9pZCIpLCBhbGxvdy5jYXJ0ZXNpYW4gPSBUXQoKdGF4b25vbXkgPC0gZnJlYWQoImRhdGEvY29udGlncy5jbGFzc2lmaWNhdGlvbi5uYW1lcy50eHQiLCBzZXA9Ilx0IiwgbmEuc3RyaW5ncz1jKCJOQSIsICJub3QgY2xhc3NpZmllZCIsICIiKSwgZmlsbD1UKQpmb3IgKGNvbCBpbiBjKCJzdXBlcmtpbmdkb20iLCAicGh5bHVtIiwgImNsYXNzIiwgIm9yZGVyIiwgImZhbWlseSIsICJnZW51cyIsICJzcGVjaWVzIikpIHsKICAgIHRheG9ub215W1tjb2xdXSA8LSBnc3ViKCI6LiskIiwgIiIsIHRheG9ub215W1tjb2xdXSkKfQpuYW1lcyh0YXhvbm9teSlbMV0gPC0gImNvbnRpZyIKcmF0ZXMgPC0gdGF4b25vbXlbcmF0ZXMsIG9uPSJjb250aWciXQpoZWFkKHJhdGVzKQpgYGAKCmBgYHtyLCBmaWcud2lkdGg9My41LCBmaWcuaGVpZ2h0PTR9CnRoZW1lX3NldCh0aGVtZV9taW5pbWFsKCkpCgpnZ3Bsb3QocmF0ZXMsIGFlcyh4ID0gc3Vic2V0LCB5ID0gcmF0ZSwgY29sb3IgPSBwaHlsdW0sIGdyb3VwID0gc3Vic2V0KSkgKyAKICAgIHNjYWxlX3lfbG9nMTAoKSArCiAgICBnZW9tX2ppdHRlcih3aWR0aD0wLjMpICsKICAgIGxhYnMoeCA9ICIiLCB5ID0gInJlcGxpY2F0aW9uIHJhdGUgW2EudS5dIikgKyB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZT00NSwgaGp1c3Q9MSwgdmp1c3Q9MSkpCmBgYAoKYGBge3IsIGZpZy53aWR0aD04LCBmaWcuaGVpZ2h0PTEuNX0KcGogPC0gcG9zaXRpb25faml0dGVyKHdpZHRoPTAuMykKZ2dwbG90KHJhdGVzLCBhZXMoeCA9IHN1YnNldCwgeSA9IHJhdGUsIGNvbG9yID0gcGh5bHVtLCBncm91cD1zdWJzZXQpKSArIAogICAgc2NhbGVfeV9sb2cxMCgpICsKICAgIGdlb21fcG9pbnQocG9zaXRpb249cGosIHNpemU9MSkgKyBmYWNldF93cmFwKH4gcGh5bHVtLCBucm93PTEpICsgCiAgICBzdGF0X3N1bW1hcnkoZnVuLnkgPSAibWVhbiIsIGdlb20gPSAicG9pbnQiLCBwY2g9MjMsIHN0cm9rZSA9IDEsIHNpemU9MiwgY29sb3I9ImJsYWNrIikgKwogICAgY29vcmRfZmxpcCgpICsKICAgIGxhYnMoeCA9ICIiLCB5ID0gInJlcGxpY2F0aW9uIHJhdGUgW2EudS5dIikgKyBndWlkZXMoY29sb3IgPSBGQUxTRSkKZ2dzYXZlKCJmaWd1cmVzL3JlcGxpY2F0aW9uLnBuZyIsIHdpZHRoPTgsIGhlaWdodD0xLjUsIGRwaT0zMDApCmBgYAoKYGBge3J9CmZvciAocCBpbiByYXRlc1shaXMubmEocGh5bHVtKSwgdW5pcXVlKHBoeWx1bSldKSB7CiAgICBpZiAocmF0ZXNbcGh5bHVtID09IHAsIHVuaXF1ZU4oc3Vic2V0KSA9PSAyXSkgewogICAgICAgIHByaW50KHApCiAgICAgICAgZ2xtKGxvZyhyYXRlKSB+IGFnZSArIGJtaSArIHN1YnNldCwgZGF0YT1yYXRlc1twaHlsdW0gPT0gcF0pICU+JSBzdW1tYXJ5KCkgJT4lIHByaW50KCkKICAgIH0KfQpwcmludCgiTkEiKQpnbG0obG9nKHJhdGUpIH4gYWdlICsgYm1pICsgc3Vic2V0LCBkYXRhPXJhdGVzW2lzLm5hKHBoeWx1bSldKSAlPiUgc3VtbWFyeSgpCnByaW50KCJhbGwiKQpnbG0obG9nKHJhdGUpIH4gYWdlICsgYm1pICsgc3Vic2V0LCBkYXRhPXJhdGVzKSAlPiUgc3VtbWFyeSgpCmBgYAoKYGBge3J9CmxpYnJhcnkoYXJpdmFsZS5kYXRhLmludGVyZmFjZSkKCgpnYXN0cm8gPC0gZ2V0X3NuYXBzaG90KCJhc3Nlc3NtZW50c19kaWdlc3RpdmVfaGVhbHRoIiwgY2xlYW49VCkKZ2FzdHJvWywgcHVibGljX2NsaWVudF9pZCA6PSBhcy5pbnRlZ2VyKHB1YmxpY19jbGllbnRfaWQpXQptZXJnZWQgPC0gZ2FzdHJvWywgLihwdWJsaWNfY2xpZW50X2lkLCBhc3Nlc3NtZW50X2RpZ2VzdGlvbl9ib3dlbF9tb3ZlbWVudHNfZW51bSldW3JhdGVzLCBvbj0icHVibGljX2NsaWVudF9pZCJdCm1lcmdlZFssICJib3dlbF9tb3ZlbWVudCIgOj0gYXNzZXNzbWVudF9kaWdlc3Rpb25fYm93ZWxfbW92ZW1lbnRzX2VudW1dCmBgYAoKYGBge3J9CmdncGxvdChtZXJnZWQsIGFlcyh4PWJvd2VsX21vdmVtZW50LCB5PXJhdGUsIGNvbG9yPXBoeWx1bSwgZ3JvdXA9Ym93ZWxfbW92ZW1lbnQpKSArIAogICAgZ2VvbV9qaXR0ZXIod2lkdGg9MC4yKSArIHNjYWxlX3lfbG9nMTAoKSArIAogICAgZ2VvbV9ib3hwbG90KGNvbG9yPSJibGFjayIsIHdpZHRoPTAuMiwgb3V0bGllci5jb2xvcj1OQSkKCmdsbShsb2cocmF0ZSkgfiBib3dlbF9tb3ZlbWVudCArIHNleCArIGFnZSwgZGF0YT11bmlxdWUobWVyZ2VkWywgLihyYXRlLCBib3dlbF9tb3ZlbWVudCwgc2V4LCBhZ2UpXSksIGZhbWlseT0iZ2F1c3NpYW4iKSAlPiUgc3VtbWFyeSgpCmBgYApgYGB7cn0KZ2xtKGZhY3RvcihzdWJzZXQpIH4gYm93ZWxfbW92ZW1lbnQgKyBzZXggKyBhZ2UsIGRhdGE9dW5pcXVlKG1lcmdlZFssIC4ocHVibGljX2NsaWVudF9pZCwgYm93ZWxfbW92ZW1lbnQsIHNleCwgYWdlLCBzdWJzZXQpXSksIGZhbWlseT0iYmlub21pYWwiKSAlPiUgc3VtbWFyeSgpCmBgYAoKIyBDb2hvcnQgY29tcG9zaXRpb24KCmBgYHtyfQpsaWJyYXJ5KG1idG9vbHMpCmdlbmVyYSA8LSBmcmVhZCgiZGF0YS9HX2NvdW50cy5jc3YiKQoKbWF0IDwtIGRjYXN0KGdlbmVyYSwgYHNhbXBsZWAgfiBnZW51cywgdmFsdWUudmFyPSJyZWFkcyIpCmlkcyA8LSBtYXRbLCBgc2FtcGxlYF0KbWF0IDwtIGFzLm1hdHJpeChtYXRbLCAiZ2VudXMiIDo9IE5VTExdKQpyb3duYW1lcyhtYXQpIDwtIGlkcwp0YXhhIDwtIG1hdHJpeChjb2xuYW1lcyhtYXQpLCBuY29sPTEpCmNvbG5hbWVzKHRheGEpIDwtICJnZW51cyIKcm93bmFtZXModGF4YSkgPC0gdGF4YVssMV0KcHMgPC0gcGh5bG9zZXEob3R1X3RhYmxlKG1hdCwgdGF4YV9hcmVfcm93cyA9IEYpLCB0YXhfdGFibGUodGF4YSkpCgpwbG90X3RheGEocHMsICJnZW51cyIpCmBgYA==