# Let's hunt some defense systems in marine SAGs
# read the genomes
s0 <- read_seqs(ex("gorg/gorg.fna.fai"))
s1 <- s0 %>%
# strip trailing number from contigs to get bins
dplyr::mutate(bin_id = stringr::str_remove(seq_id, "_\\d+$"))
# gene annotations from prokka
g0 <- read_feats(ex("gorg/gorg.gff.xz"))
# best hits to the PADS Arsenal database of prokaryotic defense-system genes
# $ mmseqs easy-search gorg.fna pads-arsenal-v1-prf gorg-pads-defense.o6 /tmp \
# --greedy-best-hits
f0 <- read_feats(ex("gorg/gorg-pads-defense.o6"))
f1 <- f0 %>%
# parser system/gene info
tidyr::separate(seq_id2, into = c("seq_id2", "system", "gene"), sep = ",") %>%
dplyr::filter(
evalue < 1e-10, # get rid of some spurious hits
# and let's focus just on a few systems for this example
system %in% c("CRISPR-CAS", "DISARM", "GABIJA", "LAMASSU", "THOERIS")
)
# plot the distribution of hits across full genomes
gggenomes(g0, s1, f1, wrap = 2e5) +
geom_seq() + geom_bin_label() +
scale_color_brewer(palette = "Dark2") +
geom_point(aes(x = x, y = y, color = system), data = feats())
# hilight the regions containing hits
gggenomes(g0, s1, f1, wrap = 2e5) %>%
locate(.track_id = feats) %>%
identity() +
geom_seq() + geom_bin_label() +
scale_color_brewer(palette = "Dark2") +
geom_feat(data = feats(loci), color = "plum3") +
geom_point(aes(x = x, y = y, color = system), data = feats())
# zoom in on loci
gggenomes(g0, s1, f1, wrap = 5e4) %>%
focus(.track_id = feats) +
geom_seq() + geom_bin_label() +
geom_gene() +
geom_feat(aes(color = system)) +
geom_feat_tag(aes(label = gene)) +
scale_color_brewer(palette = "Dark2")
Run the code above in your browser using DataLab