Tutorial 17: Viroid Sketch Round Trip (Mash + sourmash + Sylph)
This tutorial demonstrates a full, reproducible round-trip workflow: download a BLAST database, export it to FASTA, index with sketch tools, simulate reads from a known subset, and assert expected matches. Heavy steps are gated behind MYCELIA_RUN_EXTERNAL=true.
Setup
From the Mycelia base directory, convert this tutorial to a notebook:
julia --project=. -e 'import Literate; Literate.notebook("tutorials/17_viroid_sketch_round_trip.jl", "tutorials/notebooks", execute=false)'import Pkg
if isinteractive()
Pkg.activate("..")
end
import Mycelia
import DataFrames
import CSV
import StableRNGs
import StatsBase
import Test
run_external = get(ENV, "MYCELIA_RUN_EXTERNAL", "false") == "true"
if !run_external
println("Set MYCELIA_RUN_EXTERNAL=true to download references and run external tools.")
end
base_outdir = "results/viroid_sketch_round_trip"
reference_dir = joinpath(base_outdir, "reference")
mkpath(reference_dir)
reference_db = "ref_viroids_rep_genomes"
reference_fasta = joinpath(reference_dir, "$(reference_db).fna.gz")
reference_table_csv = joinpath(reference_dir, "viroids_reference_table.csv")
reference_genomes_dir = joinpath(reference_dir, "genomes")Step 1: Download the viroid BLAST database
reference_table = DataFrames.DataFrame()
if run_external
db_path = Mycelia.download_blast_db(db=reference_db, dbdir=Mycelia.DEFAULT_BLASTDB_PATH)
Test.@test !isempty(db_path)Step 2: Export to FASTA + build reference table
reference_table = Mycelia.prepare_blast_reference_table(
blastdb=db_path,
blastdbs_dir=Mycelia.DEFAULT_BLASTDB_PATH,
download_if_missing=false,
reference_fasta=reference_fasta,
taxonomy_map_out=joinpath(reference_dir, "$(reference_db).seqid2taxid.txt.gz"),
table_out=reference_table_csv,
force=false
)
Test.@test isfile(reference_fasta)
Test.@test !isempty(reference_table)
endStep 3: Build reference indices for mash + sourmash
reference_sourmash_sig = ""
reference_mash_db = ""
reference_id_map = Dict{String,String}()
reference_genomes = String[]
if run_external && !isempty(reference_table)
reference_id_map = Mycelia.split_fasta_by_record(
fasta_in=reference_fasta,
outdir=reference_genomes_dir,
gzip=false,
force=false
)
reference_genomes = Mycelia.find_fasta_files(reference_genomes_dir)
Test.@test !isempty(reference_genomes)
reference_sourmash_dir = joinpath(reference_dir, "sourmash")
reference_mash_dir = joinpath(reference_dir, "mash")
mkpath.(String[reference_sourmash_dir, reference_mash_dir])
sourmash_ref = Mycelia.run_sourmash_sketch(
input_files=[reference_fasta],
outdir=reference_sourmash_dir,
k_sizes=[31],
scaled=1000,
singleton=true
)
Test.@test !isempty(sourmash_ref.signatures)
reference_sourmash_sig = sourmash_ref.signatures[1]
mash_result = Mycelia.run_mash_sketch(
input_files=reference_genomes,
outdir=reference_mash_dir,
k=21,
s=10000,
r=false
)
Test.@test !isempty(mash_result.sketches)
reference_mash_db = Mycelia.run_mash_paste(
out_file=joinpath(reference_mash_dir, "$(reference_db).msh"),
in_files=mash_result.sketches
)
endStep 4: Simulate reads from a known subset
depth_target = 20
n_organisms = 1
readset = :illumina_pe150
replicate = 1
rng = StableRNGs.StableRNG(1234)
sim = nothing
selected_ids = String[]
scenario_dir = joinpath(base_outdir, "depth$(depth_target)_div$(n_organisms)")
if run_external && !isempty(reference_table)
available_ids = unique(String.(reference_table.sequence_id))
selected_ids = StatsBase.sample(rng, available_ids, n_organisms; replace=false)
weights = fill(1.0, n_organisms)
sim = Mycelia.simulate_metagenome_community(
reference_fasta=reference_fasta,
reference_table=reference_table,
n_organisms=n_organisms,
depth_target=depth_target,
abundance_profile=:custom,
readset=readset,
outdir=scenario_dir,
selected_ids=selected_ids,
weights=weights,
rng=rng,
replicate=replicate,
run_simulation=true,
emit_truth_reads=true
)
CSV.write(joinpath(scenario_dir, "truth_table.csv"), sim.truth_table)
endStep 5: Screen reads with mash, sourmash, and Sylph
if run_external && !isnothing(sim)
reads_forward = sim.reads.forward
reads_reverse = sim.reads.reverse
Test.@test !isnothing(reads_forward)
merged_reads = reads_forward
if !isnothing(reads_reverse)
merged_reads = joinpath(scenario_dir, "reads_merged.fq.gz")
Mycelia.concatenate_fastq_files(
fastq_files=[reads_forward, reads_reverse],
output_fastq=merged_reads,
gzip=true,
force=true
)
end
sourmash_outdir = joinpath(scenario_dir, "sourmash")
mash_outdir = joinpath(scenario_dir, "mash")
sylph_outdir = joinpath(scenario_dir, "sylph")
mkpath.(String[sourmash_outdir, mash_outdir, sylph_outdir])
query_sig = Mycelia.run_sourmash_sketch(
input_files=[merged_reads],
outdir=sourmash_outdir,
k_sizes=[31],
scaled=1000
).signatures[1]
sourmash_gather = Mycelia.run_sourmash_gather(
query_sig=query_sig,
database_sig=reference_sourmash_sig,
outdir=sourmash_outdir,
k_size=31
)
sourmash_df = CSV.read(sourmash_gather.results_csv, DataFrames.DataFrame)
mash_screen = Mycelia.run_mash_screen(
reference=reference_mash_db,
query=merged_reads,
outdir=mash_outdir,
winner_takes_all=true
)
mash_df = Mycelia.parse_mash_screen_output(mash_screen.results_tsv)
sylph_result = if isnothing(reads_reverse)
Mycelia.run_sylph_profile(
reference_fastas=reference_genomes,
sample_reads=[reads_forward],
outdir=sylph_outdir
)
else
Mycelia.run_sylph_profile(
reference_fastas=reference_genomes,
first_pairs=[reads_forward],
second_pairs=[reads_reverse],
outdir=sylph_outdir
)
endStep 6: Assert expected matches
We expect the selected reference(s) to appear in the top hits.
truth_ids = String.(selected_ids)
Test.@test !isempty(mash_df)
mash_preds = [Mycelia.normalize_reference_label(x, reference_id_map) for x in mash_df.reference]
Test.@test truth_ids[1] in mash_preds
Test.@test !isempty(sourmash_df)
name_col = if "name" in DataFrames.names(sourmash_df)
"name"
elseif "match_name" in DataFrames.names(sourmash_df)
"match_name"
else
""
end
Test.@test !isempty(name_col)
sourmash_preds = [Mycelia.normalize_reference_label(x, reference_id_map) for x in sourmash_df[!, name_col]]
Test.@test truth_ids[1] in sourmash_preds
Test.@test !isempty(sylph_result.table)
sylph_col = first(filter(col -> col in DataFrames.names(sylph_result.table), ["genome", "reference", "ref", "name"]), "")
Test.@test !isempty(sylph_col)
sylph_preds = [Mycelia.normalize_reference_label(x, reference_id_map) for x in sylph_result.table[!, sylph_col]]
Test.@test truth_ids[1] in sylph_preds
end