Round-Trip Benchmark Tutorial (Viroids)
This tutorial walks through a small, reproducible round-trip benchmark: download a viroid reference database, build taxonomy, simulate reads, and screen with sketch tools. Heavy steps are gated behind MYCELIA_RUN_EXTERNAL=true.
From the Mycelia base directory, convert this tutorial to a notebook:
julia --project=. -e 'import Literate; Literate.notebook("tutorials/15_round_trip_benchmarking.jl", "tutorials/notebooks", execute=false)'import Pkg
if isinteractive()
Pkg.activate("..")
end
import Mycelia
import DataFrames
import CSV
import StableRNGs
import StatsBase
run_external = get(ENV, "MYCELIA_RUN_EXTERNAL", "false") == "true"
if !run_external
println("Set MYCELIA_RUN_EXTERNAL=true to download references and run external tools.")
end
base_outdir = "results/round_trip_tutorial"
reference_dir = joinpath(base_outdir, "reference")
mkpath(reference_dir)
reference_db = "ref_viroids_rep_genomes"
taxonomy_db = "taxdb"
reference_fasta = joinpath(reference_dir, "$(reference_db).fna.gz")Step 1: Download reference databases
We use Mycelia's BLAST DB downloader for both the viroid database and taxdb.
reference_table = DataFrames.DataFrame()
if run_external
Mycelia.download_blast_db(db=taxonomy_db)
reference_table = Mycelia.prepare_blast_reference_table(
blastdb=reference_db,
blastdbs_dir=Mycelia.DEFAULT_BLASTDB_PATH,
download_if_missing=true,
reference_fasta=reference_fasta,
taxonomy_map_out=joinpath(reference_dir, "$(reference_db).seqid2taxid.txt.gz"),
table_out=joinpath(reference_dir, "viroids_reference_table.csv"),
force=false
)
endStep 3: Define one small scenario
depth_target = 10
n_organisms = 1
balance = :equal
readset = :illumina_pe150
replicate = 1
rng = StableRNGs.StableRNG(1234)
if run_external && !isempty(reference_table)
available_ids = unique(String.(reference_table.sequence_id))
selected_ids = StatsBase.sample(rng, available_ids, n_organisms; replace=false)
weights = Mycelia.sample_abundance_weights(n_organisms=n_organisms, balance=balance, rng=rng)
scenario_dir = joinpath(base_outdir, "depth$(depth_target)_div$(n_organisms)_$(balance)")
sim = Mycelia.simulate_metagenome_community(
reference_fasta=reference_fasta,
reference_table=reference_table,
n_organisms=n_organisms,
depth_target=depth_target,
abundance_profile=:custom,
readset=readset,
outdir=scenario_dir,
selected_ids=selected_ids,
weights=weights,
rng=rng,
replicate=replicate,
run_simulation=true,
emit_truth_reads=true
)
CSV.write(joinpath(scenario_dir, "truth_table.csv"), sim.truth_table)
endStep 4: Screen reads with sketch tools
Use Mash, sourmash, and Sylph to identify supported references.
if run_external
println("Run sketch screening in the benchmark script:")
println("julia --project=. benchmarking/15_round_trip_benchmark.jl")
end