Tutorial 14: Binning Workflow with VAMB, MetaBAT2, and dRep
This tutorial demonstrates a practical metagenomic binning workflow:
- Prepare contigs and a coverage/depth table
- Run
VAMBandMetaBAT2on the same assembly - Inspect the resulting contig-to-bin assignments and bin FASTAs
- Dereplicate candidate MAGs with
dRep
The tutorial is designed to be useful in two modes:
- Default mode: lightweight parser demos and exact setup guidance
- External mode (
MYCELIA_RUN_EXTERNAL=true): run the full workflow
If you do not already have contigs and a depth table, the external mode can generate reproducible synthetic inputs via Mycelia.get_binning_test_inputs().
Setup
From the Mycelia base directory, convert this tutorial to a notebook:
julia --project=. -e 'import Literate; Literate.notebook("tutorials/14_binning_workflow.jl", "tutorials/notebooks", execute=false)'import Pkg
if isinteractive()
Pkg.activate("..")
end
import Mycelia
import DataFrames
const RUN_EXTERNAL = get(ENV, "MYCELIA_RUN_EXTERNAL", "false") == "true"
const CONTIGS_FASTA_ENV = get(ENV, "MYCELIA_BINNING_CONTIGS", "")
const DEPTH_FILE_ENV = get(ENV, "MYCELIA_BINNING_DEPTH", "")
const GENOMES_ENV = filter(!isempty, strip.(split(get(ENV, "MYCELIA_BINNING_GENOMES", ""), ',')))
function _preview_lines(path::String; n::Int = 5)
isfile(path) || return String[]
lines = String[]
open(path, "r") do io
while !eof(io) && length(lines) < n
push!(lines, readline(io))
end
end
return lines
end
function _find_first_matching_file(dir::String, patterns::Vector{Regex})
isdir(dir) || return nothing
for (dirpath, _, filenames) in walkdir(dir)
for name in sort(filenames)
for pattern in patterns
if occursin(pattern, name)
return joinpath(dirpath, name)
end
end
end
end
return nothing
end
function _fasta_paths(dir::String)
isdir(dir) || return String[]
fasta_paths = String[]
for name in sort(readdir(dir))
path = joinpath(dir, name)
isfile(path) || continue
lower_name = lowercase(name)
if endswith(lower_name, ".fa") ||
endswith(lower_name, ".fna") ||
endswith(lower_name, ".fasta")
push!(fasta_paths, path)
end
end
return fasta_paths
end
function _summarize_fasta_files(paths::Vector{String})
rows = NamedTuple[]
for path in paths
record_count = 0
total_bases = 0
for record in Mycelia.open_fastx(path)
record_count += 1
total_bases += length(Mycelia.FASTX.sequence(record))
end
push!(rows, (
genome = basename(path),
records = record_count,
total_bases = total_bases
))
end
if isempty(rows)
return DataFrames.DataFrame(
genome = String[],
records = Int[],
total_bases = Int[]
)
end
return DataFrames.DataFrame(rows)
end
function _summarize_assignments(df::DataFrames.DataFrame)
if DataFrames.nrow(df) == 0
return DataFrames.DataFrame(bin = String[], n_contigs = Int[])
end
summary = DataFrames.combine(
DataFrames.groupby(df, :bin),
:contig => length => :n_contigs
)
DataFrames.sort!(summary, :n_contigs, rev = true)
return summary
end
function _print_section(title::String)
println()
println(title)
println(repeat("=", length(title)))
end
println("=== Binning Workflow Tutorial ===")
println("Workflow: contigs + depth -> VAMB / MetaBAT2 -> candidate MAGs -> dRep")
workdir = mktempdir()
println("Scratch workspace: ", workdir)Step 1: Learn the parser interfaces first
The workflow wrappers emit files that you typically inspect or pass into later stages. These parsers are cheap to run and clarify what the downstream tables should look like before invoking external tools.
_print_section("Step 1: Parser utilities")
toy_assignments = joinpath(workdir, "toy_assignments.tsv")
open(toy_assignments, "w") do io
write(io, "contig\tbin\tlength\n")
write(io, "contig_001\tbin_A\t120000\n")
write(io, "contig_002\tbin_A\t95000\n")
write(io, "contig_003\tbin_B\t87000\n")
end
assignments_df = Mycelia.parse_bin_assignments(toy_assignments)
println("Parsed contig-to-bin assignments:")
println(assignments_df)
println("Assignment summary:")
println(_summarize_assignments(assignments_df))
toy_drep_clusters = joinpath(workdir, "toy_drep_clusters.csv")
open(toy_drep_clusters, "w") do io
write(io, "genome,secondary_cluster,representative,ani\n")
write(io, "metabat_bin_01.fna,1,metabat_bin_01.fna,0.999\n")
write(io, "vamb_bin_02.fna,1,metabat_bin_01.fna,0.998\n")
write(io, "metabat_bin_03.fna,2,metabat_bin_03.fna,0.995\n")
end
drep_clusters_df = Mycelia.parse_drep_clusters(toy_drep_clusters)
println("Parsed dRep clustering table:")
println(drep_clusters_df)Step 2: Resolve workflow inputs
External mode can use either:
- user-provided data from environment variables, or
- reproducible synthetic inputs generated by Mycelia
User-provided inputs:
MYCELIA_BINNING_CONTIGSMYCELIA_BINNING_DEPTHMYCELIA_BINNING_GENOMES(optional comma-separated FASTAs for dRep)
_print_section("Step 2: Resolve workflow inputs")
inputs = nothing
inputs_source = "none"
if RUN_EXTERNAL
if isfile(CONTIGS_FASTA_ENV) && isfile(DEPTH_FILE_ENV)
inputs = (
contigs_fasta = CONTIGS_FASTA_ENV,
depth_file = DEPTH_FILE_ENV,
genomes = filter(isfile, copy(GENOMES_ENV))
)
inputs_source = "environment variables"
else
synthetic_inputs = Mycelia.get_binning_test_inputs()
inputs = (
contigs_fasta = synthetic_inputs.contigs_fasta,
depth_file = synthetic_inputs.depth_file,
genomes = synthetic_inputs.genomes
)
inputs_source = "Mycelia.get_binning_test_inputs()"
end
println("External execution enabled.")
println("Input source: ", inputs_source)
println("Contigs FASTA: ", inputs.contigs_fasta)
println("Depth table: ", inputs.depth_file)
println("Depth preview:")
for line in _preview_lines(inputs.depth_file; n = 4)
println(" ", line)
end
else
println("External execution disabled.")
println("Set MYCELIA_RUN_EXTERNAL=true to run VAMB, MetaBAT2, and dRep.")
println("If you already have data, also set:")
println(" MYCELIA_BINNING_CONTIGS=/path/to/contigs.fna")
println(" MYCELIA_BINNING_DEPTH=/path/to/jgi_depth.tsv")
println(" MYCELIA_BINNING_GENOMES=/path/to/bin1.fna,/path/to/bin2.fna")
println("Otherwise this tutorial will auto-generate synthetic inputs in external mode.")
endStep 3: Run VAMB and inspect its outputs
run_vamb accepts a standard contig FASTA plus a JGI depth table. Mycelia converts the JGI table to the abundance TSV format VAMB expects.
vamb_result = nothing
vamb_bin_fastas = String[]
if RUN_EXTERNAL && inputs !== nothing
_print_section("Step 3: Run VAMB")
vamb_outdir = joinpath(workdir, "vamb_out")
vamb_result = Mycelia.run_vamb(
contigs_fasta = inputs.contigs_fasta,
depth_file = inputs.depth_file,
outdir = vamb_outdir
)
println("VAMB output directory: ", vamb_result.outdir)
println("VAMB clusters table: ", vamb_result.clusters_tsv)
if vamb_result.clusters_tsv !== nothing
println("VAMB cluster preview:")
for line in _preview_lines(vamb_result.clusters_tsv; n = 5)
println(" ", line)
end
end
vamb_bin_fastas = _fasta_paths(joinpath(vamb_result.outdir, "bins"))
if isempty(vamb_bin_fastas)
println("VAMB did not emit bin FASTAs in this run; continuing with any clusters table and MetaBAT2 bins.")
else
println("VAMB bin FASTA summary:")
println(_summarize_fasta_files(vamb_bin_fastas))
end
endStep 4: Run MetaBAT2 and inspect produced bins
MetaBAT2 emits bin FASTAs using the prefix returned in bins_prefix. For post-binning work, those FASTAs are usually the direct handoff into QC or dereplication tools.
metabat_result = nothing
metabat_bin_fastas = String[]
if RUN_EXTERNAL && inputs !== nothing
_print_section("Step 4: Run MetaBAT2")
metabat_outdir = joinpath(workdir, "metabat2_out")
metabat_result = Mycelia.run_metabat2(
contigs_fasta = inputs.contigs_fasta,
depth_file = inputs.depth_file,
outdir = metabat_outdir
)
println("MetaBAT2 output directory: ", metabat_result.outdir)
println("Bin prefix: ", metabat_result.bins_prefix)
metabat_bin_fastas = _fasta_paths(metabat_result.outdir)
if isempty(metabat_bin_fastas)
println("MetaBAT2 did not emit bin FASTAs in this run.")
else
println("MetaBAT2 bin FASTA summary:")
println(_summarize_fasta_files(metabat_bin_fastas))
end
endStep 5: Assemble the dRep candidate set
A common pattern is to dereplicate bins collected from multiple binners. This removes near-duplicate MAGs and chooses representatives before downstream refinement, QC, or catalog building.
drep_inputs = String[]
if RUN_EXTERNAL && inputs !== nothing
_print_section("Step 5: Collect candidate MAGs for dRep")
append!(drep_inputs, vamb_bin_fastas)
append!(drep_inputs, metabat_bin_fastas)
unique!(drep_inputs)
if length(drep_inputs) < 2 && !isempty(inputs.genomes)
println("Using tutorial genome FASTAs as dRep inputs because binner FASTAs are limited.")
append!(drep_inputs, inputs.genomes)
unique!(drep_inputs)
end
println("Candidate genomes for dereplication: ", length(drep_inputs))
if isempty(drep_inputs)
println("No candidate MAG FASTAs available yet.")
else
println(_summarize_fasta_files(drep_inputs))
end
endStep 6: Dereplicate MAGs with dRep
The simulated tutorial genomes do not ship with completeness/contamination metadata, so the workflow uses --ignoreGenomeQuality for the synthetic case.
if RUN_EXTERNAL && !isempty(drep_inputs)
_print_section("Step 6: Run dRep")
drep_outdir = joinpath(workdir, "drep_out")
drep_extra_args = String[]
if inputs_source == "Mycelia.get_binning_test_inputs()"
drep_extra_args = ["--ignoreGenomeQuality", "-l", "1000"]
end
drep_result = Mycelia.run_drep_dereplicate(
genomes = drep_inputs,
outdir = drep_outdir,
extra_args = drep_extra_args
)
println("dRep output directory: ", drep_result.outdir)
println("Winning genomes table: ", drep_result.winning_genomes)
if drep_result.winning_genomes !== nothing
println("Winning genomes preview:")
for line in _preview_lines(drep_result.winning_genomes; n = 5)
println(" ", line)
end
end
cluster_file = _find_first_matching_file(
drep_result.outdir,
[r"Cdb\.csv$", r"clusters.*\.csv$"]
)
if cluster_file !== nothing
println("dRep cluster table: ", cluster_file)
parsed_clusters = Mycelia.parse_drep_clusters(cluster_file)
println(parsed_clusters)
end
endStep 7: Interpret the workflow
At this point you can branch in a few directions:
- compare VAMB and MetaBAT2 bins against QC metrics
- pass dereplicated MAGs into CheckM/BUSCO/GTDB-Tk style validation
- keep both raw binning outputs and dRep representatives for provenance
_print_section("Step 7: Next steps")
println("Suggested downstream steps:")
println(" 1. Validate bin quality with completeness/contamination tools.")
println(" 2. Track provenance: which representative came from which binner.")
println(" 3. Feed dereplicated MAGs into annotation, pangenome, or comparative workflows.")
println("Tutorial workspace: ", workdir)