Round-Trip Tutorial 2: String Data → N-gram Graphs → String Graphs → Reconstruction
This tutorial demonstrates the hierarchical graph conversion workflow in Mycelia, showing how fixed-length N-gram graphs can be simplified into variable-length String graphs. This represents the fundamental pattern of graph hierarchy used throughout the Mycelia system.
Learning Objectives
By the end of this tutorial, you will:
- Understand the hierarchical relationship between N-gram and String graphs
- Learn to convert fixed-length graphs to variable-length representations
- Perform graph simplification and path collapse operations
- Compare reconstruction quality across graph representations
- Analyze the trade-offs between graph complexity and assembly accuracy
From the Mycelia base directory, convert this tutorial to a notebook:
julia --project=. -e 'import Literate; Literate.notebook("tutorials/09_round_trip_02_ngram_to_string.jl", "tutorials/notebooks", execute=false)'import Mycelia
import Statistics
import RandomUnderstanding Graph Hierarchy
The Mycelia system implements a hierarchical graph structure where fixed-length graphs serve as the foundation for variable-length simplified graphs.
println("="^80)
println("ROUND-TRIP TUTORIAL 2: N-GRAM TO STRING GRAPH HIERARCHY")
println("="^80)
println("\n📚 GRAPH HIERARCHY OVERVIEW:")
println(" Fixed-Length Graphs (Foundation):")
println(" • N-gram graphs: Fixed-size text fragments")
println(" • K-mer graphs: Fixed-size biological sequences")
println(" • Qualmer graphs: Quality-aware fixed-size sequences")
println()
println(" Variable-Length Graphs (Simplified Products):")
println(" • String graphs: Variable-length text sequences")
println(" • BioSequence graphs: Variable-length biological sequences")
println(" • Quality BioSequence graphs: Quality-aware variable sequences")
println()
println(" This tutorial: N-gram → String graph conversion")Data Preparation with Hierarchical Complexity
Create test data that will demonstrate different aspects of graph simplification.
test_datasets = [
(
name = "Simple Repetitive",
text = "ABCABCABCABC",
description = "High repetition, should simplify well"
),
(
name = "Linear Sequence",
text = "ABCDEFGHIJKLMNOP",
description = "Linear path, should collapse to single string"
),
(
name = "Branching Pattern",
text = "ABCXYZABCDEFABCPQR",
description = "Branching structure, complex simplification"
),
(
name = "Real Text",
text = "The quick brown fox jumps over the lazy dog",
description = "Natural language with spaces and complexity"
),
(
name = "DNA-like Sequence",
text = "ATCGATCGATCGATCGTAGCTAGCTAGCT",
description = "Biological sequence simulation"
)
]
println("\n1. HIERARCHICAL DATA PREPARATION")
println("-"^50)
for (i, dataset) in enumerate(test_datasets)
println("Dataset $i: $(dataset.name)")
println(" Text: \"$(dataset.text)\"")
println(" Description: $(dataset.description)")
println(" Length: $(length(dataset.text)) characters")
println()
endPhase 1: N-gram Graph Construction
Build the foundation fixed-length N-gram graphs.
println("\n2. PHASE 1: N-GRAM GRAPH CONSTRUCTION")
println("-"^50)
ngram_results = Dict()
for n in [3, 4, 5]
println("\\nConstructing $(n)-gram graphs:")
for (i, dataset) in enumerate(test_datasets)
if length(dataset.text) >= n
try
# Build N-gram graph (Rhizomorph)
graph = Mycelia.Rhizomorph.build_ngram_graph([dataset.text], n; dataset_id="$(replace(dataset.name, " " => "_"))_n$(n)")
# Extract statistics
vertices = collect(Mycelia.MetaGraphsNext.labels(graph))
num_vertices = length(vertices)
expected_ngrams = length(dataset.text) - n + 1
# Calculate graph properties
compression_ratio = num_vertices / expected_ngrams
# Store results
key = "$(dataset.name)_$(n)"
ngram_results[key] = (
dataset = dataset,
graph = graph,
n = n,
vertices = vertices,
num_vertices = num_vertices,
expected_ngrams = expected_ngrams,
compression_ratio = compression_ratio
)
println(" $(dataset.name): $(num_vertices)/$(expected_ngrams) vertices ($(round(compression_ratio, digits=3)) compression)")
# Show example n-grams
if num_vertices > 0
example_count = min(3, num_vertices)
println(" Examples: $(vertices[1:example_count])")
end
catch e
println(" $(dataset.name): Construction failed - $(typeof(e))")
end
else
println(" $(dataset.name): Text too short for $(n)-grams")
end
end
endPhase 2: Graph Analysis and Path Detection
Analyze the N-gram graphs to understand their structure before simplification.
println("\n3. PHASE 2: GRAPH ANALYSIS AND PATH DETECTION")
println("-"^50)
function analyze_ngram_graph_structure(graph, description)
"""Analyze structural properties of an N-gram graph."""
vertices = collect(Mycelia.MetaGraphsNext.labels(graph))
num_vertices = length(vertices)
if num_vertices == 0
return (vertices=0, linear_paths=0, branch_points=0, complexity="empty")
end
# Basic connectivity analysis
# Note: This is a simplified analysis - full graph traversal would require edge information
# Analyze overlap patterns
overlap_count = 0
potential_linear = 0
for i in 1:length(vertices)
for j in (i+1):length(vertices)
v1, v2 = vertices[i], vertices[j]
if length(v1) > 1 && length(v2) > 1
# Check for potential overlap (simplified heuristic)
if v1[2:end] == v2[1:end-1] || v2[2:end] == v1[1:end-1]
overlap_count += 1
end
end
end
end
# Estimate complexity
complexity = if overlap_count == 0
"isolated"
elseif overlap_count <= num_vertices
"linear"
else
"complex"
end
println(" $description:")
println(" Vertices: $num_vertices")
println(" Potential overlaps: $overlap_count")
println(" Estimated complexity: $complexity")
return (
vertices = num_vertices,
overlaps = overlap_count,
complexity = complexity
)
end
# Analyze representative graphs
println("Analyzing N-gram graph structures:")
for n in [3, 4]
for dataset in test_datasets[1:3] ## Analyze first 3 datasets
key = "$(dataset.name)_$(n)"
if haskey(ngram_results, key)
result = ngram_results[key]
analyze_ngram_graph_structure(result.graph, "$(dataset.name) $(n)-gram")
end
end
endPhase 3: String Graph Conversion (Rhizomorph)
Convert N-gram graphs to variable-length String graphs using Rhizomorph contigs.
println("\n4. PHASE 3: STRING GRAPH CONVERSION (CONCEPTUAL)")
println("-"^50)
function convert_ngram_to_string_graph(ngram_result)
"""Convert N-gram graph to variable-length String graph using Rhizomorph contigs."""
contigs = Mycelia.Rhizomorph.find_contigs_next(ngram_result.graph; min_contig_length=1)
contig_strings = [string(contig.sequence) for contig in contigs]
if isempty(contig_strings)
contig_strings = [string(label) for label in ngram_result.vertices]
end
dataset_label = replace(ngram_result.dataset.name, " " => "_")
min_overlap = max(1, ngram_result.n - 1)
string_graph = Mycelia.Rhizomorph.build_string_graph(
contig_strings;
dataset_id="$(dataset_label)_n$(ngram_result.n)",
min_overlap=min_overlap
)
string_vertices = collect(Mycelia.MetaGraphsNext.labels(string_graph))
return (
graph = string_graph,
vertices = string_vertices,
contig_strings = contig_strings,
min_overlap = min_overlap
)
end
string_conversion_results = Dict()
println("Converting N-gram graphs to String graphs:")
for (key, result) in ngram_results
conversion = convert_ngram_to_string_graph(result)
string_conversion_results[key] = (
original_ngram_result = result,
string_graph = conversion.graph,
string_vertices = conversion.vertices,
contig_strings = conversion.contig_strings,
conversion_ratio = length(conversion.vertices) / max(1, result.num_vertices)
)
println(" $(key):")
println(" N-gram vertices: $(result.num_vertices)")
println(" String vertices: $(length(conversion.vertices))")
println(" Conversion ratio: $(round(length(conversion.vertices) / max(1, result.num_vertices), digits=3))")
if !isempty(conversion.vertices)
example_count = min(2, length(conversion.vertices))
println(" Example strings: $(conversion.vertices[1:example_count])")
end
endPhase 4: Reconstruction and Round-Trip Validation
Validate the complete round-trip workflow through both graph representations.
println("\n5. PHASE 4: RECONSTRUCTION AND ROUND-TRIP VALIDATION")
println("-"^50)
function validate_round_trip(original_text, ngram_graph, string_graph, n)
"""Validate the round-trip reconstruction quality."""
# Method 1: Direct N-gram assembly
try
ngram_assembly = String[]
paths = Mycelia.Rhizomorph.find_eulerian_paths_next(ngram_graph)
for path in paths
if !isempty(path)
push!(ngram_assembly, string(Mycelia.Rhizomorph.path_to_sequence(path, ngram_graph)))
end
end
if isempty(ngram_assembly)
contigs = Mycelia.Rhizomorph.find_contigs_next(ngram_graph; min_contig_length=1)
ngram_assembly = [string(contig.sequence) for contig in contigs]
end
ngram_success = !isempty(ngram_assembly)
# Find best N-gram reconstruction
best_ngram_similarity = 0.0
best_ngram_reconstruction = ""
for reconstruction in ngram_assembly
similarity = calculate_similarity(original_text, reconstruction)
if similarity > best_ngram_similarity
best_ngram_similarity = similarity
best_ngram_reconstruction = reconstruction
end
end
catch e
ngram_success = false
best_ngram_similarity = 0.0
best_ngram_reconstruction = ""
ngram_assembly = String[]
end
# Method 2: String graph simulation
string_contigs = Mycelia.Rhizomorph.find_contigs_next(string_graph; min_contig_length=1)
string_reconstructions = [string(contig.sequence) for contig in string_contigs]
if isempty(string_reconstructions)
string_reconstructions = [string(label) for label in Mycelia.MetaGraphsNext.labels(string_graph)]
end
string_success = !isempty(string_reconstructions)
best_string_similarity = 0.0
best_string_reconstruction = ""
for reconstruction in string_reconstructions
similarity = calculate_similarity(original_text, reconstruction)
if similarity > best_string_similarity
best_string_similarity = similarity
best_string_reconstruction = reconstruction
end
end
return (
ngram_success = ngram_success,
ngram_similarity = best_ngram_similarity,
ngram_reconstruction = best_ngram_reconstruction,
string_success = string_success,
string_similarity = best_string_similarity,
string_reconstruction = best_string_reconstruction,
comparison = best_ngram_similarity > best_string_similarity ? "N-gram better" : "String better"
)
end
function calculate_similarity(original::String, reconstructed::String)
"""Calculate character-level similarity between strings."""
min_len = min(length(original), length(reconstructed))
max_len = max(length(original), length(reconstructed))
if max_len == 0
return 1.0
end
matches = 0
for i in 1:min_len
if original[i] == reconstructed[i]
matches += 1
end
end
return matches / max_len
end
validation_results = Dict()
println("Validating round-trip reconstructions:")
for (key, conversion_result) in string_conversion_results
ngram_result = conversion_result.original_ngram_result
string_graph = conversion_result.string_graph
validation = validate_round_trip(
ngram_result.dataset.text,
ngram_result.graph,
string_graph,
ngram_result.n
)
validation_results[key] = validation
println("\\n $(key):")
println(" Original: \"$(ngram_result.dataset.text)\"")
println(" N-gram reconstruction: $(validation.ngram_success ? "SUCCESS" : "FAILED")")
if validation.ngram_success
println(" Similarity: $(round(validation.ngram_similarity, digits=3))")
println(" Result: \"$(validation.ngram_reconstruction)\"")
end
println(" String reconstruction: $(validation.string_success ? "SUCCESS" : "FAILED")")
if validation.string_success
println(" Similarity: $(round(validation.string_similarity, digits=3))")
println(" Result: \"$(validation.string_reconstruction)\"")
end
println(" Comparison: $(validation.comparison)")
endPerformance and Complexity Analysis
Compare the computational characteristics of both graph types.
println("\n6. PERFORMANCE AND COMPLEXITY ANALYSIS")
println("-"^50)
function analyze_graph_efficiency(ngram_results, string_results)
"""Analyze computational efficiency of both graph types."""
println("Graph type comparison:")
total_ngram_vertices = 0
total_string_vertices = 0
total_original_length = 0
for (key, ngram_result) in ngram_results
if haskey(string_results, key)
string_result = string_results[key]
total_ngram_vertices += ngram_result.num_vertices
total_string_vertices += length(string_result.string_vertices)
total_original_length += length(ngram_result.dataset.text)
end
end
avg_ngram_compression = total_ngram_vertices / total_original_length
avg_string_compression = total_string_vertices / total_original_length
hierarchy_compression = total_string_vertices / total_ngram_vertices
println(" Average N-gram compression: $(round(avg_ngram_compression, digits=3))")
println(" Average String compression: $(round(avg_string_compression, digits=3))")
println(" Hierarchical compression (String/N-gram): $(round(hierarchy_compression, digits=3))")
println("\\n Memory efficiency estimation:")
println(" N-gram graphs: Higher vertex count, fixed-size vertices")
println(" String graphs: Lower vertex count, variable-size vertices")
println(" Trade-off: Graph complexity vs. vertex size")
return (
ngram_compression = avg_ngram_compression,
string_compression = avg_string_compression,
hierarchy_compression = hierarchy_compression
)
end
efficiency_analysis = analyze_graph_efficiency(ngram_results, string_conversion_results)Real-World Application: Multi-Scale Text Analysis
Demonstrate the practical value of hierarchical graph representations.
println("\n7. REAL-WORLD APPLICATION: MULTI-SCALE TEXT ANALYSIS")
println("-"^50)Example: Analyzing text at multiple resolutions
complex_text = "In bioinformatics, sequence analysis involves the process of subjecting DNA, RNA, or protein sequences to any of a wide range of analytical methods to understand their features, function, structure, or evolution"
println("Multi-scale analysis of complex text:")
println("Text: \"$(complex_text[1:60])...\"")
println("Length: $(length(complex_text)) characters")
# Build graphs at different scales
scales = [
(n=3, description="Fine-grained analysis"),
(n=5, description="Medium-grained analysis"),
(n=7, description="Coarse-grained analysis")
]
println("\\nBuilding multi-scale representations:")
for scale in scales
if length(complex_text) >= scale.n
ngram_graph = Mycelia.Rhizomorph.build_ngram_graph([complex_text], scale.n; dataset_id="multiscale_n$(scale.n)")
ngram_vertices = length(Mycelia.MetaGraphsNext.labels(ngram_graph))
compression = ngram_vertices / (length(complex_text) - scale.n + 1)
println(" $(scale.description) (n=$(scale.n)):")
println(" N-gram vertices: $ngram_vertices")
println(" Compression ratio: $(round(compression, digits=3))")
# Simulate string graph conversion
estimated_string_count = max(1, ngram_vertices ÷ 4) ## Rough estimate
string_compression = estimated_string_count / (length(complex_text) - scale.n + 1)
println(" Estimated string vertices: $estimated_string_count")
println(" String compression ratio: $(round(string_compression, digits=3))")
end
end
println("\\nInsights from multi-scale analysis:")
println(" • Fine-grained: High detail, more vertices, better local patterns")
println(" • Coarse-grained: Lower detail, fewer vertices, global structure")
println(" • Hierarchical conversion: Further compression with controlled information loss")Tutorial Summary and Best Practices
Summarize key learnings and provide guidance for application.
println("\n" * "="^80)
println("TUTORIAL SUMMARY AND BEST PRACTICES")
println("="^80)
# Calculate overall statistics
total_validations = length(validation_results)
successful_ngram = sum(1 for v in values(validation_results) if v.ngram_success)
successful_string = sum(1 for v in values(validation_results) if v.string_success)
avg_ngram_similarity = Statistics.mean([v.ngram_similarity for v in values(validation_results)])
avg_string_similarity = Statistics.mean([v.string_similarity for v in values(validation_results)])
println("\\n✅ HIERARCHICAL WORKFLOW COMPLETION:")
println(" 1. N-gram Graph Construction: ✓ Multiple scales tested")
println(" 2. Graph Structure Analysis: ✓ Complexity patterns identified")
println(" 3. String Graph Conversion: ✓ Simulation completed")
println(" 4. Round-trip Validation: ✓ Quality metrics calculated")
println(" 5. Performance Analysis: ✓ Efficiency trade-offs analyzed")
println(" 6. Multi-scale Application: ✓ Real-world example demonstrated")
println("\\n📊 QUANTITATIVE RESULTS:")
println(" Total test cases: $total_validations")
println(" N-gram reconstruction success: $successful_ngram/$total_validations ($(round(successful_ngram/total_validations*100, digits=1))%)")
println(" String reconstruction success: $successful_string/$total_validations ($(round(successful_string/total_validations*100, digits=1))%)")
println(" Average N-gram similarity: $(round(avg_ngram_similarity, digits=3))")
println(" Average String similarity: $(round(avg_string_similarity, digits=3))")
println(" Hierarchical compression: $(round(efficiency_analysis.hierarchy_compression, digits=3))")
println("\\n🔄 HIERARCHICAL WORKFLOW VALIDATED:")
println(" String Data → N-gram Graphs → String Graphs → Reconstruction")
println(" ✓ Fixed-length foundation graphs constructed successfully")
println(" ✓ Variable-length simplified graphs derived")
println(" ✓ Quality maintained through conversion process")
println(" ✓ Performance trade-offs quantified")
println("\\n💡 KEY INSIGHTS:")
println(" • Hierarchical graphs provide multi-resolution analysis capabilities")
println(" • Fixed-length graphs offer detailed local structure information")
println(" • Variable-length graphs provide global structure with compression")
println(" • Conversion quality depends on text complexity and repetition patterns")
println(" • Multi-scale analysis reveals different aspects of data structure")
println("\\n📋 BEST PRACTICES:")
println(" • Use smaller n-grams (3-4) for detailed local analysis")
println(" • Use larger n-grams (5-7) for global structure identification")
println(" • Apply hierarchical conversion when memory efficiency is critical")
println(" • Validate reconstruction quality at each conversion step")
println(" • Consider text complexity when choosing graph representation")
println("\\n🚀 NEXT STEPS IN GRAPH HIERARCHY:")
println(" • Tutorial 3: FASTA sequences → Sequence graphs → Reconstruction")
println(" • Tutorial 4: FASTA → K-mer graphs → Sequence graphs (biological hierarchy)")
println(" • Tutorial 5: FASTQ → FASTQ graphs (direct quality-aware workflows)")
println(" • Tutorial 6: FASTQ → Qualmer graphs → FASTQ graphs (quality hierarchy)")
println("\\n🎯 PATTERNS FOR BIOLOGICAL GRAPHS:")
println(" The hierarchical patterns demonstrated here with text apply to:")
println(" • DNA/RNA sequences: Fixed k-mers → Variable-length contigs")
println(" • Quality data: Fixed qualmers → Quality-aware sequences")
println(" • Protein sequences: Fixed amino acid k-mers → Domain sequences")
println("\\n" * "="^80)
println("Hierarchical graph conversion workflow mastered!")
println("Ready for biological sequence applications in Tutorial 3!")
println("="^80)