Comparative Genomics
In [1]:
Copied!
# !pip install pycirclizely pygenomeviz
# !apt install ncbi-blast+ mummer mmseqs2
# !pip install pycirclizely pygenomeviz
# !apt install ncbi-blast+ mummer mmseqs2
In [2]:
Copied!
import plotly.io as pio
from IPython.display import HTML
import plotly.io as pio
from IPython.display import HTML
Advanced users can plot figures for comparative genomics flexibly with pyCirclize API. In this notebook, simple code recipes for comparative genomics Circos visualization utilizing pyGenomeViz align module are shown.
MUMmer¶
Plot MUMmer alignment links between query-reference genomes.
In [3]:
Copied!
from pygenomeviz.align import MUMmer # type: ignore[import-untyped]
from pygenomeviz.parser import Genbank # type: ignore[import-untyped]
from pygenomeviz.utils import ( # type: ignore[import-untyped]
load_example_genbank_dataset,
)
from pycirclizely import Circos
TICKS_INTERVAL = 1000000
# Load query & reference genbank files
gbk_files = load_example_genbank_dataset("escherichia_coli")
ref_gbk = Genbank(gbk_files[2])
query_gbk = Genbank(gbk_files[3])
# Initialize circos instance
circos = Circos(
sectors=dict(
**ref_gbk.get_seqid2size(),
**dict(reversed(list(query_gbk.get_seqid2size().items()))),
),
start=-358,
end=2,
space=4,
sector2clockwise={seqid: False for seqid in query_gbk.get_seqid2size().keys()},
)
circos.text(
f"{ref_gbk.name}\n({ref_gbk.full_genome_length:,} bp)",
r=130,
deg=35,
font=dict(size=13),
)
circos.text(
f"{query_gbk.name}\n({query_gbk.full_genome_length:,} bp)",
r=130,
deg=-35,
font=dict(size=13),
)
# Plot genomic sector axis & xticks
for sector in circos.sectors:
track = sector.add_track((99.8, 100))
track.axis(fillcolor="black")
if sector.size >= TICKS_INTERVAL:
track.xticks_by_interval(
TICKS_INTERVAL,
label_formatter=lambda v: f"{v/1000000:.1f} Mb",
label_orientation="vertical",
)
# MUMmer genome comparison & plot links
align_coords = MUMmer([query_gbk, ref_gbk]).run()
for ac in align_coords:
region1 = (ac.query_name, ac.query_start, ac.query_end)
region2 = (ac.ref_name, ac.ref_start, ac.ref_end)
color = "red" if ac.is_inverted else "grey"
circos.link(region1, region2, fillcolor=color, line=dict(color=color), r1=98, r2=98)
fig = circos.plotfig()
HTML(pio.to_html(fig, include_plotlyjs="cdn"))
from pygenomeviz.align import MUMmer # type: ignore[import-untyped]
from pygenomeviz.parser import Genbank # type: ignore[import-untyped]
from pygenomeviz.utils import ( # type: ignore[import-untyped]
load_example_genbank_dataset,
)
from pycirclizely import Circos
TICKS_INTERVAL = 1000000
# Load query & reference genbank files
gbk_files = load_example_genbank_dataset("escherichia_coli")
ref_gbk = Genbank(gbk_files[2])
query_gbk = Genbank(gbk_files[3])
# Initialize circos instance
circos = Circos(
sectors=dict(
**ref_gbk.get_seqid2size(),
**dict(reversed(list(query_gbk.get_seqid2size().items()))),
),
start=-358,
end=2,
space=4,
sector2clockwise={seqid: False for seqid in query_gbk.get_seqid2size().keys()},
)
circos.text(
f"{ref_gbk.name}\n({ref_gbk.full_genome_length:,} bp)",
r=130,
deg=35,
font=dict(size=13),
)
circos.text(
f"{query_gbk.name}\n({query_gbk.full_genome_length:,} bp)",
r=130,
deg=-35,
font=dict(size=13),
)
# Plot genomic sector axis & xticks
for sector in circos.sectors:
track = sector.add_track((99.8, 100))
track.axis(fillcolor="black")
if sector.size >= TICKS_INTERVAL:
track.xticks_by_interval(
TICKS_INTERVAL,
label_formatter=lambda v: f"{v/1000000:.1f} Mb",
label_orientation="vertical",
)
# MUMmer genome comparison & plot links
align_coords = MUMmer([query_gbk, ref_gbk]).run()
for ac in align_coords:
region1 = (ac.query_name, ac.query_start, ac.query_end)
region2 = (ac.ref_name, ac.ref_start, ac.ref_end)
color = "red" if ac.is_inverted else "grey"
circos.link(region1, region2, fillcolor=color, line=dict(color=color), r1=98, r2=98)
fig = circos.plotfig()
HTML(pio.to_html(fig, include_plotlyjs="cdn"))
Out[3]:
MMseqs¶
Search RBH(Reciprocal Best Hit) CDSs and plot hit links between query-reference genomes by MMseqs.
In [4]:
Copied!
import plotly.graph_objects as go
from pygenomeviz.align import MMseqs # type: ignore[import-untyped]
from pygenomeviz.parser import Genbank # type: ignore[import-untyped]
from pygenomeviz.utils import ( # type: ignore[import-untyped]
load_example_genbank_dataset,
)
from pycirclizely import Circos
QUERY_COLOR = "salmon"
REF_COLOR = "skyblue"
TICKS_INTERVAL = 100000
# Load query & reference genbank files
gbk_files = load_example_genbank_dataset("mycoplasma_mycoides")
query_gbk = Genbank(gbk_files[3])
ref_gbk = Genbank(gbk_files[2])
# Initialize circos instance
circos = Circos(
sectors=dict(
**ref_gbk.get_seqid2size(),
**dict(reversed(list(query_gbk.get_seqid2size().items()))),
),
start=-358,
end=2,
space=4,
sector2clockwise={seqid: False for seqid in query_gbk.get_seqid2size()},
)
# Plot genomic sector axis & xticks
for sector in circos.sectors:
track = sector.add_track((98, 100))
color = QUERY_COLOR if sector.name in query_gbk.get_seqid2size() else REF_COLOR
track.axis(fillcolor=color)
if sector.size >= TICKS_INTERVAL:
track.xticks_by_interval(
TICKS_INTERVAL,
label_formatter=lambda v: f"{v/1000000:.1f} Mb",
label_orientation="vertical",
)
# Search MMseqs RBH CDSs & plot hit links
align_coords = MMseqs([query_gbk, ref_gbk]).run()
for ac in align_coords:
region1 = (ac.query_name, ac.query_start, ac.query_end)
region2 = (ac.ref_name, ac.ref_start, ac.ref_end)
color = "red" if ac.is_inverted else "grey"
circos.link(region1, region2, fillcolor=color, line=dict(color=color))
fig = circos.plotfig()
# Add legend using dummy traces
legend_labels = [
f"{query_gbk.name} (Query)",
f"{ref_gbk.name} (Ref)",
]
legend_colors = [QUERY_COLOR, REF_COLOR]
for label, color in zip(legend_labels, legend_colors):
fig.add_trace(
go.Scatter(
x=[None],
y=None,
mode="markers",
marker=dict(color=color, symbol="square", size=12),
name=label,
showlegend=True,
)
)
fig.update_layout(
legend=dict(
x=0.5,
y=-0.1,
xanchor="center",
yanchor="top",
font=dict(size=12),
bgcolor="rgba(255,255,255,0.5)",
orientation="h",
)
)
HTML(pio.to_html(fig, include_plotlyjs="cdn"))
import plotly.graph_objects as go
from pygenomeviz.align import MMseqs # type: ignore[import-untyped]
from pygenomeviz.parser import Genbank # type: ignore[import-untyped]
from pygenomeviz.utils import ( # type: ignore[import-untyped]
load_example_genbank_dataset,
)
from pycirclizely import Circos
QUERY_COLOR = "salmon"
REF_COLOR = "skyblue"
TICKS_INTERVAL = 100000
# Load query & reference genbank files
gbk_files = load_example_genbank_dataset("mycoplasma_mycoides")
query_gbk = Genbank(gbk_files[3])
ref_gbk = Genbank(gbk_files[2])
# Initialize circos instance
circos = Circos(
sectors=dict(
**ref_gbk.get_seqid2size(),
**dict(reversed(list(query_gbk.get_seqid2size().items()))),
),
start=-358,
end=2,
space=4,
sector2clockwise={seqid: False for seqid in query_gbk.get_seqid2size()},
)
# Plot genomic sector axis & xticks
for sector in circos.sectors:
track = sector.add_track((98, 100))
color = QUERY_COLOR if sector.name in query_gbk.get_seqid2size() else REF_COLOR
track.axis(fillcolor=color)
if sector.size >= TICKS_INTERVAL:
track.xticks_by_interval(
TICKS_INTERVAL,
label_formatter=lambda v: f"{v/1000000:.1f} Mb",
label_orientation="vertical",
)
# Search MMseqs RBH CDSs & plot hit links
align_coords = MMseqs([query_gbk, ref_gbk]).run()
for ac in align_coords:
region1 = (ac.query_name, ac.query_start, ac.query_end)
region2 = (ac.ref_name, ac.ref_start, ac.ref_end)
color = "red" if ac.is_inverted else "grey"
circos.link(region1, region2, fillcolor=color, line=dict(color=color))
fig = circos.plotfig()
# Add legend using dummy traces
legend_labels = [
f"{query_gbk.name} (Query)",
f"{ref_gbk.name} (Ref)",
]
legend_colors = [QUERY_COLOR, REF_COLOR]
for label, color in zip(legend_labels, legend_colors):
fig.add_trace(
go.Scatter(
x=[None],
y=None,
mode="markers",
marker=dict(color=color, symbol="square", size=12),
name=label,
showlegend=True,
)
)
fig.update_layout(
legend=dict(
x=0.5,
y=-0.1,
xanchor="center",
yanchor="top",
font=dict(size=12),
bgcolor="rgba(255,255,255,0.5)",
orientation="h",
)
)
HTML(pio.to_html(fig, include_plotlyjs="cdn"))
Out[4]:
In [5]:
Copied!
import plotly.graph_objects as go
from pygenomeviz.align import AlignCoord, Blast # type: ignore[import-untyped]
from pygenomeviz.parser import Fasta # type: ignore[import-untyped]
from pygenomeviz.utils import ( # type: ignore[import-untyped]
ColorCycler,
interpolate_color,
load_example_fasta_dataset,
)
from pycirclizely import Circos
from pycirclizely.utils import parse_color
ColorCycler.set_cmap("Set1")
QUERY_TRACK_SIZE = 5
MIN_IDENTITY = 70
TICKS_INTERVAL = 100000
# Load target & comparison fasta files
fasta_files = load_example_fasta_dataset("mycoplasma_mycoides")
target_fasta = Fasta(fasta_files[0])
comp_fasta_list = list(map(Fasta, fasta_files[1:]))
# Initialize circos instance
circos = Circos(
sectors=target_fasta.get_seqid2size(),
space=0 if len(target_fasta.get_seqid2size()) == 1 else 2,
)
circos.text(
f"{target_fasta.name}<br>({target_fasta.full_genome_length:,} bp)",
font=dict(size=16),
)
# Blast genome comparison & plot match blocks
min_r_pos = 100
comp_name2color = {}
for idx, comp_fasta in enumerate(comp_fasta_list):
align_coords = Blast([target_fasta, comp_fasta]).run()
align_coords = AlignCoord.filter(align_coords, identity_thr=MIN_IDENTITY)
color = ColorCycler()
comp_name2color[comp_fasta.name] = color
min_r_pos -= QUERY_TRACK_SIZE
for sector in circos.sectors:
sector.add_track((min_r_pos, min_r_pos + QUERY_TRACK_SIZE), r_pad_ratio=0.1)
for ac in align_coords:
# Last added track in sector
track = circos.get_sector(ac.query_name).tracks[-1]
rect_color = interpolate_color(color, v=ac.identity, vmin=MIN_IDENTITY)
track.rect(
ac.query_start, ac.query_end, fillcolor=rect_color, line=dict(width=0)
)
# Plot genomic sector axis & xticks
for sector in circos.sectors:
track = sector.add_track((min_r_pos - 0.3, min_r_pos))
track.axis(fillcolor="black")
if sector.size >= TICKS_INTERVAL:
track.xticks_by_interval(
TICKS_INTERVAL,
outer=False,
label_formatter=lambda v: f"{v/1000000:.1f} Mb",
label_orientation="vertical",
)
fig = circos.plotfig()
# Add legend using dummy traces
for query_name, color in comp_name2color.items():
fig.add_trace(
go.Scatter(
x=[None],
y=None,
mode="markers",
marker=dict(color=parse_color(color), size=14, symbol="square"),
name=query_name,
showlegend=True,
)
)
fig.update_layout(
legend=dict(
x=0.5,
y=0.35,
xanchor="center",
yanchor="middle",
font=dict(size=12),
borderwidth=0,
),
)
HTML(pio.to_html(fig, include_plotlyjs="cdn"))
import plotly.graph_objects as go
from pygenomeviz.align import AlignCoord, Blast # type: ignore[import-untyped]
from pygenomeviz.parser import Fasta # type: ignore[import-untyped]
from pygenomeviz.utils import ( # type: ignore[import-untyped]
ColorCycler,
interpolate_color,
load_example_fasta_dataset,
)
from pycirclizely import Circos
from pycirclizely.utils import parse_color
ColorCycler.set_cmap("Set1")
QUERY_TRACK_SIZE = 5
MIN_IDENTITY = 70
TICKS_INTERVAL = 100000
# Load target & comparison fasta files
fasta_files = load_example_fasta_dataset("mycoplasma_mycoides")
target_fasta = Fasta(fasta_files[0])
comp_fasta_list = list(map(Fasta, fasta_files[1:]))
# Initialize circos instance
circos = Circos(
sectors=target_fasta.get_seqid2size(),
space=0 if len(target_fasta.get_seqid2size()) == 1 else 2,
)
circos.text(
f"{target_fasta.name}
({target_fasta.full_genome_length:,} bp)", font=dict(size=16), ) # Blast genome comparison & plot match blocks min_r_pos = 100 comp_name2color = {} for idx, comp_fasta in enumerate(comp_fasta_list): align_coords = Blast([target_fasta, comp_fasta]).run() align_coords = AlignCoord.filter(align_coords, identity_thr=MIN_IDENTITY) color = ColorCycler() comp_name2color[comp_fasta.name] = color min_r_pos -= QUERY_TRACK_SIZE for sector in circos.sectors: sector.add_track((min_r_pos, min_r_pos + QUERY_TRACK_SIZE), r_pad_ratio=0.1) for ac in align_coords: # Last added track in sector track = circos.get_sector(ac.query_name).tracks[-1] rect_color = interpolate_color(color, v=ac.identity, vmin=MIN_IDENTITY) track.rect( ac.query_start, ac.query_end, fillcolor=rect_color, line=dict(width=0) ) # Plot genomic sector axis & xticks for sector in circos.sectors: track = sector.add_track((min_r_pos - 0.3, min_r_pos)) track.axis(fillcolor="black") if sector.size >= TICKS_INTERVAL: track.xticks_by_interval( TICKS_INTERVAL, outer=False, label_formatter=lambda v: f"{v/1000000:.1f} Mb", label_orientation="vertical", ) fig = circos.plotfig() # Add legend using dummy traces for query_name, color in comp_name2color.items(): fig.add_trace( go.Scatter( x=[None], y=None, mode="markers", marker=dict(color=parse_color(color), size=14, symbol="square"), name=query_name, showlegend=True, ) ) fig.update_layout( legend=dict( x=0.5, y=0.35, xanchor="center", yanchor="middle", font=dict(size=12), borderwidth=0, ), ) HTML(pio.to_html(fig, include_plotlyjs="cdn"))
({target_fasta.full_genome_length:,} bp)", font=dict(size=16), ) # Blast genome comparison & plot match blocks min_r_pos = 100 comp_name2color = {} for idx, comp_fasta in enumerate(comp_fasta_list): align_coords = Blast([target_fasta, comp_fasta]).run() align_coords = AlignCoord.filter(align_coords, identity_thr=MIN_IDENTITY) color = ColorCycler() comp_name2color[comp_fasta.name] = color min_r_pos -= QUERY_TRACK_SIZE for sector in circos.sectors: sector.add_track((min_r_pos, min_r_pos + QUERY_TRACK_SIZE), r_pad_ratio=0.1) for ac in align_coords: # Last added track in sector track = circos.get_sector(ac.query_name).tracks[-1] rect_color = interpolate_color(color, v=ac.identity, vmin=MIN_IDENTITY) track.rect( ac.query_start, ac.query_end, fillcolor=rect_color, line=dict(width=0) ) # Plot genomic sector axis & xticks for sector in circos.sectors: track = sector.add_track((min_r_pos - 0.3, min_r_pos)) track.axis(fillcolor="black") if sector.size >= TICKS_INTERVAL: track.xticks_by_interval( TICKS_INTERVAL, outer=False, label_formatter=lambda v: f"{v/1000000:.1f} Mb", label_orientation="vertical", ) fig = circos.plotfig() # Add legend using dummy traces for query_name, color in comp_name2color.items(): fig.add_trace( go.Scatter( x=[None], y=None, mode="markers", marker=dict(color=parse_color(color), size=14, symbol="square"), name=query_name, showlegend=True, ) ) fig.update_layout( legend=dict( x=0.5, y=0.35, xanchor="center", yanchor="middle", font=dict(size=12), borderwidth=0, ), ) HTML(pio.to_html(fig, include_plotlyjs="cdn"))
Out[5]: