Skip to content

Commit 41a1984

Browse files
committed
Write compound (segmented) sequence locations
Given a SeqFeqture with a CompoundLocation we now correctly write out the CDS/gene using segmented coordinates. Auspice can now handle such coordinates (see <nextstrain/auspice#1684> and <#1281> for the corresponding schema updates). Note that the translations (via augur translate) of complex CDSs are not modified in this commit. Supersedes #1333
1 parent a9572d5 commit 41a1984

File tree

3 files changed

+59
-28
lines changed

3 files changed

+59
-28
lines changed

augur/ancestral.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
from Bio import SeqIO
2929
from Bio.Seq import Seq
3030
from Bio.SeqRecord import SeqRecord
31-
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
31+
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name, \
32+
genome_features_to_auspice_annotation
3233
from .io.file import open_file
3334
from .io.vcf import is_vcf as is_filename_vcf
3435
from treetime.vcf_utils import read_vcf, write_vcf
@@ -455,14 +456,7 @@ def run(args):
455456
node["aa_sequences"][gene] = aa_result['tt'].sequence(T.root, as_string=True, reconstructed=True)
456457

457458
anc_seqs['reference'][gene] = aa_result['root_seq']
458-
# FIXME: Note that this is calculating the end of the CDS as 3*length of translation
459-
# this is equivalent to the annotation for single segment CDS, but not for cds
460-
# with splicing and slippage. But auspice can't handle the latter at the moment.
461-
anc_seqs['annotations'][gene] = {'seqid':args.annotation,
462-
'type':feat.type,
463-
'start': int(feat.location.start)+1,
464-
'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
465-
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
459+
anc_seqs['annotations'].update(genome_features_to_auspice_annotation({gene: feat}, args.annotation))
466460

467461
# Save ancestral amino acid sequences to FASTA.
468462
if args.output_translations:

augur/translate.py

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
import numpy as np
1818
from Bio import SeqIO, Seq, SeqRecord, Phylo
1919
from .io.vcf import write_VCF_translation, is_vcf as is_filename_vcf
20-
from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
20+
from .utils import parse_genes_argument, read_node_data, load_features, \
21+
write_json, get_json_name, genome_features_to_auspice_annotation
2122
from treetime.vcf_utils import read_vcf
2223
from augur.errors import AugurError
2324
from textwrap import dedent
@@ -446,24 +447,7 @@ def run(args):
446447
continue
447448
reference_translations[fname] = safe_translate(str(feat.extract(reference)))
448449

449-
## glob the annotations for later auspice export
450-
#
451-
# Note that BioPython FeatureLocations use
452-
# "Pythonic" coordinates: [zero-origin, half-open)
453-
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
454-
annotations = {
455-
'nuc': {'start': features['nuc'].location.start+1,
456-
'end': features['nuc'].location.end,
457-
'strand': '+',
458-
'type': features['nuc'].type, # (unused by auspice)
459-
'seqid': args.reference_sequence} # (unused by auspice)
460-
}
461-
for fname, feat in features.items():
462-
annotations[fname] = {'seqid':args.reference_sequence,
463-
'type':feat.type,
464-
'start':int(feat.location.start)+1,
465-
'end':int(feat.location.end),
466-
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
450+
annotations = genome_features_to_auspice_annotation(features, args.reference_sequence, assert_nuc=True)
467451

468452
## determine amino acid mutations for each node
469453
try:

augur/utils.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -832,3 +832,56 @@ def _get_genes_from_file(fname):
832832
print("Read in {} specified genes to translate.".format(len(unique_genes)))
833833

834834
return unique_genes
835+
836+
837+
838+
def genome_features_to_auspice_annotation(features, ref_seq_name=None, assert_nuc=False):
839+
"""
840+
Parameters
841+
----------
842+
features : dict, keys: feature names, values: Bio.SeqFeature.SeqFeature objects
843+
ref_seq_name : str (optional)
844+
Exported as the `seqid` for each feature. Note this is unused by Auspice
845+
assert_nuc : bool (optional)
846+
If true, one of the feature key names must be "nuc"
847+
848+
Returns
849+
-------
850+
dict : annotations
851+
See schema-annotations.json for the schema this conforms to
852+
853+
"""
854+
from Bio.SeqFeature import SimpleLocation, CompoundLocation
855+
856+
if assert_nuc and 'nuc' not in features:
857+
raise AugurError("Genome features must include a feature for 'nuc'")
858+
859+
def _parse(feat):
860+
a = {}
861+
# Note that BioPython locations use "Pythonic" coordinates: [zero-origin, half-open)
862+
# Starting with augur v6 we use GFF coordinates: [one-origin, inclusive]
863+
if type(feat.location)==SimpleLocation:
864+
a['start'] = int(feat.location.start)+1
865+
a['end'] = int(feat.location.end)
866+
elif type(feat.location)==CompoundLocation:
867+
a['segments'] = [
868+
{'start':int(segment.start)+1, 'end':int(segment.end)}
869+
for segment in feat.location.parts # segment: SimpleLocation
870+
]
871+
else:
872+
raise AugurError(f"Encountered a genome feature with an unknown location type {type(feat.location):q}")
873+
a['strand'] = {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]
874+
a['type'] = feat.type # (unused by auspice)
875+
if ref_seq_name:
876+
a['seqid'] = ref_seq_name # (unused by auspice)
877+
return a
878+
879+
annotations = {}
880+
for fname, feat in features.items():
881+
annotations[fname] = _parse(feat)
882+
if fname=='nuc':
883+
assert annotations['nuc']['strand'] == '+', "Nuc feature must be +ve strand"
884+
elif annotations[fname]['strand'] not in ['+', '-']:
885+
print("WARNING: Feature {fname:q} uses a strand which auspice cannot display")
886+
887+
return annotations

0 commit comments

Comments
 (0)