Now, we would like to see how the snps within coding regions cause synonymous or non-synonymous changes to codons.
The process of determining whether a snp causes a synonymous or non-synonymous change is, in principle, straight forward. Start at the beginning of the CDS and count off by threes. You can then map each three-mer to its amino acid. The snp will modify the three-mer and represent a possible synonymous substitution (no change in the amino acid) or a non-synonymous substitution (an change to a different amino acid). Each three-mer is a codon.
Problems arise, however, because there are occasionally mismatches between the annotated exon positions and the annotated protein sequence. Luckily, in relation to the total number of annotated genes these problems are few. However, we have to be aware of annotation problems being a source of error in the results. The biggest problem is a mismatch between the annotated exons and the annotated amino acid sequence.
Protein sequences, along with genomic sequences are available from the UCSC Genome Browser site, as well as other sites.
As before, we will extend the Gene and HitType classex to include some new features. See the previous Pie post for the basics. HitType will now include constants to keep track of synonymous/non-synonymous changes due to SNPs, as well as nonsense (changing a codon to stop in the coding region) and read-through (changing a stop codon to a non stop codon) changes. HitType is also extended to contain a dictionary containing the text versions of the constants for printing, and a list of the HitType constants for convenience when iterating.
class HitType(object):
"""
HitType - bitmap constants for SNP location and types
"""
no_annotation = 0x0000
UTR5 = 0x0001
UTR3 = 0x0002
CDS = 0x0004
intron = 0x0008
est = 0x0010
ncRNA = 0x0020
UTR3_1k = 0x0040 # within 1 kb of 3' UTR
synonymous = 0x0080
non_synonymous = 0x0100
nonsense = 0x0200
intron_exon_bdry = 0x0400
read_through = 0x0800
unknown = 0x1000
HitNames = dict([(no_annotation, 'no_annotation'),
(UTR5, 'UTR5'),
(UTR3, 'UTR3'),
(CDS, 'CDS'),
(intron, 'intron'),
(est, 'est'),
(ncRNA, 'ncRNA'),
(UTR3_1k, 'UTR3_1k'),
(synonymous, 'synonymous'),
(non_synonymous, 'non_synonymous'),
(nonsense, 'nonsense'),
(intron_exon_bdry, 'intron_exon_boundary'),
(read_through, 'read_through'),
(unknown, 'unknown')])
HitTypeList = [non_synonymous,
synonymous,
intron,
nonsense,
read_through,
CDS,
UTR5,
UTR3,
UTR3_1k,
ncRNA,
no_annotation,
est,
intron_exon_bdry,
unknown]
We model the codons as lists of tuples, (positions, nucleotide), an amino acid as a single character, and the position within the gene's amino acid sequence. We also provide a couple of accessor methods and a method for reversing and complementing the codon sequence.
class codon_rec(object):
"""
codon_rec - a class for holding codon information
codons are:
condon_list - a list of tuples (genomic position, nucleotide)
aa - an amino acid character from the protein sequence
aa_pos - the position of aa within the amino acid sequence
"""
def __init__(self, codon_list, aa, aa_pos):
self.codon_list = codon_list
self.aa = aa
self.aa_pos = aa_pos
def nucs(self):
"""
return the nucleotide triple as a list
"""
nuc_list = []
for n in self.codon_list:
nuc_list.append(n[1])
return nuc_list
def positions(self):
"""
return the lost of nucleotide positions
"""
pos_list = []
for n in self.codon_list:
pos_list.append(n[0])
return pos_list
def ComplemenCodonList(self):
"""
return a codon_rec with the nucleotides complemented
"""
complement = { "T": "A", "A": "T", "G": "C", "C": "G" }
new_codon_list = []
for i in xrange(3):
new_codon_list.append((self.codon_list[i][0], complement[self.codon_list[i][1]]))
new_codon = codon_rec(new_codon_list, self.aa, self.aa_pos)
return new_codon
Here's the new extended gene record. It has methods for creating a list of codon records and a debugging routine for cases where the exon annotation and the protein sequence don't match.
class GeneExtended(Gene):
"""
GeneExtended - an extension of the Gene class
We want to extent the sequence area that the gene covers to include
1 kb from the 3' UTR
"""
def __init__(self, gene_str, chroms, protein_seqs):
"""
We add a new start and end location and an extended region to the
3' UTR
chroms - a dictionary of chromosome sequences
protein_seqs - a dictionary of amino acid sequences
"""
Gene.__init__(self, gene_str)
if self.chrom in chroms:
self.seq = chroms[self.chrom]
else:
self.seq = None
if self.name in protein_seqs:
self.aa_seq = protein_seqs[self.name] + '*'
else:
self.aa_seq = None
if self.strand == '+':
self.utr3_extended = (self.txEnd + 1, self.txEnd + 1000)
self.extStart = self.txStart
self.extEnd = self.txEnd + 1000
else:
self.utr3_extended = (self.txStart - 1000, self.txStart + 1)
self.extStart = self.txStart - 1000
self.extEnd = self.txEnd
def CodonLookUp(self, codon):
"""
look up the amino acid character for a codon
"""
codon_table = Bio.Data.CodonTable.standard_dna_table.forward_table
stop_codons = Bio.Data.CodonTable.standard_dna_table.stop_codons
codon_str = ''.join([y[1] for y in codon])
if codon_str in stop_codons:
return '*'
else:
return codon_table[codon_str]
def _debug_print(self, pos, aa_pos, codon_list):
"""
dump codon list for debugging
"""
codon_table = Bio.Data.CodonTable.standard_dna_table.forward_table
stop_codons = Bio.Data.CodonTable.standard_dna_table.stop_codons
print self.name
print pos
print aa_pos
print self.chrom
print self.strand
for c in codon_list:
for pos, nuc in zip(c.positions(), c.nucs()):
print pos, nuc,
codon = ''.join(c.nucs())
if codon in stop_codons:
aa = '*'
else:
aa = codon_table[codon]
if c.aa_pos < len(self.aa_seq):
print codon, aa, c.aa, self.aa_seq[c.aa_pos], c.aa_pos
else:
print codon, aa, c.aa, ' ', c.aa_pos
def BuildCodonList(self):
"""
BuildCodonList - build a list of CDS genomic locations
we build the list every time to save memory. This is slow. We could probably get away with calculating
it once and saving it.
This routine travserses the exon tree and builds a list of exon positions. It iterates through each position
within the coding region and builds a list of codon_recs. If there is a mismatch between the predicted
codon and the codon in the protein sequence, we dump the list for debugging. This happens because the exon
annotation doesn't match the protein sequence.
returns:
codon_list - a list of codon_recs
"""
position_list = []
self.exons.traverse(lambda x: position_list.append((x.start, x.end)))
codon_list = []
codon = []
aa_pos = 0
if self.strand == '+':
for start, end in position_list:
for pos in xrange(start, end):
if pos < self.cdsStart or pos >= self.cdsEnd:
continue
codon.append((pos, self.seq[pos].upper()))
if len(codon) == 3:
if self.aa_seq is not None:
if aa_pos < len(self.aa_seq):
aa = self.aa_seq[aa_pos]
else:
print 'error: aa mismatch BuildCodonList'
self._debug_print(pos, aa_pos, codon_list)
aa = None
else:
aa = None
codon_list.append(codon_rec(codon, aa, aa_pos))
codon = []
aa_pos += 1
else:
for start, end in reversed(position_list):
for pos in xrange(end-1, start-1, -1):
if pos < self.cdsStart or pos >= self.cdsEnd:
continue
codon.append((pos, self.seq[pos].upper()))
if len(codon) == 3:
if self.aa_seq is not None:
if aa_pos < len(self.aa_seq):
aa = self.aa_seq[aa_pos]
else:
print 'error: aa mismatch BuildCodonList'
self._debug_print(pos, aa_pos, codon_list)
aa = None
else:
aa = None
codon_list.append(codon_rec(codon,aa, aa_pos).ComplementCodonList())
codon = []
aa_pos += 1
return codon_list
The predicted SNP class is similar to the previous version, with the addition of a method for determining the codon type. The idea is simple enough, we look up the codon and replace the appropriate character with the predicted SNP character and determine if the amino acid has changed. It's complicated by a need to done some debugging if there is an amino acid mismatch.
Here's the codon type routine.
def CodonType(self, gene):
complement = { "T": "A", "A": "T", "G": "C", "C": "G" }
codon_table = Bio.Data.CodonTable.standard_dna_table.forward_table
stop_codons = Bio.Data.CodonTable.standard_dna_table.stop_codons
codon_list = gene.BuildCodonList()
pos_in_codon, cd = self.SearchCodonList(codon_list, self.position-1)
# if we don't get a hit, we call it an intron
# these problems arise because of discrepancies between the
# gene file and codon file
if pos_in_codon == -1:
return HitType.intron
codon = ''.join(cd.nucs())
if gene.strand == '+':
if codon[pos_in_codon] != self.ref:
print 'error codon mismatch:', codon, codon[pos_in_codon], self.ref, pos_in_codon, self.position, cd.aa
self._debug_print(gene, codon_list)
if cd.aa != self.CodonLookup(codon):
print 'error aa mismatch:', codon, codon[pos_in_codon], self.ref, pos_in_codon, self.position, cd.aa
self._debug_print(gene, codon_list)
c_list = list(codon)
c_list[pos_in_codon] = self.poly
new_codon = ''.join(c_list)
else:
if codon[pos_in_codon] != complement[self.ref]:
print 'error codon mismatch:', codon, codon[pos_in_codon], self.ref, pos_in_codon, self.position
self._debug_print(gene, codon_list)
if cd.aa != self.CodonLookup(codon):
print 'error aa mismatch:', codon, codon[pos_in_codon], self.ref, pos_in_codon, self.position, cd.aa
self._debug_print(gene, codon_list)
c_list = list(codon)
c_list[pos_in_codon] = complement[self.poly]
new_codon = ''.join(c_list)
self.codons.append(codon)
self.new_codons.append(new_codon)
if codon in stop_codons:
if new_codon in stop_codons:
return HitType.synonymous
else:
return HitType.read_through
elif new_codon in stop_codons:
return HitType.nonsense
elif codon_table[new_codon] == codon_table[codon]:
return HitType.synonymous
else:
return HitType.non_synonymous
The whole program can be downloaded.
No comments:
Post a Comment