We want to see if our data supports edits or SNPs at the reported positions. We can use Pandas to read the spreadsheet table and PySam to iterate over the reads. As we iterate through the reads, we'll count coverage and look for possible mismatches to the reference genome.
We will store the sites from spreadsheet in a site object
class Site(object):
"""
Site - ModEncode predicted SNP
init is passed a 7 element list from the supplemental spreadsheet
it also contains dictionaries for counting the coverage at the site position.
"""
def __init__(self, data_series, what):
self.what = what
self.name = data_series[0]
self.chrom = 'chr' + str(data_series[1])
if data_series[2] == 'A':
self.strand = '+'
else:
self.strand = '-'
self.char = data_series[2]
self.pos = data_series[3]
if data_series[4] == 'Yes' or data_series[4] == 'No':
self.g_rich = data_series[4]
else:
self.g_rich = ''
if data_series[5] == 'Yes' or data_series[5] == 'No':
self.tested = data_series[5]
else:
self.tested = ''
if data_series[6] == 'Yes' or data_series[6] == 'No':
self.validated = data_series[6]
else:
self.validated = ''
self.forward_counts = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
self.rev_counts = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0}
def Print(self, out):
"""
Print - print the site data and counts. Test for possible SNPs and edits.
"""
edit_evidence = 'N'
if self.strand == '+' and self.forward_counts['G'] > 0:
edit_evidence = 'Y'
if self.strand == '-' and self.rev_counts['C'] > 0:
edit_evidence = 'Y'
possible_snp = 'N'
if self.forward_counts['G'] > 0 and self.rev_counts['C'] > 0:
possible_snp = 'Y'
edit_evidence = 'N'
out.write(','.join([self.what, self.name, self.chrom, self.strand,
self.char, str(self.pos),
self.g_rich, self.tested, self.validated,
str(self.forward_counts['A']), str(self.forward_counts['C']),
str(self.forward_counts['G']), str(self.forward_counts['T']),
str(self.rev_counts['A']), str(self.rev_counts['C']),
str(self.rev_counts['G']), str(self.rev_counts['T']),
edit_evidence, possible_snp]) + '\n')
The site object initialization routine is passed a list containing the elements of the spreadsheet row and the site type. The spreadsheet table is divided into three sections labeled, embryonic, low adult, and normal, indicating the conditions under which the site was found. The print routine checks for mismatches to the reference and prints the data to a CSV file.
Here's the routine for reading the spreadsheet. We store the sites in a dictionary by type and by chromosome within each type.
def ReadSpreadsheet(xlsx_file):
"""
ReadSpreadsheet - read table 7 of the supplemental spread sheet
xlsx_file - the spreadsheet file
The spread sheet is divided into 3 types of sites. We handle the 3 separately in a dictionary.
Sites of each type are stored in a dictionary keyed by chromosome.
returns
edit_sites - a dictionary of dictionaries of Site objects
"""
with warnings.catch_warnings(): # ignore useless warnings on oscar
warnings.simplefilter("ignore")
xl = pd.ExcelFile(xlsx_file)
edit_sites = {'embryonic': dict(), 'low_adult': dict(), 'normal': dict()}
data = xl.parse('Table 7', index_col = None, header = None)
(rows, cols) = data.shape
for row in xrange(4, rows):
if isinstance(data.ix[row][1], basestring):
site = Site(list(data.ix[row][1:8]), 'embryonic')
if site.chrom not in edit_sites['embryonic'].keys():
edit_sites['embryonic'][site.chrom] = []
edit_sites['embryonic'][site.chrom].append(site)
if isinstance(data.ix[row][9], basestring):
site = Site(list(data.ix[row][9:16]), 'low_adult')
if site.chrom not in edit_sites['low_adult'].keys():
edit_sites['low_adult'][site.chrom] = []
edit_sites['low_adult'][site.chrom].append(site)
if isinstance(data.ix[row][17], basestring):
site = Site(list(data.ix[row][17:24]), 'normal')
if site.chrom not in edit_sites['normal'].keys():
edit_sites['normal'][site.chrom] = []
edit_sites['normal'][site.chrom].append(site)
return edit_sites
We read the bam file with pileup and iterate through sites for the particular chromosome. This could have been made much more efficient, but we only have a small number of sites for each chromosome so searching the site list each time gets the job done with a minimum of programming effort and still runs in a reasonable time.
def ReadBamFile(bam_file, edit_sites, chrom):
"""
ReadBamFile - read a bam file and count coverage of site positions
bam_file - the name of the bam file
edit_sites - a dictionary of dictionaries of Site objects created by ReadSpreadsheet
chrom - the chromosome name for this bam file
Iterate through the reads using pileup.If the read position matches the a site position,
add to the counts.
"""
def read_ok(read, bad_reads):
"""
read_ok - reject reads with a low quality (lt 30) base call
read - a PySam AlignedRead object
bad_reads - a dictionary of already flagged reads
returns: True if the read is ok
"""
if read.qname in bad_reads.keys():
return False
if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(read.qual)]):
bad_reads[read.qname] = 1
return False
else:
return True
_BASE_QUAL_CUTOFF = 30
complement = {'A' : 'T', 'C' : 'G', 'G' : 'C', 'T' : 'A'}
bad_reads = dict()
bam = pysam.Samfile(bam_file, "rb")
for pileup_col in bam.pileup(chrom, max_depth=1000000):
for k in edit_sites.keys():
if chrom in edit_sites[k].keys():
for site in edit_sites[k][chrom]:
if site.pos - 1 == pileup_col.pos:
for pileup_read in pileup_col.pileups:
aln = pileup_read.alignment
if not read_ok(aln, bad_reads):
next
if pileup_read.is_del == 0 and pileup_read.indel == 0:
if aln.is_reverse:
site.rev_counts[complement[aln.seq[pileup_read.qpos]]] += 1
else:
site.forward_counts[aln.seq[pileup_read.qpos]] += 1
You can download the entire program.
St Laurent, G., M. R. Tackett, S. Nechkin, D. Shtokalo, D. Antonets, Y. A. Savva, R. Maloney, P. Kapranov, C. E. Lawrence and R. A. Reenan (2013). "Genome-wide analysis of A-to-I RNA editing by single-molecule sequencing in Drosophila." Nat Struct Mol Biol 20(11): 1333-1339.
No comments:
Post a Comment