We want to see if our data supports edits or SNPs at the reported positions. We can use Pandas to read the spreadsheet table and PySam to iterate over the reads. As we iterate through the reads, we'll count coverage and look for possible mismatches to the reference genome.
We will store the sites from spreadsheet in a site object
class Site(object): """ Site - ModEncode predicted SNP init is passed a 7 element list from the supplemental spreadsheet it also contains dictionaries for counting the coverage at the site position. """ def __init__(self, data_series, what): self.what = what self.name = data_series[0] self.chrom = 'chr' + str(data_series[1]) if data_series[2] == 'A': self.strand = '+' else: self.strand = '-' self.char = data_series[2] self.pos = data_series[3] if data_series[4] == 'Yes' or data_series[4] == 'No': self.g_rich = data_series[4] else: self.g_rich = '' if data_series[5] == 'Yes' or data_series[5] == 'No': self.tested = data_series[5] else: self.tested = '' if data_series[6] == 'Yes' or data_series[6] == 'No': self.validated = data_series[6] else: self.validated = '' self.forward_counts = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0} self.rev_counts = {'A' : 0, 'C' : 0, 'G' : 0, 'T' : 0} def Print(self, out): """ Print - print the site data and counts. Test for possible SNPs and edits. """ edit_evidence = 'N' if self.strand == '+' and self.forward_counts['G'] > 0: edit_evidence = 'Y' if self.strand == '-' and self.rev_counts['C'] > 0: edit_evidence = 'Y' possible_snp = 'N' if self.forward_counts['G'] > 0 and self.rev_counts['C'] > 0: possible_snp = 'Y' edit_evidence = 'N' out.write(','.join([self.what, self.name, self.chrom, self.strand, self.char, str(self.pos), self.g_rich, self.tested, self.validated, str(self.forward_counts['A']), str(self.forward_counts['C']), str(self.forward_counts['G']), str(self.forward_counts['T']), str(self.rev_counts['A']), str(self.rev_counts['C']), str(self.rev_counts['G']), str(self.rev_counts['T']), edit_evidence, possible_snp]) + '\n')
The site object initialization routine is passed a list containing the elements of the spreadsheet row and the site type. The spreadsheet table is divided into three sections labeled, embryonic, low adult, and normal, indicating the conditions under which the site was found. The print routine checks for mismatches to the reference and prints the data to a CSV file.
Here's the routine for reading the spreadsheet. We store the sites in a dictionary by type and by chromosome within each type.
def ReadSpreadsheet(xlsx_file): """ ReadSpreadsheet - read table 7 of the supplemental spread sheet xlsx_file - the spreadsheet file The spread sheet is divided into 3 types of sites. We handle the 3 separately in a dictionary. Sites of each type are stored in a dictionary keyed by chromosome. returns edit_sites - a dictionary of dictionaries of Site objects """ with warnings.catch_warnings(): # ignore useless warnings on oscar warnings.simplefilter("ignore") xl = pd.ExcelFile(xlsx_file) edit_sites = {'embryonic': dict(), 'low_adult': dict(), 'normal': dict()} data = xl.parse('Table 7', index_col = None, header = None) (rows, cols) = data.shape for row in xrange(4, rows): if isinstance(data.ix[row][1], basestring): site = Site(list(data.ix[row][1:8]), 'embryonic') if site.chrom not in edit_sites['embryonic'].keys(): edit_sites['embryonic'][site.chrom] = [] edit_sites['embryonic'][site.chrom].append(site) if isinstance(data.ix[row][9], basestring): site = Site(list(data.ix[row][9:16]), 'low_adult') if site.chrom not in edit_sites['low_adult'].keys(): edit_sites['low_adult'][site.chrom] = [] edit_sites['low_adult'][site.chrom].append(site) if isinstance(data.ix[row][17], basestring): site = Site(list(data.ix[row][17:24]), 'normal') if site.chrom not in edit_sites['normal'].keys(): edit_sites['normal'][site.chrom] = [] edit_sites['normal'][site.chrom].append(site) return edit_sites
We read the bam file with pileup and iterate through sites for the particular chromosome. This could have been made much more efficient, but we only have a small number of sites for each chromosome so searching the site list each time gets the job done with a minimum of programming effort and still runs in a reasonable time.
def ReadBamFile(bam_file, edit_sites, chrom): """ ReadBamFile - read a bam file and count coverage of site positions bam_file - the name of the bam file edit_sites - a dictionary of dictionaries of Site objects created by ReadSpreadsheet chrom - the chromosome name for this bam file Iterate through the reads using pileup.If the read position matches the a site position, add to the counts. """ def read_ok(read, bad_reads): """ read_ok - reject reads with a low quality (lt 30) base call read - a PySam AlignedRead object bad_reads - a dictionary of already flagged reads returns: True if the read is ok """ if read.qname in bad_reads.keys(): return False if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(read.qual)]): bad_reads[read.qname] = 1 return False else: return True _BASE_QUAL_CUTOFF = 30 complement = {'A' : 'T', 'C' : 'G', 'G' : 'C', 'T' : 'A'} bad_reads = dict() bam = pysam.Samfile(bam_file, "rb") for pileup_col in bam.pileup(chrom, max_depth=1000000): for k in edit_sites.keys(): if chrom in edit_sites[k].keys(): for site in edit_sites[k][chrom]: if site.pos - 1 == pileup_col.pos: for pileup_read in pileup_col.pileups: aln = pileup_read.alignment if not read_ok(aln, bad_reads): next if pileup_read.is_del == 0 and pileup_read.indel == 0: if aln.is_reverse: site.rev_counts[complement[aln.seq[pileup_read.qpos]]] += 1 else: site.forward_counts[aln.seq[pileup_read.qpos]] += 1
You can download the entire program.
St Laurent, G., M. R. Tackett, S. Nechkin, D. Shtokalo, D. Antonets, Y. A. Savva, R. Maloney, P. Kapranov, C. E. Lawrence and R. A. Reenan (2013). "Genome-wide analysis of A-to-I RNA editing by single-molecule sequencing in Drosophila." Nat Struct Mol Biol 20(11): 1333-1339.
No comments:
Post a Comment