class GeneWithCounts(Gene): """ GeneWithCounts - extend Gene class to include counts of reads overlapping introns and exons, and search and print routines """ def __init__(self, gene_str): Gene.__init__(self, gene_str) self.read_count = 0 self.reads_in_exons = 0 self.reads_in_introns = 0 def SearchExons(self, read): """ SearchExons - search exon tree for read overlap with exon """ self.read_count += 1 exons = self.exons out2 = [] # now search the exons for hits exons.intersect(tr.interval(self.name, read.positions[0], read.positions[-1]), lambda y: out2.append(y)) if out2 != []: self.reads_in_exons += 1 return 1 # in an exon else: self.reads_in_introns += 1 return 0 def Print(self, out): """ Print - print exon/intron counts out - file handle, opened for write """ exon_frac = 0 intron_frac = 0 if self.read_count > 0: exon_frac = float(self.reads_in_exons)/self.read_count intron_frac = float(self.reads_in_introns)/self.read_count out.write(','.join([self.name, self.chrom, str(self.reads_in_exons), str(exon_frac), str(self.reads_in_introns), str(intron_frac), str(self.read_count)]) + '\n')
To process reads, we read the.bam file, search the interval tree for genes that the read overlaps, and then search the exon tree for overlaps with in that gene's exons.
def ReadBAMFile(bam_file, chrom, gene_tree): """ ReadBAMfile: read a bam file and count reads overlapping introns and exons bam_file - the bam_file name chrom - chromosome name gene_tree - an IntervalTree. Gene search tree, the intervals are Tx ranges We look for overlaps with genes and increment the read count for the gene. """ exon_count = 0 intron_count = 0 total_reads = 0 f = pysam.Samfile(bam_file, 'rb') for read in f.fetch(chrom): total_reads += 1 out = [] in_exon = 0 gene_tree.intersect(tr.interval(chrom, read.positions[0], read.positions[-1]), lambda x: out.append(x)) for x in out: # out is a list of overlaps for gene in x.chain: # loop through each gene in the chain if gene.SearchExons(read) == 1: in_exon = 1 if in_exon: exon_count += 1 else: intron_count += 1 return exon_count, intron_count, total_readsFinally, we traverse the tree and print the counts.
def PrintExonCountsFromGenes(out, gene_tree): """ PrintExonCountsFromGenes - traverse the gene tree and print counts for all genes """ def PrintCounts(node): for gene in node.chain: gene.Print(out) gene_tree.traverse(PrintCounts)
The code is available for download.
No comments:
Post a Comment