class GeneWithCounts(Gene):
"""
GeneWithCounts - extend Gene class to include counts of reads
overlapping introns and exons, and search and print routines
"""
def __init__(self, gene_str):
Gene.__init__(self, gene_str)
self.read_count = 0
self.reads_in_exons = 0
self.reads_in_introns = 0
def SearchExons(self, read):
"""
SearchExons - search exon tree for read overlap with exon
"""
self.read_count += 1
exons = self.exons
out2 = []
# now search the exons for hits
exons.intersect(tr.interval(self.name, read.positions[0], read.positions[-1]),
lambda y: out2.append(y))
if out2 != []:
self.reads_in_exons += 1
return 1 # in an exon
else:
self.reads_in_introns += 1
return 0
def Print(self, out):
"""
Print - print exon/intron counts
out - file handle, opened for write
"""
exon_frac = 0
intron_frac = 0
if self.read_count > 0:
exon_frac = float(self.reads_in_exons)/self.read_count
intron_frac = float(self.reads_in_introns)/self.read_count
out.write(','.join([self.name, self.chrom,
str(self.reads_in_exons),
str(exon_frac),
str(self.reads_in_introns),
str(intron_frac),
str(self.read_count)]) + '\n')
To process reads, we read the.bam file, search the interval tree for genes that the read overlaps, and then search the exon tree for overlaps with in that gene's exons.
def ReadBAMFile(bam_file, chrom, gene_tree):
"""
ReadBAMfile: read a bam file and count reads overlapping introns and exons
bam_file - the bam_file name
chrom - chromosome name
gene_tree - an IntervalTree. Gene search tree, the intervals are Tx ranges
We look for overlaps with genes and increment the read count for
the gene.
"""
exon_count = 0
intron_count = 0
total_reads = 0
f = pysam.Samfile(bam_file, 'rb')
for read in f.fetch(chrom):
total_reads += 1
out = []
in_exon = 0
gene_tree.intersect(tr.interval(chrom, read.positions[0], read.positions[-1]),
lambda x: out.append(x))
for x in out: # out is a list of overlaps
for gene in x.chain: # loop through each gene in the chain
if gene.SearchExons(read) == 1:
in_exon = 1
if in_exon:
exon_count += 1
else:
intron_count += 1
return exon_count, intron_count, total_reads
Finally, we traverse the tree and print the counts.def PrintExonCountsFromGenes(out, gene_tree):
"""
PrintExonCountsFromGenes - traverse the gene tree and print counts for all genes
"""
def PrintCounts(node):
for gene in node.chain:
gene.Print(out)
gene_tree.traverse(PrintCounts)
The code is available for download.
No comments:
Post a Comment