We want to count the number of reads mapped to genes and exons in the sense and anti-sense directions. As we did, previously, we will use an interval tree to store the gene and exon locations. Given the location, we search the gene interval tree for an overlap. If found and the direction of the read mapping matches the gene direction, we count it as being in the sense direction. We perform a similar search for the exons. One complication is that because of alternate splicing, a read may be indicated as overlapping several genes. We will need to be careful to count each read only once.
def ReadBamFile(chrom, chrom_path, bam_file, gene_tree):
"""
ReadBamFile - read a bam file and count coverage
chrom - chromosome name e.g. chr2L
bam_file - input bam file
gene_tree - an interval tree of gene locations
Return:
counts - total good reads
sense_counts - reads mapping to genes in the sense direction
antisense_counts - reads mapping to genes in the anti-sense direction
sense_exon_counts, antisense_exon_counts - reads mapping to exons
"""
def read_ok(aln, bad_reads):
"""
read_ok - reject reads with a low quality (lt 30) base call
aln - a PySam AlignedRead object
bad_reads - a dictionary of read ids with low quality
returns: True if the read is ok
We maintain a dictionary of rejected reads, so once a read is flagged
we don't seach it's quality list again
"""
if aln.qname in bad_reads.keys():
return False
if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(aln.qual)]):
bad_reads[aln.qname] = 1
return False
else:
return True
_BASE_QUAL_CUTOFF = 30
counts = 0
sense_counts = 0
antisense_counts = 0
sense_exon_counts = 0
antisense_exon_counts = 0
bad_reads = dict()
in_file = pysam.Samfile(bam_file, 'rb')
for read in in_file.fetch():
if read_ok(read, bad_reads):
counts += 1
out = []
gene_tree.intersect(tr.interval(chrom, read.positions[0],
read.positions[-1]),
lambda x: out.append(x))
if out != []:
is_sense = []
is_sense_exon = []
for x in out: # out is a list of overlaps
for gene in x.chain: # loop through each gene in the chain
if gene.strand == '+':
if not read.is_reverse: # count as sense if read is same direction as gene
is_sense.append(True)
out2 = []
gene.exons.intersect(tr.interval(gene.name, read.positions[0],
read.positions[-1]),
lambda y: out2.append(y))
if out2 != []:
is_sense_exon.append(True)
else:
is_sense.append(False)
out2 = []
gene.exons.intersect(tr.interval(gene.name, read.positions[0],
read.positions[-1]),
lambda y: out2.append(y))
if out2 != []:
is_sense_exon.append(False)
else:
if read.is_reverse:
is_sense.append(True)
out2 = []
gene.exons.intersect(tr.interval(gene.name, read.positions[0],
read.positions[-1]),
lambda y: out2.append(y))
if out2 != []:
is_sense_exon.append(True)
else:
is_sense.append(False)
out2 = []
gene.exons.intersect(tr.interval(gene.name, read.positions[0],
read.positions[-1]),
lambda y: out2.append(y))
if out2 != []:
is_sense_exon.append(False)
if any(is_sense):
sense_counts += 1
else:
antisense_counts += 1
if is_sense_exon != []:
if any(is_sense_exon):
sense_exon_counts += 1
else:
antisense_exon_counts += 1
return counts, sense_counts, antisense_counts, sense_exon_counts, antisense_exon_counts
The program can be downloaded.
No comments:
Post a Comment