We used Pysam to read the BAM file. While reading we filtered read pairs and rejected any read that had a base call quality or mapping quality less than 30. We also rejected read pairs when one of the pair did not map. For the pairs that passed the filter, we simply count the number with each TLEN. We save the counts in a file for plotting with R.
Here's the counting code:
def ReadBamFile(bam_file):
"""
ReadBamFile - read a bam file and count nucleotides by context
bam_file - bam file containing reads
returns:
counts - a dictionary with counts for each template length
We count reads that have mapped pairs.
"""
def read_ok(aln):
"""
read_ok - is a reak suitable for further processing
aln - a PySam AlignedRead object
returns:
True if the read is ok
"""
if aln.is_unmapped:
return False
if aln.mapq < _MAP_QUAL_CUTOFF:
return False
# note qual is offset by 64
if any([c-64 < _BASE_QUAL_CUTOFF for c in aln.qual]):
return False
else:
return True
_MAP_QUAL_CUTOFF = 30
_BASE_QUAL_CUTOFF = 30
chroms = ['chr2L', 'chr2R', 'chr3L', 'chr3R', 'chr4', 'chrX'] # we only look at these chromosomes
counts = defaultdict(lambda: 0)
read_ids = defaultdict(lambda: 0)
read_count = 0
# open the file
f = pysam.Samfile(bam_file, 'rb')
for read in f.fetch():
read_count += 1
if f.getrname(read.tid) in chroms:
if read_ok(read): # only count if read has quality lt 30
if read.is_paired and not read.mate_is_unmapped: # both mates must be mapped
read_ids[read.qname] += 1
if read_ids[read.qname] == 2:
del read_ids[read.qname] # this is the second mate, delete it from the list to save space
counts[abs(read.tlen)] += 1
if abs(read.tlen) > 20000000: # debugging
print(read.qname, read.pos, read.pnext, read.tlen, f.getrname(read.tid))
print(read)
f.close()
return counts
The full code can be downloaded.
No comments:
Post a Comment