One of the most basic things you want to know when aligning sequence reads to a genome is how many reads cover a given position. Here we'll take an alignment from reads produced by an Illumina HiSeq 2500 sequence and aligned with BWA.
To count, we'll make use of PySam's pileup routine. The process is pretty simple. We have pileup iterate through each position in the genome that is covered by a read. Pileup returns a list of read overlapping the position. We through away any read that contains a base call quality below 30 and increment a count array for each read that overlaps a given position.
Here's the relevant code
for pileup_col in f.pileup(id, max_depth=12000):
for pileup_read in pileup_col.pileups:
if not pileup_read.indel or pileup_read.is_del:
aln = pileup_read.alignment
if read_ok(aln):
coverage[pileup_col.pos] += 1
coverage is a numpy array of integers. One element for each genomic position. read_ok checks to see if the read has any basecall qualities below 30. It returns false, if the read has a low quality base.
def read_ok(aln):
"""
read_ok - reject reads with a low quality
aln - a PySam AlignedRead object
returns: True if the read is ok
We maintain a dictionary of rejected reads, so once a read is flagged
we don't search it's quality list again
"""
if aln.qname in bad_reads:
return False
if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(aln.qual)]):
bad_reads[aln.qname] = 1
return False
else:
return True
bad_reads is a dictionary of flagged reads.
Here's the whole thing: download
import sys
import argparse
from Bio import SeqIO
import pysam
import resource
import numpy as np
def ReadFasta(file):
"""
ReadFasta - read a sequence from a fasta formatted file.
file - fasta file name
returns: a BioPython sequence record
"""
handle = open(file, 'rU')
iter = SeqIO.parse(handle, 'fasta')
record = iter.next()
handle.close()
return record
def ReadBamFile(bam_file, genome_file):
"""
ReadBamFile - read a bam file and count nucleotides by context
bam_file - bam file containing reads
genome_file - file containing the genome in fasta format
returns: a dictionary keyed by context. For each contex, there is a list of A,C,G, T counts
"""
def read_ok(aln):
"""
read_ok - reject reads with a low quality
aln - a PySam AlignedRead object
returns: True if the read is ok
We maintain a dictionary of rejected reads, so once a read is flagged
we don't seach it's quality list again
"""
if aln.qname in bad_reads:
return False
if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(aln.qual)]):
bad_reads[aln.qname] = 1
return False
else:
return True
_BASE_QUAL_CUTOFF = 30
_READ_LENGTH = 50
# read the genome sequence
seq = ReadFasta(genome_file)
seq_str = str(seq.seq)
coverage = np.zeros(len(seq_str), dtype=int)
read_count = 0
bad_reads = dict()
# open the file and have pileup call the callback routine for each position
f = pysam.Samfile(bam_file, 'rb')
for pileup_col in f.pileup('chr2L', max_depth=12000):
for pileup_read in pileup_col.pileups:
if not pileup_read.indel or pileup_read.is_del:
aln = pileup_read.alignment
if read_ok(aln):
coverage[pileup_col.pos] += 1
return coverage
def GetArgs():
"""
GetArgs - read the command line
returns - a bam file name, a genome file name, and an output file name
bam file is required
genome file is optional and has a default
output file is option, if not given, output is written to stdout
typing python phiX.count.py will show the options
"""
def ParseArgs(parser):
class Parser(argparse.ArgumentParser):
def error(self, message):
sys.stderr.write('error: %s\n' % message)
self.print_help()
sys.exit(2)
parser = Parser(description='Calculate Coverage from sam file.')
parser.add_argument('-b', '--bam_file',
type=str,
required=True,
help='Bam file of PhiX reads (required).')
parser.add_argument('-g', '--genome_file',
required=False,
type=str,
default='/users/lalpert/scratch/Illumina/Project_Robert_Reenan_lane5_130523/First_Pass_Masked_Genome/chromFaMasked/chr2L.fa',
help='File containing phiX genome.')
parser.add_argument('-o', '--output_file',
type=str,
help='output file context counts.')
return parser.parse_args()
parser = argparse.ArgumentParser()
args = ParseArgs(parser)
return args.bam_file, args.genome_file, args.output_file
def Main():
"""
main routine - get the input and output files and print the output
"""
bam_file, genome_file, output_file = GetArgs()
coverage = ReadBamFile(bam_file, genome_file)
if output_file is not None:
f = open(output_file, 'w')
else:
f = sys.stdout
# write the output in csv fomat
header = ','.join(['pos', 'coverage'])
f.write(header + '\n')
# calculate the frequency of each nucleotide in each context and print
for i in xrange(coverage.shape[0]):
out_str = ','.join([str(i+1), str(coverage[i])])
f.write(out_str + '\n')
f.close()
if __name__ == '__main__':
Main()
If you want to use this code, you'll have to change a couple of the constants used: chr2L and the default genome
No comments:
Post a Comment