Counting yet again...This time, coverage

One of the most basic things you want to know when aligning sequence reads to a genome is how many reads cover a given position. Here we'll take an alignment from reads produced by an Illumina HiSeq 2500 sequence and aligned with BWA.

 To count, we'll make use of PySam's pileup routine. The process is pretty simple. We have pileup iterate through each position in the genome that is covered by a read. Pileup returns a list of read overlapping the position. We through away any read that contains a base call quality below 30 and increment a count array for each read that overlaps a given position.

Here's the relevant code

   for pileup_col in f.pileup(id, max_depth=12000):
        for pileup_read in pileup_col.pileups:
            if not pileup_read.indel or  pileup_read.is_del:
                aln = pileup_read.alignment
                if read_ok(aln):
                    coverage[pileup_col.pos] += 1


coverage is a numpy array of integers. One element for each genomic position. read_ok checks to see if the read has any basecall qualities below 30. It returns false, if the read has a low quality base.


    def read_ok(aln):
        """
        read_ok - reject reads with a low quality 

        aln - a PySam AlignedRead object
        returns: True if the read is ok

        We maintain a dictionary of rejected reads, so once a read is flagged
        we don't search it's quality list again
        """
        if aln.qname in bad_reads:
            return False

        if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(aln.qual)]):
            bad_reads[aln.qname] = 1
            return False
        else:
            return True


bad_reads is a dictionary of flagged reads.




Here's the whole thing: download




import sys
import argparse
from Bio import SeqIO
import pysam
import resource
import numpy as np


def ReadFasta(file):
    """
    ReadFasta - read a sequence from a fasta formatted file.

    file - fasta file name
    returns: a BioPython sequence record
    """
    handle = open(file, 'rU')
    iter = SeqIO.parse(handle, 'fasta')
    record = iter.next()
    handle.close()

    return record


def ReadBamFile(bam_file, genome_file):
    """
    ReadBamFile - read a bam file and count nucleotides by context
    bam_file - bam file containing reads
    genome_file - file containing the genome in fasta format
    returns: a dictionary keyed by context. For each contex, there is a list of A,C,G, T counts
    """

    def read_ok(aln):
        """
        read_ok - reject reads with a low quality

        aln - a PySam AlignedRead object
        returns: True if the read is ok

        We maintain a dictionary of rejected reads, so once a read is flagged
        we don't seach it's quality list again
        """
        if aln.qname in bad_reads:
            return False

        if any([ord(c)-33 < _BASE_QUAL_CUTOFF for c in list(aln.qual)]):
            bad_reads[aln.qname] = 1
            return False
        else:
            return True

    _BASE_QUAL_CUTOFF = 30
    _READ_LENGTH = 50

    # read the genome sequence
    seq = ReadFasta(genome_file)
    seq_str = str(seq.seq)

    coverage = np.zeros(len(seq_str), dtype=int)
    read_count = 0

    bad_reads = dict()

    # open the file and have pileup call the callback routine for each position
    f = pysam.Samfile(bam_file, 'rb')

    for pileup_col in f.pileup('chr2L', max_depth=12000):
        for pileup_read in pileup_col.pileups:
            if not pileup_read.indel or  pileup_read.is_del:
                aln = pileup_read.alignment
                if read_ok(aln):
                    coverage[pileup_col.pos] += 1

    return coverage


def GetArgs():
    """
    GetArgs - read the command line
    returns - a bam file name, a genome file name, and an output file name
    bam file is required
    genome file is optional and has a default
    output file is option, if not given, output is written to stdout

    typing python phiX.count.py will show the options
    """

    def ParseArgs(parser):
        class Parser(argparse.ArgumentParser):
            def error(self, message):
                sys.stderr.write('error: %s\n' % message)
                self.print_help()
                sys.exit(2)

        parser = Parser(description='Calculate Coverage from sam file.')
        parser.add_argument('-b', '--bam_file',
                            type=str,
                            required=True,
                            help='Bam file of PhiX reads (required).')
        parser.add_argument('-g', '--genome_file',
                            required=False,
                            type=str,
                            default='/users/lalpert/scratch/Illumina/Project_Robert_Reenan_lane5_130523/First_Pass_Masked_Genome/chromFaMasked/chr2L.fa',
                            help='File containing phiX genome.')
        parser.add_argument('-o', '--output_file',
                            type=str,
                            help='output file context counts.')
        return parser.parse_args()

    parser = argparse.ArgumentParser()
    args = ParseArgs(parser)

    return args.bam_file, args.genome_file, args.output_file


def Main():
    """
    main routine - get the input and output files and print the output
    """

    bam_file, genome_file, output_file = GetArgs()

    coverage = ReadBamFile(bam_file, genome_file)

    if output_file is not None:
        f = open(output_file, 'w')
    else:
        f = sys.stdout

    # write the output in csv fomat
    header = ','.join(['pos', 'coverage'])
    f.write(header + '\n')
   
    # calculate the frequency of each nucleotide in each context and print
    for i in xrange(coverage.shape[0]):
        out_str = ','.join([str(i+1), str(coverage[i])])
        f.write(out_str + '\n')
    f.close()

if __name__ == '__main__':
    Main()


If you want to use this code, you'll have to change a couple of the constants used: chr2L and the default genome

No comments:

Post a Comment