The other day I showed how to filter a bam file to remove low quality reads. As a side effect of this, we also removed all reads that did not map to the particular chromosome that the bam file referenced. This greatly reduced the size of the bam file. In addition, it makes the next task much easier.
We want to know about the read coverage in each of the chromosomes to visualize how the coverage might differ among runs and sources of RNA and DNA. Coverage is the number of reads overlapping a particular genomic position. Counting coverage is pretty easy with the PySam module. Here's one way to do it:
def ReadBamFile(chrom, bam_file, genome_file):
"""
ReadBamFile - read a bam file and count nucleotides by context
chrom - the chromosome name as aa string, e.g. chr2L
bam_file - bam file containing reads
genome_file - file containing the genome in fasta format
returns: a numpy integer array of counts for each genomic position
"""
# read the genome sequence
seq = ReadFasta(genome_file)
seq_str = str(seq.seq)
coverage = np.zeros(len(seq_str), dtype=int)
# open the file and have pileup call the callback routine for each position
f = pysam.Samfile(bam_file, 'rb')
# pileupcol.n has the number of reads covering a position
for pileup_col in f.pileup(chrom, max_depth=12000):
coverage[pileup_col.pos] = pileup_col.n
return coverage
This routine is passed a chromosome name, a bam file name, and the name of a fasta file containing the genomes sequence. We read the genome file to get the length of the chromosome and create a numpy integer array to hold the coverage counts. We then use PySam's pileup routine to iterate through each position covered by reads. pileup() is an iterator that returns a sequence of pileup column objects. One of the object's properties is a count of the number of reads overlapping the column position. We store that number in an array and return the array. Then , the rest of the program just writes the positions and counts to a comma separated variable (csv) file.
This routines is not particularly efficient or elegant, but it's easy to write and does the job. It's also easy to run it in parallel for each chromosome and DNA/RNA type and generate a set of coverage files. Here's the whole program. There's not much to it: download
#!/gpfs/runtime/opt/python/2.7.3/bin/python
import sys
import argparse
from Bio import SeqIO
import pysam
import resource
import numpy as np
import re
def ReadFasta(file):
"""
ReadFasta - read a sequence from a fasta formatted file.
file - fasta file name
returns: a BioPython sequence record
"""
handle = open(file, 'rU')
iter = SeqIO.parse(handle, 'fasta')
record = iter.next()
handle.close()
return record
def ReadBamFile(chrom, bam_file, genome_file):
"""
ReadBamFile - read a bam file and count nucleotides by context
chrom - the chromosome name as aa string, e.g. chr2L
bam_file - bam file containing reads
genome_file - file containing the genome in fasta format
returns: a numpy integer array of counts for each genomic position
"""
# read the genome sequence
seq = ReadFasta(genome_file)
seq_str = str(seq.seq)
coverage = np.zeros(len(seq_str), dtype=int)
# open the file and have pileup call the callback routine for each position
f = pysam.Samfile(bam_file, 'rb')
# pileupcol.n has the number of reads covering a position
for pileup_col in f.pileup(chrom, max_depth=12000):
coverage[pileup_col.pos] = pileup_col.n
return coverage
def GetArgs():
"""
GetArgs - read the command line
returns - a bam file name, a genome file name, and an output file name
bam file is required
genome file is the fasta file containing teh chromosome sequence
output file is option, if not given, output is written to stdout
typing python phiX.count.py will show the options
"""
def ParseArgs(parser):
class Parser(argparse.ArgumentParser):
def error(self, message):
sys.stderr.write('error: %s\n' % message)
self.print_help()
sys.exit(2)
parser = Parser(description='Calculate coverage from bam file.')
parser.add_argument('-b', '--bam_file',
type=str,
required=True,
help='Bam file of reads (required).')
parser.add_argument('-g', '--genome_file',
required=False,
type=str,
default='/users/lalpert/scratch/Illumina/Project_Robert_Reenan_lane5_130523/First_Pass_Masked_Genome/chromFaMasked/chr2L.fa',
help='Fasta file containing chromosome genome.')
parser.add_argument('-o', '--output_file',
type=str,
help='output file for counts.')
return parser.parse_args()
parser = argparse.ArgumentParser()
args = ParseArgs(parser)
return args.bam_file, args.genome_file, args.output_file
def Main():
"""
main routine - get the input and output files and print the output as csv
"""
bam_file, genome_file, output_file = GetArgs()
m = re.search('(chr.+?)\.fa', bam_file)
chrom = m.group(1)
coverage = ReadBamFile(chrom, bam_file, genome_file)
if output_file is not None:
f = open(output_file, 'w')
else:
f = sys.stdout
# write the output in csv fomat
header = ','.join(['pos', 'coverage'])
f.write(header + '\n')
# calculate the frequency of each nucleotide in each context and print
for i in xrange(coverage.shape[0]):
out_str = ','.join([str(i+1), str(coverage[i])])
f.write(out_str + '\n')
f.close()
if __name__ == '__main__':
Main()
After running this program on filtered reads from fly chromosomes chr2L, 2R, 3L, 3R, 4, and X, we have a set of 6 csv files for each of our sequencing runs. The next task is to plot histograms of the coverage for each chromosome for each run. In the past, I probably would have fed the csv files into R or MATLAB to plot histograms. Here, instead, I'll use Matplotlib to make the plots.
Matplotlib is a is a python 2D plotting library. One of the advantages of the python scientific stack (numpy , SciPy, Matplotlib, and Pandas, is that you can do data munging, analysis, and plotting all within the the same language. I don't think the python collection is a replacement for R or MATLAB. R and MATLAB are mature and contain many routines and features missing from numpy/SciPy/Pandas. For high quality graphics, not much beats R's ggplot2. However, Matplotlib is good enough in most cases and python is much better at data munging and easier to learn than R. (Don't get me started on the horror of R's error messages.)
Since the coverage files are in the csv format, we can read them with numpy's loadtxt() routine. The chromosome files that the reads were originally mapped to were masked, i.e. repeats were replaced with N's, so there are large regions of each chromosome that have no coverage. We will eliminate them.
def ReadCoverageFile(csv_file):
"""
ReadCoverageFile - read a coverage csv file into a numpy integer array
return a numpy array of counts > 0
"""
cv = np.loadtxt(csv_file, delimiter=',', skiprows=1, dtype=int)
counts = cv[cv[:,1]>0, 1] # remove positions with 0 coverage
return counts
Once we have the counts, we can use Matplotlib's hist routine to plot a histogram. We format the histograms for chromosomes 2L, 2R, 3L, 3R, 4, and X from each run on a single page.
def Main():
coverage_path, output_file = GetArgs()
# get a list of the csv in the directory
csv_files = GetCSVFiles(coverage_path)
if not re.search('\/$', coverage_path):
coverage_path = coverage_path + '/'
# set up the plot and plot each chromosome's coverage histogram
plt.figure(1)
n = 1
counts = []
for f in sorted(csv_files):
c = ReadCoverageFile(coverage_path + f)
plt.subplot(3, 2, n)
plt.hist(c, 50)
m = re.search('^(chr.+?)_', f)
plt.title(m.group(1))
plt.ylabel('Counts')
n += 1
# save the plot to a pdf file
plt.tight_layout()
plt.savefig(output_file)
subplot() divides the page into a 3 x 2 grid, we plot each chromosome in one of the grid positions. The regular expression pulls the chromosome name from the file name. We use it as a title for each subplot. tight_layout() makes the page layout a little cleaner. The pgram is passed the name of directory containing the csv files and the name of the output file.
Here's the full deal: download
#!/gpfs/runtime/opt/python/2.7.3/bin/python
import sys
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import argparse
def GetCSVFiles(path):
"""
GetCSVFiles - return a list of files ending in .csv in path
"""
files = [f for f in os.listdir(path) if re.search('\.csv$', f)]
return files
def GetArgs():
"""
GetArgs - read the command line
returns - a path to teh coverage files and an output file name
coverage path and output file are required
typing python plot.coverage.py will show the options
"""
def ParseArgs(parser):
class Parser(argparse.ArgumentParser):
def error(self, message):
sys.stderr.write('error: %s\n' % message)
self.print_help()
sys.exit(2)
parser = Parser(description='Calculate coverage from bam file.')
parser.add_argument('-c', '--coverage_path',
type=str,
required=True,
help='Path to coverage files (required).')
parser.add_argument('-o', '--output_file',
type=str,
required=True,
help='output file (required).')
return parser.parse_args()
parser = argparse.ArgumentParser()
args = ParseArgs(parser)
return args.coverage_path, args.output_file
def ReadCoverageFile(csv_file):
"""
ReadCoverageFile - read a coverage csv file into a numpy inteher array
return a numpy array of counts > 0
"""
cv = np.loadtxt(csv_file, delimiter=',', skiprows=1, dtype=int)
counts = cv[cv[:,1]>0, 1]
return counts
def Main():
coverage_path, output_file = GetArgs()
# get a list of the csv in the directory
csv_files = GetCSVFiles(coverage_path)
if not re.search('\/$', coverage_path):
coverage_path = coverage_path + '/'
# set up the plot and plot each chromosome's coverage histogram
plt.figure(1)
n = 1
counts = []
for f in sorted(csv_files):
c = ReadCoverageFile(coverage_path + f)
plt.subplot(3, 2, n)
plt.hist(c, 50)
m = re.search('^(chr.+?)_', f)
plt.title(m.group(1))
plt.ylabel('Counts')
n += 1
# save the plot to a pdf file
plt.tight_layout()
plt.savefig(output_file)
if __name__ == '__main__':
Main()
No comments:
Post a Comment