More Counting - This Time Reads in Exons and Introns

One thing that is important to compare is the relative coverage of reads in exonic vs. intronic regions. We can count reads in introns and exons using the IntervalTree and Gene classes we used previously. We will derive a new class from the Gene class. The new class objects have properties for keeping track of read counts and methods for searching the exon tree and printing the counts.

class GeneWithCounts(Gene):
    """
    GeneWithCounts - extend Gene class to include counts of reads
                     overlapping introns and exons, and search and print routines
    """

    def __init__(self, gene_str):
        Gene.__init__(self, gene_str)
        self.read_count = 0
        self.reads_in_exons = 0
        self.reads_in_introns = 0

    def SearchExons(self, read):
        """
        SearchExons - search exon tree for read overlap with exon
        """
        self.read_count += 1
        exons = self.exons
        out2 = []
        # now search the exons for hits
        exons.intersect(tr.interval(self.name, read.positions[0], read.positions[-1]),
                        lambda y: out2.append(y))
        if out2 != []:
            self.reads_in_exons += 1
            return 1  # in an exon
        else:
            self.reads_in_introns += 1
            return 0

    def Print(self, out):
        """
        Print - print exon/intron counts
        out - file handle, opened for write
        """
        exon_frac = 0
        intron_frac = 0
        if self.read_count > 0:
            exon_frac = float(self.reads_in_exons)/self.read_count
            intron_frac = float(self.reads_in_introns)/self.read_count

        out.write(','.join([self.name, self.chrom,
                            str(self.reads_in_exons),
                            str(exon_frac),
                            str(self.reads_in_introns),
                            str(intron_frac),
                            str(self.read_count)]) + '\n')



To process reads, we read the.bam file, search the interval tree for genes that the read overlaps, and then search the exon tree for overlaps with in that gene's exons.


def ReadBAMFile(bam_file, chrom, gene_tree):
    """
    ReadBAMfile: read a bam file and count reads overlapping introns and exons
    bam_file - the bam_file name
    chrom - chromosome name
    gene_tree - an IntervalTree. Gene search tree, the intervals are Tx ranges
                We look for overlaps with genes and increment the read count for
                the gene.
    """
    exon_count = 0
    intron_count = 0
    total_reads = 0

    f = pysam.Samfile(bam_file, 'rb')
    for read in f.fetch(chrom):
        total_reads += 1
        out = []
        in_exon = 0
        gene_tree.intersect(tr.interval(chrom, read.positions[0], read.positions[-1]),
                            lambda x: out.append(x))
        for x in out:   # out is a list of overlaps
            for gene in x.chain:    # loop through each gene in the chain
                if gene.SearchExons(read) == 1:
                    in_exon = 1

        if in_exon:
            exon_count += 1
        else:
            intron_count += 1

    return exon_count, intron_count, total_reads
Finally, we traverse the tree and print the counts.

def PrintExonCountsFromGenes(out, gene_tree):
    """
    PrintExonCountsFromGenes - traverse the gene tree and print counts for all genes
    """

    def PrintCounts(node):
        for gene in node.chain:
            gene.Print(out)
            
    gene_tree.traverse(PrintCounts)

The code is available for download

No comments:

Post a Comment