For a recent project I needed sequence regions upstream (preceding then 5' end of the gene) of a set of orthologous genes. The orthologs for a gene of interest are obtained from https://www.ncbi.nlm.nih.gov/gene. For example, searching for JAK2 orthologs at that site yields a table of JAK2 genes for a large number of species. After selecting species, the ortholog table can be downloaded.
Fetching Genomes
Since I wanted to analyze a number of different genes, I decided to automate the process of getting the upstream regions. The first step was to fetch the GenBank records for the genomes of the selected species. The GenBank IDs for each species are included in the downloaded ortholog table.
Fetching genomes is straightforward, if a bit slow. It uses Pandas to read the ortholog Table from NCBI and BioPython.Entrez to download the complete GenBank record for the genome.
def main(): args = GetArgs() genome_path = args.genome_path ortholog_table_file = args.ortholog_table if genome_path[-1] != '/': genome_path += '/' # read the table and get a list of genomes ortholog_table = pd.read_csv(ortholog_table_file, delimiter = '\t') genome_list = ortholog_table['genomic_nucleotide_accession.version'].to_list()
# NCBI wants to know who is downloading Entrez.email = 'analyticgarden@gmail.com' Entrez.tool = 'BioPython'
# get the file from NCBI and write it in GenBank format for genome_id in genome_list: handle = Entrez.efetch(db = 'nuccore', id = genome_id, rettype = 'gbwithparts', retmode = 'text') record = SeqIO.read(handle, format = 'genbank') filename = genome_path + genome_id + '.gb' print('Saving', filename) SeqIO.write(record, filename, 'genbank')
Indexing the Genome
class GeneFeatures(object): def __init__(self, genome_file): self._genes = {} self._features = [] self._index_genes(genome_file) @property def genes(self): return self._genes @property def features(self): return self._features @property def organism(self): return self._organism @property def genome_sequence(self): return self._genome_sequence def _index_genes(self, genome_file): """ _index_genes - create an index of features for genes in this GenBank file The index consists of a dict, self._gene, with gene name as key and index of the gene feature in the feature list, self._features, Parameters ---------- genome_file : str path to GenBank file. """ gb = SeqIO.read(genome_file, 'genbank') self._genome_sequence = gb.seq gene_count = 0 for feature in gb.features: if feature.type == 'gene': if 'pseudo' not in feature.qualifiers: # we are skipping pseudo genes self._features.append(feature) self._genes[feature.qualifiers['gene'][0].upper()] = gene_count gene_count += 1 elif feature.type == 'source': self._organism = feature.qualifiers['organism'][0].replace(' ', '_')
def main(): args = GetArgs() genome_path = args.genome_path output_path = args.output_path if genome_path[-1] != '/': genome_path += '/' if output_path is None: output_path = genome_path elif output_path[-1] != '/': output_path += '/' for p in Path(genome_path).glob('*.gb'): features = GeneFeatures(genome_path + p.name) out_file = output_path + os.path.basename(os.path.splitext(p.name)[0]) + '.pkl' with open(out_file, 'wb') as f: pickle.dump(features, f)
Upstream Sequences
def upstream_sequence(gene, gene_features, length = 1000, min_length = 100): """" upstream_sequence - grab sequence data preceding the 5' end of a gene. Parameters ---------- gene : str a gene name gene_features : GeneFeatures A GeneFeatures object for a genome. length : int, optional number of nucleotides to grab, by default 1000 min_length : int, optional reject sequences shorter than this, by default 100 Returns ------- Bio.SeqRecord A SeqRecord containing the upstream sequence or None if region is too short. Requires -------- The gene must exist in gene_features. """ feature = gene_features.features[gene_features.genes[gene]] location = feature.location if location.strand == 1: end = location.start - 1 upstream_feature = gene_features.features[gene_features.genes[gene] - 1] start = max(upstream_feature.location.end + 1, location.start - length) else: start = location.end + 1 upstream_feature = gene_features.features[gene_features.genes[gene] + 1] end = min(upstream_feature.location.start - 1, location.end + length) # overlapping genes or too short intergenic if end - start + 1 < min_length: return None record = SeqRecord(id = gene + '_' + gene_features.organism, description = "intergenic {}:{} length {}". format(start, end, end - start + 1), seq = gene_features.genome_sequence[start:(end+1)]) return record
def main(): args = GetArgs() ortholog_table_file = args.ortholog_table index_path = args.index_path gene = args.gene.upper() output_file = args.output_file seq_length = args.seq_length if index_path[-1] != '/': index_path += '/' ortholog_table = pd.read_csv(ortholog_table_file, delimiter = '\t') genome_list = list(ortholog_table['genomic_nucleotide_accession.version']) upstream_seqs = [] for genome in genome_list: index_file = index_path + genome + '.pkl' with open(index_file, 'rb') as f: gene_features = pickle.load(f) rec = upstream_sequence(gene, gene_features, seq_length) if rec is not None: upstream_seqs.append(rec) SeqIO.write(upstream_seqs, output_file, 'fasta')
No comments:
Post a Comment