In a previous post, I looked at the countries posting H5N1 HA sequences to GISAID and the host species that the samples were drawn from. This time I want to look at the variation in the data to identify possible mutations.
It has been reported that a glutamine to leucine mutation at residue 226 of the HA amino acid sequence increased specificity of host recognition from avian to human.
In order to get a sense of the current mutation landscape as reported to GISAID, I aligned 3,432 sequences from the time period January 1 2024 to December 12, 2024 using MAFFT. The sequences were aligned to a reference sequence, PP755589 HA sequence from GenBank. See also this GitHub page.
time mafft --6merpair --maxambiguous 0.05 --preservecase --thread -1 --addfragments data/gisaid_epiflu_sequence_HA_2024_12_12.fasta data/HA_reference.fasta > data/gisaid_HA_2024_12_12_aln.fasta
def read_alignment_file(filename: str) -> tuple[dict[str, SeqRecord.SeqRecord], Align.MultipleSeqAlignment]: """ Read an MSA FASTA file Parameters ---------- filename : str MSA file Returns ------- tuple[dict[str, SeqRecord.SeqRecord], Align.MultipleSeqAlignment] A tuple: a dictionary with record ID as key and SeqRecords as value a BioAlign Multiple Sequence Alignment """ alignment = AlignIO.read(open(filename), "fasta") align_dict = dict() for record in alignment: align_dict[record.id] = record return (align_dict, alignment)
def ref_pos_to_alignment(align: dict[str, SeqRecord.SeqRecord], ref_id: str) -> dict[int, int]: """Map aligned positions to reference sequence Parameters ---------- align : Dictionary returned by read_alignment_file A BioPython MSA ref_id : str ID of reference seq Returns ------- dict[int, int] A dictionary key: alignment position value: position in reference sequence, or -1 id a gap position """ ref_seq = align[ref_id].seq pos_map = dict() ref_pos = 0 # 0 based for pos in range(len(ref_seq)): if ref_seq[pos] in ['A', 'C', 'G', 'T']: pos_map[pos] = ref_pos ref_pos += 1 else: pos_map[pos] = -1 return pos_map
def get_varying_columns(align: Align.MultipleSeqAlignment, consensus_cutoff: float = 1.0, start: int = 0, end: int = -1) -> dict[int, VaryingCol]: """Iterate across columns and find columns in alignment that vary more than consensus_cutoff Parameters ---------- align : Align.MultipleSeqAlignment A BioPython MSA consensus_cutoff : float, optional Cutoff for consensus. Only columns with consensus freq less that this value are accepted, by default 1.0 start : int, optional Start searching at this columns, by default 0 end : int, optional last column to search. If -1, end is last column. , by default -1 Returns ------- dict[int, VaryingCol] A dictionary key: column position value: a VaryingCol object """ if end == -1: end = align.get_alignment_length() variant_cols = dict() for col in range(start, end): c = Counter(align[:, col]) common = c.most_common(1) if common[0][0] in ['A', 'C', 'G', 'T']: denom = sum([c[k] for k in ['A', 'C', 'G', 'T', '-']]) freq = common[0][1] / denom if freq < consensus_cutoff: # variant_cols[col] = (freq, tuple(list(align[:, col]))) variant_cols[col] = VaryingCol(freq, tuple(list(align[:, col]))) return variant_cols return pos_map
@dataclass(frozen=True) class VaryingCol: """ A dataclass for holding info about an aligned column with variation """ pct: float data: tuple # tuple so that it is immutable
def count_mutations(variant_cols: dict[int, tuple[float, list[float, list[str]]]], pos_map: dict[int, int], align_length: int) -> dict[int, MutationCounts]: """Count the common nucleotides and alternates Parameters ---------- variant_cols : dict[int, tuple[float, list[float, list[str]]]] A dictionary returned by et_varying_column pos_map : dict[int, int] A dictionary returned by ref_pos_to_alignment align_length : int The number of rows in the alignment Returns ------- dict[str, dict[str, int | float | str]] A dictionary key: reference position value: a MutationCounts object """ mutations = dict() for pos, rec in variant_cols.items(): ref_pos = pos_map[pos] nucleotides = rec.data c = Counter(nucleotides) ref_nuc = c.most_common(2)[0][0] ref_freq = c.most_common(2)[0][1] / align_length alt_nuc = c.most_common(2)[1][0] alt_freq = c.most_common(2)[1][1] / align_length mutations[ref_pos] = MutationCounts(align_pos = pos, ref_pos = ref_pos, ref_nuc = ref_nuc, ref_freq = ref_freq, alt_nuc = alt_nuc, alt_freq = alt_freq) return mutations def get_codons(gb_ref: SeqRecord, mutations: dict[int, MutationCounts]) -> None: """get codons and alternate codons for mutated positions Parameters ---------- gb_ref : SeqRecord Genbank record of reference sequnec mutations : dict[int, MutationCounts] a dictionary of MutationCounts for mutatated columns returned by count_mutations key: reference seq column value: a MutationCounts object Note: mutations is altered by this function """ seq = list(gb_ref.seq) for pos in range(len(seq)): if pos in mutations.keys(): mutations[pos].codon_pos = int(pos / 3) * 3 mutations[pos].codon = "".join(seq[mutations[pos].codon_pos:mutations[pos].codon_pos+3]) mutations[pos].aa = str(Seq(mutations[pos].codon).translate()) mutations[pos].aa_name = seq3(mutations[pos].aa) pos_in_codon = pos % 3 s = list(mutations[pos].codon) s[pos_in_codon] = mutations[pos].alt_nuc mutations[pos].alt_codon = "".join(s) mutations[pos].alt_aa = str(Seq(mutations[pos].alt_codon).translate()) mutations[pos].alt_aa_name = seq3(mutations[pos].alt_aa)
@dataclass class MutationCounts: """ A data class for holding mutation data """ align_pos: int # positions are 0 indexed ref_pos: int ref_nuc: str ref_freq: float alt_nuc: str alt_freq: float codon_pos: int = -1 codon: str = "" aa: str = "" aa_name: str = "" alt_codon: str = "" alt_aa: str = "" alt_aa_name: str = "" def to_list(self) -> list: """convert self to a list Returns ------- list A list of elements from self """ return [self.ref_pos + 1, self.align_pos + 1, self.ref_freq, self.alt_freq, self.ref_nuc, self.codon_pos + 1, self.aa, self.aa_name, self.codon, self.alt_nuc, self.alt_aa, self.alt_aa_name, self.alt_codon, self.aa + str(self.codon_pos + 1) + self.alt_aa, 'syn' if self.aa == self.alt_aa else 'non_syn']
reference_position | alignment_pos | ref_freq | alt_freq | ref_nucleotide | codon_pos | ref_aa | aa_name | ref_codon | alt_nucleotide | alt_aa | alt_aa_name | alt_codon | mutation | synonomous |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 37 | 0.8663677130044843 | 0.11689088191330343 | G | 4 | E | Glu | GAG | A | K | Lys | AAG | E4K | non_syn |
18 | 51 | 0.7739910313901345 | 0.22002989536621823 | A | 16 | L | Leu | CTA | T | L | Leu | CTT | L16L | syn |
93 | 126 | 0.7838565022421524 | 0.21614349775784752 | A | 91 | Q | Gln | CAA | G | Q | Gln | CAG | Q91Q | syn |
171 | 206 | 0.7461883408071749 | 0.22511210762331837 | A | 169 | L | Leu | CTA | C | L | Leu | CTC | L169L | syn |
174 | 210 | 0.7566517189835575 | 0.24275037369207772 | C | 172 | C | Cys | TGC | T | C | Cys | TGT | C172C | syn |
177 | 213 | 0.7491778774289986 | 0.2508221225710015 | C | 175 | D | Asp | GAC | T | D | Asp | GAT | D175D | syn |
178 | 214 | 0.8382660687593423 | 0.16053811659192826 | C | 178 | L | Leu | CTA | T | L | Leu | TTA | L178L | syn |
195 | 231 | 0.7515695067264574 | 0.2436472346786248 | A | 193 | P | Pro | CCA | T | P | Pro | CCT | P193P | syn |
210 | 246 | 0.7560538116591928 | 0.24394618834080717 | C | 208 | D | Asp | GAC | T | D | Asp | GAT | D208D | syn |
213 | 249 | 0.7431988041853512 | 0.25650224215246636 | C | 211 | C | Cys | TGC | T | C | Cys | TGT | C211C | syn |
The good news, at least so fa, is that the Gln to Leu in at position 226 in amino acid sequence isn't found with a significance in the data.
No comments:
Post a Comment