On November 22, 2022, I downloaded a metadata file containing 13,932,236 records from GISAID in order to view the growth of several emerging SARS-CoV-2 variants of interest.
First, the results. I plotted the growth of the variants BA.2.75, BQ.1, BQ.1.1, BQ.1.18, CL.1, and XBB in the USA from June 1 to the present.
BQ.1. and BQ.1.1 are growing in the data. It's unclear yet whether they will have a large impact. We'll see after the Thanksgiving holidays.
Memory Issues
The metadata file is large, 9.8 GB on the disk. Loading into R on Windows takes a little over two minutes and used about 8 GB for the dataframe. For the purposes of demonstration, I'm using Windows 10 22H2, RStudio 2022.07.2, and R 4.2.2.
> system.time(meta_data <- read_tsv('data/metadata.tsv', name_repair = 'universal')) user system elapsed 197.84 9.44 146.04 > object.size(meta_data) 8087686552 bytes >
PS C:\Users\bbth> Get-WmiObject WIN32_PROCESS | Sort-Object -Property ws -Descending | Select-Object -first 10 ProcessID,Name,WS ProcessID Name WS --------- ---- -- 11956 rsession-utf8.exe 9634185216
> library(data.table) data.table 1.14.6 using 8 threads (see ?getDTthreads). Latest news: r-datatable.com > system.time(meta_data <- fread(file = 'data/metadata.tsv', header = TRUE, sep = '\t', check.names = TRUE)) |--------------------------------------------------| |==================================================| |--------------------------------------------------| |==================================================| user system elapsed 111.33 4.64 94.03 >
> selected_variants <- c('BQ.1.18', 'BQ.1.1', 'BQ.1', 'XBB', 'BA.2.75', 'CL.1')
> system.time(df_emerging_variants_usa <- plot_selected_variants(meta_data, selected_variants = selected_variants, title = 'Emerging Variants 2022-11-22 USA', start_date = as.Date('2022-05-01'))) user system elapsed 320.61 12.34 333.17
PS C:\Users\bbth> Get-WmiObject WIN32_PROCESS | Sort-Object -Property ws -Descending | Select-Object -first 10 ProcessID,Name,WS ProcessID Name WS --------- ---- -- 10832 rsession-utf8.exe 20400672768 320.61 12.34 333.17
#' plot_selected_variants #' Create dataframe and plot of selected SARS-CoV-2 variants #' #' @param metadata - dataframe or tibble, GISAID metatable. #' @param selected_variants - a vector of Pango lineage strings. #' @param host - character, String indicating host species. #' @param country - character, country of interest. If NULL, all countries are included. #' @param title - character, optional plot title #' #' @return - a dataframe #' containing the columns #' Collection.date, Pango.lineage, Count #' plot_selected_variants <- function(metadata, selected_variants, host = 'Human', country = 'USA', title = NULL, start_date = as.Date("2020-01-01")) { require(tidyverse) # filter by host, lineage, and location df_BA <- metadata %>% filter(Host == {{ host }}) %>% select(Collection.date, Location, Pango.lineage) %>% separate(Location, c("Region", "Country", "State"), sep = "\\s*/\\s*", extra = "drop", fill = "right") %>% select(-c("Region", "State")) %>% mutate(Collection.date = as.Date(Collection.date, format = '%Y-%m-%d')) %>% filter(Collection.date >= start_date) if(! is.null(country)) { df_BA <- df_BA %>% filter(Country == {{ country }}) } # select lineages and count them df_BA <- df_BA %>% filter(Pango.lineage %in% {{ selected_variants }}) %>% group_by(Collection.date, Pango.lineage) %>% count() %>% rename(Count = n) %>% drop_na() p <- ggplot(df_BA) + geom_bar(aes(x = Collection.date, y = Count, color = Pango.lineage, fill = Pango.lineage), stat = 'identity') if(! is.null(title)) p <- p + ggtitle(title) print(p) return(df_BA) }
Chunking
#' get_selected_variants #' Create dataframe and plot of selected SARS-CoV-2 variants #' #' @param metafile - a meta data file from GISAID #' @param selected_variants - a vector of Pango lineage strings. #' @param host - character, String indicating host species. #' @param country - character, country of interest. If NULL, all countries are included. #' @param title - character, optional plot title #' #' @return - a dataframe #' containing the columns #' Collection.date, Pango.lineage, Count #' get_selected_variants <- function(metafile, selected_variants, host = 'Human', country = 'USA', title = NULL, start_date = as.Date("2020-01-01")) { require(chunked) require(tidyverse) # filter by host, lineage, date, and location df_BA <- read_table_chunkwise(metafile, sep = '\t') %>% filter(Host == {{ host }}) %>% select(Collection.date, Location, Pango.lineage) %>% mutate(Collection.date = as.Date(Collection.date, format = '%Y-%m-%d')) %>% filter(Collection.date >= {{ start_date }}) %>% filter(Pango.lineage %in% {{ selected_variants }}) %>% as.data.frame # extract country from location df_BA <- df_BA %>% separate(Location, c("Region", "Country", "State"), sep = "\\s*/\\s*", extra = "drop", fill = "right") # filter by country if(! is.null(country)) { df_BA <- df_BA %>% filter(Country == {{ country }}) } # accumulate counts df_BA <- df_BA %>% select(Collection.date, Pango.lineage) %>% group_by(Collection.date, Pango.lineage) %>% count() %>% rename(Count = n) %>% drop_na() %>% arrange(Collection.date) p <- ggplot(df_BA) + geom_bar(aes(x = Collection.date, y = Count, color = Pango.lineage, fill = Pango.lineage), stat = 'identity') if(! is.null(title)) p <- p + ggtitle(title) print(p) return(df_BA) }
>system.time(df_emerging_variants_usa2 <- get_selected_variants('data/metadata.tsv', selected_variants = selected_variants, start_date = as.Date('2022-05-01', format = '%Y-%m-%d'))) user system elapsed 167.50 10.28 179.73
PPS C:\Users\bbth> Get-WmiObject WIN32_PROCESS | Sort-Object -Property ws -Descending | Select-Object -first 10 ProcessID,Name,WS ProcessID Name WS --------- ---- -- 22832 rsession-utf8.exe 216952832
No comments:
Post a Comment