Import and process antimicrobial genotype data from common sources

This function imports AMR genotyping datasets in formats generated by common bioinformatics tools (AMRFinderPlus, ABRicate, Kleborate, CARD RGI) as well as processed AMRFinderPlus downloadable from EBI. Drug/class annotations given for each genotype marker in the input file are parsed to standard antibiotic names and/or antibiotic groups recognised by the AMR pkg, to facilitate extracting relevant genotype markers for comparison to phenotype data for a specific antibiotic (e.g. using AMRgen functions get_binary_matrix(), amr_ppv(), amr_upset() and amr_logistic()).

Usage

import_geno(input, format = "amrfp", ...)

Arguments

input

A string representing a dataframe, or a path to an input file, containing the phenotype data in a supported format. These files may be downloaded from public sources such EBI or NCBI, or the files may be generated using common bioinformatics software for AMR genotyping.

format

A string indicating the format of the data: "amrfp" (default), "ebi_web", "ebi_ftp", "kleborate", "abricate", "rgi". This determines which importer function the data is passed on to for processing (see below).

...

Format-specific arguments. See

"amrfp" : import_amrfp() AMRFinderPlus output
"ebi_web" : import_amrfp_ebi_ftp() EBI-processed AMRFinderPlus results (from FTP or download_ebi())
"ebi_ftp" : import_amrfp_ebi_web() EBI-processed AMRFinderPlus results, (from EBI AMR portal)
"kleborate" : import_kleborate() Kleborate output
"abricate" : import_abricate() ABRicate output
"rgi" : import_rgi() CARD RGI output

Value

A data frame with the processed genotype data, with harmonised gene names, mapped drug agents, and drug classes which can be used for other functions of the ARMgen package:

id: The sample identifier (character).
marker: The name of the genotype marker as it appears in the input (e.g. gyrA_S83F) (character).
gene: The gene identifier (character).
mutation: The mutation detected within the gene, converted to HGVS nomenclature syntax (e.g. Ser83Phe) (character).
node: (for AMRFinderPlus input only) The node in the NCBI reference gene hierarchy corresponding to the gene (character).
drug_class: Name of the antibiotic group associated with the genotype marker, compatible with AMR pkg (character).
drug: Name of the specific antibiotic associated with the genotype marker, compatible with AMR pkg (ab). Value NA is assigned when the markers are annotated with a class only and not a specific antibiotic.
variation type: (for AMRFinderPlus, ABRicate, or Kleborate results) Type of variation, e.g. Gene presence detected, Protein variant detected, Nucleotide variant detected, Inactivating mutation detected, Promoter variant detected. ... Other fields specific to the input file

Examples

# Import AMRFinderPlus data file
data(ecoli_geno_raw)
head(ecoli_geno_raw)
#> # A tibble: 6 × 28
#>   Name       `Protein identifier` `Contig id`  Start   Stop Strand `Gene symbol`
#>   <chr>      <lgl>                <chr>        <dbl>  <dbl> <chr>  <chr>        
#> 1 SAMN03177… NA                   SAMN031776…  74721  75851 -      blaEC        
#> 2 SAMN03177… NA                   SAMN031776… 166214 169315 +      acrF         
#> 3 SAMN03177… NA                   SAMN031776…  20678  22033 -      glpT_E448K   
#> 4 SAMN03177… NA                   SAMN031776…    758   1969 -      floR         
#> 5 SAMN03177… NA                   SAMN031776…   4440   5666 +      mdtM         
#> 6 SAMN03177… NA                   SAMN031776…   3941   4798 +      blaTEM-1     
#> # ℹ 21 more variables: `Sequence name` <chr>, Scope <chr>,
#> #   `Element type` <chr>, `Element subtype` <chr>, Class <chr>, Subclass <chr>,
#> #   Method <chr>, `Target length` <dbl>, `Reference sequence length` <dbl>,
#> #   `% Coverage of reference sequence` <dbl>,
#> #   `% Identity to reference sequence` <dbl>, `Alignment length` <dbl>,
#> #   `Accession of closest sequence` <chr>, `Name of closest sequence` <chr>,
#> #   `HMM id` <lgl>, `HMM description` <lgl>, `Hierarchy node` <chr>, …
geno <- import_geno(ecoli_geno_raw %>% head(n = 10), format = "amrfp")
head(geno)
#> # A tibble: 6 × 37
#>   id           marker     gene  mutation drug drug_class  `variation type` node 
#>   <chr>        <chr>      <chr> <chr>    <ab> <chr>       <chr>            <chr>
#> 1 SAMN03177615 blaEC      blaEC NA       NA   Beta-lacta… Gene presence d… blaEC
#> 2 SAMN03177615 acrF       acrF  NA       NA   Efflux      Gene presence d… acrF 
#> 3 SAMN03177615 glpT_E448K glpT  Glu448L… FOS  Phosphonics Protein variant… glpT 
#> 4 SAMN03177615 floR       floR  NA       CHL  Phenicols   Gene presence d… floR 
#> 5 SAMN03177615 floR       floR  NA       FLR  Phenicols   Gene presence d… floR 
#> 6 SAMN03177615 mdtM       mdtM  NA       NA   Efflux      Gene presence d… mdtM 
#> # ℹ 29 more variables: marker.label <chr>, `Protein identifier` <lgl>,
#> #   `Contig id` <chr>, Start <dbl>, Stop <dbl>, Strand <chr>,
#> #   `Gene symbol` <chr>, `Sequence name` <chr>, Scope <chr>,
#> #   `Element type` <chr>, `Element subtype` <chr>, Class <chr>, Subclass <chr>,
#> #   Method <chr>, `Target length` <dbl>, `Reference sequence length` <dbl>,
#> #   `% Coverage of reference sequence` <dbl>,
#> #   `% Identity to reference sequence` <dbl>, `Alignment length` <dbl>, …

if (FALSE) { # \dontrun{
# Import ABRicate results (that were run using the default db, resfinder)
abricate_resfinder <- import_geno("path/to/abricate_resfinder.tsv",
  format = "abricate"
)

# Import ABRicate results that were run using an alternative db (ncbi)
abricate_ncbi <- import_geno("path/to/abricate_ncbi.tsv",
  format = "abricate",
  db = "ncbi"
)

# Import Kleborate results
kleborate_geno <- import_geno(kleborate_raw %>% head(n = 10),
  format = "kleborate"
)

# Import Kleborate results run with an older version without HGVS syntax
kleborate_old <- import_geno(kleborate_raw_v313 %>% head(n = 10),
  format = "kleborate",
  hgvs = FALSE
)

# Import CARD RGI results with default parameters
rgi_geno <- import_geno(rgi_EuSCAPE_raw %>% head(n = 10),
  format = "rgi"
)

# Import CARD RGI results with additional options
rgi_geno <- import_geno(rgi_raw %>% head(n = 10),
  format = "rgi",
  sample_id_sep = "_genomic.fna.txt:", exclude_loose = FALSE
)

# Download quinolone-related genotype data for E. coli, from EBI
ebi_geno_raw <- download_ebi(
  data = "genotype", species = "Escherichia coli",
  geno_subclass = "QUINOLONE"
)
# import the downloaded data
ebi_geno <- import_geno(ebi_geno_raw,
  format = "ebi_ftp"
)

# Download data from EBI web portal manually, and import the file
ebi_geno_from_web <- import_geno("amr_records.csv",
  format = "ebi_web"
)

# Download carbapenem-related genotype data for K. pneumoniae, from NCBI
ncbi_geno_raw <- query_ncbi_bq_geno(
  taxgroup = "Klebsiella pneumoniae",
  geno_subclass = "CARBAPENEM"
)
# import the downloaded data
geno <- import_geno(ncbi_geno_raw,
  format = "amrfp",
  sample_col = "biosample_acc"
)
} # }