Skip to contents

Reads the variant calls from Mutect2 somatic variants caller

Usage

read_mutect_snvs(
  path,
  sample_ids = "drop_first",
  PASS_only = TRUE,
  patient_id_pattern = "(?<=\\/)[:alnum:]*(?=\\.)",
  chrom_convention = "UCSC",
  extract_VEP_fields = FALSE,
  verbose = TRUE
)

Arguments

path

Can be either:

  1. path to a single file

  2. vector of file paths, element names will be used as patient IDs

  3. directory containing multiple Mutect .vcf files, patient IDs will be guessed from the file names (should follow convention: <patient_ID>.XXX.XXX.vcf)

sample_ids

Either:

  1. "drop_first"

  2. "all"

  3. ID(s) of the selected tumor samples.

Default: "drop_first"

PASS_only

Keep FILTER == PASS variants only?

patient_id_pattern

If path is a dir only: pattern for str_extract() that should be used to extract the patient_id from the filenames

chrom_convention

UCSC/NCBI/keep

extract_VEP_fields

If VCF file contains VEP annotations, following fields will be extracted: Variant_Classification, impact, gene_symbol and entrez_id (epxerimental, not tested)

verbose

Verbose?

Examples

library(readthis)

file1 <- system.file(
  "extdata", "Mutect", "S1.Mutect2.filter.pass.phased.annot.vcf",
  package = "readthis"
)
file2 <- system.file(
  "extdata", "Mutect", "S2.Mutect2.filter.pass.phased.annot.vcf",
  package = "readthis"
)

read_mutect_snvs(file1)
#> Scanning file to determine attributes.
#> File attributes:
#>   meta lines: 52
#>   header_line: 53
#>   variant count: 5
#>   column count: 12
#> 
Meta line 52 read in.
#> All meta lines processed.
#> gt matrix initialized.
#> Character matrix gt created.
#>   Character matrix gt rows: 5
#>   Character matrix gt cols: 12
#>   skip: 0
#>   nrows: 5
#>   row_num: 0
#> 
Processed variant: 5
#> All variants processed
#> Extracting gt element AD
#> Extracting gt element AF
#> Extracting gt element DP
#> Extracting gt element F1R2
#> Extracting gt element F2R1
#> Extracting gt element FAD
#> Extracting gt element GQ
#> Extracting gt element GT
#> Extracting gt element PGT
#> Extracting gt element PID
#> Extracting gt element PL
#> Extracting gt element PS
#> Extracting gt element SB
#> readthis>Guessing sample_ids: S1_L1, S1_P1
#> # A tibble: 8 × 12
#>   sample_id chrom   pos ref   alt   FILTER ref_reads alt_reads    VAF AF   
#>   <chr>     <chr> <int> <chr> <chr> <chr>      <int>     <int>  <dbl> <chr>
#> 1 S1_L1     chr1      2 G     A     PASS          22         2 0.0833 0.104
#> 2 S1_P1     chr1      2 G     A     PASS          43         2 0.0444 0.049
#> 3 S1_L1     chr2      3 G     A     PASS          13         2 0.133  0.154
#> 4 S1_P1     chr2      3 G     A     PASS         117         2 0.0168 0.022
#> 5 S1_L1     chrX      4 C     T     PASS          28         1 0.0345 0.100
#> 6 S1_P1     chrX      4 C     T     PASS         149         3 0.0197 0.025
#> 7 S1_L1     chrY      5 C     T     PASS           8         0 0      0.143
#> 8 S1_P1     chrY      5 C     T     PASS          18         2 0.1    0.141
#> # ℹ 2 more variables: DP <int>, CSQ <chr>

files <- c(S1 = file1, S2 = file2)
read_mutect_snvs(files, verbose = FALSE)
#> # A tibble: 16 × 13
#>    patient_id sample_id chrom   pos ref   alt   FILTER ref_reads alt_reads
#>    <chr>      <chr>     <chr> <int> <chr> <chr> <chr>      <int>     <int>
#>  1 S1         S1_L1     chr1      2 G     A     PASS          22         2
#>  2 S1         S1_P1     chr1      2 G     A     PASS          43         2
#>  3 S1         S1_L1     chr2      3 G     A     PASS          13         2
#>  4 S1         S1_P1     chr2      3 G     A     PASS         117         2
#>  5 S1         S1_L1     chrX      4 C     T     PASS          28         1
#>  6 S1         S1_P1     chrX      4 C     T     PASS         149         3
#>  7 S1         S1_L1     chrY      5 C     T     PASS           8         0
#>  8 S1         S1_P1     chrY      5 C     T     PASS          18         2
#>  9 S2         S2_L1     chr1      2 G     A     PASS          22         2
#> 10 S2         S2_P1     chr1      2 G     A     PASS          43         2
#> 11 S2         S2_L1     chr2      3 G     A     PASS          13         2
#> 12 S2         S2_P1     chr2      3 G     A     PASS         117         2
#> 13 S2         S2_L1     chrX      4 C     T     PASS          28         1
#> 14 S2         S2_P1     chrX      4 C     T     PASS         149         3
#> 15 S2         S2_L1     chrY      5 C     T     PASS           8         0
#> 16 S2         S2_P1     chrY      5 C     T     PASS          18         2
#> # ℹ 4 more variables: VAF <dbl>, AF <chr>, DP <int>, CSQ <chr>

mutect_dir <- system.file("extdata", "Mutect", package = "readthis")
read_mutect_snvs(mutect_dir)
#> Scanning file to determine attributes.
#> File attributes:
#>   meta lines: 52
#>   header_line: 53
#>   variant count: 5
#>   column count: 12
#> 
Meta line 52 read in.
#> All meta lines processed.
#> gt matrix initialized.
#> Character matrix gt created.
#>   Character matrix gt rows: 5
#>   Character matrix gt cols: 12
#>   skip: 0
#>   nrows: 5
#>   row_num: 0
#> 
Processed variant: 5
#> All variants processed
#> Extracting gt element AD
#> Extracting gt element AF
#> Extracting gt element DP
#> Extracting gt element F1R2
#> Extracting gt element F2R1
#> Extracting gt element FAD
#> Extracting gt element GQ
#> Extracting gt element GT
#> Extracting gt element PGT
#> Extracting gt element PID
#> Extracting gt element PL
#> Extracting gt element PS
#> Extracting gt element SB
#> readthis>Guessing sample_ids: S1_L1, S1_P1
#> Scanning file to determine attributes.
#> File attributes:
#>   meta lines: 52
#>   header_line: 53
#>   variant count: 5
#>   column count: 12
#> 
Meta line 52 read in.
#> All meta lines processed.
#> gt matrix initialized.
#> Character matrix gt created.
#>   Character matrix gt rows: 5
#>   Character matrix gt cols: 12
#>   skip: 0
#>   nrows: 5
#>   row_num: 0
#> 
Processed variant: 5
#> All variants processed
#> Extracting gt element AD
#> Extracting gt element AF
#> Extracting gt element DP
#> Extracting gt element F1R2
#> Extracting gt element F2R1
#> Extracting gt element FAD
#> Extracting gt element GQ
#> Extracting gt element GT
#> Extracting gt element PGT
#> Extracting gt element PID
#> Extracting gt element PL
#> Extracting gt element PS
#> Extracting gt element SB
#> readthis>Guessing sample_ids: S2_L1, S2_P1
#> # A tibble: 16 × 13
#>    patient_id sample_id chrom   pos ref   alt   FILTER ref_reads alt_reads
#>    <chr>      <chr>     <chr> <int> <chr> <chr> <chr>      <int>     <int>
#>  1 S1         S1_L1     chr1      2 G     A     PASS          22         2
#>  2 S1         S1_P1     chr1      2 G     A     PASS          43         2
#>  3 S1         S1_L1     chr2      3 G     A     PASS          13         2
#>  4 S1         S1_P1     chr2      3 G     A     PASS         117         2
#>  5 S1         S1_L1     chrX      4 C     T     PASS          28         1
#>  6 S1         S1_P1     chrX      4 C     T     PASS         149         3
#>  7 S1         S1_L1     chrY      5 C     T     PASS           8         0
#>  8 S1         S1_P1     chrY      5 C     T     PASS          18         2
#>  9 S2         S2_L1     chr1      2 G     A     PASS          22         2
#> 10 S2         S2_P1     chr1      2 G     A     PASS          43         2
#> 11 S2         S2_L1     chr2      3 G     A     PASS          13         2
#> 12 S2         S2_P1     chr2      3 G     A     PASS         117         2
#> 13 S2         S2_L1     chrX      4 C     T     PASS          28         1
#> 14 S2         S2_P1     chrX      4 C     T     PASS         149         3
#> 15 S2         S2_L1     chrY      5 C     T     PASS           8         0
#> 16 S2         S2_P1     chrY      5 C     T     PASS          18         2
#> # ℹ 4 more variables: VAF <dbl>, AF <chr>, DP <int>, CSQ <chr>