Get Manta SVs — get_manta_sv • GAMBLR.open

Retrieve Manta SVs for one or many samples

Usage

get_manta_sv(
  these_samples_metadata = NULL,
  projection = "grch37",
  region,
  min_vaf = 0.1,
  min_score = 40,
  pass_filters = TRUE,
  verbose = FALSE,
  chromosome,
  qstart,
  qend,
  pairing_status,
  these_sample_ids = NULL,
  ...
)

Arguments

these_samples_metadata: A metadata data frame to limit the result to sample_ids within it
projection: The projection genome build. Default is grch37.
region: Specify a single region to fetch SVs anchored within using the format "chrom:start-end"
min_vaf: The minimum tumour VAF for a SV to be returned. Default is 0.1.
min_score: The lowest Manta somatic score for a SV to be returned. Default is 40.
pass_filters: If TRUE (default) only return SVs that are annotated with PASS in the FILTER column. Set to FALSE to keep all variants, regardless if they PASS the filters.
verbose: Set to FALSE to minimize the output to console. Default is TRUE. This parameter also dictates the verbose-ness of any helper function internally called inside the main function.
chromosome: DEPRECATED. Use region instead.
qstart: DEPRECATED. Use region instead.
qend: DEPRECATED. Use region instead.
pairing_status: DEPRECATED.
these_sample_ids: DEPRECATED. Subset your metadata and supply `these_samples_metadata“ instead.
...: Any additional parameters.

Details

Retrieve Manta SVs with additional VCF information to allow for filtering of high-confidence variants. To get SV calls for multiple samples, supply a metadata table via these_samples_metadata that has been subset to only those samples. The results will be restricted to the sample_ids within that data frame. This function can also restrict the returned breakpoints within a genomic region specified via region (in chr:start-end format). Useful filtering parameters are also available, use min_vaf to set the minimum tumour VAF for a SV to be returned and min_score to set the lowest Manta somatic score for a SV to be returned. In addition, the user can chose to return all variants, even the ones not passing the filter criteria. To do so, set pass_filters = FALSE (defaults to TRUE).

Examples

# lazily get every SV in the table with default quality filters
all_sv <- get_manta_sv()
#> Using the bundled metadata in GAMBLR.data...
#> Using the bundled Manta SV (.bedpe) calls in GAMBLR.data...
head(all_sv)
#> genomic_data Object
#> Genome Build: grch37 
#> Showing first 10 rows:
#>   CHROM_A   START_A     END_A CHROM_B   START_B     END_B
#> 1       1 161658631 161658631       3  16509907  16509907
#> 2       1 161663959 161663959       9  37363320  37363320
#> 3       1 161663959 161663959       9  37363320  37363320
#> 4      11  65267283  65267283      14 106110907 106110907
#> 5      11  65267422  65267422      14 106110905 106110905
#> 6      13  91976545  91976545      14 106211857 106211857
#>                      manta_name SCORE STRAND_A STRAND_B tumour_sample_id
#> 1      MantaBND:21171:0:1:0:0:0   133        +        +         FL2002T1
#> 2     MantaBND:206628:0:1:0:0:0   122        +        +  09-15842_tumorA
#> 3     MantaBND:195941:0:1:0:0:0   151        +        +  09-15842_tumorB
#> 4   MantaBND:152220:0:1:0:0:0:0    88        +        -        15-38154T
#> 5   MantaBND:152220:0:1:0:0:0:0   135        -        +        15-38154T
#> 6 MantaBND:18:59794:59817:0:1:0    90        -        +        15-31924T
#>   normal_sample_id VAF_tumour  DP pair_status FILTER
#> 1          FL2002N      0.331 127     matched   PASS
#> 2  09-15842_normal      0.281 196     matched   PASS
#> 3  09-15842_normal      0.364 187     matched   PASS
#> 4        15-38154N      0.150 167     matched   PASS
#> 5        15-38154N      0.290 169     matched   PASS
#> 6        15-31924N      0.365  85     matched   PASS

# get all SVs for just one cohort
cohort_meta = suppressMessages(get_gambl_metadata()) %>% 
              dplyr::filter(cohort == "DLBCL_cell_lines")

some_sv <- get_manta_sv(these_samples_metadata = cohort_meta, verbose=FALSE)
#> Using the bundled Manta SV (.bedpe) calls in GAMBLR.data...
head(some_sv)
#> genomic_data Object
#> Genome Build: grch37 
#> Showing first 10 rows:
#>   CHROM_A   START_A     END_A CHROM_B  START_B    END_B
#> 1      14 106329462 106329462      18 60774579 60774579
#> 2      14 106329465 106329465      18 60793497 60793497
#> 3      14 106330466 106330466      18 60793914 60793914
#> 4      14 106349765 106349765      18 60793914 60793914
#> 5      14 106379091 106379091      18 60793492 60793492
#> 6      14 106380227 106380227      18 60774578 60774578
#>                  manta_name SCORE STRAND_A STRAND_B tumour_sample_id
#> 1 MantaBND:220769:1:2:0:0:0   134        +        -        SU-DHL-10
#> 2 MantaBND:194451:1:2:0:0:0   103        +        -           DOHH-2
#> 3 MantaBND:217561:1:2:0:0:0   182        +        -         SU-DHL-4
#> 4 MantaBND:217561:0:1:0:0:0   198        -        +         SU-DHL-4
#> 5 MantaBND:194451:0:1:0:0:0    91        -        +           DOHH-2
#> 6 MantaBND:220769:0:1:0:0:0   169        -        +        SU-DHL-10
#>   normal_sample_id VAF_tumour DP pair_status FILTER
#> 1        14-11247N      0.318 66   unmatched   PASS
#> 2        14-11247N      0.290 69   unmatched   PASS
#> 3        14-11247N      0.474 57   unmatched   PASS
#> 4        14-11247N      0.500 62   unmatched   PASS
#> 5        14-11247N      0.300 60   unmatched   PASS
#> 6        14-11247N      0.578 45   unmatched   PASS
nrow(some_sv)
#> [1] 13

# get the SVs in a region around MYC
# WARNING: This is not the best way to find MYC SVs.
# Use annotate_sv on the full SV set instead.
myc_region_hg38 = "chr8:127710883-127761821"
myc_region_grch37 = "8:128723128-128774067"

hg38_myc_locus_sv <- get_manta_sv(region = myc_region_hg38,
                                projection = "hg38",
                                verbose = FALSE)
#> Using the bundled metadata in GAMBLR.data...
#> Using the bundled Manta SV (.bedpe) calls in GAMBLR.data...
head(hg38_myc_locus_sv)
#> genomic_data Object
#> Genome Build: hg38 
#> Showing first 10 rows:
#>   CHROM_A   START_A     END_A CHROM_B   START_B     END_B
#> 1    chr2  88860304  88860306    chr8 127751936 127751938
#> 2    chr2  88860417  88860417    chr8 127751955 127751955
#> 3    chr2  88861500  88861500    chr8 127748752 127748752
#> 4    chr3 187811601 187811601    chr8 127745649 127745649
#> 5    chr8 127741233 127741234   chr12  25049104  25049105
#> 6    chr8 127713694 127713694   chr14 105857950 105857950
#>                     manta_name SCORE STRAND_A STRAND_B
#> 1  MantaBND:194837:0:1:0:0:0:0   102        +        +
#> 2  MantaBND:194837:0:1:0:0:0:0    73        -        -
#> 3   MantaBND:1102030:0:1:0:0:0    89        +        +
#> 4     MantaBND:48510:0:2:0:0:0   106        -        +
#> 5    MantaBND:174836:0:1:0:0:0   219        +        +
#> 6 MantaBND:1:10030:23823:0:0:0   109        -        +
#>            tumour_sample_id          normal_sample_id VAF_tumour  DP
#> 1 BLGSP-71-27-00414-01A-01E BLGSP-71-27-00414-10A-01D      0.171 280
#> 2 BLGSP-71-27-00414-01A-01E BLGSP-71-27-00414-10A-01D      0.117 230
#> 3 BLGSP-71-30-00647-01A-01E BLGSP-71-06-00286-99A-01D      0.283  46
#> 4                  FL1008T2                   FL1008N      0.171 245
#> 5                  FL1018T2                   FL1018N      0.323 288
#> 6 BLGSP-71-06-00280-01A-01D BLGSP-71-06-00280-99A-01D      0.272 235
#>   pair_status FILTER
#> 1     matched   PASS
#> 2     matched   PASS
#> 3   unmatched   PASS
#> 4     matched   PASS
#> 5     matched   PASS
#> 6     matched   PASS
nrow(hg38_myc_locus_sv)
#> [1] 260

incorrect_myc_locus_sv <- get_manta_sv(region = myc_region_grch37,
                                projection = "hg38",
                                verbose = FALSE)
#> Using the bundled metadata in GAMBLR.data...
#> Using the bundled Manta SV (.bedpe) calls in GAMBLR.data...
head(incorrect_myc_locus_sv)
#> genomic_data Object
#> Genome Build: hg38 
#> Showing first 10 rows:
#>  [1] CHROM_A          START_A          END_A            CHROM_B         
#>  [5] START_B          END_B            manta_name       SCORE           
#>  [9] STRAND_A         STRAND_B         tumour_sample_id normal_sample_id
#> [13] VAF_tumour       DP               pair_status      FILTER          
#> <0 rows> (or 0-length row.names)
nrow(incorrect_myc_locus_sv)
#> [1] 0
# The effect of specifying the wrong coordinate is evident

# Despite potentially being incomplete, we can nonetheless
# annotate these directly for more details
annotated_myc_hg38 = suppressMessages(
         annotate_sv(hg38_myc_locus_sv, genome_build = "hg38")
)
head(annotated_myc_hg38)
#>   chrom1    start1      end1 chrom2    start2      end2 name score strand1
#> 1      8 127741233 127741234     12  25049104  25049105    .   219       +
#> 2      8 127713694 127713694     14 105857950 105857950    .   109       -
#> 3      8 127716025 127716934     14 105862581 105863164    .   112       +
#> 4      8 127716523 127716523     14 105862757 105862757    .   173       -
#> 5      8 127718148 127718148     14 105860256 105860256    .   152       +
#> 6      8 127718150 127718150     14 105860564 105860564    .   163       -
#>   strand2          tumour_sample_id gene partner   fusion
#> 1       +                  FL1018T2  MYC    LRMP LRMP-MYC
#> 2       + BLGSP-71-06-00280-01A-01D  MYC     IGH  IGH-MYC
#> 3       - BLGSP-71-06-00084-01A-01D  MYC     IGH  IGH-MYC
#> 4       + BLGSP-71-06-00084-01A-01D  MYC     IGH  IGH-MYC
#> 5       - BLGSP-71-08-00036-01A-01D  MYC     IGH  IGH-MYC
#> 6       + BLGSP-71-08-00036-01A-01D  MYC     IGH  IGH-MYC
table(annotated_myc_hg38$partner)
#> 
#> BCL6  DMD  IGH  IGK  IGL LRMP PAX5 
#>    1    2  247    3    2    1    4 
# The usual MYC partners are seen here