Get Combined SV. — get_combined

Retrieve Combined Manta and GRIDSS-derived SVs.

Usage

get_combined_sv(
  these_samples_metadata,
  projection = "grch37",
  oncogenes,
  region,
  min_vaf = 0,
  these_sample_ids
)

Arguments

these_samples_metadata

a GAMBL metadata frame containing only the samples you want returned

projection

The projection genome build. Default is "grch37".

oncogenes

A character vector of genes commonly involved in translocations.

region

Optional, region formatted like chrX:1234-5678 (chromosome can be prefixed or not prefixed) instead of specifying chromosome, start and end separately.

min_vaf

The minimum tumour VAF for a SV to be returned. Recommended: 0. (default: 0)

these_sample_ids

DEPRECATED

You may want to include one or more e.g.: c("CCND1", "BCL2","MYC") NOTE: If you are looking for SV affecting oncogenes you are more likely going to want to pass the full output to GAMBLR.utils::annotate_sv

Value

A data frame in a bedpe-like format with additional columns that allow filtering of high-confidence SVs.

Details

The bedpe files this function loads were pre-filtered for a minimum VAF of 0.05, and SVs affecting common translocation regions (BCL2, BCL6, MYC, CCND1) were whitelisted (e.g. no VAF filter applied). Therefore if you wish to post-filter the SVs we recommend doing so as needed. Further, the input bedpe file is annotated with oncogenes and superenhancers from naive and germinal centre B-cells. You can subset to events affecting certain loci using the "oncogenes" argument. Try one of the following, similar, functions; get_manta_sv

Examples

# lazily get every SV in the table with default quality filters
all_sv <- get_combined_sv()
dplyr::select(all_sv,1:14)
#> genomic_data Object
#> Genome Build: grch37 
#> Showing first 10 rows:
#>    CHROM_A START_A END_A CHROM_B   START_B     END_B
#> 1        1    9894 10014      16     69889     69998
#> 2        1   10173 10984      12     94748     95533
#> 3        1   10176 10711      11 134946192 134946783
#> 4        1   10269 10794      16     59726     60283
#> 5        1   10285 10286       8 146301390 146301391
#> 6        1   10287 10688      15 102521012 102521550
#> 7        1   10308 10837      12     95037     95505
#> 8        1   10347 10631      15 102520227 102520677
#> 9        1   10437 10438       8 146301390 146301391
#> 10       1   10437 10438       8 146301390 146301391
#>                       manta_name SCORE STRAND_A STRAND_B tumour_sample_id
#> 1  MantaBND:10:1901:1907:0:0:0:1    NA        -        +  00-14595_tumorD
#> 2   MantaBND:1:5198:6695:0:0:0:1    NA        +        +         CLC03470
#> 3  MantaBND:23:3075:4071:0:0:0:1    NA        +        +        14-14094T
#> 4   MantaBND:3:3538:3557:0:0:0:1    NA        +        -        07-13339T
#> 5   MantaBND:5:1923:1927:0:0:0:1    NA        +        +        09-41114T
#> 6     MantaBND:13192:0:1:0:0:0:0    NA        +        +      835-02-03TD
#> 7   MantaBND:1:6049:6050:1:0:0:0    NA        +        +     4687-03-01BD
#> 8  MantaBND:11:3940:4135:0:0:0:1    NA        -        -        12-34927T
#> 9   MantaBND:2:7221:7224:0:1:0:1    NA        +        +      102-01-01TD
#> 10  MantaBND:2:1723:1728:0:0:0:0    NA        +        +    102-0202-1DVT
#>    normal_sample_id VAF_tumour  DP
#> 1   00-14595_normal      0.400  10
#> 2         14-11247N      0.136  44
#> 3         14-11247N      0.186  59
#> 4         14-11247N      0.175  40
#> 5         14-11247N      0.118 110
#> 6    14-11247Normal      0.312  32
#> 7    14-11247Normal      0.250  52
#> 8         14-11247N      0.135 104
#> 9    14-11247Normal      0.520  25
#> 10   14-11247Normal      0.630  27

# get all SVs for just one cohort
cohort_meta = suppressMessages(get_gambl_metadata()) %>% 
              dplyr::filter(cohort == "DLBCL_cell_lines")

some_sv <- get_combined_sv(these_samples_metadata = cohort_meta)
dplyr::select(some_sv,1:14)
#> genomic_data Object
#> Genome Build: grch37 
#> Showing first 10 rows:
#>    CHROM_A  START_A    END_A CHROM_B   START_B     END_B
#> 1        1  1346152  1346155       1 111802594 111802597
#> 2        1  1739478  1739484       1   1770671   1770677
#> 3        1  1826839  1826841       1   1833712   1833714
#> 4        1  1827338  1827339       1   8741628   8741629
#> 5        1  2317295  2317296       1  16648172  16648173
#> 6        1  4472986  4472989       1   6530057   6530060
#> 7        1  6290197  6290198       1  16693372  16693373
#> 8        1  6438168  6438173       1   6445895   6445900
#> 9        1  6438168  6438173       1   6445895   6445900
#> 10       1 10005435 10005437       4  28617083  28617085
#>                             manta_name SCORE STRAND_A STRAND_B tumour_sample_id
#> 1           MantaBND:14352:0:1:0:0:0:1    NA        +        +            HBL-1
#> 2  MantaDUP:TANDEM:14103:0:1:0:0:0_bp1    NA        -        +         OCI-Ly10
#> 3           MantaBND:14359:0:1:0:0:0:1    NA        +        +         SU-DHL-4
#> 4           MantaBND:14359:0:2:0:0:0:0    NA        -        -         SU-DHL-4
#> 5           MantaBND:12526:0:1:0:0:0:1    NA        +        +         SU-DHL-6
#> 6         MantaDEL:15336:0:1:0:1:0_bp1    NA        +        -         SU-DHL-5
#> 7           MantaBND:12875:0:1:0:0:0:0    NA        -        -         SU-DHL-6
#> 8         MantaDEL:16045:0:1:0:0:0_bp1    NA        +        -               HT
#> 9         MantaDEL:15520:0:1:0:0:0_bp1    NA        +        -         SU-DHL-5
#> 10          MantaBND:15925:0:1:0:0:0:0    NA        -        -          WSU-NHL
#>    normal_sample_id VAF_tumour  DP
#> 1         14-11247N      0.189  95
#> 2         14-11247N      0.193 119
#> 3         14-11247N      0.213  94
#> 4         14-11247N      0.300  80
#> 5         14-11247N      0.323  62
#> 6         14-11247N      0.345 113
#> 7         14-11247N      0.234  77
#> 8         14-11247N      0.247  81
#> 9         14-11247N      1.000  30
#> 10        14-11247N      0.176  91
nrow(some_sv)
#> [1] 3519

# get the SVs in a region around MYC
# WARNING: This is not the best way to find MYC SVs.
# Use annotate_sv on the full SV set instead.
myc_region_hg38 = "chr8:127710883-127761821"
myc_region_grch37 = "8:128723128-128774067"

hg38_myc_locus_sv <- get_combined_sv(region = myc_region_hg38,
                                projection = "hg38")
dplyr::select(hg38_myc_locus_sv,1:14)
#> genomic_data Object
#> Genome Build: hg38 
#> Showing first 10 rows:
#>    CHROM_A   START_A     END_A CHROM_B   START_B     END_B
#> 1     chr1   6803016   6803030    chr8 127731317 127731331
#> 2     chr1   8253139   8253144    chr8 127756470 127756475
#> 3     chr1  50392925  50392927    chr8 127747754 127747756
#> 4     chr1 100041646 100041649    chr8 127753200 127753203
#> 5     chr1 149968016 149968030    chr8 127720533 127720547
#> 6     chr1 180261380 180261390    chr8 127747224 127747234
#> 7     chr1 182119529 182119530    chr8 127747267 127747268
#> 8     chr1 202928006 202928020    chr8 127747224 127747238
#> 9     chr1 207726976 207727002    chr8 127724887 127724913
#> 10    chr1 209800781 209800785    chr8 127753804 127753808
#>                          manta_name  SCORE STRAND_A STRAND_B
#> 1                              <NA> 208.20        -        +
#> 2                              <NA> 308.00        -        +
#> 3                              <NA> 253.09        -        +
#> 4                              <NA> 211.34        -        +
#> 5                              <NA> 335.95        -        +
#> 6  MantaBND:2:133568:133570:0:2:0:0     NA        -        +
#> 7                              <NA> 226.75        +        -
#> 8                              <NA> 419.03        +        -
#> 9                              <NA> 204.27        -        +
#> 10 MantaBND:0:549563:738680:0:1:0:0     NA        +        -
#>             tumour_sample_id          normal_sample_id VAF_tumour  DP
#> 1            01-12047_tumorA           01-12047_normal 0.10396040 202
#> 2  BLGSP-71-30-00665-01A-01E BLGSP-71-30-00665-10A-01D 0.08870968 124
#> 3  BLGSP-71-30-00678-01A-01E BLGSP-71-06-00286-99A-01D 0.05128205 195
#> 4  BLGSP-71-30-00656-01A-01E BLGSP-71-06-00286-99A-01D 0.05241935 248
#> 5            13-38657_tumorB           13-38657_normal 0.10800000 250
#> 6            11-12873_tumorC           11-12873_normal 0.05900000 102
#> 7  BLGSP-71-30-00655-01A-01E BLGSP-71-06-00286-99A-01D 0.05555556 198
#> 8            14-11777_tumorB                 14-11777N 0.03921569 459
#> 9  BLGSP-71-30-00661-01A-01E BLGSP-71-06-00286-99A-01D 0.07253886 193
#> 10 BLGSP-71-30-00647-01A-01E BLGSP-71-06-00286-99A-01D 0.13200000  38
nrow(hg38_myc_locus_sv)
#> [1] 683

incorrect_myc_locus_sv <- get_combined_sv(region = myc_region_grch37,
                                projection = "hg38")
dplyr::select(incorrect_myc_locus_sv,1:14)
#> genomic_data Object
#> Genome Build: hg38 
#> Showing first 10 rows:
#>    CHROM_A   START_A     END_A CHROM_B   START_B     END_B
#> 1     chr1 182655283 182655290    chr8 128767396 128767403
#> 2     chr1 233914485 233914492    chr8 128773743 128773750
#> 3    chr11  73459284  73459290    chr8 128758908 128758914
#> 4    chr11  93629111  93629307    chr8 128726629 128726825
#> 5    chr11  93629111  93629647    chr8 128726343 128727380
#> 6    chr11  93629111  93629641    chr8 128726295 128727325
#> 7    chr11  93629111  93629613    chr8 128726323 128727499
#> 8    chr11  93629111  93629581    chr8 128726380 128727086
#> 9    chr11  93629111  93629568    chr8 128726370 128727275
#> 10   chr11  93629111  93629568    chr8 128726368 128727133
#>                          manta_name  SCORE STRAND_A STRAND_B  tumour_sample_id
#> 1       MantaBND:203817:0:1:0:0:0:0     NA        -        -   12-23835_tumorA
#> 2       MantaBND:306088:0:1:0:0:0:0     NA        +        -   05-16093_tumorA
#> 3                              <NA> 259.43        -        +   01-23117_tumorB
#> 4       MantaBND:202456:0:1:0:0:0:1     NA        +        - 00-12637_CLC02086
#> 5        MantaBND:28037:1:9:0:0:0:1     NA        +        -         01-20774T
#> 6  MantaBND:0:278162:278290:0:0:0:1     NA        +        -          CLC03336
#> 7       MantaBND:170141:0:1:0:0:0:1     NA        +        -          CLC03338
#> 8       MantaBND:204178:0:1:0:0:0:1     NA        +        -          CLC03454
#> 9       MantaBND:157518:0:3:0:0:0:0     NA        +        -          CLC03455
#> 10      MantaBND:191300:0:1:0:0:0:1     NA        +        -          CLC03456
#>    normal_sample_id VAF_tumour  DP
#> 1   12-23835_normal  0.0530000 114
#> 2   05-16093_normal  0.0780000  77
#> 3   01-23117_normal  0.1383929 224
#> 4           FL1011N  0.0980000  61
#> 5         14-11247N  0.2800000  25
#> 6         14-11247N  0.2570000 179
#> 7         14-11247N  0.1150000 139
#> 8         14-11247N  0.2710000 140
#> 9         14-11247N  0.2680000  97
#> 10        14-11247N  0.1850000 151
nrow(incorrect_myc_locus_sv)
#> [1] 51

# Despite potentially being incomplete, we can nonetheless
# annotate these directly for more details
annotated_myc_hg38 = suppressMessages(
         GAMBLR.utils::annotate_sv(hg38_myc_locus_sv, genome_build = "hg38")
)
head(annotated_myc_hg38)
#>   chrom1    start1      end1 chrom2    start2      end2 name   score strand1
#> 1     18  63123493  63123497      8 127728389 127728393    . 1742.35       +
#> 2     18  63195263  63195266      8 127744561 127744564    .      NA       -
#> 3      2  28983232  28983241      8 127711263 127711272    .      NA       -
#> 4      3  70834666  70834671      8 127750318 127750323    .  249.44       -
#> 5      3 101756651 101756656      8 127724888 127724893    .  261.80       +
#> 6      4   1746418   1746422      8 127723482 127723486    .      NA       -
#>   strand2          tumour_sample_id   gene partner    fusion
#> 1       -                 SU-DHL-10   BCL2    <NA>   NA-BCL2
#> 2       -                  SP194216   BCL2    <NA>   NA-BCL2
#> 3       -           02-14764_tumorB    ALK    <NA>    NA-ALK
#> 4       + BLGSP-71-30-00661-01A-01E  FOXP1    <NA>  NA-FOXP1
#> 5       - BLGSP-71-30-00676-01A-01E NFKBIZ    <NA> NA-NFKBIZ
#> 6       -                 09-41114T  WHSC1    <NA>  NA-WHSC1
table(annotated_myc_hg38$partner)
#> 
#>  BCL6 CCNL1   DMD   IGH   IGK   IGL  LRMP  PAX5 RFTN1  ZEB2 
#>     6     1     2   377     5     7     3    11     1     1 
# The usual MYC partners are seen here

annotated_myc_incorrect = suppressMessages(
         GAMBLR.utils::annotate_sv(incorrect_myc_locus_sv, genome_build = "hg38")
)
head(annotated_myc_incorrect)
#>   chrom1    start1      end1 chrom2    start2      end2 name score strand1
#> 1      8 127313080 127313571      8 128746007 128746587    .    NA       +
#> 2      8 128738977 128738983      8 128752583 128752589    .    NA       +
#> 3      8 128738977 128738983      8 128752583 128752589    .    NA       +
#> 4      8 128738979 128738981      8 128752583 128752585    .    NA       +
#> 5      8 128738979 128738981      8 128752583 128752585    .    NA       +
#> 6      8 128738979 128738981      8 128752583 128752585    .    NA       +
#>   strand2 tumour_sample_id gene partner fusion
#> 1       -         PD26401c  MYC    <NA> NA-MYC
#> 2       -  04-14093_tumorA  MYC    <NA> NA-MYC
#> 3       -  04-14093_tumorB  MYC    <NA> NA-MYC
#> 4       -        05-24065T  MYC    <NA> NA-MYC
#> 5       -        10-27119T  MYC    <NA> NA-MYC
#> 6       -         SU-DHL-6  MYC    <NA> NA-MYC
table(annotated_myc_incorrect$partner)
#> < table of extent 0 >
# The effect of specifying the wrong coordinate is evident