Assign CN to SSM. — assign_cn_to

Annotate mutations with their copy number information.

Usage

assign_cn_to_ssm(
  these_samples_metadata,
  maf_data,
  seg_data,
  projection,
  coding_only = FALSE,
  assume_diploid = FALSE,
  include_silent = FALSE,
  ...
)

Arguments

these_samples_metadata: Metadata table with one or more rows to specify the samples to process.
maf_data: A data frame of mutations in MAF format or maf_data object (e.g. from get_coding_ssm or get_ssm_by_sample).
seg_data: A data frame of segmented copy number data or seg_data object
projection: Specified genome projection that returned data is relative to. This is only required when it cannot be inferred from maf_df or seg_df (or they are not provided).
coding_only: Optional. Set to TRUE to restrict to only variants in coding space Default is to work with genome-wide variants.
assume_diploid: Optional, this parameter annotates every mutation as copy neutral. Default is FALSE.
include_silent: Logical parameter indicating whether to include silent mutations in coding space. Default is FALSE. This parameter only makes sense if coding_only is set to TRUE.
...: Any additional parameters.

Value

A list containing a data frame (MAF-like format) with three extra columns: - log.ratio is the log ratio from the seg file (NA when no overlap). - LOH - CN (the rounded absolute copy number estimate of the region based on log.ratio, NA when no overlap was found).

Details

This function takes a metadata table and returns all mutations for the samples in that metadata. Each mutation is annotated with the local copy number state of each mutated site. The user can specify if only coding mutations are of interest. To do so, set coding_only = TRUE. When necessary, this function relies on get_ssm_by_samples and get_cn_segments to obtain the required data.

Examples

# long-handed way
# 1. get some metadata for a collection of samples
some_meta = get_gambl_metadata() %>%
        dplyr::filter(study=="FL_Dreval",
        grepl("SP",sample_id))
#> Using the bundled metadata in GAMBLR.data...
# 2. Get the SSMs for these samples

ssm_genomes_grch37 = get_coding_ssm(projection = "grch37",
                                  these_samples_metadata = some_meta)
#> Using the bundled SSM calls (.maf) calls in GAMBLR.data...
#> after linking with metadata, we have mutations from 182 samples
# peek at the results
ssm_genomes_grch37 %>% dplyr::select(1:8)
#> genomic_data Object
#> Genome Build: grch37 
#> Showing first 10 rows:
#>    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
#> 1       HNRNPU              0      .     GRCh37          1      245027102
#> 2         PCLO              0      .     GRCh37          7       82784068
#> 3        MYOM2              0      .     GRCh37          8        2063762
#> 4       CREBBP              0      .     GRCh37         16        3786740
#> 5        IGLL5              0      .     GRCh37         22       23230399
#> 6    HIST1H2BC              0      .     GRCh37          6       26123774
#> 7        SETD2              0      .     GRCh37          3       47088059
#> 8         IRF8              0      .     GRCh37         16       85942737
#> 9       ARID5B              0      .     GRCh37         10       63661999
#> 10      ARID5B              0      .     GRCh37         10       63810677
#>    End_Position Strand
#> 1     245027102      +
#> 2      82784068      +
#> 3       2063762      +
#> 4       3786740      +
#> 5      23230400      +
#> 6      26123774      +
#> 7      47088059      +
#> 8      85942737      +
#> 9      63661999      +
#> 10     63810677      +

# 3. Lazily let this function obtain the corresponding seg_data
# for the right genome_build
cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_grch37)
#> Using the bundled CN segments (.seg) calls in GAMBLR.data...
#> Running in default mode of any...

cn_list$maf %>% dplyr::select(1:8,log.ratio,CN)
#> genomic_data Object
#> Genome Build: grch37 
#> Showing first 10 rows:
#>    Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
#> 1       HNRNPU              0      .     GRCh37          1      245027102
#> 2         PCLO              0      .     GRCh37          7       82784068
#> 3        MYOM2              0      .     GRCh37          8        2063762
#> 4       CREBBP              0      .     GRCh37         16        3786740
#> 5        IGLL5              0      .     GRCh37         22       23230399
#> 6    HIST1H2BC              0      .     GRCh37          6       26123774
#> 7        SETD2              0      .     GRCh37          3       47088059
#> 8         IRF8              0      .     GRCh37         16       85942737
#> 9       ARID5B              0      .     GRCh37         10       63661999
#> 10      ARID5B              0      .     GRCh37         10       63810677
#>    End_Position Strand log.ratio CN
#> 1     245027102      + 0.4306932  3
#> 2      82784068      + 0.0000000  2
#> 3       2063762      + 0.0000000  2
#> 4       3786740      + 0.0000000  2
#> 5      23230400      + 0.0000000  2
#> 6      26123774      + 0.0000000  2
#> 7      47088059      + 0.0000000  2
#> 8      85942737      + 1.3219281  5
#> 9      63661999      + 0.5849625  3
#> 10     63810677      + 0.5849625  3
if (FALSE) { # \dontrun{
# This wouldn't work because the hg38 seg_data is not bundled
ssm_genomes_hg38 = get_coding_ssm(projection = "hg38",
                                  these_samples_metadata = some_meta)
cn_list = assign_cn_to_ssm(some_meta,ssm_genomes_hg38)

# Easiest/laziest way:
cn_list = assign_cn_to_ssm(projection = "grch37")


cn_list$maf %>% dplyr::group_by(Tumor_Sample_Barcode,CN) %>%
  dplyr::count()
} # }