Skip to contents

A step by step example workflow from start to finish

library(GAMBLR.open)
library(tidyverse)

Training data

For simplicity, we’ll skip feature selection and train our new model using a set of genes that have been pre-selected for you based on a model described in the DLBCLone preprint. The code below shows how to infer what features were used in training our model, starting with the column names of the model’s features (model_feats).

SSM features

# Full set of genes considered for SSM

model_genes = grep("_SV|HOTS",model_feats,invert=T,value=T)

model_genes
 [1] "ACTB"     "ACTG1"    "BCL10"    "BCL2"     "BCL2L1"   "BCL6"
 [7] "BIRC3"    "BRAF"     "BTG1"     "BTG2"     "BTK"      "CD19"
[13] "CD70"     "CD79B"    "CD83"     "CDKN2A"   "CREBBP"   "DDX3X"
[19] "DTX1"     "DUSP2"    "EDRF1"    "EIF4A2"   "EP300"    "ETS1"
[25] "ETV6"     "EZH2"     "FAS"      "FCGR2B"   "FOXC1"    "FOXO1"
[31] "GNA13"    "GRHPR"    "HLA-A"    "HLA-B"    "HNRNPD"   "IRF4"
[37] "IRF8"     "ITPKB"    "JUNB"     "KLF2"     "KLHL14"   "KLHL6"
[43] "KMT2D"    "MEF2B"    "MPEG1"    "MYD88"    "NFKBIA"   "NFKBIE"
[49] "NFKBIZ"   "NOL9"     "NOTCH1"   "NOTCH2"   "OSBPL10"  "PIM1"
[55] "PIM2"     "PRDM1"    "PRDM15"   "PRKDC"    "PRRC2C"   "PTPN1"
[61] "RFTN1"    "S1PR2"    "SETD1B"   "SGK1"     "SOCS1"    "SPEN"
[67] "STAT3"    "STAT6"    "TBCC"     "TBL1XR1"  "TET2"     "TMEM30A"
[73] "TMSB4X"   "TNFAIP3"  "TNFRSF14" "TOX"      "TP53"     "TP73"
[79] "UBE2A"    "WEE1"     "XBP1"     "ZFP36L1" 

Hotspot features

# Genes with hotspots considered

hotspot_features = grep("HOTS",model_feats,invert=F,value=T) 
hotspot_features
[1] "MYD88HOTSPOT"

SV features

SV_genes = grep("_SV",model_feats,invert=F,value=T)
SV_genes
[1] "BCL2_SV" "BCL6_SV"

Load and subset the training data

The GAMBLR.predict package is bundled with base-3 mutation status matrix for the training data set. The code below shows how to load that into a data frame (setting sample_id as the row names) and subset it to only the features described above.

best_opt_model_matrix <- readr::read_tsv(
  system.file("extdata/all_full_status.tsv",
             package = "GAMBLR.predict")) %>%
  tibble::column_to_rownames("sample_id") 

# keep only the columns you want to use for your model
best_opt_model_matrix = best_opt_model_matrix  %>%
  select(all_of(model_genes),all_of(hotspot_features),all_of(SV_genes))

To train a model, we also need ground truth labels. These come from another bundled file, which is the sample metadata for the samples in the mutation matrix we just loaded.

dlbcl_meta <- readr::read_tsv(system.file("extdata/dlbcl_meta_with_dlbclass.tsv",package = "GAMBLR.predict"))

# remove rows that don't correspond to unambiguous classes (i.e. drop composites)
dlbcl_meta_clean <- filter(
  dlbcl_meta,
  lymphgen %in% c("MCD","EZB","BN2","N1","ST2","Other")
)

make_and_annotate_umap

mu_everything = make_and_annotate_umap(
  best_opt_model_matrix,
  dlbcl_meta_clean,
  core_features = list(
    ST2=c("SGK1","DUSP2","TET2","SOCS1"),
    N1="NOTCH1",
    EZB=c("EZH2","BCL2_SV"),
    MCD=c("MYD88HOTSPOT","CD79B","PIM1"),
    BN2=c("BCL6_SV","NOTCH2","SPEN","CD70")
  )
)

make_umap_scatterplot

make_umap_scatterplot(
  mu_everything$df,
  drop_other = F
)

DLBCLone_optimize_params

best_opt_model = DLBCLone_optimize_params(  
  mu_everything$features, 
  dlbcl_meta_clean,
  umap_out = mu_everything,
  truth_classes = c("MCD","EZB","BN2","N1","ST2","Other"),
  optimize_for_other = T,
  min_k=5,
  max_k=13
)

make_alluvial

DLBCLone with outgroup optimization (“wo”) w for weighted, o for optimized for other.

make_alluvial(best_opt_model,pred_column = "DLBCLone_wo",pred_name = "DLBCLone_wo")

DLBCLone_predict

predicting one training sample

test_sample = "00-14595_tumorC"

pred_train <- DLBCLone_predict(
  mutation_status = best_opt_model_matrix[test_sample,],
  optimized_model = best_opt_model
)
knitr::kable(head(pred_train$prediction))
sample_id predicted_label confidence other_score neighbor_id neighbor distance label other_neighbor vote_labels weighted_votes neighbors_other neighborhood_otherness other_weighted_votes total_w pred_w V1 V2 .id EZB_NN_count MCD_NN_count ST2_NN_count N1_NN_count BN2_NN_count Other_NN_count top_group EZB_score MCD_score ST2_score N1_score BN2_score Other_score top_score_group top_group_score top_group_count Other_count by_vote by_vote_opt by_score score_ratio by_score_opt DLBCLone_w DLBCLone_wo
00-14595_tumorC EZB 1 0 DLBCL11584T,CLC03456,DLBCL11428T,05-32762T,LY_RELY_116_tumorA,99-13280T,05-25439T,QC2-32T,SP193546,SP193976,14-20962T,FL1018T2,10-36955_tumorB 1112,351,995,85,1265,416,80,1322,1182,1164,247,89,172 0.203,0.223,0.231,0.259,0.265,0.293,0.297,0.306,0.321,0.34,0.345,0.349,0.352 EZB,EZB,EZB,EZB,EZB,EZB,EZB,EZB,EZB,EZB,EZB,EZB,EZB EZB 46.055835675141 0 0 0 46.05584 46.05584 -3.826784 2.487499 1 13 0 0 0 0 0 EZB 46.05584 0 0 0 0 0 EZB 46.05584 13 0 13 EZB EZB Inf EZB EZB EZB

make_neighborhood_plot

make_neighborhood_plot(
  single_sample_prediction_output = pred_train,
  this_sample_id = "00-14595_tumorC",
  prediction_in_title = TRUE,
  add_circle = TRUE,
  label_column = "DLBCLone_wo"
)

nearest_neighbor_heatmap

nearest_neighbor_heatmap(
  this_sample_id = "00-14595_tumorC",
  DLBCLone_model = pred_train,
  font_size = 7
)