R/sample_annotation.R
sample_annotation.Rd
For statistical analysis and filtering the measurements need to be annotated with Filename, Condition, BioReplicate, and Run. This functions takes this information from a txt file containing this meta-data.
sample_annotation(
data,
sample_annotation,
data_type = "OpenSWATH",
column_file = "filename",
change_run_id = TRUE,
verbose = FALSE
)
A data frame containing SWATH data.
A data frame containing the columns: Filename, Condition, BioReplicate, Run. The values contained in the column filename have to be present in the filename of the SWATH data.
Option to specify the format of the table, if the column names from an OpenSWATH output or MSstats table are used.
Option to specify the column name where the injection file is specified. Default is set to "filename".
Option to choose if the run\_id column shall be reassigned to a unique value combining the values of Condition, BioReplicate and Run. (Option only possible if data is of format "OpenSWATH")
Option to turn on reporting on which filename it is working on.
Returns a dataframe with each row annotated for the study design
Given dataframes of TRIC processed data and sample annotations, mash them together into something appropriate for downstream analyses.
This performs some quick sanity checks on the data and annotations and creates the 'Condition', 'BioReplicate', and 'Run' columns along with other columns expected by MSstats/OpenSWATH.
data("OpenSWATH_data", package="SWATH2stats")
data("Study_design", package="SWATH2stats")
data <- SWATH2stats::sample_annotation(OpenSWATH_data, Study_design, verbose=TRUE)
#> peterb_J131223_043
#> peterb_J131223_054
#> peterb_L150425_003b_SW
#> peterb_L150425_011_SW
#> peterb_L150514_001_SW
#> peterb_L150514_002_SW
summary(data)
#> ProteinName FullPeptideName
#> 1/Protein6 :1227 GGVGAVALLIGPNAPLIFETPR : 18
#> 1/Protein7 : 216 GLLSPHPLLQLSYTATDPIR : 18
#> 1/Protein1 : 186 LLEEDKPEEPTAHAFVSTLTFHR : 18
#> 1/Protein3 : 156 LSGIDANPNALFPPVEFPAPHLR : 18
#> 1/iRT_protein: 114 NVGINSFGFGGSNVHIILRPNTQPPPAPAPHATLPGGR : 18
#> 1/Protein2 : 108 PGETLLIHSGSGGVGQAAIAIALSLGC(UniMod:4)RVR: 18
#> (Other) : 362 (Other) :2261
#> Charge
#> Min. :1.000
#> 1st Qu.:2.000
#> Median :2.000
#> Mean :2.643
#> 3rd Qu.:3.000
#> Max. :6.000
#>
#> aggr_Fragment_Annotation
#> 1000693_TVVVAFLGR_2;1000689_TVVVAFLGR_2;1000695_TVVVAFLGR_2;1000697_TVVVAFLGR_2;1000686_TVVVAFLGR_2;1000688_TVVVAFLGR_2 : 6
#> 1002329_VQALQTLGK_2;1002327_VQALQTLGK_2;1002331_VQALQTLGK_2;1002323_VQALQTLGK_2;1002326_VQALQTLGK_2;1002322_VQALQTLGK_2 : 6
#> 1005715_GLSLMMGR_2;1005711_GLSLMMGR_2;1005714_GLSLMMGR_2;1005709_GLSLMMGR_2;1005707_GLSLMMGR_2;1005710_GLSLMMGR_2 : 6
#> 1009308_AGQSNYGFANSAMENGR_2;1009315_AGQSNYGFANSAMENGR_2;1009302_AGQSNYGFANSAMENGR_2;1009310_AGQSNYGFANSAMENGR_2;1009300_AGQSNYGFANSAMENGR_2;1009313_AGQSNYGFANSAMENGR_2: 6
#> 1033816_ETSTNPIASIFAWTQGR_2;1033814_ETSTNPIASIFAWTQGR_2;1033809_ETSTNPIASIFAWTQGR_2;1033806_ETSTNPIASIFAWTQGR_2;1033819_ETSTNPIASIFAWTQGR_2;1033804_ETSTNPIASIFAWTQGR_2: 6
#> 1033831_ETSTNPIASIFAWTQGR_3;1033838_ETSTNPIASIFAWTQGR_3;1033836_ETSTNPIASIFAWTQGR_3;1033828_ETSTNPIASIFAWTQGR_3;1033826_ETSTNPIASIFAWTQGR_3;1033833_ETSTNPIASIFAWTQGR_3: 6
#> (Other) :2333
#> aggr_Peak_Area Condition BioReplicate
#> 0;0;0;0;0;0 : 12 Hela_Control :1185 Min. :1.000
#> 0.0;0.0;0.0;0.0;0.0;0.0 : 2 Hela_Treatment:1184 1st Qu.:1.000
#> 0.0;0.0;0.0;0.0;0.0;21.0 : 2 Median :2.000
#> 0.0;21.0;0.0;0.0;0.0;0.0 : 2 Mean :2.018
#> 0.0;0.0;105.0;126.0;126.0;42.0: 1 3rd Qu.:3.000
#> 0.0;0.0;105.0;63.0;21.0;42.0 : 1 Max. :3.000
#> (Other) :2349
#> Run transition_group_id
#> Min. :1.000 101540_PLLHNFHSFLYQPDWVKR_3_run0 : 6
#> 1st Qu.:2.000 101541_PLLHNFHSFLYQPDWVKR_4_run0 : 6
#> Median :4.000 106133_DTYFDELR_2_run0 : 6
#> Mean :3.535 106134_DTYFDRDVEELK_2_run0 : 6
#> 3rd Qu.:5.000 106135_DTYFDRDVEELK_3_run0 : 6
#> Max. :6.000 107313_DHPESPTPNPTEPLFLAQAEVYFLK_3_run0: 6
#> (Other) :2333
#> peptide_group_label run_id
#> 101540_PLLHNFHSFLYQPDWVKR_3 : 6 Length:2369
#> 101541_PLLHNFHSFLYQPDWVKR_4 : 6 Class :character
#> 106133_DTYFDELR_2 : 6 Mode :character
#> 106134_DTYFDRDVEELK_2 : 6
#> 106135_DTYFDRDVEELK_3 : 6
#> 107313_DHPESPTPNPTEPLFLAQAEVYFLK_3: 6
#> (Other) :2333
#> filename
#> /scratch/91384900.tmpdir/peterb_L150425_003b_SW.mzXML.gz:400
#> /scratch/91385021.tmpdir/peterb_J131223_043_SW.mzXML.gz :382
#> /scratch/91489123.tmpdir/peterb_L150514_001_SW.mzXML.gz :403
#> /scratch/91489166.tmpdir/peterb_L150425_011_SW.mzXML.gz :399
#> /scratch/91489180.tmpdir/peterb_L150514_002_SW.mzXML.gz :403
#> /scratch/91489194.tmpdir/peterb_J131223_054_SW.mzXML.gz :382
#>
#> RT id
#> Min. : 283.1 000ffaba-1fd0-11e5-b8c4-b499baa9c76e: 1
#> 1st Qu.:2685.3 00a711f0-2004-11e5-90be-7798a41a3f28: 1
#> Median :3957.1 00e9357a-2030-11e5-9551-3184f733b568: 1
#> Mean :3997.4 017119c8-2018-11e5-974a-e839352e1150: 1
#> 3rd Qu.:5262.8 01bb376e-2023-11e5-87b3-e839352e1150: 1
#> Max. :8106.8 01d045da-2020-11e5-974a-e839352e1150: 1
#> (Other) :2363
#> Sequence m.z Intensity
#> GGVGAVALLIGPNAPLIFETPR : 18 Min. : 399.5 Min. : 0
#> GLLSPHPLLQLSYTATDPIR : 18 1st Qu.: 519.8 1st Qu.: 4058
#> HTMDPQLAQR : 18 Median : 632.4 Median : 11444
#> LLEEDKPEEPTAHAFVSTLTFHR: 18 Mean : 660.1 Mean : 32694
#> LSGIDANPNALFPPVEFPAPHLR: 18 3rd Qu.: 763.3 3rd Qu.: 28995
#> LVNPEGPTLMLSR : 18 Max. :1167.6 Max. :2499570
#> (Other) :2261
#> decoy assay_rt delta_rt leftWidth
#> Min. :0.00000 Min. : 257.4 Min. :-265.88 Min. : 266.1
#> 1st Qu.:0.00000 1st Qu.:2452.2 1st Qu.: -12.70 1st Qu.:2661.5
#> Median :0.00000 Median :3672.9 Median : 21.58 Median :3938.6
#> Mean :0.02617 Mean :3702.4 Mean : 19.98 Mean :3977.2
#> 3rd Qu.:0.00000 3rd Qu.:4857.1 3rd Qu.: 56.05 3rd Qu.:5243.5
#> Max. :1.00000 Max. :7690.1 Max. : 281.73 Max. :8085.3
#> NA's :990 NA's :990
#> main_var_xx_swath_prelim_score norm_RT nr_peaks peak_apices_sum
#> Min. :0.4395 Min. :-37.45 Min. :6 Min. : -6
#> 1st Qu.:2.9025 1st Qu.: 17.96 1st Qu.:6 1st Qu.: 1868
#> Median :3.9863 Median : 50.13 Median :6 Median : 4232
#> Mean :4.0030 Mean : 49.15 Mean :6 Mean : 11126
#> 3rd Qu.:5.0904 3rd Qu.: 78.46 3rd Qu.:6 3rd Qu.: 10425
#> Max. :7.1822 Max. :162.65 Max. :6 Max. :373786
#> NA's :990 NA's :990 NA's :990
#> potentialOutlier rightWidth rt_score
#> none :977 Min. : 300.1 Min. :0.0001
#> 364038_AEAIKADK_2 : 6 1st Qu.:2704.4 1st Qu.:0.4504
#> 1931905_TLQDYLK_2 : 5 Median :3974.1 Median :1.0463
#> 1002326_VQALQTLGK_2: 4 Mean :4018.6 Mean :1.2806
#> 1094770_PLYLSTWGK_2: 4 3rd Qu.:5283.5 3rd Qu.:1.8740
#> (Other) :383 Max. :8128.2 Max. :7.9886
#> NA's :990 NA's :990
#> sn_ratio total_xic var_bseries_score dotprod_score
#> Min. : 1.529 Min. : 1890 Min. : 0.000 Min. :0.3871
#> 1st Qu.: 11.162 1st Qu.: 53738 1st Qu.: 1.000 1st Qu.:0.7841
#> Median : 28.542 Median : 132168 Median : 2.000 Median :0.8217
#> Mean : 44.632 Mean : 167410 Mean : 2.632 Mean :0.8046
#> 3rd Qu.: 66.171 3rd Qu.: 236214 3rd Qu.: 4.000 3rd Qu.:0.8463
#> Max. :216.330 Max. :2664292 Max. :12.000 Max. :0.9138
#> NA's :990 NA's :990 NA's :990 NA's :990
#> var_intensity_score var_isotope_correlation_score var_isotope_overlap_score
#> Min. :0.0188 Min. :-0.0255 Min. :0.0000
#> 1st Qu.:0.1218 1st Qu.: 0.8505 1st Qu.:0.0000
#> Median :0.1984 Median : 0.9460 Median :0.0000
#> Mean :0.2646 Mean : 0.8951 Mean :0.0213
#> 3rd Qu.:0.3450 3rd Qu.: 0.9870 3rd Qu.:0.0000
#> Max. :0.9723 Max. : 0.9996 Max. :0.8841
#> NA's :990 NA's :990 NA's :990
#> var_library_corr library_dotprod library_manhattan var_library_rmsd
#> Min. :-0.8229 Min. :0.7606 Min. :0.0211 Min. :0.0045
#> 1st Qu.: 0.6495 1st Qu.:0.9735 1st Qu.:0.1021 1st Qu.:0.0295
#> Median : 0.8590 Median :0.9868 Median :0.1431 Median :0.0436
#> Mean : 0.7242 Mean :0.9783 Mean :0.1646 Mean :0.0495
#> 3rd Qu.: 0.9595 3rd Qu.:0.9933 3rd Qu.:0.2033 3rd Qu.:0.0612
#> Max. : 0.9998 Max. :0.9998 Max. :0.6721 Max. :0.1950
#> NA's :990 NA's :990 NA's :990 NA's :990
#> library_rootmeansquare library_sangle var_log_sn_score manhatt_score
#> Min. :0.0052 Min. :0.0168 Min. :0.4244 Min. :0.4609
#> 1st Qu.:0.0358 1st Qu.:0.1749 1st Qu.:2.4125 1st Qu.:0.5894
#> Median :0.0539 Median :0.2695 Median :3.3514 Median :0.6293
#> Mean :0.0615 Mean :0.3014 Mean :3.2692 Mean :0.6556
#> 3rd Qu.:0.0760 3rd Qu.:0.3853 3rd Qu.:4.1922 3rd Qu.:0.6890
#> Max. :0.2650 Max. :1.0627 Max. :5.3768 Max. :1.4546
#> NA's :990 NA's :990 NA's :990 NA's :990
#> var_massdev_score var_massdev_score_weighted var_norm_rt_score
#> Min. : 0.5502 Min. : 0.4669 Min. :0.0000
#> 1st Qu.: 2.4458 1st Qu.: 2.1110 1st Qu.:0.0045
#> Median : 3.6665 Median : 3.1560 Median :0.0105
#> Mean : 4.3298 Mean : 3.6466 Mean :0.0128
#> 3rd Qu.: 5.5251 3rd Qu.: 4.5974 3rd Qu.:0.0187
#> Max. :20.9588 Max. :17.1705 Max. :0.0799
#> NA's :990 NA's :990 NA's :990
#> var_xcorr_coelution var_xcorr_coelution_weighted var_xcorr_shape
#> Min. :0.0000 Min. :0.0000 Min. :0.4888
#> 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.7857
#> Median :0.5014 Median :0.0780 Median :0.8769
#> Mean :0.8716 Mean :0.2444 Mean :0.8626
#> 3rd Qu.:1.1946 3rd Qu.:0.3683 3rd Qu.:0.9533
#> Max. :7.9633 Max. :2.9939 Max. :0.9987
#> NA's :990 NA's :990 NA's :990
#> var_xcorr_shape_weighted var_yseries_score elution_model_fit_score
#> Min. :0.5937 Min. : 0.000 Mode:logical
#> 1st Qu.:0.8093 1st Qu.: 3.000 NA's:2369
#> Median :0.9014 Median : 5.000
#> Mean :0.8787 Mean : 5.128
#> 3rd Qu.:0.9631 3rd Qu.: 7.000
#> Max. :0.9989 Max. :14.000
#> NA's :990 NA's :990
#> xx_lda_prelim_score xx_swath_prelim_score aggr_Peak_Apex
#> Min. :3.356 Min. :0 NA;NA;NA;NA;NA;NA:1379
#> 1st Qu.:6.301 1st Qu.:0 NA's : 990
#> Median :7.303 Median :0
#> Mean :7.249 Mean :0
#> 3rd Qu.:8.358 3rd Qu.:0
#> Max. :9.712 Max. :0
#> NA's :990 NA's :990
#> d_score m_score peak_group_rank align_runid
#> Min. :-10.0000 Min. :0.0000000 Min. :-1.000 0_119:403
#> 1st Qu.:-10.0000 1st Qu.:0.0000000 1st Qu.:-1.000 0_177:399
#> Median : 3.8893 Median :0.0002959 Median : 1.000 0_181:403
#> Mean : -0.4702 Mean :0.8360548 Mean : 0.165 0_210:382
#> 3rd Qu.: 6.8619 3rd Qu.:2.0000000 3rd Qu.: 1.000 0_221:400
#> Max. : 14.6997 Max. :2.0000000 Max. : 2.000 0_29 :382
#>
#> align_origfilename
#> /cluster/scratch_xp/shareholder/imsb_ra/workflows//1506190856/DATASET_CODE_124/PyProphet/peterb_J131223_043_SW_with_dscore_filtered.csv:382
#> /cluster/scratch_xp/shareholder/imsb_ra/workflows//1506190856/DATASET_CODE_205/PyProphet/peterb_L150514_001_SW_with_dscore_filtered.csv:403
#> /cluster/scratch_xp/shareholder/imsb_ra/workflows//1506190856/DATASET_CODE_258/PyProphet/peterb_L150425_011_SW_with_dscore_filtered.csv:399
#> /cluster/scratch_xp/shareholder/imsb_ra/workflows//1506190856/DATASET_CODE_261/PyProphet/peterb_L150514_002_SW_with_dscore_filtered.csv:403
#> /cluster/scratch_xp/shareholder/imsb_ra/workflows//1506190856/DATASET_CODE_288/PyProphet/peterb_J131223_054_SW_with_dscore_filtered.csv:382
#> /cluster/scratch_xp/shareholder/imsb_ra/workflows//1506190856/DATASET_CODE_36/PyProphet/peterb_L150425_003b_SW_with_dscore_filtered.csv:400
#>
#> align_clusterid
#> Min. :1
#> 1st Qu.:1
#> Median :1
#> Mean :1
#> 3rd Qu.:1
#> Max. :1
#>