Source code for full_dia.scoring

import numpy as np
import pandas as pd
import torch
from numba import cuda, jit

from full_dia import cfg, deepmall, deepmap, fxic, tims, utils
from full_dia.log import Logger

try:
    _ = profile
except NameError:

[docs] def profile(func): return func
logger = Logger.get_logger()
[docs] @profile def score_locus( df_target: pd.DataFrame, ms: tims.Tims, model_center: torch.nn.Module, model_big: torch.nn.Module, ) -> pd.DataFrame: """ Calculate function-based and learning-based scores for PSMs. Parameters ---------- df_target : pd.DataFrame Provide the PSM information. ms : tims.Tims MS data. model_center : torch.nn.Module DeepProfile-14 model_big : torch.nn.Module DeepProfile-56 Returns ------- df : pd.DataFrame Scores have been appended to the DataFrame in columns prefixed with "score_". """ df_good = [] for swath_id in df_target["swath_id"].unique(): df_swath = df_target[df_target["swath_id"] == swath_id] df_swath = df_swath.reset_index(drop=True) # map_gpu ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False) ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True) batch_n = cfg.batch_deep_big # may split two locus that belong to a pr for _, df_batch in df_swath.groupby(df_swath.index // batch_n): df_batch = df_batch.reset_index(drop=True) # deep scores and deep features scores_deep_v, features_deep_v = deepmap.extract_scoring_big( model_center, model_big, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm, cfg.tol_im_map, ) _, rts, ims_v, mzs_v, xics_v = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, cycle_num=13, scope="big", ) _, _, xics_ppm1 = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm * 0.5, cfg.tol_im_xic, cycle_num=13, only_xic=True, ) _, _, xics_ppm2 = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm * 0.25, cfg.tol_im_xic, cycle_num=13, only_xic=True, ) # sa scores df_batch = scoring_other_elution(df_batch, xics_v[0], x="left") df_batch, xics = scoring_main_elution(df_batch, xics_v[1], x="center") df_batch = scoring_other_elution(df_batch, xics_v[2], x="1H") df_batch = scoring_other_elution(df_batch, xics_v[3], x="2H") df_batch, _ = scoring_main_elution(df_batch, xics_ppm1, x="center_p1") df_batch, _ = scoring_main_elution(df_batch, xics_ppm2, x="center_p2") # intensity, similarity, height ratio, area, snr df_batch = scoring_center_snr(df_batch, xics) df_batch = scoring_xic_intensity(df_batch, xics, rts) # deep df_batch = scoring_by_deep_prob(df_batch, scores_deep_v, x="pre") df_batch = scoring_by_deep_layer(df_batch, features_deep_v, x="pre") # rt df_batch = scoring_rt(df_batch) # im df_batch = scoring_center_im(df_batch, ims_v[1]) # mz df_batch = scoring_center_mz(df_batch, mzs_v[1]) # cross scores df_batch = scoring_by_cross(df_batch) df_good.append(df_batch) utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid) df = pd.concat(df_good, axis=0, ignore_index=True) df = scoring_putatives(df) # competitive for two locus from a pr df = scoring_meta(df) # meta scores return df
[docs] def scoring_by_deep_prob( df_batch: pd.DataFrame, scores_deep_v: list, x: str ) -> pd.DataFrame: """ Append the inference scores of DeepProfile to df. Parameters ---------- df_batch : pd.DataFrame The object. scores_deep_v : list The inference scores of DeepProfile. x : str "pre": scores are from the pretrain models. "refine": scores are from the refinement models. "refine_p1": scores are from the refinement models with 0.5 * ppm. "refine_p2": scores are from the refinement models with 0.25 * ppm. Returns ------- df : pd.DataFrame The inference scores of DeepProfile have been appended. """ if scores_deep_v[0] is not None: df_batch[f"score_left_deep_{x}"] = scores_deep_v[0] if scores_deep_v[1] is not None: df_batch[f"score_center_deep_{x}"] = scores_deep_v[1] if scores_deep_v[2] is not None: df_batch[f"score_1H_deep_{x}"] = scores_deep_v[2] if scores_deep_v[3] is not None: df_batch[f"score_2H_deep_{x}"] = scores_deep_v[3] if scores_deep_v[4] is not None: df_batch[f"score_big_deep_{x}"] = scores_deep_v[4] return df_batch
[docs] @profile def scoring_by_deep_layer( df_batch: pd.DataFrame, features_deep_v: list, x: str ) -> pd.DataFrame: """ Append the feature layers scores of DeepProfile to df. Parameters ---------- df_batch : pd.DataFrame The object. features_deep_v : list The feature layers scores of DeepProfile. x : str "pre": scores are from the pretrain models. "refine_p1": scores are from the refinement models with 0.5 * ppm. "refine_p2": scores are from the refinement models with 0.25 * ppm. Returns ------- df : pd.DataFrame The feature layers scores of DeepProfile have been appended. """ owned = 0 for features in features_deep_v: m = features.shape[-1] columns = [f"score_ft_deep_{x}_{i}" for i in range(owned, owned + m)] df_batch[columns] = features owned += m return df_batch
[docs] @profile def scoring_other_elution( df_batch: pd.DataFrame, xics: torch.Tensor, x: str ) -> pd.DataFrame: """ Calculate the following elution scores based on the isotope types specified by x: x: ['left', '1H', '2H'] 1. sa for each of the 14 ions 2. mean value of 14 ions 3. mean value of top-6 4. mean value w/o norm of remaining ions """ if xics is None: return df_batch fg_num = df_batch["fg_num"].values xics = cuda.as_cuda_array(xics) xics = fxic.gpu_simple_smooth(xics) coelutions, elutions = fxic.cal_coelution_by_gaussion( xics, cfg.window_points, 2 + fg_num ) center_idx = int(xics.shape[-1] / 2) idx_x = np.arange(len(df_batch)) coelutions = coelutions[idx_x, center_idx].cpu().numpy() elutions = elutions[idx_x, :, center_idx].cpu().numpy() # sa for 14 ions m = elutions.shape[-1] columns = [f"score_{x}_elution_{i}" for i in range(m)] df_batch[columns] = elutions # mean of 14 ions df_batch[f"score_{x}_coelution"] = coelutions # mean of top-6 ions fg_elutions = elutions[:, 2:].copy() fg_elutions_6 = fg_elutions[:, :6].copy() df_batch[f"score_{x}_coelution_top6"] = fg_elutions_6.sum(axis=1) # mean w/o norm for remaining ions elution_rest = fg_elutions[:, 6:].sum(axis=1) elution_rest_norm = elution_rest / (fg_num - 6 - 1e-7) elution_rest_norm[elution_rest_norm < 0] = 0 elution_rest = elution_rest.astype(np.float32) elution_rest_norm = elution_rest_norm.astype(np.float32) df_batch[f"score_{x}_coelution_rest"] = elution_rest df_batch[f"score_{x}_coelution_rest_norm"] = elution_rest_norm return df_batch
[docs] @profile def scoring_main_elution( df_batch: pd.DataFrame, xics: torch.Tensor, x: str ) -> pd.DataFrame: """ Calculate the following elution scores based on the monoisotope types specified by x: x: ['center', 'center_p1', 'center_p2'] 1. The sa for each of the 14 ions 2. mean value of 14 ions 3. mean value of top-6 4. mean value w/o norm of remaining ions 5. sum of top1/2/3 b ions """ fg_num = df_batch["fg_num"].values xics = cuda.as_cuda_array(xics) xics = fxic.gpu_simple_smooth(xics) coelutions, elutions = fxic.cal_coelution_by_gaussion( xics, cfg.window_points, 2 + fg_num ) center_idx = int(xics.shape[-1] / 2) idx_x = np.arange(len(df_batch)) coelutions = coelutions[idx_x, center_idx].cpu().numpy() elutions = elutions[idx_x, :, center_idx].cpu().numpy() # sa for 14 ions and its mean df_batch[f"score_{x}_coelution"] = coelutions.astype(np.float32) m = elutions.shape[-1] columns = [f"score_{x}_elution_{i}" for i in range(m)] df_batch[columns] = elutions # mean of top-6 ions; mean w/o norm for remaining ions fg_elutions = elutions[:, 2:].copy() fg_elutions_6 = fg_elutions[:, :6].copy() df_batch[f"score_{x}_coelution_top6"] = fg_elutions_6.sum(axis=1) elution_rest = fg_elutions[:, 6:].sum(axis=1) elution_rest_norm = elution_rest / (fg_num - 6 - 1e-7) elution_rest_norm[elution_rest_norm < 0] = 0 elution_rest = elution_rest.astype(np.float32) elution_rest_norm = elution_rest_norm.astype(np.float32) df_batch[f"score_{x}_coelution_rest"] = elution_rest df_batch[f"score_{x}_coelution_rest_norm"] = elution_rest_norm # sum of top1/2/3 b ions if x.find("p") == -1: # ppm-10/5 are not available cols_anno = ["fg_anno_" + str(i) for i in range(cfg.fg_num)] fg_anno = df_batch[cols_anno].values fg_type = fg_anno // 1000 fg_elutions[fg_type != 1] = 0 # non-b series set to 0 fg_elutions = np.sort(fg_elutions, axis=1)[:, ::-1] df_batch[f"score_{x}_elution_b_top1"] = fg_elutions[:, 0] df_batch[f"score_{x}_elution_b_top2"] = fg_elutions[:, :2].sum(axis=1) df_batch[f"score_{x}_elution_b_top3"] = fg_elutions[:, :3].sum(axis=1) return df_batch, utils.convert_numba_to_tensor(xics)
[docs] @profile def scoring_xic_intensity( df_batch: pd.DataFrame, xics: torch.Tensor, rts: np.ndarray ) -> pd.DataFrame: """ Calculate the intensity related scores. Only top-6 intensities are consideration. apex intensities: ms2_relative, ms2_total, ms1/ms2, similarity profile areas: ms2_relative, ms2_total, ms1/ms2, similarity """ center_idx = int(xics.shape[-1] / 2) cols = ["score_center_elution_" + str(i) for i in range(14)] elutions = df_batch[cols].values + 1e-7 # boundary sa_m = torch.from_numpy(elutions).to(cfg.gpu_id) locus_start_v, locus_end_v = fxic.estimate_xic_boundary(xics, sa_m) locus_start_v = locus_start_v.astype(np.int8) locus_end_v = locus_end_v.astype(np.int8) df_batch["score_elute_span_left"] = locus_start_v df_batch["score_elute_span_right"] = locus_end_v df_batch["score_elute_span"] = locus_end_v - locus_start_v # outside of boundary set to 0 xics = xics[:, :8, :].cpu().numpy() mask1 = np.arange(xics.shape[2]) >= locus_start_v[:, None, None] mask2 = np.arange(xics.shape[2]) <= locus_end_v[:, None, None] xics = xics * (mask1 & mask2) # intensity:ms1 and ms2_total ms1_heights = xics[:, 0, center_idx] unfrag_heights = xics[:, 1, center_idx] ms2_heights = xics[:, 2:, center_idx] ms2_height_sum = ms2_heights.sum(axis=1) df_batch["score_intensity_ms1"] = np.log(ms1_heights + 1.0) df_batch["score_intensity_unfrag"] = np.log(unfrag_heights + 1.0) df_batch["score_intensity_ms2_total"] = np.log(ms2_height_sum + 1.0) # intensity: ms2_relative row_max = np.max(ms2_heights, axis=1, keepdims=True) + 1e-7 ms2_heights_norm = ms2_heights / row_max m = ms2_heights_norm.shape[-1] columns = ["score_intensity_ms2_relative_" + str(i) for i in range(m)] df_batch[columns] = ms2_heights_norm # intensity: ms1/ms2 ms1_ms2_ratio = ms1_heights / (ms2_height_sum + 1e-7) df_batch["score_intensity_ms1_ms2_ratio"] = np.log(ms1_ms2_ratio + 1e-7) # intensity: similarity cols_height = ["fg_height_" + str(i) for i in range(6)] ms2_lib = df_batch[cols_height].values pcc = utils.cal_sa_by_np(ms2_lib, ms2_heights_norm) df_batch["score_intensity_similarity"] = pcc df_batch["score_intensity_similarity_cube"] = pcc**3 # area rts = np.repeat(rts[:, np.newaxis, :], xics.shape[1], axis=1) areas = np.trapz(xics, x=rts, axis=2) # area: ms1, unfrag, ms2 ms1_heights = areas[:, 0] unfrag_heights = areas[:, 1] ms2_heights = areas[:, 2:] ms2_height_sum = ms2_heights.sum(axis=1) df_batch["score_area_ms1"] = np.log(ms1_heights + 1.0) df_batch["score_area_unfrag"] = np.log(unfrag_heights + 1.0) df_batch["score_area_ms2_total"] = np.log(ms2_height_sum + 1.0) # area: ms2_relative row_max = np.max(ms2_heights, axis=1, keepdims=True) + 1e-7 ms2_heights_norm = ms2_heights / row_max m = ms2_heights_norm.shape[-1] columns = ["score_area_relative_" + str(i) for i in range(m)] df_batch[columns] = ms2_heights_norm # area: ms1/ms2 ms1_ms2_ratio = ms1_heights / (ms2_height_sum + 1e-7) df_batch["score_area_ms1_ms2_ratio"] = np.log(ms1_ms2_ratio + 1e-7) # area: similarity pcc = utils.cal_sa_by_np(ms2_lib, ms2_heights_norm) df_batch["score_area_similarity"] = pcc df_batch["score_area_similarity_cube"] = pcc**3 return df_batch
[docs] def scoring_rt(df_batch: pd.DataFrame) -> pd.DataFrame: """ Calculate RT related scores. """ measure_rts = df_batch["measure_rt"].values pred_rts = df_batch["pred_rt"].values rt_bias = np.abs(pred_rts - measure_rts) df_batch["score_measure_rt"] = measure_rts df_batch["score_pred_rt"] = pred_rts df_batch["score_rt_abs"] = rt_bias df_batch["score_rt_power"] = rt_bias**2 df_batch["score_rt_root"] = rt_bias**0.5 df_batch["score_rt_log"] = np.log(rt_bias + 1.0) small = np.minimum(measure_rts, pred_rts) big = np.maximum(measure_rts, pred_rts) df_batch["score_rt_ratio"] = small / big return df_batch
[docs] @profile def scoring_center_snr(df_batch: pd.DataFrame, xics: torch.Tensor) -> pd.DataFrame: """ Calculate SNR related scores with the center cycle MS/MS. Signal is the apex intensiy, noise is the median of profile. 1. snrs for 14 ions 2. mean 3. mean weighting by sa 4. mean of top-6 weighting by sa """ center_idx = int(xics.shape[-1] / 2) signals = xics[:, :, center_idx - 1 : center_idx + 2].amax(dim=2) noises = xics.median(dim=2)[0] snr = (signals + 1) / (noises + 1) snr = snr.cpu().numpy() fg_num = df_batch["fg_num"].values # 1. snrs for 14 ions m = snr.shape[-1] columns = ["score_center_snr_" + str(i) for i in range(m)] df_batch[columns] = np.log(snr) # 2. mean snr_average = snr.sum(axis=1) / (2 + fg_num) df_batch["score_center_snr_average1"] = np.log(snr_average + 1e-7) # 3. mean weighting by sa cols = ["score_center_elution_" + str(i) for i in range(14)] elutions = df_batch[cols].values + 1e-7 snr_average = np.average(snr, weights=elutions, axis=1) df_batch["score_center_snr_average2"] = np.log(snr_average + 1e-7) # 4. mean of top-6 weighting by sa snr_fg = snr[:, 2:8] fg_elutions_6 = elutions[:, 2:8] snr_average = np.average(snr_fg, weights=fg_elutions_6, axis=1) df_batch["score_center_snr_average3"] = np.log(snr_average + 1e-7) return df_batch
[docs] @profile def scoring_center_im(df_batch: pd.DataFrame, ims_input: np.ndarray) -> pd.DataFrame: """ Calculate mobility related scores with the center cycle MS/MS. 1. imbias for 14 ions 2. mean 3. mean weighting by sa 4. mean of top-6 weighting by sa """ center_idx = int(ims_input.shape[-1] / 2) ims = ims_input[:, :, center_idx] ims[ims < 0.0] = 0.0 # missing value -- 0 fg_num = df_batch["fg_num"].values # im for precursor df_batch["score_pred_im"] = df_batch["pred_im"] df_batch["score_measure_im"] = df_batch["measure_im"] # imbias for ions,missing value -- tol bias = np.abs(ims - df_batch["pred_im"].values[:, None]) bias[bias > cfg.tol_im_xic] = cfg.tol_im_xic # 1. imbias for 14 ions m = bias.shape[-1] columns = ["score_imbias_" + str(i) for i in range(m)] df_batch[columns] = bias # 2. mean bias_ms2 = bias[:, 2:] bias_ms2[fg_num[:, None] <= np.arange(bias_ms2.shape[1])] = 0 bias_average = bias_ms2.sum(axis=1) / fg_num df_batch["score_imbias_average1"] = bias_average # 3. mean weighting by sa cols = ["score_center_elution_" + str(i) for i in range(14)] elutions = df_batch[cols].values + 1e-7 bias_average = np.average(bias_ms2, weights=elutions[:, 2:], axis=1) df_batch["score_imbias_average2"] = bias_average # 4. mean of top-6 weighting by sa fg_elutions_6 = elutions[:, 2:8] bias_ms2 = bias_ms2[:, :6] bias_average = np.average(bias_ms2, weights=fg_elutions_6, axis=1) df_batch["score_imbias_average3"] = bias_average return df_batch
[docs] @profile def scoring_center_mz(df_batch: pd.DataFrame, mzs_input: np.ndarray) -> pd.DataFrame: """ Calculate ppm related scores with the center cycle MS/MS. 1. ppm for 14 ions 2. mean 3. mean weighting by sa 4. mean of top-6 weighting by sa """ center_idx = int(mzs_input.shape[-1] / 2) mzs = mzs_input[:, :, center_idx] mzs[mzs < 0.0] = 0.0 # missing value -- 0 fg_num = df_batch["fg_num"].values # mz for precursor df_batch["score_pr_mz"] = df_batch["pr_mz"] df_batch["score_pr_mz_measure"] = mzs[:, 0] # ppm mzs_pr = df_batch["pr_mz"].values.reshape(-1, 1) cols_center = ["fg_mz_" + str(i) for i in range(cfg.fg_num)] mzs_fg = df_batch[cols_center].values mzs_pred = np.concatenate([mzs_pr, mzs_pr, mzs_fg], axis=1) ppm = 1e6 * np.abs(mzs_pred - mzs) / (mzs_pred + 1e-7) ppm[ppm > cfg.tol_ppm] = cfg.tol_ppm # 1. ppm for 14 ions m = ppm.shape[-1] columns = ["score_ppm_" + str(i) for i in range(m)] df_batch[columns] = ppm # 2. mean ppm_ms2 = ppm[:, 2:] ppm_ms2[fg_num[:, None] <= np.arange(ppm_ms2.shape[1])] = 0 ppm_average = ppm_ms2.sum(axis=1) / fg_num df_batch["score_ppm_average1"] = ppm_average # 3. mean weighting by sa cols = ["score_center_elution_" + str(i) for i in range(14)] elutions = df_batch[cols].values + 1e-7 ppm_average = np.average(ppm_ms2, weights=elutions[:, 2:], axis=1) df_batch["score_ppm_average2"] = ppm_average # 4. mean of top-6 weighting by sa fg_elutions_6 = elutions[:, 2:8] ppm_ms2 = ppm_ms2[:, :6] ppm_average = np.average(ppm_ms2, weights=fg_elutions_6, axis=1) df_batch["score_ppm_average3"] = ppm_average return df_batch
[docs] @profile def scoring_meta(df: pd.DataFrame) -> pd.DataFrame: """ Calculate peptide meta information related scores: 1. mz (scoring_center_mz) 2. charge(one-hot encoding using 1, 2, 3, 4) 3. sequence length 4. fg_num 5. library fragment ions intensities """ c = pd.Categorical(df["pr_charge"], categories=[1, 2, 3, 4]) pr_charges = pd.get_dummies(c, prefix="score_pr_charge").astype(np.int8) df = pd.concat([df, pr_charges], axis=1) df["score_pr_len"] = df["simple_seq"].str.len().astype(np.int8) df["score_fg_num"] = df["fg_num"].astype(np.int8) # frag info:height cols_height = ["fg_height_" + str(i) for i in range(1, cfg.fg_num)] height = df[cols_height].values # [k, m] columns = ["score_lib_height_" + str(i) for i in range(height.shape[-1])] df[columns] = height return df
[docs] @jit(nopython=True, nogil=True) def numba_scoring_putatives(groups, sa_v, center_v, big_v): """ Use Numba to accelerate the computation of the maximum and sum scores across different candidate peak groups for the same precursor. """ result_sa_max = np.empty(len(groups), dtype=sa_v.dtype) result_sa_sum = np.empty(len(groups), dtype=sa_v.dtype) result_center_max = np.empty(len(groups), dtype=sa_v.dtype) result_center_sum = np.empty(len(groups), dtype=sa_v.dtype) result_big_max = np.empty(len(groups), dtype=sa_v.dtype) result_big_sum = np.empty(len(groups), dtype=sa_v.dtype) current_group = groups[0] sa_max = sa_v[0] sa_sum = sa_v[0] center_max = center_v[0] center_sum = center_v[0] big_max = big_v[0] big_sum = big_v[0] start_idx = 0 for i in range(1, len(groups)): if groups[i] != current_group: for j in range(start_idx, i): result_sa_max[j] = sa_max result_center_max[j] = center_max result_big_max[j] = big_max result_sa_sum[j] = sa_sum result_center_sum[j] = center_sum result_big_sum[j] = big_sum current_group = groups[i] sa_max = sa_v[i] sa_sum = sa_v[i] center_max = center_v[i] center_sum = center_v[i] big_max = big_v[i] big_sum = big_v[i] start_idx = i else: sa_max = max(sa_max, sa_v[i]) center_max = max(center_max, center_v[i]) big_max = max(big_max, big_v[i]) sa_sum += sa_v[i] center_sum += center_v[i] big_sum += big_v[i] for j in range(start_idx, len(groups)): result_sa_max[j] = sa_max result_center_max[j] = center_max result_big_max[j] = big_max result_sa_sum[j] = sa_sum result_center_sum[j] = center_sum result_big_sum[j] = big_sum return ( result_sa_max, result_center_max, result_big_max, result_sa_sum, result_center_sum, result_big_sum, )
[docs] @profile def scoring_putatives(df: pd.DataFrame) -> pd.DataFrame: """ Calculate competition-related scores as a pr has multiple candidate elution groups: 1. score-i - score-max 2. np.log(score-i/score.sum) """ a = 1e-7 pr_index_v = df["pr_index"].values sa_v = df["score_center_coelution"].values center_v = df["score_center_deep_pre"].values big_v = df["score_big_deep_pre"].values (sa_max_v, center_max_v, big_max_v, sa_sum_v, center_sum_v, big_sum_v) = ( numba_scoring_putatives(pr_index_v, sa_v, center_v, big_v) ) df["score_center_coelution_putative1"] = sa_v - sa_max_v df["score_center_coelution_putative2"] = np.log(sa_v + a) / (sa_sum_v + a) df["score_center_deep_pre_putative1"] = center_v - center_max_v df["score_center_deep_pre_putative2"] = np.log(center_v + a) / (center_sum_v + a) df["score_big_deep_pre_putative1"] = big_v - big_max_v df["score_big_deep_pre_putative2"] = np.log(big_v + a) / (big_sum_v + a) # rank group_size = df.groupby("pr_id", sort=False).size() group_size_cumsum = np.concatenate([[0], np.cumsum(group_size)]) group_rank = utils.cal_group_rank( df["score_big_deep_pre"].values, group_size_cumsum ) df["group_rank"] = group_rank return df
[docs] def scoring_by_cross(df_batch: pd.DataFrame, is_update: bool = False) -> pd.DataFrame: """ Compute scores combinations as additional scores: Before refine phase (is_update: False): 1. sa_center - sa_left 2. deep_center - deep_left 3. sa_center * deep_center 4. sa_center * deep_big After refine phase (is_update: True): 1. deep_center - deep_left 2. sa_center * deep_center 3. sa_center * deep_big """ # feature augmentation if not is_update: # raw model + non-ppm sa_center = df_batch["score_center_coelution"].values sa_left = df_batch["score_left_coelution"].values deep_center = df_batch["score_center_deep_pre"].values deep_left = df_batch["score_left_deep_pre"].values deep_big = df_batch["score_big_deep_pre"].values df_batch["score_coelution_center_sub_left"] = sa_center - sa_left df_batch["score_deep_center_sub_left"] = deep_center - deep_left df_batch["score_coelution_x_center"] = sa_center * deep_center df_batch["score_coelution_x_big"] = sa_center * deep_big else: # refine model + non-ppm sa_center = df_batch["score_center_coelution"].values deep_center = df_batch["score_center_deep_refine"].values deep_left = df_batch["score_left_deep_refine"].values deep_big = df_batch["score_big_deep_refine"].values df_batch["score_deep_center_sub_left_refine"] = deep_center - deep_left df_batch["score_coelution_x_center_refine"] = sa_center * deep_center df_batch["score_coelution_x_big_refine"] = sa_center * deep_big return df_batch
[docs] def update_scores( df: pd.DataFrame, ms: tims.Tims, model_center: torch.nn.Module, model_big: torch.nn.Module, model_mall: torch.nn.Module, ) -> pd.DataFrame: """ Calculate scores using the refined DeepProfile and the trained DeepMall. 1. DeepProfile: refined deep prob scores 2. DeepProfile: cross scores with refined deep prob scores 3. DeepProfile: refined deep prob and layer scores with 0.5 * ppm 4. DeepProfile: refined deep prob and layer scores with 0.25 * ppm 5. DeepMall: deep prob and layer scores """ df_good = [] for swath_id in df["swath_id"].unique(): df_swath = df[df["swath_id"] == swath_id] df_swath = df_swath.reset_index(drop=True) # map_gpu ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False) ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True) batch_n = cfg.batch_deep_big for _, df_batch in df_swath.groupby(df_swath.index // batch_n): df_batch = df_batch.reset_index(drop=True) # deepmap-refined scores without feature scores_deep_v, _ = deepmap.extract_scoring_big( model_center, model_big, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm, cfg.tol_im_map, ) df_batch = scoring_by_deep_prob(df_batch, scores_deep_v, x="refine") df_batch = scoring_by_cross(df_batch, is_update=True) # 0.5*ppm scores_deep_v, features_deep_v = deepmap.extract_scoring_big( model_center, model_big, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm * 0.5, cfg.tol_im_map, ) df_batch = scoring_by_deep_prob(df_batch, scores_deep_v, x="refine_p1") df_batch = scoring_by_deep_layer(df_batch, features_deep_v, x="refine_p1") # 0.25*ppm scores_deep_v, features_deep_v = deepmap.extract_scoring_big( model_center, model_big, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm * 0.25, cfg.tol_im_map, ) df_batch = scoring_by_deep_prob(df_batch, scores_deep_v, x="refine_p2") df_batch = scoring_by_deep_layer(df_batch, features_deep_v, x="refine_p2") # deepmall scores_mall, features_mall = deepmall.scoring_mall( model_mall, df_batch, ms1_centroid, ms2_centroid, cfg.tol_im_xic, cfg.tol_ppm, ) df_batch["score_mall"] = scores_mall m = features_mall.shape[-1] columns = ["score_ft_mall_" + str(i) for i in range(m)] df_batch[columns] = features_mall df_good.append(df_batch) utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid) df = pd.concat(df_good, axis=0, ignore_index=True) utils.cal_acc_recall(cfg.ws_single, df[df["decoy"] == 0], diann_q_pr=0.01) return df