Source code for full_dia.search

import warnings

import numpy as np
import pandas as pd
import torch
from numba.core.errors import NumbaPerformanceWarning

from full_dia import (
    calib,
    cfg,
    cross,
    deepmap,
    fdr,
    fxic,
    library,
    polish,
    quant,
    scoring,
    tims,
    utils,
)
from full_dia.decoy import cal_fg_mz_iso, make_decoys
from full_dia.log import Logger
from full_dia.refine import refine_models
from full_dia.tims import load_ms
from full_dia.utils import init_single_ws

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=NumbaPerformanceWarning)
np.set_printoptions(suppress=True)

try:
    _ = profile
except NameError:

[docs] def profile(func): return func
logger = Logger.get_logger()
[docs] @profile def seek_seed( df_target: pd.DataFrame, ms: tims.Tims, model_center: torch.nn.Module ) -> pd.DataFrame: """ Seek the best elution group for each pr using SA scoring methods. Then, model_center scores the elution group. Obviously, these elution groups may contain many false positives, but they can be used as seeds for calibration. Parameters ---------- df_target : pd.DataFrame The identification object from library. ms : tims.Tims MS data. model_center : torch.nn.Module DeepProfile will score the coelution consistency of the elution group. Returns ------- df : pd.DataFrame One precursor will have one elution group. """ df_good = [] swath_id_v = np.sort(df_target["swath_id"].unique()) for swath_id in swath_id_v: df_swath = df_target[df_target["swath_id"] == swath_id] df_swath = df_swath.reset_index(drop=True) if swath_id in swath_id_v[[0, int(len(swath_id_v) / 2), -1]]: info = "Seek-Seed, swath_id: {}, target num: {}".format( swath_id, len(df_swath) ) logger.info(info) # map_gpu ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False) ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True) batch_n = cfg.batch_xic_seed # by coelution scores using sa func for _, df_batch in df_swath.groupby(df_swath.index // batch_n): df_batch = df_batch.reset_index(drop=True) # [k, ions_num, n] locus, rts, xics = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, rt_tolerance=None, only_xic=True, ) xics = fxic.gpu_simple_smooth(xics) scores_sa, scores_sa_m = fxic.cal_coelution_by_gaussion( xics, cfg.window_points, df_batch.fg_num.values + 2 ) del xics scores_sa, idx = torch.topk(scores_sa, 1, dim=1, sorted=False) idx_x = np.arange(len(locus)) locus = locus[idx_x, idx.cpu().flatten()] df_batch["locus"] = locus df_batch["measure_rt"] = rts[idx_x, locus] locus_sas = scores_sa_m[idx_x, :, locus].cpu().numpy() # re-extract cycle_num = 3 _, _, ims, mzs, _ = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, cycle_num=cycle_num, only_xic=False, ) locus_ims = ims[:, :, int(cycle_num / 2)] df_batch["measure_pr_mz"] = mzs[:, 0, int(cycle_num / 2)] # measure_im for deep measure_im = fxic.cal_measure_im(locus_ims, locus_sas) df_batch["measure_im"] = measure_im scores_deep, features = deepmap.scoring_maps( model_center, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm, cfg.tol_im_map, neutron_num=0, return_feature=False, ) scores_sa = scores_sa.cpu().numpy().flatten() scores_deep = scores_deep.cpu().numpy().flatten() df_batch["score_deep"] = scores_sa * scores_deep df_good.append(df_batch) utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid) df = pd.concat(df_good, axis=0, ignore_index=True) return df
[docs] def cal_recall_seek_seed(df_lib, ms, model_center): """ For developing. """ if not cfg.is_compare_mode: return # seek_seed is different with seek_locus df_diann = pd.read_csv(cfg.ws_single / "diann" / "report.tsv", sep="\t") df_diann = df_diann[df_diann["Q.Value"] < 0.01] df_diann["pr_id"] = df_diann["Modified.Sequence"] + df_diann[ "Precursor.Charge" ].astype(str) df_diann = df_diann[["pr_id", "RT", "IM"]] if "diann_rt" not in df_lib.columns: df_diann["diann_rt"] = df_diann["RT"] * 60.0 df_diann = pd.merge(df_lib, df_diann, on="pr_id") df_diann = df_diann.reset_index(drop=True) del df_diann["RT"] rts = ms.get_scan_rts() locus = np.argmin(np.abs(df_diann["diann_rt"].values[:, None] - rts), axis=1) df_diann["locus_diann"] = locus # df_diann = df_diann[df_diann.pr_id == 'VSNSGITR2'].reset_index(drop=True) df_good = [] for swath_id in df_diann["swath_id"].unique(): df_swath = df_diann[df_diann["swath_id"] == swath_id] df_swath = df_swath.reset_index(drop=True) # map_gpu ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False) ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True) all_rts = ms1_profile["scan_rts"] batch_n = cfg.batch_xic_seed # by coelution scores using sa func for _, df_batch in df_swath.groupby(df_swath.index // batch_n): df_batch = df_batch.reset_index(drop=True) ions_num = df_batch.fg_num.values + 2 # [k, ions_num, n] locus, _, ims, _, xics = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, rt_tolerance=None, ) # sa xics = fxic.gpu_simple_smooth(xics) sa_gausion, sa_gausion_m = fxic.cal_coelution_by_gaussion( xics, cfg.window_points, ions_num ) xic_dp = xics.shape[-1] del xics # top-locus scores_sa, idx = torch.topk(sa_gausion, 1, dim=1, sorted=False) idx_x = np.arange(len(locus)) locus = locus[idx_x, idx.cpu().flatten()] rts = all_rts[locus] bias = np.abs(rts - df_batch.diann_rt.values) df_batch["bias_coelution"] = bias # locus -- measure_im -- deep locus_ims = ims[idx_x, :, locus] locus_sas = sa_gausion_m[idx_x, :, locus].cpu().numpy() measure_ims = fxic.cal_measure_im(locus_ims, locus_sas) df_batch["locus"] = locus df_batch["measure_im"] = measure_ims scores_deep, features = deepmap.scoring_maps( model_center, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm, cfg.tol_im_map, neutron_num=0, return_feature=False, ) scores_deep = scores_deep.cpu().numpy() df_batch["score_deep"] = scores_deep.max(axis=1) df_batch["measure_rt"] = 0 df_good.append(df_batch) utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid) df = pd.concat(df_good, axis=0, ignore_index=True) logger.info("data points num of xics: {}".format(xic_dp)) df1 = df[df["bias_coelution"] < cfg.locus_rt_thre] df1 = df1.reset_index(drop=True) logger.info("In seek_seed, Recall after top-1: ") utils.cal_acc_recall(cfg.ws_single, df1, diann_q_pr=0.01)
[docs] @profile def seek_locus( df_target: pd.DataFrame, ms: tims.Tims, model_center: torch.nn.Module, top_sa_q: float, top_deep_q: float, ) -> pd.DataFrame: """ Seek candidate elution groups (locus) by: 1) scree with sa 2) screen with deep. Parameters ---------- df_target : pd.DataFrame Provide the precursor information. ms : tims.Tims MS data. model_center : torch.nn.Module DeepProfile-14, used to score the elution consistency of monoisotope ions. top_sa_q : float First, candidate locus should have good SA scores compared to the best elution group. top_deep_q : float Second, candidate locus should have good deep scores compared to the best elution group. Returns ------- df : pd.DataFrame Each row is a candidate elution group. """ df_good = [] for swath_id in df_target["swath_id"].unique(): df_swath = df_target[df_target["swath_id"] == swath_id] df_swath = df_swath.reset_index(drop=True) if swath_id % 5 == 1: info = "Seek-Locus, swath_id: {}, target num: {}".format( swath_id, len(df_swath) ) # logger.info(info) # map_gpu ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False) ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True) all_rts = ms1_profile["scan_rts"] batch_n = cfg.batch_xic_locus # by coelution scores using sa func for _, df_batch in df_swath.groupby(df_swath.index // batch_n): df_batch = df_batch.reset_index(drop=True) # [k, ions_num, n] locus, rts, xics = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, cfg.tol_rt, only_xic=True, ) xics = fxic.gpu_simple_smooth(xics) scores_sa, scores_sa_m = fxic.cal_coelution_by_gaussion( xics, cfg.window_points, df_batch.fg_num.values + 2 ) scores_sa = fxic.screen_locus_by_sa(scores_sa, top_sa_q) if scores_sa.max() == 0: continue locus_v, locus_num, scores_sa_v, locus_sas = fxic.concat_nonzero_locus( locus, scores_sa, scores_sa_m ) # a row is a locus, locus_num with zeros will not be indexed df_batch = df_batch.loc[np.repeat(df_batch.index, locus_num)] df_batch = df_batch.reset_index(drop=True) df_batch["locus"] = locus_v df_batch["measure_rt"] = all_rts[locus_v] # re-extract cycle_num = 3 _, _, ims, mzs, _ = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, cycle_num=cycle_num, only_xic=False, ) locus_ims = ims[:, :, int(cycle_num / 2)] measure_ims = fxic.cal_measure_im(locus_ims, locus_sas) df_batch["measure_im"] = measure_ims # deep scores_deep, features = deepmap.scoring_maps( model_center, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm, cfg.tol_im_map, neutron_num=0, return_feature=False, ) scores_deep = scores_deep.cpu().numpy().flatten() df_batch["seek_score_deep"] = scores_deep df_batch["seek_score_sa"] = scores_sa_v df_batch["seek_score_sa_x_deep"] = scores_deep * scores_sa_v locus_num = locus_num[locus_num > 0] df_batch = fxic.screen_locus_by_deep(df_batch, locus_num, top_deep_q) df_good.append(df_batch) utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid) df = pd.concat(df_good, axis=0, ignore_index=True) info = "Seek locus: {} prs have {} candidate locus".format( df.pr_id.nunique(), len(df) ) logger.info(info) utils.cal_acc_recall(cfg.ws_single, df[df.decoy == 0], diann_q_pr=0.01) return df
[docs] def cal_recall_seek_locus(df_lib, ms, model, tol_rt, top_sa_cut, top_deep_cut): """ For developing. """ if not cfg.is_compare_mode: return df_diann = pd.read_csv(cfg.ws_single / "diann" / "report.tsv", sep="\t") df_diann = df_diann[df_diann["Q.Value"] < 0.01] df_diann["pr_id"] = df_diann["Modified.Sequence"] + df_diann[ "Precursor.Charge" ].astype(str) df_diann = df_diann[["pr_id", "RT"]] if "diann_rt" not in df_lib.columns: df_diann["diann_rt"] = df_diann["RT"] * 60.0 df_diann = pd.merge(df_lib, df_diann, on="pr_id") df_diann = df_diann.reset_index(drop=True) del df_diann["RT"] rts = ms.get_scan_rts() locus = np.argmin(np.abs(df_diann["diann_rt"].values[:, None] - rts), axis=1) df_diann["diann_locus"] = locus # df_diann = df_diann[df_diann.pr_id == 'ALNIETAIK3'].reset_index(drop=True) df_good, locus_num_v = [], [] for swath_id in df_diann["swath_id"].unique(): df_swath = df_diann[df_diann["swath_id"] == swath_id] df_swath = df_swath.reset_index(drop=True) # map_gpu ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False) ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True) all_rts = ms1_profile["scan_rts"] # by coelution scores using sa func batch_n = cfg.batch_xic_locus for _, df_batch in df_swath.groupby(df_swath.index // batch_n): df_batch = df_batch.reset_index(drop=True) # extract xics of 13 cycles df_batch["locus"] = df_batch["diann_locus"] locus, _, xics = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, cycle_num=13, only_xic=True, ) xics = fxic.gpu_simple_smooth(xics) _, sa_m = fxic.cal_coelution_by_gaussion( xics, cfg.window_points, df_batch.fg_num.values + 2 ) xics = utils.convert_numba_to_tensor(xics) locus_start_v, locus_end_v = fxic.estimate_xic_boundary(xics, sa_m[:, :, 6]) df_batch["locus_start"] = locus_start_v df_batch["locus_end"] = locus_end_v df_batch["locus_span"] = locus_end_v - locus_start_v # [k, ions_num, n] locus, _, ims, _, xics_rt = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, tol_rt, ) xics_rt = fxic.gpu_simple_smooth(xics_rt) scores_sa, scores_sa_m = fxic.cal_coelution_by_gaussion( xics_rt, cfg.window_points, df_batch.fg_num.values + 2 ) scores_sa = fxic.screen_locus_by_sa(scores_sa, top_sa_cut) locus_v, locus_num, locus_sa_v, locus_sas = fxic.concat_nonzero_locus( locus, scores_sa, scores_sa_m ) # a row is a locus, locus_num with zeros will not be indexed df_batch = df_batch.loc[np.repeat(df_batch.index, locus_num)] df_batch = df_batch.reset_index(drop=True) df_batch["locus"] = locus_v df_batch["score_sa"] = locus_sa_v df_batch["measure_rt"] = all_rts[locus_v] locus_num_v.append(locus_num) # re-extract cycle_num = 3 _, _, ims, mzs, _ = fxic.extract_xics( df_batch, ms1_centroid, ms2_centroid, cfg.tol_ppm, cfg.tol_im_xic, cycle_num=cycle_num, only_xic=False, ) locus_ims = ims[:, :, int(cycle_num / 2)] measure_ims = fxic.cal_measure_im(locus_ims, locus_sas) df_batch["measure_im"] = measure_ims scores_deep, features = deepmap.scoring_maps( model, df_batch, ms1_profile, ms2_profile, cfg.map_cycle_dim, cfg.map_im_gap, cfg.map_im_dim, cfg.tol_ppm, cfg.tol_im_map, neutron_num=0, return_feature=False, ) scores_deep = scores_deep.cpu().numpy().flatten() df_batch["seek_score_deep"] = scores_deep df_batch["seek_score_sa"] = locus_sa_v df_batch["seek_score_sa_x_deep"] = locus_sa_v * scores_deep df_good.append(df_batch) utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid) df = pd.concat(df_good, axis=0, ignore_index=True) logger.info("data points num of xics: {}".format(xics_rt.shape[2])) # recall after sa info = "Seek-Locus-by-sa-{}: ≈{:.2f}locus/pr, recall: ".format( top_sa_cut, len(df) / len(df_diann) ) logger.info(info) utils.cal_acc_recall(cfg.ws_single, df, diann_q_pr=0.01) # recall after sa + deep locus_num = np.concatenate(locus_num_v) locus_num = locus_num[locus_num > 0] assert np.all(df.groupby(by="pr_id", sort=False).size() == locus_num) df = fxic.screen_locus_by_deep(df, locus_num, top_deep_cut) info = "Seek-Locus-by-sa-{}-deep-{}: ≈{:.2f}locus/pr, recall: ".format( top_sa_cut, top_deep_cut, len(df) / len(df_diann) ) logger.info(info) utils.cal_acc_recall(cfg.ws_single, df, diann_q_pr=0.01) df = df.drop_duplicates(subset="pr_id", ignore_index=True) locus_span = df["locus_span"].median() info = "locus_span: {}".format(int(locus_span)) logger.info(info)
[docs] def update_tolerance( df_lib: pd.DataFrame, ms: tims.Tims, model_center: torch.nn.Module, model_big: torch.nn.Module, sample_ratio: float, ) -> None: """ Update the tolerance based on identifications of a subset of target peptides. Parameters ---------- df_lib : pd.DataFrame The raw library. ms : tims.Tims MS data. model_center : torch.nn.Module DeepProfile-14, used to score the elution consistency of monoisotopes. model_big : torch.nn.Module DeepProfile-56, used to score the elution consistency of monoisotopes + isotopes. sample_ratio : float Sample the subset of library to expedite the update. Returns ------- None. The global tolerance values will be updated: cfg.tol_rt, cfg.tol_im_xic, cfg.tol_ppm """ target_num = int(min(cfg.target_batch_max, sample_ratio * len(df_lib))) logger.info(f"Estimate the tolerance using {target_num} prs ...") df = df_lib.sample(n=target_num, random_state=42, replace=False) df = df.reset_index(drop=True) # seek for targets and decoys df1 = make_decoys(df, cfg.fg_num, method="mutate") df = pd.concat([df, df1]).reset_index(drop=True) df = seek_locus(df, ms, model_center, cfg.top_sa_cut, cfg.top_deep_cut) df = cal_fg_mz_iso(df) df = scoring.score_locus(df, ms, model_center, model_big) df, model_nn, scaler = fdr.cal_q_pr_batch(df, 50, 12) df = df[df["decoy"] == 0] for q_cut in [0.01, 0.02, 0.03, 0.04, 0.05]: ids = sum(df["q_pr_run"] < q_cut) if ids >= 50: break df = df[df["q_pr_run"] < q_cut].reset_index(drop=True) logger.info(f"Estimate the tolerance using {len(df)} prs at {q_cut:.2f} FDR:") if len(df) > 50: # estimate tol x = df["score_rt_abs"].values cut1 = x.mean() + 3.0 * x.std() cut2 = np.percentile(x, 95) tol_rt = max(cut1, cut2) cycle_time = ms.get_cycle_time() cycle_num = int(tol_rt * 2 / cycle_time) if cycle_num < 23: tol_rt = 23 * cycle_time / 2 x = df["score_imbias_0"].values x = x[(x > x.min()) & (x < x.max())] cut1 = x.mean() + 3.0 * x.std() cut2 = np.percentile(x, 95) tol_im = max(cut1, cut2) x = df["score_ppm_0"].values x = x[(x > x.min()) & (x < x.max())] cut1 = x.mean() + 3.0 * x.std() cut2 = np.percentile(x, 95) tol_ppm = max(cut1, cut2) cfg.tol_rt = tol_rt cfg.tol_im_xic = tol_im cfg.tol_ppm = tol_ppm else: cfg.tol_im_xic = cfg.tol_im_xic_after_calib info = "tol_rt: {:.2f}, tol_im: {:.3f}, tol_ppm: {:.2f}".format( cfg.tol_rt, cfg.tol_im_xic, cfg.tol_ppm ) logger.info(info)
[docs] def select_required_and_all_targets(df: pd.DataFrame) -> tuple: """ Select good target and decoy peps for FDR calculation. Select all target to save, which avoids the second extraction in global analysis. Parameters ---------- df : pd.DataFrame The identification result for one batch. Returns ------- tuple df_main : pd.DataFrame Good target and decoy peps. df_other : pd.DataFrame All target peps. """ df_main = df[df["q_pr_run"] < cfg.rubbish_q_cut].copy() cols_base = [ "pr_id", "pr_charge", "pr_index", "swath_id", "decoy", "locus", "measure_rt", "measure_im", ] cols_base += ["pr_mz", "score_elute_span_left", "score_elute_span_right"] cols_base += ["fg_mz_" + str(i) for i in range(cfg.fg_num)] df_other = df.loc[df["decoy"] == 0, cols_base].copy() return df_main, df_other
[docs] def search_core(lib: library.Library) -> None: """ Search on run level: 1. Seek seeds for calibration. 2. Seek candidate elution groups (locus) for each precursor. 3. Score the elution groups. 4. Calculate the FDR on run level. 5. Save all target precursor results and high-quality decoy precursor results. """ for ws_i, ws_single in enumerate(cfg.multi_ws): # ws_single init_single_ws(ws_i, cfg.file_num, ws_single) run_finished = (cfg.dir_out_global / (ws_single.name + ".parquet")).exists() if run_finished and not cfg.is_overwrite: continue ms = load_ms(ws_single) # utils.get_diann_info(ws_single) # set tol_rt cfg.tol_rt = ms.get_scan_rts()[-1] * cfg.tol_rt_ratio cycle_time = np.diff(ms.get_scan_rts()).mean() cfg.locus_rt_thre = cycle_time * cfg.locus_valid_num # polish lib df_lib = lib.polish_lib_by_swath( ms.get_dia_quadrupole(), # ws_diann= cfg.ws_single # for debug ) # load the pretrained models model_center, model_big = deepmap.load_models() # seek seed throughout the LC time cal_recall_seek_seed(df_lib, ms, model_center) df_seed = seek_seed(df_lib, ms, model_center) utils.save_as_pkl(df_seed, "df_seed.pkl") # df_seed = pd.read_pickle(cfg.dir_out_single / 'df_seed.pkl') # update tol df_seed, df_lib = calib.calib_rt(df_seed, df_lib) df_seed, df_lib = calib.calib_im(df_seed, df_lib) df_seed = calib.calib_mz(df_seed, ms) del df_seed update_tolerance(df_lib, ms, model_center, model_big, cfg.sample_ratio) # check params for seek_locus cal_recall_seek_locus( df_lib, ms, model_center, tol_rt=cfg.tol_rt, top_sa_cut=cfg.top_sa_cut, top_deep_cut=cfg.top_deep_cut, ) # batch division batch_num = int(np.ceil(len(df_lib) / cfg.target_batch_max)) rows_idx = np.array_split(df_lib.index.values, batch_num) df_main_v, df_other_v = [], [] for batch_idx in range(len(rows_idx)): if batch_idx % 3 == 0: logger.disabled = False info = "-------------Run-{}/{}-Batch-{}/{}-------------".format( ws_i + 1, cfg.file_num, batch_idx + 1, batch_num ) logger.info(info) else: logger.disabled = True df = df_lib.iloc[rows_idx[batch_idx]].reset_index(drop=True) # seek for targets and decoys df1 = make_decoys(df, cfg.fg_num, method="mutate") df = pd.concat([df, df1]).reset_index(drop=True) df = seek_locus(df, ms, model_center, cfg.top_sa_cut, cfg.top_deep_cut) # scoring by first round df = cal_fg_mz_iso(df) df = scoring.score_locus(df, ms, model_center, model_big) if batch_idx == 0: df, model_nn, scaler = fdr.cal_q_pr_batch(df, 50, 12) fdr.adjust_rubbish_q(df, batch_num) else: df, _, _ = fdr.cal_q_pr_batch(df, 50, 12, model_nn, scaler) df_main, df_other = select_required_and_all_targets(df) utils.print_ids(df_main, cfg.rubbish_q_cut, "pr", "run") df_main_v.append(df_main) df_other_v.append(df_other) df_main = pd.concat(df_main_v, axis=0, ignore_index=True) df_other = pd.concat(df_other_v, axis=0, ignore_index=True) logger.disabled = False logger.info("--------------------Concat batches:---------------------") df_main = cross.drop_batches_mismatch(df_main) df_main = fdr.cal_q_pr_combined(df_main, 50, 12) utils.print_ids(df_main, 1, "pr", "run") logger.info("--------------------------------------------------------") utils.save_as_pkl(df_main, "df_scores1.pkl") # df = pd.read_pickle(cfg.dir_out_single / 'df_scores1.pkl') # retrain models model_center, model_big, model_mall = refine_models( df_main, ms, model_center, model_big ) # scoring by second round df_main = scoring.update_scores( df_main, ms, model_center, model_big, model_mall ) utils.save_as_pkl(df_main, "df_scores2.pkl") # df_main = pd.read_pickle(cfg.dir_out_single / 'df_scores2.pkl') # scoring using DIA-NN sa and quant fg ions df_main = df_main[df_main["score_deep_center_sub_left_refine"] > 0].reset_index( drop=True ) logger.info( "Calculating coelution with the best-profile and quantifying ions for all prs..." ) df_main = quant.quant_center_ions(df_main, ms) df_other = quant.quant_center_ions(df_other, ms) utils.save_as_pkl(df_main, "df_main.pkl") utils.save_as_pkl(df_other, "df_other.pkl") # df_main = pd.read_pickle(cfg.dir_out_single / 'df_main.pkl') # df_other = pd.read_pickle(cfg.dir_out_single / 'df_other.pkl') # update FDR-Pr for second round df_main = fdr.cal_q_pr_combined(df_main, 50, 12) utils.print_ids(df_main, 1, "pr", "run") utils.save_as_pkl(df_main, "df_fdr1.pkl") # df_main = pd.read_pickle(cfg.dir_out_single / 'df_fdr1.pkl') # polish target prs and update FDR again df_main = polish.polish_prs(df_main) df_main = fdr.cal_q_pr_combined(df_main, 50, 12) df_main = polish.polish_prs(df_main) utils.print_ids(df_main, 1, "pr", "run") utils.save_as_pkl(df_main, "df_fdr2.pkl") # df_main = pd.read_pickle(cfg.dir_out_single / 'df_fdr2.pkl') # save result logger.info("Saving run-specific result as parquet...") utils.clean_and_save(df_main, df_other, ws_single) logger.info("Saving finished.") # release within loop del df_lib, df, ms cfg.tol_im_xic = cfg.tol_im_xic_before_calib cfg.tol_ppm = cfg.tol_ppm_before_calib