Source code for full_dia.search

import warnings

import numpy as np
import pandas as pd
import torch
from numba.core.errors import NumbaPerformanceWarning

from full_dia import (
    calib,
    cfg,
    cross,
    deepmap,
    fdr,
    fxic,
    library,
    polish,
    quant,
    scoring,
    tims,
    utils,
)
from full_dia.decoy import cal_fg_mz_iso, make_decoys
from full_dia.log import Logger
from full_dia.refine import refine_models
from full_dia.tims import load_ms
from full_dia.utils import init_single_ws

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=NumbaPerformanceWarning)
np.set_printoptions(suppress=True)

try:
    _ = profile
except NameError:


[docs]
    def profile(func):
        return func



logger = Logger.get_logger()



[docs]
@profile
def seek_seed(
    df_target: pd.DataFrame, ms: tims.Tims, model_center: torch.nn.Module
) -> pd.DataFrame:
    """
    Seek the best elution group for each pr using SA scoring methods.
    Then, model_center scores the elution group.
    Obviously, these elution groups may contain many false positives, but they can be used as seeds for calibration.

    Parameters
    ----------
    df_target : pd.DataFrame
        The identification object from library.

    ms : tims.Tims
        MS data.

    model_center : torch.nn.Module
        DeepProfile will score the coelution consistency of the elution group.

    Returns
    -------
    df : pd.DataFrame
        One precursor will have one elution group.
    """
    df_good = []
    swath_id_v = np.sort(df_target["swath_id"].unique())
    for swath_id in swath_id_v:
        df_swath = df_target[df_target["swath_id"] == swath_id]
        df_swath = df_swath.reset_index(drop=True)
        if swath_id in swath_id_v[[0, int(len(swath_id_v) / 2), -1]]:
            info = "Seek-Seed, swath_id: {}, target num: {}".format(
                swath_id, len(df_swath)
            )
            logger.info(info)

        # map_gpu
        ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False)
        ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True)
        batch_n = cfg.batch_xic_seed

        # by coelution scores using sa func
        for _, df_batch in df_swath.groupby(df_swath.index // batch_n):
            df_batch = df_batch.reset_index(drop=True)
            # [k, ions_num, n]
            locus, rts, xics = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                rt_tolerance=None,
                only_xic=True,
            )
            xics = fxic.gpu_simple_smooth(xics)
            scores_sa, scores_sa_m = fxic.cal_coelution_by_gaussion(
                xics, cfg.window_points, df_batch.fg_num.values + 2
            )
            del xics

            scores_sa, idx = torch.topk(scores_sa, 1, dim=1, sorted=False)
            idx_x = np.arange(len(locus))
            locus = locus[idx_x, idx.cpu().flatten()]
            df_batch["locus"] = locus
            df_batch["measure_rt"] = rts[idx_x, locus]
            locus_sas = scores_sa_m[idx_x, :, locus].cpu().numpy()

            # re-extract
            cycle_num = 3
            _, _, ims, mzs, _ = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                cycle_num=cycle_num,
                only_xic=False,
            )
            locus_ims = ims[:, :, int(cycle_num / 2)]
            df_batch["measure_pr_mz"] = mzs[:, 0, int(cycle_num / 2)]

            # measure_im for deep
            measure_im = fxic.cal_measure_im(locus_ims, locus_sas)
            df_batch["measure_im"] = measure_im
            scores_deep, features = deepmap.scoring_maps(
                model_center,
                df_batch,
                ms1_profile,
                ms2_profile,
                cfg.map_cycle_dim,
                cfg.map_im_gap,
                cfg.map_im_dim,
                cfg.tol_ppm,
                cfg.tol_im_map,
                neutron_num=0,
                return_feature=False,
            )
            scores_sa = scores_sa.cpu().numpy().flatten()
            scores_deep = scores_deep.cpu().numpy().flatten()
            df_batch["score_deep"] = scores_sa * scores_deep
            df_good.append(df_batch)

        utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid)

    df = pd.concat(df_good, axis=0, ignore_index=True)
    return df




[docs]
def cal_recall_seek_seed(df_lib, ms, model_center):
    """
    For developing.
    """
    if not cfg.is_compare_mode:
        return

    # seek_seed is different with seek_locus
    df_diann = pd.read_csv(cfg.ws_single / "diann" / "report.tsv", sep="\t")
    df_diann = df_diann[df_diann["Q.Value"] < 0.01]
    df_diann["pr_id"] = df_diann["Modified.Sequence"] + df_diann[
        "Precursor.Charge"
    ].astype(str)
    df_diann = df_diann[["pr_id", "RT", "IM"]]
    if "diann_rt" not in df_lib.columns:
        df_diann["diann_rt"] = df_diann["RT"] * 60.0

    df_diann = pd.merge(df_lib, df_diann, on="pr_id")
    df_diann = df_diann.reset_index(drop=True)
    del df_diann["RT"]

    rts = ms.get_scan_rts()
    locus = np.argmin(np.abs(df_diann["diann_rt"].values[:, None] - rts), axis=1)
    df_diann["locus_diann"] = locus
    # df_diann = df_diann[df_diann.pr_id == 'VSNSGITR2'].reset_index(drop=True)

    df_good = []
    for swath_id in df_diann["swath_id"].unique():
        df_swath = df_diann[df_diann["swath_id"] == swath_id]
        df_swath = df_swath.reset_index(drop=True)

        # map_gpu
        ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False)
        ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True)

        all_rts = ms1_profile["scan_rts"]
        batch_n = cfg.batch_xic_seed

        # by coelution scores using sa func
        for _, df_batch in df_swath.groupby(df_swath.index // batch_n):
            df_batch = df_batch.reset_index(drop=True)
            ions_num = df_batch.fg_num.values + 2
            # [k, ions_num, n]
            locus, _, ims, _, xics = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                rt_tolerance=None,
            )
            # sa
            xics = fxic.gpu_simple_smooth(xics)
            sa_gausion, sa_gausion_m = fxic.cal_coelution_by_gaussion(
                xics, cfg.window_points, ions_num
            )
            xic_dp = xics.shape[-1]
            del xics

            # top-locus
            scores_sa, idx = torch.topk(sa_gausion, 1, dim=1, sorted=False)
            idx_x = np.arange(len(locus))
            locus = locus[idx_x, idx.cpu().flatten()]
            rts = all_rts[locus]

            bias = np.abs(rts - df_batch.diann_rt.values)
            df_batch["bias_coelution"] = bias

            # locus -- measure_im -- deep
            locus_ims = ims[idx_x, :, locus]
            locus_sas = sa_gausion_m[idx_x, :, locus].cpu().numpy()
            measure_ims = fxic.cal_measure_im(locus_ims, locus_sas)
            df_batch["locus"] = locus
            df_batch["measure_im"] = measure_ims
            scores_deep, features = deepmap.scoring_maps(
                model_center,
                df_batch,
                ms1_profile,
                ms2_profile,
                cfg.map_cycle_dim,
                cfg.map_im_gap,
                cfg.map_im_dim,
                cfg.tol_ppm,
                cfg.tol_im_map,
                neutron_num=0,
                return_feature=False,
            )
            scores_deep = scores_deep.cpu().numpy()

            df_batch["score_deep"] = scores_deep.max(axis=1)
            df_batch["measure_rt"] = 0
            df_good.append(df_batch)

        utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid)

    df = pd.concat(df_good, axis=0, ignore_index=True)
    logger.info("data points num of xics: {}".format(xic_dp))

    df1 = df[df["bias_coelution"] < cfg.locus_rt_thre]
    df1 = df1.reset_index(drop=True)
    logger.info("In seek_seed, Recall after top-1: ")
    utils.cal_acc_recall(cfg.ws_single, df1, diann_q_pr=0.01)




[docs]
@profile
def seek_locus(
    df_target: pd.DataFrame,
    ms: tims.Tims,
    model_center: torch.nn.Module,
    top_sa_q: float,
    top_deep_q: float,
) -> pd.DataFrame:
    """
    Seek candidate elution groups (locus) by: 1) scree with sa 2) screen with deep.

    Parameters
    ----------
    df_target : pd.DataFrame
        Provide the precursor information.

    ms : tims.Tims
        MS data.

    model_center : torch.nn.Module
        DeepProfile-14, used to score the elution consistency of monoisotope ions.

    top_sa_q : float
        First, candidate locus should have good SA scores compared to the best elution group.

    top_deep_q : float
        Second, candidate locus should have good deep scores compared to the best elution group.

    Returns
    -------
    df : pd.DataFrame
        Each row is a candidate elution group.
    """
    df_good = []
    for swath_id in df_target["swath_id"].unique():
        df_swath = df_target[df_target["swath_id"] == swath_id]
        df_swath = df_swath.reset_index(drop=True)
        if swath_id % 5 == 1:
            info = "Seek-Locus, swath_id: {}, target num: {}".format(
                swath_id, len(df_swath)
            )
            # logger.info(info)

        # map_gpu
        ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False)
        ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True)
        all_rts = ms1_profile["scan_rts"]
        batch_n = cfg.batch_xic_locus

        # by coelution scores using sa func
        for _, df_batch in df_swath.groupby(df_swath.index // batch_n):
            df_batch = df_batch.reset_index(drop=True)
            # [k, ions_num, n]
            locus, rts, xics = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                cfg.tol_rt,
                only_xic=True,
            )
            xics = fxic.gpu_simple_smooth(xics)
            scores_sa, scores_sa_m = fxic.cal_coelution_by_gaussion(
                xics, cfg.window_points, df_batch.fg_num.values + 2
            )
            scores_sa = fxic.screen_locus_by_sa(scores_sa, top_sa_q)
            if scores_sa.max() == 0:
                continue
            locus_v, locus_num, scores_sa_v, locus_sas = fxic.concat_nonzero_locus(
                locus, scores_sa, scores_sa_m
            )

            # a row is a locus, locus_num with zeros will not be indexed
            df_batch = df_batch.loc[np.repeat(df_batch.index, locus_num)]
            df_batch = df_batch.reset_index(drop=True)
            df_batch["locus"] = locus_v
            df_batch["measure_rt"] = all_rts[locus_v]

            # re-extract
            cycle_num = 3
            _, _, ims, mzs, _ = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                cycle_num=cycle_num,
                only_xic=False,
            )
            locus_ims = ims[:, :, int(cycle_num / 2)]
            measure_ims = fxic.cal_measure_im(locus_ims, locus_sas)
            df_batch["measure_im"] = measure_ims

            # deep
            scores_deep, features = deepmap.scoring_maps(
                model_center,
                df_batch,
                ms1_profile,
                ms2_profile,
                cfg.map_cycle_dim,
                cfg.map_im_gap,
                cfg.map_im_dim,
                cfg.tol_ppm,
                cfg.tol_im_map,
                neutron_num=0,
                return_feature=False,
            )
            scores_deep = scores_deep.cpu().numpy().flatten()

            df_batch["seek_score_deep"] = scores_deep
            df_batch["seek_score_sa"] = scores_sa_v
            df_batch["seek_score_sa_x_deep"] = scores_deep * scores_sa_v

            locus_num = locus_num[locus_num > 0]
            df_batch = fxic.screen_locus_by_deep(df_batch, locus_num, top_deep_q)

            df_good.append(df_batch)

        utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid)

    df = pd.concat(df_good, axis=0, ignore_index=True)

    info = "Seek locus: {} prs have {} candidate locus".format(
        df.pr_id.nunique(), len(df)
    )
    logger.info(info)
    utils.cal_acc_recall(cfg.ws_single, df[df.decoy == 0], diann_q_pr=0.01)

    return df




[docs]
def cal_recall_seek_locus(df_lib, ms, model, tol_rt, top_sa_cut, top_deep_cut):
    """
    For developing.
    """
    if not cfg.is_compare_mode:
        return

    df_diann = pd.read_csv(cfg.ws_single / "diann" / "report.tsv", sep="\t")
    df_diann = df_diann[df_diann["Q.Value"] < 0.01]
    df_diann["pr_id"] = df_diann["Modified.Sequence"] + df_diann[
        "Precursor.Charge"
    ].astype(str)
    df_diann = df_diann[["pr_id", "RT"]]
    if "diann_rt" not in df_lib.columns:
        df_diann["diann_rt"] = df_diann["RT"] * 60.0

    df_diann = pd.merge(df_lib, df_diann, on="pr_id")
    df_diann = df_diann.reset_index(drop=True)
    del df_diann["RT"]

    rts = ms.get_scan_rts()
    locus = np.argmin(np.abs(df_diann["diann_rt"].values[:, None] - rts), axis=1)
    df_diann["diann_locus"] = locus

    # df_diann = df_diann[df_diann.pr_id == 'ALNIETAIK3'].reset_index(drop=True)

    df_good, locus_num_v = [], []
    for swath_id in df_diann["swath_id"].unique():
        df_swath = df_diann[df_diann["swath_id"] == swath_id]
        df_swath = df_swath.reset_index(drop=True)

        # map_gpu
        ms1_profile, ms2_profile = ms.copy_map_to_gpu(swath_id, centroid=False)
        ms1_centroid, ms2_centroid = ms.copy_map_to_gpu(swath_id, centroid=True)
        all_rts = ms1_profile["scan_rts"]

        # by coelution scores using sa func
        batch_n = cfg.batch_xic_locus
        for _, df_batch in df_swath.groupby(df_swath.index // batch_n):
            df_batch = df_batch.reset_index(drop=True)

            # extract xics of 13 cycles
            df_batch["locus"] = df_batch["diann_locus"]
            locus, _, xics = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                cycle_num=13,
                only_xic=True,
            )
            xics = fxic.gpu_simple_smooth(xics)
            _, sa_m = fxic.cal_coelution_by_gaussion(
                xics, cfg.window_points, df_batch.fg_num.values + 2
            )
            xics = utils.convert_numba_to_tensor(xics)
            locus_start_v, locus_end_v = fxic.estimate_xic_boundary(xics, sa_m[:, :, 6])
            df_batch["locus_start"] = locus_start_v
            df_batch["locus_end"] = locus_end_v
            df_batch["locus_span"] = locus_end_v - locus_start_v

            # [k, ions_num, n]
            locus, _, ims, _, xics_rt = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                tol_rt,
            )
            xics_rt = fxic.gpu_simple_smooth(xics_rt)
            scores_sa, scores_sa_m = fxic.cal_coelution_by_gaussion(
                xics_rt, cfg.window_points, df_batch.fg_num.values + 2
            )
            scores_sa = fxic.screen_locus_by_sa(scores_sa, top_sa_cut)
            locus_v, locus_num, locus_sa_v, locus_sas = fxic.concat_nonzero_locus(
                locus, scores_sa, scores_sa_m
            )

            # a row is a locus, locus_num with zeros will not be indexed
            df_batch = df_batch.loc[np.repeat(df_batch.index, locus_num)]
            df_batch = df_batch.reset_index(drop=True)
            df_batch["locus"] = locus_v
            df_batch["score_sa"] = locus_sa_v
            df_batch["measure_rt"] = all_rts[locus_v]
            locus_num_v.append(locus_num)

            # re-extract
            cycle_num = 3
            _, _, ims, mzs, _ = fxic.extract_xics(
                df_batch,
                ms1_centroid,
                ms2_centroid,
                cfg.tol_ppm,
                cfg.tol_im_xic,
                cycle_num=cycle_num,
                only_xic=False,
            )
            locus_ims = ims[:, :, int(cycle_num / 2)]
            measure_ims = fxic.cal_measure_im(locus_ims, locus_sas)
            df_batch["measure_im"] = measure_ims

            scores_deep, features = deepmap.scoring_maps(
                model,
                df_batch,
                ms1_profile,
                ms2_profile,
                cfg.map_cycle_dim,
                cfg.map_im_gap,
                cfg.map_im_dim,
                cfg.tol_ppm,
                cfg.tol_im_map,
                neutron_num=0,
                return_feature=False,
            )
            scores_deep = scores_deep.cpu().numpy().flatten()
            df_batch["seek_score_deep"] = scores_deep
            df_batch["seek_score_sa"] = locus_sa_v
            df_batch["seek_score_sa_x_deep"] = locus_sa_v * scores_deep

            df_good.append(df_batch)

        utils.release_gpu_scans(ms1_profile, ms2_profile, ms1_centroid, ms2_centroid)

    df = pd.concat(df_good, axis=0, ignore_index=True)
    logger.info("data points num of xics: {}".format(xics_rt.shape[2]))

    # recall after sa
    info = "Seek-Locus-by-sa-{}: ≈{:.2f}locus/pr, recall: ".format(
        top_sa_cut, len(df) / len(df_diann)
    )
    logger.info(info)
    utils.cal_acc_recall(cfg.ws_single, df, diann_q_pr=0.01)

    # recall after sa + deep
    locus_num = np.concatenate(locus_num_v)
    locus_num = locus_num[locus_num > 0]
    assert np.all(df.groupby(by="pr_id", sort=False).size() == locus_num)
    df = fxic.screen_locus_by_deep(df, locus_num, top_deep_cut)
    info = "Seek-Locus-by-sa-{}-deep-{}: ≈{:.2f}locus/pr, recall: ".format(
        top_sa_cut, top_deep_cut, len(df) / len(df_diann)
    )
    logger.info(info)
    utils.cal_acc_recall(cfg.ws_single, df, diann_q_pr=0.01)

    df = df.drop_duplicates(subset="pr_id", ignore_index=True)
    locus_span = df["locus_span"].median()
    info = "locus_span: {}".format(int(locus_span))
    logger.info(info)




[docs]
def update_tolerance(
    df_lib: pd.DataFrame,
    ms: tims.Tims,
    model_center: torch.nn.Module,
    model_big: torch.nn.Module,
    sample_ratio: float,
) -> None:
    """
    Update the tolerance based on identifications of a subset of target peptides.

    Parameters
    ----------
    df_lib : pd.DataFrame
        The raw library.

    ms : tims.Tims
        MS data.

    model_center : torch.nn.Module
        DeepProfile-14, used to score the elution consistency of monoisotopes.

    model_big : torch.nn.Module
        DeepProfile-56, used to score the elution consistency of monoisotopes + isotopes.

    sample_ratio : float
        Sample the subset of library to expedite the update.

    Returns
    -------
    None. The global tolerance values will be updated:
        cfg.tol_rt, cfg.tol_im_xic, cfg.tol_ppm
    """
    target_num = int(min(cfg.target_batch_max, sample_ratio * len(df_lib)))
    logger.info(f"Estimate the tolerance using {target_num} prs ...")

    df = df_lib.sample(n=target_num, random_state=42, replace=False)
    df = df.reset_index(drop=True)

    # seek for targets and decoys
    df1 = make_decoys(df, cfg.fg_num, method="mutate")
    df = pd.concat([df, df1]).reset_index(drop=True)
    df = seek_locus(df, ms, model_center, cfg.top_sa_cut, cfg.top_deep_cut)
    df = cal_fg_mz_iso(df)
    df = scoring.score_locus(df, ms, model_center, model_big)
    df, model_nn, scaler = fdr.cal_q_pr_batch(df, 50, 12)
    df = df[df["decoy"] == 0]
    for q_cut in [0.01, 0.02, 0.03, 0.04, 0.05]:
        ids = sum(df["q_pr_run"] < q_cut)
        if ids >= 50:
            break
    df = df[df["q_pr_run"] < q_cut].reset_index(drop=True)
    logger.info(f"Estimate the tolerance using {len(df)} prs at {q_cut:.2f} FDR:")

    if len(df) > 50:
        # estimate tol
        x = df["score_rt_abs"].values
        cut1 = x.mean() + 3.0 * x.std()
        cut2 = np.percentile(x, 95)
        tol_rt = max(cut1, cut2)

        cycle_time = ms.get_cycle_time()
        cycle_num = int(tol_rt * 2 / cycle_time)
        if cycle_num < 23:
            tol_rt = 23 * cycle_time / 2

        x = df["score_imbias_0"].values
        x = x[(x > x.min()) & (x < x.max())]
        cut1 = x.mean() + 3.0 * x.std()
        cut2 = np.percentile(x, 95)
        tol_im = max(cut1, cut2)

        x = df["score_ppm_0"].values
        x = x[(x > x.min()) & (x < x.max())]
        cut1 = x.mean() + 3.0 * x.std()
        cut2 = np.percentile(x, 95)
        tol_ppm = max(cut1, cut2)

        cfg.tol_rt = tol_rt
        cfg.tol_im_xic = tol_im
        cfg.tol_ppm = tol_ppm
    else:
        cfg.tol_im_xic = cfg.tol_im_xic_after_calib

    info = "tol_rt: {:.2f}, tol_im: {:.3f}, tol_ppm: {:.2f}".format(
        cfg.tol_rt, cfg.tol_im_xic, cfg.tol_ppm
    )
    logger.info(info)




[docs]
def select_required_and_all_targets(df: pd.DataFrame) -> tuple:
    """
    Select good target and decoy peps for FDR calculation.
    Select all target to save, which avoids the second extraction in global analysis.

    Parameters
    ----------
    df : pd.DataFrame
        The identification result for one batch.

    Returns
    -------
    tuple
        df_main : pd.DataFrame
            Good target and decoy peps.
        df_other : pd.DataFrame
            All target peps.
    """
    df_main = df[df["q_pr_run"] < cfg.rubbish_q_cut].copy()

    cols_base = [
        "pr_id",
        "pr_charge",
        "pr_index",
        "swath_id",
        "decoy",
        "locus",
        "measure_rt",
        "measure_im",
    ]
    cols_base += ["pr_mz", "score_elute_span_left", "score_elute_span_right"]
    cols_base += ["fg_mz_" + str(i) for i in range(cfg.fg_num)]
    df_other = df.loc[df["decoy"] == 0, cols_base].copy()

    return df_main, df_other




[docs]
def search_core(lib: library.Library) -> None:
    """
    Search on run level:
        1. Seek seeds for calibration.
        2. Seek candidate elution groups (locus) for each precursor.
        3. Score the elution groups.
        4. Calculate the FDR on run level.
        5. Save all target precursor results and high-quality decoy precursor results.
    """
    for ws_i, ws_single in enumerate(cfg.multi_ws):
        # ws_single
        init_single_ws(ws_i, cfg.file_num, ws_single)
        run_finished = (cfg.dir_out_global / (ws_single.name + ".parquet")).exists()
        if run_finished and not cfg.is_overwrite:
            continue

        ms = load_ms(ws_single)
        # utils.get_diann_info(ws_single)

        # set tol_rt
        cfg.tol_rt = ms.get_scan_rts()[-1] * cfg.tol_rt_ratio
        cycle_time = np.diff(ms.get_scan_rts()).mean()
        cfg.locus_rt_thre = cycle_time * cfg.locus_valid_num

        # polish lib
        df_lib = lib.polish_lib_by_swath(
            ms.get_dia_quadrupole(),
            #     ws_diann= cfg.ws_single  # for debug
        )

        # load the pretrained models
        model_center, model_big = deepmap.load_models()

        # seek seed throughout the LC time
        cal_recall_seek_seed(df_lib, ms, model_center)
        df_seed = seek_seed(df_lib, ms, model_center)
        utils.save_as_pkl(df_seed, "df_seed.pkl")
        # df_seed = pd.read_pickle(cfg.dir_out_single / 'df_seed.pkl')

        # update tol
        df_seed, df_lib = calib.calib_rt(df_seed, df_lib)
        df_seed, df_lib = calib.calib_im(df_seed, df_lib)
        df_seed = calib.calib_mz(df_seed, ms)
        del df_seed
        update_tolerance(df_lib, ms, model_center, model_big, cfg.sample_ratio)

        # check params for seek_locus
        cal_recall_seek_locus(
            df_lib,
            ms,
            model_center,
            tol_rt=cfg.tol_rt,
            top_sa_cut=cfg.top_sa_cut,
            top_deep_cut=cfg.top_deep_cut,
        )

        # batch division
        batch_num = int(np.ceil(len(df_lib) / cfg.target_batch_max))
        rows_idx = np.array_split(df_lib.index.values, batch_num)
        df_main_v, df_other_v = [], []
        for batch_idx in range(len(rows_idx)):
            if batch_idx % 3 == 0:
                logger.disabled = False
                info = "-------------Run-{}/{}-Batch-{}/{}-------------".format(
                    ws_i + 1, cfg.file_num, batch_idx + 1, batch_num
                )
                logger.info(info)
            else:
                logger.disabled = True

            df = df_lib.iloc[rows_idx[batch_idx]].reset_index(drop=True)

            # seek for targets and decoys
            df1 = make_decoys(df, cfg.fg_num, method="mutate")
            df = pd.concat([df, df1]).reset_index(drop=True)
            df = seek_locus(df, ms, model_center, cfg.top_sa_cut, cfg.top_deep_cut)

            # scoring by first round
            df = cal_fg_mz_iso(df)
            df = scoring.score_locus(df, ms, model_center, model_big)
            if batch_idx == 0:
                df, model_nn, scaler = fdr.cal_q_pr_batch(df, 50, 12)
                fdr.adjust_rubbish_q(df, batch_num)
            else:
                df, _, _ = fdr.cal_q_pr_batch(df, 50, 12, model_nn, scaler)
            df_main, df_other = select_required_and_all_targets(df)
            utils.print_ids(df_main, cfg.rubbish_q_cut, "pr", "run")
            df_main_v.append(df_main)
            df_other_v.append(df_other)
        df_main = pd.concat(df_main_v, axis=0, ignore_index=True)
        df_other = pd.concat(df_other_v, axis=0, ignore_index=True)
        logger.disabled = False

        logger.info("--------------------Concat batches:---------------------")
        df_main = cross.drop_batches_mismatch(df_main)
        df_main = fdr.cal_q_pr_combined(df_main, 50, 12)
        utils.print_ids(df_main, 1, "pr", "run")
        logger.info("--------------------------------------------------------")
        utils.save_as_pkl(df_main, "df_scores1.pkl")
        # df = pd.read_pickle(cfg.dir_out_single / 'df_scores1.pkl')

        # retrain models
        model_center, model_big, model_mall = refine_models(
            df_main, ms, model_center, model_big
        )

        # scoring by second round
        df_main = scoring.update_scores(
            df_main, ms, model_center, model_big, model_mall
        )
        utils.save_as_pkl(df_main, "df_scores2.pkl")
        # df_main = pd.read_pickle(cfg.dir_out_single / 'df_scores2.pkl')

        # scoring using DIA-NN sa and quant fg ions
        df_main = df_main[df_main["score_deep_center_sub_left_refine"] > 0].reset_index(
            drop=True
        )
        logger.info(
            "Calculating coelution with the best-profile and quantifying ions for all prs..."
        )
        df_main = quant.quant_center_ions(df_main, ms)
        df_other = quant.quant_center_ions(df_other, ms)
        utils.save_as_pkl(df_main, "df_main.pkl")
        utils.save_as_pkl(df_other, "df_other.pkl")
        # df_main = pd.read_pickle(cfg.dir_out_single / 'df_main.pkl')
        # df_other = pd.read_pickle(cfg.dir_out_single / 'df_other.pkl')

        # update FDR-Pr for second round
        df_main = fdr.cal_q_pr_combined(df_main, 50, 12)
        utils.print_ids(df_main, 1, "pr", "run")
        utils.save_as_pkl(df_main, "df_fdr1.pkl")
        # df_main = pd.read_pickle(cfg.dir_out_single / 'df_fdr1.pkl')

        # polish target prs and update FDR again
        df_main = polish.polish_prs(df_main)
        df_main = fdr.cal_q_pr_combined(df_main, 50, 12)
        df_main = polish.polish_prs(df_main)
        utils.print_ids(df_main, 1, "pr", "run")
        utils.save_as_pkl(df_main, "df_fdr2.pkl")
        # df_main = pd.read_pickle(cfg.dir_out_single / 'df_fdr2.pkl')

        # save result
        logger.info("Saving run-specific result as parquet...")
        utils.clean_and_save(df_main, df_other, ws_single)
        logger.info("Saving finished.")

        # release within loop
        del df_lib, df, ms
        cfg.tol_im_xic = cfg.tol_im_xic_before_calib
        cfg.tol_ppm = cfg.tol_ppm_before_calib