import copy
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from full_dia import assemble, cfg, fdr, library, polish, utils
from full_dia.log import Logger
from full_dia.models import DeepQuant
try:
_ = profile
except NameError:
[docs]
def profile(func):
return func
logger = Logger.get_logger()
[docs]
def drop_batches_mismatch(df: pd.DataFrame) -> pd.DataFrame:
"""
Delete the decoy peptides if they are same to target peptides.
Also delete the duplicated decoy peptides.
"""
# remove decoy duplicates
df_decoy = df[df["decoy"] == 1]
idx_max = df_decoy.groupby("pr_id")["cscore_pr_run"].idxmax()
df_decoy = df_decoy.loc[idx_max].reset_index(drop=True)
# remove decoy mismatch
df_target = df[df["decoy"] == 0]
bad_idx = df_decoy["pr_id"].isin(df_target["pr_id"])
df_decoy = df_decoy.loc[~bad_idx]
df = pd.concat([df_target, df_decoy], axis=0, ignore_index=True)
assert len(df) == df["pr_id"].nunique()
return df
[docs]
def drop_runs_mismatch(df: pd.DataFrame) -> pd.DataFrame:
"""
Delete the decoy peptides if they are same to target peptides.
Delete the duplicated decoy/target peptides.
"""
# remove decoy duplicates
df_decoy = df[df["decoy"] == 1]
idx_max = df_decoy.groupby("pr_id")["cscore_pr_global"].idxmax()
df_decoy = df_decoy.loc[idx_max].reset_index(drop=True)
# remove target duplicates
df_target = df[df["decoy"] == 0]
idx_max = df_target.groupby("pr_id")["cscore_pr_global"].idxmax()
df_target = df_target.loc[idx_max].reset_index(drop=True)
# remove decoy mismatch
bad_idx = df_decoy["pr_id"].isin(df_target["pr_id"])
df_decoy = df_decoy.loc[~bad_idx]
df = pd.concat([df_target, df_decoy], axis=0, ignore_index=True)
assert len(df) == df["pr_id"].nunique()
return df
[docs]
def group_by_species(species: pd.Series, ratio_cut: float = 0.05) -> list:
"""
Determine the species that over the ratio_cut ratio.
"""
df = species.value_counts(normalize=True).reset_index() # sorted
df = df.rename(columns={"protein_name": "species"})
mask = df["proportion"] < ratio_cut
to_merge = df[mask]
remaining_df = df[~mask]
if not to_merge.empty:
merged_a = to_merge["species"].tolist()
merged_b = to_merge["proportion"].sum()
while merged_b < ratio_cut and not remaining_df.empty:
min_b_row = remaining_df.nsmallest(1, "proportion")
merged_a += min_b_row["species"].tolist()
merged_b += min_b_row["proportion"].sum()
remaining_df = remaining_df.drop(min_b_row.index)
if merged_b > ratio_cut:
new_row = pd.DataFrame({"species": [merged_a], "proportion": [merged_b]})
df = pd.concat([remaining_df, new_row], ignore_index=True)
else:
df = pd.concat([df[~mask], to_merge], ignore_index=True)
df["species"] = df["species"].apply(lambda x: x if isinstance(x, list) else [x])
return df["species"].tolist()
[docs]
def quant_pr_autoencoder(df_global: pd.DataFrame, top_k_fg: int) -> pd.DataFrame:
"""
Quantify peptides by summing fragment-ion intensities smoothed by an autoencoder.
Parameters
----------
df_global : pd.DataFrame
Provide the quantification values and SA values of fragment ions.
top_k_fg : int
How many fragment ions to sum to a pep quantification value.
Returns
-------
df_global : pd.DataFrame
Add columns: quant_pr_raw, quant_pr_deep, quant_pr_mix
"""
import re
n_run = (
max(
int(m.group(1))
for col in df_global.columns
if (m := re.search(r"run_(\d+)", col))
)
+ 1
)
sa_m_v, area_m_v = [], []
ion_idx = range(2, 2 + cfg.fg_num) # only considering MS2 signal
for wi in range(n_run):
cols_sa = ["run_" + str(wi) + "_" + "score_ion_sa_" + str(i) for i in ion_idx]
sa_m = df_global.loc[:, cols_sa].values
sa_m[np.isnan(sa_m)] = 0.0
sa_m_v.append(sa_m)
cols_quant = [
"run_" + str(wi) + "_" + "score_ion_quant_" + str(i) for i in ion_idx
]
area_m = df_global.loc[:, cols_quant].values
area_m[np.isnan(area_m) | (area_m < 1.1)] = 1.1 # maybe transformed by log
area_m_v.append(area_m)
# prepare dataset: norm globally or row-wise for area_m_log
sa_m = np.hstack(sa_m_v) # [n_pep, n_run * n_ion]
n_ion = area_m_v[0].shape[-1]
area_m = np.hstack(area_m_v) # [n_pep, n_run * n_ion]
area_m_log = np.log2(area_m)
area_m_log_max1 = area_m_log.max()
area_m_norm1 = area_m_log / area_m_log_max1
# area_m_norm_min1 = area_m_norm1.min()
area_m_max2 = area_m_log.max(axis=1)
area_m_norm2 = area_m_log / area_m_max2[:, None]
# weight based on train_val data
train_val_idx = (df_global["q_pr_global"] < 0.01) & (df_global["decoy"] == 0)
cscores_pr = df_global["cscore_pr_global"].values
W_cscore = cscores_pr[train_val_idx]
# pytorch
X_area1 = torch.tensor(area_m_norm1).to(cfg.gpu_id)
X_area2 = torch.tensor(area_m_norm2).to(cfg.gpu_id)
X_sa = torch.tensor(sa_m).to(cfg.gpu_id)
W_cscore = torch.tensor(W_cscore).to(cfg.gpu_id)
X_area1_train_val = X_area1[train_val_idx]
X_area2_train_val = X_area2[train_val_idx]
X_sa_train_val = X_sa[train_val_idx]
dataset = TensorDataset(
X_area1_train_val, X_area2_train_val, X_sa_train_val, W_cscore
)
dataset_pred = TensorDataset(X_area1, X_area2, X_sa)
train_num = int(0.8 * len(dataset))
eval_num = len(dataset) - train_num
logger.info(f"DeepQuant train: {train_num} prs, eval: {eval_num} prs")
train_set, val_set = torch.utils.data.random_split(
dataset, [train_num, eval_num], generator=torch.Generator().manual_seed(42)
)
train_loader = DataLoader(
train_set,
batch_size=64,
drop_last=True,
shuffle=True,
generator=torch.Generator().manual_seed(42),
)
val_loader = DataLoader(val_set, batch_size=256)
pred_loader = DataLoader(dataset_pred, batch_size=1024, shuffle=False)
model = DeepQuant(n_run, n_ion).to(cfg.gpu_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss(reduction="none")
# train and valid
best_loss = float("inf")
epochs_no_improve = 0
for epoch in range(100):
# train
model.train()
epoch_train_loss = 0
for X_area1_batch, X_area2_batch, X_sa_batch, W_cscore in train_loader:
optimizer.zero_grad()
recon = model(X_area1_batch, X_area2_batch, X_sa_batch)
loss = criterion(recon, X_area1_batch)
loss = (loss * X_sa_batch).mean(dim=1)
loss = (loss * W_cscore).mean()
epoch_train_loss += loss.item()
loss.backward()
optimizer.step()
train_loss = epoch_train_loss / len(train_loader)
# val
model.eval()
epoch_val_loss = 0
with torch.no_grad():
for X_area1_val, X_area2_val, X_sa_val, W_cscore in val_loader:
recon_val = model(X_area1_val, X_area2_val, X_sa_val)
loss = criterion(recon_val, X_area1_val)
loss = (loss * X_sa_val).mean(dim=1)
loss = (loss * W_cscore).mean()
epoch_val_loss += loss.item()
val_loss = epoch_val_loss / len(val_loader)
if epoch == 0:
info = f"DeepQuant train epoch: {epoch}, train loss: {train_loss:.3f}, eval loss: {val_loss:.3f}"
logger.info(info)
# stop check
if val_loss < best_loss:
best_loss = val_loss
epochs_no_improve = 0
model_best = copy.deepcopy(model)
info = f"DeepQuant train epoch: {epoch}, train loss: {train_loss:.3f}, eval loss: {val_loss:.3f}"
else:
epochs_no_improve += 1
if epochs_no_improve >= 10:
break
logger.info(info)
# pred all
pred_v = []
model.eval()
with torch.no_grad():
for X_area1_batch, X_area2_batch, X_sa_batch in pred_loader:
X_pred = model_best(X_area1_batch, X_area2_batch, X_sa_batch)
X_pred = X_pred.cpu().numpy()
pred_v.append(X_pred)
pred_m = np.vstack(pred_v)
pred_m = pred_m * area_m_log_max1
pred_m = np.exp2(pred_m)
# quant
sa_sum = np.nansum(sa_m_v, axis=0)
top_n_idx = np.argsort(sa_sum, axis=1)[:, -top_k_fg:]
for run_idx, area_m in enumerate(area_m_v):
# quant by raw or deep
top_n_values = np.take_along_axis(area_m, top_n_idx, axis=1)
pr_quant_raw = top_n_values.sum(axis=1)
area_m_ae = pred_m[:, run_idx * n_ion : (run_idx + 1) * n_ion]
top_n_values = np.take_along_axis(area_m_ae, top_n_idx, axis=1)
pr_quant_deep = top_n_values.sum(axis=1)
df_global["quant_pr_raw_" + str(run_idx)] = pr_quant_raw
df_global["quant_pr_deep_" + str(run_idx)] = pr_quant_deep
quant_pr_mix = np.sqrt(pr_quant_raw * pr_quant_deep)
df_global["quant_pr_mix_" + str(run_idx)] = quant_pr_mix
return df_global
[docs]
def save_report_result(df_global: pd.DataFrame, multi_ws: list) -> None:
"""
Combine the df_run and df_global to save the final result in a report.parquet file.
"""
logger.info("Saving report.parquet ...")
if len(multi_ws) > 0:
oname = "report.parquet"
df_out_v = []
for ws_i in range(len(multi_ws)):
df = utils.read_from_pq(multi_ws[ws_i])
# merge global info: cscore_global, q_global, quant pr/pg
cols = df_global.columns[~df_global.columns.str.startswith("quant_")]
cols_quant_pr = [
f"{x}_{ws_i}" for x in ["quant_pr_raw", "quant_pr_deep", "quant_pr_mix"]
]
cols_quant_pg = [
f"{x}_{ws_i}" for x in ["quant_pg_raw", "quant_pg_deep", "quant_pg_mix"]
]
cols = cols.tolist() + cols_quant_pr + cols_quant_pg
df = pd.merge(df, df_global[cols], on=["pr_id", "decoy"], how="right")
cols_new = ["quant_pr_raw", "quant_pr_deep", "quant_pr_mix"]
df = df.rename(columns=dict(zip(cols_quant_pr, cols_new)))
cols_new = ["quant_pg_raw", "quant_pg_deep", "quant_pg_mix"]
df = df.rename(columns=dict(zip(cols_quant_pg, cols_new)))
# cal q_pg_run based on assigned protein_groups
df["cscore_pr_run"].fillna(0, inplace=True)
df = fdr.cal_q_pg(df, cfg.q_cut_infer, "run")
# dtype
df["pr_charge"] = df["pr_id"].str[-1].astype(np.int8)
df["proteotypic"] = df["proteotypic"].astype(np.int8)
cols_big = df.select_dtypes(include=[np.float64]).columns
df[cols_big] = df[cols_big].astype(np.float32)
# convert
df = utils.convert_cols_to_diann(df, cfg.multi_ws[ws_i])
df_out_v.append(df)
df = pd.concat(df_out_v, ignore_index=True)
df.to_parquet(cfg.dir_out_global / oname)