Source code for single_mol.single_mol

import pymzml
import pandas as pd
import msions.utils as msutils
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import msions.kronik as kro
import sqlite3
import msions.encyclopedia as encyclo
import msions.mzml as mzml
import msions.hardklor as hk
from matplotlib.ticker import FormatStrFormatter  # format decimal places
from typing import List


[docs]def fig2a(mzml_file: str, bin_rt_list: List[float], bin_mz_list: List[float], manu_mzml: bool = False): """ Creates the Figure 2a heatmap using an mzML file and binning lists. Parameters ---------- mzml_file : str The input mzML file. bin_rt_list : List[float] List of retention time bin edges. bin_mz_list : List[float] List of m/z bin edges. manu_mzml: bool Boolean whether the mzML was used in the manuscript to determine if a red line should be plotted like in the manuscript. Examples ------- >>> import msions.utils as ustils >>> from single_mol import fig2a >>> rt_bin_size = 0.25 >>> rt_bin_mult = 1 >>> rt_start = 0 >>> rt_end = 100 >>> bin_rt_list = msutils.bin_list(rt_start, rt_end, rt_bin_size, rt_bin_mult) >>> mz_bin_size = 4 >>> mz_bin_mult = 1.0005 >>> mz_start = 399 >>> mz_end = 1005 >>> bin_mz_list = msutils.bin_list(mz_start, mz_end, mz_bin_size, mz_bin_mult) >>> fig2a("test.mzML", bin_rt_list, bin_mz_list, manu_mzml=True) """ # create DataFrame of MS1 peaks peak_df = mzml.peak_df(mzml_file) # create DataFrame with binned m/z and retention time peak_binned = msutils.bin_data(peak_df.copy(), type="both", bin_mz_list=bin_mz_list, bin_rt_list=bin_rt_list) # create a max cut-off to help with dynamic range max_binned = peak_binned.copy() max_binned["ips_max"] = [x if x < 2e9 else 2e9 for x in max_binned["ips"]] # create a pivot table to make the heatmap max_pivot_table = pd.pivot_table(max_binned, values='ips_max', index=['bin_mz'], columns=['bin_rt'], aggfunc=np.sum) # sort the m/z to make 400 closer to the x-axis max_pivot_table = max_pivot_table.sort_values(by=['bin_mz'], ascending=False) # heatmap for data fig, ax = plt.subplots(figsize=(10,8)) # create heatmap, remove color bar # remove last 3 columns because there was no data for those bins sns.heatmap(max_pivot_table.iloc[:, :-3].astype('float'), cmap='Greys', cbar=False) if manu_mzml: # create a line to correspond to (b) plt.vlines(x=230+0.17/0.25, ymin=0, ymax=1000, color="red", linestyles='dashed', linewidth=2) # set x-axis parameters plt.xlabel('Retention Time (min)',fontsize=18) ax.set_xticks((range(0, len(max_pivot_table.columns), math.floor(len(max_pivot_table.columns)/5)))) ax.set_xticklabels((0,20,40,60,80,100), rotation=0, fontsize=14) ax.axhline(y=151, color='black', linewidth=2) # set y-axis parameters plt.ylabel('m/z',fontsize=18) ax.set_yticks((range(0, len(max_pivot_table.index), math.floor(len(max_pivot_table.index)/6)))) ax.set_yticklabels((1000,900,800,700,600,500,400), fontsize=14) ax.axvline(x=0, color='black', linewidth=1) # show plot plt.show()
[docs]def fig2a_insert(kro_file: str, elib_file: str): """ Creates the Figure 2a histogram insert using a Kronik file and EncyclopeDIA elib. Parameters ---------- kro_file : str The input Kronik file. elib_file : str The input EncyclopeDIA elib file. Examples ------- >>> from single_mol import fig2a_insert >>> fig2a_insert("test.kro", "test.elib") """ # read in the Kronik output kro_df = kro.simple_df(kro_file) # create connection elib_connection = sqlite3.connect(elib_file) # initiate empty lists id_list = [] peptide_list = [] # create sql query sql = """SELECT PeptideSeq FROM entries WHERE ((? BETWEEN RTInSecondsStart AND RTInSecondsStop) AND ((PrecursorMz - ?) BETWEEN -? AND ?));""" # find the peptide(s) associated with each m/z for idx, mz in enumerate(kro_df["mz"]): rt = kro_df.loc[idx, "best_rt_s"] sql_df = pd.read_sql_query(sql, elib_connection, params=(rt, mz, mz*5e-6, mz*5e-6)) id_list.append(len(sql_df)) peptide_list.append(sql_df["PeptideSeq"]) # add identification status and peptide list to DataFrame kro_df["id"] = id_list kro_df["peptides"] = peptide_list # close the database connection elib_connection.close() # create a bin list for intensities # removed last 2 bins that were empty bin_int_list = msutils.bin_list(4, 10, 0.2, 1)[:-2] # create an array of intensities for all persistent features all_int = np.log10(kro_df.best_int) # calculate counts of all persistent features in each bin all_counts = np.histogram(all_int, bins=bin_int_list)[0] # create an array of intensities for identified persistent features id_int = np.log10(kro_df[kro_df["id"] > 0].best_int) # calculate counts of identified persistent features in each bin id_counts = np.histogram(id_int, bins=bin_int_list)[0] # define figure size fig, ax = plt.subplots(figsize=(10,8)) # plot histograms # all persistent features ax.hist(all_int, bins=bin_int_list, histtype="bar", color="dimgray") # identified persistent features ax.hist(id_int, bins=bin_int_list, histtype="bar", color="#1f77b4", alpha=0.8) # set x-axis parameters plt.xlabel("log10(Ions/Second)", fontsize=18) plt.xticks(fontsize=14) # set y-axis parameters ax.set_ylabel("Counts",fontsize=18) plt.yticks(fontsize=14) # create a second axis ax2 = ax.twinx() # plot line of percent identifed # removed last edge to match array lengths ax2.plot(bin_int_list[:-1], (id_counts/all_counts)*100, color="red") # set second y-axis parameters ax2.set_ylabel("Percent Identified", fontsize=18, rotation=270, color="red", labelpad=12) ax2.set_ylim(0,101) plt.yticks(fontsize=14) # despine top ax.spines['top'].set_visible(False) ax2.spines['top'].set_visible(False) # show plot plt.show()
[docs]def fig2b(mzml_file, hk_file, elib_file, scan_num=170933, id_peaks=7): """ Creates the Figure 2b plot of identified and unidentified MS1 peaks. Also prints the highest intensity identified peaks. Parameters ---------- mzml_file : str The input mzML file. hk_file : str The input Hardklor file. elib_file : str The input EncyclopeDIA elib file. scan_num : int Scan number of plotted peaks. id_peaks : int Number of highest intensity identified peaks to print information about. Examples ------- >>> from single_mol import fig2b >>> fig2b("test.mzML", "test.hk", "test.elib") """ # create DataFrame from elib ev_encyclo_df = encyclo.dia_df(elib_file) # create DataFrame from Hardklor output ev_hk_df = hk.hk2df(hk_file) # find Hardklor retention time/charge/mass match in encyclopeDIA results ev_hk_df["in_encyclo"] = ev_hk_df.apply(encyclo.match_hk, axis=1, other_df=ev_encyclo_df) # create run object run = pymzml.run.Reader(mzml_file) # choose MS1 spectrum between 40-60 minutes ms1_spectrum = run[scan_num] # 57.6733 min = 3460.398 sec # define retention time spec_rt = round(ms1_spectrum.scan_time[0], 4) # create pandas DataFrame of MS1 peaks ms1_peaks = pd.DataFrame(ms1_spectrum.peaks("centroided")).rename(columns={0: "mz", 1: "ips"}) ms1_peaks["mz"] = ms1_peaks["mz"].round(4) # create narrow hk_df peak_hk_df = ev_hk_df[ev_hk_df["rt"] == spec_rt] # merge identification designation onto MS1 peaks ms1_peaks = pd.merge_asof(ms1_peaks, peak_hk_df[['mz', 'mass', 'charge', 'in_encyclo']], tolerance=0.001, on=['mz']) ms1_peaks = ms1_peaks.fillna(-1) # write algorithm to color peptide isotope distributions # initiate indices idx, idx2, idx3 = [0, 0, 0] # initiate empty lists for previous values prev_idxs = [] # initiate empty list to be added to DataFrame in_dist_list = [0]*len(ms1_peaks) # list to go forward through indices: fwd_idxs = [x for x in range(0, len(ms1_peaks))] # loop through whether MS1 peaks were found in Hardklor and encyclopeDIA for idx, val in enumerate(ms1_peaks["in_encyclo"]): # if peak was seen in Hardklor and encyclopeDIA, if val >= 1: # the peak is in a distribution in_dist_list[idx] = 1 # define the mass, mz, and charge of interest mono_mass = ms1_peaks.loc[idx, "mass"] check_mz = ms1_peaks.loc[idx, "mz"] charge_state = ms1_peaks.loc[idx, "charge"] # calculate the isotopic difference in mz iso_diff = 1/charge_state # go back through stored idxs for idx2 in reversed(prev_idxs): # if the stored idx's mz is within the isotopic difference of the mz of interest if np.isclose(ms1_peaks.loc[idx2, "mz"], check_mz-iso_diff, atol=0.01): in_dist_list[idx2] = 1 check_mz = ms1_peaks.loc[idx2, "mz"] elif (check_mz+iso_diff)-ms1_peaks.loc[idx3, "mz"] > 2*iso_diff: break # go forward through upcoming idxs for idx3 in fwd_idxs[idx+1:]: # if the stored idx's mz is within the isotopic difference of the mz of interest if np.isclose(ms1_peaks.loc[idx3, "mz"], check_mz+iso_diff, atol=0.01): in_dist_list[idx3] = 1 check_mz = ms1_peaks.loc[idx3, "mz"] elif ms1_peaks.loc[idx3, "mz"]-(check_mz+iso_diff) > 2*iso_diff: in_dist = False break # if peak was not seen in encyclopeDIA else: # keep track of seen indices that are not monoisotopic peaks prev_idxs.append(idx) # add the column to the ms1_peaks DataFrame ms1_peaks["in_dist"] = in_dist_list # create DataFrames of identified and non-identified peaks id_ms1_peaks = ms1_peaks[ms1_peaks["in_dist"] > 0].reset_index(drop=True) noid_ms1_peaks = ms1_peaks[ms1_peaks["in_dist"] == 0].reset_index(drop=True) # create connection elib_connection = sqlite3.connect(elib_file) # create empty dictionary and list for results mz2pep_dict = {} pep_list = [] # create list of masses mz_list = ms1_peaks[ms1_peaks["in_encyclo"] > 0].sort_values(by="ips", ascending=False).head(id_peaks).mz # create sql query sql = """SELECT PeptideSeq FROM entries WHERE ((3460.398 BETWEEN RTInSecondsStart AND RTInSecondsStop) AND ((PrecursorMz - ?) BETWEEN -0.01 AND 0.01));""" # print the protein(s) associated with each peptide sequence for mz in mz_list: sql_df = pd.read_sql_query(sql, elib_connection, params=[mz]) for pep in sql_df["PeptideSeq"]: if mz in mz2pep_dict.keys(): mz2pep_dict[mz].append(pep) else: mz2pep_dict[mz] = [pep] pep_list.append(pep) # close the database connection elib_connection.close() # create connection elib_connection = sqlite3.connect(elib_file) # initiate empty protein dictionary seq2pro_dict = {} # create sql query sql = """SELECT ProteinAccession FROM peptidetoprotein WHERE PeptideSeq = ?;""" # print the protein(s) associated with each peptide sequence for seq in pep_list: sql_df = pd.read_sql_query(sql, elib_connection, params=[seq]) for protein in sql_df["ProteinAccession"]: if seq in seq2pro_dict.keys(): seq2pro_dict[seq].append(protein) else: seq2pro_dict[seq] = [protein] # close the database connection elib_connection.close() # combine dictionary info for mz in mz_list: for pep in mz2pep_dict[mz]: for pro in seq2pro_dict[pep]: print(mz, "m/z:", pep, "("+pro+")") # define figure size plt.figure(figsize=(10,8)) # plot spectra plt.vlines(x= noid_ms1_peaks["mz"], ymin = 0, ymax = noid_ms1_peaks["ips"]/1e8, color="red") plt.vlines(x= id_ms1_peaks["mz"], ymin = 0, ymax = id_ms1_peaks["ips"]/1e8, color="#1f77b4") # set x-axis parameters plt.xlabel("m/z", fontsize=18) plt.xlim(395,1000) plt.xticks(fontsize=14) # set y-axis parameters plt.ylabel("Ions/Second ($10^{8}$)",fontsize=18) plt.ylim(0,14.5) plt.yticks(np.arange(2,15,2)) plt.yticks(fontsize=14) # despine top and right sns.despine() # show plot plt.show()
[docs]def fig2c(mzml_file, hk_file, elib_file): """ Creates the Figure 2c plot of Total Ion Current (TIC) and identified ion current. Also prints the values and percentage identified. Parameters ---------- mzml_file : str The input mzML file. hk_file : str The input Hardklor file. elib_file : str The input EncyclopeDIA elib file. Examples ------- >>> from single_mol import fig2c >>> fig2c("test.mzML", "test.hk", "test.elib") """ # create DataFrame from elib ev_encyclo_df = encyclo.dia_df(elib_file) # create DataFrame from mzML ev_ms1_df = mzml.tic_df(mzml_file) # create DataFrame from Hardklor output ev_hk_df = hk.hk2df(hk_file) #summarize Hardklor DataFrame summed_ev_hk = hk.summarize_df(ev_hk_df, full_ms1_df=ev_ms1_df) # find Hardklor retention time/charge/mass match in encyclopeDIA results ev_hk_df["in_encyclo"] = ev_hk_df.apply(encyclo.match_hk, axis=1, other_df=ev_encyclo_df) # create DataFrame of only identified features id_ev_hk_df = ev_hk_df[ev_hk_df["in_encyclo"] > 0].reset_index(drop=True) # summarize identified features DataFrame sum_id_ev_hk_df = hk.summarize_df(id_ev_hk_df, full_ms1_df=ev_ms1_df) # join DataFrames joined_EV = pd.merge(ev_ms1_df, sum_id_ev_hk_df, how="outer", on=["scan_num"], suffixes=("_ms1","_id")) # find total ion current across all print("Total Ion Current (TIC): %.0f" % sum(ev_ms1_df.TIC)) # find identified total ion current print("ID'd TIC: %.0f" % sum(sum_id_ev_hk_df.TIC)) # calculate ratio of identified ion current to total ion current print("%.1f%% of the signal" % float(sum(sum_id_ev_hk_df.TIC)/sum(ev_ms1_df.TIC)*100)) # create figure and axis objects with subplots() fig, ax = plt.subplots(figsize=(10,8)) # make a TIC plot ax.plot(joined_EV.rt_ms1, joined_EV.TIC_ms1/1e10, color="black") ax.plot(joined_EV.rt_ms1, joined_EV.TIC_id/1e10, color="#1f77b4") # set x-axis parameters ax.set_xlabel("Retention Time (min)", fontsize=18) ax.set_xlim(0,100) plt.xticks(fontsize=14) # set y-axis parameters ax.set_ylabel("Ions/Second ($10^{10}$)",fontsize=18) ax.set_ylim(0,4.5) ax.set_yticks(np.arange(1,5,1)) plt.yticks(fontsize=14) # despine top and right sns.despine() # show plot plt.show()
[docs]def fig2d(mzml_file, hk_file, elib_file): """ Creates the Figure 2d plot of total ions and identified ions. Also prints the values and percentage identified. Parameters ---------- mzml_file : str The input mzML file. hk_file : str The input Hardklor file. elib_file : str The input EncyclopeDIA elib file. Examples ------- >>> from single_mol import fig2d >>> fig2d("test.mzML", "test.hk", "test.elib") """ # create DataFrame from elib ev_encyclo_df = encyclo.dia_df(elib_file) # create DataFrame from mzML ev_ms1_df = mzml.tic_df(mzml_file) # create DataFrame from Hardklor output ev_hk_df = hk.hk2df(hk_file) #summarize Hardklor DataFrame summed_ev_hk = hk.summarize_df(ev_hk_df, full_ms1_df=ev_ms1_df) # find Hardklor retention time/charge/mass match in encyclopeDIA results ev_hk_df["in_encyclo"] = ev_hk_df.apply(encyclo.match_hk, axis=1, other_df=ev_encyclo_df) # create DataFrame of only identified features id_ev_hk_df = ev_hk_df[ev_hk_df["in_encyclo"] > 0].reset_index(drop=True) # summarize identified features DataFrame sum_id_ev_hk_df = hk.summarize_df(id_ev_hk_df, full_ms1_df=ev_ms1_df) # join DataFrames joined_EV = pd.merge(ev_ms1_df, sum_id_ev_hk_df, how="outer", on=["scan_num"], suffixes=("_ms1","_id")) # find total ions across all print("Total # of ions: %.0f" % sum(ev_ms1_df.ions)) # find identified ions print("Ions mapped to peptides: %.0f" % sum(sum_id_ev_hk_df.ions)) # calculate ratio of identified ions to total ions print("%.1f%% of the signal" % float(sum(sum_id_ev_hk_df.ions)/sum(ev_ms1_df.ions)*100)) # create figure and axis objects with subplots() fig, ax = plt.subplots(figsize=(10,8)) # make a plot for ion counts ax.plot(joined_EV.rt_ms1, joined_EV.ions_ms1/1e6, color="black") ax.plot(joined_EV.rt_ms1, joined_EV.ions_id/1e6, color="#1f77b4") # set x-axis parameters ax.set_xlabel("Retention Time (min)", fontsize=18) ax.set_xlim(0,100) plt.xticks(fontsize=14) # set y-axis parameters ax.set_ylabel("Ions ($10^{6}$)",fontsize=18) ax.set_ylim(0,6.1) ax.set_yticks(np.arange(1,7,1)) ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f')) plt.yticks(fontsize=14) # despine top and right sns.despine() # show plot plt.show()