import pymzml
import pandas as pd
import msions.utils as msutils
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import msions.kronik as kro
import sqlite3
import msions.encyclopedia as encyclo
import msions.mzml as mzml
import msions.hardklor as hk
from matplotlib.ticker import FormatStrFormatter # format decimal places
from typing import List
[docs]def fig2a(mzml_file: str, bin_rt_list: List[float], bin_mz_list: List[float], manu_mzml: bool = False):
"""
Creates the Figure 2a heatmap using an mzML file and binning lists.
Parameters
----------
mzml_file : str
The input mzML file.
bin_rt_list : List[float]
List of retention time bin edges.
bin_mz_list : List[float]
List of m/z bin edges.
manu_mzml: bool
Boolean whether the mzML was used in the manuscript
to determine if a red line should be plotted like
in the manuscript.
Examples
-------
>>> import msions.utils as ustils
>>> from single_mol import fig2a
>>> rt_bin_size = 0.25
>>> rt_bin_mult = 1
>>> rt_start = 0
>>> rt_end = 100
>>> bin_rt_list = msutils.bin_list(rt_start, rt_end, rt_bin_size, rt_bin_mult)
>>> mz_bin_size = 4
>>> mz_bin_mult = 1.0005
>>> mz_start = 399
>>> mz_end = 1005
>>> bin_mz_list = msutils.bin_list(mz_start, mz_end, mz_bin_size, mz_bin_mult)
>>> fig2a("test.mzML", bin_rt_list, bin_mz_list, manu_mzml=True)
"""
# create DataFrame of MS1 peaks
peak_df = mzml.peak_df(mzml_file)
# create DataFrame with binned m/z and retention time
peak_binned = msutils.bin_data(peak_df.copy(), type="both", bin_mz_list=bin_mz_list, bin_rt_list=bin_rt_list)
# create a max cut-off to help with dynamic range
max_binned = peak_binned.copy()
max_binned["ips_max"] = [x if x < 2e9 else 2e9 for x in max_binned["ips"]]
# create a pivot table to make the heatmap
max_pivot_table = pd.pivot_table(max_binned, values='ips_max', index=['bin_mz'],
columns=['bin_rt'], aggfunc=np.sum)
# sort the m/z to make 400 closer to the x-axis
max_pivot_table = max_pivot_table.sort_values(by=['bin_mz'], ascending=False)
# heatmap for data
fig, ax = plt.subplots(figsize=(10,8))
# create heatmap, remove color bar
# remove last 3 columns because there was no data for those bins
sns.heatmap(max_pivot_table.iloc[:, :-3].astype('float'), cmap='Greys', cbar=False)
if manu_mzml:
# create a line to correspond to (b)
plt.vlines(x=230+0.17/0.25, ymin=0, ymax=1000, color="red", linestyles='dashed', linewidth=2)
# set x-axis parameters
plt.xlabel('Retention Time (min)',fontsize=18)
ax.set_xticks((range(0, len(max_pivot_table.columns), math.floor(len(max_pivot_table.columns)/5))))
ax.set_xticklabels((0,20,40,60,80,100), rotation=0, fontsize=14)
ax.axhline(y=151, color='black', linewidth=2)
# set y-axis parameters
plt.ylabel('m/z',fontsize=18)
ax.set_yticks((range(0, len(max_pivot_table.index), math.floor(len(max_pivot_table.index)/6))))
ax.set_yticklabels((1000,900,800,700,600,500,400), fontsize=14)
ax.axvline(x=0, color='black', linewidth=1)
# show plot
plt.show()
[docs]def fig2a_insert(kro_file: str, elib_file: str):
"""
Creates the Figure 2a histogram insert using a Kronik file and EncyclopeDIA elib.
Parameters
----------
kro_file : str
The input Kronik file.
elib_file : str
The input EncyclopeDIA elib file.
Examples
-------
>>> from single_mol import fig2a_insert
>>> fig2a_insert("test.kro", "test.elib")
"""
# read in the Kronik output
kro_df = kro.simple_df(kro_file)
# create connection
elib_connection = sqlite3.connect(elib_file)
# initiate empty lists
id_list = []
peptide_list = []
# create sql query
sql = """SELECT PeptideSeq
FROM entries
WHERE ((? BETWEEN RTInSecondsStart AND RTInSecondsStop) AND ((PrecursorMz - ?) BETWEEN -? AND ?));"""
# find the peptide(s) associated with each m/z
for idx, mz in enumerate(kro_df["mz"]):
rt = kro_df.loc[idx, "best_rt_s"]
sql_df = pd.read_sql_query(sql, elib_connection, params=(rt, mz, mz*5e-6, mz*5e-6))
id_list.append(len(sql_df))
peptide_list.append(sql_df["PeptideSeq"])
# add identification status and peptide list to DataFrame
kro_df["id"] = id_list
kro_df["peptides"] = peptide_list
# close the database connection
elib_connection.close()
# create a bin list for intensities
# removed last 2 bins that were empty
bin_int_list = msutils.bin_list(4, 10, 0.2, 1)[:-2]
# create an array of intensities for all persistent features
all_int = np.log10(kro_df.best_int)
# calculate counts of all persistent features in each bin
all_counts = np.histogram(all_int, bins=bin_int_list)[0]
# create an array of intensities for identified persistent features
id_int = np.log10(kro_df[kro_df["id"] > 0].best_int)
# calculate counts of identified persistent features in each bin
id_counts = np.histogram(id_int, bins=bin_int_list)[0]
# define figure size
fig, ax = plt.subplots(figsize=(10,8))
# plot histograms
# all persistent features
ax.hist(all_int, bins=bin_int_list, histtype="bar", color="dimgray")
# identified persistent features
ax.hist(id_int, bins=bin_int_list, histtype="bar", color="#1f77b4", alpha=0.8)
# set x-axis parameters
plt.xlabel("log10(Ions/Second)", fontsize=18)
plt.xticks(fontsize=14)
# set y-axis parameters
ax.set_ylabel("Counts",fontsize=18)
plt.yticks(fontsize=14)
# create a second axis
ax2 = ax.twinx()
# plot line of percent identifed
# removed last edge to match array lengths
ax2.plot(bin_int_list[:-1], (id_counts/all_counts)*100, color="red")
# set second y-axis parameters
ax2.set_ylabel("Percent Identified", fontsize=18, rotation=270, color="red", labelpad=12)
ax2.set_ylim(0,101)
plt.yticks(fontsize=14)
# despine top
ax.spines['top'].set_visible(False)
ax2.spines['top'].set_visible(False)
# show plot
plt.show()
[docs]def fig2b(mzml_file, hk_file, elib_file, scan_num=170933, id_peaks=7):
"""
Creates the Figure 2b plot of identified and unidentified MS1 peaks.
Also prints the highest intensity identified peaks.
Parameters
----------
mzml_file : str
The input mzML file.
hk_file : str
The input Hardklor file.
elib_file : str
The input EncyclopeDIA elib file.
scan_num : int
Scan number of plotted peaks.
id_peaks : int
Number of highest intensity identified peaks to print information about.
Examples
-------
>>> from single_mol import fig2b
>>> fig2b("test.mzML", "test.hk", "test.elib")
"""
# create DataFrame from elib
ev_encyclo_df = encyclo.dia_df(elib_file)
# create DataFrame from Hardklor output
ev_hk_df = hk.hk2df(hk_file)
# find Hardklor retention time/charge/mass match in encyclopeDIA results
ev_hk_df["in_encyclo"] = ev_hk_df.apply(encyclo.match_hk, axis=1, other_df=ev_encyclo_df)
# create run object
run = pymzml.run.Reader(mzml_file)
# choose MS1 spectrum between 40-60 minutes
ms1_spectrum = run[scan_num] # 57.6733 min = 3460.398 sec
# define retention time
spec_rt = round(ms1_spectrum.scan_time[0], 4)
# create pandas DataFrame of MS1 peaks
ms1_peaks = pd.DataFrame(ms1_spectrum.peaks("centroided")).rename(columns={0: "mz", 1: "ips"})
ms1_peaks["mz"] = ms1_peaks["mz"].round(4)
# create narrow hk_df
peak_hk_df = ev_hk_df[ev_hk_df["rt"] == spec_rt]
# merge identification designation onto MS1 peaks
ms1_peaks = pd.merge_asof(ms1_peaks, peak_hk_df[['mz', 'mass', 'charge', 'in_encyclo']], tolerance=0.001, on=['mz'])
ms1_peaks = ms1_peaks.fillna(-1)
# write algorithm to color peptide isotope distributions
# initiate indices
idx, idx2, idx3 = [0, 0, 0]
# initiate empty lists for previous values
prev_idxs = []
# initiate empty list to be added to DataFrame
in_dist_list = [0]*len(ms1_peaks)
# list to go forward through indices:
fwd_idxs = [x for x in range(0, len(ms1_peaks))]
# loop through whether MS1 peaks were found in Hardklor and encyclopeDIA
for idx, val in enumerate(ms1_peaks["in_encyclo"]):
# if peak was seen in Hardklor and encyclopeDIA,
if val >= 1:
# the peak is in a distribution
in_dist_list[idx] = 1
# define the mass, mz, and charge of interest
mono_mass = ms1_peaks.loc[idx, "mass"]
check_mz = ms1_peaks.loc[idx, "mz"]
charge_state = ms1_peaks.loc[idx, "charge"]
# calculate the isotopic difference in mz
iso_diff = 1/charge_state
# go back through stored idxs
for idx2 in reversed(prev_idxs):
# if the stored idx's mz is within the isotopic difference of the mz of interest
if np.isclose(ms1_peaks.loc[idx2, "mz"], check_mz-iso_diff, atol=0.01):
in_dist_list[idx2] = 1
check_mz = ms1_peaks.loc[idx2, "mz"]
elif (check_mz+iso_diff)-ms1_peaks.loc[idx3, "mz"] > 2*iso_diff:
break
# go forward through upcoming idxs
for idx3 in fwd_idxs[idx+1:]:
# if the stored idx's mz is within the isotopic difference of the mz of interest
if np.isclose(ms1_peaks.loc[idx3, "mz"], check_mz+iso_diff, atol=0.01):
in_dist_list[idx3] = 1
check_mz = ms1_peaks.loc[idx3, "mz"]
elif ms1_peaks.loc[idx3, "mz"]-(check_mz+iso_diff) > 2*iso_diff:
in_dist = False
break
# if peak was not seen in encyclopeDIA
else:
# keep track of seen indices that are not monoisotopic peaks
prev_idxs.append(idx)
# add the column to the ms1_peaks DataFrame
ms1_peaks["in_dist"] = in_dist_list
# create DataFrames of identified and non-identified peaks
id_ms1_peaks = ms1_peaks[ms1_peaks["in_dist"] > 0].reset_index(drop=True)
noid_ms1_peaks = ms1_peaks[ms1_peaks["in_dist"] == 0].reset_index(drop=True)
# create connection
elib_connection = sqlite3.connect(elib_file)
# create empty dictionary and list for results
mz2pep_dict = {}
pep_list = []
# create list of masses
mz_list = ms1_peaks[ms1_peaks["in_encyclo"] > 0].sort_values(by="ips", ascending=False).head(id_peaks).mz
# create sql query
sql = """SELECT PeptideSeq
FROM entries
WHERE ((3460.398 BETWEEN RTInSecondsStart AND RTInSecondsStop) AND ((PrecursorMz - ?) BETWEEN -0.01 AND 0.01));"""
# print the protein(s) associated with each peptide sequence
for mz in mz_list:
sql_df = pd.read_sql_query(sql, elib_connection, params=[mz])
for pep in sql_df["PeptideSeq"]:
if mz in mz2pep_dict.keys():
mz2pep_dict[mz].append(pep)
else:
mz2pep_dict[mz] = [pep]
pep_list.append(pep)
# close the database connection
elib_connection.close()
# create connection
elib_connection = sqlite3.connect(elib_file)
# initiate empty protein dictionary
seq2pro_dict = {}
# create sql query
sql = """SELECT ProteinAccession
FROM peptidetoprotein
WHERE PeptideSeq = ?;"""
# print the protein(s) associated with each peptide sequence
for seq in pep_list:
sql_df = pd.read_sql_query(sql, elib_connection, params=[seq])
for protein in sql_df["ProteinAccession"]:
if seq in seq2pro_dict.keys():
seq2pro_dict[seq].append(protein)
else:
seq2pro_dict[seq] = [protein]
# close the database connection
elib_connection.close()
# combine dictionary info
for mz in mz_list:
for pep in mz2pep_dict[mz]:
for pro in seq2pro_dict[pep]:
print(mz, "m/z:", pep, "("+pro+")")
# define figure size
plt.figure(figsize=(10,8))
# plot spectra
plt.vlines(x= noid_ms1_peaks["mz"], ymin = 0, ymax = noid_ms1_peaks["ips"]/1e8, color="red")
plt.vlines(x= id_ms1_peaks["mz"], ymin = 0, ymax = id_ms1_peaks["ips"]/1e8, color="#1f77b4")
# set x-axis parameters
plt.xlabel("m/z", fontsize=18)
plt.xlim(395,1000)
plt.xticks(fontsize=14)
# set y-axis parameters
plt.ylabel("Ions/Second ($10^{8}$)",fontsize=18)
plt.ylim(0,14.5)
plt.yticks(np.arange(2,15,2))
plt.yticks(fontsize=14)
# despine top and right
sns.despine()
# show plot
plt.show()
[docs]def fig2c(mzml_file, hk_file, elib_file):
"""
Creates the Figure 2c plot of Total Ion Current (TIC)
and identified ion current.
Also prints the values and percentage identified.
Parameters
----------
mzml_file : str
The input mzML file.
hk_file : str
The input Hardklor file.
elib_file : str
The input EncyclopeDIA elib file.
Examples
-------
>>> from single_mol import fig2c
>>> fig2c("test.mzML", "test.hk", "test.elib")
"""
# create DataFrame from elib
ev_encyclo_df = encyclo.dia_df(elib_file)
# create DataFrame from mzML
ev_ms1_df = mzml.tic_df(mzml_file)
# create DataFrame from Hardklor output
ev_hk_df = hk.hk2df(hk_file)
#summarize Hardklor DataFrame
summed_ev_hk = hk.summarize_df(ev_hk_df, full_ms1_df=ev_ms1_df)
# find Hardklor retention time/charge/mass match in encyclopeDIA results
ev_hk_df["in_encyclo"] = ev_hk_df.apply(encyclo.match_hk, axis=1, other_df=ev_encyclo_df)
# create DataFrame of only identified features
id_ev_hk_df = ev_hk_df[ev_hk_df["in_encyclo"] > 0].reset_index(drop=True)
# summarize identified features DataFrame
sum_id_ev_hk_df = hk.summarize_df(id_ev_hk_df, full_ms1_df=ev_ms1_df)
# join DataFrames
joined_EV = pd.merge(ev_ms1_df, sum_id_ev_hk_df, how="outer", on=["scan_num"], suffixes=("_ms1","_id"))
# find total ion current across all
print("Total Ion Current (TIC): %.0f" % sum(ev_ms1_df.TIC))
# find identified total ion current
print("ID'd TIC: %.0f" % sum(sum_id_ev_hk_df.TIC))
# calculate ratio of identified ion current to total ion current
print("%.1f%% of the signal" % float(sum(sum_id_ev_hk_df.TIC)/sum(ev_ms1_df.TIC)*100))
# create figure and axis objects with subplots()
fig, ax = plt.subplots(figsize=(10,8))
# make a TIC plot
ax.plot(joined_EV.rt_ms1, joined_EV.TIC_ms1/1e10, color="black")
ax.plot(joined_EV.rt_ms1, joined_EV.TIC_id/1e10, color="#1f77b4")
# set x-axis parameters
ax.set_xlabel("Retention Time (min)", fontsize=18)
ax.set_xlim(0,100)
plt.xticks(fontsize=14)
# set y-axis parameters
ax.set_ylabel("Ions/Second ($10^{10}$)",fontsize=18)
ax.set_ylim(0,4.5)
ax.set_yticks(np.arange(1,5,1))
plt.yticks(fontsize=14)
# despine top and right
sns.despine()
# show plot
plt.show()
[docs]def fig2d(mzml_file, hk_file, elib_file):
"""
Creates the Figure 2d plot of total ions
and identified ions.
Also prints the values and percentage identified.
Parameters
----------
mzml_file : str
The input mzML file.
hk_file : str
The input Hardklor file.
elib_file : str
The input EncyclopeDIA elib file.
Examples
-------
>>> from single_mol import fig2d
>>> fig2d("test.mzML", "test.hk", "test.elib")
"""
# create DataFrame from elib
ev_encyclo_df = encyclo.dia_df(elib_file)
# create DataFrame from mzML
ev_ms1_df = mzml.tic_df(mzml_file)
# create DataFrame from Hardklor output
ev_hk_df = hk.hk2df(hk_file)
#summarize Hardklor DataFrame
summed_ev_hk = hk.summarize_df(ev_hk_df, full_ms1_df=ev_ms1_df)
# find Hardklor retention time/charge/mass match in encyclopeDIA results
ev_hk_df["in_encyclo"] = ev_hk_df.apply(encyclo.match_hk, axis=1, other_df=ev_encyclo_df)
# create DataFrame of only identified features
id_ev_hk_df = ev_hk_df[ev_hk_df["in_encyclo"] > 0].reset_index(drop=True)
# summarize identified features DataFrame
sum_id_ev_hk_df = hk.summarize_df(id_ev_hk_df, full_ms1_df=ev_ms1_df)
# join DataFrames
joined_EV = pd.merge(ev_ms1_df, sum_id_ev_hk_df, how="outer", on=["scan_num"], suffixes=("_ms1","_id"))
# find total ions across all
print("Total # of ions: %.0f" % sum(ev_ms1_df.ions))
# find identified ions
print("Ions mapped to peptides: %.0f" % sum(sum_id_ev_hk_df.ions))
# calculate ratio of identified ions to total ions
print("%.1f%% of the signal" % float(sum(sum_id_ev_hk_df.ions)/sum(ev_ms1_df.ions)*100))
# create figure and axis objects with subplots()
fig, ax = plt.subplots(figsize=(10,8))
# make a plot for ion counts
ax.plot(joined_EV.rt_ms1, joined_EV.ions_ms1/1e6, color="black")
ax.plot(joined_EV.rt_ms1, joined_EV.ions_id/1e6, color="#1f77b4")
# set x-axis parameters
ax.set_xlabel("Retention Time (min)", fontsize=18)
ax.set_xlim(0,100)
plt.xticks(fontsize=14)
# set y-axis parameters
ax.set_ylabel("Ions ($10^{6}$)",fontsize=18)
ax.set_ylim(0,6.1)
ax.set_yticks(np.arange(1,7,1))
ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
plt.yticks(fontsize=14)
# despine top and right
sns.despine()
# show plot
plt.show()