@@ -30,7 +30,7 @@ class _SpectrumFileHandler:
3030 This class uses the pyteomics library for parsing MGF and mzML files.
3131 Parsed spectra are stored as `rustyms.RawSpectrum` objects.
3232 """
33-
33+
3434 def __init__ (self , spectrum_file : str ):
3535 self .spectrum_file = spectrum_file
3636 self .spectra = {} # Initialize an empty dictionary to hold the spectra
@@ -46,7 +46,6 @@ def __init__(self, spectrum_file: str):
4646 else :
4747 raise ValueError ("Unsupported file format. Only MGF and mzML are supported." )
4848
49-
5049 def _parse_mgf (self ):
5150 """
5251 Parse an MGF (Mascot Generic Format) file and store each spectrum as a RawSpectrum object.
@@ -61,31 +60,37 @@ def _parse_mgf(self):
6160 try :
6261 with mgf .MGF (self .spectrum_file ) as spectra :
6362 for spectrum in spectra :
64- spectrum_id = spectrum ['params' ].get ('title' , 'Unknown' ) # Extract spectrum ID from the MGF params
65- precursor_mass = spectrum ['params' ].get ('pepmass' , [None ])[0 ] # Extract precursor mass
66-
63+ spectrum_id = spectrum ["params" ].get (
64+ "title" , "Unknown"
65+ ) # Extract spectrum ID from the MGF params
66+ precursor_mass = spectrum ["params" ].get ("pepmass" , [None ])[
67+ 0
68+ ] # Extract precursor mass
69+
6770 # Extract retention time
6871 rt = 0.0
69- if ' rtinseconds' in spectrum [' params' ]:
70- rt = float (spectrum [' params' ][ ' rtinseconds' ])
71- elif ' retention time' in spectrum [' params' ]:
72- rt = float (spectrum [' params' ][ ' retention time' ])
72+ if " rtinseconds" in spectrum [" params" ]:
73+ rt = float (spectrum [" params" ][ " rtinseconds" ])
74+ elif " retention time" in spectrum [" params" ]:
75+ rt = float (spectrum [" params" ][ " retention time" ])
7376
7477 # Extract precursor charge
7578 precursor_charge = 0
76- if 'charge' in spectrum ['params' ]:
77- charge_str = spectrum ['params' ]['charge' ]
78- precursor_charge = int (charge_str .strip ('+' )) # Remove '+' and convert to int
79+ if "charge" in spectrum ["params" ]:
80+ charge_str = spectrum ["params" ]["charge" ]
81+ precursor_charge = int (
82+ charge_str .strip ("+" )
83+ ) # Remove '+' and convert to int
7984
8085 # Create a RawSpectrum object using required fields and additional attributes
8186 self .spectra [spectrum_id ] = RawSpectrum (
82- title = spectrum_id ,
83- num_scans = len (spectrum [' m/z array' ]),
87+ title = spectrum_id ,
88+ num_scans = len (spectrum [" m/z array" ]),
8489 rt = rt ,
8590 precursor_charge = precursor_charge ,
86- mz_array = np .array (spectrum [' m/z array' ]),
87- intensity_array = np .array (spectrum [' intensity array' ]),
88- precursor_mass = precursor_mass
91+ mz_array = np .array (spectrum [" m/z array" ]),
92+ intensity_array = np .array (spectrum [" intensity array" ]),
93+ precursor_mass = precursor_mass ,
8994 )
9095 logging .info (f"Parsed { len (self .spectra )} spectra from { self .spectrum_file } " )
9196 except Exception as e :
@@ -105,36 +110,40 @@ def _parse_mzml(self):
105110 try :
106111 with mzml .MzML (self .spectrum_file ) as spectra :
107112 for spectrum in spectra :
108- spectrum_id = spectrum .get ('id' , None ) # Get the spectrum ID from the mzML spectrum
113+ spectrum_id = spectrum .get (
114+ "id" , None
115+ ) # Get the spectrum ID from the mzML spectrum
109116 precursor_mass = 0.0
110117 precursor_charge = 0
111118 rt = 0.0
112119
113120 # Extract precursor mass and charge if available
114- if ' precursorList' in spectrum and spectrum [' precursorList' ]:
115- precursor = spectrum [' precursorList' ][ ' precursor' ][0 ]
116- if ' selectedIonList' in precursor :
117- selected_ion = precursor [' selectedIonList' ][ ' selectedIon' ][0 ]
118- precursor_mass = selected_ion .get (' selected ion m/z' , 0.0 )
119- precursor_charge = int (selected_ion .get (' charge state' , 0 ))
121+ if " precursorList" in spectrum and spectrum [" precursorList" ]:
122+ precursor = spectrum [" precursorList" ][ " precursor" ][0 ]
123+ if " selectedIonList" in precursor :
124+ selected_ion = precursor [" selectedIonList" ][ " selectedIon" ][0 ]
125+ precursor_mass = selected_ion .get (" selected ion m/z" , 0.0 )
126+ precursor_charge = int (selected_ion .get (" charge state" , 0 ))
120127
121128 # Extract retention time
122- if 'scanList' in spectrum and spectrum ['scanList' ]:
123- scan = spectrum ['scanList' ]['scan' ][0 ]
124- for cv_param in scan .get ('cvParam' , []):
125- if cv_param .get ('accession' ) == 'MS:1000016' : # accession for scan start time
126- rt = float (cv_param .get ('value' , 0.0 ))
129+ if "scanList" in spectrum and spectrum ["scanList" ]:
130+ scan = spectrum ["scanList" ]["scan" ][0 ]
131+ for cv_param in scan .get ("cvParam" , []):
132+ if (
133+ cv_param .get ("accession" ) == "MS:1000016"
134+ ): # accession for scan start time
135+ rt = float (cv_param .get ("value" , 0.0 ))
127136 break
128137
129138 # Create a RawSpectrum object using required fields and additional attributes
130139 self .spectra [spectrum_id ] = RawSpectrum (
131140 title = spectrum_id ,
132- num_scans = len (spectrum [' m/z array' ]),
141+ num_scans = len (spectrum [" m/z array" ]),
133142 rt = rt ,
134143 precursor_charge = precursor_charge ,
135- mz_array = np .array (spectrum [' m/z array' ]),
136- intensity_array = np .array (spectrum [' intensity array' ]),
137- precursor_mass = precursor_mass
144+ mz_array = np .array (spectrum [" m/z array" ]),
145+ intensity_array = np .array (spectrum [" intensity array" ]),
146+ precursor_mass = precursor_mass ,
138147 )
139148 logging .info (f"Parsed { len (self .spectra )} spectra from { self .spectrum_file } " )
140149 except Exception as e :
@@ -143,10 +152,10 @@ def _parse_mzml(self):
143152 def get_spectrum_from_psm (self , psm : PSM ):
144153 """
145154 Retrieve a RawSpectrum for a PSM by its ID.
146-
155+
147156 Args:
148157 psm (PSM): psm object
149-
158+
150159 Returns:
151160 RawSpectrum: The retrieved spectrum or None if not found.
152161 """
@@ -155,10 +164,10 @@ def get_spectrum_from_psm(self, psm: PSM):
155164 def get_spectra_from_psm_list (self , psmList : PSMList ):
156165 """
157166 Retrieve all spectra for a PSMList.
158-
167+
159168 Args:
160169 psmList (PSMList): A list of PSM objects.
161-
170+
162171 Returns:
163172 list: A list of RawSpectrum objects corresponding to the PSMs.
164173 None is included for any spectra not found.
@@ -168,7 +177,7 @@ def get_spectra_from_psm_list(self, psmList: PSMList):
168177 def get_all_spectra (self ):
169178 """
170179 Retrieve all parsed spectra.
171-
180+
172181 Returns:
173182 dict: A dictionary of all parsed spectra, where keys are spectrum IDs
174183 and values are RawSpectrum objects.
@@ -180,12 +189,12 @@ class _MetadataParser:
180189 """
181190 Class to parse metadata files (CSV/TSV) containing PSM information.
182191 """
183-
192+
184193 @staticmethod
185194 def parse_csv_file (file_name : str , delimiter : str = "\t " ) -> list :
186195 """
187196 Parse a CSV or TSV file containing PSM information and create PSM objects.
188-
197+
189198 Args:
190199 file_name (str): Path to the CSV or TSV file.
191200 delimiter (str, optional): Delimiter used in the file. Defaults to "\t ".
@@ -200,11 +209,11 @@ def parse_csv_file(file_name: str, delimiter: str = "\t") -> list:
200209 pd.errors.ParserError: If there's an error parsing the file.
201210
202211 Notes:
203- The file must contain at least the following columns:
212+ The file must contain at least the following columns:
204213 'peptidoform', 'spectrum_id', and 'precursor_mz'.
205214 If any of these columns are missing, an error is logged and an empty list is returned.
206215 """
207-
216+
208217 try :
209218 df = pd .read_csv (file_name , delimiter = delimiter )
210219 except FileNotFoundError as e :
@@ -228,7 +237,11 @@ def parse_csv_file(file_name: str, delimiter: str = "\t") -> list:
228237
229238 # Create a list of PSM objects from the DataFrame rows
230239 peptidoforms = [
231- PSM (peptidoform = row ["peptidoform" ], spectrum_id = row ["spectrum_id" ], precursor_mz = row ["precursor_mz" ])
240+ PSM (
241+ peptidoform = row ["peptidoform" ],
242+ spectrum_id = row ["spectrum_id" ],
243+ precursor_mz = row ["precursor_mz" ],
244+ )
232245 for _ , row in df .iterrows ()
233246 ]
234247
0 commit comments