private String sirToString(SpectrumIdentificationResult sir) { String sirString = ""; SpectraData spectraData = spectraDataIdHashMap.get(sir.getSpectraDataRef()); sirString += spectraData.getLocation() + sep + "\"" + sir.getSpectrumID() + "\""; Double rtInSeconds = -1.0; String spectrumTitle = ""; // <cvParam accession="MS:1001114" name="retention time(s)" cvRef="PSI-MS" value="3488.676" // unitAccession="UO:0000010" unitName="second" unitCvRef="UO" /> // <cvParam accession="MS:1000796" name="spectrum title" cvRef="PSI-MS" // value="mam_050108o_CPTAC_study6_6E004.6805.6805.1" /> // for (CvParam cvParam : sir.getCvParam()) { // Updated by FG: checking for old CV param 1114 or newer correct CV term 16. if (cvParam.getAccession().equals("MS:1001114") || cvParam.getAccession().equals("MS:1000016")) { if (cvParam.getUnitAccession().equals("UO:0000010")) { rtInSeconds = Double.parseDouble(cvParam.getValue()); } else if (cvParam.getUnitAccession().equals("UO:0000031")) { rtInSeconds = Double.parseDouble(cvParam.getValue()) / 60; // Convert minutes to seconds } else { System.out.println("Error parsing RT - unit not recognised"); } } if (cvParam.getAccession().equals("MS:1000796")) { spectrumTitle = cvParam.getValue(); } } sirString += sep + "\"" + spectrumTitle + "\"" + sep + rtInSeconds; return sirString; }
private void init(String outputFile, String exportOption) { Writer out = null; try { out = new BufferedWriter(new FileWriter(outputFile)); // Read all the objects we will need into hashes that are not automatically resolved by object // reference if (isVerbose) { System.out.print("About to iterate over PepEvid..."); } Iterator<PeptideEvidence> iterPeptideEvidence = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.PeptideEvidence); while (iterPeptideEvidence.hasNext()) { PeptideEvidence peptideEvidence = iterPeptideEvidence.next(); peptideEvidenceIdHashMap.put(peptideEvidence.getId(), peptideEvidence); } if (isVerbose) { System.out.println("...done"); System.out.print("About to iterate over Peptide"); } Iterator<Peptide> iterPeptide = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.Peptide); while (iterPeptide.hasNext()) { Peptide peptide = iterPeptide.next(); peptideIdHashMap.put(peptide.getId(), peptide); } if (isVerbose) { System.out.println("...done"); System.out.print("About to iterate over Spectra Data"); } Iterator<SpectraData> iterSpectraData = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SpectraData); while (iterSpectraData.hasNext()) { SpectraData spectraData = iterSpectraData.next(); spectraDataIdHashMap.put(spectraData.getId(), spectraData); } if (isVerbose) { System.out.println("...done"); System.out.print("About to iterate over DBsequence"); } Iterator<DBSequence> iterDBSequence = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.DBSequence); while (iterDBSequence.hasNext()) { DBSequence dbSequence = iterDBSequence.next(); dbSequenceIdHashMap.put(dbSequence.getId(), dbSequence); } if (isVerbose) { System.out.println("...done"); System.out.print("About to iterate over PDH"); } Iterator<ProteinDetectionHypothesis> iterPDH = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.ProteinDetectionHypothesis); Integer pCounter = 0; while (iterPDH.hasNext()) { ProteinDetectionHypothesis pdh = iterPDH.next(); pdhIdHashMap.put(pdh.getId(), pdh); for (CvParam cvParam : pdh.getCvParam()) { if (cvParam.getAccession().equals("MS:1001591") || cvParam.getAccession().equals("MS:1001592") || cvParam.getAccession().equals("MS:1001593") || cvParam.getAccession().equals("MS:1001594") || cvParam.getAccession().equals("MS:1001595") || cvParam.getAccession().equals("MS:1001596") || cvParam.getAccession().equals("MS:1001597") || cvParam.getAccession().equals("MS:1001598") || cvParam .getAccession() .equals("MS:1001599")) { // do nothing - these are specifically handled // ToDO this code could be improved using an array of values... } else if (cvParam.getValue() != null) { if (!columnToProtScoreMap.containsValue(cvParam.getName())) { columnToProtScoreMap.put(pCounter, cvParam.getName()); pCounter++; } } } for (UserParam userParam : pdh.getUserParam()) { if (!columnToProtScoreMap.containsValue(userParam.getName())) { columnToProtScoreMap.put(pCounter, userParam.getName()); pCounter++; } } } for (int i = 0; i < pCounter; i++) { pScoreHeader += columnToProtScoreMap.get(i) + sep; } // Now let's see what scores we have in the file // TODO - I'm not sure this is the fastest way to parse the files; these are unmarshalled // again below - inefficient? // Iterator<SpectrumIdentificationItem> iterSII = // unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SpectrumIdentificationItem); Integer counter = 0; if (isVerbose) { System.out.println("...done"); System.out.print("About to iterate over SIR"); } Iterator<SpectrumIdentificationResult> iterSIR = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SpectrumIdentificationResult); List<SpectrumIdentificationResult> sirList = new ArrayList<>(); while (iterSIR.hasNext()) { SpectrumIdentificationResult sir = iterSIR.next(); sirList.add(sir); List<SpectrumIdentificationItem> listSII = sir.getSpectrumIdentificationItem(); for (SpectrumIdentificationItem sii : listSII) { siiIdHashMap.put(sii.getId(), sii); siiIdToSirHashMap.put(sii.getId(), sir); for (CvParam cvParam : sii.getCvParam()) { if (cvParam.getValue() != null) { if (!columnToScoreMap.containsValue(cvParam.getName())) { columnToScoreMap.put(counter, cvParam.getName()); counter++; } } } } } for (int i = 0; i < counter; i++) { scoreHeader += sep + columnToScoreMap.get(i); } if (isVerbose) { System.out.println("...done"); System.out.print("About to create output"); } if (exportOption.equals("exportPSMs")) { out.write(spectrumHeader + psmHeader + scoreHeader); out.write(endPsmHeader + "\n"); // Iterator<SpectrumIdentificationResult> iterSIR = // unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.SpectrumIdentificationResult); for (SpectrumIdentificationResult sir : sirList) { String sirLine = sirToString(sir); List<SpectrumIdentificationItem> listSII = sir.getSpectrumIdentificationItem(); for (SpectrumIdentificationItem sii : listSII) { out.write(sirLine + sep + siiToString(sii) + "\n"); } } } else if (exportOption.equals("exportProteinGroups")) { out.write(pagHeader); out.write(pScoreHeader); out.write(spectrumHeader + psmHeader + scoreHeader); out.write(endPsmHeader + "\n"); Iterator<ProteinAmbiguityGroup> iterPAG = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.ProteinAmbiguityGroup); while (iterPAG.hasNext()) { ProteinAmbiguityGroup pag = iterPAG.next(); String pagLine = pagToString(pag); // handle PDHs for (ProteinDetectionHypothesis pdh : pag.getProteinDetectionHypothesis()) { String pdhLine = pagLine; pdhLine += pdhToString(pdh); for (PeptideHypothesis pepH : pdh.getPeptideHypothesis()) { List<SpectrumIdentificationItemRef> siiRefList = pepH.getSpectrumIdentificationItemRef(); for (SpectrumIdentificationItemRef siiRef : siiRefList) { SpectrumIdentificationResult sir = siiIdToSirHashMap.get(siiRef.getSpectrumIdentificationItemRef()); SpectrumIdentificationItem sii = siiIdHashMap.get(siiRef.getSpectrumIdentificationItemRef()); out.write(pdhLine + sirToString(sir) + sep + siiToString(sii) + "\n"); } } } } } else if (exportOption.equals("exportRepProteinPerPAGOnly")) { out.write(pagHeader); out.write(pScoreHeader); out.write("\n"); Iterator<ProteinAmbiguityGroup> iterPAG = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.ProteinAmbiguityGroup); while (iterPAG.hasNext()) { ProteinAmbiguityGroup pag = iterPAG.next(); String pagLine = pagToString(pag); ProteinDetectionHypothesis repPdh = getRepresentativePDH(pag, representativeProteinAcc); String pdhLine = pagLine; if (repPdh != null) { pdhLine += pdhToString(repPdh); } out.write(pdhLine + "\n"); } } else if (exportOption.equals( "exportProteoAnnotator")) { // Added by Fawaz Ghali 13/05/2014 exportProteoAnnotator out.write(pagHeader); out.write(pScoreHeader); // Added by Fawaz Ghali 13/05/2014 exportProteoAnnotator out.write(exportProteoAnnotatorHeader); out.write("\n"); Iterator<ProteinAmbiguityGroup> iterPAG = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.ProteinAmbiguityGroup); while (iterPAG.hasNext()) { ProteinAmbiguityGroup pag = iterPAG.next(); String pagLine = pagToString(pag); ProteinDetectionHypothesis repPdh = getRepresentativePDH(pag, representativeProteinAcc); String pdhLine = pagLine; if (repPdh != null) { pdhLine += pdhToString(repPdh); } // Added by Fawaz Ghali 13/05/2014 exportProteoAnnotator String proteoAnnotatorLine = pdhLine; proteoAnnotatorLine = proteoAnnotatorLine + proteoAnnotatorLineToString(pag); out.write(proteoAnnotatorLine + "\n"); } } else if (exportOption.equals("exportProteinsOnly")) { out.write(pagHeader); out.write(pScoreHeader); out.write("\n"); Iterator<ProteinAmbiguityGroup> iterPAG = unmarshaller.unmarshalCollectionFromXpath(MzIdentMLElement.ProteinAmbiguityGroup); while (iterPAG.hasNext()) { ProteinAmbiguityGroup pag = iterPAG.next(); String pagLine = pagToString(pag); // handle PDHs for (ProteinDetectionHypothesis pdh : pag.getProteinDetectionHypothesis()) { String pdhLine = pagLine; pdhLine += pdhToString(pdh); out.write(pdhLine + "\n"); } } } else { System.out.println( "Error - correct usage MzIdentMLToCSV inputFile outputFile -exportType [exportProteinGroups|exportPSMs|exportProteinsOnly]"); } out.close(); System.out.println("Output written to " + outputFile); } catch (IOException ex) { String methodName = Thread.currentThread().getStackTrace()[1].getMethodName(); String className = this.getClass().getName(); String message = "The task \"" + methodName + "\" in the class \"" + className + "\" was not completed because of " + ex.getMessage() + "." + "\nPlease see the reference guide at 02 for more information on this error. https://code.google.com/p/mzidentml-lib/wiki/CommonErrors "; System.out.println(message); } finally { try { out.close(); } catch (IOException ex) { String methodName = Thread.currentThread().getStackTrace()[1].getMethodName(); String className = this.getClass().getName(); String message = "The task \"" + methodName + "\" in the class \"" + className + "\" was not completed because of " + ex.getMessage() + "." + "\nPlease see the reference guide at 02 for more information on this error. https://code.google.com/p/mzidentml-lib/wiki/CommonErrors "; System.out.println(message); } } }
/** * Parses the data from an mzIdentML file given by its name into the given {@link PIACompiler}. * * @param fileName name of the mzTab file */ public static boolean getDataFromMascotDatFile( String name, String fileName, PIACompiler compiler) { // need to parse through the file, as mascotdatfile (3.2.11) does not support // - the "index" variable of the queries // - the "fastafile" // - no good information for enzyme Map<String, String> queryIndexMap = new HashMap<>(); String fastaFile = null; String enzymeCleavage = null; String enzymeRestrict = null; try (BufferedReader rd = new BufferedReader(new FileReader(fileName))) { String line; boolean inQuery = false; boolean inEnzyme = false; String queryName = null; while ((line = rd.readLine()) != null) { if (!inQuery) { if (line.startsWith("Content-Type: application/x-Mascot; NAME=\"query")) { queryName = line.substring(42, line.length() - 1); inQuery = true; } else if ((fastaFile == null) && line.startsWith("fastafile")) { fastaFile = line.substring(10); } } else if (inQuery && line.startsWith("index=")) { queryIndexMap.put(queryName, line); inQuery = false; } if (!inEnzyme) { if (((enzymeCleavage == null) || (enzymeRestrict == null)) && line.startsWith("Content-Type: application/x-Mascot; NAME=\"enzyme\"")) { inEnzyme = true; } } else { if (line.startsWith("Cleavage:")) { enzymeCleavage = line.substring(9).trim(); } else if (line.startsWith("Restrict:")) { enzymeRestrict = line.substring(9).trim(); } else if (line.startsWith("Content-Type:")) { inEnzyme = false; } } } rd.close(); } catch (IOException e) { LOGGER.error("could not read '" + fileName + "' for index parsing.", e); return false; } MascotDatfileInf mascotFile = MascotDatfileFactory.create(fileName, MascotDatfileType.MEMORY); if (mascotFile == null) { LOGGER.error("could not read '" + fileName + "'."); return false; } PIAInputFile file = compiler.insertNewFile( name, fileName, InputFileParserFactory.InputFileTypes.MASCOT_DAT_INPUT.getFileSuffix()); // create the analysis software and add it to the compiler AnalysisSoftware mascot = new AnalysisSoftware(); mascot.setId("mascot"); mascot.setName("mascot"); mascot.setUri("http://www.matrixscience.com/"); mascot.setVersion(mascotFile.getHeaderSection().getVersion()); Param param = new Param(); param.setParam(MzIdentMLTools.createPSICvParam(OntologyConstants.MASCOT, null)); mascot.setSoftwareName(param); mascot = compiler.putIntoSoftwareMap(mascot); // create the searchDatabase and add it to the compiler SearchDatabase searchDatabase = new SearchDatabase(); // required searchDatabase.setId("mascotDB"); searchDatabase.setLocation(fastaFile); // optional searchDatabase.setName(mascotFile.getParametersSection().getDatabase()); searchDatabase.setNumDatabaseSequences(mascotFile.getHeaderSection().getSequences()); searchDatabase.setNumResidues(mascotFile.getHeaderSection().getResidues()); // fileformat FileFormat fileFormat = new FileFormat(); fileFormat.setCvParam(MzIdentMLTools.createPSICvParam(OntologyConstants.FASTA_FORMAT, null)); searchDatabase.setFileFormat(fileFormat); // databaseName param = new Param(); param.setParam( MzIdentMLTools.createUserParam(mascotFile.getHeaderSection().getRelease(), null, "string")); searchDatabase.setDatabaseName(param); // add searchDB to the compiler searchDatabase = compiler.putIntoSearchDatabasesMap(searchDatabase); // add the spectraData (input file) SpectraData spectraData = null; if ((mascotFile.getParametersSection().getFile() != null) && (mascotFile.getParametersSection().getFile().trim().length() > 0)) { spectraData = new SpectraData(); spectraData.setId("mascotInput"); spectraData.setLocation(mascotFile.getParametersSection().getFile()); if ((mascotFile.getParametersSection().getFormat() != null) && "Mascot generic".equals(mascotFile.getParametersSection().getFormat())) { fileFormat = new FileFormat(); fileFormat.setCvParam( MzIdentMLTools.createPSICvParam(OntologyConstants.MASCOT_MGF_FORMAT, null)); spectraData.setFileFormat(fileFormat); SpectrumIDFormat idFormat = new SpectrumIDFormat(); idFormat.setCvParam( MzIdentMLTools.createPSICvParam( OntologyConstants.MULTIPLE_PEAK_LIST_NATIVEID_FORMAT, null)); spectraData.setSpectrumIDFormat(idFormat); } spectraData = compiler.putIntoSpectraDataMap(spectraData); } else { LOGGER.warn("The source file (MGF) was not recorded in the file!"); } // define the spectrumIdentificationProtocol SpectrumIdentificationProtocol spectrumIDProtocol = new SpectrumIdentificationProtocol(); spectrumIDProtocol.setId("mascotAnalysis"); spectrumIDProtocol.setAnalysisSoftware(mascot); param = new Param(); if ("MIS".equals(mascotFile.getParametersSection().getSearch())) { param.setParam(MzIdentMLTools.createPSICvParam(OntologyConstants.MS_MS_SEARCH, null)); } // TODO: add error on PMF query (not usable for PIA) // TODO: and sequence query spectrumIDProtocol.setSearchType(param); ParamList paramList = new ParamList(); paramList .getCvParam() .add( MzIdentMLTools.createPSICvParam( OntologyConstants.MASCOT_INSTRUMENT, mascotFile.getParametersSection().getInstrument())); paramList .getUserParam() .add( MzIdentMLTools.createUserParam( "Mascot User Comment", mascotFile.getParametersSection().getCom(), "string")); if ("Monoisotopic".equalsIgnoreCase(mascotFile.getParametersSection().getMass())) { paramList .getCvParam() .add(MzIdentMLTools.createPSICvParam(OntologyConstants.FRAGMENT_MASS_TYPE_MONO, null)); paramList .getCvParam() .add(MzIdentMLTools.createPSICvParam(OntologyConstants.PARENT_MASS_TYPE_MONO, null)); } else { paramList .getCvParam() .add(MzIdentMLTools.createPSICvParam(OntologyConstants.FRAGMENT_MASS_TYPE_AVERAGE, null)); paramList .getCvParam() .add(MzIdentMLTools.createPSICvParam(OntologyConstants.PARENT_MASS_TYPE_AVERAGE, null)); } spectrumIDProtocol.setAdditionalSearchParams(paramList); ModificationParams modParams = new ModificationParams(); for (Object objMod : mascotFile.getModificationList().getVariableModifications()) { modParams .getSearchModification() .add(createPSIModification((VariableModification) objMod, compiler.getUnimodParser())); } for (Object objMod : mascotFile.getModificationList().getFixedModifications()) { modParams .getSearchModification() .add(createPSIModification((FixedModification) objMod, compiler.getUnimodParser())); } spectrumIDProtocol.setModificationParams(modParams); Enzymes enzymes = new Enzymes(); spectrumIDProtocol.setEnzymes(enzymes); if (enzymeCleavage != null) { Enzyme enzyme = new Enzyme(); enzyme.setId("enzyme"); enzyme.setMissedCleavages(Integer.parseInt(mascotFile.getParametersSection().getPFA())); StringBuilder regExp = new StringBuilder(); if (enzymeRestrict == null) { regExp.append("(?=["); regExp.append(enzymeCleavage); regExp.append("])"); } else { regExp.append("(?<=["); regExp.append(enzymeCleavage); regExp.append("])(?!"); regExp.append(enzymeRestrict); regExp.append(")"); } enzyme.setSiteRegexp(regExp.toString()); enzymes.getEnzyme().add(enzyme); } Tolerance tolerance = new Tolerance(); AbstractParam abstractParam = MzIdentMLTools.createPSICvParam( OntologyConstants.SEARCH_TOLERANCE_PLUS_VALUE, mascotFile.getParametersSection().getITOL()); MzIdentMLTools.setUnitParameterFromString( mascotFile.getParametersSection().getITOLU(), abstractParam); tolerance.getCvParam().add((CvParam) abstractParam); abstractParam = MzIdentMLTools.createPSICvParam( OntologyConstants.SEARCH_TOLERANCE_MINUS_VALUE, mascotFile.getParametersSection().getITOL()); MzIdentMLTools.setUnitParameterFromString( mascotFile.getParametersSection().getITOLU(), abstractParam); tolerance.getCvParam().add((CvParam) abstractParam); spectrumIDProtocol.setFragmentTolerance(tolerance); tolerance = new Tolerance(); abstractParam = MzIdentMLTools.createPSICvParam( OntologyConstants.SEARCH_TOLERANCE_PLUS_VALUE, mascotFile.getParametersSection().getTOL()); MzIdentMLTools.setUnitParameterFromString( mascotFile.getParametersSection().getTOLU(), abstractParam); tolerance.getCvParam().add((CvParam) abstractParam); abstractParam = MzIdentMLTools.createPSICvParam( OntologyConstants.SEARCH_TOLERANCE_MINUS_VALUE, mascotFile.getParametersSection().getTOL()); MzIdentMLTools.setUnitParameterFromString( mascotFile.getParametersSection().getTOLU(), abstractParam); tolerance.getCvParam().add((CvParam) abstractParam); spectrumIDProtocol.setParentTolerance(tolerance); // no threshold set, take all PSMs from the dat file paramList = new ParamList(); paramList .getCvParam() .add(MzIdentMLTools.createPSICvParam(OntologyConstants.NO_THRESHOLD, null)); spectrumIDProtocol.setThreshold(paramList); file.addSpectrumIdentificationProtocol(spectrumIDProtocol); // add the spectrum identification SpectrumIdentification spectrumID = new SpectrumIdentification(); spectrumID.setId("mascotIdentification"); spectrumID.setSpectrumIdentificationList(null); spectrumID.setSpectrumIdentificationProtocol(spectrumIDProtocol); if (spectraData != null) { InputSpectra inputSpectra = new InputSpectra(); inputSpectra.setSpectraData(spectraData); spectrumID.getInputSpectra().add(inputSpectra); } SearchDatabaseRef searchDBRef = new SearchDatabaseRef(); searchDBRef.setSearchDatabase(searchDatabase); spectrumID.getSearchDatabaseRef().add(searchDBRef); file.addSpectrumIdentification(spectrumID); // get the mappings QueryEnumerator queryEnumerator = mascotFile.getQueryEnumerator(); QueryToPeptideMapInf queryToPeptideMap = mascotFile.getQueryToPeptideMap(); QueryToPeptideMapInf decoyQueryToPeptideMap = mascotFile.getDecoyQueryToPeptideMap(false); ProteinMap proteinMap = mascotFile.getProteinMap(); ProteinMap decoyProteinMap = mascotFile.getDecoyProteinMap(); // one query is one spectrum, so go through the queries int nrQueries = mascotFile.getNumberOfQueries(); int nrQueriesDone = 0; LOGGER.debug("queries in file: " + nrQueries); while (queryEnumerator.hasMoreElements()) { Query currQuery = queryEnumerator.nextElement(); int charge; try { if (currQuery.getChargeString() == null) { charge = 0; } else if (currQuery.getChargeString().contains("-")) { charge = -Integer.parseInt(currQuery.getChargeString().replace("-", "")); } else { // we assume, it is positively charged charge = Integer.parseInt(currQuery.getChargeString().replace("+", "")); } } catch (NumberFormatException e) { charge = 0; LOGGER.warn( "could not parse charge '" + currQuery.getChargeString() + "' for '" + currQuery.getTitle() + "'"); } double precursorMZ = currQuery.getPrecursorMZ(); Double retentionTime; if (currQuery.getRetentionTimeInSeconds() != null) { retentionTime = Double.parseDouble(currQuery.getRetentionTimeInSeconds()); } else { retentionTime = null; } String spectrumTitle = currQuery.getTitle(); String index = queryIndexMap.get("query" + currQuery.getQueryNumber()); // add the target identifications if (queryToPeptideMap != null) { List<PeptideHit> peptideHits = queryToPeptideMap.getAllPeptideHits(currQuery.getQueryNumber()); insertPeptideHitsIntoCompiler( compiler, peptideHits, proteinMap, searchDatabase, charge, precursorMZ, retentionTime, index, spectrumTitle, file, spectrumID, false); } // add the decoy identifications if (decoyQueryToPeptideMap != null) { List<PeptideHit> peptideHits = decoyQueryToPeptideMap.getAllPeptideHits(currQuery.getQueryNumber()); insertPeptideHitsIntoCompiler( compiler, peptideHits, decoyProteinMap, searchDatabase, charge, precursorMZ, retentionTime, index, spectrumTitle, file, spectrumID, true); } nrQueriesDone++; if (nrQueriesDone % 10000 == 0) { LOGGER.debug( "done " + nrQueriesDone + " / " + nrQueries + String.format(" (%1$.4f%%)", 100.0 * nrQueriesDone / nrQueries)); } } mascotFile.finish(); return true; }