Exemplo n.º 1
0
  /**
   * Parses the data from an mzIdentML file given by its name into the given {@link PIACompiler}.
   *
   * @param fileName name of the mzTab file
   */
  public static boolean getDataFromMascotDatFile(
      String name, String fileName, PIACompiler compiler) {

    // need to parse through the file, as mascotdatfile (3.2.11) does not support
    //   - the "index" variable of the queries
    //   - the "fastafile"
    //   - no good information for enzyme
    Map<String, String> queryIndexMap = new HashMap<>();
    String fastaFile = null;

    String enzymeCleavage = null;
    String enzymeRestrict = null;

    try (BufferedReader rd = new BufferedReader(new FileReader(fileName))) {
      String line;

      boolean inQuery = false;
      boolean inEnzyme = false;
      String queryName = null;

      while ((line = rd.readLine()) != null) {
        if (!inQuery) {
          if (line.startsWith("Content-Type: application/x-Mascot; NAME=\"query")) {
            queryName = line.substring(42, line.length() - 1);
            inQuery = true;
          } else if ((fastaFile == null) && line.startsWith("fastafile")) {
            fastaFile = line.substring(10);
          }
        } else if (inQuery && line.startsWith("index=")) {
          queryIndexMap.put(queryName, line);
          inQuery = false;
        }

        if (!inEnzyme) {
          if (((enzymeCleavage == null) || (enzymeRestrict == null))
              && line.startsWith("Content-Type: application/x-Mascot; NAME=\"enzyme\"")) {
            inEnzyme = true;
          }
        } else {
          if (line.startsWith("Cleavage:")) {
            enzymeCleavage = line.substring(9).trim();
          } else if (line.startsWith("Restrict:")) {
            enzymeRestrict = line.substring(9).trim();
          } else if (line.startsWith("Content-Type:")) {
            inEnzyme = false;
          }
        }
      }

      rd.close();
    } catch (IOException e) {
      LOGGER.error("could not read '" + fileName + "' for index parsing.", e);
      return false;
    }

    MascotDatfileInf mascotFile = MascotDatfileFactory.create(fileName, MascotDatfileType.MEMORY);

    if (mascotFile == null) {
      LOGGER.error("could not read '" + fileName + "'.");
      return false;
    }

    PIAInputFile file =
        compiler.insertNewFile(
            name, fileName, InputFileParserFactory.InputFileTypes.MASCOT_DAT_INPUT.getFileSuffix());

    // create the analysis software and add it to the compiler
    AnalysisSoftware mascot = new AnalysisSoftware();

    mascot.setId("mascot");
    mascot.setName("mascot");
    mascot.setUri("http://www.matrixscience.com/");
    mascot.setVersion(mascotFile.getHeaderSection().getVersion());

    Param param = new Param();
    param.setParam(MzIdentMLTools.createPSICvParam(OntologyConstants.MASCOT, null));
    mascot.setSoftwareName(param);

    mascot = compiler.putIntoSoftwareMap(mascot);

    // create the searchDatabase and add it to the compiler
    SearchDatabase searchDatabase = new SearchDatabase();

    // required
    searchDatabase.setId("mascotDB");
    searchDatabase.setLocation(fastaFile);
    // optional
    searchDatabase.setName(mascotFile.getParametersSection().getDatabase());
    searchDatabase.setNumDatabaseSequences(mascotFile.getHeaderSection().getSequences());
    searchDatabase.setNumResidues(mascotFile.getHeaderSection().getResidues());

    // fileformat
    FileFormat fileFormat = new FileFormat();
    fileFormat.setCvParam(MzIdentMLTools.createPSICvParam(OntologyConstants.FASTA_FORMAT, null));
    searchDatabase.setFileFormat(fileFormat);
    // databaseName
    param = new Param();
    param.setParam(
        MzIdentMLTools.createUserParam(mascotFile.getHeaderSection().getRelease(), null, "string"));
    searchDatabase.setDatabaseName(param);

    // add searchDB to the compiler
    searchDatabase = compiler.putIntoSearchDatabasesMap(searchDatabase);

    // add the spectraData (input file)
    SpectraData spectraData = null;
    if ((mascotFile.getParametersSection().getFile() != null)
        && (mascotFile.getParametersSection().getFile().trim().length() > 0)) {
      spectraData = new SpectraData();

      spectraData.setId("mascotInput");
      spectraData.setLocation(mascotFile.getParametersSection().getFile());

      if ((mascotFile.getParametersSection().getFormat() != null)
          && "Mascot generic".equals(mascotFile.getParametersSection().getFormat())) {
        fileFormat = new FileFormat();

        fileFormat.setCvParam(
            MzIdentMLTools.createPSICvParam(OntologyConstants.MASCOT_MGF_FORMAT, null));
        spectraData.setFileFormat(fileFormat);

        SpectrumIDFormat idFormat = new SpectrumIDFormat();
        idFormat.setCvParam(
            MzIdentMLTools.createPSICvParam(
                OntologyConstants.MULTIPLE_PEAK_LIST_NATIVEID_FORMAT, null));
        spectraData.setSpectrumIDFormat(idFormat);
      }

      spectraData = compiler.putIntoSpectraDataMap(spectraData);
    } else {
      LOGGER.warn("The source file (MGF) was not recorded in the file!");
    }

    // define the spectrumIdentificationProtocol
    SpectrumIdentificationProtocol spectrumIDProtocol = new SpectrumIdentificationProtocol();

    spectrumIDProtocol.setId("mascotAnalysis");
    spectrumIDProtocol.setAnalysisSoftware(mascot);

    param = new Param();
    if ("MIS".equals(mascotFile.getParametersSection().getSearch())) {
      param.setParam(MzIdentMLTools.createPSICvParam(OntologyConstants.MS_MS_SEARCH, null));
    }
    // TODO: add error on PMF query (not usable for PIA)
    // TODO: and sequence query
    spectrumIDProtocol.setSearchType(param);

    ParamList paramList = new ParamList();
    paramList
        .getCvParam()
        .add(
            MzIdentMLTools.createPSICvParam(
                OntologyConstants.MASCOT_INSTRUMENT,
                mascotFile.getParametersSection().getInstrument()));

    paramList
        .getUserParam()
        .add(
            MzIdentMLTools.createUserParam(
                "Mascot User Comment", mascotFile.getParametersSection().getCom(), "string"));

    if ("Monoisotopic".equalsIgnoreCase(mascotFile.getParametersSection().getMass())) {
      paramList
          .getCvParam()
          .add(MzIdentMLTools.createPSICvParam(OntologyConstants.FRAGMENT_MASS_TYPE_MONO, null));
      paramList
          .getCvParam()
          .add(MzIdentMLTools.createPSICvParam(OntologyConstants.PARENT_MASS_TYPE_MONO, null));
    } else {
      paramList
          .getCvParam()
          .add(MzIdentMLTools.createPSICvParam(OntologyConstants.FRAGMENT_MASS_TYPE_AVERAGE, null));
      paramList
          .getCvParam()
          .add(MzIdentMLTools.createPSICvParam(OntologyConstants.PARENT_MASS_TYPE_AVERAGE, null));
    }

    spectrumIDProtocol.setAdditionalSearchParams(paramList);

    ModificationParams modParams = new ModificationParams();
    for (Object objMod : mascotFile.getModificationList().getVariableModifications()) {
      modParams
          .getSearchModification()
          .add(createPSIModification((VariableModification) objMod, compiler.getUnimodParser()));
    }
    for (Object objMod : mascotFile.getModificationList().getFixedModifications()) {
      modParams
          .getSearchModification()
          .add(createPSIModification((FixedModification) objMod, compiler.getUnimodParser()));
    }
    spectrumIDProtocol.setModificationParams(modParams);

    Enzymes enzymes = new Enzymes();
    spectrumIDProtocol.setEnzymes(enzymes);
    if (enzymeCleavage != null) {
      Enzyme enzyme = new Enzyme();

      enzyme.setId("enzyme");
      enzyme.setMissedCleavages(Integer.parseInt(mascotFile.getParametersSection().getPFA()));

      StringBuilder regExp = new StringBuilder();
      if (enzymeRestrict == null) {
        regExp.append("(?=[");
        regExp.append(enzymeCleavage);
        regExp.append("])");
      } else {
        regExp.append("(?<=[");
        regExp.append(enzymeCleavage);
        regExp.append("])(?!");
        regExp.append(enzymeRestrict);
        regExp.append(")");
      }
      enzyme.setSiteRegexp(regExp.toString());

      enzymes.getEnzyme().add(enzyme);
    }

    Tolerance tolerance = new Tolerance();

    AbstractParam abstractParam =
        MzIdentMLTools.createPSICvParam(
            OntologyConstants.SEARCH_TOLERANCE_PLUS_VALUE,
            mascotFile.getParametersSection().getITOL());
    MzIdentMLTools.setUnitParameterFromString(
        mascotFile.getParametersSection().getITOLU(), abstractParam);
    tolerance.getCvParam().add((CvParam) abstractParam);

    abstractParam =
        MzIdentMLTools.createPSICvParam(
            OntologyConstants.SEARCH_TOLERANCE_MINUS_VALUE,
            mascotFile.getParametersSection().getITOL());
    MzIdentMLTools.setUnitParameterFromString(
        mascotFile.getParametersSection().getITOLU(), abstractParam);
    tolerance.getCvParam().add((CvParam) abstractParam);

    spectrumIDProtocol.setFragmentTolerance(tolerance);

    tolerance = new Tolerance();

    abstractParam =
        MzIdentMLTools.createPSICvParam(
            OntologyConstants.SEARCH_TOLERANCE_PLUS_VALUE,
            mascotFile.getParametersSection().getTOL());
    MzIdentMLTools.setUnitParameterFromString(
        mascotFile.getParametersSection().getTOLU(), abstractParam);
    tolerance.getCvParam().add((CvParam) abstractParam);

    abstractParam =
        MzIdentMLTools.createPSICvParam(
            OntologyConstants.SEARCH_TOLERANCE_MINUS_VALUE,
            mascotFile.getParametersSection().getTOL());
    MzIdentMLTools.setUnitParameterFromString(
        mascotFile.getParametersSection().getTOLU(), abstractParam);
    tolerance.getCvParam().add((CvParam) abstractParam);

    spectrumIDProtocol.setParentTolerance(tolerance);

    // no threshold set, take all PSMs from the dat file
    paramList = new ParamList();
    paramList
        .getCvParam()
        .add(MzIdentMLTools.createPSICvParam(OntologyConstants.NO_THRESHOLD, null));
    spectrumIDProtocol.setThreshold(paramList);

    file.addSpectrumIdentificationProtocol(spectrumIDProtocol);

    // add the spectrum identification
    SpectrumIdentification spectrumID = new SpectrumIdentification();
    spectrumID.setId("mascotIdentification");
    spectrumID.setSpectrumIdentificationList(null);
    spectrumID.setSpectrumIdentificationProtocol(spectrumIDProtocol);

    if (spectraData != null) {
      InputSpectra inputSpectra = new InputSpectra();
      inputSpectra.setSpectraData(spectraData);
      spectrumID.getInputSpectra().add(inputSpectra);
    }

    SearchDatabaseRef searchDBRef = new SearchDatabaseRef();
    searchDBRef.setSearchDatabase(searchDatabase);
    spectrumID.getSearchDatabaseRef().add(searchDBRef);

    file.addSpectrumIdentification(spectrumID);

    // get the mappings
    QueryEnumerator queryEnumerator = mascotFile.getQueryEnumerator();
    QueryToPeptideMapInf queryToPeptideMap = mascotFile.getQueryToPeptideMap();
    QueryToPeptideMapInf decoyQueryToPeptideMap = mascotFile.getDecoyQueryToPeptideMap(false);
    ProteinMap proteinMap = mascotFile.getProteinMap();
    ProteinMap decoyProteinMap = mascotFile.getDecoyProteinMap();

    // one query is one spectrum, so go through the queries
    int nrQueries = mascotFile.getNumberOfQueries();
    int nrQueriesDone = 0;
    LOGGER.debug("queries in file: " + nrQueries);
    while (queryEnumerator.hasMoreElements()) {
      Query currQuery = queryEnumerator.nextElement();

      int charge;
      try {
        if (currQuery.getChargeString() == null) {
          charge = 0;
        } else if (currQuery.getChargeString().contains("-")) {
          charge = -Integer.parseInt(currQuery.getChargeString().replace("-", ""));
        } else {
          // we assume, it is positively charged
          charge = Integer.parseInt(currQuery.getChargeString().replace("+", ""));
        }
      } catch (NumberFormatException e) {
        charge = 0;
        LOGGER.warn(
            "could not parse charge '"
                + currQuery.getChargeString()
                + "' for '"
                + currQuery.getTitle()
                + "'");
      }

      double precursorMZ = currQuery.getPrecursorMZ();

      Double retentionTime;
      if (currQuery.getRetentionTimeInSeconds() != null) {
        retentionTime = Double.parseDouble(currQuery.getRetentionTimeInSeconds());
      } else {
        retentionTime = null;
      }

      String spectrumTitle = currQuery.getTitle();
      String index = queryIndexMap.get("query" + currQuery.getQueryNumber());

      // add the target identifications
      if (queryToPeptideMap != null) {
        List<PeptideHit> peptideHits =
            queryToPeptideMap.getAllPeptideHits(currQuery.getQueryNumber());
        insertPeptideHitsIntoCompiler(
            compiler,
            peptideHits,
            proteinMap,
            searchDatabase,
            charge,
            precursorMZ,
            retentionTime,
            index,
            spectrumTitle,
            file,
            spectrumID,
            false);
      }

      // add the decoy identifications
      if (decoyQueryToPeptideMap != null) {
        List<PeptideHit> peptideHits =
            decoyQueryToPeptideMap.getAllPeptideHits(currQuery.getQueryNumber());
        insertPeptideHitsIntoCompiler(
            compiler,
            peptideHits,
            decoyProteinMap,
            searchDatabase,
            charge,
            precursorMZ,
            retentionTime,
            index,
            spectrumTitle,
            file,
            spectrumID,
            true);
      }

      nrQueriesDone++;
      if (nrQueriesDone % 10000 == 0) {
        LOGGER.debug(
            "done "
                + nrQueriesDone
                + " / "
                + nrQueries
                + String.format(" (%1$.4f%%)", 100.0 * nrQueriesDone / nrQueries));
      }
    }

    mascotFile.finish();
    return true;
  }
Exemplo n.º 2
0
  private static SearchModification createPSIModification(
      com.compomics.mascotdatfile.util.interfaces.Modification mod, UnimodParser uniModParser) {
    SearchModification searchMod = new SearchModification();

    if (mod instanceof VariableModification) {
      searchMod.setFixedMod(false);
    } else {
      searchMod.setFixedMod(true);
    }

    if (mod.getLocation().contains("term") || mod.getLocation().contains("Term")) {

      OntologyConstants modConstant = null;
      if (mod.getLocation().startsWith("Protein N")) {
        modConstant = OntologyConstants.MODIFICATION_SPECIFICITY_PROTEIN_N_TERM;
      } else if (mod.getLocation().startsWith("Protein C")) {
        modConstant = OntologyConstants.MODIFICATION_SPECIFICITY_PROTEIN_C_TERM;
      } else if (mod.getLocation().startsWith("N")) {
        modConstant = OntologyConstants.MODIFICATION_SPECIFICITY_PEP_N_TERM;
      } else if (mod.getLocation().startsWith("C")) {
        modConstant = OntologyConstants.MODIFICATION_SPECIFICITY_PEP_C_TERM;
      }

      if (modConstant != null) {
        CvParam specificity = MzIdentMLTools.createPSICvParam(modConstant, null);

        SpecificityRules specRules = new SpecificityRules();
        specRules.getCvParam().add(specificity);
        searchMod.getSpecificityRules().add(specRules);

        String[] residues = mod.getLocation().split("erm");
        if (residues.length > 1) {
          for (Character residue : residues[1].trim().toCharArray()) {
            if (residue != ' ') {
              searchMod.getResidues().add(residue.toString());
            }
          }
        } else {
          searchMod.getResidues().add(".");
        }
      }
    } else {
      for (Character residue : mod.getLocation().toCharArray()) {
        searchMod.getResidues().add(residue.toString());
      }
    }
    searchMod.setMassDelta((float) mod.getMass());

    ModT unimod =
        uniModParser.getModificationByNameAndMass(
            mod.getType(), mod.getMass(), searchMod.getResidues());
    if (unimod != null) {
      CvParam cvParam = new CvParam();
      cvParam.setAccession("UNIMOD:" + unimod.getRecordId());
      cvParam.setCv(UnimodParser.getCv());
      cvParam.setName(unimod.getTitle());
      searchMod.getCvParam().add(cvParam);
    }

    return searchMod;
  }