private void processText(
      String text, int ngramSize, String resultFolder, String sourceFile, String author) {
    List<List<String>> ngrams = extractor.extract_SN_Grams(text, ngramSize);
    List<List<Integer>> numericNGrams = FeatureMapper.numericalizeNGrams(ngrams);
    HashMap<List<Integer>, Integer> profile = FeatureMapper.createProfile(numericNGrams);
    writeProfile(profile, resultFolder, sourceFile, author);

    // Store the seen ngrams
    for (List<Integer> seenNgram : profile.keySet()) {
      uniqueNgrams.add(seenNgram);
    }
  }
 private static double calculateMass(
     FeatureMapper polypeptideMapper, SymbolList residuesSymbolList) {
   try {
     double massInDaltons =
         MassCalc.getMass(residuesSymbolList, SymbolPropertyTable.AVG_MASS, true);
     return massInDaltons;
   } catch (Exception exp) {
     logger.error(
         String.format(
             "Error computing protein mass in '%s' because '%s'",
             polypeptideMapper.getUniqueName(), exp.getMessage()));
   }
   return -1.0;
 }
  public void processCorpus(String corpusFolder, String resultFolder, int ngramSize) {
    CorpusManager corpusManager = null;
    try {
      corpusManager = new CorpusManager(corpusFolder);
    } catch (IOException e) {
      System.err.println("Could not initialize corpus manager: " + e.getMessage());
      System.exit(-1);
    }

    TextInstance currentTextInstance = corpusManager.getNextText();
    int textCount = 1;

    while (currentTextInstance != null) {
      String currentText = null;
      try {
        currentText = currentTextInstance.getFullText();
      } catch (IOException e) {
        System.err.println(
            "Could not get text. Skipping "
                + currentTextInstance.getTextSource()
                + " Cause: "
                + e.getMessage());
        currentTextInstance = corpusManager.getNextText();
        continue;
      }

      System.out.print(
          "Processing text "
              + textCount
              + " of "
              + corpusManager.getTextCount()
              + " ("
              + currentTextInstance.getTextSource().getAbsolutePath().toString()
              + ") ... ");
      try {
        processText(
            currentText,
            ngramSize,
            resultFolder,
            currentTextInstance.getTextSource().getName(),
            currentTextInstance.getTrueAuthor());
        System.out.println("Done.");
      } catch (Exception e) {
        System.err.println(" ERROR: " + e.getMessage());
        // just continue to next text
      }

      currentTextInstance = corpusManager.getNextText();
      ++textCount;
    }

    // Do the same with the unknown texts
    // TODO: Fix duplication!
    int unknownTextCount = 1;
    TextInstance currentUnknownText = corpusManager.getUnknownText();
    while (currentUnknownText != null) {
      System.out.println(
          "Processing unknown text "
              + unknownTextCount
              + " of "
              + corpusManager.getUnknownTextCount());
      String unknownText = null;
      try {
        unknownText = currentUnknownText.getFullText();
      } catch (IOException e) {
        System.err.println(
            "Could not get unknown text. Skipping "
                + currentUnknownText.getTextSource()
                + " Cause: "
                + e.getMessage());
        currentUnknownText = corpusManager.getUnknownText();
        continue;
      }

      try {
        processText(
            unknownText,
            ngramSize,
            resultFolder,
            currentUnknownText.getTextSource().getName(),
            "UNKNOWN");
      } catch (Exception e) {
        System.err.println(" ERROR: " + e.getMessage());
        // just continue to next text
      }
      currentUnknownText = corpusManager.getUnknownText();
      ++unknownTextCount;
    }

    ProfileWriter.writeTranslationTable(
        FeatureMapper.getTranslationTable(), new File(resultFolder));
    ProfileWriter.writeSeenUniqueNGrams(uniqueNgrams, new File(resultFolder));
  }
  /**
   * Calculate the predicted properties of this polypeptide.
   *
   * @return a <code>PeptideProperties</code> object containing the predicted properties of this
   *     polypeptide.
   */
  public static PeptideProperties calculateStats(FeatureMapper polypeptideMapper) {
    if (polypeptideMapper.getResidues() == null) {
      logger.warn("No residues for '" + polypeptideMapper.getUniqueName() + "'");
      return null;
    }
    String residuesString = new String(polypeptideMapper.getResidues());

    SymbolList residuesSymbolList = null;
    PeptideProperties pp = new PeptideProperties();
    try {
      SymbolTokenization proteinTokenization = ProteinTools.getTAlphabet().getTokenization("token");
      residuesSymbolList = new SimpleSymbolList(proteinTokenization, residuesString);

      if (residuesSymbolList.length() == 0) {
        logger.error(
            String.format(
                "Polypeptide feature '%s' has zero-length residues",
                polypeptideMapper.getUniqueName()));
        return pp;
      }

      try {
        // if the sequence ends with a termination symbol (*), we need to remove it
        if (residuesSymbolList.symbolAt(residuesSymbolList.length()) == ProteinTools.ter()) {
          if (residuesSymbolList.length() == 1) {
            logger.error(
                String.format(
                    "Polypeptide feature '%s' only has termination symbol",
                    polypeptideMapper.getUniqueName()));
            return pp;
          }
          residuesSymbolList = residuesSymbolList.subList(1, residuesSymbolList.length() - 1);
        }

      } catch (IndexOutOfBoundsException exception) {
        throw new RuntimeException(exception);
      }
    } catch (BioException e) {
      logger.error("Can't translate into a protein sequence", e);
      return pp;
    }

    pp.setAminoAcids(residuesSymbolList.length());

    try {
      double isoElectricPoint = new IsoelectricPointCalc().getPI(residuesSymbolList, false, false);
      pp.setIsoelectricPoint(isoElectricPoint);
    } catch (Exception e) {
      logger.error(
          String.format("Error computing protein isoelectric point for '%s'", residuesSymbolList),
          e);
    }

    double mass2 = calculateMass(polypeptideMapper, residuesSymbolList);
    if (mass2 != -1) {
      // mass = mass2;
      pp.setMass(mass2);
    }

    double charge = calculateCharge(residuesString);
    pp.setCharge(charge);

    return pp;
  }