예제 #1
0
  public void processCorpus(String corpusFolder, String resultFolder, int ngramSize) {
    CorpusManager corpusManager = null;
    try {
      corpusManager = new CorpusManager(corpusFolder);
    } catch (IOException e) {
      System.err.println("Could not initialize corpus manager: " + e.getMessage());
      System.exit(-1);
    }

    TextInstance currentTextInstance = corpusManager.getNextText();
    int textCount = 1;

    while (currentTextInstance != null) {
      String currentText = null;
      try {
        currentText = currentTextInstance.getFullText();
      } catch (IOException e) {
        System.err.println(
            "Could not get text. Skipping "
                + currentTextInstance.getTextSource()
                + " Cause: "
                + e.getMessage());
        currentTextInstance = corpusManager.getNextText();
        continue;
      }

      System.out.print(
          "Processing text "
              + textCount
              + " of "
              + corpusManager.getTextCount()
              + " ("
              + currentTextInstance.getTextSource().getAbsolutePath().toString()
              + ") ... ");
      try {
        processText(
            currentText,
            ngramSize,
            resultFolder,
            currentTextInstance.getTextSource().getName(),
            currentTextInstance.getTrueAuthor());
        System.out.println("Done.");
      } catch (Exception e) {
        System.err.println(" ERROR: " + e.getMessage());
        // just continue to next text
      }

      currentTextInstance = corpusManager.getNextText();
      ++textCount;
    }

    // Do the same with the unknown texts
    // TODO: Fix duplication!
    int unknownTextCount = 1;
    TextInstance currentUnknownText = corpusManager.getUnknownText();
    while (currentUnknownText != null) {
      System.out.println(
          "Processing unknown text "
              + unknownTextCount
              + " of "
              + corpusManager.getUnknownTextCount());
      String unknownText = null;
      try {
        unknownText = currentUnknownText.getFullText();
      } catch (IOException e) {
        System.err.println(
            "Could not get unknown text. Skipping "
                + currentUnknownText.getTextSource()
                + " Cause: "
                + e.getMessage());
        currentUnknownText = corpusManager.getUnknownText();
        continue;
      }

      try {
        processText(
            unknownText,
            ngramSize,
            resultFolder,
            currentUnknownText.getTextSource().getName(),
            "UNKNOWN");
      } catch (Exception e) {
        System.err.println(" ERROR: " + e.getMessage());
        // just continue to next text
      }
      currentUnknownText = corpusManager.getUnknownText();
      ++unknownTextCount;
    }

    ProfileWriter.writeTranslationTable(
        FeatureMapper.getTranslationTable(), new File(resultFolder));
    ProfileWriter.writeSeenUniqueNGrams(uniqueNgrams, new File(resultFolder));
  }