public void processCorpus(String corpusFolder, String resultFolder, int ngramSize) { CorpusManager corpusManager = null; try { corpusManager = new CorpusManager(corpusFolder); } catch (IOException e) { System.err.println("Could not initialize corpus manager: " + e.getMessage()); System.exit(-1); } TextInstance currentTextInstance = corpusManager.getNextText(); int textCount = 1; while (currentTextInstance != null) { String currentText = null; try { currentText = currentTextInstance.getFullText(); } catch (IOException e) { System.err.println( "Could not get text. Skipping " + currentTextInstance.getTextSource() + " Cause: " + e.getMessage()); currentTextInstance = corpusManager.getNextText(); continue; } System.out.print( "Processing text " + textCount + " of " + corpusManager.getTextCount() + " (" + currentTextInstance.getTextSource().getAbsolutePath().toString() + ") ... "); try { processText( currentText, ngramSize, resultFolder, currentTextInstance.getTextSource().getName(), currentTextInstance.getTrueAuthor()); System.out.println("Done."); } catch (Exception e) { System.err.println(" ERROR: " + e.getMessage()); // just continue to next text } currentTextInstance = corpusManager.getNextText(); ++textCount; } // Do the same with the unknown texts // TODO: Fix duplication! int unknownTextCount = 1; TextInstance currentUnknownText = corpusManager.getUnknownText(); while (currentUnknownText != null) { System.out.println( "Processing unknown text " + unknownTextCount + " of " + corpusManager.getUnknownTextCount()); String unknownText = null; try { unknownText = currentUnknownText.getFullText(); } catch (IOException e) { System.err.println( "Could not get unknown text. Skipping " + currentUnknownText.getTextSource() + " Cause: " + e.getMessage()); currentUnknownText = corpusManager.getUnknownText(); continue; } try { processText( unknownText, ngramSize, resultFolder, currentUnknownText.getTextSource().getName(), "UNKNOWN"); } catch (Exception e) { System.err.println(" ERROR: " + e.getMessage()); // just continue to next text } currentUnknownText = corpusManager.getUnknownText(); ++unknownTextCount; } ProfileWriter.writeTranslationTable( FeatureMapper.getTranslationTable(), new File(resultFolder)); ProfileWriter.writeSeenUniqueNGrams(uniqueNgrams, new File(resultFolder)); }