public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus) throws IOException { DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader()); System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size()); System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount()); SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus); System.out.println("Generating Lemma Corpus."); for (SentenceData sentenceData : trainingSet) { List<String> roots = Lists.newArrayList("<s>"); for (Z3WordData word : sentenceData.words) { Z3WordParse parse = new Z3WordParse(word.correctParse); roots.add(parse.root); } roots.add("</s>"); rootWriter.writeLine(Joiner.on(" ").join(roots)); } rootWriter.close(); SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus); System.out.println("Generating IG Corpus."); Z3WordParse start = new Z3WordParse(SENTENCE_START_PARSE); Z3WordParse end = new Z3WordParse(SENTENCE_END_PARSE); for (SentenceData sentenceData : trainingSet) { if (sentenceData.words.size() == 0) continue; Z3WordParse first = start; Z3WordParse second = new Z3WordParse(sentenceData.words.get(0).correctParse); for (int i = 1; i < sentenceData.words.size(); i++) { Z3WordParse third = new Z3WordParse(sentenceData.words.get(i).correctParse); for (int j = 0; j < third.igs.size(); j++) { igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + third.igs.get(j)); } first = second; second = third; } igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + end.getLastIg()); } igWriter.close(); }