Esempio n. 1
0
 public static void generateTrainingCorpus(File trainingFile, File rootCorpus, File igCorpus)
     throws IOException {
   DataSet trainingSet = Files.readLines(trainingFile, Charsets.UTF_8, new DataSetLoader());
   System.out.println("Amount of sentences in training set:" + trainingSet.sentences.size());
   System.out.println("Amount of tokens in training set:" + trainingSet.tokenCount());
   SimpleTextWriter rootWriter = SimpleTextWriter.keepOpenUTF8Writer(rootCorpus);
   System.out.println("Generating Lemma Corpus.");
   for (SentenceData sentenceData : trainingSet) {
     List<String> roots = Lists.newArrayList("<s>");
     for (Z3WordData word : sentenceData.words) {
       Z3WordParse parse = new Z3WordParse(word.correctParse);
       roots.add(parse.root);
     }
     roots.add("</s>");
     rootWriter.writeLine(Joiner.on(" ").join(roots));
   }
   rootWriter.close();
   SimpleTextWriter igWriter = SimpleTextWriter.keepOpenUTF8Writer(igCorpus);
   System.out.println("Generating IG Corpus.");
   Z3WordParse start = new Z3WordParse(SENTENCE_START_PARSE);
   Z3WordParse end = new Z3WordParse(SENTENCE_END_PARSE);
   for (SentenceData sentenceData : trainingSet) {
     if (sentenceData.words.size() == 0) continue;
     Z3WordParse first = start;
     Z3WordParse second = new Z3WordParse(sentenceData.words.get(0).correctParse);
     for (int i = 1; i < sentenceData.words.size(); i++) {
       Z3WordParse third = new Z3WordParse(sentenceData.words.get(i).correctParse);
       for (int j = 0; j < third.igs.size(); j++) {
         igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + third.igs.get(j));
       }
       first = second;
       second = third;
     }
     igWriter.writeLine(first.getLastIg() + " " + second.getLastIg() + " " + end.getLastIg());
   }
   igWriter.close();
 }