private void doParseSentencesInCorpus(File ntvmsnbcCorpus) throws IOException { /* SentenceMorphParse parse = parser.parse("Turgut Özal'ın ölüm raporu ile ilgili flaş bir gelişme."); parse.dump(); System.out.println("After disambiguation:"); parser.disambiguate(parse); parse.dump(); for (SentenceMorphParse.Entry entry : parse) { System.out.println(entry.input + "=" + entry.parses.get(0)); } for (SentenceMorphParse.Entry entry : parse) { System.out.println(entry.input + " kök=" + entry.parses.get(0).stem); }*/ List<String> sentences = SimpleTextReader.trimmingUTF8Reader(ntvmsnbcCorpus).asStringList(); Stopwatch sw = new Stopwatch().start(); int wc = 0; for (String sentence : sentences) { SentenceMorphParse parse = parser.parse(sentence); wc += parse.size(); parser.disambiguate(parse); // System.out.println(sentence); // parse.dump(); } System.out.println(wc); System.out.println(sw.elapsed(TimeUnit.MILLISECONDS)); }
public Ambiguous[] getAmbiguousSequence(SentenceMorphParse sentence) { Ambiguous[] awords = new Ambiguous[sentence.size() + 3]; awords[0] = startWord; awords[1] = startWord; int i = 2; for (SentenceMorphParse.Entry entry : sentence) { int[] roots = new int[entry.parses.size()]; int[][] igs = new int[entry.parses.size()][]; int j = 0; for (MorphParse parse : entry.parses) { String rootPart = parse.dictionaryItem.lemma; roots[j] = rootLm.getVocabulary().indexOf(rootPart); igs[j] = new int[parse.inflectionalGroups.size()]; for (int k = 0; j < parse.inflectionalGroups.size(); k++) { igs[j][k] = igLm.getVocabulary().indexOf(parse.inflectionalGroups.get(k).formatNoSurface()); } j++; } awords[i] = new Ambiguous(roots, igs); i++; } awords[i] = endWord; return awords; }