public Example( String name, String[] input, Lexicon lexicon, Map<ElementaryStringTree, ArrayList<Fringe>> shadowTreesMap, SuperTagger superTagger, FreqCounter freqCounter, Options opts) { this.opts = opts; this.name = name; this.lexicon = lexicon; this.shadowTreesMap = shadowTreesMap; this.superTagger = superTagger; this.freqCounter = freqCounter; if (input[0].equals( "NOT PARSED")) // we could not extract the lexicon or missing entirely from gold standard // dataset { sentence = posTagged = parsed = goldStandardNoTraces = solution = ""; numOfWords = 0; notParsed = true; } else { if (opts.estimateProcDifficulty) { if (opts.inputType == Options.InputType.dundee) { readDundeeInput(input[0]); } else if (opts.inputType == Options.InputType.posTagged || (opts.inputType == Options.InputType.pltag && input[0].contains("\t"))) // tab delimited POS-word pairs { readPosTagged(input[0]); } else if (opts.inputType == Options.InputType.pltag) { readPennTreebank(input, true); solution = parsed; } else // input is plain text without POS tags { if (opts.goldPosTags) { readPosTagged(PosTagger.posTagLine(input[0])); } else { StringBuilder str = new StringBuilder(); // for(String word : removeQuotesPlain(input[0]).split(" ")) for (String word : Utils.tokenizeStanford(input[0]).split(" ")) { str.append(String.format("N/A %s\t", word)); } readPosTagged(str.toString().trim()); } } solution = ""; } else { readPennTreebank(input, true); solution = parsed; } this.numOfWords = posTagged.split("\t").length; } }
/** * Read input from Dundee corpus. The format is: RC_label|w_1 id_1 w_2 id_2 ... * * @param line */ private void readDundeeInput(String line) { int index = line.indexOf("|"); if (index > -1) sentenceRc = line.substring(0, index); line = line.substring(index + 1); StringBuilder sent = new StringBuilder(); boolean usePosTagger = opts.goldPosTags; StringBuilder posTagDummy = new StringBuilder(); if (line.charAt(0) == '(') // some examples may contain already parsed input in tree format { List<Word> words = Tree.valueOf(treeProcessDundeeIds(line)).yieldWords(); for (Word word : words) { sent.append(word).append(" "); if (!usePosTagger) posTagDummy.append("N/A ").append(word).append("\t"); } } else { // remove quotes line = replaceParenthesesDundee(removeQuotesDundee(line).trim()).trim(); String[] tokens = line.split(" "); wordIds = new String[tokens.length / 2]; for (int i = 0; i < tokens.length - 1; i += 2) { String word = !usePosTagger ? wordRemoveDigits(tokens[i]) : tokens[i]; sent.append(word).append(" "); if (!usePosTagger) posTagDummy.append("N/A ").append(word).append("\t"); wordIds[i / 2] = tokens[i + 1]; } sentence = sent.toString().trim(); if (usePosTagger) { Pair<String, String>[] posWords = PosTagger.posTagLineToArray(sentence); // One or more words has been expanded due to PTB-compliant splitting. // For each new constituent assign the word-id of the original word. if (posWords.length != wordIds.length) { adjustWordIdsDundee(posWords, sentence.split(" ")); } sentence = sentRemoveDigits(sentence); readPosTagged(wordRemoveDigits(PosTagger.tokensToLinePosTagged(posWords))); } else posTagged = posTagDummy.toString().trim(); } }