public void overwriteParse(Sentence s) { try { // skip ROOT (i==0); for (int i = 1; i < s.size(); i++) { Word w = s.get(i); String line = br.readLine(); // if current line is blank (end of last sentence), read next // line if (line.equals("")) line = br.readLine(); String[] parts = line.split("\t"); // sanity check if (!parts[1].toLowerCase().equals(w.getForm().toLowerCase())) { System.err.println( "WARNING: different normalization applied? (" + parts[1] + " vs. " + w.getForm() + ")"); w.setLemma(w.getForm().replaceAll("[0-9]", "D")); } // CoNLL-X /**/ w.setPOS(parts[3]); w.setHeadId(Integer.parseInt(parts[6])); w.setDeprel(parts[7]); /**/ // CoNLL-09 /** w.setPOS(parts[4]); w.setHeadId(Integer.parseInt(parts[8])); w.setDeprel(parts[10]);/ */ } s.buildDependencyTree(); } catch (IOException e) { e.printStackTrace(); System.exit(1); } }
public void write(Sentence s) { try { for (Predicate p : s.getPredicates()) { if (p.getSense().equals("Action") || p.getSense().equals("OPERATION")) { out.write( id(p) + "\t" + "Action" + " " + p.getBegin() + " " + p.getEnd() + "\t" + p.getForm() + "\n"); for (Word w : p.getArgMap().keySet()) { String label = p.getArgMap().get(w); if (label.equals("Theme")) label = "Object"; if (!word2id.containsKey(w)) out.write( id(w) + "\t" + label + " " + w.getBegin() + " " + w.getEnd() + "\t" + w.getForm() + "\n"); out.write( "R" + (rnum++) + "\t" + (label.equals("Actor") ? ("IsActorOf Arg1:" + id(w) + " Arg2:" + id(p)) : (label.equals("Property") ? ("HasProperty Arg1:" + id(p) + " Arg2:" + id(w)) : ("ActsOn Arg1:" + id(p) + " Arg2:" + id(w)))) + "\n"); } } if (p.getSense().equals("Object") || p.getSense().equals("CONCEPT") || p.getSense().equals("Property")) { if (!word2id.containsKey(p)) out.write( id(p) + "\t" + p.getSense() + " " + p.getBegin() + " " + p.getEnd() + "\t" + p.getForm() + "\n"); for (Word w : p.getArgMap().keySet()) { String label = p.getArgMap().get(w); if (label.equals("Theme")) label = "Object"; if (!word2id.containsKey(w)) out.write( id(w) + "\t" + label + " " + w.getBegin() + " " + w.getEnd() + "\t" + w.getForm() + "\n"); out.write( "R" + (rnum++) + "\t" + "HasProperty Arg1:" + id(p) + " Arg2:" + id(w) + "\n"); } } } // out.write(s.toString()+"\n\n"); } catch (Exception e) { e.printStackTrace(); System.out.println("Failed to write sentance."); System.exit(1); } }
private static int parseFullDocument( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { /** initialize * */ Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); props.put( "dcoref.sievePasses", "MarkRole," + "DiscourseMatch," + "ExactStringMatch," + "RelaxedExactStringMatch," + "PreciseConstructs," + "StrictHeadMatch1," + "StrictHeadMatch2," + "StrictHeadMatch3," + "StrictHeadMatch4," + "RelaxedHeadMatch"); StanfordCoreNLP stanfordpipeline = new StanfordCoreNLP(props); ExternalProcesses glove = new ExternalProcesses(options.glovedir); /** read full text * */ int senCount = 0; String str; StringBuffer text = new StringBuffer(); while ((str = in.readLine()) != null) { text.append(str); text.append("\n"); } /** document-level preprocessing * */ Annotation document = new Annotation(text.toString()); stanfordpipeline.annotate(document); Map<String, Double[]> word2vecs = glove.createvecs(document); Corpus c = new Corpus("tmp"); /** sentence-level preprocessing * */ for (CoreMap sentence : document.get(SentencesAnnotation.class)) { StringBuffer posOutput = new StringBuffer(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { if (posOutput.length() > 0) { posOutput.append(" "); } posOutput.append(token.word()); posOutput.append("_"); posOutput.append(token.tag()); } String parse = ExternalProcesses.runProcess( "nc " + options.mstserver.replaceAll(":", " "), posOutput.toString()); parse = parse.replaceAll("-\t-", "_\t_\n@#").replaceAll("@#\t", "").replaceAll("@#", ""); String[] lines = parse.split("\n"); String[] words = new String[lines.length + 1]; String[] lemmas = new String[lines.length + 1]; String[] tags = new String[lines.length + 1]; String[] morphs = new String[lines.length + 1]; int[] heads = new int[lines.length]; String[] deprels = new String[lines.length]; for (int i = 1; i < words.length; i++) { String[] parts = lines[i - 1].split("\t"); words[i] = sentence.get(TokensAnnotation.class).get(i - 1).word(); tags[i] = sentence.get(TokensAnnotation.class).get(i - 1).tag(); lemmas[i] = sentence.get(TokensAnnotation.class).get(i - 1).lemma(); morphs[i] = "_"; heads[i - 1] = Integer.parseInt(parts[6]); deprels[i - 1] = parts[7]; } Sentence sen = new Sentence(words, lemmas, tags, morphs); sen.setHeadsAndDeprels(heads, deprels); /* add labeled predicates from SEMAFOR */ String json = ExternalProcesses.runProcess("nc " + options.semaforserver.replaceAll(":", " "), parse); Pattern pred_frame = Pattern.compile( "\\{\"target\":\\{\"name\":\"([A-Za-z_]*)\",\"spans\":\\[\\{\"start\":([0-9]*),\""); Matcher m = pred_frame.matcher(json); while (m.find()) { String frame = m.group(1); int index = Integer.parseInt(m.group(2)); System.out.println(index + "\t" + frame); sen.makePredicate(index + 1); ((Predicate) sen.get(index + 1)).setSense(frame); } for (Word w : sen) if (word2vecs.containsKey(w.getForm().toLowerCase())) w.setRep(word2vecs.get(w.getForm().toLowerCase())); new CorpusSentence(sen, c); } /* add coref output to corpus */ Map<Integer, CorefChain> coref = document.get(CorefChainAnnotation.class); int num = 1; for (Map.Entry<Integer, CorefChain> entry : coref.entrySet()) { CorefChain cc = entry.getValue(); // skip singleton mentions if (cc.getMentionsInTextualOrder().size() == 1) continue; for (CorefMention m : cc.getMentionsInTextualOrder()) { c.addMention(c.get(m.sentNum - 1), m.headIndex, num); } num++; } for (Sentence sen : c) { pipeline.srl.parseSentence(sen); senCount++; if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); writer.write(sen); } return senCount; }
/** * This is the code we used to deduce voice in Spanish (and Catalan) for CoNLL 2009, however we * didn't actually use it in the final submission. I think it was because we never saw any real * improvement. I'm not sure it's proper though, my Spanish skills are rather non-existant. I just * put it here for future reference. * * @param pred the predicate * @return true if the predicate (verb) is in passive tense, false otherwise */ private boolean isPassive(Predicate pred) { for (Word c : pred.getChildren()) if ((c.getLemma().equals("estar") || c.getLemma().equals("ser")) && c.getFeats().contains("auxiliary")) return true; return false; }
protected boolean doExtractFeatures(Word pred) { return pred.getPOS().startsWith(POSPrefix) || (usedForPredicateIdentification && !Learn.learnOptions.skipNonMatchingPredicates && pred instanceof Predicate); }