예제 #1
0
  public void overwriteParse(Sentence s) {
    try {
      // skip ROOT (i==0);
      for (int i = 1; i < s.size(); i++) {
        Word w = s.get(i);
        String line = br.readLine();
        // if current line is blank (end of last sentence), read next
        // line
        if (line.equals("")) line = br.readLine();

        String[] parts = line.split("\t");
        // sanity check
        if (!parts[1].toLowerCase().equals(w.getForm().toLowerCase())) {
          System.err.println(
              "WARNING: different normalization applied? ("
                  + parts[1]
                  + " vs. "
                  + w.getForm()
                  + ")");
          w.setLemma(w.getForm().replaceAll("[0-9]", "D"));
        }

        // CoNLL-X
        /**/ w.setPOS(parts[3]);
        w.setHeadId(Integer.parseInt(parts[6]));
        w.setDeprel(parts[7]); /**/

        // CoNLL-09
        /** w.setPOS(parts[4]); w.setHeadId(Integer.parseInt(parts[8])); w.setDeprel(parts[10]);/ */
      }
      s.buildDependencyTree();

    } catch (IOException e) {
      e.printStackTrace();
      System.exit(1);
    }
  }
예제 #2
0
  public void write(Sentence s) {
    try {
      for (Predicate p : s.getPredicates()) {
        if (p.getSense().equals("Action") || p.getSense().equals("OPERATION")) {
          out.write(
              id(p)
                  + "\t"
                  + "Action"
                  + " "
                  + p.getBegin()
                  + " "
                  + p.getEnd()
                  + "\t"
                  + p.getForm()
                  + "\n");

          for (Word w : p.getArgMap().keySet()) {
            String label = p.getArgMap().get(w);
            if (label.equals("Theme")) label = "Object";

            if (!word2id.containsKey(w))
              out.write(
                  id(w)
                      + "\t"
                      + label
                      + " "
                      + w.getBegin()
                      + " "
                      + w.getEnd()
                      + "\t"
                      + w.getForm()
                      + "\n");

            out.write(
                "R"
                    + (rnum++)
                    + "\t"
                    + (label.equals("Actor")
                        ? ("IsActorOf Arg1:" + id(w) + " Arg2:" + id(p))
                        : (label.equals("Property")
                            ? ("HasProperty Arg1:" + id(p) + " Arg2:" + id(w))
                            : ("ActsOn Arg1:" + id(p) + " Arg2:" + id(w))))
                    + "\n");
          }
        }

        if (p.getSense().equals("Object")
            || p.getSense().equals("CONCEPT")
            || p.getSense().equals("Property")) {
          if (!word2id.containsKey(p))
            out.write(
                id(p)
                    + "\t"
                    + p.getSense()
                    + " "
                    + p.getBegin()
                    + " "
                    + p.getEnd()
                    + "\t"
                    + p.getForm()
                    + "\n");

          for (Word w : p.getArgMap().keySet()) {
            String label = p.getArgMap().get(w);
            if (label.equals("Theme")) label = "Object";

            if (!word2id.containsKey(w))
              out.write(
                  id(w)
                      + "\t"
                      + label
                      + " "
                      + w.getBegin()
                      + " "
                      + w.getEnd()
                      + "\t"
                      + w.getForm()
                      + "\n");

            out.write(
                "R" + (rnum++) + "\t" + "HasProperty Arg1:" + id(p) + " Arg2:" + id(w) + "\n");
          }
        }
      }
      // out.write(s.toString()+"\n\n");
    } catch (Exception e) {
      e.printStackTrace();
      System.out.println("Failed to write sentance.");
      System.exit(1);
    }
  }
예제 #3
0
  private static int parseFullDocument(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {

    /** initialize * */
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    props.put(
        "dcoref.sievePasses",
        "MarkRole,"
            + "DiscourseMatch,"
            + "ExactStringMatch,"
            + "RelaxedExactStringMatch,"
            + "PreciseConstructs,"
            + "StrictHeadMatch1,"
            + "StrictHeadMatch2,"
            + "StrictHeadMatch3,"
            + "StrictHeadMatch4,"
            + "RelaxedHeadMatch");
    StanfordCoreNLP stanfordpipeline = new StanfordCoreNLP(props);
    ExternalProcesses glove = new ExternalProcesses(options.glovedir);

    /** read full text * */
    int senCount = 0;
    String str;
    StringBuffer text = new StringBuffer();
    while ((str = in.readLine()) != null) {
      text.append(str);
      text.append("\n");
    }

    /** document-level preprocessing * */
    Annotation document = new Annotation(text.toString());
    stanfordpipeline.annotate(document);

    Map<String, Double[]> word2vecs = glove.createvecs(document);

    Corpus c = new Corpus("tmp");

    /** sentence-level preprocessing * */
    for (CoreMap sentence : document.get(SentencesAnnotation.class)) {
      StringBuffer posOutput = new StringBuffer();

      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        if (posOutput.length() > 0) {
          posOutput.append(" ");
        }
        posOutput.append(token.word());
        posOutput.append("_");
        posOutput.append(token.tag());
      }

      String parse =
          ExternalProcesses.runProcess(
              "nc " + options.mstserver.replaceAll(":", " "), posOutput.toString());
      parse = parse.replaceAll("-\t-", "_\t_\n@#").replaceAll("@#\t", "").replaceAll("@#", "");

      String[] lines = parse.split("\n");
      String[] words = new String[lines.length + 1];
      String[] lemmas = new String[lines.length + 1];
      String[] tags = new String[lines.length + 1];
      String[] morphs = new String[lines.length + 1];
      int[] heads = new int[lines.length];
      String[] deprels = new String[lines.length];

      for (int i = 1; i < words.length; i++) {
        String[] parts = lines[i - 1].split("\t");
        words[i] = sentence.get(TokensAnnotation.class).get(i - 1).word();
        tags[i] = sentence.get(TokensAnnotation.class).get(i - 1).tag();
        lemmas[i] = sentence.get(TokensAnnotation.class).get(i - 1).lemma();
        morphs[i] = "_";
        heads[i - 1] = Integer.parseInt(parts[6]);
        deprels[i - 1] = parts[7];
      }
      Sentence sen = new Sentence(words, lemmas, tags, morphs);
      sen.setHeadsAndDeprels(heads, deprels);

      /* add labeled predicates from SEMAFOR */
      String json =
          ExternalProcesses.runProcess("nc " + options.semaforserver.replaceAll(":", " "), parse);
      Pattern pred_frame =
          Pattern.compile(
              "\\{\"target\":\\{\"name\":\"([A-Za-z_]*)\",\"spans\":\\[\\{\"start\":([0-9]*),\"");
      Matcher m = pred_frame.matcher(json);
      while (m.find()) {
        String frame = m.group(1);
        int index = Integer.parseInt(m.group(2));
        System.out.println(index + "\t" + frame);

        sen.makePredicate(index + 1);
        ((Predicate) sen.get(index + 1)).setSense(frame);
      }

      for (Word w : sen)
        if (word2vecs.containsKey(w.getForm().toLowerCase()))
          w.setRep(word2vecs.get(w.getForm().toLowerCase()));

      new CorpusSentence(sen, c);
    }

    /* add coref output to corpus */
    Map<Integer, CorefChain> coref = document.get(CorefChainAnnotation.class);
    int num = 1;
    for (Map.Entry<Integer, CorefChain> entry : coref.entrySet()) {
      CorefChain cc = entry.getValue();
      // skip singleton mentions
      if (cc.getMentionsInTextualOrder().size() == 1) continue;

      for (CorefMention m : cc.getMentionsInTextualOrder()) {
        c.addMention(c.get(m.sentNum - 1), m.headIndex, num);
      }
      num++;
    }

    for (Sentence sen : c) {
      pipeline.srl.parseSentence(sen);
      senCount++;
      if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount);
      writer.write(sen);
    }
    return senCount;
  }
예제 #4
0
파일: Spanish.java 프로젝트: imace/mate
 /**
  * This is the code we used to deduce voice in Spanish (and Catalan) for CoNLL 2009, however we
  * didn't actually use it in the final submission. I think it was because we never saw any real
  * improvement. I'm not sure it's proper though, my Spanish skills are rather non-existant. I just
  * put it here for future reference.
  *
  * @param pred the predicate
  * @return true if the predicate (verb) is in passive tense, false otherwise
  */
 private boolean isPassive(Predicate pred) {
   for (Word c : pred.getChildren())
     if ((c.getLemma().equals("estar") || c.getLemma().equals("ser"))
         && c.getFeats().contains("auxiliary")) return true;
   return false;
 }
예제 #5
0
 protected boolean doExtractFeatures(Word pred) {
   return pred.getPOS().startsWith(POSPrefix)
       || (usedForPredicateIdentification
           && !Learn.learnOptions.skipNonMatchingPredicates
           && pred instanceof Predicate);
 }