private static int parseCoNLL09(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {
    List<String> forms = new ArrayList<String>();
    forms.add("<root>");
    List<Boolean> isPred = new ArrayList<Boolean>();
    isPred.add(false);
    String str;
    int senCount = 0;

    while ((str = in.readLine()) != null) {
      if (str.trim().equals("")) {
        Sentence s;
        if (options.desegment) {
          s = pipeline.parse(ChineseDesegmenter.desegment(forms.toArray(new String[0])));
        } else {
          s = options.skipPI ? pipeline.parseOraclePI(forms, isPred) : pipeline.parse(forms);
        }
        forms.clear();
        forms.add("<root>");
        isPred.clear();
        isPred.add(false); // Root is not a predicate
        writer.write(s);
        senCount++;
        if (senCount % 100 == 0) { // TODO fix output in general, don't
          // print to System.out. Wrap a
          // printstream in some (static)
          // class, and allow people to adjust
          // this. While doing this, also add
          // the option to make the output
          // file be -, ie so it prints to
          // stdout. All kinds of errors
          // should goto stderr, and nothing
          // should be printed to stdout by
          // default
          System.out.println("Processing sentence " + senCount);
        }
      } else {
        String[] tokens = WHITESPACE_PATTERN.split(str);
        forms.add(tokens[1]);
        if (options.skipPI) isPred.add(tokens[12].equals("Y"));
      }
    }

    if (forms.size() > 1) { // We have the root token too, remember!
      writer.write(pipeline.parse(forms));
      senCount++;
    }
    return senCount;
  }
Beispiel #2
0
  public static void main(String[] args) throws Exception {
    long startTime = System.currentTimeMillis();
    parseOptions = new ParseOptions(args);

    SemanticRoleLabeler srl;

    if (parseOptions.useReranker) {
      srl = new Reranker(parseOptions);
      // srl =
      // Reranker.fromZipFile(zipFile,parseOptions.skipPI,parseOptions.global_alfa,parseOptions.global_aiBeam,parseOptions.global_acBeam);
    } else {
      ZipFile zipFile = new ZipFile(parseOptions.modelFile);
      srl =
          parseOptions.skipPD
              ? Pipeline.fromZipFile(zipFile, new Step[] {Step.ai, Step.ac})
              : parseOptions.skipPI
                  ? Pipeline.fromZipFile(zipFile, new Step[] {Step.pd, Step.ai, Step.ac /*
																	 * ,Step.po,
																	 * Step.ao
																	 */})
                  : Pipeline.fromZipFile(zipFile);
      zipFile.close();
    }

    SentenceWriter writer = null;
    if (parseOptions.printXML) writer = new FrameNetXMLWriter(parseOptions.output);
    else writer = new CoNLL09Writer(parseOptions.output);

    SentenceReader reader =
        parseOptions.skipPI
            ? new SRLOnlyCoNLL09Reader(parseOptions.inputCorpus)
            : new DepsOnlyCoNLL09Reader(parseOptions.inputCorpus);
    int senCount = 0;
    for (Sentence s : reader) {
      senCount++;
      if (senCount % 100 == 0) System.out.println("Parsing sentence " + senCount);
      srl.parseSentence(s);
      if (parseOptions.writeCoref) writer.specialwrite(s);
      else writer.write(s);
    }
    writer.close();
    reader.close();
    long totalTime = System.currentTimeMillis() - startTime;
    System.out.println("Done.");
    System.out.println(srl.getStatus());
    System.out.println();
    System.out.println("Total execution time: " + Util.insertCommas(totalTime) + "ms");
  }
  public static void main(String[] args) throws Exception {
    CompletePipelineCMDLineOptions options = new CompletePipelineCMDLineOptions();
    options.parseCmdLineArgs(args);
    String error = FileExistenceVerifier.verifyCompletePipelineAllNecessaryModelFiles(options);
    if (error != null) {
      System.err.println(error);
      System.err.println();
      System.err.println("Aborting.");
      System.exit(1);
    }

    CompletePipeline pipeline = getCompletePipeline(options);

    BufferedReader in =
        new BufferedReader(
            new InputStreamReader(new FileInputStream(options.input), Charset.forName("UTF-8")));

    SentenceWriter writer = null;

    if (options.printANN) writer = new ANNWriter(options.output);
    else writer = new CoNLL09Writer(options.output);

    long start = System.currentTimeMillis();
    int senCount;

    if (options.glovedir != null) {
      senCount = parseFullDocument(options, pipeline, in, writer);
    } else if (options.loadPreprocessorWithTokenizer) {
      senCount = parseNonSegmentedLineByLine(options, pipeline, in, writer);
    } else {
      senCount = parseCoNLL09(options, pipeline, in, writer);
    }

    in.close();
    writer.close();

    long time = System.currentTimeMillis() - start;
    System.out.println(pipeline.getStatusString());
    System.out.println();
    System.out.println("Total parsing time (ms):  " + Util.insertCommas(time));
    System.out.println("Overall speed (ms/sen):   " + Util.insertCommas(time / senCount));
  }
  private static int parseNonSegmentedLineByLine(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {
    int senCount = 0;
    String str;

    while ((str = in.readLine()) != null) {
      Sentence s = pipeline.parse(str);
      writer.write(s);
      senCount++;
      if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); // TODO,
      // same
      // as
      // below.
    }

    return senCount;
  }
  private static int parseFullDocument(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {

    /** initialize * */
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    props.put(
        "dcoref.sievePasses",
        "MarkRole,"
            + "DiscourseMatch,"
            + "ExactStringMatch,"
            + "RelaxedExactStringMatch,"
            + "PreciseConstructs,"
            + "StrictHeadMatch1,"
            + "StrictHeadMatch2,"
            + "StrictHeadMatch3,"
            + "StrictHeadMatch4,"
            + "RelaxedHeadMatch");
    StanfordCoreNLP stanfordpipeline = new StanfordCoreNLP(props);
    ExternalProcesses glove = new ExternalProcesses(options.glovedir);

    /** read full text * */
    int senCount = 0;
    String str;
    StringBuffer text = new StringBuffer();
    while ((str = in.readLine()) != null) {
      text.append(str);
      text.append("\n");
    }

    /** document-level preprocessing * */
    Annotation document = new Annotation(text.toString());
    stanfordpipeline.annotate(document);

    Map<String, Double[]> word2vecs = glove.createvecs(document);

    Corpus c = new Corpus("tmp");

    /** sentence-level preprocessing * */
    for (CoreMap sentence : document.get(SentencesAnnotation.class)) {
      StringBuffer posOutput = new StringBuffer();

      for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        if (posOutput.length() > 0) {
          posOutput.append(" ");
        }
        posOutput.append(token.word());
        posOutput.append("_");
        posOutput.append(token.tag());
      }

      String parse =
          ExternalProcesses.runProcess(
              "nc " + options.mstserver.replaceAll(":", " "), posOutput.toString());
      parse = parse.replaceAll("-\t-", "_\t_\n@#").replaceAll("@#\t", "").replaceAll("@#", "");

      String[] lines = parse.split("\n");
      String[] words = new String[lines.length + 1];
      String[] lemmas = new String[lines.length + 1];
      String[] tags = new String[lines.length + 1];
      String[] morphs = new String[lines.length + 1];
      int[] heads = new int[lines.length];
      String[] deprels = new String[lines.length];

      for (int i = 1; i < words.length; i++) {
        String[] parts = lines[i - 1].split("\t");
        words[i] = sentence.get(TokensAnnotation.class).get(i - 1).word();
        tags[i] = sentence.get(TokensAnnotation.class).get(i - 1).tag();
        lemmas[i] = sentence.get(TokensAnnotation.class).get(i - 1).lemma();
        morphs[i] = "_";
        heads[i - 1] = Integer.parseInt(parts[6]);
        deprels[i - 1] = parts[7];
      }
      Sentence sen = new Sentence(words, lemmas, tags, morphs);
      sen.setHeadsAndDeprels(heads, deprels);

      /* add labeled predicates from SEMAFOR */
      String json =
          ExternalProcesses.runProcess("nc " + options.semaforserver.replaceAll(":", " "), parse);
      Pattern pred_frame =
          Pattern.compile(
              "\\{\"target\":\\{\"name\":\"([A-Za-z_]*)\",\"spans\":\\[\\{\"start\":([0-9]*),\"");
      Matcher m = pred_frame.matcher(json);
      while (m.find()) {
        String frame = m.group(1);
        int index = Integer.parseInt(m.group(2));
        System.out.println(index + "\t" + frame);

        sen.makePredicate(index + 1);
        ((Predicate) sen.get(index + 1)).setSense(frame);
      }

      for (Word w : sen)
        if (word2vecs.containsKey(w.getForm().toLowerCase()))
          w.setRep(word2vecs.get(w.getForm().toLowerCase()));

      new CorpusSentence(sen, c);
    }

    /* add coref output to corpus */
    Map<Integer, CorefChain> coref = document.get(CorefChainAnnotation.class);
    int num = 1;
    for (Map.Entry<Integer, CorefChain> entry : coref.entrySet()) {
      CorefChain cc = entry.getValue();
      // skip singleton mentions
      if (cc.getMentionsInTextualOrder().size() == 1) continue;

      for (CorefMention m : cc.getMentionsInTextualOrder()) {
        c.addMention(c.get(m.sentNum - 1), m.headIndex, num);
      }
      num++;
    }

    for (Sentence sen : c) {
      pipeline.srl.parseSentence(sen);
      senCount++;
      if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount);
      writer.write(sen);
    }
    return senCount;
  }