Пример #1
0
  private static int parseCoNLL09(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {
    List<String> forms = new ArrayList<String>();
    forms.add("<root>");
    List<Boolean> isPred = new ArrayList<Boolean>();
    isPred.add(false);
    String str;
    int senCount = 0;

    while ((str = in.readLine()) != null) {
      if (str.trim().equals("")) {
        Sentence s;
        if (options.desegment) {
          s = pipeline.parse(ChineseDesegmenter.desegment(forms.toArray(new String[0])));
        } else {
          s = options.skipPI ? pipeline.parseOraclePI(forms, isPred) : pipeline.parse(forms);
        }
        forms.clear();
        forms.add("<root>");
        isPred.clear();
        isPred.add(false); // Root is not a predicate
        writer.write(s);
        senCount++;
        if (senCount % 100 == 0) { // TODO fix output in general, don't
          // print to System.out. Wrap a
          // printstream in some (static)
          // class, and allow people to adjust
          // this. While doing this, also add
          // the option to make the output
          // file be -, ie so it prints to
          // stdout. All kinds of errors
          // should goto stderr, and nothing
          // should be printed to stdout by
          // default
          System.out.println("Processing sentence " + senCount);
        }
      } else {
        String[] tokens = WHITESPACE_PATTERN.split(str);
        forms.add(tokens[1]);
        if (options.skipPI) isPred.add(tokens[12].equals("Y"));
      }
    }

    if (forms.size() > 1) { // We have the root token too, remember!
      writer.write(pipeline.parse(forms));
      senCount++;
    }
    return senCount;
  }
Пример #2
0
  public static void main(String[] args) throws Exception {
    CompletePipelineCMDLineOptions options = new CompletePipelineCMDLineOptions();
    options.parseCmdLineArgs(args);
    String error = FileExistenceVerifier.verifyCompletePipelineAllNecessaryModelFiles(options);
    if (error != null) {
      System.err.println(error);
      System.err.println();
      System.err.println("Aborting.");
      System.exit(1);
    }

    CompletePipeline pipeline = getCompletePipeline(options);

    BufferedReader in =
        new BufferedReader(
            new InputStreamReader(new FileInputStream(options.input), Charset.forName("UTF-8")));

    SentenceWriter writer = null;

    if (options.printANN) writer = new ANNWriter(options.output);
    else writer = new CoNLL09Writer(options.output);

    long start = System.currentTimeMillis();
    int senCount;

    if (options.glovedir != null) {
      senCount = parseFullDocument(options, pipeline, in, writer);
    } else if (options.loadPreprocessorWithTokenizer) {
      senCount = parseNonSegmentedLineByLine(options, pipeline, in, writer);
    } else {
      senCount = parseCoNLL09(options, pipeline, in, writer);
    }

    in.close();
    writer.close();

    long time = System.currentTimeMillis() - start;
    System.out.println(pipeline.getStatusString());
    System.out.println();
    System.out.println("Total parsing time (ms):  " + Util.insertCommas(time));
    System.out.println("Overall speed (ms/sen):   " + Util.insertCommas(time / senCount));
  }
Пример #3
0
  private static int parseNonSegmentedLineByLine(
      CompletePipelineCMDLineOptions options,
      CompletePipeline pipeline,
      BufferedReader in,
      SentenceWriter writer)
      throws IOException, Exception {
    int senCount = 0;
    String str;

    while ((str = in.readLine()) != null) {
      Sentence s = pipeline.parse(str);
      writer.write(s);
      senCount++;
      if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); // TODO,
      // same
      // as
      // below.
    }

    return senCount;
  }