private static int parseCoNLL09( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { List<String> forms = new ArrayList<String>(); forms.add("<root>"); List<Boolean> isPred = new ArrayList<Boolean>(); isPred.add(false); String str; int senCount = 0; while ((str = in.readLine()) != null) { if (str.trim().equals("")) { Sentence s; if (options.desegment) { s = pipeline.parse(ChineseDesegmenter.desegment(forms.toArray(new String[0]))); } else { s = options.skipPI ? pipeline.parseOraclePI(forms, isPred) : pipeline.parse(forms); } forms.clear(); forms.add("<root>"); isPred.clear(); isPred.add(false); // Root is not a predicate writer.write(s); senCount++; if (senCount % 100 == 0) { // TODO fix output in general, don't // print to System.out. Wrap a // printstream in some (static) // class, and allow people to adjust // this. While doing this, also add // the option to make the output // file be -, ie so it prints to // stdout. All kinds of errors // should goto stderr, and nothing // should be printed to stdout by // default System.out.println("Processing sentence " + senCount); } } else { String[] tokens = WHITESPACE_PATTERN.split(str); forms.add(tokens[1]); if (options.skipPI) isPred.add(tokens[12].equals("Y")); } } if (forms.size() > 1) { // We have the root token too, remember! writer.write(pipeline.parse(forms)); senCount++; } return senCount; }
private static int parseNonSegmentedLineByLine( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { int senCount = 0; String str; while ((str = in.readLine()) != null) { Sentence s = pipeline.parse(str); writer.write(s); senCount++; if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); // TODO, // same // as // below. } return senCount; }