private static int parseCoNLL09( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { List<String> forms = new ArrayList<String>(); forms.add("<root>"); List<Boolean> isPred = new ArrayList<Boolean>(); isPred.add(false); String str; int senCount = 0; while ((str = in.readLine()) != null) { if (str.trim().equals("")) { Sentence s; if (options.desegment) { s = pipeline.parse(ChineseDesegmenter.desegment(forms.toArray(new String[0]))); } else { s = options.skipPI ? pipeline.parseOraclePI(forms, isPred) : pipeline.parse(forms); } forms.clear(); forms.add("<root>"); isPred.clear(); isPred.add(false); // Root is not a predicate writer.write(s); senCount++; if (senCount % 100 == 0) { // TODO fix output in general, don't // print to System.out. Wrap a // printstream in some (static) // class, and allow people to adjust // this. While doing this, also add // the option to make the output // file be -, ie so it prints to // stdout. All kinds of errors // should goto stderr, and nothing // should be printed to stdout by // default System.out.println("Processing sentence " + senCount); } } else { String[] tokens = WHITESPACE_PATTERN.split(str); forms.add(tokens[1]); if (options.skipPI) isPred.add(tokens[12].equals("Y")); } } if (forms.size() > 1) { // We have the root token too, remember! writer.write(pipeline.parse(forms)); senCount++; } return senCount; }
public static void main(String[] args) throws Exception { CompletePipelineCMDLineOptions options = new CompletePipelineCMDLineOptions(); options.parseCmdLineArgs(args); String error = FileExistenceVerifier.verifyCompletePipelineAllNecessaryModelFiles(options); if (error != null) { System.err.println(error); System.err.println(); System.err.println("Aborting."); System.exit(1); } CompletePipeline pipeline = getCompletePipeline(options); BufferedReader in = new BufferedReader( new InputStreamReader(new FileInputStream(options.input), Charset.forName("UTF-8"))); SentenceWriter writer = null; if (options.printANN) writer = new ANNWriter(options.output); else writer = new CoNLL09Writer(options.output); long start = System.currentTimeMillis(); int senCount; if (options.glovedir != null) { senCount = parseFullDocument(options, pipeline, in, writer); } else if (options.loadPreprocessorWithTokenizer) { senCount = parseNonSegmentedLineByLine(options, pipeline, in, writer); } else { senCount = parseCoNLL09(options, pipeline, in, writer); } in.close(); writer.close(); long time = System.currentTimeMillis() - start; System.out.println(pipeline.getStatusString()); System.out.println(); System.out.println("Total parsing time (ms): " + Util.insertCommas(time)); System.out.println("Overall speed (ms/sen): " + Util.insertCommas(time / senCount)); }
private static int parseNonSegmentedLineByLine( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { int senCount = 0; String str; while ((str = in.readLine()) != null) { Sentence s = pipeline.parse(str); writer.write(s); senCount++; if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); // TODO, // same // as // below. } return senCount; }