private static int parseCoNLL09( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { List<String> forms = new ArrayList<String>(); forms.add("<root>"); List<Boolean> isPred = new ArrayList<Boolean>(); isPred.add(false); String str; int senCount = 0; while ((str = in.readLine()) != null) { if (str.trim().equals("")) { Sentence s; if (options.desegment) { s = pipeline.parse(ChineseDesegmenter.desegment(forms.toArray(new String[0]))); } else { s = options.skipPI ? pipeline.parseOraclePI(forms, isPred) : pipeline.parse(forms); } forms.clear(); forms.add("<root>"); isPred.clear(); isPred.add(false); // Root is not a predicate writer.write(s); senCount++; if (senCount % 100 == 0) { // TODO fix output in general, don't // print to System.out. Wrap a // printstream in some (static) // class, and allow people to adjust // this. While doing this, also add // the option to make the output // file be -, ie so it prints to // stdout. All kinds of errors // should goto stderr, and nothing // should be printed to stdout by // default System.out.println("Processing sentence " + senCount); } } else { String[] tokens = WHITESPACE_PATTERN.split(str); forms.add(tokens[1]); if (options.skipPI) isPred.add(tokens[12].equals("Y")); } } if (forms.size() > 1) { // We have the root token too, remember! writer.write(pipeline.parse(forms)); senCount++; } return senCount; }
public static void main(String[] args) throws Exception { long startTime = System.currentTimeMillis(); parseOptions = new ParseOptions(args); SemanticRoleLabeler srl; if (parseOptions.useReranker) { srl = new Reranker(parseOptions); // srl = // Reranker.fromZipFile(zipFile,parseOptions.skipPI,parseOptions.global_alfa,parseOptions.global_aiBeam,parseOptions.global_acBeam); } else { ZipFile zipFile = new ZipFile(parseOptions.modelFile); srl = parseOptions.skipPD ? Pipeline.fromZipFile(zipFile, new Step[] {Step.ai, Step.ac}) : parseOptions.skipPI ? Pipeline.fromZipFile(zipFile, new Step[] {Step.pd, Step.ai, Step.ac /* * ,Step.po, * Step.ao */}) : Pipeline.fromZipFile(zipFile); zipFile.close(); } SentenceWriter writer = null; if (parseOptions.printXML) writer = new FrameNetXMLWriter(parseOptions.output); else writer = new CoNLL09Writer(parseOptions.output); SentenceReader reader = parseOptions.skipPI ? new SRLOnlyCoNLL09Reader(parseOptions.inputCorpus) : new DepsOnlyCoNLL09Reader(parseOptions.inputCorpus); int senCount = 0; for (Sentence s : reader) { senCount++; if (senCount % 100 == 0) System.out.println("Parsing sentence " + senCount); srl.parseSentence(s); if (parseOptions.writeCoref) writer.specialwrite(s); else writer.write(s); } writer.close(); reader.close(); long totalTime = System.currentTimeMillis() - startTime; System.out.println("Done."); System.out.println(srl.getStatus()); System.out.println(); System.out.println("Total execution time: " + Util.insertCommas(totalTime) + "ms"); }
private static int parseNonSegmentedLineByLine( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { int senCount = 0; String str; while ((str = in.readLine()) != null) { Sentence s = pipeline.parse(str); writer.write(s); senCount++; if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); // TODO, // same // as // below. } return senCount; }
private static int parseFullDocument( CompletePipelineCMDLineOptions options, CompletePipeline pipeline, BufferedReader in, SentenceWriter writer) throws IOException, Exception { /** initialize * */ Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); props.put( "dcoref.sievePasses", "MarkRole," + "DiscourseMatch," + "ExactStringMatch," + "RelaxedExactStringMatch," + "PreciseConstructs," + "StrictHeadMatch1," + "StrictHeadMatch2," + "StrictHeadMatch3," + "StrictHeadMatch4," + "RelaxedHeadMatch"); StanfordCoreNLP stanfordpipeline = new StanfordCoreNLP(props); ExternalProcesses glove = new ExternalProcesses(options.glovedir); /** read full text * */ int senCount = 0; String str; StringBuffer text = new StringBuffer(); while ((str = in.readLine()) != null) { text.append(str); text.append("\n"); } /** document-level preprocessing * */ Annotation document = new Annotation(text.toString()); stanfordpipeline.annotate(document); Map<String, Double[]> word2vecs = glove.createvecs(document); Corpus c = new Corpus("tmp"); /** sentence-level preprocessing * */ for (CoreMap sentence : document.get(SentencesAnnotation.class)) { StringBuffer posOutput = new StringBuffer(); for (CoreLabel token : sentence.get(TokensAnnotation.class)) { if (posOutput.length() > 0) { posOutput.append(" "); } posOutput.append(token.word()); posOutput.append("_"); posOutput.append(token.tag()); } String parse = ExternalProcesses.runProcess( "nc " + options.mstserver.replaceAll(":", " "), posOutput.toString()); parse = parse.replaceAll("-\t-", "_\t_\n@#").replaceAll("@#\t", "").replaceAll("@#", ""); String[] lines = parse.split("\n"); String[] words = new String[lines.length + 1]; String[] lemmas = new String[lines.length + 1]; String[] tags = new String[lines.length + 1]; String[] morphs = new String[lines.length + 1]; int[] heads = new int[lines.length]; String[] deprels = new String[lines.length]; for (int i = 1; i < words.length; i++) { String[] parts = lines[i - 1].split("\t"); words[i] = sentence.get(TokensAnnotation.class).get(i - 1).word(); tags[i] = sentence.get(TokensAnnotation.class).get(i - 1).tag(); lemmas[i] = sentence.get(TokensAnnotation.class).get(i - 1).lemma(); morphs[i] = "_"; heads[i - 1] = Integer.parseInt(parts[6]); deprels[i - 1] = parts[7]; } Sentence sen = new Sentence(words, lemmas, tags, morphs); sen.setHeadsAndDeprels(heads, deprels); /* add labeled predicates from SEMAFOR */ String json = ExternalProcesses.runProcess("nc " + options.semaforserver.replaceAll(":", " "), parse); Pattern pred_frame = Pattern.compile( "\\{\"target\":\\{\"name\":\"([A-Za-z_]*)\",\"spans\":\\[\\{\"start\":([0-9]*),\""); Matcher m = pred_frame.matcher(json); while (m.find()) { String frame = m.group(1); int index = Integer.parseInt(m.group(2)); System.out.println(index + "\t" + frame); sen.makePredicate(index + 1); ((Predicate) sen.get(index + 1)).setSense(frame); } for (Word w : sen) if (word2vecs.containsKey(w.getForm().toLowerCase())) w.setRep(word2vecs.get(w.getForm().toLowerCase())); new CorpusSentence(sen, c); } /* add coref output to corpus */ Map<Integer, CorefChain> coref = document.get(CorefChainAnnotation.class); int num = 1; for (Map.Entry<Integer, CorefChain> entry : coref.entrySet()) { CorefChain cc = entry.getValue(); // skip singleton mentions if (cc.getMentionsInTextualOrder().size() == 1) continue; for (CorefMention m : cc.getMentionsInTextualOrder()) { c.addMention(c.get(m.sentNum - 1), m.headIndex, num); } num++; } for (Sentence sen : c) { pipeline.srl.parseSentence(sen); senCount++; if (senCount % 100 == 0) System.out.println("Processing sentence " + senCount); writer.write(sen); } return senCount; }