public void saveSerialized(String path) { try { IOUtils.writeObjectToFile(this, path); } catch (IOException e) { throw new RuntimeIOException(e); } }
public static void main(String[] args) throws IOException { PrintWriter out; if (args.length > 1) { out = new PrintWriter(args[1]); } else { out = new PrintWriter(System.out); } PrintWriter xmlOut = null; if (args.length > 2) { xmlOut = new PrintWriter(args[2]); } Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner,parse"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation annotation; if (args.length > 0) { annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0])); } else { annotation = new Annotation( "Kosgi Santosh sent an email to Stanford University. He didn't get a reply."); } pipeline.annotate(annotation); pipeline.prettyPrint(annotation, out); }
private Set<String> readDict(String filename) { Set<String> a = Generics.newHashSet(); // System.err.println("XM:::readDict(filename: " + filename + ")"); System.err.println("Loading affix dictionary from " + filename); try { /* if(filename.endsWith("in.as") ||filename.endsWith("in.city") ){ aDetectorReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "Big5_HKSCS")); }else{ aDetectorReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "GB18030")); } */ InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename); BufferedReader aDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8")); String aDetectorLine; // System.err.println("DEBUG: in affDict readDict"); while ((aDetectorLine = aDetectorReader.readLine()) != null) { // System.err.println("DEBUG: affDict: "+filename+" "+aDetectorLine); a.add(aDetectorLine); } is.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return a; }
/** * Creates a combined list of Entries using the provided mapping files. * * @param mappings List of mapping files * @return list of Entries */ private static List<Entry> readEntries( String annotatorName, Set<String> noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose, String... mappings) { // Unlike RegexNERClassifier, we don't bother sorting the entries // We leave it to TokensRegex NER to sort out the priorities and matches // (typically after all the matches has been made since for some TokenRegex expression, // we don't know how many tokens are matched until after the matching is done) List<Entry> entries = new ArrayList<>(); TrieMap<String, Entry> seenRegexes = new TrieMap<>(); Arrays.sort(mappings); for (String mapping : mappings) { BufferedReader rd = null; try { rd = IOUtils.readerFromString(mapping); readEntries( annotatorName, entries, seenRegexes, mapping, rd, noDefaultOverwriteLabels, ignoreCase, verbose); } catch (IOException e) { throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e); } finally { IOUtils.closeIgnoringExceptions(rd); } } if (mappings.length != 1) { logger.log( "TokensRegexNERAnnotator " + annotatorName + ": Read " + entries.size() + " unique entries from " + mappings.length + " files"); } return entries; }
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
public static SentimentModel loadSerialized(String path) { try { return IOUtils.readObjectFromURLOrClasspathOrFileSystem(path); } catch (IOException e) { throw new RuntimeIOException(e); } catch (ClassNotFoundException e) { throw new RuntimeIOException(e); } }
protected void read(String filename) { try { DataInputStream in = IOUtils.getDataInputStream(filename); read(in); in.close(); } catch (IOException e) { e.printStackTrace(); } }
protected void save(String filename, Map<String, Set<String>> tagTokens) { try { DataOutputStream out = IOUtils.getDataOutputStream(filename); save(out, tagTokens); out.close(); } catch (IOException e) { throw new RuntimeIOException(e); } }
/** * Saves the singleton predictor model to the given filename. If there is an error, a * RuntimeIOException is thrown. */ public void saveToSerialized(LogisticClassifier<String, String> predictor, String filename) { try { log.info("Writing singleton predictor in serialized format to file " + filename + ' '); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(predictor); out.close(); log.info("done."); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } }
public static void main(String[] args) throws IOException { String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; if (args.length > 0) { serializedClassifier = args[0]; } AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); /* For either a file to annotate or for the hardcoded text example, this demo file shows two ways to process the output, for teaching purposes. For the file, it shows both how to run NER on a String and how to run it on a whole file. For the hard-coded String, it shows how to run it on a single sentence, and how to do this and produce an inline XML output format. */ if (args.length > 1) { String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print(word.word() + '/' + word.get(AnswerAnnotation.class) + ' '); } System.out.println(); } out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print(word.word() + '/' + word.get(AnswerAnnotation.class) + ' '); } System.out.println(); } } else { String s1 = "Good afternoon Rajat Raina, how are you today?"; String s2 = "I go to school at Stanford University, which is located in California."; System.out.println(classifier.classifyToString(s1)); System.out.println(classifier.classifyWithInlineXML(s2)); System.out.println(classifier.classifyToString(s2, "xml", true)); int i = 0; for (List<CoreLabel> lcl : classifier.classify(s2)) { for (CoreLabel cl : lcl) { System.out.println(i++ + ":"); System.out.println(cl); } } } }
public static LogisticClassifier<String, String> getSingletonPredictorFromSerializedFile( String serializedFile) { try { ObjectInputStream ois = IOUtils.readStreamFromString(serializedFile); Object o = ois.readObject(); if (o instanceof LogisticClassifier<?, ?>) { return (LogisticClassifier<String, String>) o; } throw new ClassCastException("Wanted SingletonPredictor, got " + o.getClass()); } catch (IOException e) { throw new RuntimeIOException(e); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } }
/** Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations( StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set( CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(Sentence.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
private static Set<String> readDict(String filename, boolean normalize) { Set<String> word = Generics.newHashSet(); logger.info( "Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename); try { InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename); BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8")); int i = 0; for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) { i++; // String[] fields = wordDetectorLine.split(" "); // logger.debug("DEBUG: "+filename+" "+wordDetectorLine); int origLeng = wordDetectorLine.length(); wordDetectorLine = wordDetectorLine.trim(); int newLeng = wordDetectorLine.length(); if (newLeng != origLeng) { EncodingPrintWriter.err.println( "Line " + i + " of " + filename + " has leading/trailing whitespace: |" + wordDetectorLine + "|", "UTF-8"); } if (newLeng == 0) { EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8"); } else { if (normalize) { wordDetectorLine = ChineseUtils.normalize( wordDetectorLine, ChineseUtils.ASCII, ChineseUtils.ASCII, ChineseUtils.NORMALIZE); } word.add(wordDetectorLine); } } is.close(); } catch (IOException e) { throw new RuntimeIOException(e); } return word; }
private void uploadAlignmentLexicon() { for (String line : IOUtils.readLines(lexiconFile)) { LexiconValue lv = Json.readValueHard(line, LexiconValue.class); double newCount = MapUtils.getDouble(lv.features, "Intersection_size_typed", 0.0); if (newCount > opts.alignmentLexiconThreshold) { if (formulaToLexemsMap.containsKey(lv.formula)) { double currCount = formulaToLexemsMap.get(lv.formula).getSecond(); if (newCount > currCount) { formulaToLexemsMap.put(lv.formula, Pair.newPair(lv.lexeme, newCount)); } } else { formulaToLexemsMap.put(lv.formula, Pair.newPair(lv.lexeme, newCount)); } } } }
/** * Reads the POST contents of the request and parses it into an Annotation object, ready to be * annotated. This method can also read a serialized document, if the input format is set to be * serialized. * * @param props The properties we are annotating with. This is where the input format is retrieved * from. * @param httpExchange The exchange we are reading POST data from. * @return An Annotation representing the read document. * @throws IOException Thrown if we cannot read the POST data. * @throws ClassNotFoundException Thrown if we cannot load the serializer. */ private Annotation getDocument(Properties props, HttpExchange httpExchange) throws IOException, ClassNotFoundException { String inputFormat = props.getProperty("inputFormat", "text"); switch (inputFormat) { case "text": return new Annotation( IOUtils.slurpReader(new InputStreamReader(httpExchange.getRequestBody()))); case "serialized": String inputSerializerName = props.getProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName()); AnnotationSerializer serializer = MetaClass.create(inputSerializerName).createInstance(); Pair<Annotation, InputStream> pair = serializer.read(httpExchange.getRequestBody()); return pair.first; default: throw new IOException("Could not parse input format: " + inputFormat); } }
/** * Given the path to a file representing the text based serialization of a Linear Classifier, * reconstitutes and returns that LinearClassifier. * * <p>TODO: Leverage Index */ public static LinearClassifier<String, String> loadFromFilename(String file) { try { BufferedReader in = IOUtils.readerFromString(file); // Format: read indices first, weights, then thresholds Index<String> labelIndex = HashIndex.loadFromReader(in); Index<String> featureIndex = HashIndex.loadFromReader(in); double[][] weights = new double[featureIndex.size()][labelIndex.size()]; int currLine = 1; String line = in.readLine(); while (line != null && line.length() > 0) { String[] tuples = line.split(LinearClassifier.TEXT_SERIALIZATION_DELIMITER); if (tuples.length != 3) { throw new Exception( "Error: incorrect number of tokens in weight specifier, line=" + currLine + " in file " + file); } currLine++; int feature = Integer.valueOf(tuples[0]); int label = Integer.valueOf(tuples[1]); double value = Double.valueOf(tuples[2]); weights[feature][label] = value; line = in.readLine(); } // First line in thresholds is the number of thresholds int numThresholds = Integer.valueOf(in.readLine()); double[] thresholds = new double[numThresholds]; int curr = 0; while ((line = in.readLine()) != null) { double tval = Double.valueOf(line.trim()); thresholds[curr++] = tval; } in.close(); LinearClassifier<String, String> classifier = new LinearClassifier<String, String>(weights, featureIndex, labelIndex); return classifier; } catch (Exception e) { System.err.println("Error in LinearClassifierFactory, loading from file=" + file); e.printStackTrace(); return null; } }
protected void read(String filename) { try { DataInputStream rf = IOUtils.getDataInputStream(filename); read(rf, filename); int len1 = rf.readInt(); for (int i = 0; i < len1; i++) { int iO = rf.readInt(); CountWrapper tC = new CountWrapper(); tC.read(rf); this.partTakingVerbs.put(iO, tC); } rf.close(); } catch (IOException e) { e.printStackTrace(); } }
public StanfordCoreNLPServer(int port) throws IOException { serverPort = port; defaultProps = new Properties(); defaultProps.setProperty( "annotators", "tokenize, ssplit, pos, lemma, ner, parse, depparse, natlog, openie, dcoref"); defaultProps.setProperty("inputFormat", "text"); defaultProps.setProperty("outputFormat", "json"); // Generate and write a shutdown key String tmpDir = System.getProperty("java.io.tmpdir"); File tmpFile = new File(tmpDir + File.separator + "corenlp.shutdown"); tmpFile.deleteOnExit(); if (tmpFile.exists()) { if (!tmpFile.delete()) { throw new IllegalStateException("Could not delete shutdown key file"); } } this.shutdownKey = new BigInteger(130, new Random()).toString(32); IOUtils.writeStringToFile(shutdownKey, tmpFile.getPath(), "utf-8"); // Set the static page handler this.staticPageHandle = new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-brat.html"); }
public static void main(String[] args) throws IOException { String parserModel = null; String sentimentModel = null; String filename = null; String fileList = null; boolean stdin = false; boolean filterUnknown = false; List<Output> outputFormats = Collections.singletonList(Output.ROOT); Input inputFormat = Input.TEXT; String tlppClass = DEFAULT_TLPP_CLASS; for (int argIndex = 0; argIndex < args.length; ) { if (args[argIndex].equalsIgnoreCase("-sentimentModel")) { sentimentModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-file")) { filename = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-fileList")) { fileList = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-stdin")) { stdin = true; argIndex++; } else if (args[argIndex].equalsIgnoreCase("-input")) { inputFormat = Input.valueOf(args[argIndex + 1].toUpperCase()); argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-output")) { String[] formats = args[argIndex + 1].split(","); outputFormats = new ArrayList<Output>(); for (String format : formats) { outputFormats.add(Output.valueOf(format.toUpperCase())); } argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-filterUnknown")) { filterUnknown = true; argIndex++; } else if (args[argIndex].equalsIgnoreCase("-tlppClass")) { tlppClass = args[argIndex + 1]; argIndex += 2; } else if (args[argIndex].equalsIgnoreCase("-help")) { help(); System.exit(0); } else { System.err.println("Unknown argument " + args[argIndex + 1]); help(); throw new IllegalArgumentException("Unknown argument " + args[argIndex + 1]); } } // We construct two pipelines. One handles tokenization, if // necessary. The other takes tokenized sentences and converts // them to sentiment trees. Properties pipelineProps = new Properties(); Properties tokenizerProps = null; if (sentimentModel != null) { pipelineProps.setProperty("sentiment.model", sentimentModel); } if (parserModel != null) { pipelineProps.setProperty("parse.model", parserModel); } if (stdin) { pipelineProps.setProperty("ssplit.eolonly", "true"); } if (inputFormat == Input.TREES) { pipelineProps.setProperty("annotators", "binarizer, sentiment"); pipelineProps.setProperty( "customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator"); pipelineProps.setProperty("binarizer.tlppClass", tlppClass); pipelineProps.setProperty("enforceRequirements", "false"); } else { pipelineProps.setProperty("annotators", "parse, sentiment"); pipelineProps.setProperty("enforceRequirements", "false"); tokenizerProps = new Properties(); tokenizerProps.setProperty("annotators", "tokenize, ssplit"); } int count = 0; if (filename != null) count++; if (fileList != null) count++; if (stdin) count++; if (count > 1) { throw new IllegalArgumentException("Please only specify one of -file, -fileList or -stdin"); } if (count == 0) { throw new IllegalArgumentException("Please specify either -file, -fileList or -stdin"); } StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps); StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps); if (filename != null) { // Process a file. The pipeline will do tokenization, which // means it will split it into sentences as best as possible // with the tokenizer. List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, filename, filterUnknown); for (Annotation annotation : annotations) { pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { System.out.println(sentence); outputTree(System.out, sentence, outputFormats); } } } else if (fileList != null) { // Process multiple files. The pipeline will do tokenization, // which means it will split it into sentences as best as // possible with the tokenizer. Output will go to filename.out // for each file. for (String file : fileList.split(",")) { List<Annotation> annotations = getAnnotations(tokenizer, inputFormat, file, filterUnknown); FileOutputStream fout = new FileOutputStream(file + ".out"); PrintStream pout = new PrintStream(fout); for (Annotation annotation : annotations) { pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { pout.println(sentence); outputTree(pout, sentence, outputFormats); } } pout.flush(); fout.close(); } } else { // Process stdin. Each line will be treated as a single sentence. System.err.println("Reading in text from stdin."); System.err.println("Please enter one sentence per line."); System.err.println("Processing will end when EOF is reached."); BufferedReader reader = IOUtils.readerFromStdin("utf-8"); for (String line; (line = reader.readLine()) != null; ) { line = line.trim(); if (line.length() > 0) { Annotation annotation = tokenizer.process(line); pipeline.annotate(annotation); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { outputTree(System.out, sentence, outputFormats); } } else { // Output blank lines for blank lines so the tool can be // used for line-by-line text processing System.out.println(""); } } } }
public void annotate(CoreMap document) throws IOException { // write input file in GUTime format Element inputXML = toInputXML(document); File inputFile = File.createTempFile("gutime", ".input"); // Document doc = new Document(inputXML); PrintWriter inputWriter = new PrintWriter(inputFile); inputWriter.println(inputXML.toXML()); // new XMLOutputter().output(inputXML, inputWriter); inputWriter.close(); boolean useFirstDate = (!document.has(CoreAnnotations.CalendarAnnotation.class) && !document.has(CoreAnnotations.DocDateAnnotation.class)); ArrayList<String> args = new ArrayList<String>(); args.add("perl"); args.add("-I" + this.gutimePath.getPath()); args.add(new File(this.gutimePath, "TimeTag.pl").getPath()); if (useFirstDate) args.add("-FDNW"); args.add(inputFile.getPath()); // run GUTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.run(process, outputWriter, null); String output = outputWriter.getBuffer().toString(); Pattern docClose = Pattern.compile("</DOC>.*", Pattern.DOTALL); output = docClose.matcher(output).replaceAll("</DOC>"); // parse the GUTime output Element outputXML; try { Document newNodeDocument = new Builder().build(output, ""); outputXML = newNodeDocument.getRootElement(); } catch (ParsingException ex) { throw new RuntimeException( String.format( "error:\n%s\ninput:\n%s\noutput:\n%s", ex, IOUtils.slurpFile(inputFile), output)); } /* try { outputXML = new SAXBuilder().build(new StringReader(output)).getRootElement(); } catch (JDOMException e) { throw new RuntimeException(String.format("error:\n%s\ninput:\n%s\noutput:\n%s", e, IOUtils.slurpFile(inputFile), output)); } */ inputFile.delete(); // get Timex annotations List<CoreMap> timexAnns = toTimexCoreMaps(outputXML, document); document.set(TimexAnnotations.class, timexAnns); if (outputResults) { System.out.println(timexAnns); } // align Timex annotations to sentences int timexIndex = 0; for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { int sentBegin = beginOffset(sentence); int sentEnd = endOffset(sentence); // skip times before the sentence while (timexIndex < timexAnns.size() && beginOffset(timexAnns.get(timexIndex)) < sentBegin) { ++timexIndex; } // determine times within the sentence int sublistBegin = timexIndex; int sublistEnd = timexIndex; while (timexIndex < timexAnns.size() && sentBegin <= beginOffset(timexAnns.get(timexIndex)) && endOffset(timexAnns.get(timexIndex)) <= sentEnd) { ++sublistEnd; ++timexIndex; } // set the sentence timexes sentence.set(TimexAnnotations.class, timexAnns.subList(sublistBegin, sublistEnd)); } }
public double score(String filename, String delimiter, String boundary) throws IOException { return score(IOUtils.getBufferedFileReader(filename), delimiter, boundary); }
/** {@inheritDoc} */ @Override public void print(Annotation annotation, OutputStream stream, Options options) throws IOException { PrintWriter os = new PrintWriter(IOUtils.encodedOutputStreamWriter(stream, options.encoding)); print(annotation, os, options); }
/** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; System.err.println("Writing parser in serialized format to file " + filename + ' '); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); System.err.println("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(Sentence.listToString(gold.preTerminalYield())); pw.println(Sentence.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); }
/** * Prints out all matches of a semgrex pattern on a file of dependencies. <br> * Usage:<br> * java edu.stanford.nlp.semgraph.semgrex.SemgrexPattern [args] <br> * See the help() function for a list of possible arguments to provide. */ public static void main(String[] args) throws IOException { Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(PATTERN, 1); flagMap.put(TREE_FILE, 1); flagMap.put(MODE, 1); flagMap.put(EXTRAS, 1); flagMap.put(CONLLU_FILE, 1); flagMap.put(OUTPUT_FORMAT_OPTION, 1); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); // TODO: allow patterns to be extracted from a file if (!(argsMap.containsKey(PATTERN)) || argsMap.get(PATTERN).length == 0) { help(); System.exit(2); } SemgrexPattern semgrex = SemgrexPattern.compile(argsMap.get(PATTERN)[0]); String modeString = DEFAULT_MODE; if (argsMap.containsKey(MODE) && argsMap.get(MODE).length > 0) { modeString = argsMap.get(MODE)[0].toUpperCase(); } SemanticGraphFactory.Mode mode = SemanticGraphFactory.Mode.valueOf(modeString); String outputFormatString = DEFAULT_OUTPUT_FORMAT; if (argsMap.containsKey(OUTPUT_FORMAT_OPTION) && argsMap.get(OUTPUT_FORMAT_OPTION).length > 0) { outputFormatString = argsMap.get(OUTPUT_FORMAT_OPTION)[0].toUpperCase(); } OutputFormat outputFormat = OutputFormat.valueOf(outputFormatString); boolean useExtras = true; if (argsMap.containsKey(EXTRAS) && argsMap.get(EXTRAS).length > 0) { useExtras = Boolean.valueOf(argsMap.get(EXTRAS)[0]); } List<SemanticGraph> graphs = Generics.newArrayList(); // TODO: allow other sources of graphs, such as dependency files if (argsMap.containsKey(TREE_FILE) && argsMap.get(TREE_FILE).length > 0) { for (String treeFile : argsMap.get(TREE_FILE)) { System.err.println("Loading file " + treeFile); MemoryTreebank treebank = new MemoryTreebank(new TreeNormalizer()); treebank.loadPath(treeFile); for (Tree tree : treebank) { // TODO: allow other languages... this defaults to English SemanticGraph graph = SemanticGraphFactory.makeFromTree( tree, mode, useExtras ? GrammaticalStructure.Extras.MAXIMAL : GrammaticalStructure.Extras.NONE, true); graphs.add(graph); } } } if (argsMap.containsKey(CONLLU_FILE) && argsMap.get(CONLLU_FILE).length > 0) { CoNLLUDocumentReader reader = new CoNLLUDocumentReader(); for (String conlluFile : argsMap.get(CONLLU_FILE)) { System.err.println("Loading file " + conlluFile); Iterator<SemanticGraph> it = reader.getIterator(IOUtils.readerFromString(conlluFile)); while (it.hasNext()) { SemanticGraph graph = it.next(); graphs.add(graph); } } } for (SemanticGraph graph : graphs) { SemgrexMatcher matcher = semgrex.matcher(graph); if (!(matcher.find())) { continue; } if (outputFormat == OutputFormat.LIST) { System.err.println("Matched graph:"); System.err.println(graph.toString(SemanticGraph.OutputFormat.LIST)); boolean found = true; while (found) { System.err.println( "Matches at: " + matcher.getMatch().value() + "-" + matcher.getMatch().index()); List<String> nodeNames = Generics.newArrayList(); nodeNames.addAll(matcher.getNodeNames()); Collections.sort(nodeNames); for (String name : nodeNames) { System.err.println( " " + name + ": " + matcher.getNode(name).value() + "-" + matcher.getNode(name).index()); } System.err.println(); found = matcher.find(); } } else if (outputFormat == OutputFormat.OFFSET) { if (graph.vertexListSorted().isEmpty()) { continue; } System.out.printf( "+%d %s%n", graph.vertexListSorted().get(0).get(CoreAnnotations.LineNumberAnnotation.class), argsMap.get(CONLLU_FILE)[0]); } } }
public static void main(String[] args) throws Exception { // String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz"; String serializedClassifier = "classifiers/english.muc.7class.distsim.crf.ser.gz"; if (args.length > 0) { serializedClassifier = args[0]; } AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifier(serializedClassifier); /* For either a file to annotate or for the hardcoded text example, this demo file shows several ways to process the input, for teaching purposes. */ if (args.length > 1) { /* For the file, it shows (1) how to run NER on a String, (2) how to get the entities in the String with character offsets, and (3) how to run NER on a whole file (without loading it into a String). */ String fileContents = IOUtils.slurpFile(args[1]); List<List<CoreLabel>> out = classifier.classify(fileContents); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print( word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); out = classifier.classifyFile(args[1]); for (List<CoreLabel> sentence : out) { for (CoreLabel word : sentence) { System.out.print( word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' '); } System.out.println(); } System.out.println("---"); List<Triple<String, Integer, Integer>> list = classifier.classifyToCharacterOffsets(fileContents); for (Triple<String, Integer, Integer> item : list) { // print entity/or non-entity - their nearby tokens System.out.println( item.first() + ": " + fileContents.substring(item.second(), item.third())); } System.out.println("---"); System.out.println("Ten best entity labelings"); DocumentReaderAndWriter<CoreLabel> readerAndWriter = classifier.makePlainTextReaderAndWriter(); classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter); System.out.println("---"); System.out.println("Per-token marginalized probabilities"); classifier.printProbs(args[1], readerAndWriter); // -- This code prints out the first order (token pair) clique probabilities. // -- But that output is a bit overwhelming, so we leave it commented out by default. // System.out.println("---"); // System.out.println("First Order Clique Probabilities"); // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter); } else { /* For the hard-coded String, it shows how to run it on a single sentence, and how to do this and produce several formats, including slash tags and an inline XML output format. It also shows the full contents of the {@code CoreLabel}s that are constructed by the classifier. And it shows getting out the probabilities of different assignments and an n-best list of classifications with probabilities. */ String[] example = { "Good afternoon Rajat Raina, how are you today? I go to Washington DC on September 19. And Tomorrow.", "I go to school at Stanford University, which is located in California." }; for (String str : example) { System.out.println(classifier.classifyToString(str)); } System.out.println("---"); // ***sentence-by-sentence for (String str : example) { // This one puts in spaces and newlines between tokens, so just print not println. System.out.print(classifier.classifyToString(str, "slashTags", false)); } System.out.println("---"); // ***print: entities + Classes + remaining text in the text for (String str : example) { // This one is best for dealing with the output as a TSV (tab-separated column) file. // The first column gives entities, the second their classes, and the third the remaining // text in a document System.out.print(classifier.classifyToString(str, "tabbedEntities", false)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyWithInlineXML(str)); } System.out.println("---"); for (String str : example) { System.out.println(classifier.classifyToString(str, "xml", true)); } System.out.println("---"); for (String str : example) { System.out.print(classifier.classifyToString(str, "tsv", false)); } System.out.println("---"); // This gets out entities with character offsets System.out.print("character offsets"); int j = 0; for (String str : example) { j++; List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str); for (Triple<String, Integer, Integer> trip : triples) { System.out.printf( "%s over character offsets [%d, %d) in sentence %d.%n", trip.first(), trip.second(), trip.third, j); } } System.out.println("---"); // This prints out all the details of what is stored for each token int i = 0; for (String str : example) { for (List<CoreLabel> lcl : classifier.classify(str)) { for (CoreLabel cl : lcl) { System.out.print(i++ + ": "); System.out.println(cl.toShorterString()); } } } System.out.println("---"); } }
/** * Prints out all matches of a tree pattern on each tree in the path. Usage: <br> * <br> * <code> * java edu.stanford.nlp.trees.tregex.TregexPattern [[-TCwfosnu] [-filter] [-h <node-name>]]* pattern * filepath </code> * * <p>Arguments:<br> * * <ul> * <li><code>pattern</code>: the tree pattern which optionally names some set of nodes (i.e., * gives it the "handle") <code>=name</code> (for some arbitrary string "name") * <li><code>filepath</code>: the path to files with trees. If this is a directory, there will * be recursive descent and the pattern will be run on all files beneath the specified * directory. * </ul> * * <p>Options:<br> * <li><code>-C</code> suppresses printing of matches, so only the number of matches is printed. * <li><code>-w</code> causes the whole of a tree that matches to be printed. * <li><code>-f</code> causes the filename to be printed. * <li><code>-i <filename></code> causes the pattern to be matched to be read from <code> * <filename></code> rather than the command line. Don't specify a pattern when this * option is used. * <li><code>-o</code> Specifies that each tree node can be reported only once as the root of a * match (by default a node will be printed once for every <em>way</em> the pattern matches). * <li><code>-s</code> causes trees to be printed all on one line (by default they are pretty * printed). * <li><code>-n</code> causes the number of the tree in which the match was found to be printed * before every match. * <li><code>-u</code> causes only the label of each matching node to be printed, not complete * subtrees. * <li><code>-t</code> causes only the yield (terminal words) of the selected node to be printed * (or the yield of the whole tree, if the <code>-w</code> option is used). * <li><code>-encoding <charset_encoding></code> option allows specification of character * encoding of trees.. * <li><code>-h <node-handle></code> If a <code>-h</code> option is given, the root tree * node will not be printed. Instead, for each <code>node-handle</code> specified, the node * matched and given that handle will be printed. Multiple nodes can be printed by using the * <code>-h</code> option multiple times on a single command line. * <li><code>-hf <headfinder-class-name></code> use the specified {@link HeadFinder} class * to determine headship relations. * <li><code>-hfArg <string></code> pass a string argument in to the {@link HeadFinder} * class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple * arguments. * <li><code>-trf <TreeReaderFactory-class-name></code> use the specified {@link * TreeReaderFactory} class to read trees from files. * <li><code>-v</code> print every tree that contains no matches of the specified pattern, but * print no matches to the pattern. * <li><code>-x</code> Instead of the matched subtree, print the matched subtree's identifying * number as defined in <tt>tgrep2</tt>:a unique identifier for the subtree and is in the form * s:n, where s is an integer specifying the sentence number in the corpus (starting with 1), * and n is an integer giving the order in which the node is encountered in a depth-first * search starting with 1 at top node in the sentence tree. * <li><code>-extract <code> <tree-file></code> extracts the subtree s:n specified by * <tt>code</tt> from the specified <tt>tree-file</tt>. Overrides all other behavior of * tregex. Can't specify multiple encodings etc. yet. * <li><code>-extractFile <code-file> <tree-file></code> extracts every subtree * specified by the subtree codes in <tt>code-file</tt>, which must appear exactly one per * line, from the specified <tt>tree-file</tt>. Overrides all other behavior of tregex. Can't * specify multiple encodings etc. yet. * <li><code>-filter</code> causes this to act as a filter, reading tree input from stdin * <li><code>-T</code> causes all trees to be printed as processed (for debugging purposes). * Otherwise only matching nodes are printed. * <li><code>-macros <filename></code> filename with macro substitutions to use. file with * tab separated lines original-tab-replacement * </ul> */ public static void main(String[] args) throws IOException { Timing.startTime(); StringBuilder treePrintFormats = new StringBuilder(); String printNonMatchingTreesOption = "-v"; String subtreeCodeOption = "-x"; String extractSubtreesOption = "-extract"; String extractSubtreesFileOption = "-extractFile"; String inputFileOption = "-i"; String headFinderOption = "-hf"; String headFinderArgOption = "-hfArg"; String trfOption = "-trf"; String headFinderClassName = null; String[] headFinderArgs = StringUtils.EMPTY_STRING_ARRAY; String treeReaderFactoryClassName = null; String printHandleOption = "-h"; String markHandleOption = "-k"; String encodingOption = "-encoding"; String encoding = "UTF-8"; String macroOption = "-macros"; String macroFilename = ""; String yieldOnly = "-t"; String printAllTrees = "-T"; String quietMode = "-C"; String wholeTreeMode = "-w"; String filenameOption = "-f"; String oneMatchPerRootNodeMode = "-o"; String reportTreeNumbers = "-n"; String rootLabelOnly = "-u"; String oneLine = "-s"; Map<String, Integer> flagMap = Generics.newHashMap(); flagMap.put(extractSubtreesOption, 2); flagMap.put(extractSubtreesFileOption, 2); flagMap.put(subtreeCodeOption, 0); flagMap.put(printNonMatchingTreesOption, 0); flagMap.put(encodingOption, 1); flagMap.put(inputFileOption, 1); flagMap.put(printHandleOption, 1); flagMap.put(markHandleOption, 2); flagMap.put(headFinderOption, 1); flagMap.put(headFinderArgOption, 1); flagMap.put(trfOption, 1); flagMap.put(macroOption, 1); flagMap.put(yieldOnly, 0); flagMap.put(quietMode, 0); flagMap.put(wholeTreeMode, 0); flagMap.put(printAllTrees, 0); flagMap.put(filenameOption, 0); flagMap.put(oneMatchPerRootNodeMode, 0); flagMap.put(reportTreeNumbers, 0); flagMap.put(rootLabelOnly, 0); flagMap.put(oneLine, 0); Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap); args = argsMap.get(null); if (argsMap.containsKey(encodingOption)) { encoding = argsMap.get(encodingOption)[0]; System.err.println("Encoding set to " + encoding); } PrintWriter errPW = new PrintWriter(new OutputStreamWriter(System.err, encoding), true); if (argsMap.containsKey(extractSubtreesOption)) { List<String> subTreeStrings = Collections.singletonList(argsMap.get(extractSubtreesOption)[0]); extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesOption)[1]); return; } if (argsMap.containsKey(extractSubtreesFileOption)) { List<String> subTreeStrings = Arrays.asList( IOUtils.slurpFile(argsMap.get(extractSubtreesFileOption)[0]).split("\n|\r|\n\r")); extractSubtrees(subTreeStrings, argsMap.get(extractSubtreesFileOption)[0]); return; } if (args.length < 1) { errPW.println( "Usage: java edu.stanford.nlp.trees.tregex.TregexPattern [-T] [-C] [-w] [-f] [-o] [-n] [-s] [-filter] [-hf class] [-trf class] [-h handle]* pattern [filepath]"); return; } String matchString = args[0]; if (argsMap.containsKey(macroOption)) { macroFilename = argsMap.get(macroOption)[0]; } if (argsMap.containsKey(headFinderOption)) { headFinderClassName = argsMap.get(headFinderOption)[0]; errPW.println("Using head finder " + headFinderClassName + "..."); } if (argsMap.containsKey(headFinderArgOption)) { headFinderArgs = argsMap.get(headFinderArgOption); } if (argsMap.containsKey(trfOption)) { treeReaderFactoryClassName = argsMap.get(trfOption)[0]; errPW.println("Using tree reader factory " + treeReaderFactoryClassName + "..."); } if (argsMap.containsKey(printAllTrees)) { TRegexTreeVisitor.printTree = true; } if (argsMap.containsKey(inputFileOption)) { String inputFile = argsMap.get(inputFileOption)[0]; matchString = IOUtils.slurpFile(inputFile, encoding); String[] newArgs = new String[args.length + 1]; System.arraycopy(args, 0, newArgs, 1, args.length); args = newArgs; } if (argsMap.containsKey(quietMode)) { TRegexTreeVisitor.printMatches = false; TRegexTreeVisitor.printNumMatchesToStdOut = true; } if (argsMap.containsKey(printNonMatchingTreesOption)) { TRegexTreeVisitor.printNonMatchingTrees = true; } if (argsMap.containsKey(subtreeCodeOption)) { TRegexTreeVisitor.printSubtreeCode = true; TRegexTreeVisitor.printMatches = false; } if (argsMap.containsKey(wholeTreeMode)) { TRegexTreeVisitor.printWholeTree = true; } if (argsMap.containsKey(filenameOption)) { TRegexTreeVisitor.printFilename = true; } if (argsMap.containsKey(oneMatchPerRootNodeMode)) TRegexTreeVisitor.oneMatchPerRootNode = true; if (argsMap.containsKey(reportTreeNumbers)) TRegexTreeVisitor.reportTreeNumbers = true; if (argsMap.containsKey(rootLabelOnly)) { treePrintFormats.append(TreePrint.rootLabelOnlyFormat).append(','); } else if (argsMap.containsKey(oneLine)) { // display short form treePrintFormats.append("oneline,"); } else if (argsMap.containsKey(yieldOnly)) { treePrintFormats.append("words,"); } else { treePrintFormats.append("penn,"); } HeadFinder hf = new CollinsHeadFinder(); if (headFinderClassName != null) { Class[] hfArgClasses = new Class[headFinderArgs.length]; for (int i = 0; i < hfArgClasses.length; i++) hfArgClasses[i] = String.class; try { hf = (HeadFinder) Class.forName(headFinderClassName) .getConstructor(hfArgClasses) .newInstance( (Object[]) headFinderArgs); // cast to Object[] necessary to avoid varargs-related // warning. } catch (Exception e) { throw new RuntimeException("Error occurred while constructing HeadFinder: " + e); } } TRegexTreeVisitor.tp = new TreePrint(treePrintFormats.toString(), new PennTreebankLanguagePack()); try { // TreePattern p = TreePattern.compile("/^S/ > S=dt $++ '' $-- ``"); TregexPatternCompiler tpc = new TregexPatternCompiler(hf); Macros.addAllMacros(tpc, macroFilename, encoding); TregexPattern p = tpc.compile(matchString); errPW.println("Pattern string:\n" + p.pattern()); errPW.println("Parsed representation:"); p.prettyPrint(errPW); String[] handles = argsMap.get(printHandleOption); if (argsMap.containsKey("-filter")) { TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName); treebank = new MemoryTreebank( trf, encoding); // has to be in memory since we're not storing it on disk // read from stdin Reader reader = new BufferedReader(new InputStreamReader(System.in, encoding)); ((MemoryTreebank) treebank).load(reader); reader.close(); } else if (args.length == 1) { errPW.println("using default tree"); TreeReader r = new PennTreeReader( new StringReader( "(VP (VP (VBZ Try) (NP (NP (DT this) (NN wine)) (CC and) (NP (DT these) (NNS snails)))) (PUNCT .))"), new LabeledScoredTreeFactory(new StringLabelFactory())); Tree t = r.readTree(); treebank = new MemoryTreebank(); treebank.add(t); } else { int last = args.length - 1; errPW.println("Reading trees from file(s) " + args[last]); TreeReaderFactory trf = getTreeReaderFactory(treeReaderFactoryClassName); treebank = new DiskTreebank(trf, encoding); treebank.loadPath(args[last], null, true); } TRegexTreeVisitor vis = new TRegexTreeVisitor(p, handles, encoding); treebank.apply(vis); Timing.endTime(); if (TRegexTreeVisitor.printMatches) { errPW.println("There were " + vis.numMatches() + " matches in total."); } if (TRegexTreeVisitor.printNumMatchesToStdOut) { System.out.println(vis.numMatches()); } } catch (IOException e) { e.printStackTrace(); } catch (TregexParseException e) { errPW.println("Error parsing expression: " + args[0]); errPW.println("Parse exception: " + e.toString()); } }
public FileHandler(String fileOrClasspath) throws IOException { this.content = IOUtils.slurpReader(IOUtils.getBufferedReaderFromClasspathOrFileSystem(fileOrClasspath)); }
public static void setClass2KeyMapping(File file) throws ClassNotFoundException { for (String line : IOUtils.readLines(file)) { String[] toks = line.split("###"); class2KeyMapping.put(Class.forName(toks[0]), toks[1]); } }