/** * Parses a sentence and returns the parse tree. * * @param sentence a sentence * @return Tree character offsets in keys BEGIN_KEY and END_KEY */ @SuppressWarnings("unchecked") public static Tree parseTree(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce stanford Tree log.debug("Parsing sentence"); Tree tree = null; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); tree = parser.getBestParse(); } // label tree with character extents // log.debug("Setting character extents"); // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1)); // log.debug("Creating offset mapping"); // List<RangeMap> mapping = createMapping(sentence); // log.debug(mapping.toString()); // log.debug("Applying offset mapping"); // mapOffsets(tree, mapping); return tree; }
// todo: give options for document splitting. A line or the whole file or // sentence splitting as now public Iterator<List<IN>> getIterator(Reader r) { Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r); // PTBTokenizer.newPTBTokenizer(r, false, true); List<IN> words = new ArrayList<IN>(); IN previous = tokenFactory.makeToken(); StringBuilder prepend = new StringBuilder(); /* * This changes SGML tags into whitespace -- it should maybe be moved * elsewhere */ while (tokenizer.hasNext()) { IN w = tokenizer.next(); String word = w.get(CoreAnnotations.TextAnnotation.class); Matcher m = sgml.matcher(word); if (m.matches()) { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); prepend.append(before).append(word); String previousTokenAfter = StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class)); previous.set(AfterAnnotation.class, previousTokenAfter + word + after); // previous.appendAfter(w.word() + w.after()); } else { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); if (prepend.length() > 0) { w.set(BeforeAnnotation.class, prepend.toString() + before); // w.prependBefore(prepend.toString()); prepend = new StringBuilder(); } words.add(w); previous = w; } } List<List<IN>> sentences = wts.process(words); String after = ""; IN last = null; for (List<IN> sentence : sentences) { int pos = 0; for (IN w : sentence) { w.set(PositionAnnotation.class, Integer.toString(pos)); after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); w.remove(AfterAnnotation.class); last = w; } } if (last != null) { last.set(AfterAnnotation.class, after); } return sentences.iterator(); }
@Override public Sequence<IString> process(String input) { String tokenizerInput = toUncased(input.trim()); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput)); List<String> outputStrings = new LinkedList<>(); while (tokenizer.hasNext()) { String string = tokenizer.next().get(TextAnnotation.class); outputStrings.add(string); } return IStrings.toIStringSequence(outputStrings); }
/** * Parses a sentence and returns the PCFG score as a confidence measure. * * @param sentence a sentence * @return PCFG score */ @SuppressWarnings("unchecked") public static double getPCFGScore(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce PCFG score log.debug("Parsing sentence"); double score; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); score = parser.getPCFGScore(); } return score; }
@Override public SymmetricalWordAlignment processAndAlign(String input) { input = input.trim(); // Run through the tokenizer and convert to sequence String tokenizerInput = toUncased(input); String[] uncasedInputTokens = tokenizerInput.split("\\s+"); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput)); List<CoreLabel> outputTokens = tokenizer.tokenize(); IString[] outputSequence = new IString[outputTokens.size()]; for (int i = 0; i < outputSequence.length; ++i) { String outputToken = outputTokens.get(i).get(TextAnnotation.class); outputSequence[i] = new IString(outputToken); } // Whitespace tokenization of input, create alignment Sequence<IString> inputSequence = IStrings.tokenize(input); assert inputSequence.size() == uncasedInputTokens.length; SymmetricalWordAlignment alignment = new SymmetricalWordAlignment( inputSequence, new SimpleSequence<IString>(true, outputSequence)); // Generate the alignments StringBuilder inputToken = new StringBuilder(); for (int i = 0, j = 0, limit = outputTokens.size(); j < limit; ++j) { CoreLabel tokenizedToken = outputTokens.get(j); String inputTokenPart = toUncased(tokenizedToken.get(OriginalTextAnnotation.class)); alignment.addAlign(i, j); inputToken.append(inputTokenPart); if (i >= uncasedInputTokens.length) { System.err.println("WARNING: Non-invertible input: " + input); break; } if (uncasedInputTokens[i].equals(inputToken.toString())) { ++i; inputToken = new StringBuilder(); } } return alignment; }
private static List<TypedDependency> getDependencies(String sentence) { if (pipeline == null) { loadModels(); } TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> rawWords2 = tok.tokenize(); Tree parse = lp.apply(rawWords2); // parse.pennPrint(); // // System.out.println(parse.toString()); TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); return tdl; }
/** * parse sentence and generate .trees file * * @param en * @param align * @param out */ public static void parse(String en, String align, String out, boolean verbose) { // use alignments? boolean use_alignments = true; if (align.startsWith("no_align")) { use_alignments = false; System.err.println("Not using alignments."); } else { System.err.println("Using alignments from " + align); } // setup stanfordparser String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"}; LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); TreebankLanguagePack tlp = lp.getOp().langpack(); java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true; GrammaticalStructureFactory gsf = new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter); // read document Iterable<List<? extends HasWord>> sentences; Reader r = new Reader(en); String line = null; List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); while ((line = r.getNext()) != null) { Tokenizer<? extends HasWord> token = tlp.getTokenizerFactory().getTokenizer(new StringReader(line)); List<? extends HasWord> sentence = token.tokenize(); tmp.add(sentence); } sentences = tmp; // set up alignment file reader Reader alignment = new Reader(); if (use_alignments) { alignment = new Reader(align); } // set up tree file writer Writer treeWriter = new Writer(out); // parse long start = System.currentTimeMillis(); // System.err.print("Parsing sentences "); int sentID = 0; for (List<? extends HasWord> sentence : sentences) { Tree t = new Tree(); // t.setSentID(++sentID); System.err.println("parse Sentence :" + sentence + "..."); // System.err.print("."); System.err.println("-----------------------------------------------------------------------"); edu.stanford.nlp.trees.Tree parse = lp.parse(sentence); // parse.pennPrint(); // List for root node and lexical nodes List<Node> loneNodes = new LinkedList<Node>(); List<Node> governingNodes = new LinkedList<Node>(); // ROOT node Node root = new Node(true, true); root.setTag("ROOT"); t.setRoot(root); loneNodes.add(root); governingNodes.add(root); // tagging int counter = 0; String surface = ""; String tag = ""; for (TaggedWord tw : parse.taggedYield()) { Node n = new Node(); Node governingNode = new Node(); n.setNodeID(++counter); surface = tw.value(); tag = tw.tag(); if (surface.startsWith("-LRB-")) { surface = "("; } else if (surface.startsWith("-RRB-")) { surface = ")"; // } else if (surface.startsWith("-LSB-")){ // surface = "["; // } else if (surface.startsWith("-RSB-")){ // surface = "]"; // } else if (surface.startsWith("-LCB-")){ // surface = "{"; // } else if (surface.startsWith("-RCB-")){ // surface = "}"; } else if (surface.startsWith("''")) { surface = "\""; } tag = tag.replaceAll("#", "-NUM-"); surface = surface.replaceAll("&", "-AMP-"); surface = surface.replaceAll("#", "-NUM-"); surface = surface.replaceAll(">", "-GRE-"); surface = surface.replaceAll("=", "-EQU-"); n.setInitialLexicalIndex(counter); governingNode.setInitialLexicalIndex(counter); n.setSurface(surface); // System.out.print("("+tw.value()+" : "); n.setTag(tag); governingNode.setTag("_" + tag); governingNode.setLabel("_gov"); // System.out.print(tw.tag()+")"); loneNodes.add(n); governingNodes.add(governingNode); governingNode.setChild(n); } // System.out.println(""); // t.setSentLength(t.getNodes().size() - 1); // List<Node> loneNodes = new LinkedList<Node>(); Node[] nodes = new Node[2000]; // labeling int depIndex; int govIndex; String[] depInfo; String[] govInfo; GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependencies(false); // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); for (TypedDependency td : tdl) { depIndex = td.dep().index(); govIndex = td.gov().index(); // System.out.println("Index1:"+depIndex); // System.out.println("Index2:"+govIndex); // if (nodes[depIndex] == null){ // System.out.println("Making node!"); // nodes[depIndex] = new Node(); // } // if (nodes[govIndex] == null){ // System.out.println("Making node!"); // nodes[govIndex] = new Node(); // } Node dep = loneNodes.get((depIndex)); Node gov = governingNodes.get((govIndex)); Node depcopy = governingNodes.get((depIndex)); Node govcopy = loneNodes.get((govIndex)); dep.setLabel(td.reln().toString()); depcopy.setLabel(td.reln().toString()); govcopy.setLabel("head"); // System.out.println(td.toString()); govInfo = td.gov().toString().split("/"); depInfo = td.dep().toString().split("/"); // System.out.println(td.gov().toString()); // System.out.println(td.dep().toString()); // dep.setSurface(depInfo[0]); // dep.setTag(depInfo[1]); gov.setChild(governingNodes.get(depIndex)); governingNodes.get(depIndex).setParent(gov); // gov.setChild(dep); dep.setParent(governingNodes.get(depIndex)); } // t.setRoot(nodes[0]); // Collapse tree to remove unneeded governing nodes: Node gov; Node dep; Node parent; List<Node> children; for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root gov = governingNodes.get(i); dep = loneNodes.get(i); if (gov.getChildren().size() <= 1) { int k = 0; parent = gov.getParent(); children = parent.getChildren(); for (Node n : children) { if (n == gov) { gov.getParent().replaceChild(k, dep); dep.setParent(gov.getParent()); } k++; } } } // Mark head nodes with appropriate label: int k = 0; for (Node n : loneNodes) { if (k != 0) { if (n.getLabel() == n.getParent().getLabel()) { n.setLabel("head"); } } else { n.setLabel("null"); } k++; } // Sort lexical children of each governing node in lexical order for (Node n : governingNodes) { n.sortChildrenByInitialIndex(); } // combine with alignment if (use_alignments) { t.initialize(alignment.readNextAlign()); } else { t.initializeUnaligned(); } // write tree to file treeWriter.write(t); // print tree to console System.out.println(t.toSentence()); if (verbose) { System.err.println(t.toString()); // t.recursivePrint(); } System.err.println("#######################################################################"); } long stop = System.currentTimeMillis(); System.err.println("...done! [" + (stop - start) / 1000 + " sec]."); treeWriter.close(); }
private Tree getPosTree(String sentence) { final Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); final List<CoreLabel> tokens = tokenizer.tokenize(); return parser.apply(tokens); }
private static List<CoreLabel> tokenize(String str) { Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str)); return tokenizer.tokenize(); }
public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ String grammar = "./jsan_resources/englishPCFG.ser.gz"; String[] options = { "-maxLength", "120", "-retainTmpSubcategories" }; // LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = new LexicalizedParser() TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); Iterable<List<? extends HasWord>> sentences; ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); everything.add(0,otherSampleStrings); everything.add(1,authorSampleStrings); everything.add(2,toModifyStrings); Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator(); int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings int numLoaded = 0; while(everythingIter.hasNext()){ docTypeNumber++; HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder(); Set<String> currentDocStrings = currentSampleStrings.keySet(); Iterator<String> docStrIter = currentDocStrings.iterator(); String docID; ArrayList<String> sentenceTokens; allTreeProcessors[docTypeNumber] = new TreeProcessor(); allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps(); numLoaded=0; while(docStrIter.hasNext()){ docID = docStrIter.next(); sentenceTokens = currentSampleStrings.get(docID); if(sentenceTokens == null){ allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false); numLoaded++; continue; } //System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext()); numSentences = sentenceTokens.size(); //initialize(numSentences); Iterator<String> sentIter = sentenceTokens.iterator(); List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); String tempSent; while(sentIter.hasNext()){ tempSent = sentIter.next(); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent)); List<? extends HasWord> sentenceTokenized = toke.tokenize(); tmp.add(sentenceTokenized); } sentences = tmp; //int numDone = 0; TreeProcessor.singleDocMap.clear(); boolean willSaveResults = true; for (List<? extends HasWord> sentence : sentences) { Tree parse = lp.apply(sentence); //parse.pennPrint(); //System.out.println(parse.treeSkeletonCopy().toString()); //System.out.println(parse.taggedYield()); //System.out.println(); //printSubTrees(parse); //TreeContainer.recurseTree(parse,"breadth"); allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //numDone++; //System.out.println("sent "+numDone+" of "+numSentences+" done "); //System.out.println(tc.processedTrees.toString()); //in.nextLine(); //TreeContainer.recurseTree(parse, "depth"); //in.nextLine(); //addTree(parse); //GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS //Collection tdl = gs.typedDependenciesCCprocessed(true); //System.out.println(tdl); //System.out.println(); } if(willSaveResults == true) ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR); //System.out.println("After all sents: "); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //String sent3 = "This is one last test!"; //Tree parse3 = lp.apply(sent3); //parse3.pennPrint(); //System.out.println("After sorting and writing:"); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //Scanner in = new Scanner(System.in); //System.out.println("First one done."); //in.nextLine(); //viewTrees(); } //TreeProcessor.writeTreeDataToCSV(sortedTD,docID); allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1); } int i= 0; allParsedAndOrdered.clear(); String[] docTypes = new String[]{"otherSample","authorSample","toModify"}; for(i=0; i < 3; i++){ allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees); allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps); allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps); allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap)); } //ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees); //TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS"); return allParsedAndOrdered; }
/** * Tokenize a sentence in the argument, and print out the tokens to the console. * * @param args Set the first argument as the sentence to * <p>be tokenized. */ public static void main(String[] args) { TokenizerFactory<Word> factory = PTBTokenizerFactory.newTokenizerFactory(); Tokenizer<Word> tokenizer = factory.getTokenizer(new StringReader(args[0])); System.out.println(tokenizer.tokenize()); }
public static void main(String[] args) throws IOException { Tokenizer<String> att = new ArabicTreebankTokenizer(new FileReader(args[0])); while (att.hasNext()) { System.out.print(att.next()); } }