private void verifyTree(Tree expected, Tree result) { if (expected == null) { assertEquals(expected, result); return; } assertEquals(expected.toString(), result.toString()); }
private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) { String[] sent = toSentence(words); /// lexical analyzer List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); // PrintStream outa = new PrintStream(new FileOutputStream("output1.txt")); // System.setOut(outa); // System.out.println("KKKKKKK"); // parse.pennPrint(); String oldTree = parse.toString(); // String oldTree=baos.toString(); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); // System.out.println(oldTree); words.get(index).setNewValue(newWord); sent = toSentence(words); rawWords = Sentence.toCoreLabelList(sent); parse = lp.apply(rawWords); // PrintStream outb = new PrintStream(new FileOutputStream("output2.txt")); // System.setOut(outb); // parse.pennPrint(); String newTree = parse.toString(); oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")"); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); System.out.println(oldTree + "\n" + newTree); // System.out.println(oldTree.equals(newTree)); if (oldTree.equals(newTree)) { if (index == 0) { String str = words.get(index).getNewValue(); String cap = str.substring(0, 1).toUpperCase() + str.substring(1); words.get(index).setNewValue(cap); } return true; } else { words.get(index).setNewValue(null); return false; } /* catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; }*/ // return true; }
/** * Parses a sentence and returns a string representation of the parse tree. * * @param sentence a sentence * @return Tree whose Label is a MapLabel containing correct begin and end character offsets in * keys BEGIN_KEY and END_KEY */ @SuppressWarnings("unchecked") public static String parse(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce stanford Tree log.debug("Parsing sentence"); Tree tree = null; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); tree = parser.getBestParse(); } // label tree with character extents // log.debug("Setting character extents"); // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1)); // log.debug("Creating offset mapping"); // List<RangeMap> mapping = createMapping(sentence); // log.debug(mapping.toString()); // log.debug("Applying offset mapping"); // mapOffsets(tree, mapping); return tree.toString().replaceAll(" \\[[\\S]+\\]", ""); }
public static String getCleanedUpYield(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); if (DEBUG) System.err.println(copyTree.toString()); String res = copyTree.yield().toString(); if (res.length() > 1) { res = res.substring(0, 1).toUpperCase() + res.substring(1); } // (ROOT (S (NP (NNP Jaguar) (NNS shares)) (VP (VBD skyrocketed) (NP (NN yesterday)) (PP (IN // after) (NP (NP (NNP Mr.) (NNP Ridley) (POS 's)) (NN announcement)))) (. .))) res = res.replaceAll("\\s([\\.,!\\?\\-;:])", "$1"); res = res.replaceAll("(\\$)\\s", "$1"); res = res.replaceAll("can not", "cannot"); res = res.replaceAll("\\s*-LRB-\\s*", " ("); res = res.replaceAll("\\s*-RRB-\\s*", ") "); res = res.replaceAll("\\s*([\\.,?!])\\s*", "$1 "); res = res.replaceAll("\\s+''", "''"); // res = res.replaceAll("\"", ""); res = res.replaceAll("``\\s+", "``"); res = res.replaceAll("\\-[LR]CB\\-", ""); // brackets, e.g., [sic] // remove extra spaces res = res.replaceAll("\\s\\s+", " "); res = res.trim(); return res; }
/** @param args */ public static void main(String[] args) { if (args.length != 1) { System.err.println("Usage: java " + ATBCorrector.class.getName() + " filename\n"); System.exit(-1); } TreeTransformer tt = new ATBCorrector(); File f = new File(args[0]); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(); TreeReader tr = trf.newTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.readTree()) != null; nTrees++) { Tree fixedT = tt.transformTree(t); System.out.println(fixedT.toString()); } tr.close(); System.err.printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
/** * This method creates a string which represents the part of the sentence this <code>tree</code> * stands for. * * @param tree A (partial) syntax tree * @return The original sentence part */ public static String printTree(Tree tree) { final StringBuilder sb = new StringBuilder(); for (final Tree t : tree.getLeaves()) { sb.append(t.toString()).append(" "); } return sb.toString().trim(); }
private static String toString(Tree tree, boolean plainPrint) { if (!plainPrint) return tree.toString(); StringBuilder sb = new StringBuilder(); List<Tree> leaves = tree.getLeaves(); for (Tree leaf : leaves) sb.append(((CoreLabel) leaf.label()).value()).append(' '); return sb.toString(); }
public void evaluate(Tree guess, Tree gold, PrintWriter pw) { if (gold == null || guess == null) { System.err.printf( "%s: Cannot compare against a null gold or guess tree!%n", this.getClass().getName()); return; } final List<List<CoreLabel>> guessLineages = makeLineages(guess); final List<List<CoreLabel>> goldLineages = makeLineages(gold); if (guessLineages.size() == goldLineages.size()) { double localScores = 0.0; for (int i = 0; i < guessLineages.size(); i++) { List<CoreLabel> guessLin = guessLineages.get(i); List<CoreLabel> goldLin = goldLineages.get(i); double levDist = editDistance(guessLin, goldLin); double la = 1.0 - (levDist / (double) (guessLin.size() + goldLin.size())); localScores += la; updateCatAverages(goldLin, la); } corpusAvg += localScores; corpusNum += goldLineages.size(); double localSentAvg = localScores / goldLineages.size(); if (localSentAvg == 1.0) sentExact++; sentAvg += localSentAvg; sentNum++; } else { System.err.printf( "%s: Number of guess (%d) gold (%d) don't match!%n", this.getClass().getName(), guessLineages.size(), goldLineages.size()); System.err.println("Cannot evaluate!"); System.err.printf("GUESS tree:%n%s%n", guess.toString()); System.err.printf("GOLD tree:%n%s%n", gold.toString()); } }
public static void main(String[] args) { if (args.length < minArgs) { System.out.println(usage()); System.exit(-1); } Properties options = StringUtils.argsToProperties(args, argDefs()); Language language = PropertiesUtils.get(options, "l", Language.English, Language.class); TreebankLangParserParams tlpp = language.params; DiskTreebank tb = null; String encoding = options.getProperty("l", "UTF-8"); boolean removeBracket = PropertiesUtils.getBool(options, "b", false); tlpp.setInputEncoding(encoding); tlpp.setOutputEncoding(encoding); tb = tlpp.diskTreebank(); String[] files = options.getProperty("", "").split("\\s+"); if (files.length != 0) { for (String filename : files) { tb.loadPath(filename); } } else { log.info(usage()); System.exit(-1); } PrintWriter pwo = tlpp.pw(); String startSymbol = tlpp.treebankLanguagePack().startSymbol(); TreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; for (Tree t : tb) { if (removeBracket) { if (t.value().equals(startSymbol)) { t = t.firstChild(); } } else if (!t.value().equals(startSymbol)) { // Add a bracket if it isn't already there t = tf.newTreeNode(startSymbol, Collections.singletonList(t)); } pwo.println(t.toString()); nTrees++; } pwo.close(); System.err.printf("Processed %d trees.%n", nTrees); }
public static String createParseTree(String sentence) { Tree tree = parse(sentence); // System.out.println(tree.toString()); return (tree.toString() + "\n"); }
@Override public void process(JCas jCas) throws AnalysisEngineProcessException { Annotation document = this.processor.process(jCas.getDocumentText()); String lastNETag = "O"; int lastNEBegin = -1; int lastNEEnd = -1; for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) { // create the token annotation int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class); int end = tokenAnn.get(CharacterOffsetEndAnnotation.class); String pos = tokenAnn.get(PartOfSpeechAnnotation.class); String lemma = tokenAnn.get(LemmaAnnotation.class); Token token = new Token(jCas, begin, end); token.setPos(pos); token.setLemma(lemma); token.addToIndexes(); // hackery to convert token-level named entity tag into phrase-level tag String neTag = tokenAnn.get(NamedEntityTagAnnotation.class); if (neTag.equals("O") && !lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } else { if (lastNETag.equals("O")) { lastNEBegin = begin; } else if (lastNETag.equals(neTag)) { // do nothing - begin was already set } else { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); lastNEBegin = begin; } lastNEEnd = end; } lastNETag = neTag; } if (!lastNETag.equals("O")) { NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd); ne.setMentionType(lastNETag); ne.addToIndexes(); } // add sentences and trees for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) { // add the sentence annotation int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class); int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class); Sentence sentence = new Sentence(jCas, sentBegin, sentEnd); sentence.addToIndexes(); // add the syntactic tree annotation List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class); Tree tree = sentenceAnn.get(TreeAnnotation.class); if (tree.children().length != 1) { throw new RuntimeException("Expected single root node, found " + tree); } tree = tree.firstChild(); tree.indexSpans(0); TopTreebankNode root = new TopTreebankNode(jCas); root.setTreebankParse(tree.toString()); // TODO: root.setTerminals(v) this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns); // get the dependencies SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class); // convert Stanford nodes to UIMA annotations List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence); Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>(); for (IndexedWord stanfordNode : dependencies.vertexSet()) { int indexBegin = stanfordNode.get(BeginIndexAnnotation.class); int indexEnd = stanfordNode.get(EndIndexAnnotation.class); int tokenBegin = tokens.get(indexBegin).getBegin(); int tokenEnd = tokens.get(indexEnd - 1).getEnd(); DependencyNode node; if (dependencies.getRoots().contains(stanfordNode)) { node = new TopDependencyNode(jCas, tokenBegin, tokenEnd); } else { node = new DependencyNode(jCas, tokenBegin, tokenEnd); } stanfordToUima.put(stanfordNode, node); } // create relation annotations for each Stanford dependency ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create(); ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create(); for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) { DependencyRelation relation = new DependencyRelation(jCas); DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor()); DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent()); String relationType = stanfordEdge.getRelation().toString(); if (head == null || child == null || relationType == null) { throw new RuntimeException( String.format( "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head)); } relation.setHead(head); relation.setChild(child); relation.setRelation(relationType); relation.addToIndexes(); headRelations.put(child, relation); childRelations.put(head, relation); } // set the relations for each node annotation for (DependencyNode node : stanfordToUima.values()) { List<DependencyRelation> heads = headRelations.get(node); node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size())); if (heads != null) { FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads); } List<DependencyRelation> children = childRelations.get(node); node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size())); if (children != null) { FSCollectionFactory.fillArrayFS(node.getChildRelations(), children); } node.addToIndexes(); } } // map from spans to named entity mentions Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>(); for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention); } // add mentions for all entities identified by the coreference system List<NamedEntity> entities = new ArrayList<NamedEntity>(); List<List<Token>> sentenceTokens = new ArrayList<List<Token>>(); for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) { sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence)); } Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class); for (CorefChain chain : corefChains.values()) { List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>(); for (CorefMention corefMention : chain.getMentionsInTextualOrder()) { // figure out the character span of the token List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1); int begin = tokens.get(corefMention.startIndex - 1).getBegin(); int end = tokens.get(corefMention.endIndex - 2).getEnd(); // use an existing named entity mention when possible; otherwise create a new one NamedEntityMention mention = spanMentionMap.get(new Span(begin, end)); if (mention == null) { mention = new NamedEntityMention(jCas, begin, end); mention.addToIndexes(); } mentions.add(mention); } // create an entity for the mentions Collections.sort( mentions, new Comparator<NamedEntityMention>() { @Override public int compare(NamedEntityMention m1, NamedEntityMention m2) { return m1.getBegin() - m2.getBegin(); } }); // create mentions and add them to entity NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, mentions.size())); int index = 0; for (NamedEntityMention mention : mentions) { mention.setMentionedEntity(entity); entity.setMentions(index, mention); index += 1; } entities.add(entity); } // add singleton entities for any named entities not picked up by coreference system for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) { if (mention.getMentionedEntity() == null) { NamedEntity entity = new NamedEntity(jCas); entity.setMentions(new FSArray(jCas, 1)); entity.setMentions(0, mention); mention.setMentionedEntity(entity); entity.getMentions(); entities.add(entity); } } // sort entities by document order Collections.sort( entities, new Comparator<NamedEntity>() { @Override public int compare(NamedEntity o1, NamedEntity o2) { return getFirstBegin(o1) - getFirstBegin(o2); } private int getFirstBegin(NamedEntity entity) { int min = Integer.MAX_VALUE; for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) { if (mention.getBegin() < min) { min = mention.getBegin(); } } return min; } }); // add entities to document for (NamedEntity entity : entities) { entity.addToIndexes(); } }