private void verifyTree(Tree expected, Tree result) {
   if (expected == null) {
     assertEquals(expected, result);
     return;
   }
   assertEquals(expected.toString(), result.toString());
 }
Exemple #2
0
  private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) {
    String[] sent = toSentence(words);
    /// lexical analyzer
    List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
    Tree parse = lp.apply(rawWords);

    //		PrintStream outa = new PrintStream(new FileOutputStream("output1.txt"));

    //	    System.setOut(outa);
    //	    System.out.println("KKKKKKK");
    //	    parse.pennPrint();
    String oldTree = parse.toString();
    //	    String oldTree=baos.toString();
    //	    System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out)));
    //	    System.out.println(oldTree);

    words.get(index).setNewValue(newWord);
    sent = toSentence(words);
    rawWords = Sentence.toCoreLabelList(sent);
    parse = lp.apply(rawWords);
    //	    PrintStream outb = new PrintStream(new FileOutputStream("output2.txt"));
    //	    System.setOut(outb);

    //	    parse.pennPrint();
    String newTree = parse.toString();

    oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")");
    //	    System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out)));
    System.out.println(oldTree + "\n" + newTree);

    //	    	System.out.println(oldTree.equals(newTree));

    if (oldTree.equals(newTree)) {
      if (index == 0) {
        String str = words.get(index).getNewValue();
        String cap = str.substring(0, 1).toUpperCase() + str.substring(1);
        words.get(index).setNewValue(cap);
      }
      return true;
    } else {
      words.get(index).setNewValue(null);
      return false;
    }

    /* catch (FileNotFoundException e) {
    	// TODO Auto-generated catch block
    	e.printStackTrace();
    	return false;
    } catch (IOException e) {
    	// TODO Auto-generated catch block
    	e.printStackTrace();
    	return false;
    }*/

    //		return true;
  }
  /**
   * Parses a sentence and returns a string representation of the parse tree.
   *
   * @param sentence a sentence
   * @return Tree whose Label is a MapLabel containing correct begin and end character offsets in
   *     keys BEGIN_KEY and END_KEY
   */
  @SuppressWarnings("unchecked")
  public static String parse(String sentence) {
    if (tlp == null || parser == null)
      throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser) {
      Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence));
      List<Word> words = tokenizer.tokenize();
      log.debug("Tokenization: " + words);
      parser.parse(new Sentence(words));
      tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
  }
  public static String getCleanedUpYield(Tree inputTree) {
    Tree copyTree = inputTree.deepCopy();

    if (DEBUG) System.err.println(copyTree.toString());

    String res = copyTree.yield().toString();
    if (res.length() > 1) {
      res = res.substring(0, 1).toUpperCase() + res.substring(1);
    }

    // (ROOT (S (NP (NNP Jaguar) (NNS shares)) (VP (VBD skyrocketed) (NP (NN yesterday)) (PP (IN
    // after) (NP (NP (NNP Mr.) (NNP Ridley) (POS 's)) (NN announcement)))) (. .)))

    res = res.replaceAll("\\s([\\.,!\\?\\-;:])", "$1");
    res = res.replaceAll("(\\$)\\s", "$1");
    res = res.replaceAll("can not", "cannot");
    res = res.replaceAll("\\s*-LRB-\\s*", " (");
    res = res.replaceAll("\\s*-RRB-\\s*", ") ");
    res = res.replaceAll("\\s*([\\.,?!])\\s*", "$1 ");
    res = res.replaceAll("\\s+''", "''");
    // res = res.replaceAll("\"", "");
    res = res.replaceAll("``\\s+", "``");
    res = res.replaceAll("\\-[LR]CB\\-", ""); // brackets, e.g., [sic]

    // remove extra spaces
    res = res.replaceAll("\\s\\s+", " ");
    res = res.trim();

    return res;
  }
Exemple #5
0
  /** @param args */
  public static void main(String[] args) {
    if (args.length != 1) {
      System.err.println("Usage: java " + ATBCorrector.class.getName() + " filename\n");
      System.exit(-1);
    }

    TreeTransformer tt = new ATBCorrector();

    File f = new File(args[0]);
    try {

      BufferedReader br =
          new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
      TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory();
      TreeReader tr = trf.newTreeReader(br);

      int nTrees = 0;
      for (Tree t; (t = tr.readTree()) != null; nTrees++) {
        Tree fixedT = tt.transformTree(t);
        System.out.println(fixedT.toString());
      }

      tr.close();

      System.err.printf("Wrote %d trees%n", nTrees);

    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();

    } catch (IOException e) {
      e.printStackTrace();
    }
  }
  /**
   * This method creates a string which represents the part of the sentence this <code>tree</code>
   * stands for.
   *
   * @param tree A (partial) syntax tree
   * @return The original sentence part
   */
  public static String printTree(Tree tree) {
    final StringBuilder sb = new StringBuilder();

    for (final Tree t : tree.getLeaves()) {
      sb.append(t.toString()).append(" ");
    }
    return sb.toString().trim();
  }
  private static String toString(Tree tree, boolean plainPrint) {
    if (!plainPrint) return tree.toString();

    StringBuilder sb = new StringBuilder();
    List<Tree> leaves = tree.getLeaves();
    for (Tree leaf : leaves) sb.append(((CoreLabel) leaf.label()).value()).append(' ');

    return sb.toString();
  }
  public void evaluate(Tree guess, Tree gold, PrintWriter pw) {
    if (gold == null || guess == null) {
      System.err.printf(
          "%s: Cannot compare against a null gold or guess tree!%n", this.getClass().getName());
      return;
    }

    final List<List<CoreLabel>> guessLineages = makeLineages(guess);
    final List<List<CoreLabel>> goldLineages = makeLineages(gold);

    if (guessLineages.size() == goldLineages.size()) {

      double localScores = 0.0;
      for (int i = 0; i < guessLineages.size(); i++) {
        List<CoreLabel> guessLin = guessLineages.get(i);
        List<CoreLabel> goldLin = goldLineages.get(i);

        double levDist = editDistance(guessLin, goldLin);
        double la = 1.0 - (levDist / (double) (guessLin.size() + goldLin.size()));

        localScores += la;

        updateCatAverages(goldLin, la);
      }

      corpusAvg += localScores;
      corpusNum += goldLineages.size();

      double localSentAvg = localScores / goldLineages.size();
      if (localSentAvg == 1.0) sentExact++;
      sentAvg += localSentAvg;
      sentNum++;

    } else {
      System.err.printf(
          "%s: Number of guess (%d) gold (%d) don't match!%n",
          this.getClass().getName(), guessLineages.size(), goldLineages.size());
      System.err.println("Cannot evaluate!");
      System.err.printf("GUESS tree:%n%s%n", guess.toString());
      System.err.printf("GOLD tree:%n%s%n", gold.toString());
    }
  }
  public static void main(String[] args) {
    if (args.length < minArgs) {
      System.out.println(usage());
      System.exit(-1);
    }

    Properties options = StringUtils.argsToProperties(args, argDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    DiskTreebank tb = null;
    String encoding = options.getProperty("l", "UTF-8");
    boolean removeBracket = PropertiesUtils.getBool(options, "b", false);

    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    tb = tlpp.diskTreebank();

    String[] files = options.getProperty("", "").split("\\s+");
    if (files.length != 0) {
      for (String filename : files) {
        tb.loadPath(filename);
      }
    } else {
      log.info(usage());
      System.exit(-1);
    }

    PrintWriter pwo = tlpp.pw();
    String startSymbol = tlpp.treebankLanguagePack().startSymbol();
    TreeFactory tf = new LabeledScoredTreeFactory();
    int nTrees = 0;
    for (Tree t : tb) {
      if (removeBracket) {
        if (t.value().equals(startSymbol)) {
          t = t.firstChild();
        }

      } else if (!t.value().equals(startSymbol)) { // Add a bracket if it isn't already there
        t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
      }
      pwo.println(t.toString());
      nTrees++;
    }
    pwo.close();
    System.err.printf("Processed %d trees.%n", nTrees);
  }
 public static String createParseTree(String sentence) {
   Tree tree = parse(sentence);
   //		System.out.println(tree.toString());
   return (tree.toString() + "\n");
 }
  @Override
  public void process(JCas jCas) throws AnalysisEngineProcessException {
    Annotation document = this.processor.process(jCas.getDocumentText());

    String lastNETag = "O";
    int lastNEBegin = -1;
    int lastNEEnd = -1;
    for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {

      // create the token annotation
      int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
      int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
      String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
      String lemma = tokenAnn.get(LemmaAnnotation.class);
      Token token = new Token(jCas, begin, end);
      token.setPos(pos);
      token.setLemma(lemma);
      token.addToIndexes();

      // hackery to convert token-level named entity tag into phrase-level tag
      String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
      if (neTag.equals("O") && !lastNETag.equals("O")) {
        NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
        ne.setMentionType(lastNETag);
        ne.addToIndexes();
      } else {
        if (lastNETag.equals("O")) {
          lastNEBegin = begin;
        } else if (lastNETag.equals(neTag)) {
          // do nothing - begin was already set
        } else {
          NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
          ne.setMentionType(lastNETag);
          ne.addToIndexes();
          lastNEBegin = begin;
        }
        lastNEEnd = end;
      }
      lastNETag = neTag;
    }
    if (!lastNETag.equals("O")) {
      NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
      ne.setMentionType(lastNETag);
      ne.addToIndexes();
    }

    // add sentences and trees
    for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {

      // add the sentence annotation
      int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
      int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
      Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
      sentence.addToIndexes();

      // add the syntactic tree annotation
      List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
      Tree tree = sentenceAnn.get(TreeAnnotation.class);
      if (tree.children().length != 1) {
        throw new RuntimeException("Expected single root node, found " + tree);
      }
      tree = tree.firstChild();
      tree.indexSpans(0);
      TopTreebankNode root = new TopTreebankNode(jCas);
      root.setTreebankParse(tree.toString());
      // TODO: root.setTerminals(v)
      this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);

      // get the dependencies
      SemanticGraph dependencies =
          sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);

      // convert Stanford nodes to UIMA annotations
      List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
      Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
      for (IndexedWord stanfordNode : dependencies.vertexSet()) {
        int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
        int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
        int tokenBegin = tokens.get(indexBegin).getBegin();
        int tokenEnd = tokens.get(indexEnd - 1).getEnd();
        DependencyNode node;
        if (dependencies.getRoots().contains(stanfordNode)) {
          node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
        } else {
          node = new DependencyNode(jCas, tokenBegin, tokenEnd);
        }
        stanfordToUima.put(stanfordNode, node);
      }

      // create relation annotations for each Stanford dependency
      ArrayListMultimap<DependencyNode, DependencyRelation> headRelations =
          ArrayListMultimap.create();
      ArrayListMultimap<DependencyNode, DependencyRelation> childRelations =
          ArrayListMultimap.create();
      for (SemanticGraphEdge stanfordEdge : dependencies.edgeIterable()) {
        DependencyRelation relation = new DependencyRelation(jCas);
        DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
        DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
        String relationType = stanfordEdge.getRelation().toString();
        if (head == null || child == null || relationType == null) {
          throw new RuntimeException(
              String.format(
                  "null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n",
                  relation, child, head));
        }
        relation.setHead(head);
        relation.setChild(child);
        relation.setRelation(relationType);
        relation.addToIndexes();
        headRelations.put(child, relation);
        childRelations.put(head, relation);
      }

      // set the relations for each node annotation
      for (DependencyNode node : stanfordToUima.values()) {
        List<DependencyRelation> heads = headRelations.get(node);
        node.setHeadRelations(new FSArray(jCas, heads == null ? 0 : heads.size()));
        if (heads != null) {
          FSCollectionFactory.fillArrayFS(node.getHeadRelations(), heads);
        }
        List<DependencyRelation> children = childRelations.get(node);
        node.setChildRelations(new FSArray(jCas, children == null ? 0 : children.size()));
        if (children != null) {
          FSCollectionFactory.fillArrayFS(node.getChildRelations(), children);
        }
        node.addToIndexes();
      }
    }

    // map from spans to named entity mentions
    Map<Span, NamedEntityMention> spanMentionMap = new HashMap<Span, NamedEntityMention>();
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
      spanMentionMap.put(new Span(mention.getBegin(), mention.getEnd()), mention);
    }

    // add mentions for all entities identified by the coreference system
    List<NamedEntity> entities = new ArrayList<NamedEntity>();
    List<List<Token>> sentenceTokens = new ArrayList<List<Token>>();
    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
      sentenceTokens.add(JCasUtil.selectCovered(jCas, Token.class, sentence));
    }
    Map<Integer, CorefChain> corefChains = document.get(CorefChainAnnotation.class);
    for (CorefChain chain : corefChains.values()) {
      List<NamedEntityMention> mentions = new ArrayList<NamedEntityMention>();
      for (CorefMention corefMention : chain.getMentionsInTextualOrder()) {

        // figure out the character span of the token
        List<Token> tokens = sentenceTokens.get(corefMention.sentNum - 1);
        int begin = tokens.get(corefMention.startIndex - 1).getBegin();
        int end = tokens.get(corefMention.endIndex - 2).getEnd();

        // use an existing named entity mention when possible; otherwise create a new one
        NamedEntityMention mention = spanMentionMap.get(new Span(begin, end));
        if (mention == null) {
          mention = new NamedEntityMention(jCas, begin, end);
          mention.addToIndexes();
        }
        mentions.add(mention);
      }

      // create an entity for the mentions
      Collections.sort(
          mentions,
          new Comparator<NamedEntityMention>() {
            @Override
            public int compare(NamedEntityMention m1, NamedEntityMention m2) {
              return m1.getBegin() - m2.getBegin();
            }
          });

      // create mentions and add them to entity
      NamedEntity entity = new NamedEntity(jCas);
      entity.setMentions(new FSArray(jCas, mentions.size()));
      int index = 0;
      for (NamedEntityMention mention : mentions) {
        mention.setMentionedEntity(entity);
        entity.setMentions(index, mention);
        index += 1;
      }
      entities.add(entity);
    }

    // add singleton entities for any named entities not picked up by coreference system
    for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
      if (mention.getMentionedEntity() == null) {
        NamedEntity entity = new NamedEntity(jCas);
        entity.setMentions(new FSArray(jCas, 1));
        entity.setMentions(0, mention);
        mention.setMentionedEntity(entity);
        entity.getMentions();
        entities.add(entity);
      }
    }

    // sort entities by document order
    Collections.sort(
        entities,
        new Comparator<NamedEntity>() {
          @Override
          public int compare(NamedEntity o1, NamedEntity o2) {
            return getFirstBegin(o1) - getFirstBegin(o2);
          }

          private int getFirstBegin(NamedEntity entity) {
            int min = Integer.MAX_VALUE;
            for (NamedEntityMention mention :
                JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
              if (mention.getBegin() < min) {
                min = mention.getBegin();
              }
            }
            return min;
          }
        });

    // add entities to document
    for (NamedEntity entity : entities) {
      entity.addToIndexes();
    }
  }