Java List 예제들, edu.stanford.nlp.util.List Java 예제들

예제 #1

0

파일 보기

파일: ClauseSplitterSearchProblem.java 프로젝트: leelir/CoreNLP

 /**
  * Create a searcher manually, suppling a dependency tree, an optional classifier for when to
  * split clauses, and a featurizer for that classifier. You almost certainly want to use {@link
  * ClauseSplitter#load(String)} instead of this constructor.
  *
  * @param tree The dependency tree to search over.
  * @param assumedTruth The assumed truth of the tree (relevant for natural logic inference). If in
  *     doubt, pass in true.
  * @param isClauseClassifier The classifier for whether a given dependency arc should be a new
  *     clause. If this is not given, all arcs are treated as clause separators.
  * @param featurizer The featurizer for the classifier. If no featurizer is given, one should be
  *     given in {@link ClauseSplitterSearchProblem#search(java.util.function.Predicate,
  *     Classifier, Map, java.util.function.Function, int)}, or else the classifier will be
  *     useless.
  * @see ClauseSplitter#load(String)
  */
 protected ClauseSplitterSearchProblem(
     SemanticGraph tree,
     boolean assumedTruth,
     Optional<Classifier<ClauseSplitter.ClauseClassifierLabel, String>> isClauseClassifier,
     Optional<
             Function<
                 Triple<
                     ClauseSplitterSearchProblem.State,
                     ClauseSplitterSearchProblem.Action,
                     ClauseSplitterSearchProblem.State>,
                 Counter<String>>>
         featurizer) {
   this.tree = new SemanticGraph(tree);
   this.assumedTruth = assumedTruth;
   this.isClauseClassifier = isClauseClassifier;
   this.featurizer = featurizer;
   // Index edges
   this.tree.edgeIterable().forEach(edgeToIndex::addToIndex);
   // Get length
   List<IndexedWord> sortedVertices = tree.vertexListSorted();
   sentenceLength = sortedVertices.get(sortedVertices.size() - 1).index();
   // Register extra edges
   for (IndexedWord vertex : sortedVertices) {
     extraEdgesByGovernor.put(vertex, new ArrayList<>());
     extraEdgesByDependent.put(vertex, new ArrayList<>());
   }
   List<SemanticGraphEdge> extraEdges = Util.cleanTree(this.tree);
   assert Util.isTree(this.tree);
   for (SemanticGraphEdge edge : extraEdges) {
     extraEdgesByGovernor.get(edge.getGovernor()).add(edge);
     extraEdgesByDependent.get(edge.getDependent()).add(edge);
   }
 }

예제 #2

0

파일 보기

파일: AnnotationPipeline.java 프로젝트: justkunz/CoreNLP

 @Override
 public Set<Requirement> requires() {
   if (annotators.isEmpty()) {
     return Collections.emptySet();
   }
   return annotators.get(0).requires();
 }

예제 #3

0

파일 보기

파일: ClauseSplitterSearchProblem.java 프로젝트: leelir/CoreNLP

 /**
  * The basic method for splitting off a clause of a tree. This modifies the tree in place.
  *
  * @param tree The tree to split a clause from.
  * @param toKeep The edge representing the clause to keep.
  */
 static void splitToChildOfEdge(SemanticGraph tree, SemanticGraphEdge toKeep) {
   Queue<IndexedWord> fringe = new LinkedList<>();
   List<IndexedWord> nodesToRemove = new ArrayList<>();
   // Find nodes to remove
   // (from the root)
   for (IndexedWord root : tree.getRoots()) {
     nodesToRemove.add(root);
     for (SemanticGraphEdge out : tree.outgoingEdgeIterable(root)) {
       if (!out.equals(toKeep)) {
         fringe.add(out.getDependent());
       }
     }
   }
   // (recursively)
   while (!fringe.isEmpty()) {
     IndexedWord node = fringe.poll();
     nodesToRemove.add(node);
     for (SemanticGraphEdge out : tree.outgoingEdgeIterable(node)) {
       if (!out.equals(toKeep)) {
         fringe.add(out.getDependent());
       }
     }
   }
   // Remove nodes
   nodesToRemove.forEach(tree::removeVertex);
   // Set new root
   tree.setRoot(toKeep.getDependent());
 }

예제 #4

0

파일 보기

파일: TokensRegexNERAnnotator.java 프로젝트: automenta/corenlp

 // TODO: roll check into tokens regex pattern?
 // That allows for better matching because unmatched sequences will be eliminated at match time
 private boolean checkPosTags(List<CoreLabel> tokens, int start, int end) {
   if (validPosPattern != null) {
     // Need to check POS tag too...
     switch (posMatchType) {
       case MATCH_ONE_TOKEN_PHRASE_ONLY:
         if (tokens.size() > 1) return true;
         // fall through
       case MATCH_AT_LEAST_ONE_TOKEN:
         for (int i = start; i < end; i++) {
           CoreLabel token = tokens.get(i);
           String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
           if (pos != null && validPosPattern.matcher(pos).matches()) {
             return true;
           }
         }
         return false;
       case MATCH_ALL_TOKENS:
         // Checked else where
         return true;
       default:
         // Don't know this match type....
         return true;
     }
   }
   return true;
 }

예제 #5

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

 private List<Tree> helper(List<Tree> treeList, int start) {
   List<Tree> newTreeList = new ArrayList<Tree>(treeList.size());
   for (Tree tree : treeList) {
     int end = start + tree.yield().size();
     newTreeList.add(prune(tree, start));
     start = end;
   }
   return newTreeList;
 }

예제 #6

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

 private static List<TaggedWord> cleanTags(List twList, TreebankLanguagePack tlp) {
   int sz = twList.size();
   List<TaggedWord> l = new ArrayList<TaggedWord>(sz);
   for (int i = 0; i < sz; i++) {
     TaggedWord tw = (TaggedWord) twList.get(i);
     TaggedWord tw2 = new TaggedWord(tw.word(), tlp.basicCategory(tw.tag()));
     l.add(tw2);
   }
   return l;
 }

예제 #7

0

파일 보기

파일: AnnotationPipeline.java 프로젝트: justkunz/CoreNLP

 public AnnotationPipeline(List<Annotator> annotators) {
   this.annotators = annotators;
   if (TIME) {
     int num = annotators.size();
     accumulatedTime = new ArrayList<MutableLong>(num);
     for (int i = 0; i < num; i++) {
       accumulatedTime.add(new MutableLong());
     }
   }
 }

예제 #8

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

 List<Tree> prune(List<Tree> treeList, Label label, int start, int end) {
   // get reference tree
   if (treeList.size() == 1) {
     return treeList;
   }
   Tree testTree = treeList.get(0).treeFactory().newTreeNode(label, treeList);
   int goal = Numberer.getGlobalNumberer("states").number(label.value());
   Tree tempTree = parser.extractBestParse(goal, start, end);
   // parser.restoreUnaries(tempTree);
   Tree pcfgTree = debinarizer.transformTree(tempTree);
   Set<Constituent> pcfgConstituents =
       pcfgTree.constituents(new LabeledScoredConstituentFactory());
   // delete child labels that are not in reference but do not cross reference
   List<Tree> prunedChildren = new ArrayList<Tree>();
   int childStart = 0;
   for (int c = 0, numCh = testTree.numChildren(); c < numCh; c++) {
     Tree child = testTree.getChild(c);
     boolean isExtra = true;
     int childEnd = childStart + child.yield().size();
     Constituent childConstituent =
         new LabeledScoredConstituent(childStart, childEnd, child.label(), 0);
     if (pcfgConstituents.contains(childConstituent)) {
       isExtra = false;
     }
     if (childConstituent.crosses(pcfgConstituents)) {
       isExtra = false;
     }
     if (child.isLeaf() || child.isPreTerminal()) {
       isExtra = false;
     }
     if (pcfgTree.yield().size() != testTree.yield().size()) {
       isExtra = false;
     }
     if (!label.value().startsWith("NP^NP")) {
       isExtra = false;
     }
     if (isExtra) {
       System.err.println(
           "Pruning: "
               + child.label()
               + " from "
               + (childStart + start)
               + " to "
               + (childEnd + start));
       System.err.println("Was: " + testTree + " vs " + pcfgTree);
       prunedChildren.addAll(child.getChildrenAsList());
     } else {
       prunedChildren.add(child);
     }
     childStart = childEnd;
   }
   return prunedChildren;
 }

예제 #9

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

 protected String historyToString(List history) {
   String str = (String) historyToString.get(history);
   if (str == null) {
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < history.size(); i++) {
       sb.append('^');
       sb.append(history.get(i));
     }
     str = sb.toString();
     historyToString.put(history, str);
   }
   return str;
 }

예제 #10

0

파일 보기

파일: TimeExpressionExtractor.java 프로젝트: simplyianm/stanford-corenlp

 private List<CoreMap> toCoreMaps(
     CoreMap annotation, List<TimeExpression> timeExpressions, SUTime.TimeIndex timeIndex) {
   if (timeExpressions == null) return null;
   List<CoreMap> coreMaps = new ArrayList<CoreMap>(timeExpressions.size());
   for (TimeExpression te : timeExpressions) {
     CoreMap cm = te.getAnnotation();
     SUTime.Temporal temporal = te.getTemporal();
     if (temporal != null) {
       String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
       String text = cm.get(CoreAnnotations.TextAnnotation.class);
       if (origText != null) {
         // Make sure the text is from original (and not from concatenated tokens)
         ChunkAnnotationUtils.annotateChunkText(cm, annotation);
         text = cm.get(CoreAnnotations.TextAnnotation.class);
       }
       Map<String, String> timexAttributes;
       try {
         timexAttributes = temporal.getTimexAttributes(timeIndex);
         if (options.includeRange) {
           SUTime.Temporal rangeTemporal = temporal.getRange();
           if (rangeTemporal != null) {
             timexAttributes.put("range", rangeTemporal.toString());
           }
         }
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to get attributes from " + text + ", timeIndex " + timeIndex,
             e);
         continue;
       }
       Timex timex;
       try {
         timex = Timex.fromMap(text, timexAttributes);
       } catch (Exception e) {
         logger.log(
             Level.WARNING,
             "Failed to process " + text + " with attributes " + timexAttributes,
             e);
         continue;
       }
       cm.set(TimexAnnotation.class, timex);
       if (timex != null) {
         coreMaps.add(cm);
       } else {
         logger.warning("No timex expression for: " + text);
       }
     }
   }
   return coreMaps;
 }

예제 #11

0

파일 보기

파일: ClauseSplitterSearchProblem.java 프로젝트: leelir/CoreNLP

 /**
  * Stips aux and mark edges when we are splitting into a clause.
  *
  * @param toModify The tree we are stripping the edges from.
  */
 private void stripAuxMark(SemanticGraph toModify) {
   List<SemanticGraphEdge> toClean = new ArrayList<>();
   for (SemanticGraphEdge edge : toModify.outgoingEdgeIterable(toModify.getFirstRoot())) {
     String rel = edge.getRelation().toString();
     if (("aux".equals(rel) || "mark".equals(rel))
         && !toModify.outgoingEdgeIterator(edge.getDependent()).hasNext()) {
       toClean.add(edge);
     }
   }
   for (SemanticGraphEdge edge : toClean) {
     toModify.removeEdge(edge);
     toModify.removeVertex(edge.getDependent());
   }
 }

예제 #12

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

  public Object formResult() {
    Set brs = new HashSet();
    Set urs = new HashSet();
    // scan each rule / history pair
    int ruleCount = 0;
    for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) {
      if (ruleCount % 100 == 0) {
        System.err.println("Rules multiplied: " + ruleCount);
      }
      ruleCount++;
      Pair rulePair = (Pair) pairI.next();
      Rule baseRule = (Rule) rulePair.first;
      String baseLabel = (String) ruleToLabel.get(baseRule);
      List history = (List) rulePair.second;
      double totalProb = 0;
      for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) {
        List subHistory = history.subList(0, depth);
        double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory));
        double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory));
        // System.out.println("Multiplying out "+baseRule+" with history "+subHistory);
        // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label);
        // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule );

        double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label);
        totalProb += prob;
        for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) {
          Rule rule = specifyRule(baseRule, subHistory, childDepth);
          rule.score = (float) Math.log(totalProb);
          // System.out.println("Created  "+rule+" with score "+rule.score);
          if (rule instanceof UnaryRule) {
            urs.add(rule);
          } else {
            brs.add(rule);
          }
        }
      }
    }
    System.out.println("Total states: " + stateNumberer.total());
    BinaryGrammar bg = new BinaryGrammar(stateNumberer.total());
    UnaryGrammar ug = new UnaryGrammar(stateNumberer.total());
    for (Iterator brI = brs.iterator(); brI.hasNext(); ) {
      BinaryRule br = (BinaryRule) brI.next();
      bg.addRule(br);
    }
    for (Iterator urI = urs.iterator(); urI.hasNext(); ) {
      UnaryRule ur = (UnaryRule) urI.next();
      ug.addRule(ur);
    }
    return new Pair(ug, bg);
  }

예제 #13

0

파일 보기

파일: CorefResolWorker.java 프로젝트: qiaojingy/MC

  public static final String doCorefResolution(Annotation annotation) {

    Map<Integer, CorefChain> corefs = annotation.get(CorefChainAnnotation.class);
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> resolved = new ArrayList<String>();
    for (CoreMap sentence : sentences) {
      List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      for (CoreLabel token : tokens) {
        Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
        CorefChain chain = corefs.get(corefClustId);
        if (chain == null) resolved.add(token.word());
        else {
          int sentINdx = chain.getRepresentativeMention().sentNum - 1;
          CoreMap corefSentence = sentences.get(sentINdx);
          List<CoreLabel> corefSentenceTokens = corefSentence.get(TokensAnnotation.class);
          CorefMention reprMent = chain.getRepresentativeMention();
          if (token.index() < reprMent.startIndex || token.index() > reprMent.endIndex) {
            for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
              CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
              resolved.add(matchedLabel.word());
            }
          } else resolved.add(token.word());
        }
      }
    }
    String resolvedStr = "";
    System.out.println();
    for (String str : resolved) {
      resolvedStr += str + " ";
    }
    System.out.println(resolvedStr);

    return resolvedStr;
  }

예제 #14

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

 protected void tallyInternalNode(Tree lt, List parents) {
   // form base rule
   String label = lt.label().value();
   Rule baseR = ltToRule(lt);
   ruleToLabel.put(baseR, label);
   // act on each history depth
   for (int depth = 0, maxDepth = Math.min(HISTORY_DEPTH(), parents.size());
       depth <= maxDepth;
       depth++) {
     List history = new ArrayList(parents.subList(0, depth));
     // tally each history level / rewrite pair
     rulePairs.incrementCount(new Pair(baseR, history), 1);
     labelPairs.incrementCount(new Pair(label, history), 1);
   }
 }

예제 #15

0

파일 보기

파일: TimeExpressionExtractor.java 프로젝트: simplyianm/stanford-corenlp

 private List<TimeExpression> filterInvalidTimeExpressions(List<TimeExpression> timeExprs) {
   int nfiltered = 0;
   List<TimeExpression> filtered =
       new ArrayList<TimeExpression>(timeExprs.size()); // Approximate size
   for (TimeExpression timeExpr : timeExprs) {
     if (timexPatterns.checkTimeExpression(timeExpr)) {
       filtered.add(timeExpr);
     } else {
       nfiltered++;
     }
   }
   if (nfiltered > 0) {
     logger.finest("Filtered " + nfiltered);
   }
   return filtered;
 }

예제 #16

0

파일 보기

파일: TokensRegexNERAnnotator.java 프로젝트: automenta/corenlp

  private void annotateMatched(List<CoreLabel> tokens) {
    List<SequenceMatchResult<CoreMap>> matched = multiPatternMatcher.findNonOverlapping(tokens);
    for (SequenceMatchResult<CoreMap> m : matched) {
      Entry entry = patternToEntry.get(m.pattern());

      // Check if we will overwrite the existing annotation with this annotation
      int g = entry.annotateGroup;
      int start = m.start(g);
      int end = m.end(g);

      boolean overwriteOriginalNer = checkPosTags(tokens, start, end);
      if (overwriteOriginalNer) {
        overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end);
      }
      if (overwriteOriginalNer) {
        for (int i = start; i < end; i++) {
          tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type);
        }
      } else {
        if (verbose) {
          System.err.println(
              "Not annotating  '"
                  + m.group(g)
                  + "': "
                  + StringUtils.joinFields(
                      m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class)
                  + " with "
                  + entry.type
                  + ", sentence is '"
                  + StringUtils.joinWords(tokens, " ")
                  + '\'');
        }
      }
    }
  }

예제 #17

0

파일 보기

파일: TimeExpressionExtractor.java 프로젝트: simplyianm/stanford-corenlp

 public SUTime.Temporal apply(MatchResult in) {
   if (in instanceof SequenceMatchResult) {
     SequenceMatchResult<CoreMap> mr = (SequenceMatchResult<CoreMap>) (in);
     if (group >= 0) {
       List<? extends CoreMap> matched = mr.groupNodes(group);
       if (matched != null) {
         int i = (nodeIndex >= 0) ? 0 : (matched.size() + nodeIndex);
         TimeExpression te = getTimeExpression(matched, i);
         if (te != null) {
           return te.getTemporal();
         }
       }
     }
   }
   return null;
 }

예제 #18

0

파일 보기

파일: ClauseSplitterSearchProblem.java 프로젝트: leelir/CoreNLP

 /** Re-order the action space based on the specified order of names. */
 private Collection<Action> orderActions(Collection<Action> actionSpace, List<String> order) {
   List<Action> tmp = new ArrayList<>(actionSpace);
   List<Action> out = new ArrayList<>();
   for (String key : order) {
     Iterator<Action> iter = tmp.iterator();
     while (iter.hasNext()) {
       Action a = iter.next();
       if (a.signature().equals(key)) {
         out.add(a);
         iter.remove();
       }
     }
   }
   out.addAll(tmp);
   return out;
 }

예제 #19

0

파일 보기

파일: TokensRegexNERAnnotator.java 프로젝트: automenta/corenlp

  /**
   * Creates a combined list of Entries using the provided mapping files.
   *
   * @param mappings List of mapping files
   * @return list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose,
      String... mappings) {
    // Unlike RegexNERClassifier, we don't bother sorting the entries
    // We leave it to TokensRegex NER to sort out the priorities and matches
    //   (typically after all the matches has been made since for some TokenRegex expression,
    //       we don't know how many tokens are matched until after the matching is done)
    List<Entry> entries = new ArrayList<>();
    TrieMap<String, Entry> seenRegexes = new TrieMap<>();
    Arrays.sort(mappings);
    for (String mapping : mappings) {
      BufferedReader rd = null;
      try {
        rd = IOUtils.readerFromString(mapping);
        readEntries(
            annotatorName,
            entries,
            seenRegexes,
            mapping,
            rd,
            noDefaultOverwriteLabels,
            ignoreCase,
            verbose);
      } catch (IOException e) {
        throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e);
      } finally {
        IOUtils.closeIgnoringExceptions(rd);
      }
    }

    if (mappings.length != 1) {
      logger.log(
          "TokensRegexNERAnnotator "
              + annotatorName
              + ": Read "
              + entries.size()
              + " unique entries from "
              + mappings.length
              + " files");
    }
    return entries;
  }

예제 #20

0

파일 보기

파일: ClauseSplitterSearchProblem.java 프로젝트: leelir/CoreNLP

 /**
  * Get the top few clauses from this searcher, cutting off at the given minimum probability.
  *
  * @param thresholdProbability The threshold under which to stop returning clauses. This should be
  *     between 0 and 1.
  * @return The resulting {@link edu.stanford.nlp.naturalli.SentenceFragment} objects, representing
  *     the top clauses of the sentence.
  */
 public List<SentenceFragment> topClauses(double thresholdProbability) {
   List<SentenceFragment> results = new ArrayList<>();
   search(
       triple -> {
         assert triple.first <= 0.0;
         double prob = Math.exp(triple.first);
         assert prob <= 1.0;
         assert prob >= 0.0;
         assert !Double.isNaN(prob);
         if (prob >= thresholdProbability) {
           SentenceFragment fragment = triple.third.get();
           fragment.score = prob;
           results.add(fragment);
           return true;
         } else {
           return false;
         }
       });
   return results;
 }

예제 #21

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

 protected Rule specifyRule(Rule rule, List history, int childDepth) {
   Rule r;
   String topHistoryStr = historyToString(history.subList(1, history.size()));
   String bottomHistoryStr = historyToString(history.subList(0, childDepth));
   if (rule instanceof UnaryRule) {
     UnaryRule ur = new UnaryRule();
     UnaryRule urule = (UnaryRule) rule;
     ur.parent = stateNumberer.number(stateNumberer.object(urule.parent) + topHistoryStr);
     if (isSynthetic(urule.child)) {
       ur.child = stateNumberer.number(stateNumberer.object(urule.child) + topHistoryStr);
     } else if (isTag(urule.child)) {
       ur.child = urule.child;
     } else {
       ur.child = stateNumberer.number(stateNumberer.object(urule.child) + bottomHistoryStr);
     }
     r = ur;
   } else {
     BinaryRule br = new BinaryRule();
     BinaryRule brule = (BinaryRule) rule;
     br.parent = stateNumberer.number(stateNumberer.object(brule.parent) + topHistoryStr);
     if (isSynthetic(brule.leftChild)) {
       br.leftChild = stateNumberer.number(stateNumberer.object(brule.leftChild) + topHistoryStr);
     } else if (isTag(brule.leftChild)) {
       br.leftChild = brule.leftChild;
     } else {
       br.leftChild =
           stateNumberer.number(stateNumberer.object(brule.leftChild) + bottomHistoryStr);
     }
     if (isSynthetic(brule.rightChild)) {
       br.rightChild =
           stateNumberer.number(stateNumberer.object(brule.rightChild) + topHistoryStr);
     } else if (isTag(brule.rightChild)) {
       br.rightChild = brule.rightChild;
     } else {
       br.rightChild =
           stateNumberer.number(stateNumberer.object(brule.rightChild) + bottomHistoryStr);
     }
     r = br;
   }
   return r;
 }

예제 #22

0

파일 보기

파일: TimeExpressionExtractor.java 프로젝트: simplyianm/stanford-corenlp

  public List<TimeExpression> extractTimeExpressions(CoreMap annotation, String docDateStr) {
    List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation);
    annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers);

    // TODO: docDate may not have century....
    SUTime.Time docDate = timexPatterns.parseDateTime(docDateStr);

    List<? extends MatchedExpression> matchedExpressions =
        expressionExtractor.extractExpressions(annotation);
    List<TimeExpression> timeExpressions = new ArrayList<TimeExpression>(matchedExpressions.size());
    for (MatchedExpression expr : matchedExpressions) {
      if (expr instanceof TimeExpression) {
        timeExpressions.add((TimeExpression) expr);
      } else {
        timeExpressions.add(new TimeExpression(expr));
      }
    }

    // Add back nested time expressions for ranges....
    // For now only one level of nesting...
    if (options.includeNested) {
      List<TimeExpression> nestedTimeExpressions = new ArrayList<TimeExpression>();
      for (TimeExpression te : timeExpressions) {
        if (te.isIncludeNested()) {
          List<? extends CoreMap> children =
              te.getAnnotation().get(TimeExpression.ChildrenAnnotation.class);
          if (children != null) {
            for (CoreMap child : children) {
              TimeExpression childTe = child.get(TimeExpression.Annotation.class);
              if (childTe != null) {
                nestedTimeExpressions.add(childTe);
              }
            }
          }
        }
      }
      timeExpressions.addAll(nestedTimeExpressions);
    }
    Collections.sort(timeExpressions, MatchedExpression.EXPR_TOKEN_OFFSETS_NESTED_FIRST_COMPARATOR);
    timeExpressions = filterInvalidTimeExpressions(timeExpressions);

    // Some resolving is done even if docDate null...
    if (
    /*docDate != null && */ timeExpressions != null) {
      resolveTimeExpressions(annotation, timeExpressions, docDate);
    }
    // Annotate timex
    return timeExpressions;
  }

예제 #23

0

파일 보기

파일: TokensRegexNERAnnotator.java 프로젝트: automenta/corenlp

 private MultiPatternMatcher<CoreMap> createPatternMatcher(
     Map<SequencePattern<CoreMap>, Entry> patternToEntry) {
   // Convert to tokensregex pattern
   int patternFlags = ignoreCase ? Pattern.CASE_INSENSITIVE : 0;
   int stringMatchFlags = ignoreCase ? NodePattern.CASE_INSENSITIVE : 0;
   Env env = TokenSequencePattern.getNewEnv();
   env.setDefaultStringPatternFlags(patternFlags);
   env.setDefaultStringMatchFlags(stringMatchFlags);
   NodePattern<String> posTagPattern =
       (validPosPattern != null && PosMatchType.MATCH_ALL_TOKENS.equals(posMatchType))
           ? new CoreMapNodePattern.StringAnnotationRegexPattern(validPosPattern)
           : null;
   List<TokenSequencePattern> patterns = new ArrayList<>(entries.size());
   for (Entry entry : entries) {
     TokenSequencePattern pattern;
     if (entry.tokensRegex != null) {
       // TODO: posTagPatterns...
       pattern = TokenSequencePattern.compile(env, entry.tokensRegex);
     } else {
       List<SequencePattern.PatternExpr> nodePatterns = new ArrayList<>();
       for (String p : entry.regex) {
         CoreMapNodePattern c = CoreMapNodePattern.valueOf(p, patternFlags);
         if (posTagPattern != null) {
           c.add(CoreAnnotations.PartOfSpeechAnnotation.class, posTagPattern);
         }
         nodePatterns.add(new SequencePattern.NodePatternExpr(c));
       }
       pattern =
           TokenSequencePattern.compile(new SequencePattern.SequencePatternExpr(nodePatterns));
     }
     if (entry.annotateGroup < 0 || entry.annotateGroup > pattern.getTotalGroups()) {
       throw new RuntimeException("Invalid match group for entry " + entry);
     }
     pattern.setPriority(entry.priority);
     patterns.add(pattern);
     patternToEntry.put(pattern, entry);
   }
   return TokenSequencePattern.getMultiPatternMatcher(patterns);
 }

예제 #24

0

파일 보기

파일: AnnotationPipeline.java 프로젝트: justkunz/CoreNLP

 /**
  * Return a String that gives detailed human-readable information about how much time was spent by
  * each annotator and by the entire annotation pipeline. This String includes newline characters
  * but does not end with one, and so it is suitable to be printed out with a {@code println()}.
  *
  * @return Human readable information on time spent in processing.
  */
 public String timingInformation() {
   StringBuilder sb = new StringBuilder();
   if (TIME) {
     sb.append("Annotation pipeline timing information:\n");
     Iterator<MutableLong> it = accumulatedTime.iterator();
     long total = 0;
     for (Annotator annotator : annotators) {
       MutableLong m = it.next();
       sb.append(StringUtils.getShortClassName(annotator)).append(": ");
       sb.append(Timing.toSecondsString(m.longValue())).append(" sec.\n");
       total += m.longValue();
     }
     sb.append("TOTAL: ").append(Timing.toSecondsString(total)).append(" sec.");
   }
   return sb.toString();
 }

예제 #25

0

파일 보기

파일: AnnotationPipeline.java 프로젝트: justkunz/CoreNLP

 /**
  * Run the pipeline on an input annotation. The annotation is modified in place.
  *
  * @param annotation The input annotation, usually a raw document
  */
 @Override
 public void annotate(Annotation annotation) {
   Iterator<MutableLong> it = accumulatedTime.iterator();
   Timing t = new Timing();
   for (Annotator annotator : annotators) {
     if (TIME) {
       t.start();
     }
     annotator.annotate(annotation);
     if (TIME) {
       long elapsed = t.stop();
       MutableLong m = it.next();
       m.incValue(elapsed);
     }
   }
 }

예제 #26

0

파일 보기

파일: AnnotationPipeline.java 프로젝트: justkunz/CoreNLP

 public void addAnnotator(Annotator annotator) {
   annotators.add(annotator);
   if (TIME) {
     accumulatedTime.add(new MutableLong());
   }
 }

예제 #27

0

파일 보기

파일: TokensRegexNERAnnotator.java 프로젝트: automenta/corenlp

  /**
   * Reads a list of Entries from a mapping file and update the given entries. Line numbers start
   * from 1.
   *
   * @return the updated list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      List<Entry> entries,
      TrieMap<String, Entry> seenRegexes,
      String mappingFilename,
      BufferedReader mapping,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose)
      throws IOException {
    int origEntriesSize = entries.size();
    int isTokensRegex = 0;
    int lineCount = 0;
    for (String line; (line = mapping.readLine()) != null; ) {
      lineCount++;
      String[] split = line.split("\t");
      if (split.length < 2 || split.length > 5) {
        throw new IllegalArgumentException(
            "Provided mapping file is in wrong format. This line is bad: " + line);
      }
      String regex = split[0].trim();
      String tokensRegex = null;
      String[] regexes = null;
      if (regex.startsWith("( ") && regex.endsWith(" )")) {
        // Tokens regex (remove start and end parenthesis)
        tokensRegex = regex.substring(1, regex.length() - 1).trim();
      } else {
        regexes = regex.split("\\s+");
      }
      String[] key = (regexes != null) ? regexes : new String[] {tokensRegex};
      if (ignoreCase) {
        String[] norm = new String[key.length];
        for (int i = 0; i < key.length; i++) {
          norm[i] = key[i].toLowerCase();
        }
        key = norm;
      }
      String type = split[1].trim();

      Set<String> overwritableTypes = Generics.newHashSet();
      double priority = 0.0;

      if (split.length >= 3) {
        overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*")));
      }
      if (split.length >= 4) {
        try {
          priority = Double.parseDouble(split[3].trim());
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid priority in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }
      int annotateGroup = 0;
      // Get annotate group from input....
      if (split.length >= 5) {
        // Which group to take (allow for context)
        String context = split[4].trim();
        try {
          annotateGroup = Integer.parseInt(context);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid group in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }

      // Print some warning about the type
      int commaPos = type.indexOf(',');
      if (commaPos > 0) {
        // Strip the "," and just take first type
        String newType = type.substring(0, commaPos).trim();
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry has multiple types: "
                + line
                + ".  Taking type to be "
                + newType);
        type = newType;
      }

      Entry entry =
          new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup);

      if (seenRegexes.containsKey(key)) {
        Entry oldEntry = seenRegexes.get(key);
        if (priority > oldEntry.priority) {
          logger.warn(
              "TokensRegexNERAnnotator "
                  + annotatorName
                  + ": Replace duplicate entry (higher priority): old="
                  + oldEntry
                  + ", new="
                  + entry);
        } else {
          if (!oldEntry.type.equals(type)) {
            if (verbose) {
              logger.warn(
                  "TokensRegexNERAnnotator "
                      + annotatorName
                      + ": Ignoring duplicate entry: "
                      + split[0]
                      + ", old type = "
                      + oldEntry.type
                      + ", new type = "
                      + type);
            }
            // } else {
            //   if (verbose) {
            //     logger.warn("TokensRegexNERAnnotator " + annotatorName +
            //             ": Duplicate entry [ignored]: " + split[0] + ", old type = " +
            // oldEntry.type + ", new type = " + type);
            //   }
          }
          continue;
        }
      }

      // Print some warning if label belongs to noDefaultOverwriteLabels but there is no
      // overwritable types
      if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) {
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry doesn't have overwriteable types "
                + entry
                + ", but entry type is in noDefaultOverwriteLabels");
      }

      entries.add(entry);
      seenRegexes.put(key, entry);
      if (entry.tokensRegex != null) isTokensRegex++;
    }

    logger.log(
        "TokensRegexNERAnnotator "
            + annotatorName
            + ": Read "
            + (entries.size() - origEntriesSize)
            + " unique entries out of "
            + lineCount
            + " from "
            + mappingFilename
            + ", "
            + isTokensRegex
            + " TokensRegex patterns.");
    return entries;
  }

예제 #28

0

파일 보기

파일: TimeExpressionExtractor.java 프로젝트: simplyianm/stanford-corenlp

 private static TimeExpression getTimeExpression(List<? extends CoreMap> list, int index) {
   return list.get(index).get(TimeExpression.Annotation.class);
 }

예제 #29

0

파일 보기

파일: FactoredParser.java 프로젝트: renaud/maven_repo

  public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.

    System.out.println("Currently " + new Date());
    System.out.print("Invoked with arguments:");
    for (String arg : args) {
      System.out.print(" " + arg);
    }
    System.out.println();

    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
        serializeFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
        try {
          op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
        } catch (ClassNotFoundException e) {
          System.err.println("Class not found: " + args[i + 1]);
        } catch (InstantiationException e) {
          System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
        } catch (IllegalAccessException e) {
          System.err.println("illegal access" + e);
        }
        i += 2;
      } else if (args[i].equals("-encoding")) {
        // sets encoding for TreebankLangParserParams
        op.tlpParams.setInputEncoding(args[i + 1]);
        op.tlpParams.setOutputEncoding(args[i + 1]);
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    Train.sisterSplitters = new HashSet(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    Test.display();
    Train.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (Test.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");
    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer = null;
    if (!Train.leftToRight) {
      binarizer =
          new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !Train.outsideFactor(), true);
    } else {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams.headFinder(),
              new LeftHeadFinder(),
              op.tlpParams,
              op.forceCNF,
              !Train.outsideFactor(),
              true);
    }
    CollinsPuncTransformer collinsPuncTransformer = null;
    if (Train.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (Train.selectiveSplit) {
      Train.splitters =
          ParentAnnotationStats.getSplitCategories(
              trainTreebank,
              Train.tagSelectiveSplit,
              0,
              Train.selectiveSplitCutOff,
              Train.tagSelectiveSplitCutOff,
              op.tlpParams.treebankLanguagePack());
      if (Train.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : Train.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = Train.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (Train.selectivePostSplit) {
      TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      Train.postSplitters =
          ParentAnnotationStats.getSplitCategories(
              annotatedTB,
              true,
              0,
              Train.selectivePostSplitCutOff,
              Train.tagSelectivePostSplitCutOff,
              op.tlpParams.treebankLanguagePack());
    }

    if (Train.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (Train.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        // tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        // binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (Test.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (Train.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done."); // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    // extract grammars
    Extractor bgExtractor = new BinaryGrammarExtractor();
    // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    // Extractor dgExtractor = new DependencyMemGrammarExtractor();

    Extractor dgExtractor = new MLEDependencyGrammarExtractor(op);
    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair bgug = null;
      if (Train.cheatPCFG) {
        List allTrees = new ArrayList(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = (Pair) bgExtractor.extract(allTrees);
      } else {
        bgug = (Pair) bgExtractor.extract(binaryTrainTrees);
      }
      bg = (BinaryGrammar) bgug.second;
      bg.splitRules();
      ug = (UnaryGrammar) bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    lex = op.tlpParams.lex(op.lexOptions);
    lex.train(binaryTrainTrees);
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams,true));

      DependencyGrammar dg1 =
          (DependencyGrammar)
              dgExtractor.extract(
                  trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
      // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new
      // TransformTreeDependency(tlpParams));

      // dg = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      // dg = (DependencyGrammar) dgExtractor.extract(binaryTrainTrees); //uses information whether
      // the words are known or not, discards unknown words
      Timing.tick("done.");
      // System.out.print("Extracting Unknown Word Model...");
      // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      // Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      // System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser.saveParserDataToSerialized(
          new ParserData(lex, bg, ug, dg, Numberer.getNumberers(), op), serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op);
    }

    ExhaustiveDependencyParser dparser =
        ((op.doDep && !Test.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op) : null);

    Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp), dparser) : null);
    // Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser =
          (Test.useN5)
              ? new BiLexPCFGParser.N5BiLexPCFGParser(
                  scorer, parser, dparser, bg, ug, dg, lex, op, gp)
              : new BiLexPCFGParser(scorer, parser, dparser, bg, ug, dg, lex, op, gp);
    }

    LabeledConstituentEval pcfgPE = new LabeledConstituentEval("pcfg  PE", true, tlp);
    LabeledConstituentEval comboPE = new LabeledConstituentEval("combo PE", true, tlp);
    AbstractEval pcfgCB = new LabeledConstituentEval.CBEval("pcfg  CB", true, tlp);

    AbstractEval pcfgTE = new AbstractEval.TaggingEval("pcfg  TE");
    AbstractEval comboTE = new AbstractEval.TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new AbstractEval.TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new AbstractEval.TaggingEval("combo nopunct TE");
    AbstractEval depTE = new AbstractEval.TaggingEval("depnd TE");

    AbstractEval depDE =
        new AbstractEval.DependencyEval("depnd DE", true, tlp.punctuationWordAcceptFilter());
    AbstractEval comboDE =
        new AbstractEval.DependencyEval("combo DE", true, tlp.punctuationWordAcceptFilter());

    if (Test.evalb) {
      EvalB.initEVALBfiles(op.tlpParams);
    }

    // int[] countByLength = new int[Test.maxLength+1];

    // use a reflection ruse, so one can run this without needing the tagger
    // edu.stanford.nlp.process.SentenceTagger tagger = (Test.preTag ? new
    // edu.stanford.nlp.process.SentenceTagger("/u/nlp/data/tagger.params/wsj0-21.holder") : null);
    SentenceProcessor tagger = null;
    if (Test.preTag) {
      try {
        Class[] argsClass = new Class[] {String.class};
        Object[] arguments =
            new Object[] {"/u/nlp/data/pos-tagger/wsj3t0-18-bidirectional/train-wsj-0-18.holder"};
        tagger =
            (SentenceProcessor)
                Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger")
                    .getConstructor(argsClass)
                    .newInstance(arguments);
      } catch (Exception e) {
        System.err.println(e);
        System.err.println("Warning: No pretagging of sentences will be done.");
      }
    }

    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
      Tree tree = testTreebank.get(tNum);
      int testTreeLen = tree.yield().size();
      if (testTreeLen > Test.maxLength) {
        continue;
      }
      Tree binaryTree = binaryTestTrees.get(tNum);
      // countByLength[testTreeLen]++;
      System.out.println("-------------------------------------");
      System.out.println("Number: " + (tNum + 1));
      System.out.println("Length: " + testTreeLen);

      // tree.pennPrint(pw);
      // System.out.println("XXXX The binary tree is");
      // binaryTree.pennPrint(pw);
      // System.out.println("Here are the tags in the lexicon:");
      // System.out.println(lex.showTags());
      // System.out.println("Here's the tagnumberer:");
      // System.out.println(Numberer.getGlobalNumberer("tags").toString());

      long timeMil1 = System.currentTimeMillis();
      Timing.tick("Starting parse.");
      if (op.doPCFG) {
        // System.err.println(Test.forceTags);
        if (Test.forceTags) {
          if (tagger != null) {
            // System.out.println("Using a tagger to set tags");
            // System.out.println("Tagged sentence as: " +
            // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
            parser.parse(addLast(tagger.processSentence(cutLast(wordify(binaryTree.yield())))));
          } else {
            // System.out.println("Forcing tags to match input.");
            parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
          }
        } else {
          // System.out.println("XXXX Parsing " + binaryTree.yield());
          parser.parse(binaryTree.yield());
        }
        // Timing.tick("Done with pcfg phase.");
      }
      if (op.doDep) {
        dparser.parse(binaryTree.yield());
        // Timing.tick("Done with dependency phase.");
      }
      boolean bothPassed = false;
      if (op.doPCFG && op.doDep) {
        bothPassed = bparser.parse(binaryTree.yield());
        // Timing.tick("Done with combination phase.");
      }
      long timeMil2 = System.currentTimeMillis();
      long elapsed = timeMil2 - timeMil1;
      System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
      // System.out.println("PCFG Best Parse:");
      Tree tree2b = null;
      Tree tree2 = null;
      // System.out.println("Got full best parse...");
      if (op.doPCFG) {
        tree2b = parser.getBestParse();
        tree2 = debinarizer.transformTree(tree2b);
      }
      // System.out.println("Debinarized parse...");
      // tree2.pennPrint();
      // System.out.println("DepG Best Parse:");
      Tree tree3 = null;
      Tree tree3db = null;
      if (op.doDep) {
        tree3 = dparser.getBestParse();
        // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
        tree3db = debinarizer.transformTree(tree3);
        tree3.pennPrint(pw);
      }
      // tree.pennPrint();
      // ((Tree)binaryTrainTrees.get(tNum)).pennPrint();
      // System.out.println("Combo Best Parse:");
      Tree tree4 = null;
      if (op.doPCFG && op.doDep) {
        try {
          tree4 = bparser.getBestParse();
          if (tree4 == null) {
            tree4 = tree2b;
          }
        } catch (NullPointerException e) {
          System.err.println("Blocked, using PCFG parse!");
          tree4 = tree2b;
        }
      }
      if (op.doPCFG && !bothPassed) {
        tree4 = tree2b;
      }
      // tree4.pennPrint();
      if (op.doDep) {
        depDE.evaluate(tree3, binaryTree, pw);
        depTE.evaluate(tree3db, tree, pw);
      }
      TreeTransformer tc = op.tlpParams.collinizer();
      TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
      Tree tree4b = null;
      if (op.doPCFG) {
        // System.out.println("XXXX Best PCFG was: ");
        // tree2.pennPrint();
        // System.out.println("XXXX Transformed best PCFG is: ");
        // tc.transformTree(tree2).pennPrint();
        // System.out.println("True Best Parse:");
        // tree.pennPrint();
        // tc.transformTree(tree).pennPrint();
        pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        if (op.doDep) {
          comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
          tree4b = tree4;
          tree4 = debinarizer.transformTree(tree4);
          if (op.nodePrune) {
            NodePruner np = new NodePruner(parser, debinarizer);
            tree4 = np.prune(tree4);
          }
          // tree4.pennPrint();
          comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        // pcfgTE.evaluate(tree2, tree);
        pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
        pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);

        if (op.doDep) {
          comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
          comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

        // tc.transformTree(tree2).pennPrint();
        tree2.pennPrint(pw);

        if (op.doDep) {
          System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
          // tc.transformTree(tree4).pennPrint(pw);
          tree4.pennPrint(pw);
        }
        System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
        /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
        tree.pennPrint(pw);
      } // end if doPCFG

      if (Test.evalb) {
        if (op.doPCFG && op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
        } else if (op.doPCFG) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
        } else if (op.doDep) {
          EvalB.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
        }
      }
    } // end for each tree in test treebank

    if (Test.evalb) {
      EvalB.closeEVALBfiles();
    }

    // Test.display();
    if (op.doPCFG) {
      pcfgPE.display(false, pw);
      System.out.println("Grammar size: " + Numberer.getGlobalNumberer("states").total());
      pcfgCB.display(false, pw);
      if (op.doDep) {
        comboPE.display(false, pw);
      }
      pcfgTE.display(false, pw);
      pcfgTEnoPunct.display(false, pw);
      if (op.doDep) {
        comboTE.display(false, pw);
        comboTEnoPunct.display(false, pw);
      }
    }
    if (op.doDep) {
      depTE.display(false, pw);
      depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
      comboDE.display(false, pw);
    }
    // pcfgPE.printGoodBad();
  }

예제 #30

0

파일 보기

파일: TimeExpressionExtractor.java 프로젝트: simplyianm/stanford-corenlp

 @SuppressWarnings("unused")
 private static String getText(List<? extends CoreMap> list, int index) {
   return list.get(index).get(CoreAnnotations.TextAnnotation.class);
 }