Exemplo n.º 1
0
 private void commitVariableGroups(Matcher m) {
   committedVariables = true; // commit all my variable groups.
   for (Pair<Integer, String> varGroup : myNode.variableGroups) {
     String thisVarString = m.group(varGroup.first());
     variableStrings.setVar(varGroup.second(), thisVarString);
   }
 }
Exemplo n.º 2
0
  private String findNextParagraphSpeaker(
      List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
    CoreMap lastSent = paragraph.get(paragraph.size() - 1);
    String speaker = "";
    for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report")
          || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency =
            lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) {
          if (child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index(); // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size() - 1 + paragraphOffset);
            headPosition.set(1, subjectIndex - 1);
            if (mentionheadPositions.containsKey(headPosition)
                && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }
Exemplo n.º 3
0
 private void incrementMonth(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
   String origDateString = referenceDate.getStartDate();
   String monthString = origDateString.substring(4, 6);
   if (monthString.contains("*")) {
     isoDate = origDateString;
     return;
   }
   // Month is not a variable
   Integer monthNum = Integer.parseInt(monthString);
   // Check if we're an edge case
   if (((monthNum + relation.second()) > 12) || ((monthNum + relation.second) < 1)) {
     boolean decreasing = ((monthNum + relation.second) < 1);
     int newMonthNum = (monthNum + relation.second()) % 12;
     if (newMonthNum < 0) {
       newMonthNum *= -1;
     }
     // Set the month appropriately
     isoDate = makeStringMonthChange(origDateString, newMonthNum);
     // Increment the year if possible
     String yearString = origDateString.substring(0, 4);
     if (!yearString.contains("*")) {
       // How much we increment depends on above mod
       int numYearsToIncrement = (int) Math.ceil(relation.second() / 12.0);
       if (decreasing) {
         isoDate =
             makeStringYearChange(isoDate, Integer.parseInt(yearString) - numYearsToIncrement);
       } else {
         isoDate =
             makeStringYearChange(isoDate, Integer.parseInt(yearString) + numYearsToIncrement);
       }
     }
   } else {
     isoDate = makeStringMonthChange(origDateString, (monthNum + relation.second()));
   }
 }
Exemplo n.º 4
0
  /** Check one mention is the speaker of the other mention */
  public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {

    if (!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
        || ant.number == Number.PLURAL
        || ant.sentNum != m.sentNum) return false;

    int countQuotationMark = 0;
    for (int i = Math.min(m.headIndex, ant.headIndex) + 1;
        i < Math.max(m.headIndex, ant.headIndex);
        i++) {
      String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
      if (word.equals("``") || word.equals("''")) countQuotationMark++;
    }
    if (countQuotationMark != 1) return false;

    IndexedWord w =
        m.dependency.getNodeByWordPattern(
            m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
    if (w == null) return false;

    for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) {
      if (parent.first().getShortName().equals("nsubj")
          && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
        return true;
      }
    }
    return false;
  }
Exemplo n.º 5
0
 private void decommitVariableGroups() {
   if (committedVariables) {
     for (Pair<Integer, String> varGroup : myNode.variableGroups) {
       variableStrings.unsetVar(varGroup.second());
     }
   }
   committedVariables = false;
 }
Exemplo n.º 6
0
 /**
  * Compares this <code>Pair</code> to another object. If the object is a <code>Pair</code>, this
  * function will work providing the elements of the <code>Pair</code> are themselves comparable.
  * It will then return a value based on the pair of objects, where <code>
  * p &gt; q iff p.first() &gt; q.first() ||
  * (p.first().equals(q.first()) && p.second() &gt; q.second())</code>. If the other object is not
  * a <code>Pair</code>, it throws a <code>ClassCastException</code>.
  *
  * @param o the <code>Object</code> to be compared.
  * @return the value <code>0</code> if the argument is a <code>Pair</code> equal to this <code>
  *     Pair</code>; a value less than <code>0</code> if the argument is a <code>Pair</code>
  *     greater than this <code>Pair</code>; and a value greater than <code>0</code> if the
  *     argument is a <code>Pair</code> less than this <code>Pair</code>.
  * @throws ClassCastException if the argument is not a <code>Pair</code>.
  * @see java.lang.Comparable
  */
 public int compareTo(Object o) {
   Pair another = (Pair) o;
   int comp = ((Comparable) first()).compareTo(another.first());
   if (comp != 0) {
     return comp;
   } else {
     return ((Comparable) second()).compareTo(another.second());
   }
 }
 public String expandStringRegex(String regex) {
   // Replace all variables in regex
   String expanded = regex;
   for (String v : stringRegexVariables.keySet()) {
     Pair<Pattern, String> p = stringRegexVariables.get(v);
     expanded = p.first().matcher(expanded).replaceAll(p.second());
   }
   return expanded;
 }
Exemplo n.º 8
0
 /**
  * Read a string representation of a Pair from a DataStream. This might not work correctly unless
  * the pair of objects are of type <code>String</code>.
  */
 public static Pair<String, String> readStringPair(DataInputStream in) {
   Pair<String, String> p = new Pair<String, String>();
   try {
     p.first = in.readUTF();
     p.second = in.readUTF();
   } catch (Exception e) {
     e.printStackTrace();
   }
   return p;
 }
 protected void updateKeepBids(Set<Integer> bids) {
   // TODO: Is there a point when we don't need to keep these bids anymore?
   for (int i = 0; i < reachableChildBids.length; i++) {
     Set<Pair<Integer, Integer>> v = reachableChildBids[i];
     if (v != null) {
       for (Pair<Integer, Integer> p : v) {
         bids.add(p.first());
       }
     }
   }
 }
Exemplo n.º 10
0
 /**
  * Construct a new ISODate based on its relation to a referenceDate. relativeDate should be
  * something like "today" or "tomorrow" or "last year" and the resulting ISODate will be the same
  * as the referenceDate, a day later, or a year earlier, respectively.
  */
 public ISODateInstance(ISODateInstance referenceDate, String relativeDate) {
   Pair<DateField, Integer> relation = relativeDateMap.get(relativeDate.toLowerCase());
   if (relation != null) {
     switch (relation.first()) {
       case DAY:
         incrementDay(referenceDate, relation);
         break;
       case MONTH:
         incrementMonth(referenceDate, relation);
         break;
       case YEAR:
         incrementYear(referenceDate, relation);
         break;
     }
   }
 }
Exemplo n.º 11
0
  /**
   * Uses regexp matching to match month, day, and year fields TODO: Find a way to mark what;s
   * already been handled in the string
   */
  public boolean extractFields(String inputDate) {

    if (tokens.size() < 2) {
      tokenizeDate(inputDate);
    }
    if (DEBUG) {
      System.err.println("Extracting date: " + inputDate);
    }
    // first we see if it's a hyphen and two parseable dates - if not, we treat it as one date
    Pair<String, String> dateEndpoints = getRangeDates(inputDate);
    if (dateEndpoints != null) {
      ISODateInstance date1 = new ISODateInstance(dateEndpoints.first());
      if (dateEndpoints.first().contains(" ") && !dateEndpoints.second().contains(" ")) {
        // consider whether it's a leading modifier; e.g., "June 8-10" will be split into June 8,
        // and 10 when really we'd like June 8 and June 10
        String date =
            dateEndpoints.first().substring(0, dateEndpoints.first().indexOf(' '))
                + ' '
                + dateEndpoints.second();
        ISODateInstance date2 = new ISODateInstance(date);
        if (!date1.isUnparseable() && !date2.isUnparseable()) {
          isoDate = (new ISODateInstance(date1, date2)).getDateString();
          return true;
        }
      }

      ISODateInstance date2 = new ISODateInstance(dateEndpoints.second());
      if (!date1.isUnparseable() && !date2.isUnparseable()) {
        isoDate = (new ISODateInstance(date1, date2)).getDateString();
        return true;
      }
    }

    if (extractYYYYMMDD(inputDate)) {
      return true;
    }
    if (extractMMDDYY(inputDate)) {
      return true;
    }
    boolean passed = false;
    passed = extractYear(inputDate) || passed;
    passed = extractMonth(inputDate) || passed;
    passed = extractDay(inputDate) || passed;

    // slightly hacky, but check for some common modifiers that get grouped into the date
    passed = addExtraRanges(inputDate) || passed;

    if (!passed) { // couldn't parse
      // try one more trick
      unparseable = true;
      boolean weekday = extractWeekday(inputDate);
      if (!weekday) {
        isoDate = inputDate;
      }
    }
    return passed;
  }
Exemplo n.º 12
0
 private static void extractSubtrees(List<String> codeStrings, String treeFile) {
   List<Pair<Integer, Integer>> codes = new ArrayList<Pair<Integer, Integer>>();
   for (String s : codeStrings) {
     Matcher m = codePattern.matcher(s);
     if (m.matches())
       codes.add(
           new Pair<Integer, Integer>(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))));
     else throw new RuntimeException("Error: illegal node code " + s);
   }
   TreeReaderFactory trf = new TRegexTreeReaderFactory();
   MemoryTreebank treebank = new MemoryTreebank(trf);
   treebank.loadPath(treeFile, null, true);
   for (Pair<Integer, Integer> code : codes) {
     Tree t = treebank.get(code.first() - 1);
     t.getNodeNumber(code.second()).pennPrint();
   }
 }
Exemplo n.º 13
0
 public static Tree processPatternsOnTree(List<Pair<TregexPattern, TsurgeonPattern>> ops, Tree t) {
   matchedOnTree = false;
   for (Pair<TregexPattern, TsurgeonPattern> op : ops) {
     try {
       if (DEBUG) {
         System.err.println("Running pattern " + op.first());
       }
       TregexMatcher m = op.first().matcher(t);
       while (m.find()) {
         matchedOnTree = true;
         t = op.second().evaluate(t, m);
         if (t == null) {
           return null;
         }
         m = op.first().matcher(t);
       }
     } catch (NullPointerException npe) {
       throw new RuntimeException(
           "Tsurgeon.processPatternsOnTree failed to match label for pattern: "
               + op.first()
               + ", "
               + op.second(),
           npe);
     }
   }
   return t;
 }
Exemplo n.º 14
0
  private boolean findSpeaker(
      int utterNum,
      int sentNum,
      List<CoreMap> sentences,
      int startIndex,
      int endIndex,
      Dictionaries dict) {
    List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
    for (int i = startIndex; i < endIndex; i++) {
      if (sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class) != 0) continue;
      String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class);
      String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
      if (dict.reportVerb.contains(lemma)) {
        // find subject
        SemanticGraph dependency =
            sentences
                .get(sentNum)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord w = dependency.getNodeByWordPattern(word);

        if (w != null) {
          for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(w)) {
            if (child.first().getShortName().equals("nsubj")) {
              String subjectString = child.second().word();
              int subjectIndex = child.second().index(); // start from 1
              IntTuple headPosition = new IntTuple(2);
              headPosition.set(0, sentNum);
              headPosition.set(1, subjectIndex - 1);
              String speaker;
              if (mentionheadPositions.containsKey(headPosition)) {
                speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
              } else {
                speaker = subjectString;
              }
              speakers.put(utterNum, speaker);
              return true;
            }
          }
        } else {
          SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word);
        }
      }
    }
    return false;
  }
Exemplo n.º 15
0
 private void incrementYear(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
   String origDateString = referenceDate.getStartDate();
   String yearString = origDateString.substring(0, 4);
   if (yearString.contains("*")) {
     isoDate = origDateString;
     return;
   }
   isoDate =
       makeStringYearChange(origDateString, Integer.parseInt(yearString) + relation.second());
 }
 /**
  * Returns true if there is a feasible combination of child branch ids that causes all child
  * expressions to be satisfied with respect to the specified child expression (assuming
  * satisfiction with the specified branch and node index) For other child expressions to have a
  * compatible satisfiable branch, that branch must also terminate with the same node index as
  * this one.
  *
  * @param index - Index of the child expression
  * @param bid - Branch id that causes the indexed child to be satisfied
  * @param pos - Node index that causes the indexed child to be satisfied
  * @return whether there is a feasible combination that causes all children to be satisfied with
  *     respect to specfied child.
  */
 private boolean isAllChildMatched(int index, int bid, int pos) {
   for (int i = 0; i < reachableChildBids.length; i++) {
     Set<Pair<Integer, Integer>> v = reachableChildBids[i];
     if (v == null || v.isEmpty()) return false;
     if (i != index) {
       boolean ok = false;
       for (Pair<Integer, Integer> p : v) {
         if (p.second() == pos) {
           ok = true;
           break;
         }
       }
       if (!ok) {
         return false;
       }
     }
   }
   return true;
 }
Exemplo n.º 17
0
    @Override
    public Pair<DeepTree, DeepTree> process(Tree tree) {
      // For each tree, move in the direction of the gold tree, and
      // move away from the direction of the best scoring hypothesis

      IdentityHashMap<Tree, SimpleMatrix> goldVectors = new IdentityHashMap<>();
      double scoreGold = score(tree, goldVectors);
      DeepTree bestTree = getHighestScoringTree(tree, TRAIN_LAMBDA);
      DeepTree goldTree = new DeepTree(tree, goldVectors, scoreGold);
      return Pair.makePair(goldTree, bestTree);
    }
Exemplo n.º 18
0
 // Update incompatibles for two clusters that are about to be merged
 public void mergeIncompatibles(CorefCluster to, CorefCluster from) {
   List<Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>> replacements =
       new ArrayList<Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>>();
   for (Pair<Integer, Integer> p : incompatibleClusters) {
     Integer other = null;
     if (p.first == from.clusterID) {
       other = p.second;
     } else if (p.second == from.clusterID) {
       other = p.first;
     }
     if (other != null && other != to.clusterID) {
       int cid1 = Math.min(other, to.clusterID);
       int cid2 = Math.max(other, to.clusterID);
       replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2)));
     }
   }
   for (Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> r : replacements) {
     incompatibleClusters.remove(r.first.first(), r.first.second());
     incompatibleClusters.add(r.second.first(), r.second.second());
   }
 }
Exemplo n.º 19
0
  private void findSpeakersInArticle(Dictionaries dict) {
    List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    Pair<Integer, Integer> beginQuotation = new Pair<Integer, Integer>();
    Pair<Integer, Integer> endQuotation = new Pair<Integer, Integer>();
    boolean insideQuotation = false;
    int utterNum = -1;

    for (int i = 0; i < sentences.size(); i++) {
      List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
      for (int j = 0; j < sent.size(); j++) {
        int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);

        if (utterIndex != 0 && !insideQuotation) {
          utterNum = utterIndex;
          insideQuotation = true;
          beginQuotation.setFirst(i);
          beginQuotation.setSecond(j);
        } else if (utterIndex == 0 && insideQuotation) {
          insideQuotation = false;
          endQuotation.setFirst(i);
          endQuotation.setSecond(j);
          findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict);
        }
      }
    }
  }
 /**
  * Returns array of child branch ids that causes all child expressions to be satisfied with
  * respect to the specified child expression (assuming satisfiction with the specified branch
  * and node index) For other child expressions to have a compatible satisfiable branch, that
  * branch must also terminate with the same node index as this one.
  *
  * @param index - Index of the child expression
  * @param bid - Branch id that causes the indexed child to be satisfied
  * @param pos - Node index that causes the indexed child to be satisfied
  * @return array of child branch ids if there is a valid combination null otherwise
  */
 private int[] getAllChildMatchedBids(int index, int bid, int pos) {
   int[] matchedBids = new int[reachableChildBids.length];
   for (int i = 0; i < reachableChildBids.length; i++) {
     Set<Pair<Integer, Integer>> v = reachableChildBids[i];
     if (v == null || v.isEmpty()) return null;
     if (i != index) {
       boolean ok = false;
       for (Pair<Integer, Integer> p : v) {
         if (p.second() == pos) {
           ok = true;
           matchedBids[i] = p.first();
           break;
         }
       }
       if (!ok) {
         return null;
       }
     } else {
       matchedBids[i] = bid;
     }
   }
   return matchedBids;
 }
 protected <T> boolean match(
     int bid, SequenceMatcher.MatchedStates<T> matchedStates, boolean consume) {
   // Try to match previous node/nodes exactly
   if (consume) {
     // First element is group that is matched, second is number of nodes matched so far
     Pair<SequenceMatcher.MatchedGroup, Integer> backRefState =
         (Pair<SequenceMatcher.MatchedGroup, Integer>)
             matchedStates.getBranchStates().getMatchStateInfo(bid, this);
     if (backRefState == null) {
       // Haven't tried to match this node before, try now
       // Get element and return if it matched or not
       SequenceMatcher.MatchedGroup matchedGroup =
           matchedStates.getBranchStates().getMatchedGroup(bid, captureGroupId);
       if (matchedGroup != null) {
         // See if the first node matches
         if (matchedGroup.matchEnd > matchedGroup.matchBegin) {
           boolean matched = match(bid, matchedStates, matchedGroup, 0);
           return matched;
         } else {
           // TODO: Check handling of previous nodes that are zero elements?
           return super.match(bid, matchedStates, consume);
         }
       }
       return false;
     } else {
       SequenceMatcher.MatchedGroup matchedGroup = backRefState.first();
       int matchedNodes = backRefState.second();
       boolean matched = match(bid, matchedStates, matchedGroup, matchedNodes);
       return matched;
     }
   } else {
     // Not consuming, just add this state back to list of states to be processed
     matchedStates.addState(bid, this);
     return false;
   }
 }
  /**
   * Parse a CoNLL formatted tree into a SemanticGraph object (along with a list of tokens).
   *
   * @param conll The CoNLL formatted tree.
   * @return A pair of a SemanticGraph and a token list, corresponding to the parse of the sentence
   *     and to tokens in the sentence.
   */
  protected Pair<SemanticGraph, List<CoreLabel>> mkTree(String conll) {
    List<CoreLabel> sentence = new ArrayList<>();
    SemanticGraph tree = new SemanticGraph();
    for (String line : conll.split("\n")) {
      if (line.trim().equals("")) {
        continue;
      }
      String[] fields = line.trim().split("\\s+");
      int index = Integer.parseInt(fields[0]);
      String word = fields[1];
      CoreLabel label = IETestUtils.mkWord(word, index);
      sentence.add(label);
      if (fields[2].equals("0")) {
        tree.addRoot(new IndexedWord(label));
      } else {
        tree.addVertex(new IndexedWord(label));
      }
      if (fields.length > 4) {
        label.setTag(fields[4]);
      }
      if (fields.length > 5) {
        label.setNER(fields[5]);
      }
      if (fields.length > 6) {
        label.setLemma(fields[6]);
      }
    }
    int i = 0;
    for (String line : conll.split("\n")) {
      if (line.trim().equals("")) {
        continue;
      }
      String[] fields = line.trim().split("\\s+");
      int parent = Integer.parseInt(fields[2]);
      String reln = fields[3];
      if (parent > 0) {
        tree.addEdge(
            new IndexedWord(sentence.get(parent - 1)),
            new IndexedWord(sentence.get(i)),
            new GrammaticalRelation(Language.UniversalEnglish, reln, null, null),
            1.0,
            false);
      }
      i += 1;
    }

    return Pair.makePair(tree, sentence);
  }
Exemplo n.º 23
0
  private void findQuotationSpeaker(
      int utterNum,
      List<CoreMap> sentences,
      Pair<Integer, Integer> beginQuotation,
      Pair<Integer, Integer> endQuotation,
      Dictionaries dict) {

    if (findSpeaker(utterNum, beginQuotation.first(), sentences, 0, beginQuotation.second(), dict))
      return;

    if (findSpeaker(
        utterNum,
        endQuotation.first(),
        sentences,
        endQuotation.second(),
        sentences.get(endQuotation.first()).get(CoreAnnotations.TokensAnnotation.class).size(),
        dict)) return;

    if (beginQuotation.second() <= 1 && beginQuotation.first() > 0) {
      if (findSpeaker(
          utterNum,
          beginQuotation.first() - 1,
          sentences,
          0,
          sentences
              .get(beginQuotation.first() - 1)
              .get(CoreAnnotations.TokensAnnotation.class)
              .size(),
          dict)) return;
    }

    if (endQuotation.second() == sentences.get(endQuotation.first()).size() - 1
        && sentences.size() > endQuotation.first() + 1) {
      if (findSpeaker(
          utterNum,
          endQuotation.first() + 1,
          sentences,
          0,
          sentences
              .get(endQuotation.first() + 1)
              .get(CoreAnnotations.TokensAnnotation.class)
              .size(),
          dict)) return;
    }
  }
Exemplo n.º 24
0
  /**
   * The core implementation of the search.
   *
   * @param root The root word to search from. Traditionally, this is the root of the sentence.
   * @param candidateFragments The callback for the resulting sentence fragments. This is a
   *     predicate of a triple of values. The return value of the predicate determines whether we
   *     should continue searching. The triple is a triple of
   *     <ol>
   *       <li>The log probability of the sentence fragment, according to the featurizer and the
   *           weights
   *       <li>The features along the path to this fragment. The last element of this is the
   *           features from the most recent step.
   *       <li>The sentence fragment. Because it is relatively expensive to compute the resulting
   *           tree, this is returned as a lazy {@link Supplier}.
   *     </ol>
   *
   * @param classifier The classifier for whether an arc should be on the path to a clause split, a
   *     clause split itself, or neither.
   * @param featurizer The featurizer to use. Make sure this matches the weights!
   * @param actionSpace The action space we are allowed to take. Each action defines a means of
   *     splitting a clause on a dependency boundary.
   */
  protected void search(
      // The root to search from
      IndexedWord root,
      // The output specs
      final Predicate<Triple<Double, List<Counter<String>>, Supplier<SentenceFragment>>>
          candidateFragments,
      // The learning specs
      final Classifier<ClauseSplitter.ClauseClassifierLabel, String> classifier,
      Map<String, ? extends List<String>> hardCodedSplits,
      final Function<Triple<State, Action, State>, Counter<String>> featurizer,
      final Collection<Action> actionSpace,
      final int maxTicks) {
    // (the fringe)
    PriorityQueue<Pair<State, List<Counter<String>>>> fringe = new FixedPrioritiesPriorityQueue<>();
    // (avoid duplicate work)
    Set<IndexedWord> seenWords = new HashSet<>();

    State firstState =
        new State(null, null, -9000, null, x -> {}, true); // First state is implicitly "done"
    fringe.add(Pair.makePair(firstState, new ArrayList<>(0)), -0.0);
    int ticks = 0;

    while (!fringe.isEmpty()) {
      if (++ticks > maxTicks) {
        //        System.err.println("WARNING! Timed out on search with " + ticks + " ticks");
        return;
      }
      // Useful variables
      double logProbSoFar = fringe.getPriority();
      assert logProbSoFar <= 0.0;
      Pair<State, List<Counter<String>>> lastStatePair = fringe.removeFirst();
      State lastState = lastStatePair.first;
      List<Counter<String>> featuresSoFar = lastStatePair.second;
      IndexedWord rootWord = lastState.edge == null ? root : lastState.edge.getDependent();

      // Register thunk
      if (lastState.isDone) {
        if (!candidateFragments.test(
            Triple.makeTriple(
                logProbSoFar,
                featuresSoFar,
                () -> {
                  SemanticGraph copy = new SemanticGraph(tree);
                  lastState
                      .thunk
                      .andThen(
                          x -> {
                            // Add the extra edges back in, if they don't break the tree-ness of the
                            // extraction
                            for (IndexedWord newTreeRoot : x.getRoots()) {
                              if (newTreeRoot != null) { // what a strange thing to have happen...
                                for (SemanticGraphEdge extraEdge :
                                    extraEdgesByGovernor.get(newTreeRoot)) {
                                  assert Util.isTree(x);
                                  //noinspection unchecked
                                  addSubtree(
                                      x,
                                      newTreeRoot,
                                      extraEdge.getRelation().toString(),
                                      tree,
                                      extraEdge.getDependent(),
                                      tree.getIncomingEdgesSorted(newTreeRoot));
                                  assert Util.isTree(x);
                                }
                              }
                            }
                          })
                      .accept(copy);
                  return new SentenceFragment(copy, assumedTruth, false);
                }))) {
          break;
        }
      }

      // Find relevant auxilliary terms
      SemanticGraphEdge subjOrNull = null;
      SemanticGraphEdge objOrNull = null;
      for (SemanticGraphEdge auxEdge : tree.outgoingEdgeIterable(rootWord)) {
        String relString = auxEdge.getRelation().toString();
        if (relString.contains("obj")) {
          objOrNull = auxEdge;
        } else if (relString.contains("subj")) {
          subjOrNull = auxEdge;
        }
      }

      // Iterate over children
      // For each outgoing edge...
      for (SemanticGraphEdge outgoingEdge : tree.outgoingEdgeIterable(rootWord)) {
        // Prohibit indirect speech verbs from splitting off clauses
        // (e.g., 'said', 'think')
        // This fires if the governor is an indirect speech verb, and the outgoing edge is a ccomp
        if (outgoingEdge.getRelation().toString().equals("ccomp")
            && ((outgoingEdge.getGovernor().lemma() != null
                    && INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().lemma()))
                || INDIRECT_SPEECH_LEMMAS.contains(outgoingEdge.getGovernor().word()))) {
          continue;
        }
        // Get some variables
        String outgoingEdgeRelation = outgoingEdge.getRelation().toString();
        List<String> forcedArcOrder = hardCodedSplits.get(outgoingEdgeRelation);
        if (forcedArcOrder == null && outgoingEdgeRelation.contains(":")) {
          forcedArcOrder =
              hardCodedSplits.get(
                  outgoingEdgeRelation.substring(0, outgoingEdgeRelation.indexOf(":")) + ":*");
        }
        boolean doneForcedArc = false;
        // For each action...
        for (Action action :
            (forcedArcOrder == null ? actionSpace : orderActions(actionSpace, forcedArcOrder))) {
          // Check the prerequisite
          if (!action.prerequisitesMet(tree, outgoingEdge)) {
            continue;
          }
          if (forcedArcOrder != null && doneForcedArc) {
            break;
          }
          // 1. Compute the child state
          Optional<State> candidate =
              action.applyTo(tree, lastState, outgoingEdge, subjOrNull, objOrNull);
          if (candidate.isPresent()) {
            double logProbability;
            ClauseClassifierLabel bestLabel;
            Counter<String> features =
                featurizer.apply(Triple.makeTriple(lastState, action, candidate.get()));
            if (forcedArcOrder != null && !doneForcedArc) {
              logProbability = 0.0;
              bestLabel = ClauseClassifierLabel.CLAUSE_SPLIT;
              doneForcedArc = true;
            } else if (features.containsKey("__undocumented_junit_no_classifier")) {
              logProbability = Double.NEGATIVE_INFINITY;
              bestLabel = ClauseClassifierLabel.CLAUSE_INTERM;
            } else {
              Counter<ClauseClassifierLabel> scores = classifier.scoresOf(new RVFDatum<>(features));
              if (scores.size() > 0) {
                Counters.logNormalizeInPlace(scores);
              }
              String rel = outgoingEdge.getRelation().toString();
              if ("nsubj".equals(rel) || "dobj".equals(rel)) {
                scores.remove(
                    ClauseClassifierLabel.NOT_A_CLAUSE); // Always at least yield on nsubj and dobj
              }
              logProbability = Counters.max(scores, Double.NEGATIVE_INFINITY);
              bestLabel = Counters.argmax(scores, (x, y) -> 0, ClauseClassifierLabel.CLAUSE_SPLIT);
            }

            if (bestLabel != ClauseClassifierLabel.NOT_A_CLAUSE) {
              Pair<State, List<Counter<String>>> childState =
                  Pair.makePair(
                      candidate.get().withIsDone(bestLabel),
                      new ArrayList<Counter<String>>(featuresSoFar) {
                        {
                          add(features);
                        }
                      });
              // 2. Register the child state
              if (!seenWords.contains(childState.first.edge.getDependent())) {
                //            System.err.println("  pushing " + action.signature() + " with " +
                // argmax.first.edge);
                fringe.add(childState, logProbability);
              }
            }
          }
        }
      }

      seenWords.add(rootWord);
    }
    //    System.err.println("Search finished in " + ticks + " ticks and " + classifierEvals + "
    // classifier evaluations.");
  }
Exemplo n.º 25
0
  /**
   * Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po
   * matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
   *
   * <h4>Arguments:</h4>
   *
   * Each argument should be the name of a transformation file that contains a list of pattern and
   * transformation operation list pairs. That is, it is a sequence of pairs of a {@link
   * TregexPattern} pattern on one or more lines, then a blank line (empty or whitespace), then a
   * list of transformation operations one per line (as specified by <b>Legal operation syntax</b>
   * below) to apply when the pattern is matched, and then another blank line (empty or whitespace).
   * Note the need for blank lines: The code crashes if they are not present as separators (although
   * the blank line at the end of the file can be omitted). The script file can include comment
   * lines, either whole comment lines or trailing comments introduced by %, which extend to the end
   * of line. A needed percent mark can be escaped by a preceding backslash.
   *
   * <p>For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
   * and relabel the SQ node to S, your transformation file would look like this:
   *
   * <blockquote>
   *
   * <code>
   *    SBARQ=n1 &lt; SQ=n2<br>
   *    <br>
   *    excise n1 n1<br>
   *    relabel n2 S
   * </code>
   *
   * </blockquote>
   *
   * <p>
   *
   * <h4>Options:</h4>
   *
   * <ul>
   *   <li><code>-treeFile &#60;filename&#62;</code> specify the name of the file that has the trees
   *       you want to transform.
   *   <li><code>-po &#60;matchPattern&#62; &#60;operation&#62;</code> Apply a single operation to
   *       every tree using the specified match pattern and the specified operation. Use this option
   *       when you want to quickly try the effect of one pattern/surgery combination, and are too
   *       lazy to write a transformation file.
   *   <li><code>-s</code> Print each output tree on one line (default is pretty-printing).
   *   <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as
   *       "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through
   *       the transducer as usual.
   *   <li><code>-encoding X</code> Uses character set X for input and output of trees.
   *   <li><code>-macros &#60;filename&#62;</code> A file of macros to use on the tregex pattern.
   *       Macros should be one per line, with original and replacement separated by tabs.
   *   <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *       to determine headship relations.
   *   <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *       class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *       arguments.
   *   <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *       TreeReaderFactory} class to read trees from files.
   * </ul>
   *
   * <h4>Legal operation syntax:</h4>
   *
   * <ul>
   *   <li><code>delete &#60;name&#62;</code> deletes the node and everything below it.
   *   <li><code>prune &#60;name&#62;</code> Like delete, but if, after the pruning, the parent has
   *       no children anymore, the parent is pruned too. Pruning continues to affect all ancestors
   *       until one is found with remaining children. This may result in a null tree.
   *   <li><code>excise &#60;name1&#62; &#60;name2&#62;</code> The name1 node should either dominate
   *       or be the same as the name2 node. This excises out everything from name1 to name2. All
   *       the children of name2 go into the parent of name1, where name1 was.
   *   <li><code>relabel &#60;name&#62; &#60;new-label&#62;</code> Relabels the node to have the new
   *       label. <br>
   *       There are three possible forms: <br>
   *       <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string <br>
   *       <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid
   *       identifier without quoting <br>
   *       <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based
   *       relabeling. In this case, all matches of the regular expression against the node label
   *       are replaced with the replacement String. This has the semantics of Java/Perl's
   *       replaceAll: you may use capturing groups and put them in replacements with $n. For
   *       example, if the pattern is /foo/bar/ and the node matched is "foo", the replaceAll
   *       semantics result in "barbar". If the pattern is /^foo(.*)$/bar$1/ and node matched is
   *       "foofoo", relabel will result in "barfoo". <br>
   *       When using the regex replacement method, you can also use the sequences ={node} and
   *       %{var} in the replacement string to use captured nodes or variable strings in the
   *       replacement string. For example, if the Tregex pattern was "duck=bar" and the relabel is
   *       /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br>
   *       To concatenate two nodes named in the tregex pattern, for example, you can use the
   *       pattern /^.*$/={foo}={bar}/. Note that the ^.*$ is necessary to make sure the regex
   *       pattern only matches and replaces once on the entire node name. <br>
   *       To get an "=" or a "%" in the replacement, using \ escaping. Also, as in the example you
   *       can escape a slash in the middle of the second and third forms with \\/ and \\\\. <br>
   *   <li><code>insert &#60;name&#62; &#60;position&#62;</code> or <code>
   *       insert &lt;tree&gt; &#60;position&#62;</code> inserts the named node or tree into the
   *       position specified.
   *   <li><code>move &#60;name&#62; &#60;position&#62;</code> moves the named node into the
   *       specified position.
   *       <p>Right now the only ways to specify position are:
   *       <p><code>$+ &#60;name&#62;</code> the left sister of the named node<br>
   *       <code>$- &#60;name&#62;</code> the right sister of the named node<br>
   *       <code>&gt;i &#60;name&#62;</code> the i_th daughter of the named node<br>
   *       <code>&gt;-i &#60;name&#62;</code> the i_th daughter, counting from the right, of the
   *       named node.
   *   <li><code>replace &#60;name1&#62; &#60;name2&#62;</code> deletes name1 and inserts a copy of
   *       name2 in its place.
   *   <li><code>replace &#60;name&#62; &#60;tree&#62; &#60;tree2&#62;...</code> deletes name and
   *       inserts the new tree(s) in its place. If more than one replacement tree is given, each of
   *       the new subtrees will be added in order where the old tree was. Multiple subtrees at the
   *       root is an illegal operation and will throw an exception.
   *   <li>{@code createSubtree <new-label> <name1> [<name2>]} Create a subtree out of all the nodes
   *       from {@code <name1>} through {@code <name2>} and puts the new subtree where that span
   *       used to be. To limit the operation to just one node, elide {@code <name2>}.
   *   <li><code>adjoin &#60;auxiliary_tree&#62; &lt;name&gt;</code> Adjoins the specified auxiliary
   *       tree into the named node. The daughters of the target node will become the daughters of
   *       the foot of the auxiliary tree.
   *   <li><code>adjoinH &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the root of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>. The root of the auxiliary tree is ignored.)
   *   <li><code>adjoinF &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the foot of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>, and retains its status as parent of its children. The root of the
   *       auxiliary tree is ignored.)
   *   <li>
   *   <dt><code>coindex &#60;name1&#62; &#60;name2&#62; ... &#60;nameM&#62; </code> Puts a (Penn
   *       Treebank style) coindexation suffix of the form "-N" on each of nodes name_1 through
   *       name_m. The value of N will be automatically generated in reference to the existing
   *       coindexations in the tree, so that there is never an accidental clash of indices across
   *       things that are not meant to be coindexed.
   * </ul>
   *
   * <p>In the context of <code>adjoin</code>, <code>adjoinH</code>, and <code>adjoinF</code>, an
   * auxiliary tree is a tree in Penn Treebank format with <code>@</code> on exactly one of the
   * leaves denoting the foot of the tree. The operations which use the foot use the labeled node.
   * For example: <br>
   * Tsurgeon: <code>adjoin (FOO (BAR@)) foo</code> <br>
   * Tregex: <code>B=foo</code> <br>
   * Input: <code>(A (B 1 2))</code> Output: <code>(A (FOO (BAR 1 2)))</code>
   *
   * <p>Tsurgeon applies the same operation to the same tree for as long as the given tregex
   * operation matches. This means that infinite loops are very easy to cause. One common situation
   * where this comes up is with an insert operation will repeats infinitely many times unless you
   * add an expression to the tregex that matches against the inserted pattern. For example, this
   * pattern will infinite loop:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * This pattern, though, will terminate:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP !&lt;&lt; foo"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * <p>Tsurgeon has (very) limited support for conditional statements. If a pattern is prefaced
   * with <code>if exists &lt;name&gt;</code>, the rest of the pattern will only execute if the
   * named node was found in the corresponding TregexMatcher.
   *
   * @param args a list of names of files each of which contains a single tregex matching pattern
   *     plus a list, one per line, of transformation operations to apply to the matched pattern.
   * @throws Exception If an I/O or pattern syntax error
   */
  public static void main(String[] args) throws Exception {
    String headFinderClassName = null;
    String headFinderOption = "-hf";
    String[] headFinderArgs = null;
    String headFinderArgOption = "-hfArg";
    String encoding = "UTF-8";
    String encodingOption = "-encoding";
    if (args.length == 0) {
      System.err.println(
          "Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
      System.exit(0);
    }
    String treePrintFormats;
    String singleLineOption = "-s";
    String verboseOption = "-v";
    String matchedOption =
        "-m"; // if set, then print original form of trees that are matched & thus operated on
    String patternOperationOption = "-po";
    String treeFileOption = "-treeFile";
    String trfOption = "-trf";
    String macroOption = "-macros";
    String macroFilename = "";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(patternOperationOption, 2);
    flagMap.put(treeFileOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(singleLineOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(headFinderOption, 1);
    flagMap.put(macroOption, 1);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(headFinderOption))
      headFinderClassName = argsMap.get(headFinderOption)[0];
    if (argsMap.containsKey(headFinderArgOption)) headFinderArgs = argsMap.get(headFinderArgOption);
    if (argsMap.containsKey(verboseOption)) verbose = true;
    if (argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,";
    else treePrintFormats = "penn,";
    if (argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0];
    if (argsMap.containsKey(macroOption)) macroFilename = argsMap.get(macroOption)[0];

    TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
    PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);

    TreeReaderFactory trf;
    if (argsMap.containsKey(trfOption)) {
      String trfClass = argsMap.get(trfOption)[0];
      trf = ReflectionLoading.loadByReflection(trfClass);
    } else {
      trf = new TregexPattern.TRegexTreeReaderFactory();
    }

    Treebank trees = new DiskTreebank(trf, encoding);
    if (argsMap.containsKey(treeFileOption)) {
      trees.loadPath(argsMap.get(treeFileOption)[0]);
    }
    List<Pair<TregexPattern, TsurgeonPattern>> ops =
        new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();

    TregexPatternCompiler compiler;
    if (headFinderClassName == null) {
      compiler = new TregexPatternCompiler();
    } else {
      HeadFinder hf;
      if (headFinderArgs == null) {
        hf = ReflectionLoading.loadByReflection(headFinderClassName);
      } else {
        hf = ReflectionLoading.loadByReflection(headFinderClassName, (Object[]) headFinderArgs);
      }
      compiler = new TregexPatternCompiler(hf);
    }
    Macros.addAllMacros(compiler, macroFilename, encoding);
    if (argsMap.containsKey(patternOperationOption)) {
      TregexPattern matchPattern = compiler.compile(argsMap.get(patternOperationOption)[0]);
      TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]);
      ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
    } else {
      for (String arg : args) {
        List<Pair<TregexPattern, TsurgeonPattern>> pairs =
            getOperationsFromFile(arg, encoding, compiler);
        for (Pair<TregexPattern, TsurgeonPattern> pair : pairs) {
          if (verbose) {
            System.err.println(pair.second());
          }
          ops.add(pair);
        }
      }
    }

    for (Tree t : trees) {
      Tree original = t.deepCopy();
      Tree result = processPatternsOnTree(ops, t);
      if (argsMap.containsKey(matchedOption) && matchedOnTree) {
        pwOut.println("Operated on: ");
        displayTree(original, tp, pwOut);
        pwOut.println("Result: ");
      }
      displayTree(result, tp, pwOut);
    }
  }
  public List<Pair<String, Double>> selectWeightedKeysWithSampling(
      ActiveLearningSelectionCriterion criterion, int numSamples, int seed) {
    List<Pair<String, Double>> result = new ArrayList<>();
    forceTrack("Sampling Keys");
    log("" + numSamples + " to collect");

    // Get uncertainty
    forceTrack("Computing Uncertainties");
    Counter<String> weightCounter = uncertainty(criterion);
    assert weightCounter.equals(uncertainty(criterion));
    endTrack("Computing Uncertainties");
    // Compute some statistics
    startTrack("Uncertainty Histogram");
    //    log(new Histogram(weightCounter, 50).toString());  // removed to make the release easier
    // (Histogram isn't in CoreNLP)
    endTrack("Uncertainty Histogram");
    double totalCount = weightCounter.totalCount();
    Random random = new Random(seed);

    // Flatten counter
    List<String> keys = new LinkedList<>();
    List<Double> weights = new LinkedList<>();
    List<String> zeroUncertaintyKeys = new LinkedList<>();
    for (Pair<String, Double> elem :
        Counters.toSortedListWithCounts(
            weightCounter,
            (o1, o2) -> {
              int value = o1.compareTo(o2);
              if (value == 0) {
                return o1.first.compareTo(o2.first);
              } else {
                return value;
              }
            })) {
      if (elem.second != 0.0
          || weightCounter.totalCount() == 0.0
          || weightCounter.size() <= numSamples) { // ignore 0 probability weights
        keys.add(elem.first);
        weights.add(elem.second);
      } else {
        zeroUncertaintyKeys.add(elem.first);
      }
    }

    // Error check
    if (Utils.assertionsEnabled()) {
      for (Double elem : weights) {
        if (!(elem >= 0 && !Double.isInfinite(elem) && !Double.isNaN(elem))) {
          throw new IllegalArgumentException("Invalid weight: " + elem);
        }
      }
    }

    // Sample
    SAMPLE_ITER:
    for (int i = 1; i <= numSamples; ++i) { // For each sample
      if (i % 1000 == 0) {
        // Debug log
        log("sampled " + (i / 1000) + "k keys");
        // Recompute total count to mitigate floating point errors
        totalCount = 0.0;
        for (double val : weights) {
          totalCount += val;
        }
      }
      if (weights.size() == 0) {
        continue;
      }
      assert totalCount >= 0.0;
      assert weights.size() == keys.size();
      double target = random.nextDouble() * totalCount;
      Iterator<String> keyIter = keys.iterator();
      Iterator<Double> weightIter = weights.iterator();
      double runningTotal = 0.0;
      while (keyIter.hasNext()) { // For each candidate
        String key = keyIter.next();
        double weight = weightIter.next();
        runningTotal += weight;
        if (target <= runningTotal) { // Select that sample
          result.add(Pair.makePair(key, weight));
          keyIter.remove();
          weightIter.remove();
          totalCount -= weight;
          continue SAMPLE_ITER; // continue sampling
        }
      }
      // We should get here only if the keys list is empty
      warn(
          "No more uncertain samples left to draw from! (target="
              + target
              + " totalCount="
              + totalCount
              + " size="
              + keys.size());
      assert keys.size() == 0;
      if (zeroUncertaintyKeys.size() > 0) {
        result.add(Pair.makePair(zeroUncertaintyKeys.remove(0), 0.0));
      } else {
        break;
      }
    }

    endTrack("Sampling Keys");
    return result;
  }
Exemplo n.º 27
0
  private void incrementDay(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
    String origDateString = referenceDate.getStartDate();
    String dayString =
        origDateString.substring(origDateString.length() - 2, origDateString.length());
    if (dayString.contains("*")) {
      isoDate = origDateString;
      return;
    }
    // Date is not a variable
    Integer dayNum = Integer.parseInt(dayString);
    String monthString =
        origDateString.substring(origDateString.length() - 4, origDateString.length() - 2);
    int numDaysInMonth = 30; // default - assume this if month is a variable
    int monthNum =
        -1; // ie, we don't know the month yet - this remains -1 if the month is a variable
    if (!monthString.contains("*")) {
      // Set appropriate numDaysInMonth and monthNum
      monthNum = Integer.parseInt(monthString);
      numDaysInMonth = daysPerMonth.get(monthNum);
    }

    // Now, find out if we're an edge case (potential to increment month)
    if (dayNum + relation.second() <= numDaysInMonth && dayNum + relation.second() >= 1) {
      // Not an edge case - just increment the day, create a new string, and return
      dayNum += relation.second();
      isoDate = makeStringDayChange(origDateString, dayNum);
      return;
    }

    // Since we're an edge case, the month can't be a variable - if it is a variable, just set this
    // to the reference string
    if (monthNum == -1) {
      isoDate = origDateString;
      return;
    }
    // At this point, neither our day nor our month is a variable
    isoDate = origDateString;
    boolean decreasing = (dayNum + relation.second() < 1);
    // Need to increment the month, set the date appropriately - we need the new month num to set
    // the day appropriately, so do month first
    int newMonthNum;
    // Now, check if we're an edge case for month
    if ((monthNum + 1 > 12 && !decreasing) || (monthNum - 1 < 1 && decreasing)) {
      // First, change the month
      if (decreasing) {
        newMonthNum = 12;
      } else {
        newMonthNum = 1;
      }
      // If we can, increment the year
      // TODO: fix this to work more nicely with variables and thus handle more cases
      String yearString = origDateString.substring(0, 4);
      if (!yearString.contains("*")) {
        if (decreasing) {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - 1);
        } else {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + 1);
        }
      }
    } else {
      // We're not an edge case for month - just increment
      if (decreasing) {
        newMonthNum = monthNum - 1;
      } else {
        newMonthNum = monthNum + 1;
      }
    }
    // do the increment
    isoDate = makeStringMonthChange(isoDate, newMonthNum);
    int newDateNum;
    if (decreasing) {
      newDateNum = -relation.second() + daysPerMonth.get(newMonthNum) - dayNum;
    } else {
      newDateNum = relation.second() - dayNum + daysPerMonth.get(monthNum);
    }
    // Now, change the day in our original string to be appropriate
    isoDate = makeStringDayChange(isoDate, newDateNum);
  }
  /** @param args */
  public static void main(String[] args) {
    if (args.length != 3) {
      System.err.printf(
          "Usage: java %s language filename features%n",
          TreebankFactoredLexiconStats.class.getName());
      System.exit(-1);
    }

    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    if (language.equals(Language.Arabic)) {
      String[] options = {"-arabicFactored"};
      tlpp.setOptionFlag(options, 0);
    } else {
      String[] options = {"-frenchFactored"};
      tlpp.setOptionFlag(options, 0);
    }
    Treebank tb = tlpp.diskTreebank();
    tb.loadPath(args[1]);

    MorphoFeatureSpecification morphoSpec =
        language.equals(Language.Arabic)
            ? new ArabicMorphoFeatureSpecification()
            : new FrenchMorphoFeatureSpecification();

    String[] features = args[2].trim().split(",");
    for (String feature : features) {
      morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }

    // Counters
    Counter<String> wordTagCounter = new ClassicCounter<>(30000);
    Counter<String> morphTagCounter = new ClassicCounter<>(500);
    //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
    Counter<String> morphCounter = new ClassicCounter<>(500);
    Counter<String> wordCounter = new ClassicCounter<>(30000);
    Counter<String> tagCounter = new ClassicCounter<>(300);

    Counter<String> lemmaCounter = new ClassicCounter<>(25000);
    Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);

    Counter<String> richTagCounter = new ClassicCounter<>(1000);

    Counter<String> reducedTagCounter = new ClassicCounter<>(500);

    Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);

    Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();

    TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter =
        new TwoDimensionalIntCounter<>(30000);
    TwoDimensionalIntCounter<String, String> reducedTagTagCounter =
        new TwoDimensionalIntCounter<>(500);
    TwoDimensionalIntCounter<String, String> tagReducedTagCounter =
        new TwoDimensionalIntCounter<>(300);

    int numTrees = 0;
    for (Tree tree : tb) {
      for (Tree subTree : tree) {
        if (!subTree.isLeaf()) {
          tlpp.transformTree(subTree, tree);
        }
      }
      List<Label> pretermList = tree.preTerminalYield();
      List<Label> yield = tree.yield();
      assert yield.size() == pretermList.size();

      int yieldLen = yield.size();
      for (int i = 0; i < yieldLen; ++i) {
        String tag = pretermList.get(i).value();

        String word = yield.get(i).value();
        String morph = ((CoreLabel) yield.get(i)).originalText();

        // Note: if there is no lemma, then we use the surface form.
        Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
        String lemma = lemmaTag.first();
        String richTag = lemmaTag.second();

        // WSGDEBUG
        if (tag.contains("MW")) lemma += "-MWE";

        lemmaCounter.incrementCount(lemma);
        lemmaTagCounter.incrementCount(lemma + tag);

        richTagCounter.incrementCount(richTag);

        String reducedTag = morphoSpec.strToFeatures(richTag).toString();
        reducedTagCounter.incrementCount(reducedTag);

        reducedTagLemmaCounter.incrementCount(reducedTag + lemma);

        wordTagCounter.incrementCount(word + tag);
        morphTagCounter.incrementCount(morph + tag);
        morphCounter.incrementCount(morph);
        wordCounter.incrementCount(word);
        tagCounter.incrementCount(tag);

        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
        if (wordLemmaMap.containsKey(word)) {
          wordLemmaMap.get(word).add(lemma);
        } else {
          Set<String> lemmas = Generics.newHashSet(1);
          wordLemmaMap.put(word, lemmas);
        }
        lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
        reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
        tagReducedTagCounter.incrementCount(tag, reducedTag);
      }
      ++numTrees;
    }

    // Barf...
    System.out.println("Language: " + language.toString());
    System.out.printf("#trees:\t%d%n", numTrees);
    System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
    System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
    System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
    System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
    System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
    System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
    System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
    System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
    System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
    System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
    System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());

    // Extra
    System.out.println("==================");
    StringBuilder sbNoLemma = new StringBuilder();
    StringBuilder sbMultLemmas = new StringBuilder();
    for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
      String word = wordLemmas.getKey();
      Set<String> lemmas = wordLemmas.getValue();
      if (lemmas.size() == 0) {
        sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
        continue;
      }
      if (lemmas.size() > 1) {
        sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
        continue;
      }
      String lemma = lemmas.iterator().next();
      Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
      if (reducedTags.size() > 1) {
        System.out.printf("%s --> %s%n", word, lemma);
        for (String reducedTag : reducedTags) {
          int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
          String posTags =
              setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
          System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
        }
        System.out.println();
      }
    }
    System.out.println("==================");
    System.out.println(sbNoLemma.toString());
    System.out.println(sbMultLemmas.toString());
    System.out.println("==================");
    List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
    Collections.sort(tags);
    for (String tag : tags) {
      System.out.println(tag);
      Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
      for (String reducedTag : reducedTags) {
        int count = tagReducedTagCounter.getCount(tag, reducedTag);
        //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
        System.out.printf("\t%s\t%d%n", reducedTag, count);
      }
      System.out.println();
    }
    System.out.println("==================");
  }
Exemplo n.º 29
0
    // when finished = false; break; is called, it means I successfully matched.
    @SuppressWarnings("null")
    private void goToNextNodeMatch() {
      decommitVariableGroups(); // make sure variable groups are free.
      decommitNamedNodes();
      decommitNamedRelations();
      finished = true;
      Matcher m = null;
      while (nodeMatchCandidateIterator.hasNext()) {
        if (myNode.reln.getName() != null) {
          String foundReln = namesToRelations.get(myNode.reln.getName());
          nextMatchReln = ((GraphRelation.SearchNodeIterator) nodeMatchCandidateIterator).getReln();
          if ((foundReln != null) && (!nextMatchReln.equals(foundReln))) {
            nextMatch = nodeMatchCandidateIterator.next();
            continue;
          }
        }

        nextMatch = nodeMatchCandidateIterator.next();
        // System.err.println("going to next match: " + nextMatch.word() + " " +
        // myNode.descString + " " + myNode.isLink);
        if (myNode.descString.equals("{}") && myNode.isLink) {
          IndexedWord otherNode = namesToNodes.get(myNode.name);
          if (otherNode != null) {
            if (otherNode.equals(nextMatch)) {
              if (!myNode.negDesc) {
                finished = false;
                break;
              }
            } else {
              if (myNode.negDesc) {
                finished = false;
                break;
              }
            }
          } else {
            boolean found = myNode.nodeAttrMatch(nextMatch, hyp ? sg : sg_aligned, ignoreCase);
            if (found) {
              for (Pair<Integer, String> varGroup : myNode.variableGroups) {
                // if variables have been captured from a regex, they
                // must match any previous matchings
                String thisVariable = varGroup.second();
                String thisVarString = variableStrings.getString(thisVariable);
                if (thisVarString != null && !thisVarString.equals(m.group(varGroup.first()))) {
                  // failed to match a variable
                  found = false;
                  break;
                }
              }

              // nodeAttrMatch already checks negDesc, so no need to
              // check for that here
              finished = false;
              break;
            }
          }
        } else { // try to match the description pattern.
          boolean found = myNode.nodeAttrMatch(nextMatch, hyp ? sg : sg_aligned, ignoreCase);
          if (found) {
            for (Pair<Integer, String> varGroup : myNode.variableGroups) {
              // if variables have been captured from a regex, they
              // must match any previous matchings
              String thisVariable = varGroup.second();
              String thisVarString = variableStrings.getString(thisVariable);
              if (thisVarString != null && !thisVarString.equals(m.group(varGroup.first()))) {
                // failed to match a variable
                found = false;
                break;
              }
            }

            // nodeAttrMatch already checks negDesc, so no need to
            // check for that here
            finished = false;
            break;
          }
        }
      } // end while

      if (!finished) { // I successfully matched.
        resetChild();
        if (myNode.name != null) {
          // note: have to fill in the map as we go for backreferencing
          if (!namesToNodes.containsKey(myNode.name)) {
            // System.err.println("making namedFirst");
            namedFirst = true;
          }
          // System.err.println("adding named node: " + myNode.name + "=" +
          // nextMatch.word());
          namesToNodes.put(myNode.name, nextMatch);
        }
        if (myNode.reln.getName() != null) {
          if (!namesToRelations.containsKey(myNode.reln.getName())) relnNamedFirst = true;
          namesToRelations.put(myNode.reln.getName(), nextMatchReln);
        }
        commitVariableGroups(m); // commit my variable groups.
      }
      // finished is false exiting this if and only if nextChild exists
      // and has a label or backreference that matches
      // (also it will just have been reset)
    }