Exemplo n.º 1
0
 private void incrementMonth(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
   String origDateString = referenceDate.getStartDate();
   String monthString = origDateString.substring(4, 6);
   if (monthString.contains("*")) {
     isoDate = origDateString;
     return;
   }
   // Month is not a variable
   Integer monthNum = Integer.parseInt(monthString);
   // Check if we're an edge case
   if (((monthNum + relation.second()) > 12) || ((monthNum + relation.second) < 1)) {
     boolean decreasing = ((monthNum + relation.second) < 1);
     int newMonthNum = (monthNum + relation.second()) % 12;
     if (newMonthNum < 0) {
       newMonthNum *= -1;
     }
     // Set the month appropriately
     isoDate = makeStringMonthChange(origDateString, newMonthNum);
     // Increment the year if possible
     String yearString = origDateString.substring(0, 4);
     if (!yearString.contains("*")) {
       // How much we increment depends on above mod
       int numYearsToIncrement = (int) Math.ceil(relation.second() / 12.0);
       if (decreasing) {
         isoDate =
             makeStringYearChange(isoDate, Integer.parseInt(yearString) - numYearsToIncrement);
       } else {
         isoDate =
             makeStringYearChange(isoDate, Integer.parseInt(yearString) + numYearsToIncrement);
       }
     }
   } else {
     isoDate = makeStringMonthChange(origDateString, (monthNum + relation.second()));
   }
 }
Exemplo n.º 2
0
 public static Tree processPatternsOnTree(List<Pair<TregexPattern, TsurgeonPattern>> ops, Tree t) {
   matchedOnTree = false;
   for (Pair<TregexPattern, TsurgeonPattern> op : ops) {
     try {
       if (DEBUG) {
         System.err.println("Running pattern " + op.first());
       }
       TregexMatcher m = op.first().matcher(t);
       while (m.find()) {
         matchedOnTree = true;
         t = op.second().evaluate(t, m);
         if (t == null) {
           return null;
         }
         m = op.first().matcher(t);
       }
     } catch (NullPointerException npe) {
       throw new RuntimeException(
           "Tsurgeon.processPatternsOnTree failed to match label for pattern: "
               + op.first()
               + ", "
               + op.second(),
           npe);
     }
   }
   return t;
 }
Exemplo n.º 3
0
  /**
   * Uses regexp matching to match month, day, and year fields TODO: Find a way to mark what;s
   * already been handled in the string
   */
  public boolean extractFields(String inputDate) {

    if (tokens.size() < 2) {
      tokenizeDate(inputDate);
    }
    if (DEBUG) {
      System.err.println("Extracting date: " + inputDate);
    }
    // first we see if it's a hyphen and two parseable dates - if not, we treat it as one date
    Pair<String, String> dateEndpoints = getRangeDates(inputDate);
    if (dateEndpoints != null) {
      ISODateInstance date1 = new ISODateInstance(dateEndpoints.first());
      if (dateEndpoints.first().contains(" ") && !dateEndpoints.second().contains(" ")) {
        // consider whether it's a leading modifier; e.g., "June 8-10" will be split into June 8,
        // and 10 when really we'd like June 8 and June 10
        String date =
            dateEndpoints.first().substring(0, dateEndpoints.first().indexOf(' '))
                + ' '
                + dateEndpoints.second();
        ISODateInstance date2 = new ISODateInstance(date);
        if (!date1.isUnparseable() && !date2.isUnparseable()) {
          isoDate = (new ISODateInstance(date1, date2)).getDateString();
          return true;
        }
      }

      ISODateInstance date2 = new ISODateInstance(dateEndpoints.second());
      if (!date1.isUnparseable() && !date2.isUnparseable()) {
        isoDate = (new ISODateInstance(date1, date2)).getDateString();
        return true;
      }
    }

    if (extractYYYYMMDD(inputDate)) {
      return true;
    }
    if (extractMMDDYY(inputDate)) {
      return true;
    }
    boolean passed = false;
    passed = extractYear(inputDate) || passed;
    passed = extractMonth(inputDate) || passed;
    passed = extractDay(inputDate) || passed;

    // slightly hacky, but check for some common modifiers that get grouped into the date
    passed = addExtraRanges(inputDate) || passed;

    if (!passed) { // couldn't parse
      // try one more trick
      unparseable = true;
      boolean weekday = extractWeekday(inputDate);
      if (!weekday) {
        isoDate = inputDate;
      }
    }
    return passed;
  }
Exemplo n.º 4
0
 private void commitVariableGroups(Matcher m) {
   committedVariables = true; // commit all my variable groups.
   for (Pair<Integer, String> varGroup : myNode.variableGroups) {
     String thisVarString = m.group(varGroup.first());
     variableStrings.setVar(varGroup.second(), thisVarString);
   }
 }
Exemplo n.º 5
0
  /** Check one mention is the speaker of the other mention */
  public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {

    if (!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
        || ant.number == Number.PLURAL
        || ant.sentNum != m.sentNum) return false;

    int countQuotationMark = 0;
    for (int i = Math.min(m.headIndex, ant.headIndex) + 1;
        i < Math.max(m.headIndex, ant.headIndex);
        i++) {
      String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
      if (word.equals("``") || word.equals("''")) countQuotationMark++;
    }
    if (countQuotationMark != 1) return false;

    IndexedWord w =
        m.dependency.getNodeByWordPattern(
            m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
    if (w == null) return false;

    for (Pair<GrammaticalRelation, IndexedWord> parent : m.dependency.parentPairs(w)) {
      if (parent.first().getShortName().equals("nsubj")
          && dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
        return true;
      }
    }
    return false;
  }
Exemplo n.º 6
0
  private String findNextParagraphSpeaker(
      List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
    CoreMap lastSent = paragraph.get(paragraph.size() - 1);
    String speaker = "";
    for (CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
      if (w.get(CoreAnnotations.LemmaAnnotation.class).equals("report")
          || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
        String word = w.get(CoreAnnotations.TextAnnotation.class);
        SemanticGraph dependency =
            lastSent.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord t = dependency.getNodeByWordPattern(word);

        for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(t)) {
          if (child.first().getShortName().equals("nsubj")) {
            int subjectIndex = child.second().index(); // start from 1
            IntTuple headPosition = new IntTuple(2);
            headPosition.set(0, paragraph.size() - 1 + paragraphOffset);
            headPosition.set(1, subjectIndex - 1);
            if (mentionheadPositions.containsKey(headPosition)
                && mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
              speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
            }
          }
        }
      }
    }
    return speaker;
  }
Exemplo n.º 7
0
 private void decommitVariableGroups() {
   if (committedVariables) {
     for (Pair<Integer, String> varGroup : myNode.variableGroups) {
       variableStrings.unsetVar(varGroup.second());
     }
   }
   committedVariables = false;
 }
Exemplo n.º 8
0
  private boolean findSpeaker(
      int utterNum,
      int sentNum,
      List<CoreMap> sentences,
      int startIndex,
      int endIndex,
      Dictionaries dict) {
    List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
    for (int i = startIndex; i < endIndex; i++) {
      if (sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class) != 0) continue;
      String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class);
      String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
      if (dict.reportVerb.contains(lemma)) {
        // find subject
        SemanticGraph dependency =
            sentences
                .get(sentNum)
                .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        IndexedWord w = dependency.getNodeByWordPattern(word);

        if (w != null) {
          for (Pair<GrammaticalRelation, IndexedWord> child : dependency.childPairs(w)) {
            if (child.first().getShortName().equals("nsubj")) {
              String subjectString = child.second().word();
              int subjectIndex = child.second().index(); // start from 1
              IntTuple headPosition = new IntTuple(2);
              headPosition.set(0, sentNum);
              headPosition.set(1, subjectIndex - 1);
              String speaker;
              if (mentionheadPositions.containsKey(headPosition)) {
                speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
              } else {
                speaker = subjectString;
              }
              speakers.put(utterNum, speaker);
              return true;
            }
          }
        } else {
          SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word);
        }
      }
    }
    return false;
  }
Exemplo n.º 9
0
  private void findQuotationSpeaker(
      int utterNum,
      List<CoreMap> sentences,
      Pair<Integer, Integer> beginQuotation,
      Pair<Integer, Integer> endQuotation,
      Dictionaries dict) {

    if (findSpeaker(utterNum, beginQuotation.first(), sentences, 0, beginQuotation.second(), dict))
      return;

    if (findSpeaker(
        utterNum,
        endQuotation.first(),
        sentences,
        endQuotation.second(),
        sentences.get(endQuotation.first()).get(CoreAnnotations.TokensAnnotation.class).size(),
        dict)) return;

    if (beginQuotation.second() <= 1 && beginQuotation.first() > 0) {
      if (findSpeaker(
          utterNum,
          beginQuotation.first() - 1,
          sentences,
          0,
          sentences
              .get(beginQuotation.first() - 1)
              .get(CoreAnnotations.TokensAnnotation.class)
              .size(),
          dict)) return;
    }

    if (endQuotation.second() == sentences.get(endQuotation.first()).size() - 1
        && sentences.size() > endQuotation.first() + 1) {
      if (findSpeaker(
          utterNum,
          endQuotation.first() + 1,
          sentences,
          0,
          sentences
              .get(endQuotation.first() + 1)
              .get(CoreAnnotations.TokensAnnotation.class)
              .size(),
          dict)) return;
    }
  }
Exemplo n.º 10
0
 /**
  * Compares this <code>Pair</code> to another object. If the object is a <code>Pair</code>, this
  * function will work providing the elements of the <code>Pair</code> are themselves comparable.
  * It will then return a value based on the pair of objects, where <code>
  * p &gt; q iff p.first() &gt; q.first() ||
  * (p.first().equals(q.first()) && p.second() &gt; q.second())</code>. If the other object is not
  * a <code>Pair</code>, it throws a <code>ClassCastException</code>.
  *
  * @param o the <code>Object</code> to be compared.
  * @return the value <code>0</code> if the argument is a <code>Pair</code> equal to this <code>
  *     Pair</code>; a value less than <code>0</code> if the argument is a <code>Pair</code>
  *     greater than this <code>Pair</code>; and a value greater than <code>0</code> if the
  *     argument is a <code>Pair</code> less than this <code>Pair</code>.
  * @throws ClassCastException if the argument is not a <code>Pair</code>.
  * @see java.lang.Comparable
  */
 public int compareTo(Object o) {
   Pair another = (Pair) o;
   int comp = ((Comparable) first()).compareTo(another.first());
   if (comp != 0) {
     return comp;
   } else {
     return ((Comparable) second()).compareTo(another.second());
   }
 }
 public String expandStringRegex(String regex) {
   // Replace all variables in regex
   String expanded = regex;
   for (String v : stringRegexVariables.keySet()) {
     Pair<Pattern, String> p = stringRegexVariables.get(v);
     expanded = p.first().matcher(expanded).replaceAll(p.second());
   }
   return expanded;
 }
Exemplo n.º 12
0
 /**
  * Read a string representation of a Pair from a DataStream. This might not work correctly unless
  * the pair of objects are of type <code>String</code>.
  */
 public static Pair<String, String> readStringPair(DataInputStream in) {
   Pair<String, String> p = new Pair<String, String>();
   try {
     p.first = in.readUTF();
     p.second = in.readUTF();
   } catch (Exception e) {
     e.printStackTrace();
   }
   return p;
 }
Exemplo n.º 13
0
 private void incrementYear(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
   String origDateString = referenceDate.getStartDate();
   String yearString = origDateString.substring(0, 4);
   if (yearString.contains("*")) {
     isoDate = origDateString;
     return;
   }
   isoDate =
       makeStringYearChange(origDateString, Integer.parseInt(yearString) + relation.second());
 }
Exemplo n.º 14
0
 private static void extractSubtrees(List<String> codeStrings, String treeFile) {
   List<Pair<Integer, Integer>> codes = new ArrayList<Pair<Integer, Integer>>();
   for (String s : codeStrings) {
     Matcher m = codePattern.matcher(s);
     if (m.matches())
       codes.add(
           new Pair<Integer, Integer>(Integer.parseInt(m.group(1)), Integer.parseInt(m.group(2))));
     else throw new RuntimeException("Error: illegal node code " + s);
   }
   TreeReaderFactory trf = new TRegexTreeReaderFactory();
   MemoryTreebank treebank = new MemoryTreebank(trf);
   treebank.loadPath(treeFile, null, true);
   for (Pair<Integer, Integer> code : codes) {
     Tree t = treebank.get(code.first() - 1);
     t.getNodeNumber(code.second()).pennPrint();
   }
 }
 /**
  * Returns true if there is a feasible combination of child branch ids that causes all child
  * expressions to be satisfied with respect to the specified child expression (assuming
  * satisfiction with the specified branch and node index) For other child expressions to have a
  * compatible satisfiable branch, that branch must also terminate with the same node index as
  * this one.
  *
  * @param index - Index of the child expression
  * @param bid - Branch id that causes the indexed child to be satisfied
  * @param pos - Node index that causes the indexed child to be satisfied
  * @return whether there is a feasible combination that causes all children to be satisfied with
  *     respect to specfied child.
  */
 private boolean isAllChildMatched(int index, int bid, int pos) {
   for (int i = 0; i < reachableChildBids.length; i++) {
     Set<Pair<Integer, Integer>> v = reachableChildBids[i];
     if (v == null || v.isEmpty()) return false;
     if (i != index) {
       boolean ok = false;
       for (Pair<Integer, Integer> p : v) {
         if (p.second() == pos) {
           ok = true;
           break;
         }
       }
       if (!ok) {
         return false;
       }
     }
   }
   return true;
 }
 /**
  * Returns array of child branch ids that causes all child expressions to be satisfied with
  * respect to the specified child expression (assuming satisfiction with the specified branch
  * and node index) For other child expressions to have a compatible satisfiable branch, that
  * branch must also terminate with the same node index as this one.
  *
  * @param index - Index of the child expression
  * @param bid - Branch id that causes the indexed child to be satisfied
  * @param pos - Node index that causes the indexed child to be satisfied
  * @return array of child branch ids if there is a valid combination null otherwise
  */
 private int[] getAllChildMatchedBids(int index, int bid, int pos) {
   int[] matchedBids = new int[reachableChildBids.length];
   for (int i = 0; i < reachableChildBids.length; i++) {
     Set<Pair<Integer, Integer>> v = reachableChildBids[i];
     if (v == null || v.isEmpty()) return null;
     if (i != index) {
       boolean ok = false;
       for (Pair<Integer, Integer> p : v) {
         if (p.second() == pos) {
           ok = true;
           matchedBids[i] = p.first();
           break;
         }
       }
       if (!ok) {
         return null;
       }
     } else {
       matchedBids[i] = bid;
     }
   }
   return matchedBids;
 }
 protected <T> boolean match(
     int bid, SequenceMatcher.MatchedStates<T> matchedStates, boolean consume) {
   // Try to match previous node/nodes exactly
   if (consume) {
     // First element is group that is matched, second is number of nodes matched so far
     Pair<SequenceMatcher.MatchedGroup, Integer> backRefState =
         (Pair<SequenceMatcher.MatchedGroup, Integer>)
             matchedStates.getBranchStates().getMatchStateInfo(bid, this);
     if (backRefState == null) {
       // Haven't tried to match this node before, try now
       // Get element and return if it matched or not
       SequenceMatcher.MatchedGroup matchedGroup =
           matchedStates.getBranchStates().getMatchedGroup(bid, captureGroupId);
       if (matchedGroup != null) {
         // See if the first node matches
         if (matchedGroup.matchEnd > matchedGroup.matchBegin) {
           boolean matched = match(bid, matchedStates, matchedGroup, 0);
           return matched;
         } else {
           // TODO: Check handling of previous nodes that are zero elements?
           return super.match(bid, matchedStates, consume);
         }
       }
       return false;
     } else {
       SequenceMatcher.MatchedGroup matchedGroup = backRefState.first();
       int matchedNodes = backRefState.second();
       boolean matched = match(bid, matchedStates, matchedGroup, matchedNodes);
       return matched;
     }
   } else {
     // Not consuming, just add this state back to list of states to be processed
     matchedStates.addState(bid, this);
     return false;
   }
 }
  /** @param args */
  public static void main(String[] args) {
    if (args.length != 3) {
      System.err.printf(
          "Usage: java %s language filename features%n",
          TreebankFactoredLexiconStats.class.getName());
      System.exit(-1);
    }

    Language language = Language.valueOf(args[0]);
    TreebankLangParserParams tlpp = language.params;
    if (language.equals(Language.Arabic)) {
      String[] options = {"-arabicFactored"};
      tlpp.setOptionFlag(options, 0);
    } else {
      String[] options = {"-frenchFactored"};
      tlpp.setOptionFlag(options, 0);
    }
    Treebank tb = tlpp.diskTreebank();
    tb.loadPath(args[1]);

    MorphoFeatureSpecification morphoSpec =
        language.equals(Language.Arabic)
            ? new ArabicMorphoFeatureSpecification()
            : new FrenchMorphoFeatureSpecification();

    String[] features = args[2].trim().split(",");
    for (String feature : features) {
      morphoSpec.activate(MorphoFeatureType.valueOf(feature));
    }

    // Counters
    Counter<String> wordTagCounter = new ClassicCounter<>(30000);
    Counter<String> morphTagCounter = new ClassicCounter<>(500);
    //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
    Counter<String> morphCounter = new ClassicCounter<>(500);
    Counter<String> wordCounter = new ClassicCounter<>(30000);
    Counter<String> tagCounter = new ClassicCounter<>(300);

    Counter<String> lemmaCounter = new ClassicCounter<>(25000);
    Counter<String> lemmaTagCounter = new ClassicCounter<>(25000);

    Counter<String> richTagCounter = new ClassicCounter<>(1000);

    Counter<String> reducedTagCounter = new ClassicCounter<>(500);

    Counter<String> reducedTagLemmaCounter = new ClassicCounter<>(500);

    Map<String, Set<String>> wordLemmaMap = Generics.newHashMap();

    TwoDimensionalIntCounter<String, String> lemmaReducedTagCounter =
        new TwoDimensionalIntCounter<>(30000);
    TwoDimensionalIntCounter<String, String> reducedTagTagCounter =
        new TwoDimensionalIntCounter<>(500);
    TwoDimensionalIntCounter<String, String> tagReducedTagCounter =
        new TwoDimensionalIntCounter<>(300);

    int numTrees = 0;
    for (Tree tree : tb) {
      for (Tree subTree : tree) {
        if (!subTree.isLeaf()) {
          tlpp.transformTree(subTree, tree);
        }
      }
      List<Label> pretermList = tree.preTerminalYield();
      List<Label> yield = tree.yield();
      assert yield.size() == pretermList.size();

      int yieldLen = yield.size();
      for (int i = 0; i < yieldLen; ++i) {
        String tag = pretermList.get(i).value();

        String word = yield.get(i).value();
        String morph = ((CoreLabel) yield.get(i)).originalText();

        // Note: if there is no lemma, then we use the surface form.
        Pair<String, String> lemmaTag = MorphoFeatureSpecification.splitMorphString(word, morph);
        String lemma = lemmaTag.first();
        String richTag = lemmaTag.second();

        // WSGDEBUG
        if (tag.contains("MW")) lemma += "-MWE";

        lemmaCounter.incrementCount(lemma);
        lemmaTagCounter.incrementCount(lemma + tag);

        richTagCounter.incrementCount(richTag);

        String reducedTag = morphoSpec.strToFeatures(richTag).toString();
        reducedTagCounter.incrementCount(reducedTag);

        reducedTagLemmaCounter.incrementCount(reducedTag + lemma);

        wordTagCounter.incrementCount(word + tag);
        morphTagCounter.incrementCount(morph + tag);
        morphCounter.incrementCount(morph);
        wordCounter.incrementCount(word);
        tagCounter.incrementCount(tag);

        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
        if (wordLemmaMap.containsKey(word)) {
          wordLemmaMap.get(word).add(lemma);
        } else {
          Set<String> lemmas = Generics.newHashSet(1);
          wordLemmaMap.put(word, lemmas);
        }
        lemmaReducedTagCounter.incrementCount(lemma, reducedTag);
        reducedTagTagCounter.incrementCount(lemma + reducedTag, tag);
        tagReducedTagCounter.incrementCount(tag, reducedTag);
      }
      ++numTrees;
    }

    // Barf...
    System.out.println("Language: " + language.toString());
    System.out.printf("#trees:\t%d%n", numTrees);
    System.out.printf("#tokens:\t%d%n", (int) wordCounter.totalCount());
    System.out.printf("#words:\t%d%n", wordCounter.keySet().size());
    System.out.printf("#tags:\t%d%n", tagCounter.keySet().size());
    System.out.printf("#wordTagPairs:\t%d%n", wordTagCounter.keySet().size());
    System.out.printf("#lemmas:\t%d%n", lemmaCounter.keySet().size());
    System.out.printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.keySet().size());
    System.out.printf("#feattags:\t%d%n", reducedTagCounter.keySet().size());
    System.out.printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.keySet().size());
    System.out.printf("#richtags:\t%d%n", richTagCounter.keySet().size());
    System.out.printf("#richtag+lemma:\t%d%n", morphCounter.keySet().size());
    System.out.printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.keySet().size());

    // Extra
    System.out.println("==================");
    StringBuilder sbNoLemma = new StringBuilder();
    StringBuilder sbMultLemmas = new StringBuilder();
    for (Map.Entry<String, Set<String>> wordLemmas : wordLemmaMap.entrySet()) {
      String word = wordLemmas.getKey();
      Set<String> lemmas = wordLemmas.getValue();
      if (lemmas.size() == 0) {
        sbNoLemma.append("NO LEMMAS FOR WORD: " + word + "\n");
        continue;
      }
      if (lemmas.size() > 1) {
        sbMultLemmas.append("MULTIPLE LEMMAS: " + word + " " + setToString(lemmas) + "\n");
        continue;
      }
      String lemma = lemmas.iterator().next();
      Set<String> reducedTags = lemmaReducedTagCounter.getCounter(lemma).keySet();
      if (reducedTags.size() > 1) {
        System.out.printf("%s --> %s%n", word, lemma);
        for (String reducedTag : reducedTags) {
          int count = lemmaReducedTagCounter.getCount(lemma, reducedTag);
          String posTags =
              setToString(reducedTagTagCounter.getCounter(lemma + reducedTag).keySet());
          System.out.printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
        }
        System.out.println();
      }
    }
    System.out.println("==================");
    System.out.println(sbNoLemma.toString());
    System.out.println(sbMultLemmas.toString());
    System.out.println("==================");
    List<String> tags = new ArrayList<>(tagReducedTagCounter.firstKeySet());
    Collections.sort(tags);
    for (String tag : tags) {
      System.out.println(tag);
      Set<String> reducedTags = tagReducedTagCounter.getCounter(tag).keySet();
      for (String reducedTag : reducedTags) {
        int count = tagReducedTagCounter.getCount(tag, reducedTag);
        //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
        System.out.printf("\t%s\t%d%n", reducedTag, count);
      }
      System.out.println();
    }
    System.out.println("==================");
  }
Exemplo n.º 19
0
    // when finished = false; break; is called, it means I successfully matched.
    @SuppressWarnings("null")
    private void goToNextNodeMatch() {
      decommitVariableGroups(); // make sure variable groups are free.
      decommitNamedNodes();
      decommitNamedRelations();
      finished = true;
      Matcher m = null;
      while (nodeMatchCandidateIterator.hasNext()) {
        if (myNode.reln.getName() != null) {
          String foundReln = namesToRelations.get(myNode.reln.getName());
          nextMatchReln = ((GraphRelation.SearchNodeIterator) nodeMatchCandidateIterator).getReln();
          if ((foundReln != null) && (!nextMatchReln.equals(foundReln))) {
            nextMatch = nodeMatchCandidateIterator.next();
            continue;
          }
        }

        nextMatch = nodeMatchCandidateIterator.next();
        // System.err.println("going to next match: " + nextMatch.word() + " " +
        // myNode.descString + " " + myNode.isLink);
        if (myNode.descString.equals("{}") && myNode.isLink) {
          IndexedWord otherNode = namesToNodes.get(myNode.name);
          if (otherNode != null) {
            if (otherNode.equals(nextMatch)) {
              if (!myNode.negDesc) {
                finished = false;
                break;
              }
            } else {
              if (myNode.negDesc) {
                finished = false;
                break;
              }
            }
          } else {
            boolean found = myNode.nodeAttrMatch(nextMatch, hyp ? sg : sg_aligned, ignoreCase);
            if (found) {
              for (Pair<Integer, String> varGroup : myNode.variableGroups) {
                // if variables have been captured from a regex, they
                // must match any previous matchings
                String thisVariable = varGroup.second();
                String thisVarString = variableStrings.getString(thisVariable);
                if (thisVarString != null && !thisVarString.equals(m.group(varGroup.first()))) {
                  // failed to match a variable
                  found = false;
                  break;
                }
              }

              // nodeAttrMatch already checks negDesc, so no need to
              // check for that here
              finished = false;
              break;
            }
          }
        } else { // try to match the description pattern.
          boolean found = myNode.nodeAttrMatch(nextMatch, hyp ? sg : sg_aligned, ignoreCase);
          if (found) {
            for (Pair<Integer, String> varGroup : myNode.variableGroups) {
              // if variables have been captured from a regex, they
              // must match any previous matchings
              String thisVariable = varGroup.second();
              String thisVarString = variableStrings.getString(thisVariable);
              if (thisVarString != null && !thisVarString.equals(m.group(varGroup.first()))) {
                // failed to match a variable
                found = false;
                break;
              }
            }

            // nodeAttrMatch already checks negDesc, so no need to
            // check for that here
            finished = false;
            break;
          }
        }
      } // end while

      if (!finished) { // I successfully matched.
        resetChild();
        if (myNode.name != null) {
          // note: have to fill in the map as we go for backreferencing
          if (!namesToNodes.containsKey(myNode.name)) {
            // System.err.println("making namedFirst");
            namedFirst = true;
          }
          // System.err.println("adding named node: " + myNode.name + "=" +
          // nextMatch.word());
          namesToNodes.put(myNode.name, nextMatch);
        }
        if (myNode.reln.getName() != null) {
          if (!namesToRelations.containsKey(myNode.reln.getName())) relnNamedFirst = true;
          namesToRelations.put(myNode.reln.getName(), nextMatchReln);
        }
        commitVariableGroups(m); // commit my variable groups.
      }
      // finished is false exiting this if and only if nextChild exists
      // and has a label or backreference that matches
      // (also it will just have been reset)
    }
Exemplo n.º 20
0
  /**
   * Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po
   * matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
   *
   * <h4>Arguments:</h4>
   *
   * Each argument should be the name of a transformation file that contains a list of pattern and
   * transformation operation list pairs. That is, it is a sequence of pairs of a {@link
   * TregexPattern} pattern on one or more lines, then a blank line (empty or whitespace), then a
   * list of transformation operations one per line (as specified by <b>Legal operation syntax</b>
   * below) to apply when the pattern is matched, and then another blank line (empty or whitespace).
   * Note the need for blank lines: The code crashes if they are not present as separators (although
   * the blank line at the end of the file can be omitted). The script file can include comment
   * lines, either whole comment lines or trailing comments introduced by %, which extend to the end
   * of line. A needed percent mark can be escaped by a preceding backslash.
   *
   * <p>For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
   * and relabel the SQ node to S, your transformation file would look like this:
   *
   * <blockquote>
   *
   * <code>
   *    SBARQ=n1 &lt; SQ=n2<br>
   *    <br>
   *    excise n1 n1<br>
   *    relabel n2 S
   * </code>
   *
   * </blockquote>
   *
   * <p>
   *
   * <h4>Options:</h4>
   *
   * <ul>
   *   <li><code>-treeFile &#60;filename&#62;</code> specify the name of the file that has the trees
   *       you want to transform.
   *   <li><code>-po &#60;matchPattern&#62; &#60;operation&#62;</code> Apply a single operation to
   *       every tree using the specified match pattern and the specified operation. Use this option
   *       when you want to quickly try the effect of one pattern/surgery combination, and are too
   *       lazy to write a transformation file.
   *   <li><code>-s</code> Print each output tree on one line (default is pretty-printing).
   *   <li><code>-m</code> For every tree that had a matching pattern, print "before" (prepended as
   *       "Operated on:") and "after" (prepended as "Result:"). Unoperated trees just pass through
   *       the transducer as usual.
   *   <li><code>-encoding X</code> Uses character set X for input and output of trees.
   *   <li><code>-macros &#60;filename&#62;</code> A file of macros to use on the tregex pattern.
   *       Macros should be one per line, with original and replacement separated by tabs.
   *   <li><code>-hf &lt;headfinder-class-name&gt;</code> use the specified {@link HeadFinder} class
   *       to determine headship relations.
   *   <li><code>-hfArg &lt;string&gt;</code> pass a string argument in to the {@link HeadFinder}
   *       class's constructor. <code>-hfArg</code> can be used multiple times to pass in multiple
   *       arguments.
   *   <li><code>-trf &lt;TreeReaderFactory-class-name&gt;</code> use the specified {@link
   *       TreeReaderFactory} class to read trees from files.
   * </ul>
   *
   * <h4>Legal operation syntax:</h4>
   *
   * <ul>
   *   <li><code>delete &#60;name&#62;</code> deletes the node and everything below it.
   *   <li><code>prune &#60;name&#62;</code> Like delete, but if, after the pruning, the parent has
   *       no children anymore, the parent is pruned too. Pruning continues to affect all ancestors
   *       until one is found with remaining children. This may result in a null tree.
   *   <li><code>excise &#60;name1&#62; &#60;name2&#62;</code> The name1 node should either dominate
   *       or be the same as the name2 node. This excises out everything from name1 to name2. All
   *       the children of name2 go into the parent of name1, where name1 was.
   *   <li><code>relabel &#60;name&#62; &#60;new-label&#62;</code> Relabels the node to have the new
   *       label. <br>
   *       There are three possible forms: <br>
   *       <code>relabel nodeX VP</code> - for changing a node label to an alphanumeric string <br>
   *       <code>relabel nodeX /''/</code> - for relabeling a node to something that isn't a valid
   *       identifier without quoting <br>
   *       <code>relabel nodeX /^VB(.*)$/verb\\/$1/</code> - for regular expression based
   *       relabeling. In this case, all matches of the regular expression against the node label
   *       are replaced with the replacement String. This has the semantics of Java/Perl's
   *       replaceAll: you may use capturing groups and put them in replacements with $n. For
   *       example, if the pattern is /foo/bar/ and the node matched is "foo", the replaceAll
   *       semantics result in "barbar". If the pattern is /^foo(.*)$/bar$1/ and node matched is
   *       "foofoo", relabel will result in "barfoo". <br>
   *       When using the regex replacement method, you can also use the sequences ={node} and
   *       %{var} in the replacement string to use captured nodes or variable strings in the
   *       replacement string. For example, if the Tregex pattern was "duck=bar" and the relabel is
   *       /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br>
   *       To concatenate two nodes named in the tregex pattern, for example, you can use the
   *       pattern /^.*$/={foo}={bar}/. Note that the ^.*$ is necessary to make sure the regex
   *       pattern only matches and replaces once on the entire node name. <br>
   *       To get an "=" or a "%" in the replacement, using \ escaping. Also, as in the example you
   *       can escape a slash in the middle of the second and third forms with \\/ and \\\\. <br>
   *   <li><code>insert &#60;name&#62; &#60;position&#62;</code> or <code>
   *       insert &lt;tree&gt; &#60;position&#62;</code> inserts the named node or tree into the
   *       position specified.
   *   <li><code>move &#60;name&#62; &#60;position&#62;</code> moves the named node into the
   *       specified position.
   *       <p>Right now the only ways to specify position are:
   *       <p><code>$+ &#60;name&#62;</code> the left sister of the named node<br>
   *       <code>$- &#60;name&#62;</code> the right sister of the named node<br>
   *       <code>&gt;i &#60;name&#62;</code> the i_th daughter of the named node<br>
   *       <code>&gt;-i &#60;name&#62;</code> the i_th daughter, counting from the right, of the
   *       named node.
   *   <li><code>replace &#60;name1&#62; &#60;name2&#62;</code> deletes name1 and inserts a copy of
   *       name2 in its place.
   *   <li><code>replace &#60;name&#62; &#60;tree&#62; &#60;tree2&#62;...</code> deletes name and
   *       inserts the new tree(s) in its place. If more than one replacement tree is given, each of
   *       the new subtrees will be added in order where the old tree was. Multiple subtrees at the
   *       root is an illegal operation and will throw an exception.
   *   <li>{@code createSubtree <new-label> <name1> [<name2>]} Create a subtree out of all the nodes
   *       from {@code <name1>} through {@code <name2>} and puts the new subtree where that span
   *       used to be. To limit the operation to just one node, elide {@code <name2>}.
   *   <li><code>adjoin &#60;auxiliary_tree&#62; &lt;name&gt;</code> Adjoins the specified auxiliary
   *       tree into the named node. The daughters of the target node will become the daughters of
   *       the foot of the auxiliary tree.
   *   <li><code>adjoinH &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the root of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>. The root of the auxiliary tree is ignored.)
   *   <li><code>adjoinF &#60;auxiliary_tree&#62; &lt;name&gt;</code> Similar to adjoin, but
   *       preserves the target node and makes it the foot of &lt;tree&gt;. (It is still accessible
   *       as <code>name</code>, and retains its status as parent of its children. The root of the
   *       auxiliary tree is ignored.)
   *   <li>
   *   <dt><code>coindex &#60;name1&#62; &#60;name2&#62; ... &#60;nameM&#62; </code> Puts a (Penn
   *       Treebank style) coindexation suffix of the form "-N" on each of nodes name_1 through
   *       name_m. The value of N will be automatically generated in reference to the existing
   *       coindexations in the tree, so that there is never an accidental clash of indices across
   *       things that are not meant to be coindexed.
   * </ul>
   *
   * <p>In the context of <code>adjoin</code>, <code>adjoinH</code>, and <code>adjoinF</code>, an
   * auxiliary tree is a tree in Penn Treebank format with <code>@</code> on exactly one of the
   * leaves denoting the foot of the tree. The operations which use the foot use the labeled node.
   * For example: <br>
   * Tsurgeon: <code>adjoin (FOO (BAR@)) foo</code> <br>
   * Tregex: <code>B=foo</code> <br>
   * Input: <code>(A (B 1 2))</code> Output: <code>(A (FOO (BAR 1 2)))</code>
   *
   * <p>Tsurgeon applies the same operation to the same tree for as long as the given tregex
   * operation matches. This means that infinite loops are very easy to cause. One common situation
   * where this comes up is with an insert operation will repeats infinitely many times unless you
   * add an expression to the tregex that matches against the inserted pattern. For example, this
   * pattern will infinite loop:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * This pattern, though, will terminate:
   *
   * <blockquote>
   *
   * <code>
   *   TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP !&lt;&lt; foo"); <br>
   *   TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
   * </code>
   *
   * </blockquote>
   *
   * <p>Tsurgeon has (very) limited support for conditional statements. If a pattern is prefaced
   * with <code>if exists &lt;name&gt;</code>, the rest of the pattern will only execute if the
   * named node was found in the corresponding TregexMatcher.
   *
   * @param args a list of names of files each of which contains a single tregex matching pattern
   *     plus a list, one per line, of transformation operations to apply to the matched pattern.
   * @throws Exception If an I/O or pattern syntax error
   */
  public static void main(String[] args) throws Exception {
    String headFinderClassName = null;
    String headFinderOption = "-hf";
    String[] headFinderArgs = null;
    String headFinderArgOption = "-hfArg";
    String encoding = "UTF-8";
    String encodingOption = "-encoding";
    if (args.length == 0) {
      System.err.println(
          "Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
      System.exit(0);
    }
    String treePrintFormats;
    String singleLineOption = "-s";
    String verboseOption = "-v";
    String matchedOption =
        "-m"; // if set, then print original form of trees that are matched & thus operated on
    String patternOperationOption = "-po";
    String treeFileOption = "-treeFile";
    String trfOption = "-trf";
    String macroOption = "-macros";
    String macroFilename = "";
    Map<String, Integer> flagMap = Generics.newHashMap();
    flagMap.put(patternOperationOption, 2);
    flagMap.put(treeFileOption, 1);
    flagMap.put(trfOption, 1);
    flagMap.put(singleLineOption, 0);
    flagMap.put(encodingOption, 1);
    flagMap.put(headFinderOption, 1);
    flagMap.put(macroOption, 1);
    Map<String, String[]> argsMap = StringUtils.argsToMap(args, flagMap);
    args = argsMap.get(null);

    if (argsMap.containsKey(headFinderOption))
      headFinderClassName = argsMap.get(headFinderOption)[0];
    if (argsMap.containsKey(headFinderArgOption)) headFinderArgs = argsMap.get(headFinderArgOption);
    if (argsMap.containsKey(verboseOption)) verbose = true;
    if (argsMap.containsKey(singleLineOption)) treePrintFormats = "oneline,";
    else treePrintFormats = "penn,";
    if (argsMap.containsKey(encodingOption)) encoding = argsMap.get(encodingOption)[0];
    if (argsMap.containsKey(macroOption)) macroFilename = argsMap.get(macroOption)[0];

    TreePrint tp = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
    PrintWriter pwOut = new PrintWriter(new OutputStreamWriter(System.out, encoding), true);

    TreeReaderFactory trf;
    if (argsMap.containsKey(trfOption)) {
      String trfClass = argsMap.get(trfOption)[0];
      trf = ReflectionLoading.loadByReflection(trfClass);
    } else {
      trf = new TregexPattern.TRegexTreeReaderFactory();
    }

    Treebank trees = new DiskTreebank(trf, encoding);
    if (argsMap.containsKey(treeFileOption)) {
      trees.loadPath(argsMap.get(treeFileOption)[0]);
    }
    List<Pair<TregexPattern, TsurgeonPattern>> ops =
        new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();

    TregexPatternCompiler compiler;
    if (headFinderClassName == null) {
      compiler = new TregexPatternCompiler();
    } else {
      HeadFinder hf;
      if (headFinderArgs == null) {
        hf = ReflectionLoading.loadByReflection(headFinderClassName);
      } else {
        hf = ReflectionLoading.loadByReflection(headFinderClassName, (Object[]) headFinderArgs);
      }
      compiler = new TregexPatternCompiler(hf);
    }
    Macros.addAllMacros(compiler, macroFilename, encoding);
    if (argsMap.containsKey(patternOperationOption)) {
      TregexPattern matchPattern = compiler.compile(argsMap.get(patternOperationOption)[0]);
      TsurgeonPattern p = parseOperation(argsMap.get(patternOperationOption)[1]);
      ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
    } else {
      for (String arg : args) {
        List<Pair<TregexPattern, TsurgeonPattern>> pairs =
            getOperationsFromFile(arg, encoding, compiler);
        for (Pair<TregexPattern, TsurgeonPattern> pair : pairs) {
          if (verbose) {
            System.err.println(pair.second());
          }
          ops.add(pair);
        }
      }
    }

    for (Tree t : trees) {
      Tree original = t.deepCopy();
      Tree result = processPatternsOnTree(ops, t);
      if (argsMap.containsKey(matchedOption) && matchedOnTree) {
        pwOut.println("Operated on: ");
        displayTree(original, tp, pwOut);
        pwOut.println("Result: ");
      }
      displayTree(result, tp, pwOut);
    }
  }
Exemplo n.º 21
0
  private void incrementDay(ISODateInstance referenceDate, Pair<DateField, Integer> relation) {
    String origDateString = referenceDate.getStartDate();
    String dayString =
        origDateString.substring(origDateString.length() - 2, origDateString.length());
    if (dayString.contains("*")) {
      isoDate = origDateString;
      return;
    }
    // Date is not a variable
    Integer dayNum = Integer.parseInt(dayString);
    String monthString =
        origDateString.substring(origDateString.length() - 4, origDateString.length() - 2);
    int numDaysInMonth = 30; // default - assume this if month is a variable
    int monthNum =
        -1; // ie, we don't know the month yet - this remains -1 if the month is a variable
    if (!monthString.contains("*")) {
      // Set appropriate numDaysInMonth and monthNum
      monthNum = Integer.parseInt(monthString);
      numDaysInMonth = daysPerMonth.get(monthNum);
    }

    // Now, find out if we're an edge case (potential to increment month)
    if (dayNum + relation.second() <= numDaysInMonth && dayNum + relation.second() >= 1) {
      // Not an edge case - just increment the day, create a new string, and return
      dayNum += relation.second();
      isoDate = makeStringDayChange(origDateString, dayNum);
      return;
    }

    // Since we're an edge case, the month can't be a variable - if it is a variable, just set this
    // to the reference string
    if (monthNum == -1) {
      isoDate = origDateString;
      return;
    }
    // At this point, neither our day nor our month is a variable
    isoDate = origDateString;
    boolean decreasing = (dayNum + relation.second() < 1);
    // Need to increment the month, set the date appropriately - we need the new month num to set
    // the day appropriately, so do month first
    int newMonthNum;
    // Now, check if we're an edge case for month
    if ((monthNum + 1 > 12 && !decreasing) || (monthNum - 1 < 1 && decreasing)) {
      // First, change the month
      if (decreasing) {
        newMonthNum = 12;
      } else {
        newMonthNum = 1;
      }
      // If we can, increment the year
      // TODO: fix this to work more nicely with variables and thus handle more cases
      String yearString = origDateString.substring(0, 4);
      if (!yearString.contains("*")) {
        if (decreasing) {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) - 1);
        } else {
          isoDate = makeStringYearChange(isoDate, Integer.parseInt(yearString) + 1);
        }
      }
    } else {
      // We're not an edge case for month - just increment
      if (decreasing) {
        newMonthNum = monthNum - 1;
      } else {
        newMonthNum = monthNum + 1;
      }
    }
    // do the increment
    isoDate = makeStringMonthChange(isoDate, newMonthNum);
    int newDateNum;
    if (decreasing) {
      newDateNum = -relation.second() + daysPerMonth.get(newMonthNum) - dayNum;
    } else {
      newDateNum = relation.second() - dayNum + daysPerMonth.get(monthNum);
    }
    // Now, change the day in our original string to be appropriate
    isoDate = makeStringDayChange(isoDate, newDateNum);
  }