Beispiel #1
0
 public NodePattern getNodePattern(String name) {
   Object obj = variables.get(name);
   if (obj != null) {
     if (obj instanceof SequencePattern) {
       SequencePattern seqPattern = (SequencePattern) obj;
       if (seqPattern.getPatternExpr() instanceof SequencePattern.NodePatternExpr) {
         return ((SequencePattern.NodePatternExpr) seqPattern.getPatternExpr()).nodePattern;
       } else {
         throw new Error(
             "Invalid node pattern class: "
                 + seqPattern.getPatternExpr().getClass()
                 + " for variable "
                 + name);
       }
     } else if (obj instanceof SequencePattern.NodePatternExpr) {
       SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) obj;
       return pe.nodePattern;
     } else if (obj instanceof NodePattern) {
       return (NodePattern) obj;
     } else if (obj instanceof String) {
       try {
         SequencePattern.NodePatternExpr pe =
             (SequencePattern.NodePatternExpr) parser.parseNode(this, (String) obj);
         return pe.nodePattern;
       } catch (Exception pex) {
         throw new RuntimeException("Error parsing " + obj + " to node pattern", pex);
       }
     } else {
       throw new Error(
           "Invalid node pattern variable class: " + obj.getClass() + " for variable " + name);
     }
   }
   return null;
 }
Beispiel #2
0
 public SequencePattern.PatternExpr getSequencePatternExpr(String name, boolean copy) {
   Object obj = variables.get(name);
   if (obj != null) {
     if (obj instanceof SequencePattern) {
       SequencePattern seqPattern = (SequencePattern) obj;
       return seqPattern.getPatternExpr();
     } else if (obj instanceof SequencePattern.PatternExpr) {
       SequencePattern.PatternExpr pe = (SequencePattern.PatternExpr) obj;
       return (copy) ? pe.copy() : pe;
     } else if (obj instanceof NodePattern) {
       return new SequencePattern.NodePatternExpr((NodePattern) obj);
     } else if (obj instanceof String) {
       try {
         return parser.parseSequence(this, (String) obj);
       } catch (Exception pex) {
         throw new RuntimeException("Error parsing " + obj + " to sequence pattern", pex);
       }
     } else {
       throw new Error("Invalid sequence pattern variable class: " + obj.getClass());
     }
   }
   return null;
 }
  /**
   * Returns a List of Lists where each element is built from a run of Words in the input Document.
   * Specifically, reads through each word in the input document and breaks off a sentence after
   * finding a valid sentence boundary token or end of file. Note that for this to work, the words
   * in the input document must have been tokenized with a tokenizer that makes sentence boundary
   * tokens their own tokens (e.g., {@link PTBTokenizer}).
   *
   * @param words A list of already tokenized words (must implement HasWord or be a String).
   * @return A list of sentences.
   * @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak,
   *     SequencePattern, Set, boolean, boolean)
   */
  public List<List<IN>> wordsToSentences(List<? extends IN> words) {
    IdentityHashMap<Object, Boolean> isSentenceBoundary =
        null; // is null unless used by sentenceBoundaryMultiTokenPattern

    if (sentenceBoundaryMultiTokenPattern != null) {
      // Do initial pass using tokensregex to identify multi token patterns that need to be matched
      // and add the last token to our table of sentence boundary tokens
      isSentenceBoundary = new IdentityHashMap<>();
      SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
      while (matcher.find()) {
        List nodes = matcher.groupNodes();
        if (nodes != null && !nodes.isEmpty()) {
          isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
        }
      }
    }

    // Split tokens into sentences!!!
    List<List<IN>> sentences = Generics.newArrayList();
    List<IN> currentSentence = new ArrayList<>();
    List<IN> lastSentence = null;
    boolean insideRegion = false;
    boolean inWaitForForcedEnd = false;
    boolean lastTokenWasNewline = false;

    for (IN o : words) {
      String word = getString(o);
      boolean forcedEnd = isForcedEndToken(o);

      boolean inMultiTokenExpr = false;
      boolean discardToken = false;
      if (o instanceof CoreMap) {
        // Hacky stuff to ensure sentence breaks do not happen in certain cases
        CoreMap cm = (CoreMap) o;
        Boolean forcedUntilEndValue =
            cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
        if (!forcedEnd) {
          if (forcedUntilEndValue != null && forcedUntilEndValue) inWaitForForcedEnd = true;
          else {
            MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
            if (mt != null && !mt.isEnd()) {
              // In the middle of a multi token mention, make sure sentence is not ended here
              inMultiTokenExpr = true;
            }
          }
        }
      }
      if (tokenPatternsToDiscard != null) {
        discardToken = matchesTokenPatternsToDiscard(word);
      }

      if (sentenceRegionBeginPattern != null && !insideRegion) {
        if (DEBUG) {
          log.info("Word is " + word + "; outside region; deleted");
        }
        if (sentenceRegionBeginPattern.matcher(word).matches()) {
          insideRegion = true;
          if (DEBUG) {
            log.info("  entering region");
          }
        }
        lastTokenWasNewline = false;
        continue;
      }

      if (lastSentence != null
          && currentSentence.isEmpty()
          && sentenceBoundaryFollowersPattern.matcher(word).matches()) {
        if (!discardToken) {
          lastSentence.add(o);
        }
        if (DEBUG) {
          log.info("Word is " + word + (discardToken ? "discarded" : "  added to last sentence"));
        }
        lastTokenWasNewline = false;
        continue;
      }

      boolean newSent = false;
      String debugText = (discardToken) ? "discarded" : "added to current";
      if (inWaitForForcedEnd && !forcedEnd) {
        if (!discardToken) currentSentence.add(o);
        if (DEBUG) {
          log.info("Word is " + word + "; is in wait for forced end; " + debugText);
        }
      } else if (inMultiTokenExpr && !forcedEnd) {
        if (!discardToken) currentSentence.add(o);
        if (DEBUG) {
          log.info("Word is " + word + "; is in multi token expr; " + debugText);
        }
      } else if (sentenceBoundaryToDiscard.contains(word)) {
        if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
          newSent = true;
        } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) {
          if (lastTokenWasNewline) {
            newSent = true;
          }
        }
        lastTokenWasNewline = true;
        if (DEBUG) {
          log.info("Word is " + word + "  discarded sentence boundary");
        }
      } else {
        lastTokenWasNewline = false;
        Boolean isb;
        if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
          newSent = true;
          if (DEBUG) {
            log.info("Word is " + word + "; is XML break element; discarded");
          }
        } else if (sentenceRegionEndPattern != null
            && sentenceRegionEndPattern.matcher(word).matches()) {
          insideRegion = false;
          newSent = true;
          // Marked sentence boundaries
        } else if ((isSentenceBoundary != null)
            && ((isb = isSentenceBoundary.get(o)) != null)
            && isb) {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            log.info(
                "Word is "
                    + word
                    + "; is sentence boundary (matched multi-token pattern); "
                    + debugText);
          }
          newSent = true;
        } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            log.info("Word is " + word + "; is sentence boundary; " + debugText);
          }
          newSent = true;
        } else if (forcedEnd) {
          if (!discardToken) currentSentence.add(o);
          inWaitForForcedEnd = false;
          newSent = true;
          if (DEBUG) {
            log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText);
          }
        } else {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            log.info("Word is " + word + "; " + debugText);
          }
        }
      }

      if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) {
        if (DEBUG) {
          log.info("  beginning new sentence");
        }
        sentences.add(currentSentence);
        // adds this sentence now that it's complete
        lastSentence = currentSentence;
        currentSentence = new ArrayList<>(); // clears the current sentence
      }
    }

    // add any words at the end, even if there isn't a sentence
    // terminator at the end of file
    if (!currentSentence.isEmpty()) {
      sentences.add(currentSentence); // adds last sentence
    }

    return sentences;
  }
Beispiel #4
0
 public void bind(String name, SequencePattern pattern) {
   bind(name, pattern.getPatternExpr());
 }