public NodePattern getNodePattern(String name) { Object obj = variables.get(name); if (obj != null) { if (obj instanceof SequencePattern) { SequencePattern seqPattern = (SequencePattern) obj; if (seqPattern.getPatternExpr() instanceof SequencePattern.NodePatternExpr) { return ((SequencePattern.NodePatternExpr) seqPattern.getPatternExpr()).nodePattern; } else { throw new Error( "Invalid node pattern class: " + seqPattern.getPatternExpr().getClass() + " for variable " + name); } } else if (obj instanceof SequencePattern.NodePatternExpr) { SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) obj; return pe.nodePattern; } else if (obj instanceof NodePattern) { return (NodePattern) obj; } else if (obj instanceof String) { try { SequencePattern.NodePatternExpr pe = (SequencePattern.NodePatternExpr) parser.parseNode(this, (String) obj); return pe.nodePattern; } catch (Exception pex) { throw new RuntimeException("Error parsing " + obj + " to node pattern", pex); } } else { throw new Error( "Invalid node pattern variable class: " + obj.getClass() + " for variable " + name); } } return null; }
public SequencePattern.PatternExpr getSequencePatternExpr(String name, boolean copy) { Object obj = variables.get(name); if (obj != null) { if (obj instanceof SequencePattern) { SequencePattern seqPattern = (SequencePattern) obj; return seqPattern.getPatternExpr(); } else if (obj instanceof SequencePattern.PatternExpr) { SequencePattern.PatternExpr pe = (SequencePattern.PatternExpr) obj; return (copy) ? pe.copy() : pe; } else if (obj instanceof NodePattern) { return new SequencePattern.NodePatternExpr((NodePattern) obj); } else if (obj instanceof String) { try { return parser.parseSequence(this, (String) obj); } catch (Exception pex) { throw new RuntimeException("Error parsing " + obj + " to sequence pattern", pex); } } else { throw new Error("Invalid sequence pattern variable class: " + obj.getClass()); } } return null; }
/** * Returns a List of Lists where each element is built from a run of Words in the input Document. * Specifically, reads through each word in the input document and breaks off a sentence after * finding a valid sentence boundary token or end of file. Note that for this to work, the words * in the input document must have been tokenized with a tokenizer that makes sentence boundary * tokens their own tokens (e.g., {@link PTBTokenizer}). * * @param words A list of already tokenized words (must implement HasWord or be a String). * @return A list of sentences. * @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak, * SequencePattern, Set, boolean, boolean) */ public List<List<IN>> wordsToSentences(List<? extends IN> words) { IdentityHashMap<Object, Boolean> isSentenceBoundary = null; // is null unless used by sentenceBoundaryMultiTokenPattern if (sentenceBoundaryMultiTokenPattern != null) { // Do initial pass using tokensregex to identify multi token patterns that need to be matched // and add the last token to our table of sentence boundary tokens isSentenceBoundary = new IdentityHashMap<>(); SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words); while (matcher.find()) { List nodes = matcher.groupNodes(); if (nodes != null && !nodes.isEmpty()) { isSentenceBoundary.put(nodes.get(nodes.size() - 1), true); } } } // Split tokens into sentences!!! List<List<IN>> sentences = Generics.newArrayList(); List<IN> currentSentence = new ArrayList<>(); List<IN> lastSentence = null; boolean insideRegion = false; boolean inWaitForForcedEnd = false; boolean lastTokenWasNewline = false; for (IN o : words) { String word = getString(o); boolean forcedEnd = isForcedEndToken(o); boolean inMultiTokenExpr = false; boolean discardToken = false; if (o instanceof CoreMap) { // Hacky stuff to ensure sentence breaks do not happen in certain cases CoreMap cm = (CoreMap) o; Boolean forcedUntilEndValue = cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class); if (!forcedEnd) { if (forcedUntilEndValue != null && forcedUntilEndValue) inWaitForForcedEnd = true; else { MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class); if (mt != null && !mt.isEnd()) { // In the middle of a multi token mention, make sure sentence is not ended here inMultiTokenExpr = true; } } } } if (tokenPatternsToDiscard != null) { discardToken = matchesTokenPatternsToDiscard(word); } if (sentenceRegionBeginPattern != null && !insideRegion) { if (DEBUG) { log.info("Word is " + word + "; outside region; deleted"); } if (sentenceRegionBeginPattern.matcher(word).matches()) { insideRegion = true; if (DEBUG) { log.info(" entering region"); } } lastTokenWasNewline = false; continue; } if (lastSentence != null && currentSentence.isEmpty() && sentenceBoundaryFollowersPattern.matcher(word).matches()) { if (!discardToken) { lastSentence.add(o); } if (DEBUG) { log.info("Word is " + word + (discardToken ? "discarded" : " added to last sentence")); } lastTokenWasNewline = false; continue; } boolean newSent = false; String debugText = (discardToken) ? "discarded" : "added to current"; if (inWaitForForcedEnd && !forcedEnd) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is in wait for forced end; " + debugText); } } else if (inMultiTokenExpr && !forcedEnd) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is in multi token expr; " + debugText); } } else if (sentenceBoundaryToDiscard.contains(word)) { if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) { newSent = true; } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) { if (lastTokenWasNewline) { newSent = true; } } lastTokenWasNewline = true; if (DEBUG) { log.info("Word is " + word + " discarded sentence boundary"); } } else { lastTokenWasNewline = false; Boolean isb; if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) { newSent = true; if (DEBUG) { log.info("Word is " + word + "; is XML break element; discarded"); } } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) { insideRegion = false; newSent = true; // Marked sentence boundaries } else if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary.get(o)) != null) && isb) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info( "Word is " + word + "; is sentence boundary (matched multi-token pattern); " + debugText); } newSent = true; } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; is sentence boundary; " + debugText); } newSent = true; } else if (forcedEnd) { if (!discardToken) currentSentence.add(o); inWaitForForcedEnd = false; newSent = true; if (DEBUG) { log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText); } } else { if (!discardToken) currentSentence.add(o); if (DEBUG) { log.info("Word is " + word + "; " + debugText); } } } if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) { if (DEBUG) { log.info(" beginning new sentence"); } sentences.add(currentSentence); // adds this sentence now that it's complete lastSentence = currentSentence; currentSentence = new ArrayList<>(); // clears the current sentence } } // add any words at the end, even if there isn't a sentence // terminator at the end of file if (!currentSentence.isEmpty()) { sentences.add(currentSentence); // adds last sentence } return sentences; }
public void bind(String name, SequencePattern pattern) { bind(name, pattern.getPatternExpr()); }