public void check(String name, String in, String[] out, BreakIterator bi, TestHarness harness) { harness.checkPoint(name); bi.setText(in); int index = 0; int from = bi.current(); harness.check(from, 0); while (true) { int to = bi.next(); if (to == BreakIterator.DONE) break; harness.check(in.substring(from, to), out[index]); ++index; from = to; } harness.check(index, out.length); harness.checkPoint("backwards " + name); bi.last(); index = out.length - 1; from = bi.current(); harness.check(from, in.length()); while (true) { int to = bi.previous(); if (to == BreakIterator.DONE) break; harness.check(in.substring(to, from), out[index]); --index; from = to; } harness.check(index, -1); }
@Override public Object evaluate(DeferredObject[] arguments) throws HiveException { assert (arguments.length >= 1 && arguments.length <= 3); if (arguments[0].get() == null) { return null; } // if there is more than 1 argument specified, a different natural language // locale is being specified Locale locale = null; if (arguments.length > 1 && arguments[1].get() != null) { Text language = (Text) converters[1].convert(arguments[1].get()); Text country = null; if (arguments.length > 2 && arguments[2].get() != null) { country = (Text) converters[2].convert(arguments[2].get()); } if (country != null) { locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase()); } else { locale = new Locale(language.toString().toLowerCase()); } } else { locale = Locale.getDefault(); } // get the input and prepare the output Text chunk = (Text) converters[0].convert(arguments[0].get()); String text = chunk.toString(); ArrayList<ArrayList<Text>> result = new ArrayList<ArrayList<Text>>(); // Parse out sentences using Java's text-handling API BreakIterator bi = BreakIterator.getSentenceInstance(locale); bi.setText(text); int idx = 0; while (bi.next() != BreakIterator.DONE) { String sentence = text.substring(idx, bi.current()); idx = bi.current(); result.add(new ArrayList<Text>()); // Parse out words in the sentence BreakIterator wi = BreakIterator.getWordInstance(locale); wi.setText(sentence); int widx = 0; ArrayList<Text> sent_array = result.get(result.size() - 1); while (wi.next() != BreakIterator.DONE) { String word = sentence.substring(widx, wi.current()); widx = wi.current(); if (Character.isLetterOrDigit(word.charAt(0))) { sent_array.add(new Text(word)); } } } return result; }
/** * Called to summarize a document when no hits were found. By default this just returns the first * {@code maxPassages} sentences; subclasses can override to customize. */ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { // BreakIterator should be un-next'd: List<Passage> passages = new ArrayList<>(); int pos = bi.current(); assert pos == 0; while (passages.size() < maxPassages) { int next = bi.next(); if (next == BreakIterator.DONE) { break; } Passage passage = new Passage(); passage.score = Float.NaN; passage.startOffset = pos; passage.endOffset = next; passages.add(passage); pos = next; } return passages.toArray(new Passage[passages.size()]); }
/** Returns the next word in the text */ public String nextWord() { if (!first) { currentWordPos = nextWordPos; currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd + 1); int current = sentanceIterator.current(); if (current == currentWordPos) startsSentance = true; else { startsSentance = false; if (currentWordEnd > current) sentanceIterator.next(); } } // The nextWordPos has already been populated String word = null; try { word = document.getText(currentWordPos, currentWordEnd - currentWordPos); } catch (BadLocationException ex) { moreTokens = false; } wordCount++; first = false; if (nextWordPos == -1) moreTokens = false; return word; }
public static void main(String[] args) { BreakIterator bi = BreakIterator.getWordInstance(); bi.setText(text); MirroredBreakIterator mirror = new MirroredBreakIterator(bi); final int first = bi.first(); if (first != 0) { throw new RuntimeException("first != 0: " + first); } final int last = bi.last(); bi = BreakIterator.getWordInstance(); bi.setText(text); int length = text.length(); /* * following(int) */ for (int i = 0; i <= length; i++) { if (i == length) { check(bi.following(i), DONE); } check(bi.following(i), mirror.following(i)); check(bi.current(), mirror.current()); } for (int i = -length; i < 0; i++) { checkFollowingException(bi, i); checkFollowingException(mirror, i); check(bi.current(), mirror.current()); } for (int i = 1; i < length; i++) { checkFollowingException(bi, length + i); checkFollowingException(mirror, length + i); check(bi.current(), mirror.current()); } /* * preceding(int) */ for (int i = length; i >= 0; i--) { if (i == 0) { check(bi.preceding(i), DONE); } check(bi.preceding(i), mirror.preceding(i)); check(bi.current(), mirror.current()); } for (int i = -length; i < 0; i++) { checkPrecedingException(bi, i); checkPrecedingException(mirror, i); check(bi.current(), mirror.current()); } for (int i = 1; i < length; i++) { checkPrecedingException(bi, length + i); checkPrecedingException(mirror, length + i); check(bi.current(), mirror.current()); } /* * isBoundary(int) */ for (int i = 0; i <= length; i++) { check(bi.isBoundary(i), mirror.isBoundary(i)); check(bi.current(), mirror.current()); } for (int i = -length; i < 0; i++) { checkIsBoundaryException(bi, i); checkIsBoundaryException(mirror, i); } for (int i = 1; i < length; i++) { checkIsBoundaryException(bi, length + i); checkIsBoundaryException(mirror, length + i); } }