private static void checkPrecedingException(BreakIterator bi, int offset) { try { bi.preceding(offset); } catch (IllegalArgumentException e) { return; // OK } throw new RuntimeException(bi + ": preceding() doesn't throw an IAE with offset " + offset); }
/** * If <code>offset</code> is within a word, returns the index of the first character of that word, * otherwise returns BreakIterator.DONE. * * <p>The offsets that are considered to be part of a word are the indexes of its characters, * <i>as well as</i> the index of its last character plus one. If offset is the index of a low * surrogate character, BreakIterator.DONE will be returned. * * <p>Valid range for offset is [0..textLength] (note the inclusive upper bound). The returned * value is within [0..offset] or BreakIterator.DONE. * * @throws IllegalArgumentException is offset is not valid. */ public int getBeginning(int offset) { final int shiftedOffset = offset - mOffsetShift; checkOffsetIsValid(shiftedOffset); if (isOnLetterOrDigit(shiftedOffset)) { if (mIterator.isBoundary(shiftedOffset)) { return shiftedOffset + mOffsetShift; } else { return mIterator.preceding(shiftedOffset) + mOffsetShift; } } else { if (isAfterLetterOrDigit(shiftedOffset)) { return mIterator.preceding(shiftedOffset) + mOffsetShift; } } return BreakIterator.DONE; }
/** * Returns the position of boundary preceding the given offset or {@code DONE} if the given offset * specifies the starting position. * * @param offset the given start position to search from. * @return the position of the last boundary preceding the given offset. */ public int prevBoundary(int offset) { int shiftedOffset = offset - mOffsetShift; shiftedOffset = mIterator.preceding(shiftedOffset); if (shiftedOffset == BreakIterator.DONE) { return BreakIterator.DONE; } return shiftedOffset + mOffsetShift; }
/** {@inheritDoc} */ public int preceding(int offset) { int shiftedOffset = offset - mOffsetShift; do { shiftedOffset = mIterator.preceding(shiftedOffset); if (shiftedOffset == BreakIterator.DONE) { return BreakIterator.DONE; } if (isOnLetterOrDigit(shiftedOffset)) { return shiftedOffset + mOffsetShift; } } while (true); }
public static String wordWrap(String input, int width, Locale locale) { if (input == null) return ""; if (width < 5) return input; if (width >= input.length()) return input; StringBuffer buf = new StringBuffer(input); boolean endOfLine = false; int lineStart = 0; for (int i = 0; i < buf.length(); i++) { if (buf.charAt(i) == '\n') { lineStart = i + 1; endOfLine = true; } if (i <= (lineStart + width) - 1) continue; if (!endOfLine) { int limit = i - lineStart - 1; BreakIterator breaks = BreakIterator.getLineInstance(locale); breaks.setText(buf.substring(lineStart, i)); int end = breaks.last(); if (end == limit + 1 && !Character.isWhitespace(buf.charAt(lineStart + end))) end = breaks.preceding(end - 1); if (end != -1 && end == limit + 1) { buf.replace(lineStart + end, lineStart + end + 1, "\n"); lineStart += end; continue; } if (end != -1 && end != 0) { buf.insert(lineStart + end, '\n'); lineStart = lineStart + end + 1; } else { buf.insert(i, '\n'); lineStart = i + 1; } } else { buf.insert(i, '\n'); lineStart = i + 1; endOfLine = false; } } return buf.toString(); }
private void makeLayoutWindow(int localStart) { int compStart = localStart; int compLimit = fChars.length; // If we've already gone past the layout window, format to end of paragraph if (layoutCount > 0 && !haveLayoutWindow) { float avgLineLength = Math.max(layoutCharCount / layoutCount, 1); compLimit = Math.min(localStart + (int) (avgLineLength * EST_LINES), fChars.length); } if (localStart > 0 || compLimit < fChars.length) { if (charIter == null) { charIter = new CharArrayIterator(fChars); } else { charIter.reset(fChars); } if (fLineBreak == null) { fLineBreak = BreakIterator.getLineInstance(); } fLineBreak.setText(charIter); if (localStart > 0) { if (!fLineBreak.isBoundary(localStart)) { compStart = fLineBreak.preceding(localStart); } } if (compLimit < fChars.length) { if (!fLineBreak.isBoundary(compLimit)) { compLimit = fLineBreak.following(compLimit); } } } ensureComponents(compStart, compLimit); haveLayoutWindow = true; }
public static void main(String[] args) { BreakIterator bi = BreakIterator.getWordInstance(); bi.setText(text); MirroredBreakIterator mirror = new MirroredBreakIterator(bi); final int first = bi.first(); if (first != 0) { throw new RuntimeException("first != 0: " + first); } final int last = bi.last(); bi = BreakIterator.getWordInstance(); bi.setText(text); int length = text.length(); /* * following(int) */ for (int i = 0; i <= length; i++) { if (i == length) { check(bi.following(i), DONE); } check(bi.following(i), mirror.following(i)); check(bi.current(), mirror.current()); } for (int i = -length; i < 0; i++) { checkFollowingException(bi, i); checkFollowingException(mirror, i); check(bi.current(), mirror.current()); } for (int i = 1; i < length; i++) { checkFollowingException(bi, length + i); checkFollowingException(mirror, length + i); check(bi.current(), mirror.current()); } /* * preceding(int) */ for (int i = length; i >= 0; i--) { if (i == 0) { check(bi.preceding(i), DONE); } check(bi.preceding(i), mirror.preceding(i)); check(bi.current(), mirror.current()); } for (int i = -length; i < 0; i++) { checkPrecedingException(bi, i); checkPrecedingException(mirror, i); check(bi.current(), mirror.current()); } for (int i = 1; i < length; i++) { checkPrecedingException(bi, length + i); checkPrecedingException(mirror, length + i); check(bi.current(), mirror.current()); } /* * isBoundary(int) */ for (int i = 0; i <= length; i++) { check(bi.isBoundary(i), mirror.isBoundary(i)); check(bi.current(), mirror.current()); } for (int i = -length; i < 0; i++) { checkIsBoundaryException(bi, i); checkIsBoundaryException(mirror, i); } for (int i = 1; i < length; i++) { checkIsBoundaryException(bi, length + i); checkIsBoundaryException(mirror, length + i); } }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] highlightDoc( String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { PassageScorer scorer = getScorer(field); if (scorer == null) { throw new NullPointerException("PassageScorer cannot be null"); } PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>(); float weights[] = new float[terms.length]; // initialize postings for (int i = 0; i < terms.length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.seekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); if (de == null) { // no positions available throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.advance(doc); } else { pDoc = de.docID(); if (pDoc < doc) { pDoc = de.advance(doc); } } if (doc == pDoc) { weights[i] = scorer.weight(contentLength, de.freq()); de.nextPosition(); pq.add(new OffsetsEnum(de, i)); } } pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination PriorityQueue<Passage> passageQueue = new PriorityQueue<>( n, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { if (left.score < right.score) { return -1; } else if (left.score > right.score) { return 1; } else { return left.startOffset - right.startOffset; } } }); Passage current = new Passage(); OffsetsEnum off; while ((off = pq.poll()) != null) { final DocsAndPositionsEnum dp = off.dp; int start = dp.startOffset(); if (start == -1) { throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.endOffset(); // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. assert EMPTY.startOffset() == Integer.MAX_VALUE; if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.size() == n && current.score < passageQueue.peek().score) { current.reset(); // can't compete, just reset it } else { passageQueue.offer(current); if (passageQueue.size() > n) { current = passageQueue.poll(); current.reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage passages[] = new Passage[passageQueue.size()]; passageQueue.toArray(passages); for (Passage p : passages) { p.sort(); } // sort in ascending order Arrays.sort( passages, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { return left.startOffset - right.startOffset; } }); return passages; } // advance breakiterator assert BreakIterator.DONE < 0; current.startOffset = Math.max(bi.preceding(start + 1), 0); current.endOffset = Math.min(bi.next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.getPayload(); assert term != null; } current.addMatch(start, end, term); if (off.pos == dp.freq()) { break; // removed from pq } else { off.pos++; dp.nextPosition(); start = dp.startOffset(); end = dp.endOffset(); } if (start >= current.endOffset || end > contentLength) { pq.offer(off); break; } } current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: assert false; return null; }
/** * Reformats a string where lines that are longer than <tt>width</tt> are split apart at the * earliest wordbreak or at maxLength, whichever is sooner. If the width specified is less than 5 * or greater than the input Strings length the string will be returned as is. * * <p>Please note that this method can be lossy - trailing spaces on wrapped lines may be trimmed. * * @param input the String to reformat. * @param width the maximum length of any one line. * @return a new String with reformatted as needed. */ public static String wordWrap(String input, int width, Locale locale) { // protect ourselves if (input == null) { return ""; } else if (width < 5) { return input; } else if (width >= input.length()) { return input; } // default locale if (locale == null) { locale = JiveGlobals.getLocale(); } StringBuilder buf = new StringBuilder(input); boolean endOfLine = false; int lineStart = 0; for (int i = 0; i < buf.length(); i++) { if (buf.charAt(i) == '\n') { lineStart = i + 1; endOfLine = true; } // handle splitting at width character if (i > lineStart + width - 1) { if (!endOfLine) { int limit = i - lineStart - 1; BreakIterator breaks = BreakIterator.getLineInstance(locale); breaks.setText(buf.substring(lineStart, i)); int end = breaks.last(); // if the last character in the search string isn't a space, // we can't split on it (looks bad). Search for a previous // break character if (end == limit + 1) { if (!Character.isWhitespace(buf.charAt(lineStart + end))) { end = breaks.preceding(end - 1); } } // if the last character is a space, replace it with a \n if (end != BreakIterator.DONE && end == limit + 1) { buf.replace(lineStart + end, lineStart + end + 1, "\n"); lineStart = lineStart + end; } // otherwise, just insert a \n else if (end != BreakIterator.DONE && end != 0) { buf.insert(lineStart + end, '\n'); lineStart = lineStart + end + 1; } else { buf.insert(i, '\n'); lineStart = i + 1; } } else { buf.insert(i, '\n'); lineStart = i + 1; endOfLine = false; } } } return buf.toString(); }