Example #1
0
 private static void checkPrecedingException(BreakIterator bi, int offset) {
   try {
     bi.preceding(offset);
   } catch (IllegalArgumentException e) {
     return; // OK
   }
   throw new RuntimeException(bi + ": preceding() doesn't throw an IAE with offset " + offset);
 }
Example #2
0
  /**
   * If <code>offset</code> is within a word, returns the index of the first character of that word,
   * otherwise returns BreakIterator.DONE.
   *
   * <p>The offsets that are considered to be part of a word are the indexes of its characters,
   * <i>as well as</i> the index of its last character plus one. If offset is the index of a low
   * surrogate character, BreakIterator.DONE will be returned.
   *
   * <p>Valid range for offset is [0..textLength] (note the inclusive upper bound). The returned
   * value is within [0..offset] or BreakIterator.DONE.
   *
   * @throws IllegalArgumentException is offset is not valid.
   */
  public int getBeginning(int offset) {
    final int shiftedOffset = offset - mOffsetShift;
    checkOffsetIsValid(shiftedOffset);

    if (isOnLetterOrDigit(shiftedOffset)) {
      if (mIterator.isBoundary(shiftedOffset)) {
        return shiftedOffset + mOffsetShift;
      } else {
        return mIterator.preceding(shiftedOffset) + mOffsetShift;
      }
    } else {
      if (isAfterLetterOrDigit(shiftedOffset)) {
        return mIterator.preceding(shiftedOffset) + mOffsetShift;
      }
    }
    return BreakIterator.DONE;
  }
 /**
  * Returns the position of boundary preceding the given offset or {@code DONE} if the given offset
  * specifies the starting position.
  *
  * @param offset the given start position to search from.
  * @return the position of the last boundary preceding the given offset.
  */
 public int prevBoundary(int offset) {
   int shiftedOffset = offset - mOffsetShift;
   shiftedOffset = mIterator.preceding(shiftedOffset);
   if (shiftedOffset == BreakIterator.DONE) {
     return BreakIterator.DONE;
   }
   return shiftedOffset + mOffsetShift;
 }
Example #4
0
 /** {@inheritDoc} */
 public int preceding(int offset) {
   int shiftedOffset = offset - mOffsetShift;
   do {
     shiftedOffset = mIterator.preceding(shiftedOffset);
     if (shiftedOffset == BreakIterator.DONE) {
       return BreakIterator.DONE;
     }
     if (isOnLetterOrDigit(shiftedOffset)) {
       return shiftedOffset + mOffsetShift;
     }
   } while (true);
 }
Example #5
0
  public static String wordWrap(String input, int width, Locale locale) {
    if (input == null) return "";
    if (width < 5) return input;
    if (width >= input.length()) return input;
    StringBuffer buf = new StringBuffer(input);
    boolean endOfLine = false;
    int lineStart = 0;
    for (int i = 0; i < buf.length(); i++) {
      if (buf.charAt(i) == '\n') {
        lineStart = i + 1;
        endOfLine = true;
      }
      if (i <= (lineStart + width) - 1) continue;
      if (!endOfLine) {
        int limit = i - lineStart - 1;
        BreakIterator breaks = BreakIterator.getLineInstance(locale);
        breaks.setText(buf.substring(lineStart, i));
        int end = breaks.last();
        if (end == limit + 1 && !Character.isWhitespace(buf.charAt(lineStart + end)))
          end = breaks.preceding(end - 1);
        if (end != -1 && end == limit + 1) {
          buf.replace(lineStart + end, lineStart + end + 1, "\n");
          lineStart += end;
          continue;
        }
        if (end != -1 && end != 0) {
          buf.insert(lineStart + end, '\n');
          lineStart = lineStart + end + 1;
        } else {
          buf.insert(i, '\n');
          lineStart = i + 1;
        }
      } else {
        buf.insert(i, '\n');
        lineStart = i + 1;
        endOfLine = false;
      }
    }

    return buf.toString();
  }
Example #6
0
  private void makeLayoutWindow(int localStart) {

    int compStart = localStart;
    int compLimit = fChars.length;

    // If we've already gone past the layout window, format to end of paragraph
    if (layoutCount > 0 && !haveLayoutWindow) {
      float avgLineLength = Math.max(layoutCharCount / layoutCount, 1);
      compLimit = Math.min(localStart + (int) (avgLineLength * EST_LINES), fChars.length);
    }

    if (localStart > 0 || compLimit < fChars.length) {
      if (charIter == null) {
        charIter = new CharArrayIterator(fChars);
      } else {
        charIter.reset(fChars);
      }
      if (fLineBreak == null) {
        fLineBreak = BreakIterator.getLineInstance();
      }
      fLineBreak.setText(charIter);
      if (localStart > 0) {
        if (!fLineBreak.isBoundary(localStart)) {
          compStart = fLineBreak.preceding(localStart);
        }
      }
      if (compLimit < fChars.length) {
        if (!fLineBreak.isBoundary(compLimit)) {
          compLimit = fLineBreak.following(compLimit);
        }
      }
    }

    ensureComponents(compStart, compLimit);
    haveLayoutWindow = true;
  }
Example #7
0
  public static void main(String[] args) {
    BreakIterator bi = BreakIterator.getWordInstance();
    bi.setText(text);
    MirroredBreakIterator mirror = new MirroredBreakIterator(bi);
    final int first = bi.first();
    if (first != 0) {
      throw new RuntimeException("first != 0: " + first);
    }
    final int last = bi.last();
    bi = BreakIterator.getWordInstance();
    bi.setText(text);
    int length = text.length();

    /*
     * following(int)
     */
    for (int i = 0; i <= length; i++) {
      if (i == length) {
        check(bi.following(i), DONE);
      }
      check(bi.following(i), mirror.following(i));
      check(bi.current(), mirror.current());
    }
    for (int i = -length; i < 0; i++) {
      checkFollowingException(bi, i);
      checkFollowingException(mirror, i);
      check(bi.current(), mirror.current());
    }
    for (int i = 1; i < length; i++) {
      checkFollowingException(bi, length + i);
      checkFollowingException(mirror, length + i);
      check(bi.current(), mirror.current());
    }

    /*
     * preceding(int)
     */
    for (int i = length; i >= 0; i--) {
      if (i == 0) {
        check(bi.preceding(i), DONE);
      }
      check(bi.preceding(i), mirror.preceding(i));
      check(bi.current(), mirror.current());
    }
    for (int i = -length; i < 0; i++) {
      checkPrecedingException(bi, i);
      checkPrecedingException(mirror, i);
      check(bi.current(), mirror.current());
    }
    for (int i = 1; i < length; i++) {
      checkPrecedingException(bi, length + i);
      checkPrecedingException(mirror, length + i);
      check(bi.current(), mirror.current());
    }

    /*
     * isBoundary(int)
     */
    for (int i = 0; i <= length; i++) {
      check(bi.isBoundary(i), mirror.isBoundary(i));
      check(bi.current(), mirror.current());
    }
    for (int i = -length; i < 0; i++) {
      checkIsBoundaryException(bi, i);
      checkIsBoundaryException(mirror, i);
    }
    for (int i = 1; i < length; i++) {
      checkIsBoundaryException(bi, length + i);
      checkIsBoundaryException(mirror, length + i);
    }
  }
  // algorithm: treat sentence snippets as miniature documents
  // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
  // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
  private Passage[] highlightDoc(
      String field,
      BytesRef terms[],
      int contentLength,
      BreakIterator bi,
      int doc,
      TermsEnum termsEnum,
      DocsAndPositionsEnum[] postings,
      int n)
      throws IOException {
    PassageScorer scorer = getScorer(field);
    if (scorer == null) {
      throw new NullPointerException("PassageScorer cannot be null");
    }
    PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
    float weights[] = new float[terms.length];
    // initialize postings
    for (int i = 0; i < terms.length; i++) {
      DocsAndPositionsEnum de = postings[i];
      int pDoc;
      if (de == EMPTY) {
        continue;
      } else if (de == null) {
        postings[i] = EMPTY; // initially
        if (!termsEnum.seekExact(terms[i])) {
          continue; // term not found
        }
        de =
            postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
        if (de == null) {
          // no positions available
          throw new IllegalArgumentException(
              "field '" + field + "' was indexed without offsets, cannot highlight");
        }
        pDoc = de.advance(doc);
      } else {
        pDoc = de.docID();
        if (pDoc < doc) {
          pDoc = de.advance(doc);
        }
      }

      if (doc == pDoc) {
        weights[i] = scorer.weight(contentLength, de.freq());
        de.nextPosition();
        pq.add(new OffsetsEnum(de, i));
      }
    }

    pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination

    PriorityQueue<Passage> passageQueue =
        new PriorityQueue<>(
            n,
            new Comparator<Passage>() {
              @Override
              public int compare(Passage left, Passage right) {
                if (left.score < right.score) {
                  return -1;
                } else if (left.score > right.score) {
                  return 1;
                } else {
                  return left.startOffset - right.startOffset;
                }
              }
            });
    Passage current = new Passage();

    OffsetsEnum off;
    while ((off = pq.poll()) != null) {
      final DocsAndPositionsEnum dp = off.dp;
      int start = dp.startOffset();
      if (start == -1) {
        throw new IllegalArgumentException(
            "field '" + field + "' was indexed without offsets, cannot highlight");
      }
      int end = dp.endOffset();
      // LUCENE-5166: this hit would span the content limit... however more valid
      // hits may exist (they are sorted by start). so we pretend like we never
      // saw this term, it won't cause a passage to be added to passageQueue or anything.
      assert EMPTY.startOffset() == Integer.MAX_VALUE;
      if (start < contentLength && end > contentLength) {
        continue;
      }
      if (start >= current.endOffset) {
        if (current.startOffset >= 0) {
          // finalize current
          current.score *= scorer.norm(current.startOffset);
          // new sentence: first add 'current' to queue
          if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
            current.reset(); // can't compete, just reset it
          } else {
            passageQueue.offer(current);
            if (passageQueue.size() > n) {
              current = passageQueue.poll();
              current.reset();
            } else {
              current = new Passage();
            }
          }
        }
        // if we exceed limit, we are done
        if (start >= contentLength) {
          Passage passages[] = new Passage[passageQueue.size()];
          passageQueue.toArray(passages);
          for (Passage p : passages) {
            p.sort();
          }
          // sort in ascending order
          Arrays.sort(
              passages,
              new Comparator<Passage>() {
                @Override
                public int compare(Passage left, Passage right) {
                  return left.startOffset - right.startOffset;
                }
              });
          return passages;
        }
        // advance breakiterator
        assert BreakIterator.DONE < 0;
        current.startOffset = Math.max(bi.preceding(start + 1), 0);
        current.endOffset = Math.min(bi.next(), contentLength);
      }
      int tf = 0;
      while (true) {
        tf++;
        BytesRef term = terms[off.id];
        if (term == null) {
          // multitermquery match, pull from payload
          term = off.dp.getPayload();
          assert term != null;
        }
        current.addMatch(start, end, term);
        if (off.pos == dp.freq()) {
          break; // removed from pq
        } else {
          off.pos++;
          dp.nextPosition();
          start = dp.startOffset();
          end = dp.endOffset();
        }
        if (start >= current.endOffset || end > contentLength) {
          pq.offer(off);
          break;
        }
      }
      current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
    }

    // Dead code but compiler disagrees:
    assert false;
    return null;
  }
Example #9
0
  /**
   * Reformats a string where lines that are longer than <tt>width</tt> are split apart at the
   * earliest wordbreak or at maxLength, whichever is sooner. If the width specified is less than 5
   * or greater than the input Strings length the string will be returned as is.
   *
   * <p>Please note that this method can be lossy - trailing spaces on wrapped lines may be trimmed.
   *
   * @param input the String to reformat.
   * @param width the maximum length of any one line.
   * @return a new String with reformatted as needed.
   */
  public static String wordWrap(String input, int width, Locale locale) {
    // protect ourselves
    if (input == null) {
      return "";
    } else if (width < 5) {
      return input;
    } else if (width >= input.length()) {
      return input;
    }

    // default locale
    if (locale == null) {
      locale = JiveGlobals.getLocale();
    }

    StringBuilder buf = new StringBuilder(input);
    boolean endOfLine = false;
    int lineStart = 0;

    for (int i = 0; i < buf.length(); i++) {
      if (buf.charAt(i) == '\n') {
        lineStart = i + 1;
        endOfLine = true;
      }

      // handle splitting at width character
      if (i > lineStart + width - 1) {
        if (!endOfLine) {
          int limit = i - lineStart - 1;
          BreakIterator breaks = BreakIterator.getLineInstance(locale);
          breaks.setText(buf.substring(lineStart, i));
          int end = breaks.last();

          // if the last character in the search string isn't a space,
          // we can't split on it (looks bad). Search for a previous
          // break character
          if (end == limit + 1) {
            if (!Character.isWhitespace(buf.charAt(lineStart + end))) {
              end = breaks.preceding(end - 1);
            }
          }

          // if the last character is a space, replace it with a \n
          if (end != BreakIterator.DONE && end == limit + 1) {
            buf.replace(lineStart + end, lineStart + end + 1, "\n");
            lineStart = lineStart + end;
          }
          // otherwise, just insert a \n
          else if (end != BreakIterator.DONE && end != 0) {
            buf.insert(lineStart + end, '\n');
            lineStart = lineStart + end + 1;
          } else {
            buf.insert(i, '\n');
            lineStart = i + 1;
          }
        } else {
          buf.insert(i, '\n');
          lineStart = i + 1;
          endOfLine = false;
        }
      }
    }

    return buf.toString();
  }