  public void splitWordAndLength() {

    StringBuilder sb = new StringBuilder();
    List<String> brokenStrings = new ArrayList<String>();

    BreakIterator boundary = BreakIterator.getWordInstance();

    int start = boundary.first();
    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {

      int lengthOfNext = end - start;

      if ((sb.length() + lengthOfNext) > 180) {
        sb = new StringBuilder(); // or set to 0

      sb.append(val.substring(start, end));

      // if last element
      if (end == val.length()) {

    for (String x : brokenStrings) {
Example #2
   * 格式化字符串,如果字符串超过指定长度,则自动折行。
   * @param text 要格式化的字符串
   * @param maxLength 行的长度
   * @param locale 国家地区
   * @param prefix1 首行前缀
   * @param prefix2 第二行及后面行的前缀
   * @return 格式化后的字符串
  private String formatLines(
      String text, int maxLength, Locale locale, String prefix1, String prefix) {
    BreakIterator boundary = BreakIterator.getLineInstance(locale);
    StringBuffer result = new StringBuffer(prefix1);


    int start = boundary.first();
    int end = boundary.next();
    int lineLength = 0;

    while (end != BreakIterator.DONE) {
      String word = text.substring(start, end);

      lineLength = lineLength + word.length();

      if (lineLength >= maxLength) {
        lineLength = word.length();

      start = end;
      end = boundary.next();

    return result.toString();
Example #3
   * Converts a line of text into an array of lower case words using a BreakIterator.wordInstance().
   * <p>This method is under the Jive Open Source Software License and was written by Mark Imbriaco.
   * @param text a String of text to convert into an array of words
   * @return text broken up into an array of words.
  public static String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) {
      return new String[0];

    List<String> wordList = new ArrayList<String>();
    BreakIterator boundary = BreakIterator.getWordInstance();
    int start = 0;

    for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) {
      String tmp = text.substring(start, end).trim();
      // Remove characters that are not needed.
      tmp = replace(tmp, "+", "");
      tmp = replace(tmp, "/", "");
      tmp = replace(tmp, "\\", "");
      tmp = replace(tmp, "#", "");
      tmp = replace(tmp, "*", "");
      tmp = replace(tmp, ")", "");
      tmp = replace(tmp, "(", "");
      tmp = replace(tmp, "&", "");
      if (tmp.length() > 0) {
    return wordList.toArray(new String[wordList.size()]);
Example #4
  * LIU: Finds the longest substring that fits a given width composed of subunits returned by a
  * BreakIterator. If the smallest subunit is too long, returns 0.
  * @param fMtr metrics to use
  * @param line the string to be fix into width
  * @param width line.substring(0, result) must be <= width
  * @param breaker the BreakIterator that will be used to find subunits
  * @return maximum characters, at boundaries returned by breaker, that fit into width, or zero on
  *     failure
 private int findFittingBreak(FontMetrics fMtr, String line, int width, BreakIterator breaker) {
   int last = breaker.first();
   int end = breaker.next();
   while (end != BreakIterator.DONE && visibleWidth(fMtr, line.substring(0, end)) <= width) {
     last = end;
     end = breaker.next();
   return last;
  private void prepTxt(String txtInit, float fontSizeInit, float maxWidth) {
    if (txtInit == null) throw new NullPointerException();


    txt = txtInit;
    fontSize = fontSizeInit;
    areaWidth = maxWidth - pad;
    lineHeight = getTextAsc() + getTextDesc();

    if (lineList == null) lineList = new ArrayList<String>();
    else lineList.clear();

    BreakIterator boundary = BreakIterator.getWordInstance();

    int start = boundary.first();
    int end = boundary.next();
    int prevEnd = start;
    while (end != BreakIterator.DONE) {
      String line = txt.substring(start, end);
      String prevLine = txt.substring(start, prevEnd);

      float lineWidth = getTextWidth(line);

      if (lineWidth > areaWidth) {
        // If the first word is longer than lineWidth
        // prevLine is empty and should be ignored
        if (prevLine.length() > 0) lineList.add(prevLine);

        start = prevEnd;

      prevEnd = end;
      end = boundary.next();
    String line = txt.substring(start, prevEnd);

    if (lines == null || lines.length != lineList.size()) lines = new String[lineList.size()];
    if (lineWidths == null || lineWidths.length != lineList.size())
      lineWidths = new float[lineList.size()];

    maxLineWidth = 0;
    for (int i = 0; i < lines.length; i++) {
      lineWidths[i] = getTextWidth(lines[i]);
      if (maxLineWidth < lineWidths[i]) maxLineWidth = lineWidths[i];
    areaWidth = maxLineWidth;
    areaHeight = lineHeight * lines.length;

    width = areaWidth + pad * 2;
    height = areaHeight + pad * 2;
  public Object evaluate(DeferredObject[] arguments) throws HiveException {
    assert (arguments.length >= 1 && arguments.length <= 3);
    if (arguments[0].get() == null) {
      return null;

    // if there is more than 1 argument specified, a different natural language
    // locale is being specified
    Locale locale = null;
    if (arguments.length > 1 && arguments[1].get() != null) {
      Text language = (Text) converters[1].convert(arguments[1].get());
      Text country = null;
      if (arguments.length > 2 && arguments[2].get() != null) {
        country = (Text) converters[2].convert(arguments[2].get());
      if (country != null) {
        locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase());
      } else {
        locale = new Locale(language.toString().toLowerCase());
    } else {
      locale = Locale.getDefault();

    // get the input and prepare the output
    Text chunk = (Text) converters[0].convert(arguments[0].get());
    String text = chunk.toString();
    ArrayList<ArrayList<Text>> result = new ArrayList<ArrayList<Text>>();

    // Parse out sentences using Java's text-handling API
    BreakIterator bi = BreakIterator.getSentenceInstance(locale);
    int idx = 0;
    while (bi.next() != BreakIterator.DONE) {
      String sentence = text.substring(idx, bi.current());
      idx = bi.current();
      result.add(new ArrayList<Text>());

      // Parse out words in the sentence
      BreakIterator wi = BreakIterator.getWordInstance(locale);
      int widx = 0;
      ArrayList<Text> sent_array = result.get(result.size() - 1);
      while (wi.next() != BreakIterator.DONE) {
        String word = sentence.substring(widx, wi.current());
        widx = wi.current();
        if (Character.isLetterOrDigit(word.charAt(0))) {
          sent_array.add(new Text(word));

    return result;
  static List<String> extractTokens(String document) {
    BreakIterator iterator = BreakIterator.getWordInstance();
    ArrayList<String> result = new ArrayList<String>();
    int start = iterator.first();
    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
      String s = document.substring(start, end).toLowerCase().replaceAll("[^a-z]", "");
      if (!ENGLISH_STOP_WORDS.contains(s)) {

    return result;
  * Extract out sentences from the reviews. to take into account the negative lists Later Check
  * Stanford Document tokenizer
 private void BreakInLines() {
   // this.Lines = review.split(". ");
   BreakIterator border = BreakIterator.getSentenceInstance(Locale.US);
   //	System.out.println(review);
   int start = border.first();
   // iterate, creating sentences out of all the Strings between the given boundaries
   for (int end = border.next(); end != BreakIterator.DONE; start = end, end = border.next()) {
     // System.out.println(review.substring(start,end));
     Lines.add(review.substring(start, end));
   // System.out.println(NumOfSentences);
  private void calcBoxDimensions(CharSequence txtInit, float fontSizeInit, float maxWidth) {
    if (txtInit == null) throw new NullPointerException();


    txt = txtInit;
    fontSize = fontSizeInit;
    areaWidth = maxWidth - pad;
    linesHeight = getTextAsc() + getTextDesc();

    if (lineList == null) lineList = new ArrayList<CharSequence>();
    else lineList.clear();

    BreakIterator boundary = BreakIterator.getWordInstance();

    int start = boundary.first();
    int end = boundary.next();
    int prevEnd = start;
    while (end != BreakIterator.DONE) {
      CharSequence line = txt.subSequence(start, end);
      CharSequence prevLine = txt.subSequence(start, prevEnd);
      float lineWidth = getTextWidth(line, 0, line.length());

      if (lineWidth > areaWidth) {
        // If the first word is longer than lineWidth
        // prevLine is empty and should be ignored
        if (prevLine.length() > 0) lineList.add(prevLine);

        start = prevEnd;

      prevEnd = end;
      end = boundary.next();
    CharSequence line = txt.subSequence(start, prevEnd);

    maxLinesWidth = 0;
    for (CharSequence seq : lineList) {
      float lineWidth = getTextWidth(seq, 0, seq.length());
      if (maxLinesWidth < lineWidth) maxLinesWidth = lineWidth;
    areaWidth = maxLinesWidth;
    areaHeight = linesHeight * lineList.size();

    width = areaWidth + pad * 2;
    height = areaHeight + pad * 2;
Example #10
  public void check(String name, String in, String[] out, BreakIterator bi, TestHarness harness) {

    int index = 0;
    int from = bi.current();
    harness.check(from, 0);

    while (true) {
      int to = bi.next();
      if (to == BreakIterator.DONE) break;
      harness.check(in.substring(from, to), out[index]);
      from = to;

    harness.check(index, out.length);

    harness.checkPoint("backwards " + name);
    index = out.length - 1;
    from = bi.current();
    harness.check(from, in.length());

    while (true) {
      int to = bi.previous();
      if (to == BreakIterator.DONE) break;
      harness.check(in.substring(to, from), out[index]);
      from = to;

    harness.check(index, -1);
Example #11
  * set / update the text of the displayLabels. these are the Week column headers above the days on
  * the Calendar part of the <code>CDateTime</code>.
 private void updateDaysOfWeek() {
   if (dayPanel != null) {
     Calendar tmpcal = cdt.getCalendarInstance();
     tmpcal.set(Calendar.DAY_OF_WEEK, tmpcal.getFirstDayOfWeek());
     Locale locale = cdt.getLocale();
     boolean ltr =
             && !locale.getLanguage().equals("zh")); // $NON-NLS-1$
     BreakIterator iterator = BreakIterator.getCharacterInstance(locale);
     for (int x = 0; x < dayLabels.length; x++) {
       String str = getFormattedDate("E", tmpcal.getTime()); // $NON-NLS-1$
       if (dayLabels[x].getData(CDT.Key.Compact, Boolean.class)) {
         int start, end;
         if (ltr) {
           start = iterator.first();
           end = iterator.next();
         } else {
           end = iterator.last();
           start = iterator.previous();
         dayLabels[x].setText(str.substring(start, end));
       } else {
       tmpcal.add(Calendar.DAY_OF_WEEK, 1);
  static HashMap<String, WordTuple> findWordsInSentences(
      String target, BreakIterator wordIterator, ArrayList<Integer> sentences) {

    HashMap<String, WordTuple> wordMap = new HashMap<String, WordTuple>();

    int start = wordIterator.first();
    int end = wordIterator.next();

    while (end != BreakIterator.DONE) {
      String word = target.substring(start, end);
      if (Character.isLetterOrDigit(word.charAt(0))) {
        // System.out.println(word);
        // System.out.println(start + "-" + end);
        // check which sentence the word is in by comparing end with values in sentences
        int sentenceNo = 0;
        for (int i = 0; i < sentences.size(); i++) {
          if (end <= sentences.get(i)) {
            sentenceNo = i;
        // lowercase the word
        String wordLc = word.toLowerCase();
        // check if word exists in hashmap
        if (wordMap.containsKey(wordLc)) {
          // if exists, add sentence number to word's list in hashmap
          WordTuple wordTuple = wordMap.get(wordLc);
          ArrayList<Integer> sentenceList = wordTuple.getSentenceList();
          wordMap.put(wordLc, wordTuple);
        } else {
          // if it does not exist, create list, add sentence number to list, and add list to hashmap
          // with word as key
          ArrayList<Integer> sentenceList = new ArrayList<Integer>();
          WordTuple wordTuple = new WordTuple();
          wordMap.put(wordLc, wordTuple);
      start = end;
      end = wordIterator.next();
    return wordMap;
Example #13
  private static Tokens splitText(final String text) {
    final List<Token> l = new LinkedList<>();

    // use a BreakIterator to iterate our way through the words of the text
    final BreakIterator wordIterator = BreakIterator.getWordInstance(new Locale("en", "US"));

    // simply iterate through the text, keeping track of a start and end index of the current word
    int startIdx = wordIterator.first();
    for (int endIdx = wordIterator.next();
        endIdx != DONE;
        startIdx = endIdx, endIdx = wordIterator.next()) {
      final String word = text.substring(startIdx, endIdx);
      l.add(new Token(startIdx, word));

    return new Tokens(l);
Example #14
   * 返回一段文字中的单词的数组
   * @param text
   * @return
  public static final String[] toLowerCaseWordArray(String text) {
    if (text == null || text.length() == 0) return new String[0];
    ArrayList wordList = new ArrayList();
    BreakIterator boundary = BreakIterator.getWordInstance();
    int start = 0;
    for (int end = boundary.next(); end != -1; end = boundary.next()) {
      String tmp = text.substring(start, end).trim();
      tmp = replace(tmp, "+", "");
      tmp = replace(tmp, "/", "");
      tmp = replace(tmp, "\\", "");
      tmp = replace(tmp, "#", "");
      tmp = replace(tmp, "*", "");
      tmp = replace(tmp, ")", "");
      tmp = replace(tmp, "(", "");
      tmp = replace(tmp, "&", "");
      if (tmp.length() > 0) wordList.add(tmp);
      start = end;

    return (String[]) wordList.toArray(new String[wordList.size()]);
Example #15
  public static String convertStringToTitleCase(String toConvert) {
    BreakIterator wordBreaker = BreakIterator.getWordInstance();
    int end;

    String word = "";
    for (int start = wordBreaker.first();
        (end = wordBreaker.next()) != BreakIterator.DONE;
        start = end) {

      word += StringProcessing.wordToTitleCase(toConvert.substring(start, end));

    return word;
  static ArrayList<Integer> findSentenceBoundaries(String target, BreakIterator iterator) {

    ArrayList<Integer> sentenceBoundaryList = new ArrayList<Integer>();
    int boundary = iterator.first();

    while (boundary != BreakIterator.DONE) {
      boundary = iterator.next();
      if (boundary != -1) {

    return sentenceBoundaryList;
Example #17
 public static void javaBreakIterator() {
   BreakIterator wordIterator = BreakIterator.getWordInstance();
   String text = "Let's pause, and then reflect.";
   int boundary = wordIterator.first();
   while (boundary != BreakIterator.DONE) {
     int begin = boundary;
     System.out.print(boundary + "-");
     boundary = wordIterator.next();
     int end = boundary;
     if (end == BreakIterator.DONE) break;
     System.out.print(boundary + " [" + text.substring(begin, end) + "];");
Example #18
 static String extractShortDescription(String description) {
   if (description == null) {
     return null;
   int dot = description.indexOf(".");
   if (dot != -1) {
     BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.US);
     String text = description.substring(breakIterator.first(), breakIterator.next()).trim();
     return removeSpaceBetweenLine(text);
   } else {
     String[] lines = description.split(NEW_LINE);
     return lines[0].trim();
Example #19
  public void testSentenceDetection() {
    BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.US);


    int start = sentenceIterator.first();
    int end = -1;

    List<String> sentenceList = new ArrayList<String>();

    while ((end = sentenceIterator.next()) != BreakIterator.DONE) {
      String sentence = TEST_STRING.substring(start, end);
      start = end;

      System.out.println("Sentence: " + sentence);
   * Called to summarize a document when no hits were found. By default this just returns the first
   * {@code maxPassages} sentences; subclasses can override to customize.
  protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
    // BreakIterator should be un-next'd:
    List<Passage> passages = new ArrayList<>();
    int pos = bi.current();
    assert pos == 0;
    while (passages.size() < maxPassages) {
      int next = bi.next();
      if (next == BreakIterator.DONE) {
      Passage passage = new Passage();
      passage.score = Float.NaN;
      passage.startOffset = pos;
      passage.endOffset = next;
      pos = next;

    return passages.toArray(new Passage[passages.size()]);
 /** Returns the next word in the text */
 public String nextWord() {
   if (!first) {
     currentWordPos = nextWordPos;
     currentWordEnd = getNextWordEnd(text, currentWordPos);
     nextWordPos = getNextWordStart(text, currentWordEnd + 1);
     int current = sentanceIterator.current();
     if (current == currentWordPos) startsSentance = true;
     else {
       startsSentance = false;
       if (currentWordEnd > current) sentanceIterator.next();
   // The nextWordPos has already been populated
   String word = null;
   try {
     word = document.getText(currentWordPos, currentWordEnd - currentWordPos);
   } catch (BadLocationException ex) {
     moreTokens = false;
   first = false;
   if (nextWordPos == -1) moreTokens = false;
   return word;
Example #22
   * Initializes fields comment, inlineTags of the object
   * @param commentText the processed comment text
  private void procComment(String commentText) {
    // initialize inlineTags
    ArrayList<Tag> result = new ArrayList<Tag>();
    String noInlineTags = replaceAtSigns(commentText);

     * Pattern p = Pattern.compile("\\{\\s*@[^}]*\\}"); // matches inline
     * tags // Pattern p =
     * Pattern.compile("\\{\\s*@([^\\s\\}]+)\\s*([^\\}]*)\\s*}"); // matches
     * inline tags Matcher m = p.matcher(commentText); int start = 0, end =
     * 0; // create an array of tag objects of kind "Text" and "@link"; as
     * explained in the // doclet API, for a comment // This is a {@link Doc
     * commentlabel} example. // create an array of Tag objects: // *
     * tags[0] is a Tag with name "Text" and text consisting of "This is a "
     * // * tags[1] is a SeeTag with name "@link", and label "commentlabel"
     * // * tags[2] is a Tag with name "Text" and text consisting of
     * " example." while (m.find()) { end = m.start(); String linkText =
     * m.group(); // System.out.print("String = \"" +
     * commentText.substring(start, end)); //
     * System.out.println("\"; linkText = \"" + linkText + "\""); //
     * result.add(new X10Tag("Text", commentText.substring(start, end),
     * this)); result.add(X10Tag.processInlineTag(linkText, this)); //int
     * index = commentText.indexOf(linkText); //commentText =
     * commentText.substring(0, index) + commentText.substring(index +
     * linkText.length()); // result.add(new X10SeeTag(true, linkText,
     * this)); // "true" signifies an @link tag, as opposed to an @see tag
     * start = m.end(); }
    if (!commentText.startsWith("@")) { // make sure that there is a
      // beginning paragraph
      // initialize comment
      int blockTagStart = noInlineTags.indexOf("@"); // start of block
      // tags within
      // comment
      blockTagStart = (blockTagStart == -1) ? commentText.length() : blockTagStart;
      this.comment = commentText.substring(0, blockTagStart).trim();
      if (!comment.equals("")) {
        result.addAll(createInlineTags(comment, this));

      // }
      // add constraints, if any
      // String decl = declString();
      // if (decl != null) {
      // result.add(new X10Tag(decl, this));
      // }

      // initialize firstSentenceTags
      BreakIterator b = BreakIterator.getSentenceInstance();
      int start = 0;
      int end = 0;
      start = b.first();
      end = b.next();
      String firstSentence = ((start <= end) ? comment.substring(start, end).trim() : "");
      // System.out.println("X10Doc.initializeFields(): firstSentence = \""
      // + firstSentence + "\"");
      firstSentenceTags = createInlineTags(firstSentence, this).toArray(new X10Tag[0]);

    } else {
      firstSentenceTags = new X10Tag[0];

    inlineTags = result.toArray(new X10Tag[0]);

    // TODO: creating Tag objects for block tags and storing them in a field
    // of this object
    Pattern blockTagPattern = Pattern.compile("\\s*@[^@]*");
    Matcher blockTagMatcher = blockTagPattern.matcher(noInlineTags);
    while (blockTagMatcher.find()) {
      String tagText = blockTagMatcher.group();
      int start = blockTagMatcher.start();
      processBlockTag(commentText.substring(start, start + tagText.length()));
  // algorithm: treat sentence snippets as miniature documents
  // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
  // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
  private Passage[] highlightDoc(
      String field,
      BytesRef terms[],
      int contentLength,
      BreakIterator bi,
      int doc,
      TermsEnum termsEnum,
      DocsAndPositionsEnum[] postings,
      int n)
      throws IOException {
    PassageScorer scorer = getScorer(field);
    if (scorer == null) {
      throw new NullPointerException("PassageScorer cannot be null");
    PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
    float weights[] = new float[terms.length];
    // initialize postings
    for (int i = 0; i < terms.length; i++) {
      DocsAndPositionsEnum de = postings[i];
      int pDoc;
      if (de == EMPTY) {
      } else if (de == null) {
        postings[i] = EMPTY; // initially
        if (!termsEnum.seekExact(terms[i])) {
          continue; // term not found
        de =
            postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
        if (de == null) {
          // no positions available
          throw new IllegalArgumentException(
              "field '" + field + "' was indexed without offsets, cannot highlight");
        pDoc = de.advance(doc);
      } else {
        pDoc = de.docID();
        if (pDoc < doc) {
          pDoc = de.advance(doc);

      if (doc == pDoc) {
        weights[i] = scorer.weight(contentLength, de.freq());
        pq.add(new OffsetsEnum(de, i));

    pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination

    PriorityQueue<Passage> passageQueue =
        new PriorityQueue<>(
            new Comparator<Passage>() {
              public int compare(Passage left, Passage right) {
                if (left.score < right.score) {
                  return -1;
                } else if (left.score > right.score) {
                  return 1;
                } else {
                  return left.startOffset - right.startOffset;
    Passage current = new Passage();

    OffsetsEnum off;
    while ((off = pq.poll()) != null) {
      final DocsAndPositionsEnum dp = off.dp;
      int start = dp.startOffset();
      if (start == -1) {
        throw new IllegalArgumentException(
            "field '" + field + "' was indexed without offsets, cannot highlight");
      int end = dp.endOffset();
      // LUCENE-5166: this hit would span the content limit... however more valid
      // hits may exist (they are sorted by start). so we pretend like we never
      // saw this term, it won't cause a passage to be added to passageQueue or anything.
      assert EMPTY.startOffset() == Integer.MAX_VALUE;
      if (start < contentLength && end > contentLength) {
      if (start >= current.endOffset) {
        if (current.startOffset >= 0) {
          // finalize current
          current.score *= scorer.norm(current.startOffset);
          // new sentence: first add 'current' to queue
          if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
            current.reset(); // can't compete, just reset it
          } else {
            if (passageQueue.size() > n) {
              current = passageQueue.poll();
            } else {
              current = new Passage();
        // if we exceed limit, we are done
        if (start >= contentLength) {
          Passage passages[] = new Passage[passageQueue.size()];
          for (Passage p : passages) {
          // sort in ascending order
              new Comparator<Passage>() {
                public int compare(Passage left, Passage right) {
                  return left.startOffset - right.startOffset;
          return passages;
        // advance breakiterator
        assert BreakIterator.DONE < 0;
        current.startOffset = Math.max(bi.preceding(start + 1), 0);
        current.endOffset = Math.min(bi.next(), contentLength);
      int tf = 0;
      while (true) {
        BytesRef term = terms[off.id];
        if (term == null) {
          // multitermquery match, pull from payload
          term = off.dp.getPayload();
          assert term != null;
        current.addMatch(start, end, term);
        if (off.pos == dp.freq()) {
          break; // removed from pq
        } else {
          start = dp.startOffset();
          end = dp.endOffset();
        if (start >= current.endOffset || end > contentLength) {
      current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);

    // Dead code but compiler disagrees:
    assert false;
    return null;