/** * Converts a line of text into an array of lower case words using a BreakIterator.wordInstance(). * * <p>This method is under the Jive Open Source Software License and was written by Mark Imbriaco. * * @param text a String of text to convert into an array of words * @return text broken up into an array of words. */ public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; } List<String> wordList = new ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); // Remove characters that are not needed. tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
/** Replaces the current word token */ public void replaceWord(String newWord) { if (currentWordPos != -1) { try { /* ORIGINAL document.remove(currentWordPos, currentWordEnd - currentWordPos); document.insertString(currentWordPos, newWord, null); */ // Howard's Version for Ekit Element element = ((javax.swing.text.html.HTMLDocument) document).getCharacterElement(currentWordPos); AttributeSet attribs = element.getAttributes(); document.remove(currentWordPos, currentWordEnd - currentWordPos); document.insertString(currentWordPos, newWord, attribs); // End Howard's Version // Need to reset the segment document.getText(0, document.getLength(), text); } catch (BadLocationException ex) { throw new RuntimeException(ex.getMessage()); } // Position after the newly replaced word(s) // Position after the newly replaced word(s) first = true; currentWordPos = getNextWordStart(text, currentWordPos + newWord.length()); if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd); sentanceIterator.setText(text); sentanceIterator.following(currentWordPos); } else moreTokens = false; } }
/** * Take the given filter text and break it down into words using a BreakIterator. * * @param text * @return an array of words */ private String[] getWords(String text) { List<String> words = new ArrayList<String>(); /* * Break the text up into words, separating based on whitespace and common * punctuation. Previously used String.split(..., "\\W"), where "\W" is a * regular expression (see the Javadoc for class Pattern). Need to avoid * both String.split and regular expressions, in order to compile against * JCL Foundation (bug 80053). Also need to do this in an NL-sensitive way. * The use of BreakIterator was suggested in bug 90579. */ BreakIterator iter = BreakIterator.getWordInstance(); iter.setText(text); int i = iter.first(); while (i != java.text.BreakIterator.DONE && i < text.length()) { int j = iter.following(i); if (j == java.text.BreakIterator.DONE) j = text.length(); /* match the word */ if (Character.isLetterOrDigit(text.charAt(i))) { String word = text.substring(i, j); words.add(word); } i = j; } return words.toArray(new String[words.size()]); }
// offsets on any line will go from start,true to end,false // excluding start,false and end,true public Selection point2Offset(Point p, Selection o) { if (p.y < yInset) { o.caret = 0; o.clickAfter = true; return o; } int line = (p.y - yInset) / lineHeight; if (line >= lineCount) { o.caret = contents.length(); o.clickAfter = false; return o; } int target = p.x - xInset; if (target <= 0) { o.caret = lineStarts[line]; o.clickAfter = true; return o; } int lowGuess = lineStarts[line]; int lowWidth = 0; int highGuess = lineStarts[line + 1]; int highWidth = fm.stringWidth(contents.substring(lineStarts[line], highGuess)); if (target >= highWidth) { o.caret = lineStarts[line + 1]; o.clickAfter = false; return o; } while (lowGuess < highGuess - 1) { int guess = (lowGuess + highGuess) / 2; int width = fm.stringWidth(contents.substring(lineStarts[line], guess)); if (width <= target) { lowGuess = guess; lowWidth = width; if (width == target) break; } else { highGuess = guess; highWidth = width; } } // at end, either lowWidth < target < width(low+1), or lowWidth = target int highBound = charBreaker.following(lowGuess); int lowBound = charBreaker.previous(); // we are now at character boundaries if (lowBound != lowGuess) lowWidth = fm.stringWidth(contents.substring(lineStarts[line], lowBound)); if (highBound != highGuess) highWidth = fm.stringWidth(contents.substring(lineStarts[line], highBound)); // we now have the right widths if (target - lowWidth < highWidth - target) { o.caret = lowBound; o.clickAfter = true; } else { o.caret = highBound; o.clickAfter = false; } // we now have the closest! return o; }
/** * Create copy of this iterator, all status including current position is kept. * * @return copy of this iterator */ @Override public Object clone() { try { BreakIterator cloned = (BreakIterator) super.clone(); cloned.wrapped = (com.ibm.icu4jni.text.BreakIterator) wrapped.clone(); return cloned; } catch (CloneNotSupportedException e) { throw new InternalError(e.getMessage()); } }
public void setText2(String text) { contents = text; charBreaker.setText(text); wordBreaker.setText(text); lineBreaker.setText(text); redoLines = true; if (textListener != null) textListener.textValueChanged(new TextEvent(this, TextEvent.TEXT_VALUE_CHANGED)); repaint(16); }
/** * Creates a copy of this iterator, all status information including the current position are kept * the same. * * @return a copy of this iterator. */ @Override public Object clone() { try { BreakIterator cloned = (BreakIterator) super.clone(); cloned.wrapped = (NativeBreakIterator) wrapped.clone(); return cloned; } catch (CloneNotSupportedException e) { throw new AssertionError(e); // android-changed } }
/** * LIU: Finds the longest substring that fits a given width composed of subunits returned by a * BreakIterator. If the smallest subunit is too long, returns 0. * * @param fMtr metrics to use * @param line the string to be fix into width * @param width line.substring(0, result) must be <= width * @param breaker the BreakIterator that will be used to find subunits * @return maximum characters, at boundaries returned by breaker, that fit into width, or zero on * failure */ private int findFittingBreak(FontMetrics fMtr, String line, int width, BreakIterator breaker) { breaker.setText(line); int last = breaker.first(); int end = breaker.next(); while (end != BreakIterator.DONE && visibleWidth(fMtr, line.substring(0, end)) <= width) { last = end; end = breaker.next(); } return last; }
static void parseWordDataQ1(String inputText) { Locale currentLocale = new Locale("en", "US"); BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(currentLocale); ArrayList<Integer> alist = findSentenceBoundaries(inputText, sentenceIterator); BreakIterator wordIterator = BreakIterator.getWordInstance(currentLocale); HashMap<String, WordTuple> wordList = findWordsInSentences(inputText, wordIterator, alist); printProcessedData(alist, wordList); }
public static String convertStringToTitleCase(String toConvert) { BreakIterator wordBreaker = BreakIterator.getWordInstance(); wordBreaker.setText(toConvert); int end; String word = ""; for (int start = wordBreaker.first(); (end = wordBreaker.next()) != BreakIterator.DONE; start = end) { word += StringProcessing.wordToTitleCase(toConvert.substring(start, end)); } return word; }
public static void javaBreakIterator() { BreakIterator wordIterator = BreakIterator.getWordInstance(); String text = "Let's pause, and then reflect."; wordIterator.setText(text); int boundary = wordIterator.first(); while (boundary != BreakIterator.DONE) { int begin = boundary; System.out.print(boundary + "-"); boundary = wordIterator.next(); int end = boundary; if (end == BreakIterator.DONE) break; System.out.print(boundary + " [" + text.substring(begin, end) + "];"); } System.out.println(); }
static String extractShortDescription(String description) { if (description == null) { return null; } int dot = description.indexOf("."); if (dot != -1) { BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.US); breakIterator.setText(description); String text = description.substring(breakIterator.first(), breakIterator.next()).trim(); return removeSpaceBetweenLine(text); } else { String[] lines = description.split(NEW_LINE); return lines[0].trim(); } }
static ArrayList<Integer> findSentenceBoundaries(String target, BreakIterator iterator) { ArrayList<Integer> sentenceBoundaryList = new ArrayList<Integer>(); iterator.setText(target); int boundary = iterator.first(); while (boundary != BreakIterator.DONE) { boundary = iterator.next(); if (boundary != -1) { sentenceBoundaryList.add(boundary); } } return sentenceBoundaryList; }
/* */ public LineBreakMeasurer( AttributedCharacterIterator paramAttributedCharacterIterator, FontRenderContext paramFontRenderContext) /* */ { /* 277 */ this( paramAttributedCharacterIterator, BreakIterator.getLineInstance(), paramFontRenderContext); /* */ }
private static void assertObsolete(String newCode, String oldCode, String displayName) { // Either code should get you the same locale. Locale newLocale = new Locale(newCode); Locale oldLocale = new Locale(oldCode); assertEquals(newLocale, oldLocale); // No matter what code you used to create the locale, you should get the old code back. assertEquals(oldCode, newLocale.getLanguage()); assertEquals(oldCode, oldLocale.getLanguage()); // Check we get the right display name. assertEquals(displayName, newLocale.getDisplayLanguage(newLocale)); assertEquals(displayName, oldLocale.getDisplayLanguage(newLocale)); assertEquals(displayName, newLocale.getDisplayLanguage(oldLocale)); assertEquals(displayName, oldLocale.getDisplayLanguage(oldLocale)); // Check that none of the 'getAvailableLocales' methods are accidentally returning two // equal locales (because to ICU they're different, but we mangle one into the other). assertOnce(newLocale, BreakIterator.getAvailableLocales()); assertOnce(newLocale, Calendar.getAvailableLocales()); assertOnce(newLocale, Collator.getAvailableLocales()); assertOnce(newLocale, DateFormat.getAvailableLocales()); assertOnce(newLocale, DateFormatSymbols.getAvailableLocales()); assertOnce(newLocale, NumberFormat.getAvailableLocales()); assertOnce(newLocale, Locale.getAvailableLocales()); }
public void setCharSequence(CharSequence charSequence, int start, int end) { mOffsetShift = Math.max(0, start - WINDOW_WIDTH); final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); mString = charSequence.toString().substring(mOffsetShift, windowEnd); mIterator.setText(mString); }
static HashMap<String, WordTuple> findWordsInSentences( String target, BreakIterator wordIterator, ArrayList<Integer> sentences) { HashMap<String, WordTuple> wordMap = new HashMap<String, WordTuple>(); wordIterator.setText(target); int start = wordIterator.first(); int end = wordIterator.next(); while (end != BreakIterator.DONE) { String word = target.substring(start, end); if (Character.isLetterOrDigit(word.charAt(0))) { // System.out.println(word); // System.out.println(start + "-" + end); // check which sentence the word is in by comparing end with values in sentences int sentenceNo = 0; for (int i = 0; i < sentences.size(); i++) { if (end <= sentences.get(i)) { sentenceNo = i; break; } } // lowercase the word String wordLc = word.toLowerCase(); // check if word exists in hashmap if (wordMap.containsKey(wordLc)) { // if exists, add sentence number to word's list in hashmap WordTuple wordTuple = wordMap.get(wordLc); ArrayList<Integer> sentenceList = wordTuple.getSentenceList(); sentenceList.add(sentenceNo); wordMap.put(wordLc, wordTuple); } else { // if it does not exist, create list, add sentence number to list, and add list to hashmap // with word as key ArrayList<Integer> sentenceList = new ArrayList<Integer>(); sentenceList.add(sentenceNo); WordTuple wordTuple = new WordTuple(); wordTuple.setSentenceList(sentenceList); wordMap.put(wordLc, wordTuple); } } start = end; end = wordIterator.next(); } return wordMap; }
/** * Returns the position of next boundary after the given offset. Returns {@code DONE} if there is * no boundary after the given offset. * * @param offset the given start position to search from. * @return the position of the last boundary preceding the given offset. */ public int nextBoundary(int offset) { int shiftedOffset = offset - mOffsetShift; shiftedOffset = mIterator.following(shiftedOffset); if (shiftedOffset == BreakIterator.DONE) { return BreakIterator.DONE; } return shiftedOffset + mOffsetShift; }
/** * Returns the position of boundary preceding the given offset or {@code DONE} if the given offset * specifies the starting position. * * @param offset the given start position to search from. * @return the position of the last boundary preceding the given offset. */ public int prevBoundary(int offset) { int shiftedOffset = offset - mOffsetShift; shiftedOffset = mIterator.preceding(shiftedOffset); if (shiftedOffset == BreakIterator.DONE) { return BreakIterator.DONE; } return shiftedOffset + mOffsetShift; }
private static void checkIsBoundaryException(BreakIterator bi, int offset) { try { bi.isBoundary(offset); } catch (IllegalArgumentException e) { return; // OK } throw new RuntimeException(bi + ": isBoundary() doesn't throw an IAE with offset " + offset); }
/** * If <code>offset</code> is within a word, returns the index of the last character of that word * plus one, otherwise returns BreakIterator.DONE. * * <p>The offsets that are considered to be part of a word are the indexes of its characters, * <i>as well as</i> the index of its last character plus one. If offset is the index of a low * surrogate character, BreakIterator.DONE will be returned. * * <p>Valid range for offset is [0..textLength] (note the inclusive upper bound). The returned * value is within [offset..textLength] or BreakIterator.DONE. * * @throws IllegalArgumentException is offset is not valid. */ public int getEnd(int offset) { final int shiftedOffset = offset - mOffsetShift; checkOffsetIsValid(shiftedOffset); if (isAfterLetterOrDigit(shiftedOffset)) { if (mIterator.isBoundary(shiftedOffset)) { return shiftedOffset + mOffsetShift; } else { return mIterator.following(shiftedOffset) + mOffsetShift; } } else { if (isOnLetterOrDigit(shiftedOffset)) { return mIterator.following(shiftedOffset) + mOffsetShift; } } return BreakIterator.DONE; }
private static void checkPrecedingException(BreakIterator bi, int offset) { try { bi.preceding(offset); } catch (IllegalArgumentException e) { return; // OK } throw new RuntimeException(bi + ": preceding() doesn't throw an IAE with offset " + offset); }
/** * set / update the text of the displayLabels. these are the Week column headers above the days on * the Calendar part of the <code>CDateTime</code>. */ private void updateDaysOfWeek() { if (dayPanel != null) { Calendar tmpcal = cdt.getCalendarInstance(); tmpcal.set(Calendar.DAY_OF_WEEK, tmpcal.getFirstDayOfWeek()); Locale locale = cdt.getLocale(); boolean ltr = (ComponentOrientation.getOrientation(locale).isLeftToRight() && !locale.getLanguage().equals("zh")); // $NON-NLS-1$ BreakIterator iterator = BreakIterator.getCharacterInstance(locale); for (int x = 0; x < dayLabels.length; x++) { String str = getFormattedDate("E", tmpcal.getTime()); // $NON-NLS-1$ if (dayLabels[x].getData(CDT.Key.Compact, Boolean.class)) { iterator.setText(str); int start, end; if (ltr) { start = iterator.first(); end = iterator.next(); } else { end = iterator.last(); start = iterator.previous(); } dayLabels[x].setText(str.substring(start, end)); } else { dayLabels[x].setText(str); } tmpcal.add(Calendar.DAY_OF_WEEK, 1); } } }
public void test(TestHarness harness) { // Just to be explicit: we're only testing the US locale here. Locale loc = Locale.US; Locale.setDefault(loc); BreakIterator bi = BreakIterator.getLineInstance(loc); String[] r1 = {"How ", "much ", "time ", "is ", "left? ", "We ", "don't ", "know."}; check("How much", "How much time is left? We don't know.", r1, bi, harness); }
@Test public void testSentenceDetection() { BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.US); sentenceIterator.setText(TEST_STRING); int start = sentenceIterator.first(); int end = -1; List<String> sentenceList = new ArrayList<String>(); while ((end = sentenceIterator.next()) != BreakIterator.DONE) { String sentence = TEST_STRING.substring(start, end); start = end; sentenceList.add(sentence); System.out.println("Sentence: " + sentence); } }
public void setCharSequence(CharSequence charSequence, int start, int end) { mOffsetShift = Math.max(0, start - WINDOW_WIDTH); final int windowEnd = Math.min(charSequence.length(), end + WINDOW_WIDTH); if (charSequence instanceof SpannableStringBuilder) { mString = ((SpannableStringBuilder) charSequence).substring(mOffsetShift, windowEnd); } else { mString = charSequence.subSequence(mOffsetShift, windowEnd).toString(); } mIterator.setText(mString); }
/** * Called to summarize a document when no hits were found. By default this just returns the first * {@code maxPassages} sentences; subclasses can override to customize. */ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { // BreakIterator should be un-next'd: List<Passage> passages = new ArrayList<>(); int pos = bi.current(); assert pos == 0; while (passages.size() < maxPassages) { int next = bi.next(); if (next == BreakIterator.DONE) { break; } Passage passage = new Passage(); passage.score = Float.NaN; passage.startOffset = pos; passage.endOffset = next; passages.add(passage); pos = next; } return passages.toArray(new Passage[passages.size()]); }
public DocumentWordTokenizer(Document document) { this.document = document; // Create a text segment over the etire document text = new Segment(); sentanceIterator = BreakIterator.getSentenceInstance(); try { document.getText(0, document.getLength(), text); sentanceIterator.setText(text); currentWordPos = getNextWordStart(text, 0); // If the current word pos is -1 then the string was all white space if (currentWordPos != -1) { currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd); } else { moreTokens = false; } } catch (BadLocationException ex) { moreTokens = false; } }
public static String wordWrap(String input, int width, Locale locale) { if (input == null) return ""; if (width < 5) return input; if (width >= input.length()) return input; StringBuffer buf = new StringBuffer(input); boolean endOfLine = false; int lineStart = 0; for (int i = 0; i < buf.length(); i++) { if (buf.charAt(i) == '\n') { lineStart = i + 1; endOfLine = true; } if (i <= (lineStart + width) - 1) continue; if (!endOfLine) { int limit = i - lineStart - 1; BreakIterator breaks = BreakIterator.getLineInstance(locale); breaks.setText(buf.substring(lineStart, i)); int end = breaks.last(); if (end == limit + 1 && !Character.isWhitespace(buf.charAt(lineStart + end))) end = breaks.preceding(end - 1); if (end != -1 && end == limit + 1) { buf.replace(lineStart + end, lineStart + end + 1, "\n"); lineStart += end; continue; } if (end != -1 && end != 0) { buf.insert(lineStart + end, '\n'); lineStart = lineStart + end + 1; } else { buf.insert(i, '\n'); lineStart = i + 1; } } else { buf.insert(i, '\n'); lineStart = i + 1; endOfLine = false; } } return buf.toString(); }
/** {@inheritDoc} */ public int following(int offset) { int shiftedOffset = offset - mOffsetShift; do { shiftedOffset = mIterator.following(shiftedOffset); if (shiftedOffset == BreakIterator.DONE) { return BreakIterator.DONE; } if (isAfterLetterOrDigit(shiftedOffset)) { return shiftedOffset + mOffsetShift; } } while (true); }