@Test public void splitWordAndLength() { StringBuilder sb = new StringBuilder(); List<String> brokenStrings = new ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(val); int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { int lengthOfNext = end - start; if ((sb.length() + lengthOfNext) > 180) { brokenStrings.add(sb.toString()); sb = new StringBuilder(); // or set to 0 } sb.append(val.substring(start, end)); // if last element if (end == val.length()) { brokenStrings.add(sb.toString()); } } for (String x : brokenStrings) { System.out.println(x); } }
/** * 格式化字符串,如果字符串超过指定长度,则自动折行。 * * @param text 要格式化的字符串 * @param maxLength 行的长度 * @param locale 国家地区 * @param prefix1 首行前缀 * @param prefix2 第二行及后面行的前缀 * @return 格式化后的字符串 */ private String formatLines( String text, int maxLength, Locale locale, String prefix1, String prefix) { BreakIterator boundary = BreakIterator.getLineInstance(locale); StringBuffer result = new StringBuffer(prefix1); boundary.setText(text); int start = boundary.first(); int end = boundary.next(); int lineLength = 0; while (end != BreakIterator.DONE) { String word = text.substring(start, end); lineLength = lineLength + word.length(); if (lineLength >= maxLength) { result.append("\n").append(prefix); lineLength = word.length(); } result.append(word); start = end; end = boundary.next(); } return result.toString(); }
/** * Converts a line of text into an array of lower case words using a BreakIterator.wordInstance(). * * <p>This method is under the Jive Open Source Software License and was written by Mark Imbriaco. * * @param text a String of text to convert into an array of words * @return text broken up into an array of words. */ public static String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) { return new String[0]; } List<String> wordList = new ArrayList<String>(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { String tmp = text.substring(start, end).trim(); // Remove characters that are not needed. tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) { wordList.add(tmp); } } return wordList.toArray(new String[wordList.size()]); }
/** * LIU: Finds the longest substring that fits a given width composed of subunits returned by a * BreakIterator. If the smallest subunit is too long, returns 0. * * @param fMtr metrics to use * @param line the string to be fix into width * @param width line.substring(0, result) must be <= width * @param breaker the BreakIterator that will be used to find subunits * @return maximum characters, at boundaries returned by breaker, that fit into width, or zero on * failure */ private int findFittingBreak(FontMetrics fMtr, String line, int width, BreakIterator breaker) { breaker.setText(line); int last = breaker.first(); int end = breaker.next(); while (end != BreakIterator.DONE && visibleWidth(fMtr, line.substring(0, end)) <= width) { last = end; end = breaker.next(); } return last; }
private void prepTxt(String txtInit, float fontSizeInit, float maxWidth) { if (txtInit == null) throw new NullPointerException(); setFontSize(fontSizeInit); txt = txtInit; fontSize = fontSizeInit; areaWidth = maxWidth - pad; lineHeight = getTextAsc() + getTextDesc(); if (lineList == null) lineList = new ArrayList<String>(); else lineList.clear(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(txt); int start = boundary.first(); int end = boundary.next(); int prevEnd = start; while (end != BreakIterator.DONE) { String line = txt.substring(start, end); String prevLine = txt.substring(start, prevEnd); float lineWidth = getTextWidth(line); if (lineWidth > areaWidth) { // If the first word is longer than lineWidth // prevLine is empty and should be ignored if (prevLine.length() > 0) lineList.add(prevLine); start = prevEnd; } prevEnd = end; end = boundary.next(); } String line = txt.substring(start, prevEnd); lineList.add(line); if (lines == null || lines.length != lineList.size()) lines = new String[lineList.size()]; if (lineWidths == null || lineWidths.length != lineList.size()) lineWidths = new float[lineList.size()]; lineList.toArray(lines); maxLineWidth = 0; for (int i = 0; i < lines.length; i++) { lineWidths[i] = getTextWidth(lines[i]); if (maxLineWidth < lineWidths[i]) maxLineWidth = lineWidths[i]; } areaWidth = maxLineWidth; areaHeight = lineHeight * lines.length; width = areaWidth + pad * 2; height = areaHeight + pad * 2; }
@Override public Object evaluate(DeferredObject[] arguments) throws HiveException { assert (arguments.length >= 1 && arguments.length <= 3); if (arguments[0].get() == null) { return null; } // if there is more than 1 argument specified, a different natural language // locale is being specified Locale locale = null; if (arguments.length > 1 && arguments[1].get() != null) { Text language = (Text) converters[1].convert(arguments[1].get()); Text country = null; if (arguments.length > 2 && arguments[2].get() != null) { country = (Text) converters[2].convert(arguments[2].get()); } if (country != null) { locale = new Locale(language.toString().toLowerCase(), country.toString().toUpperCase()); } else { locale = new Locale(language.toString().toLowerCase()); } } else { locale = Locale.getDefault(); } // get the input and prepare the output Text chunk = (Text) converters[0].convert(arguments[0].get()); String text = chunk.toString(); ArrayList<ArrayList<Text>> result = new ArrayList<ArrayList<Text>>(); // Parse out sentences using Java's text-handling API BreakIterator bi = BreakIterator.getSentenceInstance(locale); bi.setText(text); int idx = 0; while (bi.next() != BreakIterator.DONE) { String sentence = text.substring(idx, bi.current()); idx = bi.current(); result.add(new ArrayList<Text>()); // Parse out words in the sentence BreakIterator wi = BreakIterator.getWordInstance(locale); wi.setText(sentence); int widx = 0; ArrayList<Text> sent_array = result.get(result.size() - 1); while (wi.next() != BreakIterator.DONE) { String word = sentence.substring(widx, wi.current()); widx = wi.current(); if (Character.isLetterOrDigit(word.charAt(0))) { sent_array.add(new Text(word)); } } } return result; }
static List<String> extractTokens(String document) { BreakIterator iterator = BreakIterator.getWordInstance(); iterator.setText(document); ArrayList<String> result = new ArrayList<String>(); int start = iterator.first(); for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { String s = document.substring(start, end).toLowerCase().replaceAll("[^a-z]", ""); if (!ENGLISH_STOP_WORDS.contains(s)) { result.add(s); } } return result; }
/** * Extract out sentences from the reviews. to take into account the negative lists Later Check * Stanford Document tokenizer */ private void BreakInLines() { // this.Lines = review.split(". "); BreakIterator border = BreakIterator.getSentenceInstance(Locale.US); border.setText(review); // System.out.println(review); int start = border.first(); // iterate, creating sentences out of all the Strings between the given boundaries for (int end = border.next(); end != BreakIterator.DONE; start = end, end = border.next()) { // System.out.println(review.substring(start,end)); Lines.add(review.substring(start, end)); NumOfSentences++; } // System.out.println(NumOfSentences); }
private void calcBoxDimensions(CharSequence txtInit, float fontSizeInit, float maxWidth) { if (txtInit == null) throw new NullPointerException(); setFontSize(fontSizeInit); txt = txtInit; fontSize = fontSizeInit; areaWidth = maxWidth - pad; linesHeight = getTextAsc() + getTextDesc(); if (lineList == null) lineList = new ArrayList<CharSequence>(); else lineList.clear(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(txt.toString()); int start = boundary.first(); int end = boundary.next(); int prevEnd = start; while (end != BreakIterator.DONE) { CharSequence line = txt.subSequence(start, end); CharSequence prevLine = txt.subSequence(start, prevEnd); float lineWidth = getTextWidth(line, 0, line.length()); if (lineWidth > areaWidth) { // If the first word is longer than lineWidth // prevLine is empty and should be ignored if (prevLine.length() > 0) lineList.add(prevLine); start = prevEnd; } prevEnd = end; end = boundary.next(); } CharSequence line = txt.subSequence(start, prevEnd); lineList.add(line); maxLinesWidth = 0; for (CharSequence seq : lineList) { float lineWidth = getTextWidth(seq, 0, seq.length()); if (maxLinesWidth < lineWidth) maxLinesWidth = lineWidth; } areaWidth = maxLinesWidth; areaHeight = linesHeight * lineList.size(); width = areaWidth + pad * 2; height = areaHeight + pad * 2; }
public void check(String name, String in, String[] out, BreakIterator bi, TestHarness harness) { harness.checkPoint(name); bi.setText(in); int index = 0; int from = bi.current(); harness.check(from, 0); while (true) { int to = bi.next(); if (to == BreakIterator.DONE) break; harness.check(in.substring(from, to), out[index]); ++index; from = to; } harness.check(index, out.length); harness.checkPoint("backwards " + name); bi.last(); index = out.length - 1; from = bi.current(); harness.check(from, in.length()); while (true) { int to = bi.previous(); if (to == BreakIterator.DONE) break; harness.check(in.substring(to, from), out[index]); --index; from = to; } harness.check(index, -1); }
/** * set / update the text of the displayLabels. these are the Week column headers above the days on * the Calendar part of the <code>CDateTime</code>. */ private void updateDaysOfWeek() { if (dayPanel != null) { Calendar tmpcal = cdt.getCalendarInstance(); tmpcal.set(Calendar.DAY_OF_WEEK, tmpcal.getFirstDayOfWeek()); Locale locale = cdt.getLocale(); boolean ltr = (ComponentOrientation.getOrientation(locale).isLeftToRight() && !locale.getLanguage().equals("zh")); // $NON-NLS-1$ BreakIterator iterator = BreakIterator.getCharacterInstance(locale); for (int x = 0; x < dayLabels.length; x++) { String str = getFormattedDate("E", tmpcal.getTime()); // $NON-NLS-1$ if (dayLabels[x].getData(CDT.Key.Compact, Boolean.class)) { iterator.setText(str); int start, end; if (ltr) { start = iterator.first(); end = iterator.next(); } else { end = iterator.last(); start = iterator.previous(); } dayLabels[x].setText(str.substring(start, end)); } else { dayLabels[x].setText(str); } tmpcal.add(Calendar.DAY_OF_WEEK, 1); } } }
static HashMap<String, WordTuple> findWordsInSentences( String target, BreakIterator wordIterator, ArrayList<Integer> sentences) { HashMap<String, WordTuple> wordMap = new HashMap<String, WordTuple>(); wordIterator.setText(target); int start = wordIterator.first(); int end = wordIterator.next(); while (end != BreakIterator.DONE) { String word = target.substring(start, end); if (Character.isLetterOrDigit(word.charAt(0))) { // System.out.println(word); // System.out.println(start + "-" + end); // check which sentence the word is in by comparing end with values in sentences int sentenceNo = 0; for (int i = 0; i < sentences.size(); i++) { if (end <= sentences.get(i)) { sentenceNo = i; break; } } // lowercase the word String wordLc = word.toLowerCase(); // check if word exists in hashmap if (wordMap.containsKey(wordLc)) { // if exists, add sentence number to word's list in hashmap WordTuple wordTuple = wordMap.get(wordLc); ArrayList<Integer> sentenceList = wordTuple.getSentenceList(); sentenceList.add(sentenceNo); wordMap.put(wordLc, wordTuple); } else { // if it does not exist, create list, add sentence number to list, and add list to hashmap // with word as key ArrayList<Integer> sentenceList = new ArrayList<Integer>(); sentenceList.add(sentenceNo); WordTuple wordTuple = new WordTuple(); wordTuple.setSentenceList(sentenceList); wordMap.put(wordLc, wordTuple); } } start = end; end = wordIterator.next(); } return wordMap; }
private static Tokens splitText(final String text) { final List<Token> l = new LinkedList<>(); // use a BreakIterator to iterate our way through the words of the text final BreakIterator wordIterator = BreakIterator.getWordInstance(new Locale("en", "US")); wordIterator.setText(text); // simply iterate through the text, keeping track of a start and end index of the current word int startIdx = wordIterator.first(); for (int endIdx = wordIterator.next(); endIdx != DONE; startIdx = endIdx, endIdx = wordIterator.next()) { final String word = text.substring(startIdx, endIdx); l.add(new Token(startIdx, word)); } return new Tokens(l); }
/** * 返回一段文字中的单词的数组 * * @param text * @return */ public static final String[] toLowerCaseWordArray(String text) { if (text == null || text.length() == 0) return new String[0]; ArrayList wordList = new ArrayList(); BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(text); int start = 0; for (int end = boundary.next(); end != -1; end = boundary.next()) { String tmp = text.substring(start, end).trim(); tmp = replace(tmp, "+", ""); tmp = replace(tmp, "/", ""); tmp = replace(tmp, "\\", ""); tmp = replace(tmp, "#", ""); tmp = replace(tmp, "*", ""); tmp = replace(tmp, ")", ""); tmp = replace(tmp, "(", ""); tmp = replace(tmp, "&", ""); if (tmp.length() > 0) wordList.add(tmp); start = end; } return (String[]) wordList.toArray(new String[wordList.size()]); }
public static String convertStringToTitleCase(String toConvert) { BreakIterator wordBreaker = BreakIterator.getWordInstance(); wordBreaker.setText(toConvert); int end; String word = ""; for (int start = wordBreaker.first(); (end = wordBreaker.next()) != BreakIterator.DONE; start = end) { word += StringProcessing.wordToTitleCase(toConvert.substring(start, end)); } return word; }
static ArrayList<Integer> findSentenceBoundaries(String target, BreakIterator iterator) { ArrayList<Integer> sentenceBoundaryList = new ArrayList<Integer>(); iterator.setText(target); int boundary = iterator.first(); while (boundary != BreakIterator.DONE) { boundary = iterator.next(); if (boundary != -1) { sentenceBoundaryList.add(boundary); } } return sentenceBoundaryList; }
public static void javaBreakIterator() { BreakIterator wordIterator = BreakIterator.getWordInstance(); String text = "Let's pause, and then reflect."; wordIterator.setText(text); int boundary = wordIterator.first(); while (boundary != BreakIterator.DONE) { int begin = boundary; System.out.print(boundary + "-"); boundary = wordIterator.next(); int end = boundary; if (end == BreakIterator.DONE) break; System.out.print(boundary + " [" + text.substring(begin, end) + "];"); } System.out.println(); }
static String extractShortDescription(String description) { if (description == null) { return null; } int dot = description.indexOf("."); if (dot != -1) { BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.US); breakIterator.setText(description); String text = description.substring(breakIterator.first(), breakIterator.next()).trim(); return removeSpaceBetweenLine(text); } else { String[] lines = description.split(NEW_LINE); return lines[0].trim(); } }
@Test public void testSentenceDetection() { BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(Locale.US); sentenceIterator.setText(TEST_STRING); int start = sentenceIterator.first(); int end = -1; List<String> sentenceList = new ArrayList<String>(); while ((end = sentenceIterator.next()) != BreakIterator.DONE) { String sentence = TEST_STRING.substring(start, end); start = end; sentenceList.add(sentence); System.out.println("Sentence: " + sentence); } }
/** * Called to summarize a document when no hits were found. By default this just returns the first * {@code maxPassages} sentences; subclasses can override to customize. */ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { // BreakIterator should be un-next'd: List<Passage> passages = new ArrayList<>(); int pos = bi.current(); assert pos == 0; while (passages.size() < maxPassages) { int next = bi.next(); if (next == BreakIterator.DONE) { break; } Passage passage = new Passage(); passage.score = Float.NaN; passage.startOffset = pos; passage.endOffset = next; passages.add(passage); pos = next; } return passages.toArray(new Passage[passages.size()]); }
/** Returns the next word in the text */ public String nextWord() { if (!first) { currentWordPos = nextWordPos; currentWordEnd = getNextWordEnd(text, currentWordPos); nextWordPos = getNextWordStart(text, currentWordEnd + 1); int current = sentanceIterator.current(); if (current == currentWordPos) startsSentance = true; else { startsSentance = false; if (currentWordEnd > current) sentanceIterator.next(); } } // The nextWordPos has already been populated String word = null; try { word = document.getText(currentWordPos, currentWordEnd - currentWordPos); } catch (BadLocationException ex) { moreTokens = false; } wordCount++; first = false; if (nextWordPos == -1) moreTokens = false; return word; }
/** * Initializes fields comment, inlineTags of the object * * @param commentText the processed comment text */ private void procComment(String commentText) { // initialize inlineTags ArrayList<Tag> result = new ArrayList<Tag>(); String noInlineTags = replaceAtSigns(commentText); /* * Pattern p = Pattern.compile("\\{\\s*@[^}]*\\}"); // matches inline * tags // Pattern p = * Pattern.compile("\\{\\s*@([^\\s\\}]+)\\s*([^\\}]*)\\s*}"); // matches * inline tags Matcher m = p.matcher(commentText); int start = 0, end = * 0; // create an array of tag objects of kind "Text" and "@link"; as * explained in the // doclet API, for a comment // This is a {@link Doc * commentlabel} example. // create an array of Tag objects: // * * tags[0] is a Tag with name "Text" and text consisting of "This is a " * // * tags[1] is a SeeTag with name "@link", and label "commentlabel" * // * tags[2] is a Tag with name "Text" and text consisting of * " example." while (m.find()) { end = m.start(); String linkText = * m.group(); // System.out.print("String = \"" + * commentText.substring(start, end)); // * System.out.println("\"; linkText = \"" + linkText + "\""); // * result.add(new X10Tag("Text", commentText.substring(start, end), * this)); result.add(X10Tag.processInlineTag(linkText, this)); //int * index = commentText.indexOf(linkText); //commentText = * commentText.substring(0, index) + commentText.substring(index + * linkText.length()); // result.add(new X10SeeTag(true, linkText, * this)); // "true" signifies an @link tag, as opposed to an @see tag * start = m.end(); } */ if (!commentText.startsWith("@")) { // make sure that there is a // beginning paragraph // initialize comment int blockTagStart = noInlineTags.indexOf("@"); // start of block // tags within // comment blockTagStart = (blockTagStart == -1) ? commentText.length() : blockTagStart; this.comment = commentText.substring(0, blockTagStart).trim(); if (!comment.equals("")) { result.addAll(createInlineTags(comment, this)); } // } // add constraints, if any // String decl = declString(); // if (decl != null) { // result.add(new X10Tag(decl, this)); // } // initialize firstSentenceTags BreakIterator b = BreakIterator.getSentenceInstance(); b.setText(comment); int start = 0; int end = 0; start = b.first(); end = b.next(); String firstSentence = ((start <= end) ? comment.substring(start, end).trim() : ""); // System.out.println("X10Doc.initializeFields(): firstSentence = \"" // + firstSentence + "\""); firstSentenceTags = createInlineTags(firstSentence, this).toArray(new X10Tag[0]); } else { firstSentenceTags = new X10Tag[0]; } inlineTags = result.toArray(new X10Tag[0]); // TODO: creating Tag objects for block tags and storing them in a field // of this object Pattern blockTagPattern = Pattern.compile("\\s*@[^@]*"); Matcher blockTagMatcher = blockTagPattern.matcher(noInlineTags); while (blockTagMatcher.find()) { String tagText = blockTagMatcher.group(); int start = blockTagMatcher.start(); processBlockTag(commentText.substring(start, start + tagText.length())); } }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] highlightDoc( String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException { PassageScorer scorer = getScorer(field); if (scorer == null) { throw new NullPointerException("PassageScorer cannot be null"); } PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>(); float weights[] = new float[terms.length]; // initialize postings for (int i = 0; i < terms.length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de == null) { postings[i] = EMPTY; // initially if (!termsEnum.seekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); if (de == null) { // no positions available throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.advance(doc); } else { pDoc = de.docID(); if (pDoc < doc) { pDoc = de.advance(doc); } } if (doc == pDoc) { weights[i] = scorer.weight(contentLength, de.freq()); de.nextPosition(); pq.add(new OffsetsEnum(de, i)); } } pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination PriorityQueue<Passage> passageQueue = new PriorityQueue<>( n, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { if (left.score < right.score) { return -1; } else if (left.score > right.score) { return 1; } else { return left.startOffset - right.startOffset; } } }); Passage current = new Passage(); OffsetsEnum off; while ((off = pq.poll()) != null) { final DocsAndPositionsEnum dp = off.dp; int start = dp.startOffset(); if (start == -1) { throw new IllegalArgumentException( "field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.endOffset(); // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. assert EMPTY.startOffset() == Integer.MAX_VALUE; if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.size() == n && current.score < passageQueue.peek().score) { current.reset(); // can't compete, just reset it } else { passageQueue.offer(current); if (passageQueue.size() > n) { current = passageQueue.poll(); current.reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage passages[] = new Passage[passageQueue.size()]; passageQueue.toArray(passages); for (Passage p : passages) { p.sort(); } // sort in ascending order Arrays.sort( passages, new Comparator<Passage>() { @Override public int compare(Passage left, Passage right) { return left.startOffset - right.startOffset; } }); return passages; } // advance breakiterator assert BreakIterator.DONE < 0; current.startOffset = Math.max(bi.preceding(start + 1), 0); current.endOffset = Math.min(bi.next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term == null) { // multitermquery match, pull from payload term = off.dp.getPayload(); assert term != null; } current.addMatch(start, end, term); if (off.pos == dp.freq()) { break; // removed from pq } else { off.pos++; dp.nextPosition(); start = dp.startOffset(); end = dp.endOffset(); } if (start >= current.endOffset || end > contentLength) { pq.offer(off); break; } } current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: assert false; return null; }