public void testSupplementaryCharacters() throws IOException { final String s = _TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = _TestUtil.nextInt(random(), 1, 3); final int maxGram = _TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int startIndex = Character.offsetByCodePoints(s, 0, start); final int endIndex = Character.offsetByCodePoints(s, 0, end); assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); } } assertFalse(tk.incrementToken()); }
public void setBatchInputWord(final String word) { reset(); mIsBatchMode = true; final int length = word.length(); for (int i = 0; i < length; i = Character.offsetByCodePoints(word, i, 1)) { final int codePoint = Character.codePointAt(word, i); // We don't want to override the batch input points that are held in mInputPointers // (See {@link #add(int,int,int)}). final Event processedEvent = processEvent(Event.createEventForCodePointFromUnknownSource(codePoint)); applyProcessedEvent(processedEvent); } }
/** * Approximates whether the text before the cursor looks like a URL. * * <p>This is not foolproof, but it should work well in the practice. Essentially it walks * backward from the cursor until it finds something that's not a letter, digit, or common URL * symbol like underscore. If it hasn't found a period yet, then it does not look like a URL. If * the text: - starts with www and contains a period - starts with a slash preceded by either a * slash, whitespace, or start-of-string Then it looks like a URL and we return true. Otherwise, * we return false. * * <p>Note: this method is called quite often, and should be fast. * * <p>TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the code * complexity, but ideally it should not. It's acceptable for now. */ public static boolean lastPartLooksLikeURL(final CharSequence text) { int i = text.length(); if (0 == i) return false; int wCount = 0; int slashCount = 0; boolean hasSlash = false; boolean hasPeriod = false; int codePoint = 0; while (i > 0) { codePoint = Character.codePointBefore(text, i); if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { // Handwavy heuristic to see if that's a URL character. Anything between period // and z. This includes all lower- and upper-case ascii letters, period, // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation // marks, double quotes... // Anything that's not a URL-like character causes us to break from here and // evaluate normally. break; } if (Constants.CODE_PERIOD == codePoint) { hasPeriod = true; } if (Constants.CODE_SLASH == codePoint) { hasSlash = true; if (2 == ++slashCount) { return true; } } else { slashCount = 0; } if ('w' == codePoint) { ++wCount; } else { wCount = 0; } i = Character.offsetByCodePoints(text, i, -1); } // End of the text run. // If it starts with www and includes a period, then it looks like a URL. if (wCount >= 3 && hasPeriod) return true; // If it starts with a slash, and the code point before is whitespace, it looks like an URL. if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true; // If it has both a period and a slash, it looks like an URL. if (hasPeriod && hasSlash) return true; // Otherwise, it doesn't look like an URL. return false; }
/** * Copies the codepoints in a CharSequence to an int array. * * <p>This method assumes there is enough space in the array to store the code points. The size * can be measured with Character#codePointCount(CharSequence, int, int) before passing to this * method. If the int array is too small, an ArrayIndexOutOfBoundsException will be thrown. Also, * this method makes no effort to be thread-safe. Do not modify the CharSequence while this method * is running, or the behavior is undefined. This method can optionally downcase code points * before copying them, but it pays no attention to locale while doing so. * * @param destination the int array. * @param charSequence the CharSequence. * @param startIndex the start index inside the string in java chars, inclusive. * @param endIndex the end index inside the string in java chars, exclusive. * @param downCase if this is true, code points will be downcased before being copied. * @return the number of copied code points. */ public static int copyCodePointsAndReturnCodePointCount( final int[] destination, final CharSequence charSequence, final int startIndex, final int endIndex, final boolean downCase) { int destIndex = 0; for (int index = startIndex; index < endIndex; index = Character.offsetByCodePoints(charSequence, index, 1)) { final int codePoint = Character.codePointAt(charSequence, index); // TODO: stop using this, as it's not aware of the locale and does not always do // the right thing. destination[destIndex] = downCase ? Character.toLowerCase(codePoint) : codePoint; destIndex++; } return destIndex; }