예제 #1
0
 public void testSupplementaryCharacters() throws IOException {
   final String s = _TestUtil.randomUnicodeString(random(), 10);
   final int codePointCount = s.codePointCount(0, s.length());
   final int minGram = _TestUtil.nextInt(random(), 1, 3);
   final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
   TokenStream tk = new KeywordTokenizer(new StringReader(s));
   tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
   final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
   final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
   tk.reset();
   for (int start = 0; start < codePointCount; ++start) {
     for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
       assertTrue(tk.incrementToken());
       assertEquals(0, offsetAtt.startOffset());
       assertEquals(s.length(), offsetAtt.endOffset());
       final int startIndex = Character.offsetByCodePoints(s, 0, start);
       final int endIndex = Character.offsetByCodePoints(s, 0, end);
       assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
     }
   }
   assertFalse(tk.incrementToken());
 }
예제 #2
0
 public void setBatchInputWord(final String word) {
   reset();
   mIsBatchMode = true;
   final int length = word.length();
   for (int i = 0; i < length; i = Character.offsetByCodePoints(word, i, 1)) {
     final int codePoint = Character.codePointAt(word, i);
     // We don't want to override the batch input points that are held in mInputPointers
     // (See {@link #add(int,int,int)}).
     final Event processedEvent =
         processEvent(Event.createEventForCodePointFromUnknownSource(codePoint));
     applyProcessedEvent(processedEvent);
   }
 }
예제 #3
0
 /**
  * Approximates whether the text before the cursor looks like a URL.
  *
  * <p>This is not foolproof, but it should work well in the practice. Essentially it walks
  * backward from the cursor until it finds something that's not a letter, digit, or common URL
  * symbol like underscore. If it hasn't found a period yet, then it does not look like a URL. If
  * the text: - starts with www and contains a period - starts with a slash preceded by either a
  * slash, whitespace, or start-of-string Then it looks like a URL and we return true. Otherwise,
  * we return false.
  *
  * <p>Note: this method is called quite often, and should be fast.
  *
  * <p>TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the code
  * complexity, but ideally it should not. It's acceptable for now.
  */
 public static boolean lastPartLooksLikeURL(final CharSequence text) {
   int i = text.length();
   if (0 == i) return false;
   int wCount = 0;
   int slashCount = 0;
   boolean hasSlash = false;
   boolean hasPeriod = false;
   int codePoint = 0;
   while (i > 0) {
     codePoint = Character.codePointBefore(text, i);
     if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') {
       // Handwavy heuristic to see if that's a URL character. Anything between period
       // and z. This includes all lower- and upper-case ascii letters, period,
       // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation
       // marks, double quotes...
       // Anything that's not a URL-like character causes us to break from here and
       // evaluate normally.
       break;
     }
     if (Constants.CODE_PERIOD == codePoint) {
       hasPeriod = true;
     }
     if (Constants.CODE_SLASH == codePoint) {
       hasSlash = true;
       if (2 == ++slashCount) {
         return true;
       }
     } else {
       slashCount = 0;
     }
     if ('w' == codePoint) {
       ++wCount;
     } else {
       wCount = 0;
     }
     i = Character.offsetByCodePoints(text, i, -1);
   }
   // End of the text run.
   // If it starts with www and includes a period, then it looks like a URL.
   if (wCount >= 3 && hasPeriod) return true;
   // If it starts with a slash, and the code point before is whitespace, it looks like an URL.
   if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true;
   // If it has both a period and a slash, it looks like an URL.
   if (hasPeriod && hasSlash) return true;
   // Otherwise, it doesn't look like an URL.
   return false;
 }
예제 #4
0
 /**
  * Copies the codepoints in a CharSequence to an int array.
  *
  * <p>This method assumes there is enough space in the array to store the code points. The size
  * can be measured with Character#codePointCount(CharSequence, int, int) before passing to this
  * method. If the int array is too small, an ArrayIndexOutOfBoundsException will be thrown. Also,
  * this method makes no effort to be thread-safe. Do not modify the CharSequence while this method
  * is running, or the behavior is undefined. This method can optionally downcase code points
  * before copying them, but it pays no attention to locale while doing so.
  *
  * @param destination the int array.
  * @param charSequence the CharSequence.
  * @param startIndex the start index inside the string in java chars, inclusive.
  * @param endIndex the end index inside the string in java chars, exclusive.
  * @param downCase if this is true, code points will be downcased before being copied.
  * @return the number of copied code points.
  */
 public static int copyCodePointsAndReturnCodePointCount(
     final int[] destination,
     final CharSequence charSequence,
     final int startIndex,
     final int endIndex,
     final boolean downCase) {
   int destIndex = 0;
   for (int index = startIndex;
       index < endIndex;
       index = Character.offsetByCodePoints(charSequence, index, 1)) {
     final int codePoint = Character.codePointAt(charSequence, index);
     // TODO: stop using this, as it's not aware of the locale and does not always do
     // the right thing.
     destination[destIndex] = downCase ? Character.toLowerCase(codePoint) : codePoint;
     destIndex++;
   }
   return destIndex;
 }