예제 #1
1
 /**
  * Given a string, return an array of tokens. The separator can be escaped with the '\' character.
  * The '\' character may also be escaped by the '\' character.
  *
  * @param s the string to tokenize.
  * @param separator the separator char.
  * @param maxTokens the maxmimum number of tokens returned. If the max is reached, the remaining
  *     part of s is appended to the end of the last token.
  * @return an array of tokens.
  */
 public static String[] tokenize(String s, char separator, int maxTokens) {
   List tokens = new ArrayList();
   StringBuilder token = new StringBuilder();
   boolean prevIsEscapeChar = false;
   for (int i = 0; i < s.length(); i += Character.charCount(i)) {
     int currentChar = s.codePointAt(i);
     if (prevIsEscapeChar) {
       // Case 1:  escaped character
       token.appendCodePoint(currentChar);
       prevIsEscapeChar = false;
     } else if (currentChar == separator && tokens.size() < maxTokens - 1) {
       // Case 2:  separator
       tokens.add(token.toString());
       token = new StringBuilder();
     } else if (currentChar == '\\') {
       // Case 3:  escape character
       prevIsEscapeChar = true;
     } else {
       // Case 4:  regular character
       token.appendCodePoint(currentChar);
     }
   }
   if (token.length() > 0) {
     tokens.add(token.toString());
   }
   return (String[]) tokens.toArray(new String[] {});
 }
예제 #2
0
 /**
  * When the cursor is moved by the user, we need to update its position. If it falls inside the
  * currently composing word, we don't reset the composition, and only update the cursor position.
  *
  * @param expectedMoveAmount How many java chars to move the cursor. Negative values move the
  *     cursor backward, positive values move the cursor forward.
  * @return true if the cursor is still inside the composing word, false otherwise.
  */
 public boolean moveCursorByAndReturnIfInsideComposingWord(final int expectedMoveAmount) {
   int actualMoveAmount = 0;
   int cursorPos = mCursorPositionWithinWord;
   // TODO: Don't make that copy. We can do this directly from mTypedWordCache.
   final int[] codePoints = StringUtils.toCodePointArray(mTypedWordCache);
   if (expectedMoveAmount >= 0) {
     // Moving the cursor forward for the expected amount or until the end of the word has
     // been reached, whichever comes first.
     while (actualMoveAmount < expectedMoveAmount && cursorPos < codePoints.length) {
       actualMoveAmount += Character.charCount(codePoints[cursorPos]);
       ++cursorPos;
     }
   } else {
     // Moving the cursor backward for the expected amount or until the start of the word
     // has been reached, whichever comes first.
     while (actualMoveAmount > expectedMoveAmount && cursorPos > 0) {
       --cursorPos;
       actualMoveAmount -= Character.charCount(codePoints[cursorPos]);
     }
   }
   // If the actual and expected amounts differ, we crossed the start or the end of the word
   // so the result would not be inside the composing word.
   if (actualMoveAmount != expectedMoveAmount) {
     return false;
   }
   mCursorPositionWithinWord = cursorPos;
   mCombinerChain.applyProcessedEvent(
       mCombinerChain.processEvent(mEvents, Event.createCursorMovedEvent(cursorPos)));
   return true;
 }
예제 #3
0
  public static String firstNonRepeatedMoreEffect(String str) {
    HashMap<Integer, Object> charHash = new HashMap<Integer, Object>();
    Object seenOnce = new Object();
    Object seenMultiple = new Object();
    Object seen;
    final int length = str.length();

    for (int i = 0; i < length; ) {
      int cp = str.codePointAt(i);
      i += Character.charCount(cp);
      seen = charHash.get(cp);
      if (seen == null) {
        charHash.put(cp, seenOnce);
      } else {
        if (seen == seenOnce) {
          charHash.put(cp, seenMultiple);
        }
      }
    }

    for (int i = 0; i < length; ) {
      int cp = str.codePointAt(i);
      i += Character.charCount(cp);
      if (charHash.get(cp) == seenOnce) {
        return new String(Character.toChars(cp));
      }
    }

    return null;
  }
예제 #4
0
 private void createMarks(List<Mark> acc, Mark.ENTRY_PART part, String text, int firstMissing) {
   char[] chars = text.toCharArray();
   int i = firstMissing;
   while ((i = editorFont.canDisplayUpTo(chars, i, chars.length)) != -1) {
     int cp = Character.codePointAt(chars, i);
     int start = i;
     i += Character.charCount(cp);
     Font font = FontFallbackManager.getCapableFont(cp);
     if (font == null) {
       continue;
     }
     // Look ahead to try to group as many characters as possible into this run.
     for (int cpn, ccn, j = i; j < chars.length; j += ccn) {
       cpn = Character.codePointAt(chars, j);
       ccn = Character.charCount(cpn);
       if (!editorFont.canDisplay(cpn) && font.canDisplay(cpn)) {
         i += ccn;
       } else {
         break;
       }
     }
     Mark m = new Mark(part, start, i);
     m.attributes = getAttributes(font);
     acc.add(m);
   }
 }
 /**
  * Looks for the key-value separator (=,: or ' ') in the string.
  *
  * <p>See also bugreport <a href="http://sourceforge.net/support/tracker.php?aid=1606595"
  * >#1606595</a>.
  *
  * @return The char number of key-value separator in a string. Not that if the string does not
  *     contain any separator this string is considered to be a key with empty string value, and
  *     this method returns <code>-1</code> to indicate there's no equals.
  */
 private int searchEquals(String str) {
   int prevCp = 'a';
   for (int cp, i = 0; i < str.length(); i += Character.charCount(cp)) {
     cp = str.codePointAt(i);
     if (prevCp != '\\') {
       if (cp == '=' || cp == ':') {
         return i;
       } else if (cp == ' ' || cp == '\t') {
         for (int cp2, j = str.offsetByCodePoints(i, 1);
             j < str.length();
             j += Character.charCount(cp2)) {
           cp2 = str.codePointAt(j);
           if (cp2 == ':' || cp2 == '=') {
             return j;
           }
           if (cp2 != ' ' && cp2 != '\t') {
             return i;
           }
         }
         return i;
       }
     }
     prevCp = cp;
   }
   return -1;
 }
예제 #6
0
 // Parse a single line from the given configuration file, adding the name
 // on the line to the names list.
 //
 private int parseLine(Class<?> service, URL u, BufferedReader r, int lc, List<String> names)
     throws IOException, ServiceConfigurationError {
   String ln = r.readLine();
   if (ln == null) {
     return -1;
   }
   int ci = ln.indexOf('#');
   if (ci >= 0) ln = ln.substring(0, ci);
   ln = ln.trim();
   int n = ln.length();
   if (n != 0) {
     if ((ln.indexOf(' ') >= 0) || (ln.indexOf('\t') >= 0))
       fail(service, u, lc, "Illegal configuration-file syntax");
     int cp = ln.codePointAt(0);
     if (!Character.isJavaIdentifierStart(cp))
       fail(service, u, lc, "Illegal provider-class name: " + ln);
     for (int i = Character.charCount(cp); i < n; i += Character.charCount(cp)) {
       cp = ln.codePointAt(i);
       if (!Character.isJavaIdentifierPart(cp) && (cp != '.'))
         fail(service, u, lc, "Illegal provider-class name: " + ln);
     }
     if (!providers.containsKey(ln) && !names.contains(ln)) names.add(ln);
   }
   return lc + 1;
 }
예제 #7
0
  public int guessFullNameStyle(String name) {
    if (name == null) {
      return FullNameStyle.UNDEFINED;
    }

    int nameStyle = FullNameStyle.UNDEFINED;
    int length = name.length();
    int offset = 0;
    while (offset < length) {
      int codePoint = Character.codePointAt(name, offset);
      if (Character.isLetter(codePoint)) {
        UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);

        if (!isLatinUnicodeBlock(unicodeBlock)) {

          if (isCJKUnicodeBlock(unicodeBlock)) {
            // We don't know if this is Chinese, Japanese or Korean -
            // trying to figure out by looking at other characters in the name
            return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
          }

          if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
            return FullNameStyle.JAPANESE;
          }

          if (isKoreanUnicodeBlock(unicodeBlock)) {
            return FullNameStyle.KOREAN;
          }
        }
        nameStyle = FullNameStyle.WESTERN;
      }
      offset += Character.charCount(codePoint);
    }
    return nameStyle;
  }
        private int compareNumbers(String str0, String str1, int[] pos) {
          int delta = 0;
          int zeroes0 = 0, zeroes1 = 0;
          int ch0 = -1, ch1 = -1;

          // Skip leading zeroes, but keep a count of them.
          while (pos[0] < str0.length() && isZero(ch0 = str0.codePointAt(pos[0]))) {
            zeroes0++;
            pos[0] += Character.charCount(ch0);
          }
          while (pos[1] < str1.length() && isZero(ch1 = str1.codePointAt(pos[1]))) {
            zeroes1++;
            pos[1] += Character.charCount(ch1);
          }

          // If one sequence contains more significant digits than the
          // other, it's a larger number. In case they turn out to have
          // equal lengths, we compare digits at each position; the first
          // unequal pair determines which is the bigger number.
          while (true) {
            boolean noMoreDigits0 = (ch0 < 0) || !isDigit(ch0);
            boolean noMoreDigits1 = (ch1 < 0) || !isDigit(ch1);

            if (noMoreDigits0 && noMoreDigits1) {
              return delta != 0 ? delta : zeroes0 - zeroes1;
            } else if (noMoreDigits0) {
              return -1;
            } else if (noMoreDigits1) {
              return 1;
            } else if (delta == 0 && ch0 != ch1) {
              delta = valueOf(ch0) - valueOf(ch1);
            }

            if (pos[0] < str0.length()) {
              ch0 = str0.codePointAt(pos[0]);
              if (isDigit(ch0)) {
                pos[0] += Character.charCount(ch0);
              } else {
                ch0 = -1;
              }
            } else {
              ch0 = -1;
            }

            if (pos[1] < str1.length()) {
              ch1 = str1.codePointAt(pos[1]);
              if (isDigit(ch1)) {
                pos[1] += Character.charCount(ch1);
              } else {
                ch1 = -1;
              }
            } else {
              ch1 = -1;
            }
          }
        }
  /**
   * Reads next line from the input and:
   *
   * <ul>
   *   <li>Converts ascii-encoded \\uxxxx chars to normal characters.
   *   <li>Converts \r, \n and \t to CR, line feed and tab.
   *   <li>But! Keeps a backspace in '\ ', '\=', '\:' etc (non-trimmable space or
   *       non-key-value-breaking :-) equals).
   *       <ul>
   *         Change from BufferedReader to LinebreakPreservingReader was part of fix for bug 1462566
   */
  protected String getNextLine(LinebreakPreservingReader reader)
      throws IOException, TranslationException {
    String ascii = reader.readLine();
    if (ascii == null) {
      return null;
    }

    StringBuilder result = new StringBuilder();
    for (int cp, len = ascii.length(), i = 0; i < len; i += Character.charCount(cp)) {
      cp = ascii.codePointAt(i);
      if (cp == '\\' && ascii.codePointCount(i, len) > 1) {
        i += Character.charCount(cp);
        cp = ascii.codePointAt(i);
        if (cp != 'u') {
          if (cp == 'n') {
            cp = '\n';
          } else if (cp == 'r') {
            cp = '\r';
          } else if (cp == 't') {
            cp = '\t';
          } else {
            result.append('\\');
          }
        } else if (dontUnescapeULiterals) {
          // Put back the \ we swallowed
          result.append('\\');
        } else {
          // checking if the string is long enough
          if (ascii.codePointCount(i, len) < 1 + 4) {
            throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"));
          }
          int uStart = ascii.offsetByCodePoints(i, 1);
          int uEnd = ascii.offsetByCodePoints(uStart, 4);
          String uStr = ascii.substring(uStart, uEnd);
          try {
            cp = Integer.parseInt(uStr, 16);
            if (!Character.isValidCodePoint(cp)) {
              throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"));
            }
            i = uEnd - Character.charCount(cp);
          } catch (NumberFormatException ex) {
            throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"), ex);
          }
        }
      }
      result.appendCodePoint(cp);
    }

    return result.toString();
  }
예제 #10
0
파일: Utils.java 프로젝트: mkolod/pdfxtk
 /**
  * This method ensures that the output String has only valid XML unicode characters as specified
  * by the XML 1.0 standard. For reference, please see the standard. This method will return an
  * empty String if the input is null or empty.
  *
  * @author Donoiu Cristian, GPL The String whose non-valid characters we want to remove. in
  *     String, stripped of non-valid characters.
  */
 public static String removeInvalidXMLCharacters(String s) {
   StringBuilder out = new StringBuilder(); // Used to hold the output.
   int codePoint; // Used to reference the current character.
   // String ss = "\ud801\udc00";                           // This is actualy one unicode
   // character, represented by two code units!!!.
   // System.out.println(ss.codePointCount(0, ss.length()));// See: 1
   int i = 0;
   while (i < s.length()) {
     // System.out.println("i=" + i);
     codePoint = s.codePointAt(i); // This is the unicode code of the character.
     if ((codePoint == 0x9)
         || // Consider testing larger ranges first to improve speed.
         (codePoint == 0xA)
         || (codePoint == 0xD)
         || ((codePoint >= 0x20) && (codePoint <= 0xD7FF))
         || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD))
         || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) {
       out.append(Character.toChars(codePoint));
     }
     i +=
         Character.charCount(
             codePoint); // Increment with the number of code units(java chars) needed to represent
                         // a Unicode char.
   }
   return out.toString();
 }
예제 #11
0
 public static String enquoteCString(String value, char delimiter) {
   final int length = value.length();
   StringBuilder result = new StringBuilder();
   result.append(delimiter);
   for (int offset = 0, codepoint; offset < length; offset += Character.charCount(codepoint)) {
     codepoint = value.codePointAt(offset);
     if (codepoint > 0xFF) {
       result.appendCodePoint(codepoint);
     } else if (codepoint == delimiter) {
       result.append("\\" + delimiter);
     } else if (codepoint == '\\') {
       result.append("\\\\");
     } else if (codepoint == '\n') {
       result.append("\\n");
     } else if (codepoint == '\t') {
       result.append("\\t");
     } else if (codepoint == '\r') {
       result.append("\\r");
     } else if (codepoint == '\f') {
       result.append("\\f");
     } else if (codepoint >= 32 && codepoint < 127) {
       result.append((char) codepoint);
     } else if (codepoint <= 0xff) {
       result.append("\\");
       result.append(String.format("%03o", codepoint));
     }
   }
   result.append(delimiter);
   return result.toString();
 }
예제 #12
0
 /**
  * Adds double-quote at the beginning and end of the string and escapes special characters with
  * backslash: newline, carriage return, line feed, tab and backslash. Characters between 127 and
  * 255 are stored as \xFF Characters between 256 and 65535 are stored as \uFFFF Characters above
  * 65536 are stored as \u0010FFFF
  *
  * @param value any string
  * @return Python like textual representation of the string
  */
 public static String enquoteKString(String value) {
   final int length = value.length();
   StringBuilder result = new StringBuilder();
   result.append("\"");
   for (int offset = 0, codepoint; offset < length; offset += Character.charCount(codepoint)) {
     codepoint = value.codePointAt(offset);
     if (codepoint == '"') {
       result.append("\\\"");
     } else if (codepoint == '\\') {
       result.append("\\\\");
     } else if (codepoint == '\n') {
       result.append("\\n");
     } else if (codepoint == '\t') {
       result.append("\\t");
     } else if (codepoint == '\r') {
       result.append("\\r");
     } else if (codepoint == '\f') {
       result.append("\\f");
     } else if (codepoint >= 32 && codepoint < 127) {
       result.append((char) codepoint);
     } else if (codepoint <= 0xff) {
       result.append("\\x");
       result.append(String.format("%02x", codepoint));
     } else if (codepoint <= 0xffff) {
       result.append("\\u");
       result.append(String.format("%04x", codepoint));
     } else {
       result.append("\\U");
       result.append(String.format("%08x", codepoint));
     }
   }
   result.append("\"");
   return result.toString();
 }
예제 #13
0
  /** puts as utf-8 string */
  protected int _put(String str) {

    final int len = str.length();
    int total = 0;

    for (int i = 0; i < len; ) {
      int c = Character.codePointAt(str, i);

      if (c < 0x80) {
        _buf.write((byte) c);
        total += 1;
      } else if (c < 0x800) {
        _buf.write((byte) (0xc0 + (c >> 6)));
        _buf.write((byte) (0x80 + (c & 0x3f)));
        total += 2;
      } else if (c < 0x10000) {
        _buf.write((byte) (0xe0 + (c >> 12)));
        _buf.write((byte) (0x80 + ((c >> 6) & 0x3f)));
        _buf.write((byte) (0x80 + (c & 0x3f)));
        total += 3;
      } else {
        _buf.write((byte) (0xf0 + (c >> 18)));
        _buf.write((byte) (0x80 + ((c >> 12) & 0x3f)));
        _buf.write((byte) (0x80 + ((c >> 6) & 0x3f)));
        _buf.write((byte) (0x80 + (c & 0x3f)));
        total += 4;
      }

      i += Character.charCount(c);
    }

    _buf.write((byte) 0);
    total++;
    return total;
  }
예제 #14
0
  public int guessPhoneticNameStyle(String name) {
    if (name == null) {
      return PhoneticNameStyle.UNDEFINED;
    }

    int nameStyle = PhoneticNameStyle.UNDEFINED;
    int length = name.length();
    int offset = 0;
    while (offset < length) {
      int codePoint = Character.codePointAt(name, offset);
      if (Character.isLetter(codePoint)) {
        UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
        if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
          return PhoneticNameStyle.JAPANESE;
        }
        if (isKoreanUnicodeBlock(unicodeBlock)) {
          return PhoneticNameStyle.KOREAN;
        }
        if (isLatinUnicodeBlock(unicodeBlock)) {
          return PhoneticNameStyle.PINYIN;
        }
      }
      offset += Character.charCount(codePoint);
    }

    return nameStyle;
  }
예제 #15
0
  /**
   * Logically casts input to UTF32 ints then looks up the output or null if the input is not
   * accepted. FST must be INPUT_TYPE.BYTE4.
   */
  public static <T> T get(FST<T> fst, CharSequence input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE4;

    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

    int charIdx = 0;
    final int charLimit = input.length();

    // Accumulate output as we go
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    T output = NO_OUTPUT;

    while (charIdx < charLimit) {
      final int utf32 = Character.codePointAt(input, charIdx);
      charIdx += Character.charCount(utf32);

      if (fst.findTargetArc(utf32, arc, arc) == null) {
        return null;
      } else if (arc.output != NO_OUTPUT) {
        output = fst.outputs.add(output, arc.output);
      }
    }

    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
      return null;
    } else if (arc.output != NO_OUTPUT) {
      return fst.outputs.add(output, arc.output);
    } else {
      return output;
    }
  }
예제 #16
0
 /**
  * Remove invalid XML characters from the string
  *
  * @param text The string containing xml message.
  * @return
  */
 private static String removeNonXMLCharacters(String text) {
   StringBuilder out = new StringBuilder();
   StringBuilder errorMessage = null;
   int codePoint;
   int index = 0;
   while (index < text.length()) {
     codePoint = text.codePointAt(index);
     if ((codePoint == 0x9)
         || (codePoint == 0xA)
         || (codePoint == 0xD)
         || ((codePoint >= 0x20) && (codePoint <= 0xD7FF))
         || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD))
         || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) {
       out.append(Character.toChars(codePoint));
     } else {
       if (errorMessage == null) {
         errorMessage = new StringBuilder();
         errorMessage.append(
             "The message from xdebug contains invalid XML characters: "); // NOI18N
       } else {
         errorMessage.append(", "); // NOI18N
       }
       errorMessage.append(codePoint);
     }
     index += Character.charCount(codePoint);
   }
   if (errorMessage != null) {
     errorMessage.append(
         "\nPlease mentioned it in http://netbeans.org/bugzilla/show_bug.cgi?id=179309."); // NOI18N
     LOGGER.warning(errorMessage.toString());
   }
   return out.toString();
 }
예제 #17
0
  /**
   * Validates that the name of a processor option conforms to the grammar defined by <code>
   * javax.annotation.processing.Processor.getSupportedOptions()</code>.
   *
   * @param optionName
   * @return <code>true</code> if the name conforms to the grammar, <code>false</code> if not.
   */
  public static boolean isValidOptionName(String optionName) {
    if (optionName == null) {
      return false;
    }

    boolean startExpected = true;
    int codePoint;

    for (int i = 0; i < optionName.length(); i += Character.charCount(codePoint)) {
      codePoint = optionName.codePointAt(i);

      if (startExpected) {
        if (!Character.isJavaIdentifierStart(codePoint)) {
          return false;
        }

        startExpected = false;

      } else {
        if (codePoint == '.') {
          startExpected = true;

        } else if (!Character.isJavaIdentifierPart(codePoint)) {
          return false;
        }
      }
    }

    return !startExpected;
  }
예제 #18
0
 /**
  * Escapes a character sequence so that it is valid XML.
  *
  * @param s The character sequence.
  * @return The escaped version of the character sequence.
  */
 public static String escapeXML(CharSequence s) {
   // double quote -- quot
   // ampersand -- amp
   // less than -- lt
   // greater than -- gt
   // apostrophe -- apos
   StringBuilder sb = new StringBuilder(s.length() * 2);
   for (int i = 0; i < s.length(); ) {
     int codePoint = Character.codePointAt(s, i);
     if (codePoint == '<') {
       sb.append(LT);
     } else if (codePoint == '>') {
       sb.append(GT);
     } else if (codePoint == '\"') {
       sb.append(QUOT);
     } else if (codePoint == '&') {
       sb.append(AMP);
     } else if (codePoint == '\'') {
       sb.append(APOS);
     } else {
       sb.appendCodePoint(codePoint);
     }
     i += Character.charCount(codePoint);
   }
   return sb.toString();
 }
예제 #19
0
 // http://java.sun.com/developer/technicalArticles/Intl/Supplementary/
 private String buildString(int codePoint) {
   if (Character.charCount(codePoint) == 1) {
     return String.valueOf((char) codePoint);
   } else {
     return new String(Character.toChars(codePoint));
   }
 }
예제 #20
0
 /**
  * Examines the string and returns whether we're inside a double quote.
  *
  * <p>This is used to decide whether we should put an automatic space before or after a double
  * quote character. If we're inside a quotation, then we want to close it, so we want a space
  * after and not before. Otherwise, we want to open the quotation, so we want a space before and
  * not after. Exception: after a digit, we never want a space because the "inch" or "minutes" use
  * cases is dominant after digits. In the practice, we determine whether we are in a quotation or
  * not by finding the previous double quote character, and looking at whether it's followed by
  * whitespace. If so, that was a closing quotation mark, so we're not inside a double quote. If
  * it's not followed by whitespace, then it was an opening quotation mark, and we're inside a
  * quotation.
  *
  * @param text the text to examine.
  * @return whether we're inside a double quote.
  */
 public static boolean isInsideDoubleQuoteOrAfterDigit(final CharSequence text) {
   int i = text.length();
   if (0 == i) return false;
   int codePoint = Character.codePointBefore(text, i);
   if (Character.isDigit(codePoint)) return true;
   int prevCodePoint = 0;
   while (i > 0) {
     codePoint = Character.codePointBefore(text, i);
     if (Constants.CODE_DOUBLE_QUOTE == codePoint) {
       // If we see a double quote followed by whitespace, then that
       // was a closing quote.
       if (Character.isWhitespace(prevCodePoint)) return false;
     }
     if (Character.isWhitespace(codePoint) && Constants.CODE_DOUBLE_QUOTE == prevCodePoint) {
       // If we see a double quote preceded by whitespace, then that
       // was an opening quote. No need to continue seeking.
       return true;
     }
     i -= Character.charCount(codePoint);
     prevCodePoint = codePoint;
   }
   // We reached the start of text. If the first char is a double quote, then we're inside
   // a double quote. Otherwise we're not.
   return Constants.CODE_DOUBLE_QUOTE == codePoint;
 }
예제 #21
0
 /**
  * Determines if a character sequence is a QName. A QName is either an NCName (LocalName), or an
  * NCName followed by a colon followed by another NCName (where the first NCName is referred to as
  * the 'Prefix Name' and the second NCName is referred to as the 'Local Name' - i.e.
  * PrefixName:LocalName).
  *
  * @param s The character sequence to be tested.
  * @return {@code true} if {@code s} is a QName, otherwise {@code false}.
  */
 public static boolean isQName(CharSequence s) {
   if (isNullOrEmpty(s)) {
     return false;
   }
   boolean foundColon = false;
   boolean inNCName = false;
   for (int i = 0; i < s.length(); ) {
     int codePoint = Character.codePointAt(s, i);
     if (codePoint == ':') {
       if (foundColon) {
         return false;
       }
       foundColon = true;
       if (!inNCName) {
         return false;
       }
       inNCName = false;
     } else {
       if (!inNCName) {
         if (!isXMLNameStartCharacter(codePoint)) {
           return false;
         }
         inNCName = true;
       } else {
         if (!isXMLNameChar(codePoint)) {
           return false;
         }
       }
     }
     i += Character.charCount(codePoint);
   }
   return true;
 }
예제 #22
0
 // @VisibleForTesting
 public static boolean isValidUriCharset(String uri) {
   int len = uri.length();
   int i = 0;
   while (i < len) {
     int codePoint = uri.codePointAt(i);
     i += Character.charCount(codePoint);
     if (Character.isSupplementaryCodePoint(codePoint)) {
       continue;
     }
     if (HREF_DISCRETE_UCSCHAR.indexOf(codePoint) >= 0) {
       continue;
     }
     // iunreserved ranges
     if (('a' <= codePoint && codePoint <= 'z')
         || ('A' <= codePoint && codePoint <= 'Z')
         || ('0' <= codePoint && codePoint <= '9')) {
       continue;
     }
     // href-ucschar ranges
     if ((0 <= codePoint && codePoint <= 0x1F)
         || (0x7F <= codePoint && codePoint <= 0xD7FF)
         || (0xE000 <= codePoint && codePoint <= 0xFFFD)) {
       continue;
     }
     return false;
   }
   return true;
 }
예제 #23
0
 public static int lastIndexOfAny(String str, String search, int offset) {
   if (str.equals("") || search.equals("")) {
     return -1;
   }
   for (int i = str.length(), strCodepoint; i > 0; i -= Character.charCount(strCodepoint)) {
     strCodepoint = str.codePointBefore(i);
     for (int j = search.length(), searchCodepoint;
         j > 0;
         j -= Character.charCount(searchCodepoint)) {
       searchCodepoint = search.codePointBefore(j);
       if (strCodepoint == searchCodepoint) {
         return i;
       }
     }
   }
   return -1;
 }
예제 #24
0
 /**
  * Determines if a character sequence is an NCName (Non-Colonised Name). An NCName is a string
  * which starts with an NCName start character and is followed by zero or more NCName characters.
  *
  * @param s The character sequence to be tested.
  * @return {@code true} if {@code s} is an NCName, otherwise {@code false}.
  */
 public static boolean isNCName(CharSequence s) {
   if (isNullOrEmpty(s)) {
     return false;
   }
   int firstCodePoint = Character.codePointAt(s, 0);
   if (!isNCNameStartChar(firstCodePoint)) {
     return false;
   }
   for (int i = Character.charCount(firstCodePoint); i < s.length(); ) {
     int codePoint = Character.codePointAt(s, i);
     if (!isNCNameChar(codePoint)) {
       return false;
     }
     i += Character.charCount(codePoint);
   }
   return true;
 }
예제 #25
0
 public static int indexOfAny(String str, String search, int offset) {
   if (str.equals("") || search.equals("")) {
     return -1;
   }
   for (int i = 0, strCodepoint; i < str.length(); i += Character.charCount(strCodepoint)) {
     strCodepoint = str.codePointAt(i);
     for (int j = 0, searchCodepoint;
         j < search.length();
         j += Character.charCount(searchCodepoint)) {
       searchCodepoint = search.codePointAt(j);
       if (strCodepoint == searchCodepoint) {
         return i;
       }
     }
   }
   return -1;
 }
  public void testRandomRealisticWhiteSpace() throws IOException {
    Map<String, String> map = new HashMap<>();
    Set<String> seen = new HashSet<>();
    int numTerms = atLeast(50);
    boolean ignoreCase = random().nextBoolean();

    for (int i = 0; i < numTerms; i++) {
      String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
      char[] charArray = randomRealisticUnicodeString.toCharArray();
      StringBuilder builder = new StringBuilder();
      for (int j = 0; j < charArray.length; ) {
        int cp = Character.codePointAt(charArray, j, charArray.length);
        if (!Character.isWhitespace(cp)) {
          builder.appendCodePoint(cp);
        }
        j += Character.charCount(cp);
      }
      if (builder.length() > 0) {
        String inputValue = builder.toString();

        // Make sure we don't try to add two inputs that vary only by case:
        String seenInputValue;
        if (ignoreCase) {
          // TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
          char[] buffer = inputValue.toCharArray();
          CharacterUtils.toLowerCase(buffer, 0, buffer.length);
          seenInputValue = buffer.toString();
        } else {
          seenInputValue = inputValue;
        }

        if (seen.contains(seenInputValue) == false) {
          seen.add(seenInputValue);
          String value = TestUtil.randomSimpleString(random());
          map.put(inputValue, value.isEmpty() ? "a" : value);
        }
      }
    }
    if (map.isEmpty()) {
      map.put("booked", "books");
    }
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
    Set<Entry<String, String>> entrySet = map.entrySet();
    StringBuilder input = new StringBuilder();
    List<String> output = new ArrayList<>();
    for (Entry<String, String> entry : entrySet) {
      builder.add(entry.getKey(), entry.getValue());
      if (random().nextBoolean() || output.isEmpty()) {
        input.append(entry.getKey()).append(" ");
        output.add(entry.getValue());
      }
    }
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(input.toString()));
    TokenStream stream =
        new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
    assertTokenStreamContents(stream, output.toArray(new String[0]));
  }
 /**
  * Returns true if the given string is accepted by the automaton.
  *
  * <p>Complexity: linear in the length of the string.
  *
  * <p><b>Note:</b> for full performance, use the {@link RunAutomaton} class.
  */
 public static boolean run(Automaton a, String s) {
   if (a.isSingleton()) return s.equals(a.singleton);
   if (a.deterministic) {
     State p = a.initial;
     for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) {
       State q = p.step(cp = s.codePointAt(i));
       if (q == null) return false;
       p = q;
     }
     return p.accept;
   } else {
     State[] states = a.getNumberedStates();
     LinkedList<State> pp = new LinkedList<State>();
     LinkedList<State> pp_other = new LinkedList<State>();
     BitSet bb = new BitSet(states.length);
     BitSet bb_other = new BitSet(states.length);
     pp.add(a.initial);
     ArrayList<State> dest = new ArrayList<State>();
     boolean accept = a.initial.accept;
     for (int i = 0, c = 0; i < s.length(); i += Character.charCount(c)) {
       c = s.codePointAt(i);
       accept = false;
       pp_other.clear();
       bb_other.clear();
       for (State p : pp) {
         dest.clear();
         p.step(c, dest);
         for (State q : dest) {
           if (q.accept) accept = true;
           if (!bb_other.get(q.number)) {
             bb_other.set(q.number);
             pp_other.add(q);
           }
         }
       }
       LinkedList<State> tp = pp;
       pp = pp_other;
       pp_other = tp;
       BitSet tb = bb;
       bb = bb_other;
       bb_other = tb;
     }
     return accept;
   }
 }
예제 #28
0
  @Override
  public final boolean incrementToken() throws IOException {
    this.clearAttributes();
    int length = 0;
    int start = -1;
    int end = -1;
    char[] buffer = this.termAtt.buffer();

    while (true) {
      if (this.bufferIndex >= this.dataLen) {
        this.offset += this.dataLen;
        this.charUtils.fill(this.ioBuffer, this.input);
        if (this.ioBuffer.getLength() == 0) {
          this.dataLen = 0;
          if (length <= 0) {
            this.finalOffset = this.correctOffset(this.offset);
            return false;
          }
          break;
        }

        this.dataLen = this.ioBuffer.getLength();
        this.bufferIndex = 0;
      }

      int c =
          this.charUtils.codePointAt(
              this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength());
      int charCount = Character.charCount(c);
      this.bufferIndex += charCount;
      if (this.isTokenChar(c)) {
        if (length == 0) {
          assert start == -1;

          start = this.offset + this.bufferIndex - charCount;
          end = start;
        } else if (length >= buffer.length - 1) {
          buffer = this.termAtt.resizeBuffer(2 + length);
        }

        end += charCount;
        length += Character.toChars(this.normalize(c), buffer, length);
        if (length >= MAX_WORD_LEN) {
          break;
        }
      } else if (length > 0) {
        break;
      }
    }

    this.termAtt.setLength(length);

    assert start != -1;

    this.offsetAtt.setOffset(this.correctOffset(start), this.finalOffset = this.correctOffset(end));
    return true;
  }
예제 #29
0
 public static String newSingleCodePointString(int codePoint) {
   if (Character.charCount(codePoint) == 1) {
     // Optimization: avoid creating a temporary array for characters that are
     // represented by a single char value
     return String.valueOf((char) codePoint);
   }
   // For surrogate pair
   return new String(Character.toChars(codePoint));
 }
예제 #30
0
 /**
  * Unpacks data for the selected Unicode version, populating {@link #propertyValueIntervals}.
  *
  * @param propertyValues The list of property values, in same order as the packed data
  *     corresponding to them, in the given intervals, for the selected Unicode version.
  * @param intervals The packed character intervals corresponding to and in the same order as the
  *     given propertyValues, for the selected Unicode version.
  * @param propertyValueAliases Key/value pairs mapping property value aliases to property values,
  *     for the selected Unicode version.
  * @param maximumCodePoint The maximum code point for the selected Unicode version.
  * @param caselessMatchPartitions The packed caseless match partition data for the selected
  *     Unicode version
  * @param caselessMatchPartitionSize The partition data record length (the maximum number of
  *     elements in a caseless match partition) for the selected Unicode version.
  */
 private void bind(
     String[] propertyValues,
     String[] intervals,
     String[] propertyValueAliases,
     int maximumCodePoint,
     String caselessMatchPartitions,
     int caselessMatchPartitionSize) {
   // IntCharSet caselessMatches[] is lazily initialized - don't unpack here
   this.caselessMatchPartitions = caselessMatchPartitions;
   this.caselessMatchPartitionSize = caselessMatchPartitionSize;
   this.maximumCodePoint = maximumCodePoint;
   for (int n = 0; n < propertyValues.length; ++n) {
     String propertyValue = propertyValues[n];
     String propertyIntervals = intervals[n];
     IntCharSet set = new IntCharSet();
     for (int index = 0; index < propertyIntervals.length(); ) {
       int start = propertyIntervals.codePointAt(index);
       index += Character.charCount(start);
       int end = propertyIntervals.codePointAt(index);
       index += Character.charCount(end);
       set.add(new Interval(start, end));
     }
     propertyValueIntervals.put(propertyValue, set);
     if (2 == propertyValue.length()) {
       String singleLetter = propertyValue.substring(0, 1);
       IntCharSet singleLetterPropValueSet = propertyValueIntervals.get(singleLetter);
       if (null == singleLetterPropValueSet) {
         singleLetterPropValueSet = new IntCharSet();
         propertyValueIntervals.put(singleLetter, singleLetterPropValueSet);
       }
       singleLetterPropValueSet.add(set);
     }
   }
   for (int n = 0; n < propertyValueAliases.length; n += 2) {
     String alias = propertyValueAliases[n];
     String propertyValue = propertyValueAliases[n + 1];
     IntCharSet targetSet = propertyValueIntervals.get(propertyValue);
     if (null != targetSet) {
       propertyValueIntervals.put(alias, targetSet);
     }
   }
   bindInvariantIntervals();
 }