/** * Given a string, return an array of tokens. The separator can be escaped with the '\' character. * The '\' character may also be escaped by the '\' character. * * @param s the string to tokenize. * @param separator the separator char. * @param maxTokens the maxmimum number of tokens returned. If the max is reached, the remaining * part of s is appended to the end of the last token. * @return an array of tokens. */ public static String[] tokenize(String s, char separator, int maxTokens) { List tokens = new ArrayList(); StringBuilder token = new StringBuilder(); boolean prevIsEscapeChar = false; for (int i = 0; i < s.length(); i += Character.charCount(i)) { int currentChar = s.codePointAt(i); if (prevIsEscapeChar) { // Case 1: escaped character token.appendCodePoint(currentChar); prevIsEscapeChar = false; } else if (currentChar == separator && tokens.size() < maxTokens - 1) { // Case 2: separator tokens.add(token.toString()); token = new StringBuilder(); } else if (currentChar == '\\') { // Case 3: escape character prevIsEscapeChar = true; } else { // Case 4: regular character token.appendCodePoint(currentChar); } } if (token.length() > 0) { tokens.add(token.toString()); } return (String[]) tokens.toArray(new String[] {}); }
/** * When the cursor is moved by the user, we need to update its position. If it falls inside the * currently composing word, we don't reset the composition, and only update the cursor position. * * @param expectedMoveAmount How many java chars to move the cursor. Negative values move the * cursor backward, positive values move the cursor forward. * @return true if the cursor is still inside the composing word, false otherwise. */ public boolean moveCursorByAndReturnIfInsideComposingWord(final int expectedMoveAmount) { int actualMoveAmount = 0; int cursorPos = mCursorPositionWithinWord; // TODO: Don't make that copy. We can do this directly from mTypedWordCache. final int[] codePoints = StringUtils.toCodePointArray(mTypedWordCache); if (expectedMoveAmount >= 0) { // Moving the cursor forward for the expected amount or until the end of the word has // been reached, whichever comes first. while (actualMoveAmount < expectedMoveAmount && cursorPos < codePoints.length) { actualMoveAmount += Character.charCount(codePoints[cursorPos]); ++cursorPos; } } else { // Moving the cursor backward for the expected amount or until the start of the word // has been reached, whichever comes first. while (actualMoveAmount > expectedMoveAmount && cursorPos > 0) { --cursorPos; actualMoveAmount -= Character.charCount(codePoints[cursorPos]); } } // If the actual and expected amounts differ, we crossed the start or the end of the word // so the result would not be inside the composing word. if (actualMoveAmount != expectedMoveAmount) { return false; } mCursorPositionWithinWord = cursorPos; mCombinerChain.applyProcessedEvent( mCombinerChain.processEvent(mEvents, Event.createCursorMovedEvent(cursorPos))); return true; }
public static String firstNonRepeatedMoreEffect(String str) { HashMap<Integer, Object> charHash = new HashMap<Integer, Object>(); Object seenOnce = new Object(); Object seenMultiple = new Object(); Object seen; final int length = str.length(); for (int i = 0; i < length; ) { int cp = str.codePointAt(i); i += Character.charCount(cp); seen = charHash.get(cp); if (seen == null) { charHash.put(cp, seenOnce); } else { if (seen == seenOnce) { charHash.put(cp, seenMultiple); } } } for (int i = 0; i < length; ) { int cp = str.codePointAt(i); i += Character.charCount(cp); if (charHash.get(cp) == seenOnce) { return new String(Character.toChars(cp)); } } return null; }
private void createMarks(List<Mark> acc, Mark.ENTRY_PART part, String text, int firstMissing) { char[] chars = text.toCharArray(); int i = firstMissing; while ((i = editorFont.canDisplayUpTo(chars, i, chars.length)) != -1) { int cp = Character.codePointAt(chars, i); int start = i; i += Character.charCount(cp); Font font = FontFallbackManager.getCapableFont(cp); if (font == null) { continue; } // Look ahead to try to group as many characters as possible into this run. for (int cpn, ccn, j = i; j < chars.length; j += ccn) { cpn = Character.codePointAt(chars, j); ccn = Character.charCount(cpn); if (!editorFont.canDisplay(cpn) && font.canDisplay(cpn)) { i += ccn; } else { break; } } Mark m = new Mark(part, start, i); m.attributes = getAttributes(font); acc.add(m); } }
/** * Looks for the key-value separator (=,: or ' ') in the string. * * <p>See also bugreport <a href="http://sourceforge.net/support/tracker.php?aid=1606595" * >#1606595</a>. * * @return The char number of key-value separator in a string. Not that if the string does not * contain any separator this string is considered to be a key with empty string value, and * this method returns <code>-1</code> to indicate there's no equals. */ private int searchEquals(String str) { int prevCp = 'a'; for (int cp, i = 0; i < str.length(); i += Character.charCount(cp)) { cp = str.codePointAt(i); if (prevCp != '\\') { if (cp == '=' || cp == ':') { return i; } else if (cp == ' ' || cp == '\t') { for (int cp2, j = str.offsetByCodePoints(i, 1); j < str.length(); j += Character.charCount(cp2)) { cp2 = str.codePointAt(j); if (cp2 == ':' || cp2 == '=') { return j; } if (cp2 != ' ' && cp2 != '\t') { return i; } } return i; } } prevCp = cp; } return -1; }
// Parse a single line from the given configuration file, adding the name // on the line to the names list. // private int parseLine(Class<?> service, URL u, BufferedReader r, int lc, List<String> names) throws IOException, ServiceConfigurationError { String ln = r.readLine(); if (ln == null) { return -1; } int ci = ln.indexOf('#'); if (ci >= 0) ln = ln.substring(0, ci); ln = ln.trim(); int n = ln.length(); if (n != 0) { if ((ln.indexOf(' ') >= 0) || (ln.indexOf('\t') >= 0)) fail(service, u, lc, "Illegal configuration-file syntax"); int cp = ln.codePointAt(0); if (!Character.isJavaIdentifierStart(cp)) fail(service, u, lc, "Illegal provider-class name: " + ln); for (int i = Character.charCount(cp); i < n; i += Character.charCount(cp)) { cp = ln.codePointAt(i); if (!Character.isJavaIdentifierPart(cp) && (cp != '.')) fail(service, u, lc, "Illegal provider-class name: " + ln); } if (!providers.containsKey(ln) && !names.contains(ln)) names.add(ln); } return lc + 1; }
public int guessFullNameStyle(String name) { if (name == null) { return FullNameStyle.UNDEFINED; } int nameStyle = FullNameStyle.UNDEFINED; int length = name.length(); int offset = 0; while (offset < length) { int codePoint = Character.codePointAt(name, offset); if (Character.isLetter(codePoint)) { UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); if (!isLatinUnicodeBlock(unicodeBlock)) { if (isCJKUnicodeBlock(unicodeBlock)) { // We don't know if this is Chinese, Japanese or Korean - // trying to figure out by looking at other characters in the name return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); } if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { return FullNameStyle.JAPANESE; } if (isKoreanUnicodeBlock(unicodeBlock)) { return FullNameStyle.KOREAN; } } nameStyle = FullNameStyle.WESTERN; } offset += Character.charCount(codePoint); } return nameStyle; }
private int compareNumbers(String str0, String str1, int[] pos) { int delta = 0; int zeroes0 = 0, zeroes1 = 0; int ch0 = -1, ch1 = -1; // Skip leading zeroes, but keep a count of them. while (pos[0] < str0.length() && isZero(ch0 = str0.codePointAt(pos[0]))) { zeroes0++; pos[0] += Character.charCount(ch0); } while (pos[1] < str1.length() && isZero(ch1 = str1.codePointAt(pos[1]))) { zeroes1++; pos[1] += Character.charCount(ch1); } // If one sequence contains more significant digits than the // other, it's a larger number. In case they turn out to have // equal lengths, we compare digits at each position; the first // unequal pair determines which is the bigger number. while (true) { boolean noMoreDigits0 = (ch0 < 0) || !isDigit(ch0); boolean noMoreDigits1 = (ch1 < 0) || !isDigit(ch1); if (noMoreDigits0 && noMoreDigits1) { return delta != 0 ? delta : zeroes0 - zeroes1; } else if (noMoreDigits0) { return -1; } else if (noMoreDigits1) { return 1; } else if (delta == 0 && ch0 != ch1) { delta = valueOf(ch0) - valueOf(ch1); } if (pos[0] < str0.length()) { ch0 = str0.codePointAt(pos[0]); if (isDigit(ch0)) { pos[0] += Character.charCount(ch0); } else { ch0 = -1; } } else { ch0 = -1; } if (pos[1] < str1.length()) { ch1 = str1.codePointAt(pos[1]); if (isDigit(ch1)) { pos[1] += Character.charCount(ch1); } else { ch1 = -1; } } else { ch1 = -1; } } }
/** * Reads next line from the input and: * * <ul> * <li>Converts ascii-encoded \\uxxxx chars to normal characters. * <li>Converts \r, \n and \t to CR, line feed and tab. * <li>But! Keeps a backspace in '\ ', '\=', '\:' etc (non-trimmable space or * non-key-value-breaking :-) equals). * <ul> * Change from BufferedReader to LinebreakPreservingReader was part of fix for bug 1462566 */ protected String getNextLine(LinebreakPreservingReader reader) throws IOException, TranslationException { String ascii = reader.readLine(); if (ascii == null) { return null; } StringBuilder result = new StringBuilder(); for (int cp, len = ascii.length(), i = 0; i < len; i += Character.charCount(cp)) { cp = ascii.codePointAt(i); if (cp == '\\' && ascii.codePointCount(i, len) > 1) { i += Character.charCount(cp); cp = ascii.codePointAt(i); if (cp != 'u') { if (cp == 'n') { cp = '\n'; } else if (cp == 'r') { cp = '\r'; } else if (cp == 't') { cp = '\t'; } else { result.append('\\'); } } else if (dontUnescapeULiterals) { // Put back the \ we swallowed result.append('\\'); } else { // checking if the string is long enough if (ascii.codePointCount(i, len) < 1 + 4) { throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE")); } int uStart = ascii.offsetByCodePoints(i, 1); int uEnd = ascii.offsetByCodePoints(uStart, 4); String uStr = ascii.substring(uStart, uEnd); try { cp = Integer.parseInt(uStr, 16); if (!Character.isValidCodePoint(cp)) { throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE")); } i = uEnd - Character.charCount(cp); } catch (NumberFormatException ex) { throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"), ex); } } } result.appendCodePoint(cp); } return result.toString(); }
/** * This method ensures that the output String has only valid XML unicode characters as specified * by the XML 1.0 standard. For reference, please see the standard. This method will return an * empty String if the input is null or empty. * * @author Donoiu Cristian, GPL The String whose non-valid characters we want to remove. in * String, stripped of non-valid characters. */ public static String removeInvalidXMLCharacters(String s) { StringBuilder out = new StringBuilder(); // Used to hold the output. int codePoint; // Used to reference the current character. // String ss = "\ud801\udc00"; // This is actualy one unicode // character, represented by two code units!!!. // System.out.println(ss.codePointCount(0, ss.length()));// See: 1 int i = 0; while (i < s.length()) { // System.out.println("i=" + i); codePoint = s.codePointAt(i); // This is the unicode code of the character. if ((codePoint == 0x9) || // Consider testing larger ranges first to improve speed. (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) { out.append(Character.toChars(codePoint)); } i += Character.charCount( codePoint); // Increment with the number of code units(java chars) needed to represent // a Unicode char. } return out.toString(); }
public static String enquoteCString(String value, char delimiter) { final int length = value.length(); StringBuilder result = new StringBuilder(); result.append(delimiter); for (int offset = 0, codepoint; offset < length; offset += Character.charCount(codepoint)) { codepoint = value.codePointAt(offset); if (codepoint > 0xFF) { result.appendCodePoint(codepoint); } else if (codepoint == delimiter) { result.append("\\" + delimiter); } else if (codepoint == '\\') { result.append("\\\\"); } else if (codepoint == '\n') { result.append("\\n"); } else if (codepoint == '\t') { result.append("\\t"); } else if (codepoint == '\r') { result.append("\\r"); } else if (codepoint == '\f') { result.append("\\f"); } else if (codepoint >= 32 && codepoint < 127) { result.append((char) codepoint); } else if (codepoint <= 0xff) { result.append("\\"); result.append(String.format("%03o", codepoint)); } } result.append(delimiter); return result.toString(); }
/** * Adds double-quote at the beginning and end of the string and escapes special characters with * backslash: newline, carriage return, line feed, tab and backslash. Characters between 127 and * 255 are stored as \xFF Characters between 256 and 65535 are stored as \uFFFF Characters above * 65536 are stored as \u0010FFFF * * @param value any string * @return Python like textual representation of the string */ public static String enquoteKString(String value) { final int length = value.length(); StringBuilder result = new StringBuilder(); result.append("\""); for (int offset = 0, codepoint; offset < length; offset += Character.charCount(codepoint)) { codepoint = value.codePointAt(offset); if (codepoint == '"') { result.append("\\\""); } else if (codepoint == '\\') { result.append("\\\\"); } else if (codepoint == '\n') { result.append("\\n"); } else if (codepoint == '\t') { result.append("\\t"); } else if (codepoint == '\r') { result.append("\\r"); } else if (codepoint == '\f') { result.append("\\f"); } else if (codepoint >= 32 && codepoint < 127) { result.append((char) codepoint); } else if (codepoint <= 0xff) { result.append("\\x"); result.append(String.format("%02x", codepoint)); } else if (codepoint <= 0xffff) { result.append("\\u"); result.append(String.format("%04x", codepoint)); } else { result.append("\\U"); result.append(String.format("%08x", codepoint)); } } result.append("\""); return result.toString(); }
/** puts as utf-8 string */ protected int _put(String str) { final int len = str.length(); int total = 0; for (int i = 0; i < len; ) { int c = Character.codePointAt(str, i); if (c < 0x80) { _buf.write((byte) c); total += 1; } else if (c < 0x800) { _buf.write((byte) (0xc0 + (c >> 6))); _buf.write((byte) (0x80 + (c & 0x3f))); total += 2; } else if (c < 0x10000) { _buf.write((byte) (0xe0 + (c >> 12))); _buf.write((byte) (0x80 + ((c >> 6) & 0x3f))); _buf.write((byte) (0x80 + (c & 0x3f))); total += 3; } else { _buf.write((byte) (0xf0 + (c >> 18))); _buf.write((byte) (0x80 + ((c >> 12) & 0x3f))); _buf.write((byte) (0x80 + ((c >> 6) & 0x3f))); _buf.write((byte) (0x80 + (c & 0x3f))); total += 4; } i += Character.charCount(c); } _buf.write((byte) 0); total++; return total; }
public int guessPhoneticNameStyle(String name) { if (name == null) { return PhoneticNameStyle.UNDEFINED; } int nameStyle = PhoneticNameStyle.UNDEFINED; int length = name.length(); int offset = 0; while (offset < length) { int codePoint = Character.codePointAt(name, offset); if (Character.isLetter(codePoint)) { UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { return PhoneticNameStyle.JAPANESE; } if (isKoreanUnicodeBlock(unicodeBlock)) { return PhoneticNameStyle.KOREAN; } if (isLatinUnicodeBlock(unicodeBlock)) { return PhoneticNameStyle.PINYIN; } } offset += Character.charCount(codePoint); } return nameStyle; }
/** * Logically casts input to UTF32 ints then looks up the output or null if the input is not * accepted. FST must be INPUT_TYPE.BYTE4. */ public static <T> T get(FST<T> fst, CharSequence input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE4; // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); int charIdx = 0; final int charLimit = input.length(); // Accumulate output as we go final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; while (charIdx < charLimit) { final int utf32 = Character.codePointAt(input, charIdx); charIdx += Character.charCount(utf32); if (fst.findTargetArc(utf32, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } }
/** * Remove invalid XML characters from the string * * @param text The string containing xml message. * @return */ private static String removeNonXMLCharacters(String text) { StringBuilder out = new StringBuilder(); StringBuilder errorMessage = null; int codePoint; int index = 0; while (index < text.length()) { codePoint = text.codePointAt(index); if ((codePoint == 0x9) || (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF))) { out.append(Character.toChars(codePoint)); } else { if (errorMessage == null) { errorMessage = new StringBuilder(); errorMessage.append( "The message from xdebug contains invalid XML characters: "); // NOI18N } else { errorMessage.append(", "); // NOI18N } errorMessage.append(codePoint); } index += Character.charCount(codePoint); } if (errorMessage != null) { errorMessage.append( "\nPlease mentioned it in http://netbeans.org/bugzilla/show_bug.cgi?id=179309."); // NOI18N LOGGER.warning(errorMessage.toString()); } return out.toString(); }
/** * Validates that the name of a processor option conforms to the grammar defined by <code> * javax.annotation.processing.Processor.getSupportedOptions()</code>. * * @param optionName * @return <code>true</code> if the name conforms to the grammar, <code>false</code> if not. */ public static boolean isValidOptionName(String optionName) { if (optionName == null) { return false; } boolean startExpected = true; int codePoint; for (int i = 0; i < optionName.length(); i += Character.charCount(codePoint)) { codePoint = optionName.codePointAt(i); if (startExpected) { if (!Character.isJavaIdentifierStart(codePoint)) { return false; } startExpected = false; } else { if (codePoint == '.') { startExpected = true; } else if (!Character.isJavaIdentifierPart(codePoint)) { return false; } } } return !startExpected; }
/** * Escapes a character sequence so that it is valid XML. * * @param s The character sequence. * @return The escaped version of the character sequence. */ public static String escapeXML(CharSequence s) { // double quote -- quot // ampersand -- amp // less than -- lt // greater than -- gt // apostrophe -- apos StringBuilder sb = new StringBuilder(s.length() * 2); for (int i = 0; i < s.length(); ) { int codePoint = Character.codePointAt(s, i); if (codePoint == '<') { sb.append(LT); } else if (codePoint == '>') { sb.append(GT); } else if (codePoint == '\"') { sb.append(QUOT); } else if (codePoint == '&') { sb.append(AMP); } else if (codePoint == '\'') { sb.append(APOS); } else { sb.appendCodePoint(codePoint); } i += Character.charCount(codePoint); } return sb.toString(); }
// http://java.sun.com/developer/technicalArticles/Intl/Supplementary/ private String buildString(int codePoint) { if (Character.charCount(codePoint) == 1) { return String.valueOf((char) codePoint); } else { return new String(Character.toChars(codePoint)); } }
/** * Examines the string and returns whether we're inside a double quote. * * <p>This is used to decide whether we should put an automatic space before or after a double * quote character. If we're inside a quotation, then we want to close it, so we want a space * after and not before. Otherwise, we want to open the quotation, so we want a space before and * not after. Exception: after a digit, we never want a space because the "inch" or "minutes" use * cases is dominant after digits. In the practice, we determine whether we are in a quotation or * not by finding the previous double quote character, and looking at whether it's followed by * whitespace. If so, that was a closing quotation mark, so we're not inside a double quote. If * it's not followed by whitespace, then it was an opening quotation mark, and we're inside a * quotation. * * @param text the text to examine. * @return whether we're inside a double quote. */ public static boolean isInsideDoubleQuoteOrAfterDigit(final CharSequence text) { int i = text.length(); if (0 == i) return false; int codePoint = Character.codePointBefore(text, i); if (Character.isDigit(codePoint)) return true; int prevCodePoint = 0; while (i > 0) { codePoint = Character.codePointBefore(text, i); if (Constants.CODE_DOUBLE_QUOTE == codePoint) { // If we see a double quote followed by whitespace, then that // was a closing quote. if (Character.isWhitespace(prevCodePoint)) return false; } if (Character.isWhitespace(codePoint) && Constants.CODE_DOUBLE_QUOTE == prevCodePoint) { // If we see a double quote preceded by whitespace, then that // was an opening quote. No need to continue seeking. return true; } i -= Character.charCount(codePoint); prevCodePoint = codePoint; } // We reached the start of text. If the first char is a double quote, then we're inside // a double quote. Otherwise we're not. return Constants.CODE_DOUBLE_QUOTE == codePoint; }
/** * Determines if a character sequence is a QName. A QName is either an NCName (LocalName), or an * NCName followed by a colon followed by another NCName (where the first NCName is referred to as * the 'Prefix Name' and the second NCName is referred to as the 'Local Name' - i.e. * PrefixName:LocalName). * * @param s The character sequence to be tested. * @return {@code true} if {@code s} is a QName, otherwise {@code false}. */ public static boolean isQName(CharSequence s) { if (isNullOrEmpty(s)) { return false; } boolean foundColon = false; boolean inNCName = false; for (int i = 0; i < s.length(); ) { int codePoint = Character.codePointAt(s, i); if (codePoint == ':') { if (foundColon) { return false; } foundColon = true; if (!inNCName) { return false; } inNCName = false; } else { if (!inNCName) { if (!isXMLNameStartCharacter(codePoint)) { return false; } inNCName = true; } else { if (!isXMLNameChar(codePoint)) { return false; } } } i += Character.charCount(codePoint); } return true; }
// @VisibleForTesting public static boolean isValidUriCharset(String uri) { int len = uri.length(); int i = 0; while (i < len) { int codePoint = uri.codePointAt(i); i += Character.charCount(codePoint); if (Character.isSupplementaryCodePoint(codePoint)) { continue; } if (HREF_DISCRETE_UCSCHAR.indexOf(codePoint) >= 0) { continue; } // iunreserved ranges if (('a' <= codePoint && codePoint <= 'z') || ('A' <= codePoint && codePoint <= 'Z') || ('0' <= codePoint && codePoint <= '9')) { continue; } // href-ucschar ranges if ((0 <= codePoint && codePoint <= 0x1F) || (0x7F <= codePoint && codePoint <= 0xD7FF) || (0xE000 <= codePoint && codePoint <= 0xFFFD)) { continue; } return false; } return true; }
public static int lastIndexOfAny(String str, String search, int offset) { if (str.equals("") || search.equals("")) { return -1; } for (int i = str.length(), strCodepoint; i > 0; i -= Character.charCount(strCodepoint)) { strCodepoint = str.codePointBefore(i); for (int j = search.length(), searchCodepoint; j > 0; j -= Character.charCount(searchCodepoint)) { searchCodepoint = search.codePointBefore(j); if (strCodepoint == searchCodepoint) { return i; } } } return -1; }
/** * Determines if a character sequence is an NCName (Non-Colonised Name). An NCName is a string * which starts with an NCName start character and is followed by zero or more NCName characters. * * @param s The character sequence to be tested. * @return {@code true} if {@code s} is an NCName, otherwise {@code false}. */ public static boolean isNCName(CharSequence s) { if (isNullOrEmpty(s)) { return false; } int firstCodePoint = Character.codePointAt(s, 0); if (!isNCNameStartChar(firstCodePoint)) { return false; } for (int i = Character.charCount(firstCodePoint); i < s.length(); ) { int codePoint = Character.codePointAt(s, i); if (!isNCNameChar(codePoint)) { return false; } i += Character.charCount(codePoint); } return true; }
public static int indexOfAny(String str, String search, int offset) { if (str.equals("") || search.equals("")) { return -1; } for (int i = 0, strCodepoint; i < str.length(); i += Character.charCount(strCodepoint)) { strCodepoint = str.codePointAt(i); for (int j = 0, searchCodepoint; j < search.length(); j += Character.charCount(searchCodepoint)) { searchCodepoint = search.codePointAt(j); if (strCodepoint == searchCodepoint) { return i; } } } return -1; }
public void testRandomRealisticWhiteSpace() throws IOException { Map<String, String> map = new HashMap<>(); Set<String> seen = new HashSet<>(); int numTerms = atLeast(50); boolean ignoreCase = random().nextBoolean(); for (int i = 0; i < numTerms; i++) { String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random()); char[] charArray = randomRealisticUnicodeString.toCharArray(); StringBuilder builder = new StringBuilder(); for (int j = 0; j < charArray.length; ) { int cp = Character.codePointAt(charArray, j, charArray.length); if (!Character.isWhitespace(cp)) { builder.appendCodePoint(cp); } j += Character.charCount(cp); } if (builder.length() > 0) { String inputValue = builder.toString(); // Make sure we don't try to add two inputs that vary only by case: String seenInputValue; if (ignoreCase) { // TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)??? char[] buffer = inputValue.toCharArray(); CharacterUtils.toLowerCase(buffer, 0, buffer.length); seenInputValue = buffer.toString(); } else { seenInputValue = inputValue; } if (seen.contains(seenInputValue) == false) { seen.add(seenInputValue); String value = TestUtil.randomSimpleString(random()); map.put(inputValue, value.isEmpty() ? "a" : value); } } } if (map.isEmpty()) { map.put("booked", "books"); } StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase); Set<Entry<String, String>> entrySet = map.entrySet(); StringBuilder input = new StringBuilder(); List<String> output = new ArrayList<>(); for (Entry<String, String> entry : entrySet) { builder.add(entry.getKey(), entry.getValue()); if (random().nextBoolean() || output.isEmpty()) { input.append(entry.getKey()).append(" "); output.add(entry.getValue()); } } Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(input.toString())); TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build())); assertTokenStreamContents(stream, output.toArray(new String[0])); }
/** * Returns true if the given string is accepted by the automaton. * * <p>Complexity: linear in the length of the string. * * <p><b>Note:</b> for full performance, use the {@link RunAutomaton} class. */ public static boolean run(Automaton a, String s) { if (a.isSingleton()) return s.equals(a.singleton); if (a.deterministic) { State p = a.initial; for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) { State q = p.step(cp = s.codePointAt(i)); if (q == null) return false; p = q; } return p.accept; } else { State[] states = a.getNumberedStates(); LinkedList<State> pp = new LinkedList<State>(); LinkedList<State> pp_other = new LinkedList<State>(); BitSet bb = new BitSet(states.length); BitSet bb_other = new BitSet(states.length); pp.add(a.initial); ArrayList<State> dest = new ArrayList<State>(); boolean accept = a.initial.accept; for (int i = 0, c = 0; i < s.length(); i += Character.charCount(c)) { c = s.codePointAt(i); accept = false; pp_other.clear(); bb_other.clear(); for (State p : pp) { dest.clear(); p.step(c, dest); for (State q : dest) { if (q.accept) accept = true; if (!bb_other.get(q.number)) { bb_other.set(q.number); pp_other.add(q); } } } LinkedList<State> tp = pp; pp = pp_other; pp_other = tp; BitSet tb = bb; bb = bb_other; bb_other = tb; } return accept; } }
@Override public final boolean incrementToken() throws IOException { this.clearAttributes(); int length = 0; int start = -1; int end = -1; char[] buffer = this.termAtt.buffer(); while (true) { if (this.bufferIndex >= this.dataLen) { this.offset += this.dataLen; this.charUtils.fill(this.ioBuffer, this.input); if (this.ioBuffer.getLength() == 0) { this.dataLen = 0; if (length <= 0) { this.finalOffset = this.correctOffset(this.offset); return false; } break; } this.dataLen = this.ioBuffer.getLength(); this.bufferIndex = 0; } int c = this.charUtils.codePointAt( this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength()); int charCount = Character.charCount(c); this.bufferIndex += charCount; if (this.isTokenChar(c)) { if (length == 0) { assert start == -1; start = this.offset + this.bufferIndex - charCount; end = start; } else if (length >= buffer.length - 1) { buffer = this.termAtt.resizeBuffer(2 + length); } end += charCount; length += Character.toChars(this.normalize(c), buffer, length); if (length >= MAX_WORD_LEN) { break; } } else if (length > 0) { break; } } this.termAtt.setLength(length); assert start != -1; this.offsetAtt.setOffset(this.correctOffset(start), this.finalOffset = this.correctOffset(end)); return true; }
public static String newSingleCodePointString(int codePoint) { if (Character.charCount(codePoint) == 1) { // Optimization: avoid creating a temporary array for characters that are // represented by a single char value return String.valueOf((char) codePoint); } // For surrogate pair return new String(Character.toChars(codePoint)); }
/** * Unpacks data for the selected Unicode version, populating {@link #propertyValueIntervals}. * * @param propertyValues The list of property values, in same order as the packed data * corresponding to them, in the given intervals, for the selected Unicode version. * @param intervals The packed character intervals corresponding to and in the same order as the * given propertyValues, for the selected Unicode version. * @param propertyValueAliases Key/value pairs mapping property value aliases to property values, * for the selected Unicode version. * @param maximumCodePoint The maximum code point for the selected Unicode version. * @param caselessMatchPartitions The packed caseless match partition data for the selected * Unicode version * @param caselessMatchPartitionSize The partition data record length (the maximum number of * elements in a caseless match partition) for the selected Unicode version. */ private void bind( String[] propertyValues, String[] intervals, String[] propertyValueAliases, int maximumCodePoint, String caselessMatchPartitions, int caselessMatchPartitionSize) { // IntCharSet caselessMatches[] is lazily initialized - don't unpack here this.caselessMatchPartitions = caselessMatchPartitions; this.caselessMatchPartitionSize = caselessMatchPartitionSize; this.maximumCodePoint = maximumCodePoint; for (int n = 0; n < propertyValues.length; ++n) { String propertyValue = propertyValues[n]; String propertyIntervals = intervals[n]; IntCharSet set = new IntCharSet(); for (int index = 0; index < propertyIntervals.length(); ) { int start = propertyIntervals.codePointAt(index); index += Character.charCount(start); int end = propertyIntervals.codePointAt(index); index += Character.charCount(end); set.add(new Interval(start, end)); } propertyValueIntervals.put(propertyValue, set); if (2 == propertyValue.length()) { String singleLetter = propertyValue.substring(0, 1); IntCharSet singleLetterPropValueSet = propertyValueIntervals.get(singleLetter); if (null == singleLetterPropValueSet) { singleLetterPropValueSet = new IntCharSet(); propertyValueIntervals.put(singleLetter, singleLetterPropValueSet); } singleLetterPropValueSet.add(set); } } for (int n = 0; n < propertyValueAliases.length; n += 2) { String alias = propertyValueAliases[n]; String propertyValue = propertyValueAliases[n + 1]; IntCharSet targetSet = propertyValueIntervals.get(propertyValue); if (null != targetSet) { propertyValueIntervals.put(alias, targetSet); } } bindInvariantIntervals(); }