public static void writeString(Writer out, String value) throws IOException { out.write('"'); char[] array = null; for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); switch (c) { case '"': out.write("\\\""); break; case '\\': out.write("\\\\"); break; case '\n': out.write("\\n"); break; case '\t': out.write("\\t"); break; case '\r': out.write("\\r"); break; case '\0': out.write("\\0"); break; default: if (Character.isISOControl(c)) { // Encode as: "x" + two hex digits. if (array == null) { array = new char[4]; array[0] = '\\'; } array[1] = 'x'; array[3] = Hex.charAt(c & 0xf); c >>= 4; array[2] = Hex.charAt(c & 0xf); out.write(array, 0, 4); } else if (Character.isHighSurrogate(c)) { // Surrogate pair i++; if (i >= value.length()) { throw new IllegalArgumentException("high surrogate not followed by anything"); } char c2 = value.charAt(i); if (!Character.isLowSurrogate(c2)) { throw new IllegalArgumentException("high surrogate not followed by low surrogate"); } out.write(value, i - 1, 2); } else if (Character.isLowSurrogate(c)) { throw new IllegalArgumentException("low surrogate without preceding high surrogate"); } else { // Basic Multilingual Plane (16 bits) out.write(c); } } } out.write('"'); }
private int[] calculateNewlineIndicesAndCheckCodePoints(StringBuilder inputData) throws SnuggleParseException { List<Integer> newlineIndicesBuilder = new ArrayList<Integer>(); newlineIndicesBuilder.add(Integer.valueOf(-1)); char lastChar = 0; char thisChar; /* (16 bit char only) */ int codePoint; /* (Full Unicode code point */ for (int i = 0, length = inputData.length(); i < length; i++, lastChar = thisChar) { thisChar = inputData.charAt(i); if (thisChar == '\n') { newlineIndicesBuilder.add(Integer.valueOf(i)); } if (Character.isHighSurrogate(lastChar)) { if (Character.isLowSurrogate(thisChar)) { codePoint = Character.toCodePoint(lastChar, thisChar); } else { /* Error: last was bad surrogate character */ recordSurrogateError(inputData, i - 1, lastChar); continue; } } else if (Character.isLowSurrogate(thisChar)) { /* Error: this is bad surrogate character */ recordSurrogateError(inputData, i, thisChar); continue; } else { codePoint = thisChar; } /* Check that we allow this codepoint */ if (Character.isISOControl(codePoint) && !(codePoint == '\r' || codePoint == '\n' || codePoint == '\t')) { sessionContext.registerError( new InputError( CoreErrorCode.TTEG02, null, Integer.toHexString(codePoint), Integer.valueOf(i))); inputData.setCharAt(i, ' '); } } /* Make sure last character wasn't surrogate pair starter */ if (Character.isHighSurrogate(lastChar)) { recordSurrogateError(inputData, inputData.length() - 1, lastChar); } /* Finally store newline information */ int[] calculatedNewlineIndices = new int[newlineIndicesBuilder.size()]; for (int i = 0; i < calculatedNewlineIndices.length; i++) { calculatedNewlineIndices[i] = newlineIndicesBuilder.get(i); } return calculatedNewlineIndices; }
/** {@inheritDoc} */ @Override public int read() throws IOException { int ic = buffer.get(bufferPosition); // End of input if (ic == -1) { buffer.freeBefore(bufferPosition); return ic; } char c = (char) ic; // Skip surrogate pair characters if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) { iterationMarkSpanEndPosition = bufferPosition + 1; } // Free rolling buffer on full stop if (c == FULL_STOP_PUNCTUATION) { buffer.freeBefore(bufferPosition); iterationMarkSpanEndPosition = bufferPosition + 1; } // Normalize iteration mark if (isIterationMark(c)) { c = normalizeIterationMark(c); } bufferPosition++; return c; }
public static int correctSubStringLen(String input, int len) { if (Character.isHighSurrogate(input.charAt(len - 1))) { assert input.length() >= len + 1 && Character.isLowSurrogate(input.charAt(len)); return len + 1; } return len; }
/** * Check if the given {@code index} is between UTF-16 surrogate pair. * * @param str The String. * @param index The index * @return True if the index is between UTF-16 surrogate pair, false otherwise. */ @VisibleForTesting static boolean isIndexBetweenUtf16SurrogatePair(CharSequence str, int index) { return index > 0 && index < str.length() && Character.isHighSurrogate(str.charAt(index - 1)) && Character.isLowSurrogate(str.charAt(index)); }
protected CoderResult encodeLoop(CharBuffer src, ByteBuffer dst) { int mark = src.position(); if (!doneBOM && src.hasRemaining()) { if (dst.remaining() < 4) return CoderResult.OVERFLOW; put(BOM_BIG, dst); doneBOM = true; } try { while (src.hasRemaining()) { char c = src.get(); if (!Character.isSurrogate(c)) { if (dst.remaining() < 4) return CoderResult.OVERFLOW; mark++; put(c, dst); } else if (Character.isHighSurrogate(c)) { if (!src.hasRemaining()) return CoderResult.UNDERFLOW; char low = src.get(); if (Character.isLowSurrogate(low)) { if (dst.remaining() < 4) return CoderResult.OVERFLOW; mark += 2; put(Character.toCodePoint(c, low), dst); } else { return CoderResult.malformedForLength(1); } } else { // assert Character.isLowSurrogate(c); return CoderResult.malformedForLength(1); } } return CoderResult.UNDERFLOW; } finally { src.position(mark); } }
/** * Returns the Unicode code point of the character at the given index. * * <p>Unlike {@link Character#codePointAt(CharSequence, int)} or {@link String#codePointAt(int)} * this method will never fail silently when encountering an invalid surrogate pair. * * <p>The behaviour of this method is as follows: * * <ol> * <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown. * <li><b>If the character at the specified index is not a surrogate, it is returned.</b> * <li>If the first character was a high surrogate value, then an attempt is made to read the * next character. * <ol> * <li><b>If the end of the sequence was reached, the negated value of the trailing high * surrogate is returned.</b> * <li><b>If the next character was a valid low surrogate, the code point value of the * high/low surrogate pair is returned.</b> * <li>If the next character was not a low surrogate value, then {@link * IllegalArgumentException} is thrown. * </ol> * <li>If the first character was a low surrogate value, {@link IllegalArgumentException} is * thrown. * </ol> * * @param seq the sequence of characters from which to decode the code point * @param index the index of the first character to decode * @param end the index beyond the last valid character to decode * @return the Unicode code point for the given index or the negated value of the trailing high * surrogate character at the end of the sequence */ protected static final int codePointAt(CharSequence seq, int index, int end) { if (index < end) { char c1 = seq.charAt(index++); if (c1 < Character.MIN_HIGH_SURROGATE || c1 > Character.MAX_LOW_SURROGATE) { // Fast path (first test is probably all we need to do) return c1; } else if (c1 <= Character.MAX_HIGH_SURROGATE) { // If the high surrogate was the last character, return its inverse if (index == end) { return -c1; } // Otherwise look for the low surrogate following it char c2 = seq.charAt(index); if (Character.isLowSurrogate(c2)) { return Character.toCodePoint(c1, c2); } throw new IllegalArgumentException( "Expected low surrogate but got char '" + c2 + "' with value " + (int) c2 + " at index " + index); } else { throw new IllegalArgumentException( "Unexpected low surrogate character '" + c1 + "' with value " + (int) c1 + " at index " + (index - 1)); } } throw new IndexOutOfBoundsException("Index exceeds specified range"); }
private void finishComposition() { int len = buffer.length(); if (len == 6 && format != SPECIAL_ESCAPE) { char codePoint = (char) getCodePoint(buffer, 2, 5); if (Character.isValidCodePoint(codePoint) && codePoint != 0xFFFF) { buffer.setLength(0); buffer.append(codePoint); sendCommittedText(); return; } } else if (len == 8 && format == SPECIAL_ESCAPE) { int codePoint = getCodePoint(buffer, 2, 7); if (Character.isValidCodePoint(codePoint) && codePoint != 0xFFFF) { buffer.setLength(0); buffer.appendCodePoint(codePoint); sendCommittedText(); return; } } else if (len == 12 && format == SURROGATE_PAIR) { char[] codePoint = {(char) getCodePoint(buffer, 2, 5), (char) getCodePoint(buffer, 8, 11)}; if (Character.isHighSurrogate(codePoint[0]) && Character.isLowSurrogate(codePoint[1])) { buffer.setLength(0); buffer.append(codePoint); sendCommittedText(); return; } } beep(); }
/** Returns current character */ int getCurrent() { char c1 = text.current(); if (Character.isHighSurrogate(c1) && text.getIndex() < text.getEndIndex()) { char c2 = text.next(); text.previous(); if (Character.isLowSurrogate(c2)) { return Character.toCodePoint(c1, c2); } } return (int) c1; }
/** Returns the count of next character. */ private int getCurrentCodePointCount() { char c1 = text.current(); if (Character.isHighSurrogate(c1) && text.getIndex() < text.getEndIndex()) { char c2 = text.next(); text.previous(); if (Character.isLowSurrogate(c2)) { return 2; } } return 1; }
/** Returns previous character */ private int getPrevious() { char c2 = text.previous(); if (Character.isLowSurrogate(c2) && text.getIndex() > text.getBeginIndex()) { char c1 = text.previous(); if (Character.isHighSurrogate(c1)) { return Character.toCodePoint(c1, c2); } else { text.next(); } } return (int) c2; }
public int previousCodePoint() { int ch1 = previous(); if (Character.isLowSurrogate((char) ch1)) { int ch2 = previous(); if (Character.isHighSurrogate((char) ch2)) { return Character.toCodePoint((char) ch2, (char) ch1); } else if (ch2 != DONE) { // unmatched trail surrogate so back out next(); } } return ch1; }
/** * Converts a string to a UTF8 byte array. * * @param string string to be converted * @return byte array */ private static byte[] utf8(final String string) { final char[] arr = string.toCharArray(); final int al = arr.length; final TokenBuilder tb = new TokenBuilder(al << 1); for (int c = 0; c < al; ++c) { final char ch = arr[c]; tb.add( Character.isHighSurrogate(ch) && c < al - 1 && Character.isLowSurrogate(arr[c + 1]) ? Character.toCodePoint(ch, arr[++c]) : ch); } return tb.finish(); }
/** * Parses a UCS-4 character from the given source buffer, handling surrogates. * * @param c The first character * @param in The source buffer, from which one more character will be consumed if c is a high * surrogate * @returns Either a parsed UCS-4 character, in which case the isPair() and increment() methods * will return meaningful values, or -1, in which case error() will return a descriptive * result object */ public int parse(char c, CharBuffer in) { if (Character.isHighSurrogate(c)) { if (!in.hasRemaining()) { error = CoderResult.UNDERFLOW; return -1; } char d = in.get(); if (Character.isLowSurrogate(d)) { character = Character.toCodePoint(c, d); error = null; return character; } error = CoderResult.malformedForLength(1); return -1; } if (Character.isLowSurrogate(c)) { error = CoderResult.malformedForLength(1); return -1; } character = c; error = null; return character; }
/** * Adjusts entity indices for supplementary characters (Emoji being the most common example) in * UTF-8 (ones outside of U+0000 to U+FFFF range) are represented as a pair of char values, the * first from the high-surrogates range, and the second from the low-surrogates range. * * @param content The content of the tweet * @param formattedTweetText The formatted tweet text with entities that we need to adjust */ static void adjustIndicesForSupplementaryChars( StringBuilder content, FormattedTweetText formattedTweetText) { final List<Integer> highSurrogateIndices = new ArrayList<>(); final int len = content.length() - 1; for (int i = 0; i < len; ++i) { if (Character.isHighSurrogate(content.charAt(i)) && Character.isLowSurrogate(content.charAt(i + 1))) { highSurrogateIndices.add(i); } } adjustEntitiesWithOffsets(formattedTweetText.urlEntities, highSurrogateIndices); adjustEntitiesWithOffsets(formattedTweetText.mediaEntities, highSurrogateIndices); }
/** * Parses a UCS-4 character from the given source buffer, handling surrogates. * * @param c The first character * @param ia The input array, from which one more character will be consumed if c is a high * surrogate * @param ip The input index * @param il The input limit * @returns Either a parsed UCS-4 character, in which case the isPair() and increment() methods * will return meaningful values, or -1, in which case error() will return a descriptive * result object */ public int parse(char c, char[] ia, int ip, int il) { assert (ia[ip] == c); if (Character.isHighSurrogate(c)) { if (il - ip < 2) { error = CoderResult.UNDERFLOW; return -1; } char d = ia[ip + 1]; if (Character.isLowSurrogate(d)) { character = Character.toCodePoint(c, d); error = null; return character; } error = CoderResult.malformedForLength(1); return -1; } if (Character.isLowSurrogate(c)) { error = CoderResult.malformedForLength(1); return -1; } character = c; error = null; return character; }
/** * Gets the index of the longest NCName that is the suffix of a character sequence. * * @param s The character sequence. * @return The index of the longest suffix of the specified character sequence {@code s} that is * an NCName, or -1 if the character sequence {@code s} does not have a suffix that is an * NCName. */ public static int getNCNameSuffixIndex(CharSequence s) { // identify bnode labels and do not try to split them if (s.length() > 1 && s.charAt(0) == '_' && s.charAt(1) == ':') { return -1; } int index = -1; for (int i = s.length() - 1; i > -1; i--) { if (!Character.isLowSurrogate(s.charAt(i))) { int codePoint = Character.codePointAt(s, i); if (isNCNameStartChar(codePoint)) { index = i; } if (!isNCNameChar(codePoint)) { break; } } } return index; }
public int encode(char[] src, int sp, int len, byte[] dst) { int dp = 0; int sl = sp + Math.min(len, dst.length); while (sp < sl) { char c = src[sp++]; int b = encode(c); if (b != UNMAPPABLE_ENCODING) { dst[dp++] = (byte) b; continue; } if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(src[sp])) { if (len > dst.length) { sl++; len--; } sp++; } dst[dp++] = repl; } return dp; }
private static CharBuffer _getRawCharBuffer(String rawString, int start) { int count = 0; for (int i = start; i < rawString.length(); i++) { char rawChar = rawString.charAt(i); if (!_validChars.get(rawChar)) { count++; if (Character.isHighSurrogate(rawChar)) { if (((i + 1) < rawString.length()) && Character.isLowSurrogate(rawString.charAt(i + 1))) { count++; } } } else { break; } } return CharBuffer.wrap(rawString, start, start + count); }
/* */ protected CoderResult encodeLoop( CharBuffer paramCharBuffer, ByteBuffer paramByteBuffer) { /* 146 */ int i = paramCharBuffer.position(); /* 147 */ if ((!this.doneBOM) && (paramCharBuffer.hasRemaining())) { /* 148 */ if (paramByteBuffer.remaining() < 4) /* 149 */ return CoderResult.OVERFLOW; /* 150 */ put(65279, paramByteBuffer); /* 151 */ this.doneBOM = true; /* */ } /* */ try { /* 154 */ while (paramCharBuffer.hasRemaining()) { /* 155 */ char c1 = paramCharBuffer.get(); /* */ CoderResult localCoderResult2; /* 156 */ if (!Character.isSurrogate(c1)) { /* 157 */ if (paramByteBuffer.remaining() < 4) /* 158 */ return CoderResult.OVERFLOW; /* 159 */ i++; /* 160 */ put(c1, paramByteBuffer); /* 161 */ } else if (Character.isHighSurrogate(c1)) { /* 162 */ if (!paramCharBuffer.hasRemaining()) /* 163 */ return CoderResult.UNDERFLOW; /* 164 */ char c2 = paramCharBuffer.get(); /* */ CoderResult localCoderResult4; /* 165 */ if (Character.isLowSurrogate(c2)) { /* 166 */ if (paramByteBuffer.remaining() < 4) /* 167 */ return CoderResult.OVERFLOW; /* 168 */ i += 2; /* 169 */ put(Character.toCodePoint(c1, c2), paramByteBuffer); /* */ } else { /* 171 */ return CoderResult.malformedForLength(1); /* */ } /* */ } /* */ else { /* 175 */ return CoderResult.malformedForLength(1); /* */ } /* */ } /* 178 */ return CoderResult.UNDERFLOW; /* */ } finally { /* 180 */ paramCharBuffer.position(i); /* */ } /* */ }
/** * Returns a parser that produces a low-surrogate character. * * @param missing The error if there is no character on the stream to produce a low-surrogate * character with. * @param sat The error if the produced character is not a low-surrogate character. * @return A parser that produces a low-surrogate character. * @see Character#isLowSurrogate(char) */ public static <E> Parser<Stream<Character>, Character, E> lowSurrogate( final F0<E> missing, final F<Character, E> sat) { return StreamParser.satisfy(missing, sat, c -> Character.isLowSurrogate(c)); }
private void writePrimitive(PrintWriter out, Object val) throws SerializerException { if (val instanceof Tristate) { Tristate bool = (Tristate) val; if (bool == Tristate.TRUE) { out.print(BOOLEAN_TRUE); } else if (bool == Tristate.FALSE) { out.print(BOOLEAN_FALSE); } else if (bool == Tristate.UNDEFINED) { out.print(BOOLEAN_UNDEFINED); } } else if (val instanceof Double) { if (((Double) val).isInfinite() || (((Double) val).isNaN())) { LOGGER.info("Serializing infinite or NaN double as 0.0"); out.print("0.0"); } else { String string = val.toString(); if (string.endsWith(DOT_0)) { out.print(string.substring(0, string.length() - 1)); } else { out.print(string); } } } else if (val instanceof Boolean) { Boolean bool = (Boolean) val; if (bool) { out.print(BOOLEAN_TRUE); } else { out.print(BOOLEAN_FALSE); } } else if (val instanceof String) { out.print(SINGLE_QUOTE); String stringVal = (String) val; for (int i = 0; i < stringVal.length(); i++) { char c = stringVal.charAt(i); if (c == '\'') { out.print("\'\'"); } else if (c == '\\') { out.print("\\\\"); } else if (c >= 32 && c <= 126) { // ISO 8859-1 out.print(c); } else if (c < 255) { // ISO 10646 and ISO 8859-1 are the same < 255 , using ISO_8859_1 out.write( "\\X\\" + new String( Hex.encode( Charsets.ISO_8859_1 .encode(CharBuffer.wrap(new char[] {(char) c})) .array()), Charsets.UTF_8) .toUpperCase()); } else { if (useIso8859_1) { // ISO 8859-1 with -128 offset ByteBuffer encode = Charsets.ISO_8859_1.encode(new String(new char[] {(char) (c - 128)})); out.write("\\S\\" + (char) encode.get()); } else { // The following code has not been tested (2012-04-25) // Use UCS-2 or UCS-4 // TODO when multiple sequential characters should be encoded in UCS-2 or UCS-4, we // don't really need to add all those \X0\ \X2\ and \X4\ chars if (Character.isLowSurrogate(c)) { throw new SerializerException("Unexpected low surrogate range char"); } else if (Character.isHighSurrogate(c)) { // We need UCS-4, this is probably never happening if (i + 1 < stringVal.length()) { char low = stringVal.charAt(i + 1); if (!Character.isLowSurrogate(low)) { throw new SerializerException( "High surrogate char should be followed by char in low surrogate range"); } try { out.write( "\\X4\\" + new String( Hex.encode( Charset.forName("UTF-32") .encode(new String(new char[] {c, low})) .array()), Charsets.UTF_8) .toUpperCase() + "\\X0\\"); } catch (UnsupportedCharsetException e) { throw new SerializerException(e); } i++; } else { throw new SerializerException( "High surrogate char should be followed by char in low surrogate range, but end of string reached"); } } else { // UCS-2 will do out.write( "\\X2\\" + new String( Hex.encode( Charsets.UTF_16BE .encode(CharBuffer.wrap(new char[] {c})) .array()), Charsets.UTF_8) .toUpperCase() + "\\X0\\"); } } } } out.print(SINGLE_QUOTE); } else if (val instanceof Enumerator) { out.print("." + val + "."); } else { out.print(val == null ? "$" : val.toString()); } }
// Backtrace from the provided position, back to the last // time we back-traced, accumulating the resulting tokens to // the pending list. The pending list is then in-reverse // (last token should be returned first). private void backtrace(final Position endPosData, final int fromIDX) throws IOException { final int endPos = endPosData.pos; if (VERBOSE) { System.out.println( "\n backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]); } final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos); if (dotOut != null) { dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end); } int pos = endPos; int bestIDX = fromIDX; Token altToken = null; // We trace backwards, so this will be the leftWordID of // the token after the one we are now on: int lastLeftWordID = -1; int backCount = 0; // TODO: sort of silly to make Token instances here; the // back trace has all info needed to generate the // token. So, we could just directly set the attrs, // from the backtrace, in incrementToken w/o ever // creating Token; we'd have to defer calling freeBefore // until after the backtrace was fully "consumed" by // incrementToken. while (pos > lastBackTracePos) { // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); final Position posData = positions.get(pos); assert bestIDX < posData.count; int backPos = posData.backPos[bestIDX]; assert backPos >= lastBackTracePos : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos; int length = pos - backPos; Type backType = posData.backType[bestIDX]; int backID = posData.backID[bestIDX]; int nextBestIDX = posData.backIndex[bestIDX]; if (outputCompounds && searchMode && altToken == null && backType != Type.USER) { // In searchMode, if best path had picked a too-long // token, we use the "penalty" to compute the allowed // max cost of an alternate back-trace. If we find an // alternate back trace with cost below that // threshold, we pursue it instead (but also output // the long token). // System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); final int penalty = computeSecondBestThreshold(backPos, pos - backPos); if (penalty > 0) { if (VERBOSE) { System.out.println( " compound=" + new String(buffer.get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); } // Use the penalty to set maxCost on the 2nd best // segmentation: int maxCost = posData.costs[bestIDX] + penalty; if (lastLeftWordID != -1) { maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID); } // Now, prune all too-long tokens from the graph: pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]); // Finally, find 2nd best back-trace and resume // backtrace there: int leastCost = Integer.MAX_VALUE; int leastIDX = -1; for (int idx = 0; idx < posData.count; idx++) { int cost = posData.costs[idx]; // System.out.println(" idx=" + idx + " prevCost=" + cost); if (lastLeftWordID != -1) { cost += costs.get( getDict(posData.backType[idx]).getRightId(posData.backID[idx]), lastLeftWordID); // System.out.println(" += bgCost=" + // costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]), // lastLeftWordID) + " -> " + cost); } // System.out.println("penalty " + posData.backPos[idx] + " to " + pos); // cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); if (cost < leastCost) { // System.out.println(" ** "); leastCost = cost; leastIDX = idx; } } // System.out.println(" leastIDX=" + leastIDX); if (VERBOSE) { System.out.println( " afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); } if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) { // We should have pruned the altToken from the graph: assert posData.backPos[leastIDX] != backPos; // Save the current compound token, to output when // this alternate path joins back: altToken = new Token( backID, fragment, backPos - lastBackTracePos, length, backType, backPos, getDict(backType)); // Redirect our backtrace to 2nd best: bestIDX = leastIDX; nextBestIDX = posData.backIndex[bestIDX]; backPos = posData.backPos[bestIDX]; length = pos - backPos; backType = posData.backType[bestIDX]; backID = posData.backID[bestIDX]; backCount = 0; // System.out.println(" do alt token!"); } else { // I think in theory it's possible there is no // 2nd best path, which is fine; in this case we // only output the compound token: // System.out.println(" no alt token! bestIDX=" + bestIDX); } } } final int offset = backPos - lastBackTracePos; assert offset >= 0; if (altToken != null && altToken.getPosition() >= backPos) { // We've backtraced to the position where the // compound token starts; add it now: // The pruning we did when we created the altToken // ensures that the back trace will align back with // the start of the altToken: assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos; // NOTE: not quite right: the compound token may // have had all punctuation back traced so far, but // then the decompounded token at this position is // not punctuation. In this case backCount is 0, // but we should maybe add the altToken anyway...? if (backCount > 0) { backCount++; altToken.setPositionLength(backCount); if (VERBOSE) { System.out.println(" add altToken=" + altToken); } pending.add(altToken); } else { // This means alt token was all punct tokens: if (VERBOSE) { System.out.println(" discard all-punctuation altToken=" + altToken); } assert discardPunctuation; } altToken = null; } final Dictionary dict = getDict(backType); if (backType == Type.USER) { // Expand the phraseID we recorded into the actual // segmentation: final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID); int wordID = wordIDAndLength[0]; int current = 0; for (int j = 1; j < wordIDAndLength.length; j++) { final int len = wordIDAndLength[j]; // System.out.println(" add user: len=" + len); pending.add( new Token( wordID + j - 1, fragment, current + offset, len, Type.USER, current + backPos, dict)); if (VERBOSE) { System.out.println(" add USER token=" + pending.get(pending.size() - 1)); } current += len; } // Reverse the tokens we just added, because when we // serve them up from incrementToken we serve in // reverse: Collections.reverse( pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size())); backCount += wordIDAndLength.length - 1; } else { if (extendedMode && backType == Type.UNKNOWN) { // In EXTENDED mode we convert unknown word into // unigrams: int unigramTokenCount = 0; for (int i = length - 1; i >= 0; i--) { int charLen = 1; if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) { i--; charLen = 2; } // System.out.println(" extended tok offset=" // + (offset + i)); if (!discardPunctuation || !isPunctuation(fragment[offset + i])) { pending.add( new Token( CharacterDefinition.NGRAM, fragment, offset + i, charLen, Type.UNKNOWN, backPos + i, unkDictionary)); unigramTokenCount++; } } backCount += unigramTokenCount; } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) { pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict)); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } backCount++; } else { if (VERBOSE) { System.out.println( " skip punctuation token=" + new String(fragment, offset, length)); } } } lastLeftWordID = dict.getLeftId(backID); pos = backPos; bestIDX = nextBestIDX; } lastBackTracePos = endPos; if (VERBOSE) { System.out.println(" freeBefore pos=" + endPos); } // Notify the circular buffers that we are done with // these positions: buffer.freeBefore(endPos); positions.freeBefore(endPos); }
// Find all concepts that match public Tuple<Integer, Map<Concept, Score>> processArticle(Reader r, PrintWriter pw) throws java.io.IOException { String currToken = null; ArrayList prevStates = new ArrayList(); int tokenPosn = 0; StringBuffer buf = new StringBuffer(); Map<Concept, Score> tokenMap = new HashMap<Concept, Score>(); // Use a 1-character buffer pushback reader PushbackReader pbr = new PushbackReader(r); try { boolean eof = skipHeader(pbr); while (!eof) { ArrayList newStates = new ArrayList(); char separator = ' '; // Read stream and build up a token boolean done = false; while (!done) { int i = pbr.read(); if (i == -1) { done = true; eof = true; } else { char c = (char) i; if (Character.isLetterOrDigit(c) || Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) { buf.append(c); } else { // FIXME: Normalize white-space separator = swallowWhiteSpace(c, pbr); done = true; } } } String token = buf.toString(); tokenPosn++; // Clear buffer buf.delete(0, buf.length()); // Match token from the root // System.out.println(tokenPosn + ". TOKEN: " + token + "; separator: <" + separator + ">"); Node match = matchString(null, token); if (match != null) { if (match._matchedConcepts != null) processMatchedConcepts( match._matchedConcepts, match._matchedString, tokenPosn, tokenMap, pw); if (!eof) { // Match the separator match = match._children.get(separator); if (match != null) newStates.add(match); } } // Match from each of the match states from previous tokens for (Object s : prevStates) { match = matchString(s, token); if (match != null) { // fixme if (match._matchedConcepts != null) processMatchedConcepts( match._matchedConcepts, match._matchedString, tokenPosn, tokenMap, pw); if (!eof) { // Match the separator match = match._children.get(separator); if (match != null) newStates.add(match); } } } // new previous states prevStates = newStates; } } finally { pbr.close(); } return new Tuple<Integer, Map<Concept, Score>>(tokenPosn, tokenMap); }