Esempio n. 1
0
 public static void writeString(Writer out, String value) throws IOException {
   out.write('"');
   char[] array = null;
   for (int i = 0; i < value.length(); i++) {
     char c = value.charAt(i);
     switch (c) {
       case '"':
         out.write("\\\"");
         break;
       case '\\':
         out.write("\\\\");
         break;
       case '\n':
         out.write("\\n");
         break;
       case '\t':
         out.write("\\t");
         break;
       case '\r':
         out.write("\\r");
         break;
       case '\0':
         out.write("\\0");
         break;
       default:
         if (Character.isISOControl(c)) {
           // Encode as: "x" + two hex digits.
           if (array == null) {
             array = new char[4];
             array[0] = '\\';
           }
           array[1] = 'x';
           array[3] = Hex.charAt(c & 0xf);
           c >>= 4;
           array[2] = Hex.charAt(c & 0xf);
           out.write(array, 0, 4);
         } else if (Character.isHighSurrogate(c)) {
           // Surrogate pair
           i++;
           if (i >= value.length()) {
             throw new IllegalArgumentException("high surrogate not followed by anything");
           }
           char c2 = value.charAt(i);
           if (!Character.isLowSurrogate(c2)) {
             throw new IllegalArgumentException("high surrogate not followed by low surrogate");
           }
           out.write(value, i - 1, 2);
         } else if (Character.isLowSurrogate(c)) {
           throw new IllegalArgumentException("low surrogate without preceding high surrogate");
         } else {
           // Basic Multilingual Plane (16 bits)
           out.write(c);
         }
     }
   }
   out.write('"');
 }
Esempio n. 2
0
  private int[] calculateNewlineIndicesAndCheckCodePoints(StringBuilder inputData)
      throws SnuggleParseException {
    List<Integer> newlineIndicesBuilder = new ArrayList<Integer>();
    newlineIndicesBuilder.add(Integer.valueOf(-1));
    char lastChar = 0;
    char thisChar; /* (16 bit char only) */
    int codePoint; /* (Full Unicode code point */
    for (int i = 0, length = inputData.length(); i < length; i++, lastChar = thisChar) {
      thisChar = inputData.charAt(i);
      if (thisChar == '\n') {
        newlineIndicesBuilder.add(Integer.valueOf(i));
      }
      if (Character.isHighSurrogate(lastChar)) {
        if (Character.isLowSurrogate(thisChar)) {
          codePoint = Character.toCodePoint(lastChar, thisChar);
        } else {
          /* Error: last was bad surrogate character */
          recordSurrogateError(inputData, i - 1, lastChar);
          continue;
        }
      } else if (Character.isLowSurrogate(thisChar)) {
        /* Error: this is bad surrogate character */
        recordSurrogateError(inputData, i, thisChar);
        continue;
      } else {
        codePoint = thisChar;
      }
      /* Check that we allow this codepoint */
      if (Character.isISOControl(codePoint)
          && !(codePoint == '\r' || codePoint == '\n' || codePoint == '\t')) {
        sessionContext.registerError(
            new InputError(
                CoreErrorCode.TTEG02, null, Integer.toHexString(codePoint), Integer.valueOf(i)));
        inputData.setCharAt(i, ' ');
      }
    }
    /* Make sure last character wasn't surrogate pair starter */
    if (Character.isHighSurrogate(lastChar)) {
      recordSurrogateError(inputData, inputData.length() - 1, lastChar);
    }

    /* Finally store newline information */
    int[] calculatedNewlineIndices = new int[newlineIndicesBuilder.size()];
    for (int i = 0; i < calculatedNewlineIndices.length; i++) {
      calculatedNewlineIndices[i] = newlineIndicesBuilder.get(i);
    }
    return calculatedNewlineIndices;
  }
  /** {@inheritDoc} */
  @Override
  public int read() throws IOException {
    int ic = buffer.get(bufferPosition);

    // End of input
    if (ic == -1) {
      buffer.freeBefore(bufferPosition);
      return ic;
    }

    char c = (char) ic;

    // Skip surrogate pair characters
    if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
      iterationMarkSpanEndPosition = bufferPosition + 1;
    }

    // Free rolling buffer on full stop
    if (c == FULL_STOP_PUNCTUATION) {
      buffer.freeBefore(bufferPosition);
      iterationMarkSpanEndPosition = bufferPosition + 1;
    }

    // Normalize iteration mark
    if (isIterationMark(c)) {
      c = normalizeIterationMark(c);
    }

    bufferPosition++;
    return c;
  }
 public static int correctSubStringLen(String input, int len) {
   if (Character.isHighSurrogate(input.charAt(len - 1))) {
     assert input.length() >= len + 1 && Character.isLowSurrogate(input.charAt(len));
     return len + 1;
   }
   return len;
 }
 /**
  * Check if the given {@code index} is between UTF-16 surrogate pair.
  *
  * @param str The String.
  * @param index The index
  * @return True if the index is between UTF-16 surrogate pair, false otherwise.
  */
 @VisibleForTesting
 static boolean isIndexBetweenUtf16SurrogatePair(CharSequence str, int index) {
   return index > 0
       && index < str.length()
       && Character.isHighSurrogate(str.charAt(index - 1))
       && Character.isLowSurrogate(str.charAt(index));
 }
Esempio n. 6
0
 protected CoderResult encodeLoop(CharBuffer src, ByteBuffer dst) {
   int mark = src.position();
   if (!doneBOM && src.hasRemaining()) {
     if (dst.remaining() < 4) return CoderResult.OVERFLOW;
     put(BOM_BIG, dst);
     doneBOM = true;
   }
   try {
     while (src.hasRemaining()) {
       char c = src.get();
       if (!Character.isSurrogate(c)) {
         if (dst.remaining() < 4) return CoderResult.OVERFLOW;
         mark++;
         put(c, dst);
       } else if (Character.isHighSurrogate(c)) {
         if (!src.hasRemaining()) return CoderResult.UNDERFLOW;
         char low = src.get();
         if (Character.isLowSurrogate(low)) {
           if (dst.remaining() < 4) return CoderResult.OVERFLOW;
           mark += 2;
           put(Character.toCodePoint(c, low), dst);
         } else {
           return CoderResult.malformedForLength(1);
         }
       } else {
         // assert Character.isLowSurrogate(c);
         return CoderResult.malformedForLength(1);
       }
     }
     return CoderResult.UNDERFLOW;
   } finally {
     src.position(mark);
   }
 }
Esempio n. 7
0
 /**
  * Returns the Unicode code point of the character at the given index.
  *
  * <p>Unlike {@link Character#codePointAt(CharSequence, int)} or {@link String#codePointAt(int)}
  * this method will never fail silently when encountering an invalid surrogate pair.
  *
  * <p>The behaviour of this method is as follows:
  *
  * <ol>
  *   <li>If {@code index >= end}, {@link IndexOutOfBoundsException} is thrown.
  *   <li><b>If the character at the specified index is not a surrogate, it is returned.</b>
  *   <li>If the first character was a high surrogate value, then an attempt is made to read the
  *       next character.
  *       <ol>
  *         <li><b>If the end of the sequence was reached, the negated value of the trailing high
  *             surrogate is returned.</b>
  *         <li><b>If the next character was a valid low surrogate, the code point value of the
  *             high/low surrogate pair is returned.</b>
  *         <li>If the next character was not a low surrogate value, then {@link
  *             IllegalArgumentException} is thrown.
  *       </ol>
  *   <li>If the first character was a low surrogate value, {@link IllegalArgumentException} is
  *       thrown.
  * </ol>
  *
  * @param seq the sequence of characters from which to decode the code point
  * @param index the index of the first character to decode
  * @param end the index beyond the last valid character to decode
  * @return the Unicode code point for the given index or the negated value of the trailing high
  *     surrogate character at the end of the sequence
  */
 protected static final int codePointAt(CharSequence seq, int index, int end) {
   if (index < end) {
     char c1 = seq.charAt(index++);
     if (c1 < Character.MIN_HIGH_SURROGATE || c1 > Character.MAX_LOW_SURROGATE) {
       // Fast path (first test is probably all we need to do)
       return c1;
     } else if (c1 <= Character.MAX_HIGH_SURROGATE) {
       // If the high surrogate was the last character, return its inverse
       if (index == end) {
         return -c1;
       }
       // Otherwise look for the low surrogate following it
       char c2 = seq.charAt(index);
       if (Character.isLowSurrogate(c2)) {
         return Character.toCodePoint(c1, c2);
       }
       throw new IllegalArgumentException(
           "Expected low surrogate but got char '"
               + c2
               + "' with value "
               + (int) c2
               + " at index "
               + index);
     } else {
       throw new IllegalArgumentException(
           "Unexpected low surrogate character '"
               + c1
               + "' with value "
               + (int) c1
               + " at index "
               + (index - 1));
     }
   }
   throw new IndexOutOfBoundsException("Index exceeds specified range");
 }
  private void finishComposition() {
    int len = buffer.length();
    if (len == 6 && format != SPECIAL_ESCAPE) {
      char codePoint = (char) getCodePoint(buffer, 2, 5);
      if (Character.isValidCodePoint(codePoint) && codePoint != 0xFFFF) {
        buffer.setLength(0);
        buffer.append(codePoint);
        sendCommittedText();
        return;
      }
    } else if (len == 8 && format == SPECIAL_ESCAPE) {
      int codePoint = getCodePoint(buffer, 2, 7);
      if (Character.isValidCodePoint(codePoint) && codePoint != 0xFFFF) {
        buffer.setLength(0);
        buffer.appendCodePoint(codePoint);
        sendCommittedText();
        return;
      }
    } else if (len == 12 && format == SURROGATE_PAIR) {
      char[] codePoint = {(char) getCodePoint(buffer, 2, 5), (char) getCodePoint(buffer, 8, 11)};
      if (Character.isHighSurrogate(codePoint[0]) && Character.isLowSurrogate(codePoint[1])) {
        buffer.setLength(0);
        buffer.append(codePoint);
        sendCommittedText();
        return;
      }
    }

    beep();
  }
 /** Returns current character */
 int getCurrent() {
   char c1 = text.current();
   if (Character.isHighSurrogate(c1) && text.getIndex() < text.getEndIndex()) {
     char c2 = text.next();
     text.previous();
     if (Character.isLowSurrogate(c2)) {
       return Character.toCodePoint(c1, c2);
     }
   }
   return (int) c1;
 }
Esempio n. 10
0
 /** Returns the count of next character. */
 private int getCurrentCodePointCount() {
   char c1 = text.current();
   if (Character.isHighSurrogate(c1) && text.getIndex() < text.getEndIndex()) {
     char c2 = text.next();
     text.previous();
     if (Character.isLowSurrogate(c2)) {
       return 2;
     }
   }
   return 1;
 }
Esempio n. 11
0
 /** Returns previous character */
 private int getPrevious() {
   char c2 = text.previous();
   if (Character.isLowSurrogate(c2) && text.getIndex() > text.getBeginIndex()) {
     char c1 = text.previous();
     if (Character.isHighSurrogate(c1)) {
       return Character.toCodePoint(c1, c2);
     } else {
       text.next();
     }
   }
   return (int) c2;
 }
Esempio n. 12
0
 public int previousCodePoint() {
   int ch1 = previous();
   if (Character.isLowSurrogate((char) ch1)) {
     int ch2 = previous();
     if (Character.isHighSurrogate((char) ch2)) {
       return Character.toCodePoint((char) ch2, (char) ch1);
     } else if (ch2 != DONE) {
       // unmatched trail surrogate so back out
       next();
     }
   }
   return ch1;
 }
Esempio n. 13
0
 /**
  * Converts a string to a UTF8 byte array.
  *
  * @param string string to be converted
  * @return byte array
  */
 private static byte[] utf8(final String string) {
   final char[] arr = string.toCharArray();
   final int al = arr.length;
   final TokenBuilder tb = new TokenBuilder(al << 1);
   for (int c = 0; c < al; ++c) {
     final char ch = arr[c];
     tb.add(
         Character.isHighSurrogate(ch) && c < al - 1 && Character.isLowSurrogate(arr[c + 1])
             ? Character.toCodePoint(ch, arr[++c])
             : ch);
   }
   return tb.finish();
 }
Esempio n. 14
0
 /**
  * Parses a UCS-4 character from the given source buffer, handling surrogates.
  *
  * @param c The first character
  * @param in The source buffer, from which one more character will be consumed if c is a high
  *     surrogate
  * @returns Either a parsed UCS-4 character, in which case the isPair() and increment() methods
  *     will return meaningful values, or -1, in which case error() will return a descriptive
  *     result object
  */
 public int parse(char c, CharBuffer in) {
   if (Character.isHighSurrogate(c)) {
     if (!in.hasRemaining()) {
       error = CoderResult.UNDERFLOW;
       return -1;
     }
     char d = in.get();
     if (Character.isLowSurrogate(d)) {
       character = Character.toCodePoint(c, d);
       error = null;
       return character;
     }
     error = CoderResult.malformedForLength(1);
     return -1;
   }
   if (Character.isLowSurrogate(c)) {
     error = CoderResult.malformedForLength(1);
     return -1;
   }
   character = c;
   error = null;
   return character;
 }
  /**
   * Adjusts entity indices for supplementary characters (Emoji being the most common example) in
   * UTF-8 (ones outside of U+0000 to U+FFFF range) are represented as a pair of char values, the
   * first from the high-surrogates range, and the second from the low-surrogates range.
   *
   * @param content The content of the tweet
   * @param formattedTweetText The formatted tweet text with entities that we need to adjust
   */
  static void adjustIndicesForSupplementaryChars(
      StringBuilder content, FormattedTweetText formattedTweetText) {
    final List<Integer> highSurrogateIndices = new ArrayList<>();
    final int len = content.length() - 1;
    for (int i = 0; i < len; ++i) {
      if (Character.isHighSurrogate(content.charAt(i))
          && Character.isLowSurrogate(content.charAt(i + 1))) {
        highSurrogateIndices.add(i);
      }
    }

    adjustEntitiesWithOffsets(formattedTweetText.urlEntities, highSurrogateIndices);
    adjustEntitiesWithOffsets(formattedTweetText.mediaEntities, highSurrogateIndices);
  }
Esempio n. 16
0
 /**
  * Parses a UCS-4 character from the given source buffer, handling surrogates.
  *
  * @param c The first character
  * @param ia The input array, from which one more character will be consumed if c is a high
  *     surrogate
  * @param ip The input index
  * @param il The input limit
  * @returns Either a parsed UCS-4 character, in which case the isPair() and increment() methods
  *     will return meaningful values, or -1, in which case error() will return a descriptive
  *     result object
  */
 public int parse(char c, char[] ia, int ip, int il) {
   assert (ia[ip] == c);
   if (Character.isHighSurrogate(c)) {
     if (il - ip < 2) {
       error = CoderResult.UNDERFLOW;
       return -1;
     }
     char d = ia[ip + 1];
     if (Character.isLowSurrogate(d)) {
       character = Character.toCodePoint(c, d);
       error = null;
       return character;
     }
     error = CoderResult.malformedForLength(1);
     return -1;
   }
   if (Character.isLowSurrogate(c)) {
     error = CoderResult.malformedForLength(1);
     return -1;
   }
   character = c;
   error = null;
   return character;
 }
Esempio n. 17
0
 /**
  * Gets the index of the longest NCName that is the suffix of a character sequence.
  *
  * @param s The character sequence.
  * @return The index of the longest suffix of the specified character sequence {@code s} that is
  *     an NCName, or -1 if the character sequence {@code s} does not have a suffix that is an
  *     NCName.
  */
 public static int getNCNameSuffixIndex(CharSequence s) {
   // identify bnode labels and do not try to split them
   if (s.length() > 1 && s.charAt(0) == '_' && s.charAt(1) == ':') {
     return -1;
   }
   int index = -1;
   for (int i = s.length() - 1; i > -1; i--) {
     if (!Character.isLowSurrogate(s.charAt(i))) {
       int codePoint = Character.codePointAt(s, i);
       if (isNCNameStartChar(codePoint)) {
         index = i;
       }
       if (!isNCNameChar(codePoint)) {
         break;
       }
     }
   }
   return index;
 }
 public int encode(char[] src, int sp, int len, byte[] dst) {
   int dp = 0;
   int sl = sp + Math.min(len, dst.length);
   while (sp < sl) {
     char c = src[sp++];
     int b = encode(c);
     if (b != UNMAPPABLE_ENCODING) {
       dst[dp++] = (byte) b;
       continue;
     }
     if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(src[sp])) {
       if (len > dst.length) {
         sl++;
         len--;
       }
       sp++;
     }
     dst[dp++] = repl;
   }
   return dp;
 }
Esempio n. 19
0
  private static CharBuffer _getRawCharBuffer(String rawString, int start) {
    int count = 0;

    for (int i = start; i < rawString.length(); i++) {
      char rawChar = rawString.charAt(i);

      if (!_validChars.get(rawChar)) {
        count++;

        if (Character.isHighSurrogate(rawChar)) {
          if (((i + 1) < rawString.length()) && Character.isLowSurrogate(rawString.charAt(i + 1))) {

            count++;
          }
        }
      } else {
        break;
      }
    }

    return CharBuffer.wrap(rawString, start, start + count);
  }
Esempio n. 20
0
 /*     */ protected CoderResult encodeLoop(
     CharBuffer paramCharBuffer, ByteBuffer paramByteBuffer) {
   /* 146 */ int i = paramCharBuffer.position();
   /* 147 */ if ((!this.doneBOM) && (paramCharBuffer.hasRemaining())) {
     /* 148 */ if (paramByteBuffer.remaining() < 4) /* 149 */ return CoderResult.OVERFLOW;
     /* 150 */ put(65279, paramByteBuffer);
     /* 151 */ this.doneBOM = true;
     /*     */ }
   /*     */ try {
     /* 154 */ while (paramCharBuffer.hasRemaining()) {
       /* 155 */ char c1 = paramCharBuffer.get();
       /*     */ CoderResult localCoderResult2;
       /* 156 */ if (!Character.isSurrogate(c1)) {
         /* 157 */ if (paramByteBuffer.remaining() < 4) /* 158 */ return CoderResult.OVERFLOW;
         /* 159 */ i++;
         /* 160 */ put(c1, paramByteBuffer);
         /* 161 */ } else if (Character.isHighSurrogate(c1)) {
         /* 162 */ if (!paramCharBuffer.hasRemaining()) /* 163 */ return CoderResult.UNDERFLOW;
         /* 164 */ char c2 = paramCharBuffer.get();
         /*     */ CoderResult localCoderResult4;
         /* 165 */ if (Character.isLowSurrogate(c2)) {
           /* 166 */ if (paramByteBuffer.remaining() < 4) /* 167 */ return CoderResult.OVERFLOW;
           /* 168 */ i += 2;
           /* 169 */ put(Character.toCodePoint(c1, c2), paramByteBuffer);
           /*     */ } else {
           /* 171 */ return CoderResult.malformedForLength(1);
           /*     */ }
         /*     */ }
       /*     */ else {
         /* 175 */ return CoderResult.malformedForLength(1);
         /*     */ }
       /*     */ }
     /* 178 */ return CoderResult.UNDERFLOW;
     /*     */ } finally {
     /* 180 */ paramCharBuffer.position(i);
     /*     */ }
   /*     */ }
Esempio n. 21
0
 /**
  * Returns a parser that produces a low-surrogate character.
  *
  * @param missing The error if there is no character on the stream to produce a low-surrogate
  *     character with.
  * @param sat The error if the produced character is not a low-surrogate character.
  * @return A parser that produces a low-surrogate character.
  * @see Character#isLowSurrogate(char)
  */
 public static <E> Parser<Stream<Character>, Character, E> lowSurrogate(
     final F0<E> missing, final F<Character, E> sat) {
   return StreamParser.satisfy(missing, sat, c -> Character.isLowSurrogate(c));
 }
Esempio n. 22
0
  private void writePrimitive(PrintWriter out, Object val) throws SerializerException {
    if (val instanceof Tristate) {
      Tristate bool = (Tristate) val;
      if (bool == Tristate.TRUE) {
        out.print(BOOLEAN_TRUE);
      } else if (bool == Tristate.FALSE) {
        out.print(BOOLEAN_FALSE);
      } else if (bool == Tristate.UNDEFINED) {
        out.print(BOOLEAN_UNDEFINED);
      }
    } else if (val instanceof Double) {
      if (((Double) val).isInfinite() || (((Double) val).isNaN())) {
        LOGGER.info("Serializing infinite or NaN double as 0.0");
        out.print("0.0");
      } else {
        String string = val.toString();
        if (string.endsWith(DOT_0)) {
          out.print(string.substring(0, string.length() - 1));
        } else {
          out.print(string);
        }
      }
    } else if (val instanceof Boolean) {
      Boolean bool = (Boolean) val;
      if (bool) {
        out.print(BOOLEAN_TRUE);
      } else {
        out.print(BOOLEAN_FALSE);
      }
    } else if (val instanceof String) {
      out.print(SINGLE_QUOTE);
      String stringVal = (String) val;
      for (int i = 0; i < stringVal.length(); i++) {
        char c = stringVal.charAt(i);
        if (c == '\'') {
          out.print("\'\'");
        } else if (c == '\\') {
          out.print("\\\\");
        } else if (c >= 32 && c <= 126) {
          // ISO 8859-1
          out.print(c);
        } else if (c < 255) {
          //  ISO 10646 and ISO 8859-1 are the same < 255 , using ISO_8859_1
          out.write(
              "\\X\\"
                  + new String(
                          Hex.encode(
                              Charsets.ISO_8859_1
                                  .encode(CharBuffer.wrap(new char[] {(char) c}))
                                  .array()),
                          Charsets.UTF_8)
                      .toUpperCase());
        } else {
          if (useIso8859_1) {
            // ISO 8859-1 with -128 offset
            ByteBuffer encode =
                Charsets.ISO_8859_1.encode(new String(new char[] {(char) (c - 128)}));
            out.write("\\S\\" + (char) encode.get());
          } else {
            // The following code has not been tested (2012-04-25)
            // Use UCS-2 or UCS-4

            // TODO when multiple sequential characters should be encoded in UCS-2 or UCS-4, we
            // don't really need to add all those \X0\ \X2\ and \X4\ chars
            if (Character.isLowSurrogate(c)) {
              throw new SerializerException("Unexpected low surrogate range char");
            } else if (Character.isHighSurrogate(c)) {
              // We need UCS-4, this is probably never happening
              if (i + 1 < stringVal.length()) {
                char low = stringVal.charAt(i + 1);
                if (!Character.isLowSurrogate(low)) {
                  throw new SerializerException(
                      "High surrogate char should be followed by char in low surrogate range");
                }
                try {
                  out.write(
                      "\\X4\\"
                          + new String(
                                  Hex.encode(
                                      Charset.forName("UTF-32")
                                          .encode(new String(new char[] {c, low}))
                                          .array()),
                                  Charsets.UTF_8)
                              .toUpperCase()
                          + "\\X0\\");
                } catch (UnsupportedCharsetException e) {
                  throw new SerializerException(e);
                }
                i++;
              } else {
                throw new SerializerException(
                    "High surrogate char should be followed by char in low surrogate range, but end of string reached");
              }
            } else {
              // UCS-2 will do
              out.write(
                  "\\X2\\"
                      + new String(
                              Hex.encode(
                                  Charsets.UTF_16BE
                                      .encode(CharBuffer.wrap(new char[] {c}))
                                      .array()),
                              Charsets.UTF_8)
                          .toUpperCase()
                      + "\\X0\\");
            }
          }
        }
      }
      out.print(SINGLE_QUOTE);
    } else if (val instanceof Enumerator) {
      out.print("." + val + ".");
    } else {
      out.print(val == null ? "$" : val.toString());
    }
  }
Esempio n. 23
0
  // Backtrace from the provided position, back to the last
  // time we back-traced, accumulating the resulting tokens to
  // the pending list.  The pending list is then in-reverse
  // (last token should be returned first).
  private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
    final int endPos = endPosData.pos;

    if (VERBOSE) {
      System.out.println(
          "\n  backtrace: endPos="
              + endPos
              + " pos="
              + pos
              + "; "
              + (pos - lastBackTracePos)
              + " characters; last="
              + lastBackTracePos
              + " cost="
              + endPosData.costs[fromIDX]);
    }

    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);

    if (dotOut != null) {
      dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
    }

    int pos = endPos;
    int bestIDX = fromIDX;
    Token altToken = null;

    // We trace backwards, so this will be the leftWordID of
    // the token after the one we are now on:
    int lastLeftWordID = -1;

    int backCount = 0;

    // TODO: sort of silly to make Token instances here; the
    // back trace has all info needed to generate the
    // token.  So, we could just directly set the attrs,
    // from the backtrace, in incrementToken w/o ever
    // creating Token; we'd have to defer calling freeBefore
    // until after the backtrace was fully "consumed" by
    // incrementToken.

    while (pos > lastBackTracePos) {
      // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
      final Position posData = positions.get(pos);
      assert bestIDX < posData.count;

      int backPos = posData.backPos[bestIDX];
      assert backPos >= lastBackTracePos
          : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
      int length = pos - backPos;
      Type backType = posData.backType[bestIDX];
      int backID = posData.backID[bestIDX];
      int nextBestIDX = posData.backIndex[bestIDX];

      if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {

        // In searchMode, if best path had picked a too-long
        // token, we use the "penalty" to compute the allowed
        // max cost of an alternate back-trace.  If we find an
        // alternate back trace with cost below that
        // threshold, we pursue it instead (but also output
        // the long token).
        // System.out.println("    2nd best backPos=" + backPos + " pos=" + pos);

        final int penalty = computeSecondBestThreshold(backPos, pos - backPos);

        if (penalty > 0) {
          if (VERBOSE) {
            System.out.println(
                "  compound="
                    + new String(buffer.get(backPos, pos - backPos))
                    + " backPos="
                    + backPos
                    + " pos="
                    + pos
                    + " penalty="
                    + penalty
                    + " cost="
                    + posData.costs[bestIDX]
                    + " bestIDX="
                    + bestIDX
                    + " lastLeftID="
                    + lastLeftWordID);
          }

          // Use the penalty to set maxCost on the 2nd best
          // segmentation:
          int maxCost = posData.costs[bestIDX] + penalty;
          if (lastLeftWordID != -1) {
            maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
          }

          // Now, prune all too-long tokens from the graph:
          pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]);

          // Finally, find 2nd best back-trace and resume
          // backtrace there:
          int leastCost = Integer.MAX_VALUE;
          int leastIDX = -1;
          for (int idx = 0; idx < posData.count; idx++) {
            int cost = posData.costs[idx];
            // System.out.println("    idx=" + idx + " prevCost=" + cost);

            if (lastLeftWordID != -1) {
              cost +=
                  costs.get(
                      getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
                      lastLeftWordID);
              // System.out.println("      += bgCost=" +
              // costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
              // lastLeftWordID) + " -> " + cost);
            }
            // System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
            // cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
            if (cost < leastCost) {
              // System.out.println("      ** ");
              leastCost = cost;
              leastIDX = idx;
            }
          }
          // System.out.println("  leastIDX=" + leastIDX);

          if (VERBOSE) {
            System.out.println(
                "  afterPrune: "
                    + posData.count
                    + " arcs arriving; leastCost="
                    + leastCost
                    + " vs threshold="
                    + maxCost
                    + " lastLeftWordID="
                    + lastLeftWordID);
          }

          if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
            // We should have pruned the altToken from the graph:
            assert posData.backPos[leastIDX] != backPos;

            // Save the current compound token, to output when
            // this alternate path joins back:
            altToken =
                new Token(
                    backID,
                    fragment,
                    backPos - lastBackTracePos,
                    length,
                    backType,
                    backPos,
                    getDict(backType));

            // Redirect our backtrace to 2nd best:
            bestIDX = leastIDX;
            nextBestIDX = posData.backIndex[bestIDX];

            backPos = posData.backPos[bestIDX];
            length = pos - backPos;
            backType = posData.backType[bestIDX];
            backID = posData.backID[bestIDX];
            backCount = 0;
            // System.out.println("  do alt token!");

          } else {
            // I think in theory it's possible there is no
            // 2nd best path, which is fine; in this case we
            // only output the compound token:
            // System.out.println("  no alt token! bestIDX=" + bestIDX);
          }
        }
      }

      final int offset = backPos - lastBackTracePos;
      assert offset >= 0;

      if (altToken != null && altToken.getPosition() >= backPos) {

        // We've backtraced to the position where the
        // compound token starts; add it now:

        // The pruning we did when we created the altToken
        // ensures that the back trace will align back with
        // the start of the altToken:
        assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;

        // NOTE: not quite right: the compound token may
        // have had all punctuation back traced so far, but
        // then the decompounded token at this position is
        // not punctuation.  In this case backCount is 0,
        // but we should maybe add the altToken anyway...?

        if (backCount > 0) {
          backCount++;
          altToken.setPositionLength(backCount);
          if (VERBOSE) {
            System.out.println("    add altToken=" + altToken);
          }
          pending.add(altToken);
        } else {
          // This means alt token was all punct tokens:
          if (VERBOSE) {
            System.out.println("    discard all-punctuation altToken=" + altToken);
          }
          assert discardPunctuation;
        }
        altToken = null;
      }

      final Dictionary dict = getDict(backType);

      if (backType == Type.USER) {

        // Expand the phraseID we recorded into the actual
        // segmentation:
        final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
        int wordID = wordIDAndLength[0];
        int current = 0;
        for (int j = 1; j < wordIDAndLength.length; j++) {
          final int len = wordIDAndLength[j];
          // System.out.println("    add user: len=" + len);
          pending.add(
              new Token(
                  wordID + j - 1,
                  fragment,
                  current + offset,
                  len,
                  Type.USER,
                  current + backPos,
                  dict));
          if (VERBOSE) {
            System.out.println("    add USER token=" + pending.get(pending.size() - 1));
          }
          current += len;
        }

        // Reverse the tokens we just added, because when we
        // serve them up from incrementToken we serve in
        // reverse:
        Collections.reverse(
            pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size()));

        backCount += wordIDAndLength.length - 1;
      } else {

        if (extendedMode && backType == Type.UNKNOWN) {
          // In EXTENDED mode we convert unknown word into
          // unigrams:
          int unigramTokenCount = 0;
          for (int i = length - 1; i >= 0; i--) {
            int charLen = 1;
            if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) {
              i--;
              charLen = 2;
            }
            // System.out.println("    extended tok offset="
            // + (offset + i));
            if (!discardPunctuation || !isPunctuation(fragment[offset + i])) {
              pending.add(
                  new Token(
                      CharacterDefinition.NGRAM,
                      fragment,
                      offset + i,
                      charLen,
                      Type.UNKNOWN,
                      backPos + i,
                      unkDictionary));
              unigramTokenCount++;
            }
          }
          backCount += unigramTokenCount;

        } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
          pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict));
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size() - 1));
          }
          backCount++;
        } else {
          if (VERBOSE) {
            System.out.println(
                "    skip punctuation token=" + new String(fragment, offset, length));
          }
        }
      }

      lastLeftWordID = dict.getLeftId(backID);
      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;

    if (VERBOSE) {
      System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
  }
Esempio n. 24
0
  // Find all concepts that match
  public Tuple<Integer, Map<Concept, Score>> processArticle(Reader r, PrintWriter pw)
      throws java.io.IOException {
    String currToken = null;
    ArrayList prevStates = new ArrayList();
    int tokenPosn = 0;

    StringBuffer buf = new StringBuffer();
    Map<Concept, Score> tokenMap = new HashMap<Concept, Score>();

    // Use a 1-character buffer pushback reader
    PushbackReader pbr = new PushbackReader(r);

    try {
      boolean eof = skipHeader(pbr);
      while (!eof) {
        ArrayList newStates = new ArrayList();
        char separator = ' ';

        // Read stream and build up a token
        boolean done = false;
        while (!done) {
          int i = pbr.read();
          if (i == -1) {
            done = true;
            eof = true;
          } else {
            char c = (char) i;
            if (Character.isLetterOrDigit(c)
                || Character.isHighSurrogate(c)
                || Character.isLowSurrogate(c)) {
              buf.append(c);
            } else {
              // FIXME: Normalize white-space
              separator = swallowWhiteSpace(c, pbr);
              done = true;
            }
          }
        }
        String token = buf.toString();
        tokenPosn++;

        // Clear buffer
        buf.delete(0, buf.length());

        // Match token from the root
        // System.out.println(tokenPosn + ". TOKEN: " + token + "; separator: <" + separator + ">");
        Node match = matchString(null, token);
        if (match != null) {
          if (match._matchedConcepts != null)
            processMatchedConcepts(
                match._matchedConcepts, match._matchedString, tokenPosn, tokenMap, pw);
          if (!eof) {
            // Match the separator
            match = match._children.get(separator);
            if (match != null) newStates.add(match);
          }
        }

        // Match from each of the match states from previous tokens
        for (Object s : prevStates) {
          match = matchString(s, token);
          if (match != null) {
            // fixme
            if (match._matchedConcepts != null)
              processMatchedConcepts(
                  match._matchedConcepts, match._matchedString, tokenPosn, tokenMap, pw);
            if (!eof) {
              // Match the separator
              match = match._children.get(separator);
              if (match != null) newStates.add(match);
            }
          }
        }

        // new previous states
        prevStates = newStates;
      }
    } finally {
      pbr.close();
    }

    return new Tuple<Integer, Map<Concept, Score>>(tokenPosn, tokenMap);
  }