コード例 #1
0
  /**
   * 从语料库中随机读取一行字符
   *
   * @return
   * @throws UnsupportedEncodingException
   */
  String readRandomLine(EncodingSet encoding) throws UnsupportedEncodingException {
    String line = null;

    // 从语料库文件中随机取一个字节
    int startIndex = NumberUtil.getRandomInt(-1, fileSize);
    byte b = mapBuf.get(startIndex);

    while (true) {
      try {
        if ((char) b == '\n') {
          // 拿到第一个\n和第二个\n之间的串
          byte[] lineBytes =
              ByteBufferUtil.getBytesAbsoluteBeforeEOFChar(mapBuf, startIndex + 1, '\n');
          line = new String(lineBytes, encoding.getEncode()).trim();
          if (line.length() <= 1) {
            startIndex += lineBytes.length + 1;
            b = mapBuf.get(startIndex);
            continue;
          }

          break;
        } else {
          b = mapBuf.get(++startIndex);
        }
      } catch (IndexOutOfBoundsException e) {
        line = readRandomLine(encoding);
        break;
      }
    }

    return line;
  }
コード例 #2
0
  /**
   * @param maxLength 最大长度,单位byte。如果超过此长度则读另一行,直到找到<=maxLength的行
   * @return
   * @throws UnsupportedEncodingException
   */
  String readRandomLine(int maxLength, EncodingSet encoding) throws UnsupportedEncodingException {
    String line;
    int lineLength;

    do {
      line = readRandomLine(encoding);
      lineLength = line.getBytes(encoding.getEncode()).length;
    } while (lineLength > maxLength);

    return line;
  }