/** * 从语料库中随机读取一行字符 * * @return * @throws UnsupportedEncodingException */ String readRandomLine(EncodingSet encoding) throws UnsupportedEncodingException { String line = null; // 从语料库文件中随机取一个字节 int startIndex = NumberUtil.getRandomInt(-1, fileSize); byte b = mapBuf.get(startIndex); while (true) { try { if ((char) b == '\n') { // 拿到第一个\n和第二个\n之间的串 byte[] lineBytes = ByteBufferUtil.getBytesAbsoluteBeforeEOFChar(mapBuf, startIndex + 1, '\n'); line = new String(lineBytes, encoding.getEncode()).trim(); if (line.length() <= 1) { startIndex += lineBytes.length + 1; b = mapBuf.get(startIndex); continue; } break; } else { b = mapBuf.get(++startIndex); } } catch (IndexOutOfBoundsException e) { line = readRandomLine(encoding); break; } } return line; }
/** * @param maxLength 最大长度,单位byte。如果超过此长度则读另一行,直到找到<=maxLength的行 * @return * @throws UnsupportedEncodingException */ String readRandomLine(int maxLength, EncodingSet encoding) throws UnsupportedEncodingException { String line; int lineLength; do { line = readRandomLine(encoding); lineLength = line.getBytes(encoding.getEncode()).length; } while (lineLength > maxLength); return line; }