コード例 #1
0
 public void flush() {
   if (valueCount > 0) {
     writePage();
   }
   final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose();
   if (dictionaryPage != null) {
     if (DEBUG) LOG.debug("write dictionary");
     try {
       pageWriter.writeDictionaryPage(dictionaryPage);
     } catch (IOException e) {
       throw new ParquetEncodingException("could not write dictionary page for " + path, e);
     }
     dataColumn.resetDictionary();
   }
 }
コード例 #2
0
ファイル: BytesUtils.java プロジェクト: lw-lin/parquet-mr
 public static void writeIntLittleEndian(OutputStream out, int v) throws IOException {
   // TODO: this is duplicated code in LittleEndianDataOutputStream
   out.write((v >>> 0) & 0xFF);
   out.write((v >>> 8) & 0xFF);
   out.write((v >>> 16) & 0xFF);
   out.write((v >>> 24) & 0xFF);
   if (Log.DEBUG)
     LOG.debug(
         "write le int: "
             + v
             + " => "
             + ((v >>> 0) & 0xFF)
             + " "
             + ((v >>> 8) & 0xFF)
             + " "
             + ((v >>> 16) & 0xFF)
             + " "
             + ((v >>> 24) & 0xFF));
 }
コード例 #3
0
 private void writePage() {
   if (DEBUG) LOG.debug("write page");
   try {
     pageWriter.writePage(
         concat(
             repetitionLevelColumn.getBytes(),
             definitionLevelColumn.getBytes(),
             dataColumn.getBytes()),
         valueCount,
         statistics,
         repetitionLevelColumn.getEncoding(),
         definitionLevelColumn.getEncoding(),
         dataColumn.getEncoding());
   } catch (IOException e) {
     throw new ParquetEncodingException("could not write page for " + path, e);
   }
   repetitionLevelColumn.reset();
   definitionLevelColumn.reset();
   dataColumn.reset();
   valueCount = 0;
   resetStatistics();
 }
コード例 #4
0
 private void log(Object value, int r, int d) {
   LOG.debug(path + " " + value + " r:" + r + " d:" + d);
 }
コード例 #5
0
/**
 * Writes (repetition level, definition level, value) triplets and deals with writing pages to the
 * underlying layer.
 *
 * @author Julien Le Dem
 */
final class ColumnWriterV1 implements ColumnWriter {
  private static final Log LOG = Log.getLog(ColumnWriterV1.class);
  private static final boolean DEBUG = Log.DEBUG;

  private final ColumnDescriptor path;
  private final PageWriter pageWriter;
  private final ParquetProperties props;

  private ValuesWriter repetitionLevelColumn;
  private ValuesWriter definitionLevelColumn;
  private ValuesWriter dataColumn;
  private int valueCount;
  private int valueCountForNextSizeCheck;

  private Statistics statistics;

  public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) {
    this.path = path;
    this.pageWriter = pageWriter;
    this.props = props;

    // initial check of memory usage. So that we have enough data to make an initial prediction
    this.valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck();

    resetStatistics();

    this.repetitionLevelColumn = props.newRepetitionLevelWriter(path);
    this.definitionLevelColumn = props.newDefinitionLevelWriter(path);
    this.dataColumn = props.newValuesWriter(path);
  }

  private void log(Object value, int r, int d) {
    LOG.debug(path + " " + value + " r:" + r + " d:" + d);
  }

  private void resetStatistics() {
    this.statistics = Statistics.getStatsBasedOnType(this.path.getType());
  }

  /**
   * Counts how many values have been written and checks the memory usage to flush the page when we
   * reach the page threshold.
   *
   * <p>We measure the memory used when we reach the mid point toward our estimated count. We then
   * update the estimate and flush the page if we reached the threshold.
   *
   * <p>That way we check the memory size log2(n) times.
   */
  private void accountForValueWritten() {
    ++valueCount;
    if (valueCount > valueCountForNextSizeCheck) {
      // not checking the memory used for every value
      long memSize =
          repetitionLevelColumn.getBufferedSize()
              + definitionLevelColumn.getBufferedSize()
              + dataColumn.getBufferedSize();
      if (memSize > props.getPageSizeThreshold()) {
        // we will write the current page and check again the size at the predicted middle of next
        // page
        if (props.estimateNextSizeCheck()) {
          valueCountForNextSizeCheck = valueCount / 2;
        } else {
          valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck();
        }
        writePage();
      } else if (props.estimateNextSizeCheck()) {
        // not reached the threshold, will check again midway
        valueCountForNextSizeCheck =
            (int) (valueCount + ((float) valueCount * props.getPageSizeThreshold() / memSize)) / 2
                + 1;
      } else {
        valueCountForNextSizeCheck += props.getMinRowCountForPageSizeCheck();
      }
    }
  }

  private void updateStatisticsNumNulls() {
    statistics.incrementNumNulls();
  }

  private void updateStatistics(int value) {
    statistics.updateStats(value);
  }

  private void updateStatistics(long value) {
    statistics.updateStats(value);
  }

  private void updateStatistics(float value) {
    statistics.updateStats(value);
  }

  private void updateStatistics(double value) {
    statistics.updateStats(value);
  }

  private void updateStatistics(Binary value) {
    statistics.updateStats(value);
  }

  private void updateStatistics(boolean value) {
    statistics.updateStats(value);
  }

  private void writePage() {
    if (DEBUG) LOG.debug("write page");
    try {
      pageWriter.writePage(
          concat(
              repetitionLevelColumn.getBytes(),
              definitionLevelColumn.getBytes(),
              dataColumn.getBytes()),
          valueCount,
          statistics,
          repetitionLevelColumn.getEncoding(),
          definitionLevelColumn.getEncoding(),
          dataColumn.getEncoding());
    } catch (IOException e) {
      throw new ParquetEncodingException("could not write page for " + path, e);
    }
    repetitionLevelColumn.reset();
    definitionLevelColumn.reset();
    dataColumn.reset();
    valueCount = 0;
    resetStatistics();
  }

  @Override
  public void writeNull(int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(null, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    updateStatisticsNumNulls();
    accountForValueWritten();
  }

  @Override
  public void write(double value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeDouble(value);
    updateStatistics(value);
    accountForValueWritten();
  }

  @Override
  public void write(float value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeFloat(value);
    updateStatistics(value);
    accountForValueWritten();
  }

  @Override
  public void write(Binary value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeBytes(value);
    updateStatistics(value);
    accountForValueWritten();
  }

  @Override
  public void write(boolean value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeBoolean(value);
    updateStatistics(value);
    accountForValueWritten();
  }

  @Override
  public void write(int value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeInteger(value);
    updateStatistics(value);
    accountForValueWritten();
  }

  @Override
  public void write(long value, int repetitionLevel, int definitionLevel) {
    if (DEBUG) log(value, repetitionLevel, definitionLevel);
    repetitionLevelColumn.writeInteger(repetitionLevel);
    definitionLevelColumn.writeInteger(definitionLevel);
    dataColumn.writeLong(value);
    updateStatistics(value);
    accountForValueWritten();
  }

  public void flush() {
    if (valueCount > 0) {
      writePage();
    }
    final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose();
    if (dictionaryPage != null) {
      if (DEBUG) LOG.debug("write dictionary");
      try {
        pageWriter.writeDictionaryPage(dictionaryPage);
      } catch (IOException e) {
        throw new ParquetEncodingException("could not write dictionary page for " + path, e);
      }
      dataColumn.resetDictionary();
    }
  }

  @Override
  public void close() {
    flush();
    // Close the Values writers.
    repetitionLevelColumn.close();
    definitionLevelColumn.close();
    dataColumn.close();
  }

  @Override
  public long getBufferedSizeInMemory() {
    return repetitionLevelColumn.getBufferedSize()
        + definitionLevelColumn.getBufferedSize()
        + dataColumn.getBufferedSize()
        + pageWriter.getMemSize();
  }

  public long allocatedSize() {
    return repetitionLevelColumn.getAllocatedSize()
        + definitionLevelColumn.getAllocatedSize()
        + dataColumn.getAllocatedSize()
        + pageWriter.allocatedSize();
  }

  public String memUsageString(String indent) {
    StringBuilder b = new StringBuilder(indent).append(path).append(" {\n");
    b.append(repetitionLevelColumn.memUsageString(indent + "  r:")).append("\n");
    b.append(definitionLevelColumn.memUsageString(indent + "  d:")).append("\n");
    b.append(dataColumn.memUsageString(indent + "  data:")).append("\n");
    b.append(pageWriter.memUsageString(indent + "  pages:")).append("\n");
    b.append(indent)
        .append(String.format("  total: %,d/%,d", getBufferedSizeInMemory(), allocatedSize()))
        .append("\n");
    b.append(indent).append("}\n");
    return b.toString();
  }
}
コード例 #6
0
ファイル: BytesUtils.java プロジェクト: lw-lin/parquet-mr
/**
 * utility methods to deal with bytes
 *
 * @author Julien Le Dem
 */
public class BytesUtils {
  private static final Log LOG = Log.getLog(BytesUtils.class);

  public static final Charset UTF8 = Charset.forName("UTF-8");

  /**
   * give the number of bits needed to encode an int given the max value
   *
   * @param bound max int that we want to encode
   * @return the number of bits required
   */
  public static int getWidthFromMaxInt(int bound) {
    return 32 - Integer.numberOfLeadingZeros(bound);
  }

  /**
   * reads an int in little endian at the given position
   *
   * @param in
   * @param offset
   * @return
   * @throws IOException
   */
  public static int readIntLittleEndian(ByteBuffer in, int offset) throws IOException {
    int ch4 = in.get(offset) & 0xff;
    int ch3 = in.get(offset + 1) & 0xff;
    int ch2 = in.get(offset + 2) & 0xff;
    int ch1 = in.get(offset + 3) & 0xff;
    return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
  }

  /**
   * reads an int in little endian at the given position
   *
   * @param in
   * @param offset
   * @return
   * @throws IOException
   */
  public static int readIntLittleEndian(byte[] in, int offset) throws IOException {
    int ch4 = in[offset] & 0xff;
    int ch3 = in[offset + 1] & 0xff;
    int ch2 = in[offset + 2] & 0xff;
    int ch1 = in[offset + 3] & 0xff;
    return ((ch1 << 24) + (ch2 << 16) + (ch3 << 8) + (ch4 << 0));
  }

  public static int readIntLittleEndian(InputStream in) throws IOException {
    // TODO: this is duplicated code in LittleEndianDataInputStream
    int ch1 = in.read();
    int ch2 = in.read();
    int ch3 = in.read();
    int ch4 = in.read();
    if ((ch1 | ch2 | ch3 | ch4) < 0) {
      throw new EOFException();
    }
    return ((ch4 << 24) + (ch3 << 16) + (ch2 << 8) + (ch1 << 0));
  }

  public static int readIntLittleEndianOnOneByte(InputStream in) throws IOException {
    int ch1 = in.read();
    if (ch1 < 0) {
      throw new EOFException();
    }
    return ch1;
  }

  public static int readIntLittleEndianOnTwoBytes(InputStream in) throws IOException {
    int ch1 = in.read();
    int ch2 = in.read();
    if ((ch1 | ch2) < 0) {
      throw new EOFException();
    }
    return ((ch2 << 8) + (ch1 << 0));
  }

  public static int readIntLittleEndianOnThreeBytes(InputStream in) throws IOException {
    int ch1 = in.read();
    int ch2 = in.read();
    int ch3 = in.read();
    if ((ch1 | ch2 | ch3) < 0) {
      throw new EOFException();
    }
    return ((ch3 << 16) + (ch2 << 8) + (ch1 << 0));
  }

  public static int readIntLittleEndianPaddedOnBitWidth(InputStream in, int bitWidth)
      throws IOException {

    int bytesWidth = paddedByteCountFromBits(bitWidth);
    switch (bytesWidth) {
      case 0:
        return 0;
      case 1:
        return BytesUtils.readIntLittleEndianOnOneByte(in);
      case 2:
        return BytesUtils.readIntLittleEndianOnTwoBytes(in);
      case 3:
        return BytesUtils.readIntLittleEndianOnThreeBytes(in);
      case 4:
        return BytesUtils.readIntLittleEndian(in);
      default:
        throw new IOException(
            String.format("Encountered bitWidth (%d) that requires more than 4 bytes", bitWidth));
    }
  }

  public static void writeIntLittleEndianOnOneByte(OutputStream out, int v) throws IOException {
    out.write((v >>> 0) & 0xFF);
  }

  public static void writeIntLittleEndianOnTwoBytes(OutputStream out, int v) throws IOException {
    out.write((v >>> 0) & 0xFF);
    out.write((v >>> 8) & 0xFF);
  }

  public static void writeIntLittleEndianOnThreeBytes(OutputStream out, int v) throws IOException {
    out.write((v >>> 0) & 0xFF);
    out.write((v >>> 8) & 0xFF);
    out.write((v >>> 16) & 0xFF);
  }

  public static void writeIntLittleEndian(OutputStream out, int v) throws IOException {
    // TODO: this is duplicated code in LittleEndianDataOutputStream
    out.write((v >>> 0) & 0xFF);
    out.write((v >>> 8) & 0xFF);
    out.write((v >>> 16) & 0xFF);
    out.write((v >>> 24) & 0xFF);
    if (Log.DEBUG)
      LOG.debug(
          "write le int: "
              + v
              + " => "
              + ((v >>> 0) & 0xFF)
              + " "
              + ((v >>> 8) & 0xFF)
              + " "
              + ((v >>> 16) & 0xFF)
              + " "
              + ((v >>> 24) & 0xFF));
  }

  /** Write a little endian int to out, using the the number of bytes required by bit width */
  public static void writeIntLittleEndianPaddedOnBitWidth(OutputStream out, int v, int bitWidth)
      throws IOException {

    int bytesWidth = paddedByteCountFromBits(bitWidth);
    switch (bytesWidth) {
      case 0:
        break;
      case 1:
        writeIntLittleEndianOnOneByte(out, v);
        break;
      case 2:
        writeIntLittleEndianOnTwoBytes(out, v);
        break;
      case 3:
        writeIntLittleEndianOnThreeBytes(out, v);
        break;
      case 4:
        writeIntLittleEndian(out, v);
        break;
      default:
        throw new IOException(
            String.format("Encountered value (%d) that requires more than 4 bytes", v));
    }
  }

  public static int readUnsignedVarInt(InputStream in) throws IOException {
    int value = 0;
    int i = 0;
    int b;
    while (((b = in.read()) & 0x80) != 0) {
      value |= (b & 0x7F) << i;
      i += 7;
    }
    return value | (b << i);
  }

  /**
   * uses a trick mentioned in https://developers.google.com/protocol-buffers/docs/encoding to read
   * zigZag encoded data
   *
   * @param in
   * @return
   * @throws IOException
   */
  public static int readZigZagVarInt(InputStream in) throws IOException {
    int raw = readUnsignedVarInt(in);
    int temp = (((raw << 31) >> 31) ^ raw) >> 1;
    return temp ^ (raw & (1 << 31));
  }

  public static void writeUnsignedVarInt(int value, OutputStream out) throws IOException {
    while ((value & 0xFFFFFF80) != 0L) {
      out.write((value & 0x7F) | 0x80);
      value >>>= 7;
    }
    out.write(value & 0x7F);
  }

  public static void writeUnsignedVarInt(int value, ByteBuffer dest) throws IOException {
    while ((value & 0xFFFFFF80) != 0L) {
      dest.putInt((value & 0x7F) | 0x80);
      value >>>= 7;
    }
    dest.putInt(value & 0x7F);
  }

  public static void writeZigZagVarInt(int intValue, OutputStream out) throws IOException {
    writeUnsignedVarInt((intValue << 1) ^ (intValue >> 31), out);
  }

  /**
   * uses a trick mentioned in https://developers.google.com/protocol-buffers/docs/encoding to read
   * zigZag encoded data TODO: the implementation is compatible with readZigZagVarInt. Is there a
   * need for different functions?
   *
   * @param in
   * @return
   * @throws IOException
   */
  public static long readZigZagVarLong(InputStream in) throws IOException {
    long raw = readUnsignedVarLong(in);
    long temp = (((raw << 63) >> 63) ^ raw) >> 1;
    return temp ^ (raw & (1L << 63));
  }

  public static long readUnsignedVarLong(InputStream in) throws IOException {
    long value = 0;
    int i = 0;
    long b;
    while (((b = in.read()) & 0x80) != 0) {
      value |= (b & 0x7F) << i;
      i += 7;
    }
    return value | (b << i);
  }

  public static void writeUnsignedVarLong(long value, OutputStream out) throws IOException {
    while ((value & 0xFFFFFFFFFFFFFF80L) != 0L) {
      out.write((int) ((value & 0x7F) | 0x80));
      value >>>= 7;
    }
    out.write((int) (value & 0x7F));
  }

  public static void writeZigZagVarLong(long longValue, OutputStream out) throws IOException {
    writeUnsignedVarLong((longValue << 1) ^ (longValue >> 63), out);
  }

  /**
   * @param bitLength a count of bits
   * @return the corresponding byte count padded to the next byte
   */
  public static int paddedByteCountFromBits(int bitLength) {
    return (bitLength + 7) / 8;
  }

  public static byte[] intToBytes(int value) {
    byte[] outBuffer = new byte[4];
    outBuffer[3] = (byte) (value >>> 24);
    outBuffer[2] = (byte) (value >>> 16);
    outBuffer[1] = (byte) (value >>> 8);
    outBuffer[0] = (byte) (value >>> 0);
    return outBuffer;
  }

  public static int bytesToInt(byte[] bytes) {
    return ((int) (bytes[3] & 255) << 24)
        + ((int) (bytes[2] & 255) << 16)
        + ((int) (bytes[1] & 255) << 8)
        + ((int) (bytes[0] & 255) << 0);
  }

  public static byte[] longToBytes(long value) {
    byte[] outBuffer = new byte[8];
    outBuffer[7] = (byte) (value >>> 56);
    outBuffer[6] = (byte) (value >>> 48);
    outBuffer[5] = (byte) (value >>> 40);
    outBuffer[4] = (byte) (value >>> 32);
    outBuffer[3] = (byte) (value >>> 24);
    outBuffer[2] = (byte) (value >>> 16);
    outBuffer[1] = (byte) (value >>> 8);
    outBuffer[0] = (byte) (value >>> 0);
    return outBuffer;
  }

  public static long bytesToLong(byte[] bytes) {
    return (((long) bytes[7] << 56)
        + ((long) (bytes[6] & 255) << 48)
        + ((long) (bytes[5] & 255) << 40)
        + ((long) (bytes[4] & 255) << 32)
        + ((long) (bytes[3] & 255) << 24)
        + ((long) (bytes[2] & 255) << 16)
        + ((long) (bytes[1] & 255) << 8)
        + ((long) (bytes[0] & 255) << 0));
  }

  public static byte[] booleanToBytes(boolean value) {
    byte[] outBuffer = new byte[1];
    outBuffer[0] = (byte) (value ? 1 : 0);
    return outBuffer;
  }

  public static boolean bytesToBool(byte[] bytes) {
    return ((int) (bytes[0] & 255) != 0);
  }
}