Beispiel #1
0
  private void writeTest(FileSystem fs, int count, Path file, byte[][] fieldsData)
      throws IOException, SerDeException {
    fs.delete(file, true);

    RCFileOutputFormat.setColumnNumber(conf, fieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());

    BytesRefArrayWritable bytes = new BytesRefArrayWritable(fieldsData.length);
    for (int i = 0; i < fieldsData.length; i++) {
      BytesRefWritable cu = null;
      cu = new BytesRefWritable(fieldsData[i], 0, fieldsData[i].length);
      bytes.set(i, cu);
    }

    for (int i = 0; i < count; i++) {
      writer.append(bytes);
    }
    writer.close();
    long fileLen = fs.getFileStatus(file).getLen();
    System.out.println(
        "The file size of RCFile with "
            + bytes.size()
            + " number columns and "
            + count
            + " number rows is "
            + fileLen);
  }
  private void writeRCFileTest(
      FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec)
      throws IOException {
    fs.delete(file, true);

    resetRandomGenerators();

    RCFileOutputFormat.setColumnNumber(conf, columnNum);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, codec);

    byte[][] columnRandom;

    BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
    columnRandom = new byte[columnNum][];
    for (int i = 0; i < columnNum; i++) {
      BytesRefWritable cu = new BytesRefWritable();
      bytes.set(i, cu);
    }

    for (int i = 0; i < rowCount; i++) {
      nextRandomRow(columnRandom, bytes);
      writer.append(bytes);
    }
    writer.close();
  }
Beispiel #3
0
  private void writeThenReadByRecordReader(
      int intervalRecordCount,
      int writeCount,
      int splitNumber,
      long minSplitSize,
      CompressionCodec codec)
      throws IOException {
    Path testDir =
        new Path(System.getProperty("test.data.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(RCFile.RECORD_INTERVAL_CONF_STR, intervalRecordCount);

    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);

    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
      BytesRefWritable cu = null;
      cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
      bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
      if (i == intervalRecordCount) {
        System.out.println("write position:" + writer.getLength());
      }
      writer.append(bytes);
    }
    writer.close();

    RCFileInputFormat inputFormat = new RCFileInputFormat();
    JobConf jonconf = new JobConf(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    jonconf.setLong("mapred.min.split.size", minSplitSize);
    InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber);
    assertEquals("splits length should be " + splitNumber, splits.length, splitNumber);
    int readCount = 0;
    for (int i = 0; i < splits.length; i++) {
      int previousReadCount = readCount;
      RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL);
      Object key = rr.createKey();
      Object value = rr.createValue();
      while (rr.next(key, value)) {
        readCount++;
      }
      System.out.println("The " + i + "th split read " + (readCount - previousReadCount));
    }
    assertEquals("readCount should be equal to writeCount", readCount, writeCount);
  }
Beispiel #4
0
  public void testReadCorruptFile() throws IOException, SerDeException {
    fs.delete(file, true);

    byte[][] record = {null, null, null, null, null, null, null, null};

    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length);
    final int recCount = 100;
    Random rand = new Random();
    for (int recIdx = 0; recIdx < recCount; recIdx++) {
      for (int i = 0; i < record.length; i++) {
        record[i] = new Integer(rand.nextInt()).toString().getBytes("UTF-8");
      }
      for (int i = 0; i < record.length; i++) {
        BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length);
        bytes.set(i, cu);
      }
      writer.append(bytes);
      bytes.clear();
    }
    writer.close();

    // Insert junk in middle of file. Assumes file is on local disk.
    RandomAccessFile raf = new RandomAccessFile(file.toUri().getPath(), "rw");
    long corruptOffset = raf.length() / 2;
    LOG.info("corrupting " + raf + " at offset " + corruptOffset);
    raf.seek(corruptOffset);
    raf.writeBytes("junkjunkjunkjunkjunkjunkjunkjunk");
    raf.close();

    // Set the option for tolerating corruptions. The read should succeed.
    Configuration tmpConf = new Configuration(conf);
    tmpConf.setBoolean("hive.io.rcfile.tolerate.corruptions", true);
    RCFile.Reader reader = new RCFile.Reader(fs, file, tmpConf);

    LongWritable rowID = new LongWritable();

    while (true) {
      boolean more = reader.next(rowID);
      if (!more) {
        break;
      }
      BytesRefArrayWritable cols = new BytesRefArrayWritable();
      reader.getCurrentRow(cols);
      cols.resetValid(8);
    }

    reader.close();
  }
Beispiel #5
0
  public void testSimpleReadAndWrite() throws IOException, SerDeException {
    fs.delete(file, true);

    byte[][] record_1 = {
      "123".getBytes("UTF-8"),
      "456".getBytes("UTF-8"),
      "789".getBytes("UTF-8"),
      "1000".getBytes("UTF-8"),
      "5.3".getBytes("UTF-8"),
      "hive and hadoop".getBytes("UTF-8"),
      new byte[0],
      "NULL".getBytes("UTF-8")
    };
    byte[][] record_2 = {
      "100".getBytes("UTF-8"),
      "200".getBytes("UTF-8"),
      "123".getBytes("UTF-8"),
      "1000".getBytes("UTF-8"),
      "5.3".getBytes("UTF-8"),
      "hive and hadoop".getBytes("UTF-8"),
      new byte[0],
      "NULL".getBytes("UTF-8")
    };

    RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
    for (int i = 0; i < record_1.length; i++) {
      BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
      bytes.set(i, cu);
    }
    writer.append(bytes);
    bytes.clear();
    for (int i = 0; i < record_2.length; i++) {
      BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
      bytes.set(i, cu);
    }
    writer.append(bytes);
    writer.close();

    Object[] expectedRecord_1 = {
      new ByteWritable((byte) 123),
      new ShortWritable((short) 456),
      new IntWritable(789),
      new LongWritable(1000),
      new DoubleWritable(5.3),
      new Text("hive and hadoop"),
      null,
      null
    };

    Object[] expectedRecord_2 = {
      new ByteWritable((byte) 100),
      new ShortWritable((short) 200),
      new IntWritable(123),
      new LongWritable(1000),
      new DoubleWritable(5.3),
      new Text("hive and hadoop"),
      null,
      null
    };

    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();

    for (int i = 0; i < 2; i++) {
      reader.next(rowID);
      BytesRefArrayWritable cols = new BytesRefArrayWritable();
      reader.getCurrentRow(cols);
      cols.resetValid(8);
      Object row = serDe.deserialize(cols);

      StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector();
      List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
      assertEquals("Field size should be 8", 8, fieldRefs.size());
      for (int j = 0; j < fieldRefs.size(); j++) {
        Object fieldData = oi.getStructFieldData(row, fieldRefs.get(j));
        Object standardWritableData =
            ObjectInspectorUtils.copyToStandardObject(
                fieldData,
                fieldRefs.get(j).getFieldObjectInspector(),
                ObjectInspectorCopyOption.WRITABLE);
        if (i == 0) {
          assertEquals("Field " + i, standardWritableData, expectedRecord_1[j]);
        } else {
          assertEquals("Field " + i, standardWritableData, expectedRecord_2[j]);
        }
      }
    }

    reader.close();
  }