예제 #1
0
  // Dumps the contents of the file as ByteBuffer.
  public void readFile_ByteBuffer() throws Exception {

    OrcStruct lv_row = null;
    Object lv_field_val = null;
    ByteBuffer lv_row_buffer;
    while (m_rr.hasNext()) {
      byte[] lv_row_ba = new byte[4096];
      lv_row_buffer = ByteBuffer.wrap(lv_row_ba);
      lv_row = (OrcStruct) m_rr.next(lv_row);
      for (int i = 0; i < m_fields.size(); i++) {
        lv_field_val = lv_row.getFieldValue(i);
        if (lv_field_val == null) {
          lv_row_buffer.putInt(0);
          continue;
        }
        String lv_field_val_str = lv_field_val.toString();
        lv_row_buffer.putInt(lv_field_val_str.length());
        if (lv_field_val != null) {
          lv_row_buffer.put(lv_field_val_str.getBytes());
        }
      }
      System.out.println(lv_row_buffer);
      //	    System.out.println(new String(lv_row_buffer.array()));
    }
  }
예제 #2
0
  public byte[] getNext() throws Exception {

    if (!m_rr.hasNext()) {
      return null;
    }

    OrcStruct lv_row = (OrcStruct) m_rr.next(null);
    Object lv_field_val = null;
    ByteBuffer lv_row_buffer;

    byte[] lv_row_ba = new byte[4096];
    lv_row_buffer = ByteBuffer.wrap(lv_row_ba);
    for (int i = 0; i < m_fields.size(); i++) {
      lv_field_val = lv_row.getFieldValue(i);
      if (lv_field_val == null) {
        lv_row_buffer.putInt(0);
        continue;
      }
      String lv_field_val_str = lv_field_val.toString();
      lv_row_buffer.putInt(lv_field_val_str.length());
      if (lv_field_val != null) {
        lv_row_buffer.put(lv_field_val_str.getBytes());
      }
    }

    System.out.println(lv_row_buffer);
    return lv_row_buffer.array();
  }
예제 #3
0
  // Dumps the content of the file. The columns are '|' separated.
  public void readFile_String() throws Exception {

    OrcStruct lv_row = null;
    Object lv_field_val = null;
    StringBuilder lv_row_string = new StringBuilder(1024);
    while (m_rr.hasNext()) {
      lv_row = (OrcStruct) m_rr.next(lv_row);
      lv_row_string.setLength(0);
      for (int i = 0; i < m_fields.size(); i++) {
        lv_field_val = lv_row.getFieldValue(i);
        if (lv_field_val != null) {
          lv_row_string.append(lv_field_val);
        }
        lv_row_string.append('|');
      }
      System.out.println(lv_row_string);
    }
  }
예제 #4
0
  public boolean seekToRow(long pv_rowNumber) throws IOException {
    if ((pv_rowNumber < 0) || (pv_rowNumber >= m_reader.getNumberOfRows())) {
      return false;
    }

    m_rr.seekToRow(pv_rowNumber);

    return true;
  }
  void checkFormat(Job job) throws Exception {
    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat();
    FileInputFormat.setInputPaths(job, workDir);

    List<InputSplit> splits = format.getSplits(job);
    for (int j = 0; j < splits.size(); j++) {
      RecordReader<LongWritable, MyClassWritable> reader =
          format.createRecordReader(splits.get(j), attemptContext);
      reader.initialize(splits.get(j), attemptContext);

      int count = 0;
      try {
        while (reader.nextKeyValue()) {
          LongWritable key = reader.getCurrentKey();
          MyClassWritable val = reader.getCurrentValue();
          MyClass mc = val.get();
          assertEquals(mc.v, count);
          assertEquals(mc.s, Integer.toString(count));
          count++;
        }
      } finally {
        reader.close();
      }
    }
  }
예제 #6
0
  /**
   * Configure the default record mapper.
   *
   * @return the default implementation of record mapper
   * @throws BatchConfigurationException if the target type class is not found
   */
  private RecordMapper configureDefaultRecordMapper() throws BatchConfigurationException {

    RecordMapper recordMapper;

    String recordClassName = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_CLASS);
    if (recordClassName == null || recordClassName.length() == 0) {
      try {
        Class recordProcessorClass = Class.forName(recordProcessor.getClass().getName());
        Method[] declaredMethods = recordProcessorClass.getDeclaredMethods();
        for (Method declaredMethod : declaredMethods) {
          if (declaredMethod.getName().equals("processRecord")) {
            recordClassName = declaredMethod.getParameterTypes()[0].getName();
            break;
          }
        }
      } catch (ClassNotFoundException e) {
        String error =
            "Configuration failed : unable to get record class name from registered record processor implementation.";
        logger.severe(error);
        throw new BatchConfigurationException(error, e);
      }
    }

    String[] headers;
    String headersProperty =
        configurationProperties.getProperty(BatchConstants.INPUT_RECORD_HEADERS);
    if (headersProperty
        == null) { // if no headers specified, use field names declared in the header record
      String headerRecord = recordReader.getHeaderRecord();
      Record record =
          recordParser.parseRecord(
              headerRecord,
              0); // use the record parser to parse the header record using the right delimiter
      List<Field> fields = record.getFields();
      headers = new String[fields.size()];
      for (int i = 0; i < fields.size(); i++) {
        headers[i] = fields.get(i).getContent();
      }
    } else { // headers specified, split the comma separated list
      headers = headersProperty.split(",");
    }

    try {
      recordMapper = new DefaultRecordMapperImpl(recordClassName, headers, typeConverters);
    } catch (ClassNotFoundException e) {
      String error = "Configuration failed : Class " + recordClassName + " not found.";
      logger.severe(error);
      throw new BatchConfigurationException(error, e);
    }

    return recordMapper;
  }
  public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "test.seq");

    Reporter reporter = Reporter.NULL;

    int seed = new Random().nextInt();
    // LOG.info("seed = "+seed);
    Random random = new Random(seed);

    fs.delete(dir, true);

    FileInputFormat.setInputPaths(job, dir);

    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {

      // LOG.info("creating; entries = " + length);

      // create a file with length entries
      SequenceFile.Writer writer =
          SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class);
      try {
        for (int i = 0; i < length; i++) {
          IntWritable key = new IntWritable(i);
          byte[] data = new byte[random.nextInt(10)];
          random.nextBytes(data);
          BytesWritable value = new BytesWritable(data);
          writer.append(key, value);
        }
      } finally {
        writer.close();
      }

      // try splitting the file in a variety of sizes
      InputFormat<IntWritable, BytesWritable> format =
          new SequenceFileInputFormat<IntWritable, BytesWritable>();
      IntWritable key = new IntWritable();
      BytesWritable value = new BytesWritable();
      for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        // LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(job, numSplits);
        // LOG.info("splitting: got =        " + splits.length);

        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          RecordReader<IntWritable, BytesWritable> reader =
              format.getRecordReader(splits[j], job, reporter);
          try {
            int count = 0;
            while (reader.next(key, value)) {
              // if (bits.get(key.get())) {
              // LOG.info("splits["+j+"]="+splits[j]+" : " +
              // key.get());
              // LOG.info("@"+reader.getPos());
              // }
              assertFalse("Key in multiple partitions.", bits.get(key.get()));
              bits.set(key.get());
              count++;
            }
            // LOG.info("splits["+j+"]="+splits[j]+" count=" +
            // count);
          } finally {
            reader.close();
          }
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
      }
    }
  }
예제 #8
0
  // This method is just for experimentation.
  public void testRead() throws Exception {

    m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf));

    System.out.println("Reader: " + m_reader);

    System.out.println("# Rows: " + m_reader.getNumberOfRows());
    m_types = m_reader.getTypes();
    System.out.println("# Types in the file: " + m_types.size());

    for (int i = 0; i < m_types.size(); i++) {
      System.out.println("Type " + i + ": " + m_types.get(i).getKind());
    }

    System.out.println("Compression: " + m_reader.getCompression());
    if (m_reader.getCompression() != CompressionKind.NONE) {
      System.out.println("Compression size: " + m_reader.getCompressionSize());
    }

    StructObjectInspector m_oi = (StructObjectInspector) m_reader.getObjectInspector();

    System.out.println("object inspector type category: " + m_oi.getCategory());
    System.out.println("object inspector type name    : " + m_oi.getTypeName());

    m_fields = m_oi.getAllStructFieldRefs();
    System.out.println("Number of columns in the table: " + m_fields.size());

    RecordReader m_rr = m_reader.rows();

    // Print the type info:
    for (int i = 0; i < m_fields.size(); i++) {
      System.out.println("Column " + i + " name: " + m_fields.get(i).getFieldName());
      ObjectInspector lv_foi = m_fields.get(i).getFieldObjectInspector();
      System.out.println("Column " + i + " type category: " + lv_foi.getCategory());
      System.out.println("Column " + i + " type name: " + lv_foi.getTypeName());
      //		Object lv_column_val = m_oi.getStructFieldData(lv_row, m_fields.get(i));
      // System.out.print("Column " + i + " value: " + lv_row.getFieldValue(i));
    }

    OrcStruct lv_row = null;
    Object lv_field_val = null;
    StringBuilder lv_row_string = new StringBuilder(1024);
    while (m_rr.hasNext()) {
      lv_row = (OrcStruct) m_rr.next(lv_row);
      lv_row_string.setLength(0);
      for (int i = 0; i < m_fields.size(); i++) {
        lv_field_val = lv_row.getFieldValue(i);
        if (lv_field_val != null) {
          lv_row_string.append(lv_field_val);
        }
        lv_row_string.append('|');
      }
      System.out.println(lv_row_string);
    }

    /**
     * Typecasting to appropriate type based on the 'kind' if (OrcProto.Type.Kind.INT ==
     * m_types.get(1).getKind()) { IntWritable lvf_1_val = (IntWritable) lv_row.getFieldValue(0);
     * System.out.println("Column 1 value: " + lvf_1_val); }
     */
  }
예제 #9
0
  /**
   * Configure CB4J record parser.
   *
   * @throws BatchConfigurationException thrown if record parser is not correctly configured
   */
  private void configureRecordParser() throws BatchConfigurationException {

    // read record type property and set default value if invalid input
    String recordTypeProperty =
        configurationProperties.getProperty(BatchConstants.INPUT_RECORD_TYPE);
    String recordType;
    if (recordTypeProperty == null || recordTypeProperty.length() == 0) {
      recordType = BatchConstants.DEFAULT_RECORD_TYPE;
      logger.info(
          "Record type property not specified, records will be considered as delimiter-separated values");
    } else if (!RecordType.DSV.toString().equalsIgnoreCase(recordTypeProperty)
        && !RecordType.FLR.toString().equalsIgnoreCase(recordTypeProperty)) {
      recordType = BatchConstants.DEFAULT_RECORD_TYPE;
      logger.warning(
          "Record type property '"
              + recordTypeProperty
              + "' is invalid, records will be considered as delimiter-separated values");
    } else {
      recordType = recordTypeProperty;
    }

    // fixed length record configuration
    if (RecordType.FLR.toString().equalsIgnoreCase(recordType)) {
      String fieldsLengthProperties =
          configurationProperties.getProperty(BatchConstants.INPUT_FIELD_LENGTHS);
      if (fieldsLengthProperties == null || fieldsLengthProperties.length() == 0) {
        String error =
            "Configuration failed : when using fixed length records, fields length values property '"
                + BatchConstants.INPUT_FIELD_LENGTHS
                + "' is mandatory but was not specified.";
        logger.severe(error);
        throw new BatchConfigurationException(error);
      } else {
        // parse fields length property and extract numeric values
        StringTokenizer stringTokenizer = new StringTokenizer(fieldsLengthProperties, ",");
        int[] fieldsLength = new int[stringTokenizer.countTokens()];
        int index = 0;
        while (stringTokenizer.hasMoreTokens()) {
          String length = stringTokenizer.nextToken();
          try {
            fieldsLength[index] = Integer.parseInt(length);
            index++;
          } catch (NumberFormatException e) {
            String error =
                "Configuration failed : field length '"
                    + length
                    + "' in property "
                    + BatchConstants.INPUT_FIELD_LENGTHS
                    + "="
                    + fieldsLengthProperties
                    + " is not numeric.";
            logger.severe(error);
            throw new BatchConfigurationException(error);
          }
        }
        recordParser = new FlrRecordParserImpl(fieldsLength);
      }
    } else { // delimited values configuration

      String recordSizeProperty =
          configurationProperties.getProperty(BatchConstants.INPUT_RECORD_SIZE);

      try {

        String fieldsDelimiter =
            configurationProperties.getProperty(BatchConstants.INPUT_FIELD_DELIMITER);
        if (fieldsDelimiter == null || fieldsDelimiter.length() == 0) {
          fieldsDelimiter = BatchConstants.DEFAULT_FIELD_DELIMITER;
          logger.info("No field delimiter specified, using default : '" + fieldsDelimiter + "'");
        }

        String trimWhitespacesProperty =
            configurationProperties.getProperty(BatchConstants.INPUT_FIELD_TRIM);
        boolean trimWhitespaces;
        if (trimWhitespacesProperty != null) {
          trimWhitespaces = Boolean.valueOf(trimWhitespacesProperty);
        } else {
          trimWhitespaces = BatchConstants.DEFAULT_FIELD_TRIM;
          logger.info("Trim whitespaces property not specified, default to " + trimWhitespaces);
        }

        String dataQualifierCharacterProperty =
            configurationProperties.getProperty(BatchConstants.INPUT_FIELD_QUALIFIER_CHAR);
        String dataQualifierCharacter = BatchConstants.DEFAULT_FIELD_QUALIFIER_CHAR;
        if (dataQualifierCharacterProperty != null && dataQualifierCharacterProperty.length() > 0) {
          dataQualifierCharacter = dataQualifierCharacterProperty;
        } else {
          logger.info(
              "Data qualifier character not specified, default to " + dataQualifierCharacter);
        }

        recordParser =
            new DsvRecordParserImpl(fieldsDelimiter, trimWhitespaces, dataQualifierCharacter);

        if (recordSizeProperty == null || recordSizeProperty.length() == 0) {
          logger.info(
              "Record size property not specified, it will be calculated from the header record");
          String headerRecord = recordReader.getHeaderRecord();
          Record record =
              recordParser.parseRecord(
                  headerRecord,
                  0); // use the record parser to parse the header record using the right delimiter
          recordSizeProperty = String.valueOf(record.getFields().size());
        }

        int recordSize = Integer.parseInt(recordSizeProperty);

        recordParser =
            new DsvRecordParserImpl(
                recordSize, fieldsDelimiter, trimWhitespaces, dataQualifierCharacter);

        logger.info("Record size : " + recordSize);
        logger.info("Fields delimiter : '" + fieldsDelimiter + "'");
        logger.info("Data qualifier character : '" + dataQualifierCharacter + "'");

      } catch (NumberFormatException e) {
        String error = "Record size property is not recognized as a number : " + recordSizeProperty;
        logger.severe(error);
        throw new BatchConfigurationException(error);
      }
    }
  }