// Dumps the contents of the file as ByteBuffer. public void readFile_ByteBuffer() throws Exception { OrcStruct lv_row = null; Object lv_field_val = null; ByteBuffer lv_row_buffer; while (m_rr.hasNext()) { byte[] lv_row_ba = new byte[4096]; lv_row_buffer = ByteBuffer.wrap(lv_row_ba); lv_row = (OrcStruct) m_rr.next(lv_row); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val == null) { lv_row_buffer.putInt(0); continue; } String lv_field_val_str = lv_field_val.toString(); lv_row_buffer.putInt(lv_field_val_str.length()); if (lv_field_val != null) { lv_row_buffer.put(lv_field_val_str.getBytes()); } } System.out.println(lv_row_buffer); // System.out.println(new String(lv_row_buffer.array())); } }
public byte[] getNext() throws Exception { if (!m_rr.hasNext()) { return null; } OrcStruct lv_row = (OrcStruct) m_rr.next(null); Object lv_field_val = null; ByteBuffer lv_row_buffer; byte[] lv_row_ba = new byte[4096]; lv_row_buffer = ByteBuffer.wrap(lv_row_ba); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val == null) { lv_row_buffer.putInt(0); continue; } String lv_field_val_str = lv_field_val.toString(); lv_row_buffer.putInt(lv_field_val_str.length()); if (lv_field_val != null) { lv_row_buffer.put(lv_field_val_str.getBytes()); } } System.out.println(lv_row_buffer); return lv_row_buffer.array(); }
// Dumps the content of the file. The columns are '|' separated. public void readFile_String() throws Exception { OrcStruct lv_row = null; Object lv_field_val = null; StringBuilder lv_row_string = new StringBuilder(1024); while (m_rr.hasNext()) { lv_row = (OrcStruct) m_rr.next(lv_row); lv_row_string.setLength(0); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val != null) { lv_row_string.append(lv_field_val); } lv_row_string.append('|'); } System.out.println(lv_row_string); } }
public boolean seekToRow(long pv_rowNumber) throws IOException { if ((pv_rowNumber < 0) || (pv_rowNumber >= m_reader.getNumberOfRows())) { return false; } m_rr.seekToRow(pv_rowNumber); return true; }
void checkFormat(Job job) throws Exception { TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat(); FileInputFormat.setInputPaths(job, workDir); List<InputSplit> splits = format.getSplits(job); for (int j = 0; j < splits.size(); j++) { RecordReader<LongWritable, MyClassWritable> reader = format.createRecordReader(splits.get(j), attemptContext); reader.initialize(splits.get(j), attemptContext); int count = 0; try { while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); MyClassWritable val = reader.getCurrentValue(); MyClass mc = val.get(); assertEquals(mc.v, count); assertEquals(mc.s, Integer.toString(count)); count++; } } finally { reader.close(); } } }
/** * Configure the default record mapper. * * @return the default implementation of record mapper * @throws BatchConfigurationException if the target type class is not found */ private RecordMapper configureDefaultRecordMapper() throws BatchConfigurationException { RecordMapper recordMapper; String recordClassName = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_CLASS); if (recordClassName == null || recordClassName.length() == 0) { try { Class recordProcessorClass = Class.forName(recordProcessor.getClass().getName()); Method[] declaredMethods = recordProcessorClass.getDeclaredMethods(); for (Method declaredMethod : declaredMethods) { if (declaredMethod.getName().equals("processRecord")) { recordClassName = declaredMethod.getParameterTypes()[0].getName(); break; } } } catch (ClassNotFoundException e) { String error = "Configuration failed : unable to get record class name from registered record processor implementation."; logger.severe(error); throw new BatchConfigurationException(error, e); } } String[] headers; String headersProperty = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_HEADERS); if (headersProperty == null) { // if no headers specified, use field names declared in the header record String headerRecord = recordReader.getHeaderRecord(); Record record = recordParser.parseRecord( headerRecord, 0); // use the record parser to parse the header record using the right delimiter List<Field> fields = record.getFields(); headers = new String[fields.size()]; for (int i = 0; i < fields.size(); i++) { headers[i] = fields.get(i).getContent(); } } else { // headers specified, split the comma separated list headers = headersProperty.split(","); } try { recordMapper = new DefaultRecordMapperImpl(recordClassName, headers, typeConverters); } catch (ClassNotFoundException e) { String error = "Configuration failed : Class " + recordClassName + " not found."; logger.severe(error); throw new BatchConfigurationException(error, e); } return recordMapper; }
public void testFormat() throws Exception { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); // LOG.info("seed = "+seed); Random random = new Random(seed); fs.delete(dir, true); FileInputFormat.setInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); byte[] data = new byte[random.nextInt(10)]; random.nextBytes(data); BytesWritable value = new BytesWritable(data); writer.append(key, value); } } finally { writer.close(); } // try splitting the file in a variety of sizes InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; // LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); // LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + // key.get()); // LOG.info("@"+reader.getPos()); // } assertFalse("Key in multiple partitions.", bits.get(key.get())); bits.set(key.get()); count++; } // LOG.info("splits["+j+"]="+splits[j]+" count=" + // count); } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
// This method is just for experimentation. public void testRead() throws Exception { m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf)); System.out.println("Reader: " + m_reader); System.out.println("# Rows: " + m_reader.getNumberOfRows()); m_types = m_reader.getTypes(); System.out.println("# Types in the file: " + m_types.size()); for (int i = 0; i < m_types.size(); i++) { System.out.println("Type " + i + ": " + m_types.get(i).getKind()); } System.out.println("Compression: " + m_reader.getCompression()); if (m_reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + m_reader.getCompressionSize()); } StructObjectInspector m_oi = (StructObjectInspector) m_reader.getObjectInspector(); System.out.println("object inspector type category: " + m_oi.getCategory()); System.out.println("object inspector type name : " + m_oi.getTypeName()); m_fields = m_oi.getAllStructFieldRefs(); System.out.println("Number of columns in the table: " + m_fields.size()); RecordReader m_rr = m_reader.rows(); // Print the type info: for (int i = 0; i < m_fields.size(); i++) { System.out.println("Column " + i + " name: " + m_fields.get(i).getFieldName()); ObjectInspector lv_foi = m_fields.get(i).getFieldObjectInspector(); System.out.println("Column " + i + " type category: " + lv_foi.getCategory()); System.out.println("Column " + i + " type name: " + lv_foi.getTypeName()); // Object lv_column_val = m_oi.getStructFieldData(lv_row, m_fields.get(i)); // System.out.print("Column " + i + " value: " + lv_row.getFieldValue(i)); } OrcStruct lv_row = null; Object lv_field_val = null; StringBuilder lv_row_string = new StringBuilder(1024); while (m_rr.hasNext()) { lv_row = (OrcStruct) m_rr.next(lv_row); lv_row_string.setLength(0); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val != null) { lv_row_string.append(lv_field_val); } lv_row_string.append('|'); } System.out.println(lv_row_string); } /** * Typecasting to appropriate type based on the 'kind' if (OrcProto.Type.Kind.INT == * m_types.get(1).getKind()) { IntWritable lvf_1_val = (IntWritable) lv_row.getFieldValue(0); * System.out.println("Column 1 value: " + lvf_1_val); } */ }
/** * Configure CB4J record parser. * * @throws BatchConfigurationException thrown if record parser is not correctly configured */ private void configureRecordParser() throws BatchConfigurationException { // read record type property and set default value if invalid input String recordTypeProperty = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_TYPE); String recordType; if (recordTypeProperty == null || recordTypeProperty.length() == 0) { recordType = BatchConstants.DEFAULT_RECORD_TYPE; logger.info( "Record type property not specified, records will be considered as delimiter-separated values"); } else if (!RecordType.DSV.toString().equalsIgnoreCase(recordTypeProperty) && !RecordType.FLR.toString().equalsIgnoreCase(recordTypeProperty)) { recordType = BatchConstants.DEFAULT_RECORD_TYPE; logger.warning( "Record type property '" + recordTypeProperty + "' is invalid, records will be considered as delimiter-separated values"); } else { recordType = recordTypeProperty; } // fixed length record configuration if (RecordType.FLR.toString().equalsIgnoreCase(recordType)) { String fieldsLengthProperties = configurationProperties.getProperty(BatchConstants.INPUT_FIELD_LENGTHS); if (fieldsLengthProperties == null || fieldsLengthProperties.length() == 0) { String error = "Configuration failed : when using fixed length records, fields length values property '" + BatchConstants.INPUT_FIELD_LENGTHS + "' is mandatory but was not specified."; logger.severe(error); throw new BatchConfigurationException(error); } else { // parse fields length property and extract numeric values StringTokenizer stringTokenizer = new StringTokenizer(fieldsLengthProperties, ","); int[] fieldsLength = new int[stringTokenizer.countTokens()]; int index = 0; while (stringTokenizer.hasMoreTokens()) { String length = stringTokenizer.nextToken(); try { fieldsLength[index] = Integer.parseInt(length); index++; } catch (NumberFormatException e) { String error = "Configuration failed : field length '" + length + "' in property " + BatchConstants.INPUT_FIELD_LENGTHS + "=" + fieldsLengthProperties + " is not numeric."; logger.severe(error); throw new BatchConfigurationException(error); } } recordParser = new FlrRecordParserImpl(fieldsLength); } } else { // delimited values configuration String recordSizeProperty = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_SIZE); try { String fieldsDelimiter = configurationProperties.getProperty(BatchConstants.INPUT_FIELD_DELIMITER); if (fieldsDelimiter == null || fieldsDelimiter.length() == 0) { fieldsDelimiter = BatchConstants.DEFAULT_FIELD_DELIMITER; logger.info("No field delimiter specified, using default : '" + fieldsDelimiter + "'"); } String trimWhitespacesProperty = configurationProperties.getProperty(BatchConstants.INPUT_FIELD_TRIM); boolean trimWhitespaces; if (trimWhitespacesProperty != null) { trimWhitespaces = Boolean.valueOf(trimWhitespacesProperty); } else { trimWhitespaces = BatchConstants.DEFAULT_FIELD_TRIM; logger.info("Trim whitespaces property not specified, default to " + trimWhitespaces); } String dataQualifierCharacterProperty = configurationProperties.getProperty(BatchConstants.INPUT_FIELD_QUALIFIER_CHAR); String dataQualifierCharacter = BatchConstants.DEFAULT_FIELD_QUALIFIER_CHAR; if (dataQualifierCharacterProperty != null && dataQualifierCharacterProperty.length() > 0) { dataQualifierCharacter = dataQualifierCharacterProperty; } else { logger.info( "Data qualifier character not specified, default to " + dataQualifierCharacter); } recordParser = new DsvRecordParserImpl(fieldsDelimiter, trimWhitespaces, dataQualifierCharacter); if (recordSizeProperty == null || recordSizeProperty.length() == 0) { logger.info( "Record size property not specified, it will be calculated from the header record"); String headerRecord = recordReader.getHeaderRecord(); Record record = recordParser.parseRecord( headerRecord, 0); // use the record parser to parse the header record using the right delimiter recordSizeProperty = String.valueOf(record.getFields().size()); } int recordSize = Integer.parseInt(recordSizeProperty); recordParser = new DsvRecordParserImpl( recordSize, fieldsDelimiter, trimWhitespaces, dataQualifierCharacter); logger.info("Record size : " + recordSize); logger.info("Fields delimiter : '" + fieldsDelimiter + "'"); logger.info("Data qualifier character : '" + dataQualifierCharacter + "'"); } catch (NumberFormatException e) { String error = "Record size property is not recognized as a number : " + recordSizeProperty; logger.severe(error); throw new BatchConfigurationException(error); } } }