// Dumps the contents of the file as ByteBuffer. public void readFile_ByteBuffer() throws Exception { OrcStruct lv_row = null; Object lv_field_val = null; ByteBuffer lv_row_buffer; while (m_rr.hasNext()) { byte[] lv_row_ba = new byte[4096]; lv_row_buffer = ByteBuffer.wrap(lv_row_ba); lv_row = (OrcStruct) m_rr.next(lv_row); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val == null) { lv_row_buffer.putInt(0); continue; } String lv_field_val_str = lv_field_val.toString(); lv_row_buffer.putInt(lv_field_val_str.length()); if (lv_field_val != null) { lv_row_buffer.put(lv_field_val_str.getBytes()); } } System.out.println(lv_row_buffer); // System.out.println(new String(lv_row_buffer.array())); } }
public byte[] getNext() throws Exception { if (!m_rr.hasNext()) { return null; } OrcStruct lv_row = (OrcStruct) m_rr.next(null); Object lv_field_val = null; ByteBuffer lv_row_buffer; byte[] lv_row_ba = new byte[4096]; lv_row_buffer = ByteBuffer.wrap(lv_row_ba); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val == null) { lv_row_buffer.putInt(0); continue; } String lv_field_val_str = lv_field_val.toString(); lv_row_buffer.putInt(lv_field_val_str.length()); if (lv_field_val != null) { lv_row_buffer.put(lv_field_val_str.getBytes()); } } System.out.println(lv_row_buffer); return lv_row_buffer.array(); }
@Override @SuppressWarnings("unchecked") public void run(final JobConf job, final TaskUmbilicalProtocol umbilical) throws IOException { final Reporter reporter = getReporter(umbilical); // start thread that will handle communication with parent startCommunicationThread(umbilical); int numReduceTasks = conf.getNumReduceTasks(); LOG.info("numReduceTasks: " + numReduceTasks); MapOutputCollector collector = null; if (numReduceTasks > 0) { collector = new MapOutputBuffer(umbilical, job, reporter); } else { collector = new DirectMapOutputCollector(umbilical, job, reporter); } // reinstantiate the split try { instantiatedSplit = (InputSplit) ReflectionUtils.newInstance(job.getClassByName(splitClass), job); } catch (ClassNotFoundException exp) { IOException wrap = new IOException("Split class " + splitClass + " not found"); wrap.initCause(exp); throw wrap; } DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(split.get(), 0, split.getSize()); instantiatedSplit.readFields(splitBuffer); // if it is a file split, we can give more details if (instantiatedSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) instantiatedSplit; job.set("map.input.file", fileSplit.getPath().toString()); job.setLong("map.input.start", fileSplit.getStart()); job.setLong("map.input.length", fileSplit.getLength()); } RecordReader rawIn = // open input job.getInputFormat().getRecordReader(instantiatedSplit, job, reporter); RecordReader in = new TrackedRecordReader(rawIn, getCounters()); MapRunnable runner = (MapRunnable) ReflectionUtils.newInstance(job.getMapRunnerClass(), job); try { runner.run(in, collector, reporter); collector.flush(); } finally { // close in.close(); // close input collector.close(); } done(umbilical); }
public void testInputFormat() { try { JobConf conf = new JobConf(); String TMP_DIR = System.getProperty("test.build.data", "/tmp"); Path filename = new Path("file:///" + TMP_DIR + "/tmpSeqFile"); SequenceFile.Writer sfw = SequenceFile.createWriter( FileSystem.getLocal(conf), conf, filename, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.NONE, Reporter.NULL); StringBuilder buf = new StringBuilder(); int offsets[] = new int[lines.length]; for (int i = 0; i < lines.length; ++i) { buf.append(lines[i]); buf.append("\n"); offsets[i] = buf.length() - 1; } ChukwaArchiveKey key = new ChukwaArchiveKey(0, "datatype", "sname", 0); ChunkImpl val = new ChunkImpl("datatype", "sname", 0, buf.toString().getBytes(), null); val.setRecordOffsets(offsets); sfw.append(key, val); sfw.append(key, val); // write it twice sfw.close(); long len = FileSystem.getLocal(conf).getFileStatus(filename).getLen(); InputSplit split = new FileSplit(filename, 0, len, (String[]) null); ChukwaInputFormat in = new ChukwaInputFormat(); RecordReader<LongWritable, Text> r = in.getRecordReader(split, conf, Reporter.NULL); LongWritable l = r.createKey(); Text line = r.createValue(); for (int i = 0; i < lines.length * 2; ++i) { boolean succeeded = r.next(l, line); assertTrue(succeeded); assertEquals(i, l.get()); assertEquals(lines[i % lines.length], line.toString()); System.out.println("read line: " + l.get() + " " + line); } boolean succeeded = r.next(l, line); assertFalse(succeeded); } catch (IOException e) { e.printStackTrace(); fail("IO exception " + e); } }
@Override public String[] next() { if (!hasNext()) { throw new NoSuchElementException(); } return reader.next(); }
private void handle(final Reader in, final Writer out, final RecordProcessor processor) throws Exception { final RecordReader reader = new RecordReader(in); final OutputStreamOutput output = new OutputStreamOutput(out); try { while (reader.hasNext()) { processor.processNext(reader, output); } } finally { try { output.close(); } finally { reader.close(); } } }
/** return progress based on the amount of data processed so far. */ public float getProgress() throws IOException, InterruptedException { long subprogress = 0; // bytes processed in current split if (null != curReader) { // idx is always one past the current subsplit's true index. subprogress = (long) (curReader.getProgress() * split.getLength(idx - 1)); } return Math.min(1.0f, (progress + subprogress) / (float) (split.getLength())); }
public void run() { // TODO Auto-generated method stub run = true; System.out.println("Running a map"); while (run) { // Figure out how to read input file // Evaluate it Map<String, List<String>> input = reader.getKeyValuePairs(); if (input == null) { map.setStatus(-1); Message msg = new Message(); msg.setTask(map); msg.setType('f'); coord.conn.sendMessage(msg); return; } OutputCollector<String, String> collect = new OutputCollector<String, String>(); Set<String> keySet = input.keySet(); for (String key : keySet) { List<String> values = input.get(key); for (String value : values) { map.getJob().map(key, value, collect); } } List<Pair> results = collect.getResults(); int length = results.size(); FileOutputStream out; try { System.out.println("Trying to Write to File"); out = new FileOutputStream(new File(map.getOutput().get(0))); BufferedWriter dw = new BufferedWriter(new OutputStreamWriter(out)); for (Pair p : results) { dw.append(p.toString()); dw.newLine(); } dw.newLine(); dw.flush(); dw.close(); out.close(); coord.dataNode.addFileToDFS(map.getOutput().get(0), coord.conn.port, false); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); map.setStatus(-1); Message msg = new Message(); msg.setTask(map); msg.setType('f'); coord.conn.sendMessage(msg); return; } map.setStatus(1); Message msg = new Message(); msg.setTask(map); msg.setType('f'); coord.conn.sendMessage(msg); break; } }
public boolean nextKeyValue() throws IOException, InterruptedException { while ((curReader == null) || !curReader.nextKeyValue()) { if (!initNextRecordReader()) { return false; } } return true; }
// Dumps the content of the file. The columns are '|' separated. public void readFile_String() throws Exception { OrcStruct lv_row = null; Object lv_field_val = null; StringBuilder lv_row_string = new StringBuilder(1024); while (m_rr.hasNext()) { lv_row = (OrcStruct) m_rr.next(lv_row); lv_row_string.setLength(0); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val != null) { lv_row_string.append(lv_field_val); } lv_row_string.append('|'); } System.out.println(lv_row_string); } }
public boolean seekToRow(long pv_rowNumber) throws IOException { if ((pv_rowNumber < 0) || (pv_rowNumber >= m_reader.getNumberOfRows())) { return false; } m_rr.seekToRow(pv_rowNumber); return true; }
public synchronized boolean next(K key, V value) throws IOException { setProgress(getProgress()); long beforePos = getPos(); boolean ret = rawIn.next(key, value); if (ret) { inputRecordCounter.increment(1); inputByteCounter.increment(getPos() - beforePos); } return ret; }
void next(OrcStruct next) throws IOException { if (recordReader.hasNext()) { nextRecord = (OrcStruct) recordReader.next(next); // set the key key.setValues( OrcRecordUpdater.getOriginalTransaction(nextRecord), OrcRecordUpdater.getBucket(nextRecord), OrcRecordUpdater.getRowId(nextRecord), OrcRecordUpdater.getCurrentTransaction(nextRecord)); // if this record is larger than maxKey, we need to stop if (maxKey != null && key.compareRow(maxKey) > 0) { LOG.debug("key " + key + " > maxkey " + maxKey); nextRecord = null; recordReader.close(); } } else { nextRecord = null; recordReader.close(); } }
void checkFormat(Job job) throws Exception { TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat(); FileInputFormat.setInputPaths(job, workDir); List<InputSplit> splits = format.getSplits(job); for (int j = 0; j < splits.size(); j++) { RecordReader<LongWritable, MyClassWritable> reader = format.createRecordReader(splits.get(j), attemptContext); reader.initialize(splits.get(j), attemptContext); int count = 0; try { while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); MyClassWritable val = reader.getCurrentValue(); MyClass mc = val.get(); assertEquals(mc.v, count); assertEquals(mc.s, Integer.toString(count)); count++; } } finally { reader.close(); } } }
/** * Configure the default record mapper. * * @return the default implementation of record mapper * @throws BatchConfigurationException if the target type class is not found */ private RecordMapper configureDefaultRecordMapper() throws BatchConfigurationException { RecordMapper recordMapper; String recordClassName = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_CLASS); if (recordClassName == null || recordClassName.length() == 0) { try { Class recordProcessorClass = Class.forName(recordProcessor.getClass().getName()); Method[] declaredMethods = recordProcessorClass.getDeclaredMethods(); for (Method declaredMethod : declaredMethods) { if (declaredMethod.getName().equals("processRecord")) { recordClassName = declaredMethod.getParameterTypes()[0].getName(); break; } } } catch (ClassNotFoundException e) { String error = "Configuration failed : unable to get record class name from registered record processor implementation."; logger.severe(error); throw new BatchConfigurationException(error, e); } } String[] headers; String headersProperty = configurationProperties.getProperty(BatchConstants.INPUT_RECORD_HEADERS); if (headersProperty == null) { // if no headers specified, use field names declared in the header record String headerRecord = recordReader.getHeaderRecord(); Record record = recordParser.parseRecord( headerRecord, 0); // use the record parser to parse the header record using the right delimiter List<Field> fields = record.getFields(); headers = new String[fields.size()]; for (int i = 0; i < fields.size(); i++) { headers[i] = fields.get(i).getContent(); } } else { // headers specified, split the comma separated list headers = headersProperty.split(","); } try { recordMapper = new DefaultRecordMapperImpl(recordClassName, headers, typeConverters); } catch (ClassNotFoundException e) { String error = "Configuration failed : Class " + recordClassName + " not found."; logger.severe(error); throw new BatchConfigurationException(error, e); } return recordMapper; }
/** Get the record reader for the next chunk in this CombineFileSplit. */ protected boolean initNextRecordReader() throws IOException { if (curReader != null) { curReader.close(); curReader = null; if (idx > 0) { progress += split.getLength(idx - 1); // done processing so far } } // if all chunks have been processed, nothing more to do. if (idx == split.getNumPaths()) { return false; } // get a record reader for the idx-th chunk try { Configuration conf = context.getConfiguration(); // setup some helper config variables. conf.set(MRJobConfig.MAP_INPUT_FILE, split.getPath(idx).toString()); conf.setLong(MRJobConfig.MAP_INPUT_START, split.getOffset(idx)); conf.setLong(MRJobConfig.MAP_INPUT_PATH, split.getLength(idx)); curReader = rrConstructor.newInstance(new Object[] {split, context, Integer.valueOf(idx)}); if (idx > 0) { // initialize() for the first RecordReader will be called by MapTask; // we're responsible for initializing subsequent RecordReaders. curReader.initialize(split, context); } } catch (Exception e) { throw new RuntimeException(e); } idx++; return true; }
private static List<Text> readSplit(TextInputFormat format, InputSplit split, JobConf job) throws IOException { List<Text> result = new ArrayList<Text>(); RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, voidReporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); while (reader.next(key, value)) { result.add(value); value = reader.createValue(); } reader.close(); return result; }
@Test public void testFormat() throws Exception { JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); LOG.info("seed = " + seed); Random random = new Random(seed); localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { LOG.debug("creating; entries = " + length); // create a file with length entries Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < length; i++) { writer.write(Integer.toString(i)); writer.write("\n"); } } finally { writer.close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / 20) + 1; LOG.debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); LOG.debug("splitting: got = " + splits.length); if (length == 0) { assertEquals( "Files of length 0 are not returned from FileInputFormat.getSplits().", 1, splits.length); assertEquals("Empty file length == 0", 0, splits[0].getLength()); } // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) { LOG.warn( "conflict with " + v + " in split " + j + " at position " + reader.getPos()); } assertFalse("Key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
@Test public void testSplitableCodecs() throws IOException { JobConf conf = new JobConf(defaultConf); int seed = new Random().nextInt(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec) ReflectionUtils.newInstance( conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf); } catch (ClassNotFoundException cnfe) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.getDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.NULL; LOG.info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(workDir, true); FileInputFormat.setInputPaths(conf, workDir); final int MAX_LENGTH = 500000; // for a variety of lengths for (int length = MAX_LENGTH / 2; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) { LOG.info("creating; entries = " + length); // create a file with length entries Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file))); try { for (int i = 0; i < length; i++) { writer.write(Integer.toString(i)); writer.write("\n"); } } finally { writer.close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1; LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(conf, numSplits); LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) { LOG.warn( "conflict with " + v + " in split " + j + " at position " + reader.getPos()); } assertFalse("Key in multiple partitions.", bits.get(v)); bits.set(v); counter++; } if (counter > 0) { LOG.info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
public void testFormat() throws Exception { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred"); Path file = new Path(dir, "test.seq"); Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); // LOG.info("seed = "+seed); Random random = new Random(seed); fs.delete(dir, true); FileInputFormat.setInputPaths(job, dir); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // LOG.info("creating; entries = " + length); // create a file with length entries SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class); try { for (int i = 0; i < length; i++) { IntWritable key = new IntWritable(i); byte[] data = new byte[random.nextInt(10)]; random.nextBytes(data); BytesWritable value = new BytesWritable(data); writer.append(key, value); } } finally { writer.close(); } // try splitting the file in a variety of sizes InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>(); IntWritable key = new IntWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1; // LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); // LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { // if (bits.get(key.get())) { // LOG.info("splits["+j+"]="+splits[j]+" : " + // key.get()); // LOG.info("@"+reader.getPos()); // } assertFalse("Key in multiple partitions.", bits.get(key.get())); bits.set(key.get()); count++; } // LOG.info("splits["+j+"]="+splits[j]+" count=" + // count); } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
public float getProgress() throws IOException { return rawIn.getProgress(); }
public void close() throws IOException { rawIn.close(); }
public long getPos() throws IOException { return rawIn.getPos(); }
// This method is just for experimentation. public void testRead() throws Exception { m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf)); System.out.println("Reader: " + m_reader); System.out.println("# Rows: " + m_reader.getNumberOfRows()); m_types = m_reader.getTypes(); System.out.println("# Types in the file: " + m_types.size()); for (int i = 0; i < m_types.size(); i++) { System.out.println("Type " + i + ": " + m_types.get(i).getKind()); } System.out.println("Compression: " + m_reader.getCompression()); if (m_reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + m_reader.getCompressionSize()); } StructObjectInspector m_oi = (StructObjectInspector) m_reader.getObjectInspector(); System.out.println("object inspector type category: " + m_oi.getCategory()); System.out.println("object inspector type name : " + m_oi.getTypeName()); m_fields = m_oi.getAllStructFieldRefs(); System.out.println("Number of columns in the table: " + m_fields.size()); RecordReader m_rr = m_reader.rows(); // Print the type info: for (int i = 0; i < m_fields.size(); i++) { System.out.println("Column " + i + " name: " + m_fields.get(i).getFieldName()); ObjectInspector lv_foi = m_fields.get(i).getFieldObjectInspector(); System.out.println("Column " + i + " type category: " + lv_foi.getCategory()); System.out.println("Column " + i + " type name: " + lv_foi.getTypeName()); // Object lv_column_val = m_oi.getStructFieldData(lv_row, m_fields.get(i)); // System.out.print("Column " + i + " value: " + lv_row.getFieldValue(i)); } OrcStruct lv_row = null; Object lv_field_val = null; StringBuilder lv_row_string = new StringBuilder(1024); while (m_rr.hasNext()) { lv_row = (OrcStruct) m_rr.next(lv_row); lv_row_string.setLength(0); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val != null) { lv_row_string.append(lv_field_val); } lv_row_string.append('|'); } System.out.println(lv_row_string); } /** * Typecasting to appropriate type based on the 'kind' if (OrcProto.Type.Kind.INT == * m_types.get(1).getKind()) { IntWritable lvf_1_val = (IntWritable) lv_row.getFieldValue(0); * System.out.println("Column 1 value: " + lvf_1_val); } */ }
public V createValue() { return rawIn.createValue(); }
public K createKey() { return rawIn.createKey(); }
public K getCurrentKey() throws IOException, InterruptedException { return curReader.getCurrentKey(); }
@Override public boolean hasNext() { return (reader.hasNext() && key.equals(reader.peek()[0])); }
public V getCurrentValue() throws IOException, InterruptedException { return curReader.getCurrentValue(); }
public void close() throws IOException { if (curReader != null) { curReader.close(); curReader = null; } }