void checkFormat(Job job) throws Exception { TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat(); FileInputFormat.setInputPaths(job, workDir); List<InputSplit> splits = format.getSplits(job); for (int j = 0; j < splits.size(); j++) { RecordReader<LongWritable, MyClassWritable> reader = format.createRecordReader(splits.get(j), attemptContext); reader.initialize(splits.get(j), attemptContext); int count = 0; try { while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); MyClassWritable val = reader.getCurrentValue(); MyClass mc = val.get(); assertEquals(mc.v, count); assertEquals(mc.s, Integer.toString(count)); count++; } } finally { reader.close(); } } }
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); long fileLength = fileStatus.getLen(); // Hadoop does unsafe casting from long to int, so split length should not be greater than int // max value long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE; InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl( hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext); recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add( new Pair( fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
@Override public HCatRecord next() { try { return curRecReader.getCurrentValue(); } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } }
/** * Read all records from a RecordReader * * @param reader RecordReader * @throws IOException I/O errors * @throws InterruptedException thread errors */ private static long readFully(RecordReader<WritableComparable, HiveReadableRecord> reader) throws IOException, InterruptedException { long num = 0; while (reader.nextKeyValue()) { HiveReadableRecord record = reader.getCurrentValue(); parseLongLongDouble(record); ++num; // if (num % 1000000 == 0) { // System.out.println("Parsed " + num + " rows"); // } } return num; }
@Test public void testVbRecordReader() throws IOException, InterruptedException { RecordReader reader = getRecordReader( testUtils.getCobolFileLocation(), testUtils.getTestVbFileLocation(), "0x01", net.sf.JRecord.Common.Constants.IO_VB); int counter = 0; while (reader.nextKeyValue()) { counter++; System.out.println(reader.getCurrentKey() + "::\t" + reader.getCurrentValue()); } assertEquals(testUtils.getTestDataLength(), counter); }
@Override public Tuple getNext() throws IOException { Tuple tuple = null; try { if (reader.nextKeyValue()) { QuadWritable quad = reader.getCurrentValue(); tuple = tupleFactory.newTuple(4); tuple.set(0, NodeEncoder.asString(quad.getQuad().getGraph())); tuple.set(1, NodeEncoder.asString(quad.getQuad().getSubject())); tuple.set(2, NodeEncoder.asString(quad.getQuad().getPredicate())); tuple.set(3, NodeEncoder.asString(quad.getQuad().getObject())); } } catch (InterruptedException e) { throw new IOException(String.format("Error while reading %s", location)); } log.debug("getNext() --> {}", tuple); return tuple; }
@Override public Tuple getNext() throws IOException { try { if (!reader.nextKeyValue()) { return null; } final PhoenixRecord phoenixRecord = reader.getCurrentValue(); if (phoenixRecord == null) { return null; } final Tuple tuple = TypeUtil.transformToTuple(phoenixRecord, schema.getFields()); return tuple; } catch (InterruptedException e) { int errCode = 6018; final String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
@Override public Tuple getNext() throws IOException { try { List values = new ArrayList(); if (!reader.nextKeyValue()) { return null; } Text value = (Text) reader.getCurrentValue(); // TODO: parse record into component fields, add to values in order // check that the appropriate index of requiredFields is true before adding return tupleFactory.newTuple(values); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
@Override public Tuple getNext() throws IOException { try { if (!reader.nextKeyValue()) { return null; } Text value = (Text) reader.getCurrentValue(); String line = value.toString(); Tuple tuple = tupleFactory.newTuple(ranges.size()); for (int i = 0; i < ranges.size(); i++) { Range range = ranges.get(i); if (range.getEnd() > line.length()) { LOG.warn( String.format( "Range end (%s) is longer than line length (%s)", range.getEnd(), line.length())); continue; } tuple.set(i, new DataByteArray(range.getSubstring(line))); } return tuple; } catch (InterruptedException e) { throw new ExecException(e); } }
/** * Read the next k,v pair into the head of this object; return true iff the RR and this are * exhausted. */ private boolean next() throws IOException, InterruptedException { empty = !rr.nextKeyValue(); key = rr.getCurrentKey(); value = rr.getCurrentValue(); return !empty; }
public void testBinary() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq"); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); FileOutputFormat.setOutputPath(job, outdir); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class); SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); LOG.info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < RECORDS; ++i) { iwritable = new IntWritable(r.nextInt()); iwritable.write(outbuf); bkey.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); dwritable = new DoubleWritable(r.nextDouble()); dwritable.write(outbuf); bval.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); writer.write(bkey, bval); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>(); int count = 0; r.setSeed(seed); SequenceFileInputFormat.setInputPaths(job, outdir); LOG.info("Reading data by SequenceFileInputFormat"); for (InputSplit split : iformat.getSplits(job)) { RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context); MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { int sourceInt; double sourceDouble; while (reader.nextKeyValue()) { sourceInt = r.nextInt(); sourceDouble = r.nextDouble(); iwritable = reader.getCurrentKey(); dwritable = reader.getCurrentValue(); assertEquals( "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get()); assertTrue( "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0); ++count; } } finally { reader.close(); } } assertEquals("Some records not found", RECORDS, count); }
public V getCurrentValue() throws IOException, InterruptedException { return curReader.getCurrentValue(); }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#getNext() */ @Override public Tuple getNext() throws IOException { // If SKIP_INPUT_HEADER and this is the first input split, skip header record // We store its value as a string though, so we can compare // further records to it. If they are the same (this would // happen if multiple small files each with a header were combined // into one split), we know to skip the duplicate header record as well. if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER && (splitIndex == 0 || splitIndex == -1)) { try { if (!in.nextKeyValue()) return null; header = ((Text) in.getCurrentValue()).toString(); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } loadingFirstRecord = false; mProtoTuple = new ArrayList<Object>(); getNextInQuotedField = false; boolean evenQuotesSeen = true; boolean sawEmbeddedRecordDelimiter = false; byte[] buf = null; if (!mRequiredColumnsInitialized) { if (udfContextSignature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature)); } mRequiredColumnsInitialized = true; } // Note: we cannot factor out the check for nextKeyValue() being null, // because that call overwrites buf with the new line, which is // bad if we have a field with a newline. try { int recordLen = 0; getNextFieldID = 0; while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) { Text value = null; if (sawEmbeddedRecordDelimiter) { // Deal with pulling more records from the input, because // a double quoted embedded newline was encountered in a field. // Save the length of the record so far, plus one byte for the // record delimiter (usually newline) that's embedded in the field // we were working on before falling into this branch: int prevLineLen = recordLen + 1; // Save previous line (the one with the field that has the newline) in a new array. // The last byte will be random; we'll fill in the embedded // record delimiter (usually newline) below: byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen); prevLineSaved[prevLineLen - 1] = RECORD_DEL; // Read the continuation of the record, unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); recordLen = value.getLength(); // Grab the continuation's bytes: buf = value.getBytes(); // Combine the previous line and the continuation into a new array. // The following copyOf() does half the job: it allocates all the // space, and also copies the previous line into that space: byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen); // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, // lengthToCopy: System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen); // We'll work with the combination now: buf = prevLineAndContinuation; // Do the whole record over from the start: mProtoTuple.clear(); getNextInQuotedField = false; evenQuotesSeen = true; getNextFieldID = 0; recordLen = prevLineAndContinuation.length; } else { // Previous record finished cleanly: start with the next record, // unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it // (this might happen if multiple files each with a header are combined into a single // split) if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) { if (!in.nextKeyValue()) return null; value = (Text) in.getCurrentValue(); } buf = value.getBytes(); getNextFieldID = 0; recordLen = value.getLength(); } nextTupleSkipChar = false; ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen); sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer); // The last field is never delimited by a FIELD_DEL, but by // the end of the record. So we need to add that last field. // The '!sawEmbeddedRecordDelimiter' handles the case of // embedded newlines; we are amidst a field, not at // the final record: if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++); } // end while } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; }
@Override public V getCurrentValue() throws IOException, InterruptedException { return originalRR.getCurrentValue(); }
/** Get current value */ public U getCurrentValue() throws IOException, InterruptedException { return rr.getCurrentValue(); }
@Override public Void getCurrentValue() throws IOException, InterruptedException { return unfiltered.getCurrentValue(); }