/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
@Test public void readOutsidePig() throws ClassCastException, ParseException, ClassNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException { // simulate Pig front-end runtime final SequenceFileLoader<IntWritable, Text> storage = new SequenceFileLoader<IntWritable, Text>( "-c " + IntWritableConverter.class.getName(), "-c " + TextConverter.class.getName()); Job job = new Job(); storage.setUDFContextSignature("12345"); storage.setLocation(tempFilename, job); // simulate Pig back-end runtime RecordReader<DataInputBuffer, DataInputBuffer> reader = new RawSequenceFileRecordReader(); FileSplit fileSplit = new FileSplit( new Path(tempFilename), 0, new File(tempFilename).length(), new String[] {"localhost"}); TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); reader.initialize(fileSplit, context); InputSplit[] wrappedSplits = new InputSplit[] {fileSplit}; int inputIndex = 0; List<OperatorKey> targetOps = Arrays.asList(new OperatorKey("54321", 0)); int splitIndex = 0; PigSplit split = new PigSplit(wrappedSplits, inputIndex, targetOps, splitIndex); split.setConf(job.getConfiguration()); storage.prepareToRead(reader, split); // read tuples and validate validate(new LoadFuncTupleIterator(storage)); }
@Override public RecordReader<LongWritable, Text> createRecordReader( InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final RecordReader<LongWritable, Text> rr = new SortRecordReader(); rr.initialize(split, ctx); return rr; }
private E computeNextKey() throws IOException, InterruptedException { while (unfiltered.nextKeyValue()) { E element = unfiltered.getCurrentKey(); if (predicate.apply(element)) { return element; } } return null; }
/** * Read all records from a RecordReader * * @param reader RecordReader * @throws IOException I/O errors * @throws InterruptedException thread errors */ private static long readFully(RecordReader<WritableComparable, HiveReadableRecord> reader) throws IOException, InterruptedException { long num = 0; while (reader.nextKeyValue()) { HiveReadableRecord record = reader.getCurrentValue(); parseLongLongDouble(record); ++num; // if (num % 1000000 == 0) { // System.out.println("Parsed " + num + " rows"); // } } return num; }
@Test public void testPreferredServerUnreachable() throws Exception { InfinispanInputSplit invalidSplit = createInfinispanSplit(); Configuration configuration = miniHadoopCluster.getConfiguration(); TaskAttemptContextImpl fakeTaskContext = new TaskAttemptContextImpl(configuration, new TaskAttemptID()); InfinispanInputFormat<Integer, WebPage> inputFormat = new InfinispanInputFormat<>(); RecordReader<Integer, WebPage> reader = inputFormat.createRecordReader(invalidSplit, fakeTaskContext); reader.initialize(invalidSplit, fakeTaskContext); reader.nextKeyValue(); assertNotNull(reader.getCurrentKey()); }
@Test public void testVbRecordReader() throws IOException, InterruptedException { RecordReader reader = getRecordReader( testUtils.getCobolFileLocation(), testUtils.getTestVbFileLocation(), "0x01", net.sf.JRecord.Common.Constants.IO_VB); int counter = 0; while (reader.nextKeyValue()) { counter++; System.out.println(reader.getCurrentKey() + "::\t" + reader.getCurrentValue()); } assertEquals(testUtils.getTestDataLength(), counter); }
@Override public boolean hasNext() { try { boolean retVal = curRecReader.nextKeyValue(); if (retVal) { return true; } // if its false, we need to close recordReader. curRecReader.close(); return false; } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } }
/** * Randomize the split order, then take the specified number of keys from each split sampled, * where each key is selected with the specified probability and possibly replaced by a * subsequently selected key when the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); // shuffle splits for (int i = 0; i < splits.size(); ++i) { InputSplit tmp = splits.get(i); int j = r.nextInt(splits.size()); splits.set(i, splits.get(j)); splits.set(j, tmp); } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.size() && samples.size() < numSamples); ++i) { RecordReader<K, V> reader = inf.createRecordReader( splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID())); while (reader.nextKeyValue()) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(reader.getCurrentKey()); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, reader.getCurrentKey()); } freq *= (numSamples - 1) / (double) numSamples; } } } reader.close(); } return (K[]) samples.toArray(); }
/** return progress based on the amount of data processed so far. */ public float getProgress() throws IOException, InterruptedException { long subprogress = 0; // bytes processed in current split if (null != curReader) { // idx is always one past the current subsplit's true index. subprogress = (long) (curReader.getProgress() * split.getLength(idx - 1)); } return Math.min(1.0f, (progress + subprogress) / (float) (split.getLength())); }
@Override public Tuple getNext() throws IOException { try { if (!reader.nextKeyValue()) { return null; } final PhoenixRecord phoenixRecord = reader.getCurrentValue(); if (phoenixRecord == null) { return null; } final Tuple tuple = TypeUtil.transformToTuple(phoenixRecord, schema.getFields()); return tuple; } catch (InterruptedException e) { int errCode = 6018; final String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
@Override public Tuple getNext() throws IOException { Tuple tuple = null; try { if (reader.nextKeyValue()) { QuadWritable quad = reader.getCurrentValue(); tuple = tupleFactory.newTuple(4); tuple.set(0, NodeEncoder.asString(quad.getQuad().getGraph())); tuple.set(1, NodeEncoder.asString(quad.getQuad().getSubject())); tuple.set(2, NodeEncoder.asString(quad.getQuad().getPredicate())); tuple.set(3, NodeEncoder.asString(quad.getQuad().getObject())); } } catch (InterruptedException e) { throw new IOException(String.format("Error while reading %s", location)); } log.debug("getNext() --> {}", tuple); return tuple; }
public boolean nextKeyValue() throws IOException, InterruptedException { while ((curReader == null) || !curReader.nextKeyValue()) { if (!initNextRecordReader()) { return false; } } return true; }
@Override public Tuple getNext() throws IOException { try { List values = new ArrayList(); if (!reader.nextKeyValue()) { return null; } Text value = (Text) reader.getCurrentValue(); // TODO: parse record into component fields, add to values in order // check that the appropriate index of requiredFields is true before adding return tupleFactory.newTuple(values); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
@Override public HCatRecord next() { try { return curRecReader.getCurrentValue(); } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } }
@Override public Iterator<HCatRecord> read() throws HCatException { HCatInputFormat inpFmt = new HCatInputFormat(); RecordReader<WritableComparable, HCatRecord> rr; try { TaskAttemptContext cntxt = ShimLoader.getHadoopShims() .getHCatShim() .createTaskAttemptContext(conf, new TaskAttemptID()); rr = inpFmt.createRecordReader(split, cntxt); rr.initialize(split, cntxt); } catch (IOException e) { throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e); } catch (InterruptedException e) { throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e); } return new HCatRecordItr(rr); }
private void runNewMapper( final JobConf job, MRTaskReporter reporter, final MRInputLegacy in, KeyValueWriter out) throws IOException, InterruptedException { // Initialize input in-line since it sets parameters which may be used by the processor. // Done only for MRInput. // TODO use new method in MRInput to get required info // in.initialize(job, master); // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = getTaskAttemptContext(); // make a mapper org.apache.hadoop.mapreduce.Mapper mapper; try { mapper = (org.apache.hadoop.mapreduce.Mapper) ReflectionUtils.newInstance(taskContext.getMapperClass(), job); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } org.apache.hadoop.mapreduce.RecordReader input = new NewRecordReader(in); org.apache.hadoop.mapreduce.RecordWriter output = new NewOutputCollector(out); org.apache.hadoop.mapreduce.InputSplit split = in.getNewInputSplit(); org.apache.hadoop.mapreduce.MapContext mapContext = new MapContextImpl( job, taskAttemptId, input, output, getCommitter(), processorContext, split); org.apache.hadoop.mapreduce.Mapper.Context mapperContext = new WrappedMapper().getMapContext(mapContext); input.initialize(split, mapperContext); mapper.run(mapperContext); this.statusUpdate(); input.close(); output.close(mapperContext); }
public static RecordReader getRecordReader( String cobolLocation, String datafileLocation, String delimiter, int fileFormat) throws IOException, InterruptedException { Configuration conf = new Configuration(false); conf.set("fs.default.name", "file:///"); conf.set(Constants.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF, cobolLocation); conf.set(Constants.COPYBOOK_INPUTFORMAT_FIELD_DELIMITER, delimiter); conf.set(Constants.COPYBOOK_INPUTFORMAT_FILE_STRUCTURE, Integer.toString(fileFormat)); File testFile = new File(datafileLocation); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(CopybookInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); reader.initialize(split, context); return reader; }
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); long fileLength = fileStatus.getLen(); // Hadoop does unsafe casting from long to int, so split length should not be greater than int // max value long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE; InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl( hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext); recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add( new Pair( fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { rr.initialize(split, context); conf = context.getConfiguration(); nextKeyValue(); if (!empty) { keyclass = key.getClass().asSubclass(WritableComparable.class); valueclass = value.getClass(); if (cmp == null) { cmp = WritableComparator.get(keyclass, conf); } } }
private static void run(BenchmarkArgs parsedArgs) throws TTransportException, IOException, InterruptedException { HiveInputDescription input = new HiveInputDescription(); input.setDbName(parsedArgs.getDatabase()); input.setTableName(parsedArgs.getTable()); input.setPartitionFilter(parsedArgs.getPartitionFilter()); HiveConf hiveConf = new HiveConf(InputBenchmark.class); ThriftHiveMetastore.Iface client = HiveMetastores.create(parsedArgs.getHiveHost(), parsedArgs.getHivePort()); System.err.println("Initialize profile with input data"); HiveApiInputFormat.setProfileInputDesc(hiveConf, input, HiveApiInputFormat.DEFAULT_PROFILE_ID); HiveApiInputFormat defaultInputFormat = new HiveApiInputFormat(); if (parsedArgs.isTrackMetrics()) { defaultInputFormat.setObserver( new MetricsObserver("default", parsedArgs.getRecordPrintPeriod())); } List<InputSplit> splits = defaultInputFormat.getSplits(hiveConf, client); System.err.println("getSplits returned " + splits.size() + " splits"); long numRows = 0; for (int i = 0; i < splits.size(); ++i) { InputSplit split = splits.get(i); TaskAttemptID taskID = new TaskAttemptID(); TaskAttemptContext taskContext = new TaskAttemptContext(hiveConf, taskID); if (i % parsedArgs.getSplitPrintPeriod() == 0) { System.err.println("Handling split " + i + " of " + splits.size()); } RecordReader<WritableComparable, HiveReadableRecord> reader = defaultInputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); numRows += readFully(reader); } System.err.println("Parsed " + numRows + " rows"); }
@Override public Tuple getNext() throws IOException { try { if (!reader.nextKeyValue()) { return null; } Text value = (Text) reader.getCurrentValue(); String line = value.toString(); Tuple tuple = tupleFactory.newTuple(ranges.size()); for (int i = 0; i < ranges.size(); i++) { Range range = ranges.get(i); if (range.getEnd() > line.length()) { LOG.warn( String.format( "Range end (%s) is longer than line length (%s)", range.getEnd(), line.length())); continue; } tuple.set(i, new DataByteArray(range.getSubstring(line))); } return tuple; } catch (InterruptedException e) { throw new ExecException(e); } }
/** * For each split sampled, emit when the ratio of the number of records retained to the total * record count is less than the specified frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); int splitStep = splits.size() / splitsToSample; long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { RecordReader<K, V> reader = inf.createRecordReader( splits.get(i * splitStep), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID())); while (reader.nextKeyValue()) { ++records; if ((double) kept / records < freq) { ++kept; samples.add(reader.getCurrentKey()); } } reader.close(); } return (K[]) samples.toArray(); }
/** Get the record reader for the next chunk in this CombineFileSplit. */ protected boolean initNextRecordReader() throws IOException { if (curReader != null) { curReader.close(); curReader = null; if (idx > 0) { progress += split.getLength(idx - 1); // done processing so far } } // if all chunks have been processed, nothing more to do. if (idx == split.getNumPaths()) { return false; } // get a record reader for the idx-th chunk try { Configuration conf = context.getConfiguration(); // setup some helper config variables. conf.set(MRJobConfig.MAP_INPUT_FILE, split.getPath(idx).toString()); conf.setLong(MRJobConfig.MAP_INPUT_START, split.getOffset(idx)); conf.setLong(MRJobConfig.MAP_INPUT_PATH, split.getLength(idx)); curReader = rrConstructor.newInstance(new Object[] {split, context, Integer.valueOf(idx)}); if (idx > 0) { // initialize() for the first RecordReader will be called by MapTask; // we're responsible for initializing subsequent RecordReaders. curReader.initialize(split, context); } } catch (Exception e) { throw new RuntimeException(e); } idx++; return true; }
public K getCurrentKey() throws IOException, InterruptedException { return curReader.getCurrentKey(); }
/** Forward close request to proxied RR. */ public void close() throws IOException { rr.close(); }
public void testBinary() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq"); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); FileOutputFormat.setOutputPath(job, outdir); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class); SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); LOG.info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < RECORDS; ++i) { iwritable = new IntWritable(r.nextInt()); iwritable.write(outbuf); bkey.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); dwritable = new DoubleWritable(r.nextDouble()); dwritable.write(outbuf); bval.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); writer.write(bkey, bval); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>(); int count = 0; r.setSeed(seed); SequenceFileInputFormat.setInputPaths(job, outdir); LOG.info("Reading data by SequenceFileInputFormat"); for (InputSplit split : iformat.getSplits(job)) { RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context); MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { int sourceInt; double sourceDouble; while (reader.nextKeyValue()) { sourceInt = r.nextInt(); sourceDouble = r.nextDouble(); iwritable = reader.getCurrentKey(); dwritable = reader.getCurrentValue(); assertEquals( "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get()); assertTrue( "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0); ++count; } } finally { reader.close(); } } assertEquals("Some records not found", RECORDS, count); }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#getNext() */ @Override public Tuple getNext() throws IOException { // If SKIP_INPUT_HEADER and this is the first input split, skip header record // We store its value as a string though, so we can compare // further records to it. If they are the same (this would // happen if multiple small files each with a header were combined // into one split), we know to skip the duplicate header record as well. if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER && (splitIndex == 0 || splitIndex == -1)) { try { if (!in.nextKeyValue()) return null; header = ((Text) in.getCurrentValue()).toString(); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } loadingFirstRecord = false; mProtoTuple = new ArrayList<Object>(); getNextInQuotedField = false; boolean evenQuotesSeen = true; boolean sawEmbeddedRecordDelimiter = false; byte[] buf = null; if (!mRequiredColumnsInitialized) { if (udfContextSignature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature)); } mRequiredColumnsInitialized = true; } // Note: we cannot factor out the check for nextKeyValue() being null, // because that call overwrites buf with the new line, which is // bad if we have a field with a newline. try { int recordLen = 0; getNextFieldID = 0; while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) { Text value = null; if (sawEmbeddedRecordDelimiter) { // Deal with pulling more records from the input, because // a double quoted embedded newline was encountered in a field. // Save the length of the record so far, plus one byte for the // record delimiter (usually newline) that's embedded in the field // we were working on before falling into this branch: int prevLineLen = recordLen + 1; // Save previous line (the one with the field that has the newline) in a new array. // The last byte will be random; we'll fill in the embedded // record delimiter (usually newline) below: byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen); prevLineSaved[prevLineLen - 1] = RECORD_DEL; // Read the continuation of the record, unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); recordLen = value.getLength(); // Grab the continuation's bytes: buf = value.getBytes(); // Combine the previous line and the continuation into a new array. // The following copyOf() does half the job: it allocates all the // space, and also copies the previous line into that space: byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen); // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, // lengthToCopy: System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen); // We'll work with the combination now: buf = prevLineAndContinuation; // Do the whole record over from the start: mProtoTuple.clear(); getNextInQuotedField = false; evenQuotesSeen = true; getNextFieldID = 0; recordLen = prevLineAndContinuation.length; } else { // Previous record finished cleanly: start with the next record, // unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it // (this might happen if multiple files each with a header are combined into a single // split) if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) { if (!in.nextKeyValue()) return null; value = (Text) in.getCurrentValue(); } buf = value.getBytes(); getNextFieldID = 0; recordLen = value.getLength(); } nextTupleSkipChar = false; ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen); sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer); // The last field is never delimited by a FIELD_DEL, but by // the end of the record. So we need to add that last field. // The '!sawEmbeddedRecordDelimiter' handles the case of // embedded newlines; we are amidst a field, not at // the final record: if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++); } // end while } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; }
public void close() throws IOException { if (curReader != null) { curReader.close(); curReader = null; } }
public V getCurrentValue() throws IOException, InterruptedException { return curReader.getCurrentValue(); }