private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
     throws IOException, InterruptedException {
   TextInputFormat textInputFormat = new TextInputFormat();
   long fileLength = fileStatus.getLen();
   // Hadoop does unsafe casting from long to int, so split length should not be greater than int
   // max value
   long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE;
   InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null);
   TaskAttemptContext taskAttemptContext =
       new TaskAttemptContextImpl(
           hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
   RecordReader<LongWritable, Text> recordReader =
       textInputFormat.createRecordReader(fileSplit, taskAttemptContext);
   recordReader.initialize(fileSplit, taskAttemptContext);
   boolean hasNext = recordReader.nextKeyValue();
   List<Map.Entry> batch = new ArrayList<>();
   while (hasNext && batch.size() < batchSize) {
     batch.add(
         new Pair(
             fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
             String.valueOf(recordReader.getCurrentValue())));
     hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
   }
   return batch;
 }
  void checkFormat(Job job) throws Exception {
    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat();
    FileInputFormat.setInputPaths(job, workDir);

    List<InputSplit> splits = format.getSplits(job);
    for (int j = 0; j < splits.size(); j++) {
      RecordReader<LongWritable, MyClassWritable> reader =
          format.createRecordReader(splits.get(j), attemptContext);
      reader.initialize(splits.get(j), attemptContext);

      int count = 0;
      try {
        while (reader.nextKeyValue()) {
          LongWritable key = reader.getCurrentKey();
          MyClassWritable val = reader.getCurrentValue();
          MyClass mc = val.get();
          assertEquals(mc.v, count);
          assertEquals(mc.s, Integer.toString(count));
          count++;
        }
      } finally {
        reader.close();
      }
    }
  }
  /**
   * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5
   * that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput)
      throws IOException, NoSuchAlgorithmException, InterruptedException {

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));

    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);

    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir_);

    List<InputSplit> is = inputFormat.getSplits(job);
    // verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr =
          inputFormat.createRecordReader(inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);

      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();

        md5_.update(value.getBytes(), 0, value.getLength());
      }

      rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }
 private E computeNextKey() throws IOException, InterruptedException {
   while (unfiltered.nextKeyValue()) {
     E element = unfiltered.getCurrentKey();
     if (predicate.apply(element)) {
       return element;
     }
   }
   return null;
 }
  public boolean nextKeyValue() throws IOException, InterruptedException {

    while ((curReader == null) || !curReader.nextKeyValue()) {
      if (!initNextRecordReader()) {
        return false;
      }
    }
    return true;
  }
 /**
  * Read all records from a RecordReader
  *
  * @param reader RecordReader
  * @throws IOException I/O errors
  * @throws InterruptedException thread errors
  */
 private static long readFully(RecordReader<WritableComparable, HiveReadableRecord> reader)
     throws IOException, InterruptedException {
   long num = 0;
   while (reader.nextKeyValue()) {
     HiveReadableRecord record = reader.getCurrentValue();
     parseLongLongDouble(record);
     ++num;
     //      if (num % 1000000 == 0) {
     //        System.out.println("Parsed " + num + " rows");
     //      }
   }
   return num;
 }
  @Test
  public void testPreferredServerUnreachable() throws Exception {
    InfinispanInputSplit invalidSplit = createInfinispanSplit();

    Configuration configuration = miniHadoopCluster.getConfiguration();
    TaskAttemptContextImpl fakeTaskContext =
        new TaskAttemptContextImpl(configuration, new TaskAttemptID());
    InfinispanInputFormat<Integer, WebPage> inputFormat = new InfinispanInputFormat<>();
    RecordReader<Integer, WebPage> reader =
        inputFormat.createRecordReader(invalidSplit, fakeTaskContext);
    reader.initialize(invalidSplit, fakeTaskContext);

    reader.nextKeyValue();
    assertNotNull(reader.getCurrentKey());
  }
  @Test
  public void testVbRecordReader() throws IOException, InterruptedException {
    RecordReader reader =
        getRecordReader(
            testUtils.getCobolFileLocation(),
            testUtils.getTestVbFileLocation(),
            "0x01",
            net.sf.JRecord.Common.Constants.IO_VB);

    int counter = 0;
    while (reader.nextKeyValue()) {
      counter++;
      System.out.println(reader.getCurrentKey() + "::\t" + reader.getCurrentValue());
    }
    assertEquals(testUtils.getTestDataLength(), counter);
  }
 @Override
 public boolean hasNext() {
   try {
     boolean retVal = curRecReader.nextKeyValue();
     if (retVal) {
       return true;
     }
     // if its false, we need to close recordReader.
     curRecReader.close();
     return false;
   } catch (IOException e) {
     throw new RuntimeException(e);
   } catch (InterruptedException e) {
     throw new RuntimeException(e);
   }
 }
    /**
     * Randomize the split order, then take the specified number of keys from each split sampled,
     * where each key is selected with the specified probability and possibly replaced by a
     * subsequently selected key when the quota of keys from that split is satisfied.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
      List<InputSplit> splits = inf.getSplits(job);
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.size());

      Random r = new Random();
      long seed = r.nextLong();
      r.setSeed(seed);
      // shuffle splits
      for (int i = 0; i < splits.size(); ++i) {
        InputSplit tmp = splits.get(i);
        int j = r.nextInt(splits.size());
        splits.set(i, splits.get(j));
        splits.set(j, tmp);
      }
      // our target rate is in terms of the maximum number of sample splits,
      // but we accept the possibility of sampling additional splits to hit
      // the target sample keyset
      for (int i = 0;
          i < splitsToSample || (i < splits.size() && samples.size() < numSamples);
          ++i) {
        RecordReader<K, V> reader =
            inf.createRecordReader(
                splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
        while (reader.nextKeyValue()) {
          if (r.nextDouble() <= freq) {
            if (samples.size() < numSamples) {
              samples.add(reader.getCurrentKey());
            } else {
              // When exceeding the maximum number of samples, replace a
              // random element with this one, then adjust the frequency
              // to reflect the possibility of existing elements being
              // pushed out
              int ind = r.nextInt(numSamples);
              if (ind != numSamples) {
                samples.set(ind, reader.getCurrentKey());
              }
              freq *= (numSamples - 1) / (double) numSamples;
            }
          }
        }
        reader.close();
      }
      return (K[]) samples.toArray();
    }
 @Override
 public Tuple getNext() throws IOException {
   try {
     if (!reader.nextKeyValue()) {
       return null;
     }
     final PhoenixRecord phoenixRecord = reader.getCurrentValue();
     if (phoenixRecord == null) {
       return null;
     }
     final Tuple tuple = TypeUtil.transformToTuple(phoenixRecord, schema.getFields());
     return tuple;
   } catch (InterruptedException e) {
     int errCode = 6018;
     final String errMsg = "Error while reading input";
     throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
   }
 }
 @Override
 public Tuple getNext() throws IOException {
   Tuple tuple = null;
   try {
     if (reader.nextKeyValue()) {
       QuadWritable quad = reader.getCurrentValue();
       tuple = tupleFactory.newTuple(4);
       tuple.set(0, NodeEncoder.asString(quad.getQuad().getGraph()));
       tuple.set(1, NodeEncoder.asString(quad.getQuad().getSubject()));
       tuple.set(2, NodeEncoder.asString(quad.getQuad().getPredicate()));
       tuple.set(3, NodeEncoder.asString(quad.getQuad().getObject()));
     }
   } catch (InterruptedException e) {
     throw new IOException(String.format("Error while reading %s", location));
   }
   log.debug("getNext() --> {}", tuple);
   return tuple;
 }
  @Override
  public Tuple getNext() throws IOException {
    try {
      List values = new ArrayList();
      if (!reader.nextKeyValue()) {
        return null;
      }
      Text value = (Text) reader.getCurrentValue();

      // TODO: parse record into component fields, add to values in order
      // check that the appropriate index of requiredFields is true before adding

      return tupleFactory.newTuple(values);
    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }
  }
 @Override
 public Tuple getNext() throws IOException {
   try {
     if (!reader.nextKeyValue()) {
       return null;
     }
     Text value = (Text) reader.getCurrentValue();
     String line = value.toString();
     Tuple tuple = tupleFactory.newTuple(ranges.size());
     for (int i = 0; i < ranges.size(); i++) {
       Range range = ranges.get(i);
       if (range.getEnd() > line.length()) {
         LOG.warn(
             String.format(
                 "Range end (%s) is longer than line length (%s)", range.getEnd(), line.length()));
         continue;
       }
       tuple.set(i, new DataByteArray(range.getSubstring(line)));
     }
     return tuple;
   } catch (InterruptedException e) {
     throw new ExecException(e);
   }
 }
 /**
  * For each split sampled, emit when the ratio of the number of records retained to the total
  * record count is less than the specified frequency.
  */
 @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
 public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
   List<InputSplit> splits = inf.getSplits(job);
   ArrayList<K> samples = new ArrayList<K>();
   int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   int splitStep = splits.size() / splitsToSample;
   long records = 0;
   long kept = 0;
   for (int i = 0; i < splitsToSample; ++i) {
     RecordReader<K, V> reader =
         inf.createRecordReader(
             splits.get(i * splitStep),
             new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
     while (reader.nextKeyValue()) {
       ++records;
       if ((double) kept / records < freq) {
         ++kept;
         samples.add(reader.getCurrentKey());
       }
     }
     reader.close();
   }
   return (K[]) samples.toArray();
 }
 /**
  * Read the next k,v pair into the head of this object; return true iff the RR and this are
  * exhausted.
  */
 private boolean next() throws IOException, InterruptedException {
   empty = !rr.nextKeyValue();
   key = rr.getCurrentKey();
   value = rr.getCurrentValue();
   return !empty;
 }
  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }
 @Override
 public boolean nextKeyValue() throws IOException, InterruptedException {
   return originalRR.nextKeyValue();
 }
Exemple #19
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#getNext()
   */
  @Override
  public Tuple getNext() throws IOException {
    // If SKIP_INPUT_HEADER and this is the first input split, skip header record
    // We store its value as a string though, so we can compare
    // further records to it. If they are the same (this would
    // happen if multiple small files each with a header were combined
    // into one split), we know to skip the duplicate header record as well.
    if (loadingFirstRecord
        && headerTreatment == Headers.SKIP_INPUT_HEADER
        && (splitIndex == 0 || splitIndex == -1)) {
      try {
        if (!in.nextKeyValue()) return null;
        header = ((Text) in.getCurrentValue()).toString();
      } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
      }
    }
    loadingFirstRecord = false;

    mProtoTuple = new ArrayList<Object>();

    getNextInQuotedField = false;
    boolean evenQuotesSeen = true;
    boolean sawEmbeddedRecordDelimiter = false;
    byte[] buf = null;

    if (!mRequiredColumnsInitialized) {
      if (udfContextSignature != null) {
        Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
        mRequiredColumns =
            (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature));
      }
      mRequiredColumnsInitialized = true;
    }
    // Note: we cannot factor out the check for nextKeyValue() being null,
    // because that call overwrites buf with the new line, which is
    // bad if we have a field with a newline.

    try {
      int recordLen = 0;
      getNextFieldID = 0;

      while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) {
        Text value = null;
        if (sawEmbeddedRecordDelimiter) {

          // Deal with pulling more records from the input, because
          // a double quoted embedded newline was encountered in a field.
          // Save the length of the record so far, plus one byte for the
          // record delimiter (usually newline) that's embedded in the field
          // we were working on before falling into this branch:
          int prevLineLen = recordLen + 1;

          // Save previous line (the one with the field that has the newline) in a new array.
          // The last byte will be random; we'll fill in the embedded
          // record delimiter (usually newline) below:
          byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen);
          prevLineSaved[prevLineLen - 1] = RECORD_DEL;

          // Read the continuation of the record, unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();
          recordLen = value.getLength();
          // Grab the continuation's bytes:
          buf = value.getBytes();

          // Combine the previous line and the continuation into a new array.
          // The following copyOf() does half the job: it allocates all the
          // space, and also copies the previous line into that space:
          byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen);

          // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos,
          // lengthToCopy:
          System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen);

          // We'll work with the combination now:
          buf = prevLineAndContinuation;

          // Do the whole record over from the start:
          mProtoTuple.clear();
          getNextInQuotedField = false;
          evenQuotesSeen = true;
          getNextFieldID = 0;
          recordLen = prevLineAndContinuation.length;

        } else {
          // Previous record finished cleanly: start with the next record,
          // unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();

          // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it
          // (this might happen if multiple files each with a header are combined into a single
          // split)
          if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) {
            if (!in.nextKeyValue()) return null;
            value = (Text) in.getCurrentValue();
          }

          buf = value.getBytes();
          getNextFieldID = 0;
          recordLen = value.getLength();
        }

        nextTupleSkipChar = false;

        ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);

        sawEmbeddedRecordDelimiter =
            processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer);

        // The last field is never delimited by a FIELD_DEL, but by
        // the end of the record. So we need to add that last field.
        // The '!sawEmbeddedRecordDelimiter' handles the case of
        // embedded newlines; we are amidst a field, not at
        // the final record:
        if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++);
      } // end while

    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
  }