void checkFormat(Job job) throws Exception {
    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat();
    FileInputFormat.setInputPaths(job, workDir);

    List<InputSplit> splits = format.getSplits(job);
    for (int j = 0; j < splits.size(); j++) {
      RecordReader<LongWritable, MyClassWritable> reader =
          format.createRecordReader(splits.get(j), attemptContext);
      reader.initialize(splits.get(j), attemptContext);

      int count = 0;
      try {
        while (reader.nextKeyValue()) {
          LongWritable key = reader.getCurrentKey();
          MyClassWritable val = reader.getCurrentValue();
          MyClass mc = val.get();
          assertEquals(mc.v, count);
          assertEquals(mc.s, Integer.toString(count));
          count++;
        }
      } finally {
        reader.close();
      }
    }
  }
 private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
     throws IOException, InterruptedException {
   TextInputFormat textInputFormat = new TextInputFormat();
   long fileLength = fileStatus.getLen();
   // Hadoop does unsafe casting from long to int, so split length should not be greater than int
   // max value
   long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE;
   InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null);
   TaskAttemptContext taskAttemptContext =
       new TaskAttemptContextImpl(
           hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
   RecordReader<LongWritable, Text> recordReader =
       textInputFormat.createRecordReader(fileSplit, taskAttemptContext);
   recordReader.initialize(fileSplit, taskAttemptContext);
   boolean hasNext = recordReader.nextKeyValue();
   List<Map.Entry> batch = new ArrayList<>();
   while (hasNext && batch.size() < batchSize) {
     batch.add(
         new Pair(
             fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
             String.valueOf(recordReader.getCurrentValue())));
     hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
   }
   return batch;
 }
Example #3
0
    /**
     * Randomize the split order, then take the specified number of keys from each split sampled,
     * where each key is selected with the specified probability and possibly replaced by a
     * subsequently selected key when the quota of keys from that split is satisfied.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
      List<InputSplit> splits = inf.getSplits(job);
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.size());

      Random r = new Random();
      long seed = r.nextLong();
      r.setSeed(seed);
      // shuffle splits
      for (int i = 0; i < splits.size(); ++i) {
        InputSplit tmp = splits.get(i);
        int j = r.nextInt(splits.size());
        splits.set(i, splits.get(j));
        splits.set(j, tmp);
      }
      // our target rate is in terms of the maximum number of sample splits,
      // but we accept the possibility of sampling additional splits to hit
      // the target sample keyset
      for (int i = 0;
          i < splitsToSample || (i < splits.size() && samples.size() < numSamples);
          ++i) {
        RecordReader<K, V> reader =
            inf.createRecordReader(
                splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
        while (reader.nextKeyValue()) {
          if (r.nextDouble() <= freq) {
            if (samples.size() < numSamples) {
              samples.add(reader.getCurrentKey());
            } else {
              // When exceeding the maximum number of samples, replace a
              // random element with this one, then adjust the frequency
              // to reflect the possibility of existing elements being
              // pushed out
              int ind = r.nextInt(numSamples);
              if (ind != numSamples) {
                samples.set(ind, reader.getCurrentKey());
              }
              freq *= (numSamples - 1) / (double) numSamples;
            }
          }
        }
        reader.close();
      }
      return (K[]) samples.toArray();
    }
 private E computeNextKey() throws IOException, InterruptedException {
   while (unfiltered.nextKeyValue()) {
     E element = unfiltered.getCurrentKey();
     if (predicate.apply(element)) {
       return element;
     }
   }
   return null;
 }
  @Test
  public void testPreferredServerUnreachable() throws Exception {
    InfinispanInputSplit invalidSplit = createInfinispanSplit();

    Configuration configuration = miniHadoopCluster.getConfiguration();
    TaskAttemptContextImpl fakeTaskContext =
        new TaskAttemptContextImpl(configuration, new TaskAttemptID());
    InfinispanInputFormat<Integer, WebPage> inputFormat = new InfinispanInputFormat<>();
    RecordReader<Integer, WebPage> reader =
        inputFormat.createRecordReader(invalidSplit, fakeTaskContext);
    reader.initialize(invalidSplit, fakeTaskContext);

    reader.nextKeyValue();
    assertNotNull(reader.getCurrentKey());
  }
  @Test
  public void testVbRecordReader() throws IOException, InterruptedException {
    RecordReader reader =
        getRecordReader(
            testUtils.getCobolFileLocation(),
            testUtils.getTestVbFileLocation(),
            "0x01",
            net.sf.JRecord.Common.Constants.IO_VB);

    int counter = 0;
    while (reader.nextKeyValue()) {
      counter++;
      System.out.println(reader.getCurrentKey() + "::\t" + reader.getCurrentValue());
    }
    assertEquals(testUtils.getTestDataLength(), counter);
  }
Example #7
0
 /**
  * For each split sampled, emit when the ratio of the number of records retained to the total
  * record count is less than the specified frequency.
  */
 @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
 public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
   List<InputSplit> splits = inf.getSplits(job);
   ArrayList<K> samples = new ArrayList<K>();
   int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   int splitStep = splits.size() / splitsToSample;
   long records = 0;
   long kept = 0;
   for (int i = 0; i < splitsToSample; ++i) {
     RecordReader<K, V> reader =
         inf.createRecordReader(
             splits.get(i * splitStep),
             new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
     while (reader.nextKeyValue()) {
       ++records;
       if ((double) kept / records < freq) {
         ++kept;
         samples.add(reader.getCurrentKey());
       }
     }
     reader.close();
   }
   return (K[]) samples.toArray();
 }
  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }
 public K getCurrentKey() throws IOException, InterruptedException {
   return curReader.getCurrentKey();
 }
 /**
  * Read the next k,v pair into the head of this object; return true iff the RR and this are
  * exhausted.
  */
 private boolean next() throws IOException, InterruptedException {
   empty = !rr.nextKeyValue();
   key = rr.getCurrentKey();
   value = rr.getCurrentValue();
   return !empty;
 }
 @Override
 public K getCurrentKey() throws IOException, InterruptedException {
   return originalRR.getCurrentKey();
 }