Esempi in Java per RecordReader, esempi in Java per org.apache.hadoop.mapreduce.RecordReader

Esempio n. 1

0

Mostra file

File: TestLzoTextInputFormat.java Progetto: huagetai/elephant-bird

  /**
   * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5
   * that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput)
      throws IOException, NoSuchAlgorithmException, InterruptedException {

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));

    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);

    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir_);

    List<InputSplit> is = inputFormat.getSplits(job);
    // verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr =
          inputFormat.createRecordReader(inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);

      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();

        md5_.update(value.getBytes(), 0, value.getLength());
      }

      rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }

Esempio n. 2

0

Mostra file

File: TestSequenceFileStorage.java Progetto: hardiku/elephant-bird

  @Test
  public void readOutsidePig()
      throws ClassCastException, ParseException, ClassNotFoundException, InstantiationException,
          IllegalAccessException, IOException, InterruptedException {
    // simulate Pig front-end runtime
    final SequenceFileLoader<IntWritable, Text> storage =
        new SequenceFileLoader<IntWritable, Text>(
            "-c " + IntWritableConverter.class.getName(), "-c " + TextConverter.class.getName());
    Job job = new Job();
    storage.setUDFContextSignature("12345");
    storage.setLocation(tempFilename, job);

    // simulate Pig back-end runtime
    RecordReader<DataInputBuffer, DataInputBuffer> reader = new RawSequenceFileRecordReader();
    FileSplit fileSplit =
        new FileSplit(
            new Path(tempFilename), 0, new File(tempFilename).length(), new String[] {"localhost"});
    TaskAttemptContext context =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID());
    reader.initialize(fileSplit, context);
    InputSplit[] wrappedSplits = new InputSplit[] {fileSplit};
    int inputIndex = 0;
    List<OperatorKey> targetOps = Arrays.asList(new OperatorKey("54321", 0));
    int splitIndex = 0;
    PigSplit split = new PigSplit(wrappedSplits, inputIndex, targetOps, splitIndex);
    split.setConf(job.getConfiguration());
    storage.prepareToRead(reader, split);

    // read tuples and validate
    validate(new LoadFuncTupleIterator(storage));
  }

Esempio n. 3

0

Mostra file

File: SummarySort.java Progetto: akiezun/Hadoop-BAM

 @Override
 public RecordReader<LongWritable, Text> createRecordReader(
     InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException {
   final RecordReader<LongWritable, Text> rr = new SortRecordReader();
   rr.initialize(split, ctx);
   return rr;
 }

Esempio n. 4

0

Mostra file

File: FilteredRecordReader.java Progetto: abczhqiang/kite

 private E computeNextKey() throws IOException, InterruptedException {
   while (unfiltered.nextKeyValue()) {
     E element = unfiltered.getCurrentKey();
     if (predicate.apply(element)) {
       return element;
     }
   }
   return null;
 }

Esempio n. 5

0

Mostra file

File: InputBenchmark.java Progetto: niumowm/hive-io-experimental

 /**
  * Read all records from a RecordReader
  *
  * @param reader RecordReader
  * @throws IOException I/O errors
  * @throws InterruptedException thread errors
  */
 private static long readFully(RecordReader<WritableComparable, HiveReadableRecord> reader)
     throws IOException, InterruptedException {
   long num = 0;
   while (reader.nextKeyValue()) {
     HiveReadableRecord record = reader.getCurrentValue();
     parseLongLongDouble(record);
     ++num;
     //      if (num % 1000000 == 0) {
     //        System.out.println("Parsed " + num + " rows");
     //      }
   }
   return num;
 }

Esempio n. 6

0

Mostra file

File: InputOutputFormatTest.java Progetto: andyuk1986/infinispan-hadoop

  @Test
  public void testPreferredServerUnreachable() throws Exception {
    InfinispanInputSplit invalidSplit = createInfinispanSplit();

    Configuration configuration = miniHadoopCluster.getConfiguration();
    TaskAttemptContextImpl fakeTaskContext =
        new TaskAttemptContextImpl(configuration, new TaskAttemptID());
    InfinispanInputFormat<Integer, WebPage> inputFormat = new InfinispanInputFormat<>();
    RecordReader<Integer, WebPage> reader =
        inputFormat.createRecordReader(invalidSplit, fakeTaskContext);
    reader.initialize(invalidSplit, fakeTaskContext);

    reader.nextKeyValue();
    assertNotNull(reader.getCurrentKey());
  }

Esempio n. 7

0

Mostra file

File: CopyBookInputFormatTest.java Progetto: pboado/CopybookHadoop

  @Test
  public void testVbRecordReader() throws IOException, InterruptedException {
    RecordReader reader =
        getRecordReader(
            testUtils.getCobolFileLocation(),
            testUtils.getTestVbFileLocation(),
            "0x01",
            net.sf.JRecord.Common.Constants.IO_VB);

    int counter = 0;
    while (reader.nextKeyValue()) {
      counter++;
      System.out.println(reader.getCurrentKey() + "::\t" + reader.getCurrentValue());
    }
    assertEquals(testUtils.getTestDataLength(), counter);
  }

Esempio n. 8

0

Mostra file

File: HCatInputFormatReader.java Progetto: Leolh/hive

 @Override
 public boolean hasNext() {
   try {
     boolean retVal = curRecReader.nextKeyValue();
     if (retVal) {
       return true;
     }
     // if its false, we need to close recordReader.
     curRecReader.close();
     return false;
   } catch (IOException e) {
     throw new RuntimeException(e);
   } catch (InterruptedException e) {
     throw new RuntimeException(e);
   }
 }

Esempio n. 9

0

Mostra file

File: InputSampler.java Progetto: AndreSchumacher/adam

    /**
     * Randomize the split order, then take the specified number of keys from each split sampled,
     * where each key is selected with the specified probability and possibly replaced by a
     * subsequently selected key when the quota of keys from that split is satisfied.
     */
    @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
    public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
      List<InputSplit> splits = inf.getSplits(job);
      ArrayList<K> samples = new ArrayList<K>(numSamples);
      int splitsToSample = Math.min(maxSplitsSampled, splits.size());

      Random r = new Random();
      long seed = r.nextLong();
      r.setSeed(seed);
      // shuffle splits
      for (int i = 0; i < splits.size(); ++i) {
        InputSplit tmp = splits.get(i);
        int j = r.nextInt(splits.size());
        splits.set(i, splits.get(j));
        splits.set(j, tmp);
      }
      // our target rate is in terms of the maximum number of sample splits,
      // but we accept the possibility of sampling additional splits to hit
      // the target sample keyset
      for (int i = 0;
          i < splitsToSample || (i < splits.size() && samples.size() < numSamples);
          ++i) {
        RecordReader<K, V> reader =
            inf.createRecordReader(
                splits.get(i), new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
        while (reader.nextKeyValue()) {
          if (r.nextDouble() <= freq) {
            if (samples.size() < numSamples) {
              samples.add(reader.getCurrentKey());
            } else {
              // When exceeding the maximum number of samples, replace a
              // random element with this one, then adjust the frequency
              // to reflect the possibility of existing elements being
              // pushed out
              int ind = r.nextInt(numSamples);
              if (ind != numSamples) {
                samples.set(ind, reader.getCurrentKey());
              }
              freq *= (numSamples - 1) / (double) numSamples;
            }
          }
        }
        reader.close();
      }
      return (K[]) samples.toArray();
    }

Esempio n. 10

0

Mostra file

File: CombineFileRecordReader.java Progetto: tthevis/groovy-hadoop

 /** return progress based on the amount of data processed so far. */
 public float getProgress() throws IOException, InterruptedException {
   long subprogress = 0; // bytes processed in current split
   if (null != curReader) {
     // idx is always one past the current subsplit's true index.
     subprogress = (long) (curReader.getProgress() * split.getLength(idx - 1));
   }
   return Math.min(1.0f, (progress + subprogress) / (float) (split.getLength()));
 }

Esempio n. 11

0

Mostra file

File: PhoenixHBaseLoader.java Progetto: karel1980/phoenix

 @Override
 public Tuple getNext() throws IOException {
   try {
     if (!reader.nextKeyValue()) {
       return null;
     }
     final PhoenixRecord phoenixRecord = reader.getCurrentValue();
     if (phoenixRecord == null) {
       return null;
     }
     final Tuple tuple = TypeUtil.transformToTuple(phoenixRecord, schema.getFields());
     return tuple;
   } catch (InterruptedException e) {
     int errCode = 6018;
     final String errMsg = "Error while reading input";
     throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
   }
 }

Esempio n. 12

0

Mostra file

File: RdfStorage.java Progetto: strategist922/jena-grande

 @Override
 public Tuple getNext() throws IOException {
   Tuple tuple = null;
   try {
     if (reader.nextKeyValue()) {
       QuadWritable quad = reader.getCurrentValue();
       tuple = tupleFactory.newTuple(4);
       tuple.set(0, NodeEncoder.asString(quad.getQuad().getGraph()));
       tuple.set(1, NodeEncoder.asString(quad.getQuad().getSubject()));
       tuple.set(2, NodeEncoder.asString(quad.getQuad().getPredicate()));
       tuple.set(3, NodeEncoder.asString(quad.getQuad().getObject()));
     }
   } catch (InterruptedException e) {
     throw new IOException(String.format("Error while reading %s", location));
   }
   log.debug("getNext() --> {}", tuple);
   return tuple;
 }

Esempio n. 13

0

Mostra file

File: CombineFileRecordReader.java Progetto: tthevis/groovy-hadoop

  public boolean nextKeyValue() throws IOException, InterruptedException {

    while ((curReader == null) || !curReader.nextKeyValue()) {
      if (!initNextRecordReader()) {
        return false;
      }
    }
    return true;
  }

Esempio n. 14

0

Mostra file

File: TemplateLoaderPushProjection.java Progetto: gmud/mortar-pig-java-template

  @Override
  public Tuple getNext() throws IOException {
    try {
      List values = new ArrayList();
      if (!reader.nextKeyValue()) {
        return null;
      }
      Text value = (Text) reader.getCurrentValue();

      // TODO: parse record into component fields, add to values in order
      // check that the appropriate index of requiredFields is true before adding

      return tupleFactory.newTuple(values);
    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }
  }

Esempio n. 15

0

Mostra file

File: HCatInputFormatReader.java Progetto: Leolh/hive

 @Override
 public HCatRecord next() {
   try {
     return curRecReader.getCurrentValue();
   } catch (IOException e) {
     throw new RuntimeException(e);
   } catch (InterruptedException e) {
     throw new RuntimeException(e);
   }
 }

Esempio n. 16

0

Mostra file

File: HCatInputFormatReader.java Progetto: Leolh/hive

  @Override
  public Iterator<HCatRecord> read() throws HCatException {

    HCatInputFormat inpFmt = new HCatInputFormat();
    RecordReader<WritableComparable, HCatRecord> rr;
    try {
      TaskAttemptContext cntxt =
          ShimLoader.getHadoopShims()
              .getHCatShim()
              .createTaskAttemptContext(conf, new TaskAttemptID());
      rr = inpFmt.createRecordReader(split, cntxt);
      rr.initialize(split, cntxt);
    } catch (IOException e) {
      throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
    } catch (InterruptedException e) {
      throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
    }
    return new HCatRecordItr(rr);
  }

Esempio n. 17

0

Mostra file

File: MapProcessor.java Progetto: prashantkommireddi/incubator-tez

  private void runNewMapper(
      final JobConf job, MRTaskReporter reporter, final MRInputLegacy in, KeyValueWriter out)
      throws IOException, InterruptedException {

    // Initialize input in-line since it sets parameters which may be used by the processor.
    // Done only for MRInput.
    // TODO use new method in MRInput to get required info
    // in.initialize(job, master);

    // make a task context so we can get the classes
    org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = getTaskAttemptContext();

    // make a mapper
    org.apache.hadoop.mapreduce.Mapper mapper;
    try {
      mapper =
          (org.apache.hadoop.mapreduce.Mapper)
              ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
    } catch (ClassNotFoundException cnfe) {
      throw new IOException(cnfe);
    }

    org.apache.hadoop.mapreduce.RecordReader input = new NewRecordReader(in);

    org.apache.hadoop.mapreduce.RecordWriter output = new NewOutputCollector(out);

    org.apache.hadoop.mapreduce.InputSplit split = in.getNewInputSplit();

    org.apache.hadoop.mapreduce.MapContext mapContext =
        new MapContextImpl(
            job, taskAttemptId, input, output, getCommitter(), processorContext, split);

    org.apache.hadoop.mapreduce.Mapper.Context mapperContext =
        new WrappedMapper().getMapContext(mapContext);

    input.initialize(split, mapperContext);
    mapper.run(mapperContext);
    this.statusUpdate();
    input.close();
    output.close(mapperContext);
  }

Esempio n. 18

0

Mostra file

File: CopyBookInputFormatTest.java Progetto: pboado/CopybookHadoop

  public static RecordReader getRecordReader(
      String cobolLocation, String datafileLocation, String delimiter, int fileFormat)
      throws IOException, InterruptedException {
    Configuration conf = new Configuration(false);
    conf.set("fs.default.name", "file:///");
    conf.set(Constants.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF, cobolLocation);
    conf.set(Constants.COPYBOOK_INPUTFORMAT_FIELD_DELIMITER, delimiter);
    conf.set(Constants.COPYBOOK_INPUTFORMAT_FILE_STRUCTURE, Integer.toString(fileFormat));

    File testFile = new File(datafileLocation);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(CopybookInputFormat.class, conf);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    RecordReader reader = inputFormat.createRecordReader(split, context);
    reader.initialize(split, context);

    return reader;
  }

Esempio n. 19

0

Mostra file

File: ClusterHdfsSource.java Progetto: madhukard/datacollector

 private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
     throws IOException, InterruptedException {
   TextInputFormat textInputFormat = new TextInputFormat();
   long fileLength = fileStatus.getLen();
   // Hadoop does unsafe casting from long to int, so split length should not be greater than int
   // max value
   long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE;
   InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null);
   TaskAttemptContext taskAttemptContext =
       new TaskAttemptContextImpl(
           hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
   RecordReader<LongWritable, Text> recordReader =
       textInputFormat.createRecordReader(fileSplit, taskAttemptContext);
   recordReader.initialize(fileSplit, taskAttemptContext);
   boolean hasNext = recordReader.nextKeyValue();
   List<Map.Entry> batch = new ArrayList<>();
   while (hasNext && batch.size() < batchSize) {
     batch.add(
         new Pair(
             fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
             String.valueOf(recordReader.getCurrentValue())));
     hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
   }
   return batch;
 }

Esempio n. 20

0

Mostra file

File: WrappedRecordReader.java Progetto: FloodDragon/hadoop

 public void initialize(InputSplit split, TaskAttemptContext context)
     throws IOException, InterruptedException {
   rr.initialize(split, context);
   conf = context.getConfiguration();
   nextKeyValue();
   if (!empty) {
     keyclass = key.getClass().asSubclass(WritableComparable.class);
     valueclass = value.getClass();
     if (cmp == null) {
       cmp = WritableComparator.get(keyclass, conf);
     }
   }
 }

Esempio n. 21

0

Mostra file

File: InputBenchmark.java Progetto: niumowm/hive-io-experimental

  private static void run(BenchmarkArgs parsedArgs)
      throws TTransportException, IOException, InterruptedException {
    HiveInputDescription input = new HiveInputDescription();
    input.setDbName(parsedArgs.getDatabase());
    input.setTableName(parsedArgs.getTable());
    input.setPartitionFilter(parsedArgs.getPartitionFilter());

    HiveConf hiveConf = new HiveConf(InputBenchmark.class);
    ThriftHiveMetastore.Iface client =
        HiveMetastores.create(parsedArgs.getHiveHost(), parsedArgs.getHivePort());

    System.err.println("Initialize profile with input data");
    HiveApiInputFormat.setProfileInputDesc(hiveConf, input, HiveApiInputFormat.DEFAULT_PROFILE_ID);

    HiveApiInputFormat defaultInputFormat = new HiveApiInputFormat();
    if (parsedArgs.isTrackMetrics()) {
      defaultInputFormat.setObserver(
          new MetricsObserver("default", parsedArgs.getRecordPrintPeriod()));
    }

    List<InputSplit> splits = defaultInputFormat.getSplits(hiveConf, client);
    System.err.println("getSplits returned " + splits.size() + " splits");

    long numRows = 0;
    for (int i = 0; i < splits.size(); ++i) {
      InputSplit split = splits.get(i);
      TaskAttemptID taskID = new TaskAttemptID();
      TaskAttemptContext taskContext = new TaskAttemptContext(hiveConf, taskID);
      if (i % parsedArgs.getSplitPrintPeriod() == 0) {
        System.err.println("Handling split " + i + " of " + splits.size());
      }
      RecordReader<WritableComparable, HiveReadableRecord> reader =
          defaultInputFormat.createRecordReader(split, taskContext);
      reader.initialize(split, taskContext);
      numRows += readFully(reader);
    }

    System.err.println("Parsed " + numRows + " rows");
  }

Esempio n. 22

0

Mostra file

File: CutLoadFunc.java Progetto: amitnema/hadoop-ecosystem

 @Override
 public Tuple getNext() throws IOException {
   try {
     if (!reader.nextKeyValue()) {
       return null;
     }
     Text value = (Text) reader.getCurrentValue();
     String line = value.toString();
     Tuple tuple = tupleFactory.newTuple(ranges.size());
     for (int i = 0; i < ranges.size(); i++) {
       Range range = ranges.get(i);
       if (range.getEnd() > line.length()) {
         LOG.warn(
             String.format(
                 "Range end (%s) is longer than line length (%s)", range.getEnd(), line.length()));
         continue;
       }
       tuple.set(i, new DataByteArray(range.getSubstring(line)));
     }
     return tuple;
   } catch (InterruptedException e) {
     throw new ExecException(e);
   }
 }

Esempio n. 23

0

Mostra file

File: InputSampler.java Progetto: AndreSchumacher/adam

 /**
  * For each split sampled, emit when the ratio of the number of records retained to the total
  * record count is less than the specified frequency.
  */
 @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
 public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
   List<InputSplit> splits = inf.getSplits(job);
   ArrayList<K> samples = new ArrayList<K>();
   int splitsToSample = Math.min(maxSplitsSampled, splits.size());
   int splitStep = splits.size() / splitsToSample;
   long records = 0;
   long kept = 0;
   for (int i = 0; i < splitsToSample; ++i) {
     RecordReader<K, V> reader =
         inf.createRecordReader(
             splits.get(i * splitStep),
             new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()));
     while (reader.nextKeyValue()) {
       ++records;
       if ((double) kept / records < freq) {
         ++kept;
         samples.add(reader.getCurrentKey());
       }
     }
     reader.close();
   }
   return (K[]) samples.toArray();
 }

Esempio n. 24

0

Mostra file

File: CombineFileRecordReader.java Progetto: tthevis/groovy-hadoop

  /** Get the record reader for the next chunk in this CombineFileSplit. */
  protected boolean initNextRecordReader() throws IOException {

    if (curReader != null) {
      curReader.close();
      curReader = null;
      if (idx > 0) {
        progress += split.getLength(idx - 1); // done processing so far
      }
    }

    // if all chunks have been processed, nothing more to do.
    if (idx == split.getNumPaths()) {
      return false;
    }

    // get a record reader for the idx-th chunk
    try {
      Configuration conf = context.getConfiguration();
      // setup some helper config variables.
      conf.set(MRJobConfig.MAP_INPUT_FILE, split.getPath(idx).toString());
      conf.setLong(MRJobConfig.MAP_INPUT_START, split.getOffset(idx));
      conf.setLong(MRJobConfig.MAP_INPUT_PATH, split.getLength(idx));

      curReader = rrConstructor.newInstance(new Object[] {split, context, Integer.valueOf(idx)});

      if (idx > 0) {
        // initialize() for the first RecordReader will be called by MapTask;
        // we're responsible for initializing subsequent RecordReaders.
        curReader.initialize(split, context);
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
    idx++;
    return true;
  }

Esempio n. 25

0

Mostra file

File: CombineFileRecordReader.java Progetto: tthevis/groovy-hadoop

 public K getCurrentKey() throws IOException, InterruptedException {
   return curReader.getCurrentKey();
 }

Esempio n. 26

0

Mostra file

File: WrappedRecordReader.java Progetto: FloodDragon/hadoop

 /** Forward close request to proxied RR. */
 public void close() throws IOException {
   rr.close();
 }

Esempio n. 27

0

Mostra file

File: TestMRSequenceFileAsBinaryOutputFormat.java Progetto: Jude7/bc-hadoop2.0

  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }

Esempio n. 28

0

Mostra file

File: CSVExcelStorage.java Progetto: vladistan/pig

  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#getNext()
   */
  @Override
  public Tuple getNext() throws IOException {
    // If SKIP_INPUT_HEADER and this is the first input split, skip header record
    // We store its value as a string though, so we can compare
    // further records to it. If they are the same (this would
    // happen if multiple small files each with a header were combined
    // into one split), we know to skip the duplicate header record as well.
    if (loadingFirstRecord
        && headerTreatment == Headers.SKIP_INPUT_HEADER
        && (splitIndex == 0 || splitIndex == -1)) {
      try {
        if (!in.nextKeyValue()) return null;
        header = ((Text) in.getCurrentValue()).toString();
      } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
      }
    }
    loadingFirstRecord = false;

    mProtoTuple = new ArrayList<Object>();

    getNextInQuotedField = false;
    boolean evenQuotesSeen = true;
    boolean sawEmbeddedRecordDelimiter = false;
    byte[] buf = null;

    if (!mRequiredColumnsInitialized) {
      if (udfContextSignature != null) {
        Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
        mRequiredColumns =
            (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature));
      }
      mRequiredColumnsInitialized = true;
    }
    // Note: we cannot factor out the check for nextKeyValue() being null,
    // because that call overwrites buf with the new line, which is
    // bad if we have a field with a newline.

    try {
      int recordLen = 0;
      getNextFieldID = 0;

      while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) {
        Text value = null;
        if (sawEmbeddedRecordDelimiter) {

          // Deal with pulling more records from the input, because
          // a double quoted embedded newline was encountered in a field.
          // Save the length of the record so far, plus one byte for the
          // record delimiter (usually newline) that's embedded in the field
          // we were working on before falling into this branch:
          int prevLineLen = recordLen + 1;

          // Save previous line (the one with the field that has the newline) in a new array.
          // The last byte will be random; we'll fill in the embedded
          // record delimiter (usually newline) below:
          byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen);
          prevLineSaved[prevLineLen - 1] = RECORD_DEL;

          // Read the continuation of the record, unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();
          recordLen = value.getLength();
          // Grab the continuation's bytes:
          buf = value.getBytes();

          // Combine the previous line and the continuation into a new array.
          // The following copyOf() does half the job: it allocates all the
          // space, and also copies the previous line into that space:
          byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen);

          // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos,
          // lengthToCopy:
          System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen);

          // We'll work with the combination now:
          buf = prevLineAndContinuation;

          // Do the whole record over from the start:
          mProtoTuple.clear();
          getNextInQuotedField = false;
          evenQuotesSeen = true;
          getNextFieldID = 0;
          recordLen = prevLineAndContinuation.length;

        } else {
          // Previous record finished cleanly: start with the next record,
          // unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();

          // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it
          // (this might happen if multiple files each with a header are combined into a single
          // split)
          if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) {
            if (!in.nextKeyValue()) return null;
            value = (Text) in.getCurrentValue();
          }

          buf = value.getBytes();
          getNextFieldID = 0;
          recordLen = value.getLength();
        }

        nextTupleSkipChar = false;

        ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);

        sawEmbeddedRecordDelimiter =
            processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer);

        // The last field is never delimited by a FIELD_DEL, but by
        // the end of the record. So we need to add that last field.
        // The '!sawEmbeddedRecordDelimiter' handles the case of
        // embedded newlines; we are amidst a field, not at
        // the final record:
        if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++);
      } // end while

    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
  }

Esempio n. 29

0

Mostra file

File: CombineFileRecordReader.java Progetto: tthevis/groovy-hadoop

 public void close() throws IOException {
   if (curReader != null) {
     curReader.close();
     curReader = null;
   }
 }

Esempio n. 30

0

Mostra file

File: CombineFileRecordReader.java Progetto: tthevis/groovy-hadoop

 public V getCurrentValue() throws IOException, InterruptedException {
   return curReader.getCurrentValue();
 }