Esempio n. 1
0
 private <T> void serializeDelegate(OutputStream out, Class<T> delegateClass) throws IOException {
   // TODO: this should happen in context of the classloader and conf
   SerializationFactory factory = new SerializationFactory(configuration);
   Serializer<T> serializer = factory.getSerializer(delegateClass);
   serializer.open(out);
   serializer.serialize(delegateClass.cast(this.delegate));
 }
  public void initialize(TezOutputContext outputContext, Configuration conf, int numOutputs)
      throws IOException {
    this.outputContext = outputContext;
    this.conf = conf;
    this.partitions = numOutputs;

    rfs = ((LocalFileSystem) FileSystem.getLocal(this.conf)).getRaw();

    // sorter
    sorter =
        ReflectionUtils.newInstance(
            this.conf.getClass(
                TezJobConfig.TEZ_RUNTIME_INTERNAL_SORTER_CLASS,
                QuickSort.class,
                IndexedSorter.class),
            this.conf);

    comparator = ConfigUtils.getIntermediateOutputKeyComparator(this.conf);

    // k/v serialization
    keyClass = ConfigUtils.getIntermediateOutputKeyClass(this.conf);
    valClass = ConfigUtils.getIntermediateOutputValueClass(this.conf);
    serializationFactory = new SerializationFactory(this.conf);
    keySerializer = serializationFactory.getSerializer(keyClass);
    valSerializer = serializationFactory.getSerializer(valClass);

    //    counters
    mapOutputByteCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_BYTES);
    mapOutputRecordCounter =
        outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_RECORDS);
    fileOutputByteCounter =
        outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES);
    spilledRecordsCounter = outputContext.getCounters().findCounter(TaskCounter.SPILLED_RECORDS);
    // compression
    if (ConfigUtils.shouldCompressIntermediateOutput(this.conf)) {
      Class<? extends CompressionCodec> codecClass =
          ConfigUtils.getIntermediateOutputCompressorClass(this.conf, DefaultCodec.class);
      codec = ReflectionUtils.newInstance(codecClass, this.conf);
    } else {
      codec = null;
    }

    // Task outputs
    mapOutputFile = TezRuntimeUtils.instantiateTaskOutputManager(conf, outputContext);

    LOG.info(
        "Instantiating Partitioner: ["
            + conf.get(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS)
            + "]");
    this.conf.setInt(TezJobConfig.TEZ_RUNTIME_NUM_EXPECTED_PARTITIONS, this.partitions);
    this.partitioner = TezRuntimeUtils.instantiatePartitioner(this.conf);
    this.combiner = TezRuntimeUtils.instantiateCombiner(this.conf, outputContext);
  }
Esempio n. 3
0
 @SuppressWarnings("unchecked")
 public void write(DataOutput os) throws IOException {
   os.writeBoolean(disableCounter);
   os.writeBoolean(isMultiInputs);
   os.writeInt(totalSplits);
   os.writeInt(splitIndex);
   os.writeInt(inputIndex);
   writeObject(targetOps, os);
   os.writeInt(wrappedSplits.length);
   os.writeUTF(wrappedSplits[0].getClass().getName());
   SerializationFactory sf = new SerializationFactory(conf);
   Serializer s = sf.getSerializer(wrappedSplits[0].getClass());
   s.open((OutputStream) os);
   for (int i = 0; i < wrappedSplits.length; i++) {
     // The correct call sequence for Serializer is, we shall open, then serialize, but we shall
     // not close
     s.serialize(wrappedSplits[i]);
   }
 }
Esempio n. 4
0
 @SuppressWarnings("unchecked")
 public MapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job, Reporter reporter)
     throws IOException {
   this.job = job;
   this.reporter = reporter;
   localFs = FileSystem.getLocal(job);
   partitions = job.getNumReduceTasks();
   partitioner = (Partitioner) ReflectionUtils.newInstance(job.getPartitionerClass(), job);
   // sanity checks
   final float spillper = job.getFloat("io.sort.spill.percent", (float) 0.8);
   final float recper = job.getFloat("io.sort.record.percent", (float) 0.05);
   final int sortmb = job.getInt("io.sort.mb", 100);
   if (spillper > (float) 1.0 || spillper < (float) 0.0) {
     throw new IOException("Invalid \"io.sort.spill.percent\": " + spillper);
   }
   if (recper > (float) 1.0 || recper < (float) 0.01) {
     throw new IOException("Invalid \"io.sort.record.percent\": " + recper);
   }
   if ((sortmb & 0x7FF) != sortmb) {
     throw new IOException("Invalid \"io.sort.mb\": " + sortmb);
   }
   sorter =
       (IndexedSorter)
           ReflectionUtils.newInstance(job.getClass("map.sort.class", QuickSort.class), job);
   LOG.info("io.sort.mb = " + sortmb);
   // buffers and accounting
   int maxMemUsage = sortmb << 20;
   int recordCapacity = (int) (maxMemUsage * recper);
   recordCapacity -= recordCapacity % RECSIZE;
   kvbuffer = new byte[maxMemUsage - recordCapacity];
   bufvoid = kvbuffer.length;
   recordCapacity /= RECSIZE;
   kvoffsets = new int[recordCapacity];
   kvindices = new int[recordCapacity * ACCTSIZE];
   softBufferLimit = (int) (kvbuffer.length * spillper);
   softRecordLimit = (int) (kvoffsets.length * spillper);
   LOG.info("data buffer = " + softBufferLimit + "/" + kvbuffer.length);
   LOG.info("record buffer = " + softRecordLimit + "/" + kvoffsets.length);
   // k/v serialization
   comparator = job.getOutputKeyComparator();
   keyClass = (Class<K>) job.getMapOutputKeyClass();
   valClass = (Class<V>) job.getMapOutputValueClass();
   serializationFactory = new SerializationFactory(job);
   keySerializer = serializationFactory.getSerializer(keyClass);
   keySerializer.open(bb);
   valSerializer = serializationFactory.getSerializer(valClass);
   valSerializer.open(bb);
   // counters
   Counters counters = getCounters();
   mapOutputByteCounter = counters.findCounter(MAP_OUTPUT_BYTES);
   mapOutputRecordCounter = counters.findCounter(MAP_OUTPUT_RECORDS);
   combineInputCounter = counters.findCounter(COMBINE_INPUT_RECORDS);
   combineOutputCounter = counters.findCounter(COMBINE_OUTPUT_RECORDS);
   // compression
   if (job.getCompressMapOutput()) {
     Class<? extends CompressionCodec> codecClass =
         job.getMapOutputCompressorClass(DefaultCodec.class);
     codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job);
   }
   // combiner
   combinerClass = job.getCombinerClass();
   combineCollector =
       (null != combinerClass) ? new CombineOutputCollector(combineOutputCounter) : null;
   minSpillsForCombine = job.getInt("min.num.spills.for.combine", 3);
 }
Esempio n. 5
0
  /**
   * create inmemory segments
   *
   * @return
   * @throws IOException
   */
  public List<TezMerger.Segment> createInMemStreams() throws IOException {
    int numberOfStreams = Math.max(2, rnd.nextInt(10));
    LOG.info("No of streams : " + numberOfStreams);

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(keyClass);
    Serializer valueSerializer = serializationFactory.getSerializer(valClass);

    LocalDirAllocator localDirAllocator =
        new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext context = createTezInputContext();
    MergeManager mergeManager =
        new MergeManager(
            conf,
            fs,
            localDirAllocator,
            context,
            null,
            null,
            null,
            null,
            null,
            1024 * 1024 * 10,
            null,
            false,
            -1);

    DataOutputBuffer keyBuf = new DataOutputBuffer();
    DataOutputBuffer valBuf = new DataOutputBuffer();
    DataInputBuffer keyIn = new DataInputBuffer();
    DataInputBuffer valIn = new DataInputBuffer();
    keySerializer.open(keyBuf);
    valueSerializer.open(valBuf);

    List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>();
    for (int i = 0; i < numberOfStreams; i++) {
      BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
      InMemoryWriter writer = new InMemoryWriter(bout);
      Map<Writable, Writable> data = createData();
      // write data
      for (Map.Entry<Writable, Writable> entry : data.entrySet()) {
        keySerializer.serialize(entry.getKey());
        valueSerializer.serialize(entry.getValue());
        keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength());
        valIn.reset(valBuf.getData(), 0, valBuf.getLength());
        writer.append(keyIn, valIn);
        originalData.put(entry.getKey(), entry.getValue());
        keyBuf.reset();
        valBuf.reset();
        keyIn.reset();
        valIn.reset();
      }
      IFile.Reader reader =
          new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length);
      segments.add(new TezMerger.Segment(reader, true));

      data.clear();
      writer.close();
    }
    return segments;
  }
Esempio n. 6
0
  private static void extract(
      List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output)
      throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    Configuration conf = new JobConf();
    File outFile = new File(output);
    if (outFile.exists()) {
      outFile.delete();
    }
    outFile.createNewFile();
    BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile));
    ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream();
    DataOutput keySectionOut = new DataOutputStream(keySectionStream);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    RubixFile<Tuple, Object> lastrFile = null;
    JsonNode json;
    long totalLength = 0;

    final int BUF_SIZE = 32 * 1024;
    long blockIds[] = new long[numBlocks];
    int foundBlocks = 0;

    for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i;

    for (int i = 0; i < numBlocks; i++) {
      boolean found = false;
      for (RubixFile<Tuple, Object> rfile : rfiles) {
        print.f("Checking %s", rfile.path.toString());
        List<KeyData<Tuple>> keyDataList = rfile.getKeyData();
        for (KeyData<Tuple> keyData : keyDataList) {
          if (keyData.getBlockId() == blockIds[i]) {
            long offset = keyData.getOffset();
            long length = keyData.getLength();
            Tuple key = keyData.getKey();
            print.f(
                "Extracting block %d (off=%d len=%d) from %s",
                keyData.getBlockId(), offset, length, rfile.path.toString());

            // copy the data
            if (length > 0) {
              FileSystem fs = FileSystem.get(conf);
              FSDataInputStream in = fs.open(rfile.path);
              in.seek(offset);

              byte[] data = new byte[BUF_SIZE];
              long toRead = length;
              while (toRead > 0) {
                int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead;
                in.readFully(data, 0, thisRead);
                bos.write(data, 0, thisRead);
                toRead -= thisRead;
                System.out.print(".");
              }
              System.out.println();
            }
            // copy the key section
            Serializer<Tuple> keySerializer =
                serializationFactory.getSerializer(rfile.getKeyClass());
            keySerializer.open(keySectionStream);

            keySerializer.serialize(key);
            keySectionOut.writeLong(totalLength); // position
            keySectionOut.writeLong(keyData.getBlockId());
            keySectionOut.writeLong(keyData.getNumRecords());
            foundBlocks++;
            totalLength += length;
            lastrFile = rfile;

            found = true;
            break;
          }
        }
        if (found) {
          break;
        }
      }
      if (!found) System.err.println("Cannot locate block with id " + blockIds[i]);
    }
    byte[] trailerBytes = keySectionStream.toByteArray();

    json = JsonUtils.cloneNode(lastrFile.metadataJson);
    ((ObjectNode) json).put("numberOfBlocks", foundBlocks);

    DataOutput out = new DataOutputStream(bos);
    out.writeUTF(json.toString());
    out.writeInt(trailerBytes.length);
    out.write(trailerBytes);
    out.writeLong(totalLength); // trailer start offset
    bos.close();
  }