private <T> void serializeDelegate(OutputStream out, Class<T> delegateClass) throws IOException { // TODO: this should happen in context of the classloader and conf SerializationFactory factory = new SerializationFactory(configuration); Serializer<T> serializer = factory.getSerializer(delegateClass); serializer.open(out); serializer.serialize(delegateClass.cast(this.delegate)); }
@SuppressWarnings("unchecked") public void readFields(DataInput is) throws IOException { disableCounter = is.readBoolean(); isMultiInputs = is.readBoolean(); totalSplits = is.readInt(); splitIndex = is.readInt(); inputIndex = is.readInt(); targetOps = (ArrayList<OperatorKey>) readObject(is); int splitLen = is.readInt(); String splitClassName = is.readUTF(); try { Class splitClass = conf.getClassByName(splitClassName); SerializationFactory sf = new SerializationFactory(conf); // The correct call sequence for Deserializer is, we shall open, then deserialize, but we // shall not close Deserializer d = sf.getDeserializer(splitClass); d.open((InputStream) is); wrappedSplits = new InputSplit[splitLen]; for (int i = 0; i < splitLen; i++) { wrappedSplits[i] = (InputSplit) ReflectionUtils.newInstance(splitClass, conf); d.deserialize(wrappedSplits[i]); } } catch (ClassNotFoundException e) { throw new IOException(e); } }
@SuppressWarnings("unchecked") public void write(DataOutput os) throws IOException { os.writeBoolean(disableCounter); os.writeBoolean(isMultiInputs); os.writeInt(totalSplits); os.writeInt(splitIndex); os.writeInt(inputIndex); writeObject(targetOps, os); os.writeInt(wrappedSplits.length); os.writeUTF(wrappedSplits[0].getClass().getName()); SerializationFactory sf = new SerializationFactory(conf); Serializer s = sf.getSerializer(wrappedSplits[0].getClass()); s.open((OutputStream) os); for (int i = 0; i < wrappedSplits.length; i++) { // The correct call sequence for Serializer is, we shall open, then serialize, but we shall // not close s.serialize(wrappedSplits[i]); } }
public void initialize(TezOutputContext outputContext, Configuration conf, int numOutputs) throws IOException { this.outputContext = outputContext; this.conf = conf; this.partitions = numOutputs; rfs = ((LocalFileSystem) FileSystem.getLocal(this.conf)).getRaw(); // sorter sorter = ReflectionUtils.newInstance( this.conf.getClass( TezJobConfig.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, QuickSort.class, IndexedSorter.class), this.conf); comparator = ConfigUtils.getIntermediateOutputKeyComparator(this.conf); // k/v serialization keyClass = ConfigUtils.getIntermediateOutputKeyClass(this.conf); valClass = ConfigUtils.getIntermediateOutputValueClass(this.conf); serializationFactory = new SerializationFactory(this.conf); keySerializer = serializationFactory.getSerializer(keyClass); valSerializer = serializationFactory.getSerializer(valClass); // counters mapOutputByteCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_BYTES); mapOutputRecordCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_RECORDS); fileOutputByteCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES); spilledRecordsCounter = outputContext.getCounters().findCounter(TaskCounter.SPILLED_RECORDS); // compression if (ConfigUtils.shouldCompressIntermediateOutput(this.conf)) { Class<? extends CompressionCodec> codecClass = ConfigUtils.getIntermediateOutputCompressorClass(this.conf, DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, this.conf); } else { codec = null; } // Task outputs mapOutputFile = TezRuntimeUtils.instantiateTaskOutputManager(conf, outputContext); LOG.info( "Instantiating Partitioner: [" + conf.get(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS) + "]"); this.conf.setInt(TezJobConfig.TEZ_RUNTIME_NUM_EXPECTED_PARTITIONS, this.partitions); this.partitioner = TezRuntimeUtils.instantiatePartitioner(this.conf); this.combiner = TezRuntimeUtils.instantiateCombiner(this.conf, outputContext); }
@SuppressWarnings("unchecked") public MapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job, Reporter reporter) throws IOException { this.job = job; this.reporter = reporter; localFs = FileSystem.getLocal(job); partitions = job.getNumReduceTasks(); partitioner = (Partitioner) ReflectionUtils.newInstance(job.getPartitionerClass(), job); // sanity checks final float spillper = job.getFloat("io.sort.spill.percent", (float) 0.8); final float recper = job.getFloat("io.sort.record.percent", (float) 0.05); final int sortmb = job.getInt("io.sort.mb", 100); if (spillper > (float) 1.0 || spillper < (float) 0.0) { throw new IOException("Invalid \"io.sort.spill.percent\": " + spillper); } if (recper > (float) 1.0 || recper < (float) 0.01) { throw new IOException("Invalid \"io.sort.record.percent\": " + recper); } if ((sortmb & 0x7FF) != sortmb) { throw new IOException("Invalid \"io.sort.mb\": " + sortmb); } sorter = (IndexedSorter) ReflectionUtils.newInstance(job.getClass("map.sort.class", QuickSort.class), job); LOG.info("io.sort.mb = " + sortmb); // buffers and accounting int maxMemUsage = sortmb << 20; int recordCapacity = (int) (maxMemUsage * recper); recordCapacity -= recordCapacity % RECSIZE; kvbuffer = new byte[maxMemUsage - recordCapacity]; bufvoid = kvbuffer.length; recordCapacity /= RECSIZE; kvoffsets = new int[recordCapacity]; kvindices = new int[recordCapacity * ACCTSIZE]; softBufferLimit = (int) (kvbuffer.length * spillper); softRecordLimit = (int) (kvoffsets.length * spillper); LOG.info("data buffer = " + softBufferLimit + "/" + kvbuffer.length); LOG.info("record buffer = " + softRecordLimit + "/" + kvoffsets.length); // k/v serialization comparator = job.getOutputKeyComparator(); keyClass = (Class<K>) job.getMapOutputKeyClass(); valClass = (Class<V>) job.getMapOutputValueClass(); serializationFactory = new SerializationFactory(job); keySerializer = serializationFactory.getSerializer(keyClass); keySerializer.open(bb); valSerializer = serializationFactory.getSerializer(valClass); valSerializer.open(bb); // counters Counters counters = getCounters(); mapOutputByteCounter = counters.findCounter(MAP_OUTPUT_BYTES); mapOutputRecordCounter = counters.findCounter(MAP_OUTPUT_RECORDS); combineInputCounter = counters.findCounter(COMBINE_INPUT_RECORDS); combineOutputCounter = counters.findCounter(COMBINE_OUTPUT_RECORDS); // compression if (job.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = job.getMapOutputCompressorClass(DefaultCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job); } // combiner combinerClass = job.getCombinerClass(); combineCollector = (null != combinerClass) ? new CombineOutputCollector(combineOutputCounter) : null; minSpillsForCombine = job.getInt("min.num.spills.for.combine", 3); }
/** * create inmemory segments * * @return * @throws IOException */ public List<TezMerger.Segment> createInMemStreams() throws IOException { int numberOfStreams = Math.max(2, rnd.nextInt(10)); LOG.info("No of streams : " + numberOfStreams); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer keySerializer = serializationFactory.getSerializer(keyClass); Serializer valueSerializer = serializationFactory.getSerializer(valClass); LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS); InputContext context = createTezInputContext(); MergeManager mergeManager = new MergeManager( conf, fs, localDirAllocator, context, null, null, null, null, null, 1024 * 1024 * 10, null, false, -1); DataOutputBuffer keyBuf = new DataOutputBuffer(); DataOutputBuffer valBuf = new DataOutputBuffer(); DataInputBuffer keyIn = new DataInputBuffer(); DataInputBuffer valIn = new DataInputBuffer(); keySerializer.open(keyBuf); valueSerializer.open(valBuf); List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>(); for (int i = 0; i < numberOfStreams; i++) { BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024); InMemoryWriter writer = new InMemoryWriter(bout); Map<Writable, Writable> data = createData(); // write data for (Map.Entry<Writable, Writable> entry : data.entrySet()) { keySerializer.serialize(entry.getKey()); valueSerializer.serialize(entry.getValue()); keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength()); valIn.reset(valBuf.getData(), 0, valBuf.getLength()); writer.append(keyIn, valIn); originalData.put(entry.getKey(), entry.getValue()); keyBuf.reset(); valBuf.reset(); keyIn.reset(); valIn.reset(); } IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length); segments.add(new TezMerger.Segment(reader, true)); data.clear(); writer.close(); } return segments; }
private static void extract( List<RubixFile<Tuple, Object>> rfiles, long blockId, int numBlocks, String output) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { Configuration conf = new JobConf(); File outFile = new File(output); if (outFile.exists()) { outFile.delete(); } outFile.createNewFile(); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(outFile)); ByteArrayOutputStream keySectionStream = new ByteArrayOutputStream(); DataOutput keySectionOut = new DataOutputStream(keySectionStream); SerializationFactory serializationFactory = new SerializationFactory(conf); RubixFile<Tuple, Object> lastrFile = null; JsonNode json; long totalLength = 0; final int BUF_SIZE = 32 * 1024; long blockIds[] = new long[numBlocks]; int foundBlocks = 0; for (int i = 0; i < numBlocks; i++) blockIds[i] = blockId + i; for (int i = 0; i < numBlocks; i++) { boolean found = false; for (RubixFile<Tuple, Object> rfile : rfiles) { print.f("Checking %s", rfile.path.toString()); List<KeyData<Tuple>> keyDataList = rfile.getKeyData(); for (KeyData<Tuple> keyData : keyDataList) { if (keyData.getBlockId() == blockIds[i]) { long offset = keyData.getOffset(); long length = keyData.getLength(); Tuple key = keyData.getKey(); print.f( "Extracting block %d (off=%d len=%d) from %s", keyData.getBlockId(), offset, length, rfile.path.toString()); // copy the data if (length > 0) { FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(rfile.path); in.seek(offset); byte[] data = new byte[BUF_SIZE]; long toRead = length; while (toRead > 0) { int thisRead = toRead > BUF_SIZE ? BUF_SIZE : (int) toRead; in.readFully(data, 0, thisRead); bos.write(data, 0, thisRead); toRead -= thisRead; System.out.print("."); } System.out.println(); } // copy the key section Serializer<Tuple> keySerializer = serializationFactory.getSerializer(rfile.getKeyClass()); keySerializer.open(keySectionStream); keySerializer.serialize(key); keySectionOut.writeLong(totalLength); // position keySectionOut.writeLong(keyData.getBlockId()); keySectionOut.writeLong(keyData.getNumRecords()); foundBlocks++; totalLength += length; lastrFile = rfile; found = true; break; } } if (found) { break; } } if (!found) System.err.println("Cannot locate block with id " + blockIds[i]); } byte[] trailerBytes = keySectionStream.toByteArray(); json = JsonUtils.cloneNode(lastrFile.metadataJson); ((ObjectNode) json).put("numberOfBlocks", foundBlocks); DataOutput out = new DataOutputStream(bos); out.writeUTF(json.toString()); out.writeInt(trailerBytes.length); out.write(trailerBytes); out.writeLong(totalLength); // trailer start offset bos.close(); }
@SuppressWarnings("unchecked") public List<KeyData<K>> getKeyData() throws IOException, ClassNotFoundException { if (keyData != null) return keyData; final FileSystem fs = FileSystem.get(conf); keyData = new ArrayList<KeyData<K>>(); final long filesize = fs.getFileStatus(path).getLen(); FSDataInputStream in = fs.open(path); /* The last long in the file is the start position of the trailer section */ in.seek(filesize - 8); long metaDataStartPos = in.readLong(); in.seek(metaDataStartPos); ObjectMapper mapper = new ObjectMapper(); metadataJson = mapper.readValue(in.readUTF(), JsonNode.class); int keySectionSize = in.readInt(); // load the key section byte[] keySection = new byte[keySectionSize]; in.seek(filesize - keySectionSize - 8); in.read(keySection, 0, keySectionSize); in.close(); ByteArrayInputStream bis = new ByteArrayInputStream(keySection); DataInput dataInput = new DataInputStream(bis); int numberOfBlocks = metadataJson.get("numberOfBlocks").getIntValue(); // load the key section keyClass = (Class<K>) ClassCache.forName(JsonUtils.getText(metadataJson, "keyClass")); valueClass = (Class<V>) ClassCache.forName(JsonUtils.getText(metadataJson, "valueClass")); SerializationFactory serializationFactory = new SerializationFactory(conf); Deserializer<K> deserializer = serializationFactory.getDeserializer(keyClass); deserializer.open(bis); while (bis.available() > 0 && numberOfBlocks > 0) { K key = deserializer.deserialize(null); long offset = dataInput.readLong(); long blockId = dataInput.readLong(); long numRecords = dataInput.readLong(); keyData.add(new KeyData<K>(key, offset, 0, numRecords, blockId)); numberOfBlocks--; } // Assign length to each keydata entry int numEntries = keyData.size(); for (int i = 1; i < numEntries; i++) { KeyData<K> prev = keyData.get(i - 1); KeyData<K> current = keyData.get(i); prev.setLength(current.getOffset() - prev.getOffset()); } if (numEntries > 0) { KeyData<K> last = keyData.get(numEntries - 1); last.setLength(metaDataStartPos - last.offset); } return keyData; }