public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException( "Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
public MutilCharRecordReader(FileSplit inputSplit, Configuration job) throws IOException { maxLineLength = job.getInt("mapred.mutilCharRecordReader.maxlength", Integer.MAX_VALUE); start = inputSplit.getStart(); end = start + inputSplit.getLength(); final Path file = inputSplit.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // 打开文件系统 FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(file); boolean skipFirstLine = false; if (codec != null) { lineReader = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new LineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); m_Sb.setLength(0); m_Start = split.getStart(); m_End = m_Start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the m_Start of the split FileSystem fs = file.getFileSystem(job); // getFileStatus fileStatus = fs.getFileStatus(split.getPath()); //noinspection deprecation @SuppressWarnings(value = "deprecated") long length = fs.getLength(file); FSDataInputStream fileIn = fs.open(split.getPath()); if (m_Start > 0) fileIn.seek(m_Start); if (codec != null) { CompressionInputStream inputStream = codec.createInputStream(fileIn); m_Input = new BufferedReader(new InputStreamReader(inputStream)); m_End = length; } else { m_Input = new BufferedReader(new InputStreamReader(fileIn)); } m_Current = m_Start; m_Key = split.getPath().getName(); }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { // Obtain path to input list of input images and open input stream FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); FileSystem fileSystem = path.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fileSystem.open(path); // Note the start and length fields in the FileSplit object are being used to // convey a range of lines in the input list of image URLs startLine = fileSplit.getStart(); numLines = fileSplit.getLength(); linesRead = 0; // total lines read by this particular record reader instance linesPerRecord = 100; // can be modified to change key/value pair size (may improve efficiency) // If it exists, get the relevant compression codec for the FileSplit CompressionCodecFactory codecFactory = new CompressionCodecFactory(context.getConfiguration()); CompressionCodec codec = codecFactory.getCodec(path); // If the codec was found, use it to create an decompressed input stream. // Otherwise, assume input stream is already decompressed if (codec != null) { reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fileIn))); } else { reader = new BufferedReader(new InputStreamReader(fileIn)); } }
private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException { Configuration conf = new Configuration(); conf.set("fs.default.name", "file:///"); FileSystem fs = FileSystem.get(conf); InputStream is = fs.open(f); CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(f); LOG.info("gzip check codec is " + codec); Decompressor decompressor = CodecPool.getDecompressor(codec); if (null == decompressor) { LOG.info("Verifying gzip sanity with null decompressor"); } else { LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString()); } is = codec.createInputStream(is, decompressor); BufferedReader r = new BufferedReader(new InputStreamReader(is)); int numLines = 0; while (true) { String ln = r.readLine(); if (ln == null) { break; } numLines++; } r.close(); assertEquals("Did not read back correct number of lines", expectedNumLines, numLines); LOG.info("gzip sanity check returned " + numLines + " lines; ok."); }
/** * Create a data file that gets exported to the db. * * @param fileNum the number of the file (for multi-file export) * @param numRecords how many records to write to the file. * @param gzip is true if the file should be gzipped. */ private void createTextFile( int fileNum, int numRecords, boolean gzip, ColumnGenerator... extraCols) throws IOException { int startId = fileNum * numRecords; String ext = ".txt"; if (gzip) { ext = ext + ".gz"; } Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part" + fileNum + ext); Configuration conf = new Configuration(); conf.set("fs.default.name", "file:///"); FileSystem fs = FileSystem.get(conf); fs.mkdirs(tablePath); OutputStream os = fs.create(filePath); if (gzip) { CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(filePath); os = codec.createOutputStream(os); } BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); for (int i = 0; i < numRecords; i++) { w.write(getRecordLine(startId + i, extraCols)); } w.close(); os.close(); if (gzip) { verifyCompressedFile(filePath, numRecords); } }
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException { context = taskAttemptContext; FileSplit fileSplit = (FileSplit) genericSplit; lzoFile = fileSplit.getPath(); // The LzoSplitInputFormat is not splittable, so the split length is the whole file. totalFileSize = fileSplit.getLength(); // Jump through some hoops to create the lzo codec. Configuration conf = CompatibilityUtil.getConfiguration(context); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); LzopDecompressor lzopDecompressor = (LzopDecompressor) codec.createDecompressor(); FileSystem fs = lzoFile.getFileSystem(conf); rawInputStream = fs.open(lzoFile); // Creating the LzopInputStream here just reads the lzo header for us, nothing more. // We do the rest of our input off of the raw stream is. codec.createInputStream(rawInputStream, lzopDecompressor); // This must be called AFTER createInputStream is called, because createInputStream // is what reads the header, which has the checksum information. Otherwise getChecksumsCount // erroneously returns zero, and all block offsets will be wrong. numCompressedChecksums = lzopDecompressor.getCompressedChecksumsCount(); numDecompressedChecksums = lzopDecompressor.getDecompressedChecksumsCount(); }
public InputStream createDecompressionStream( InputStream downStream, Decompressor decompressor, int downStreamBufferSize) throws IOException { CompressionCodec codec = getCodec(conf); // Set the internal buffer size to read from down stream. if (downStreamBufferSize > 0) { ((Configurable) codec).getConf().setInt("io.file.buffer.size", downStreamBufferSize); } CompressionInputStream cis = codec.createInputStream(downStream, decompressor); BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE); return bis2; }
/** * Returns an {@link InputStream} to the specified file. * * <p>Note: It is the caller's responsibility to close the returned {@link InputStream}. * * @param path The path to the file to open. * @return An {@link InputStream} for the specified file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the * specified file. */ @Override public InputStream getFileStream(String path) throws FileBasedHelperException { try { Path p = new Path(path); InputStream in = this.getFileSystem().open(p); // Account for compressed files (e.g. gzip). // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf()); CompressionCodec codec = factory.getCodec(p); return (codec == null) ? in : codec.createInputStream(in); } catch (IOException e) { throw new FileBasedHelperException( "Cannot open file " + path + " due to " + e.getMessage(), e); } }
public static void decompressFile( final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious) throws IOException { final Path inPath = new Path(inFile); final Path outPath = new Path(outFile); final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); final CompressionCodec codec = factory.getCodec(inPath); final OutputStream out = fs.create(outPath); final InputStream in = codec.createInputStream(fs.open(inPath)); IOUtils.copyBytes(in, out, 8192); IOUtils.closeStream(in); IOUtils.closeStream(out); if (deletePrevious) fs.delete(new Path(inFile), true); }
/** * Creates an output segment file and sets up the output streams to point at it. If the file * already exists, retries with a different filename. This is a bit nasty -- after all, {@link * FileOutputFormat}'s work directory concept is supposed to prevent filename clashes -- but it * looks like Amazon Elastic MapReduce prevents use of per-task work directories if the output of * a job is on S3. * * <p>TODO: Investigate this and find a better solution. */ private void createSegment() throws IOException { segmentsAttempted = 0; bytesWritten = 0; boolean success = false; while (!success) { Path path = workOutputPath.suffix(String.format(extensionFormat, segmentsCreated, segmentsAttempted)); FileSystem fs = path.getFileSystem(conf); try { // The o.a.h.mapred OutputFormats overwrite existing files, whereas // the o.a.h.mapreduce OutputFormats don't overwrite. Bizarre... // Here, overwrite if progress != null, i.e. if using mapred API. FSDataOutputStream fsStream = (progress == null) ? fs.create(path, false) : fs.create(path, progress); byteStream = new CountingOutputStream(new BufferedOutputStream(fsStream)); dataStream = new DataOutputStream(codec == null ? byteStream : codec.createOutputStream(byteStream)); segmentsCreated++; logger.info("Writing to output file: {}", path); success = true; } catch (IOException e) { if (e.getMessage().startsWith("File already exists")) { logger.warn("Tried to create file {} but it already exists; retrying.", path); segmentsAttempted++; // retry } else { throw e; } } } }
@Override public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter( TaskAttemptContext context, String fileName, CamusWrapper data, FileOutputCommitter committer) throws IOException, InterruptedException { // If recordDelimiter hasn't been initialized, do so now if (recordDelimiter == null) { recordDelimiter = context.getConfiguration().get(ETL_OUTPUT_RECORD_DELIMITER, DEFAULT_RECORD_DELIMITER); } // Get the filename for this RecordWriter. Path path = new Path( committer.getWorkPath(), EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension())); // final FSDataOutputStream writer = // path.getFileSystem(context.getConfiguration()).create(path); FileSystem fs = path.getFileSystem(context.getConfiguration()); DataOutputStream writer; // = fs.create(path, false); if (isCompressed) { return new ByteRecordWriter( new DataOutputStream(codec.createOutputStream(fs.create(path, false))), recordDelimiter); } else { return new ByteRecordWriter(fs.create(path, false), recordDelimiter); } }
public PartitionFinalizer(SecorConfig config) throws Exception { mConfig = config; mKafkaClient = new KafkaClient(mConfig); mZookeeperConnector = new ZookeeperConnector(mConfig); mMessageParser = (TimestampedMessageParser) ReflectionUtil.createMessageParser(mConfig.getMessageParserClass(), mConfig); mQuboleClient = new QuboleClient(mConfig); if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) { CompressionCodec codec = CompressionUtil.createCompressionCodec(mConfig.getCompressionCodec()); mFileExtension = codec.getDefaultExtension(); } else { mFileExtension = ""; } }
private DataOutputStream createRawOutputStream(TaskAttemptContext ctx) throws IOException { boolean isCompressed = getCompressOutput(ctx); if (!isCompressed) { Path file = getDefaultWorkFile(ctx, ".nt"); FileSystem fs = file.getFileSystem(ctx.getConfiguration()); return fs.create(file, false); } else { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(ctx, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, ctx.getConfiguration()); Path file = getDefaultWorkFile(ctx, ".nt" + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(ctx.getConfiguration()); FSDataOutputStream fileOut = fs.create(file, false); return new DataOutputStream(codec.createOutputStream(fileOut)); } }
@Override public RecordWriter<K, V> getRecordWriter( FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); if (!isCompressed) { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new LineRecordWriter<K, V>(fileOut); } else { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job); Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut))); } }
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec) .createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader( codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException { if (jobConf.get(START_TAG_KEY) == null || jobConf.get(END_TAG_KEY) == null) throw new RuntimeException("Error! XML start and end tags unspecified!"); startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8"); endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8"); start = split.getStart(); Path file = split.getPath(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(jobConf); CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(jobConf); if (codec != null) { sLogger.info("Reading compressed file..."); // InputStream tempStream = codec.createInputStream(fileIn); // fsin = new DataInputStream(tempStream); fsin = new DataInputStream(codec.createInputStream(fs.open(file))); end = Long.MAX_VALUE; } else { sLogger.info("Reading uncompressed file..."); FSDataInputStream fileIn = fs.open(file); fileIn.seek(start); fsin = fileIn; end = start + split.getLength(); } recordStartPos = start; // Because input streams of gzipped files are not seekable // (specifically, do not support getPos), we need to keep // track of bytes consumed ourselves. pos = start; }
private static void writeFile(FileSystem fs, Path name, CompressionCodec codec, String contents) throws IOException { OutputStream stm; if (codec == null) { stm = fs.create(name); } else { stm = codec.createOutputStream(fs.create(name)); } stm.write(contents.getBytes()); stm.close(); }
// copied from FaunusFileOutputFormat protected DataOutputStream getDataOuputStream(final TaskAttemptContext job) throws IOException, InterruptedException { final Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; if (isCompressed) { final Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } final Path file = super.getDefaultWorkFile(job, extension); final FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { return new DataOutputStream(fs.create(file, false)); } else { return new DataOutputStream(codec.createOutputStream(fs.create(file, false))); } }
/** * Creates a WARC file, and opens it for writing. If a file with the same name already exists, it * is *overwritten*. Note that this is different behaviour from the other constructor. Yes, this * sucks. It will probably change in a future version. * * @param conf The Hadoop configuration. * @param codec If null, the file is uncompressed. If non-null, this compression codec will be * used. The codec's default file extension is appended to the filename. * @param workOutputPath The directory and filename prefix to which the data should be written. We * append a segment number and filename extensions to it. * @param progress An object used by the mapred API for tracking a task's progress. * @throws IOException */ public WARCFileWriter( Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress) throws IOException { this.conf = conf; this.codec = codec; this.workOutputPath = workOutputPath; this.progress = progress; this.extensionFormat = ".seg-%05d.attempt-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension()); this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE); createSegment(); }
private void runTest(int numChunks, int chunkSize) throws Exception { CompressionCodec codec = ReflectionUtils.newInstance(LzopCodec.class, conf); final Random writerRand = new Random(12345); final Random readerRand = new Random(12345); File testFile = new File(System.getProperty("test.build.data"), "randdata"); String fileName = testFile.getAbsolutePath(); // Create the file OutputStream fos = new FileOutputStream(fileName); fos = codec.createOutputStream(fos); // Write file byte[] data = new byte[chunkSize]; System.out.println("Start to write to file..."); for (int i = 0; i < numChunks; i++) { writerRand.nextBytes(data); fos.write(data); } fos.close(); System.out.println("Closed file."); // Open file InputStream tis = new FileInputStream(fileName); tis = codec.createInputStream(tis); // Read file byte[] dataExpected = new byte[chunkSize]; byte[] dataRead = new byte[chunkSize]; for (int i = 0; i < numChunks; i++) { readerRand.nextBytes(dataExpected); readFully(tis, dataRead); assertArrayEquals(dataExpected, dataRead); } assertEquals(-1, tis.read()); tis.close(); }
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter<K, V>( fileOut, KEY_VALUE_SEPARATOR); } else { FSDataOutputStream fileOut = fs.create(file, false); return new org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter<K, V>( new DataOutputStream(codec.createOutputStream(fileOut)), KEY_VALUE_SEPARATOR); } }
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start_ = split.getStart(); end_ = start_ + split.getLength(); final Path file = split.getPath(); Configuration job = HadoopCompat.getConfiguration(context); errorTracker = new InputErrorTracker(job); LOG.info("input split: " + file + " " + start_ + ":" + end_); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " found, cannot run"); } // Open the file and seek to the start of the split. fileIn_ = fs.open(split.getPath()); // Creates input stream and also reads the file header. createInputReader(codec.createInputStream(fileIn_), job); if (start_ != 0) { fileIn_.seek(start_); skipToNextSyncPoint(false); start_ = fileIn_.getPos(); LOG.info("Start is now " + start_); } else { skipToNextSyncPoint(true); } pos_ = start_; }
// ${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension} private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = ","; RecordWriter<K, V> recordWriter = null; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); Path file = new Path(workPath, baseName + codec.getDefaultExtension()); FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false); recordWriter = new LineRecordWriter<K, V>( new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } else { Path file = new Path(workPath, baseName); FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false); // file 是指的file name of the output file recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator); // 这里调用的LineRecordWriter } return recordWriter; }
private void shuffleToMemory( MapHost host, MapOutput mapOutput, InputStream input, int decompressedLength, int compressedLength) throws IOException { IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, job); input = checksumIn; // Are map-outputs compressed? if (codec != null) { decompressor.reset(); input = codec.createInputStream(input, decompressor); } // Copy map-output into an in-memory buffer byte[] shuffleData = mapOutput.getMemory(); try { IOUtils.readFully(input, shuffleData, 0, shuffleData.length); metrics.inputBytes(shuffleData.length); LOG.info( "Read " + shuffleData.length + " bytes from map-output for " + mapOutput.getAttemptIdentifier()); } catch (IOException ioe) { // Close the streams IOUtils.cleanup(LOG, input); // Re-throw throw ioe; } }
public ShuffleManager( TezInputContext inputContext, Configuration conf, int numInputs, int bufferSize, boolean ifileReadAheadEnabled, int ifileReadAheadLength, CompressionCodec codec, FetchedInputAllocator inputAllocator) throws IOException { this.inputContext = inputContext; this.numInputs = numInputs; this.shuffledInputsCounter = inputContext.getCounters().findCounter(TaskCounter.NUM_SHUFFLED_INPUTS); this.failedShufflesCounter = inputContext.getCounters().findCounter(TaskCounter.NUM_FAILED_SHUFFLE_INPUTS); this.bytesShuffledCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES); this.decompressedDataSizeCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_DECOMPRESSED); this.bytesShuffledToDiskCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_DISK); this.bytesShuffledToMemCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_MEM); this.ifileBufferSize = bufferSize; this.ifileReadAhead = ifileReadAheadEnabled; this.ifileReadAheadLength = ifileReadAheadLength; this.codec = codec; this.inputManager = inputAllocator; this.srcNameTrimmed = TezUtils.cleanVertexName(inputContext.getSourceVertexName()); completedInputSet = Collections.newSetFromMap(new ConcurrentHashMap<InputIdentifier, Boolean>(numInputs)); completedInputs = new LinkedBlockingQueue<FetchedInput>(numInputs); knownSrcHosts = new ConcurrentHashMap<String, InputHost>(); pendingHosts = new LinkedBlockingQueue<InputHost>(); obsoletedInputs = Collections.newSetFromMap(new ConcurrentHashMap<InputAttemptIdentifier, Boolean>()); runningFetchers = Collections.newSetFromMap(new ConcurrentHashMap<Fetcher, Boolean>()); int maxConfiguredFetchers = conf.getInt( TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES_DEFAULT); this.numFetchers = Math.min(maxConfiguredFetchers, numInputs); ExecutorService fetcherRawExecutor = Executors.newFixedThreadPool( numFetchers, new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("Fetcher [" + srcNameTrimmed + "] #%d") .build()); this.fetcherExecutor = MoreExecutors.listeningDecorator(fetcherRawExecutor); ExecutorService schedulerRawExecutor = Executors.newFixedThreadPool( 1, new ThreadFactoryBuilder() .setDaemon(true) .setNameFormat("ShuffleRunner [" + srcNameTrimmed + "]") .build()); this.schedulerExecutor = MoreExecutors.listeningDecorator(schedulerRawExecutor); this.startTime = System.currentTimeMillis(); this.lastProgressTime = startTime; this.shuffleSecret = ShuffleUtils.getJobTokenSecretFromTokenBytes( inputContext.getServiceConsumerMetaData( TezConfiguration.TEZ_SHUFFLE_HANDLER_SERVICE_ID)); httpConnectionParams = ShuffleUtils.constructHttpShuffleConnectionParams(conf); LOG.info( this.getClass().getSimpleName() + " : numInputs=" + numInputs + ", compressionCodec=" + (codec == null ? "NoCompressionCodec" : codec.getClass().getName()) + ", numFetchers=" + numFetchers + ", ifileBufferSize=" + ifileBufferSize + ", ifileReadAheadEnabled=" + ifileReadAhead + ", ifileReadAheadLength=" + ifileReadAheadLength + ", " + httpConnectionParams.toString()); }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Path inputPath = new Path(args[0]); Path outputDir = new Path(args[1]); // Create configuration Configuration conf = new Configuration(true); // Create job @SuppressWarnings("deprecation") Job job = new Job(conf, "CountryIncomeConf"); job.setJarByClass(CountryIncomeConf.class); // Decompressing .gz file Ex. foo.csv.gz to foo.csv String uri = args[0]; FileSystem fs = FileSystem.get(URI.create(uri), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } // Setup MapReduce job.setMapperClass(CountryIncomeMapper.class); job.setReducerClass(CountryIncomeReducer.class); job.setNumReduceTasks(1); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Input // FileInputFormat.addInputPath(job, inputPath); FileInputFormat.addInputPaths(job, outputUri); job.setInputFormatClass(TextInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true); // Execute job int code = job.waitForCompletion(true) ? 0 : 1; // Counter finding and displaying Counters counters = job.getCounters(); // Displaying counters System.out.printf( "Missing Fields: %d, Error Count: %d\n", counters.findCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT).getValue(), counters.findCounter(COUNTERS.NULL_OR_EMPTY).getValue()); System.exit(code); }
@Test public void testSplitableCodecs() throws IOException { JobConf conf = new JobConf(defaultConf); int seed = new Random().nextInt(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec) ReflectionUtils.newInstance( conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf); } catch (ClassNotFoundException cnfe) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.getDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.NULL; LOG.info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(workDir, true); FileInputFormat.setInputPaths(conf, workDir); final int MAX_LENGTH = 500000; // for a variety of lengths for (int length = MAX_LENGTH / 2; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) { LOG.info("creating; entries = " + length); // create a file with length entries Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file))); try { for (int i = 0; i < length; i++) { writer.write(Integer.toString(i)); writer.write("\n"); } } finally { writer.close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1; LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(conf, numSplits); LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) { LOG.warn( "conflict with " + v + " in split " + j + " at position " + reader.getPos()); } assertFalse("Key in multiple partitions.", bits.get(v)); bits.set(v); counter++; } if (counter > 0) { LOG.info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
/** Creates a compression stream without any additional wrapping into buffering streams. */ public CompressionOutputStream createPlainCompressionStream( OutputStream downStream, Compressor compressor) throws IOException { CompressionCodec codec = getCodec(conf); ((Configurable) codec).getConf().setInt("io.file.buffer.size", 32 * 1024); return codec.createOutputStream(downStream, compressor); }
boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException { job = new JobConf(getConf(), Crush.class); /* * Turn off speculative execution because that's just wasting network io. */ job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); /* * Turn off pre-emption because we don't want to kill a task after two hours of network io. */ job.set("mapred.fairscheduler.preemption", "false"); tmpDir = new Path("tmp/crush-" + UUID.randomUUID()); outDir = new Path(tmpDir, "out"); double threshold = 0.75; List<String> regexes = asList(".+"); List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); List<String> inFormats = asList(SequenceFileInputFormat.class.getName()); List<String> outFormats = asList(SequenceFileOutputFormat.class.getName()); String crushTimestamp; Options options = buildOptions(); CommandLine cli = new GnuParser().parse(options, args); if (cli.hasOption("?")) { BufferedReader reader = new BufferedReader( new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt"))); try { String line; while (null != (line = reader.readLine())) { System.out.println(line); } } finally { reader.close(); } return false; } if (cli.hasOption("verbose")) { console = Verbosity.VERBOSE; } else if (cli.hasOption("info")) { console = Verbosity.INFO; } else { console = Verbosity.NONE; } if (cli.hasOption("ignore-regex")) { ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher(""); } excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs"); String[] nonOptions = cli.getArgs(); if (2 == nonOptions.length) { /* * Stand alone mode accepts two arguments. */ mode = Mode.STAND_ALONE; srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValue("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValue("output-format")); } replacements = asList(dest.getName()); crushTimestamp = Long.toString(currentTimeMillis()); } else { /* * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much * smaller. */ if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length && args[2].length() != 14) { int maxTasks = Integer.parseInt(args[2]); if (maxTasks <= 0 || maxTasks > 4000) { throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks); } job.setInt("mapred.reduce.tasks", maxTasks); maxFileBlocks = Integer.MAX_VALUE; crushTimestamp = Long.toString(currentTimeMillis()); srcDir = new Path(args[0]); dest = new Path(args[1]); mode = Mode.CLONE; if (args.length == 4) { if (args[3].equals("TEXT")) { /* * These are the defaults except with text input and output formats. */ inFormats = asList(TextInputFormat.class.getName()); outFormats = asList(TextOutputFormat.class.getName()); } else if (!args[3].equals("SEQUENCE")) { throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]); } } } else { /* * V2 style arguments. */ if (cli.hasOption("threshold")) { threshold = Double.parseDouble(cli.getOptionValue("threshold")); if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold) || Double.isNaN(threshold)) { throw new IllegalArgumentException( "Block size threshold must be in (0, 1]: " + threshold); } } if (cli.hasOption("max-file-blocks")) { int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks")); if (0 > maxFileBlocksOption) { throw new IllegalArgumentException( "Maximum file size in blocks must be positive: " + maxFileBlocksOption); } maxFileBlocks = maxFileBlocksOption; } else { maxFileBlocks = 8; } if (cli.hasOption("regex")) { regexes = asList(cli.getOptionValues("regex")); } if (cli.hasOption("replacement")) { replacements = asList(cli.getOptionValues("replacement")); } if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValues("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValues("output-format")); } if (3 != nonOptions.length) { throw new IllegalArgumentException( "Could not find source directory, out directory, and job timestamp"); } srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); crushTimestamp = nonOptions[2]; if (cli.hasOption("clone")) { mode = Mode.CLONE; } else { mode = Mode.MAP_REDUCE; } if (!crushTimestamp.matches("\\d{14}")) { throw new IllegalArgumentException( "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp); } } dfsBlockSize = Long.parseLong(job.get("dfs.blocksize")); maxEligibleSize = (long) (dfsBlockSize * threshold); } /* * Add the crush specs and compression options to the configuration. */ job.set("crush.timestamp", crushTimestamp); if (ignoredFiles != null) { job.set("crush.ignore-regex", ignoredFiles.pattern().pattern()); } if (regexes.size() != replacements.size() || replacements.size() != inFormats.size() || inFormats.size() != outFormats.size()) { throw new IllegalArgumentException( "Must be an equal number of regex, replacement, in-format, and out-format options"); } job.setInt("crush.num.specs", regexes.size()); matchers = new ArrayList<Matcher>(regexes.size()); for (int i = 0; i < regexes.size(); i++) { job.set(format("crush.%d.regex", i), regexes.get(i)); matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy")); job.set(format("crush.%d.regex.replacement", i), replacements.get(i)); String inFmt = inFormats.get(i); if ("sequence".equals(inFmt)) { inFmt = SequenceFileInputFormat.class.getName(); } else if ("text".equals(inFmt)) { inFmt = TextInputFormat.class.getName(); } else { try { if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } job.set(format("crush.%d.input.format", i), inFmt); String outFmt = outFormats.get(i); if ("sequence".equals(outFmt)) { outFmt = SequenceFileOutputFormat.class.getName(); } else if ("text".equals(outFmt)) { outFmt = TextOutputFormat.class.getName(); } else { try { if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } job.set(format("crush.%d.output.format", i), outFmt); } String codec = cli.getOptionValue("compress"); if (null == codec) { codec = DefaultCodec.class.getName(); } else if ("none".equals(codec)) { codec = null; } else if ("gzip".equals(codec)) { codec = GzipCodec.class.getName(); } else { try { if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } if (null == codec) { job.setBoolean("mapred.output.compress", false); } else { job.setBoolean("mapred.output.compress", true); job.set("mapred.output.compression.type", "BLOCK"); job.set("mapred.output.compression.codec", codec); try { CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance(); codecExtension = instance.getDefaultExtension(); } catch (Exception e) { throw new AssertionError(); } } return true; }