@Override public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException { context = taskAttemptContext; FileSplit fileSplit = (FileSplit) genericSplit; lzoFile = fileSplit.getPath(); // The LzoSplitInputFormat is not splittable, so the split length is the whole file. totalFileSize = fileSplit.getLength(); // Jump through some hoops to create the lzo codec. Configuration conf = CompatibilityUtil.getConfiguration(context); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); LzopDecompressor lzopDecompressor = (LzopDecompressor) codec.createDecompressor(); FileSystem fs = lzoFile.getFileSystem(conf); rawInputStream = fs.open(lzoFile); // Creating the LzopInputStream here just reads the lzo header for us, nothing more. // We do the rest of our input off of the raw stream is. codec.createInputStream(rawInputStream, lzopDecompressor); // This must be called AFTER createInputStream is called, because createInputStream // is what reads the header, which has the checksum information. Otherwise getChecksumsCount // erroneously returns zero, and all block offsets will be wrong. numCompressedChecksums = lzopDecompressor.getCompressedChecksumsCount(); numDecompressedChecksums = lzopDecompressor.getDecompressedChecksumsCount(); }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { // Obtain path to input list of input images and open input stream FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); FileSystem fileSystem = path.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fileSystem.open(path); // Note the start and length fields in the FileSplit object are being used to // convey a range of lines in the input list of image URLs startLine = fileSplit.getStart(); numLines = fileSplit.getLength(); linesRead = 0; // total lines read by this particular record reader instance linesPerRecord = 100; // can be modified to change key/value pair size (may improve efficiency) // If it exists, get the relevant compression codec for the FileSplit CompressionCodecFactory codecFactory = new CompressionCodecFactory(context.getConfiguration()); CompressionCodec codec = codecFactory.getCodec(path); // If the codec was found, use it to create an decompressed input stream. // Otherwise, assume input stream is already decompressed if (codec != null) { reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fileIn))); } else { reader = new BufferedReader(new InputStreamReader(fileIn)); } }
private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException { Configuration conf = new Configuration(); conf.set("fs.default.name", "file:///"); FileSystem fs = FileSystem.get(conf); InputStream is = fs.open(f); CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(f); LOG.info("gzip check codec is " + codec); Decompressor decompressor = CodecPool.getDecompressor(codec); if (null == decompressor) { LOG.info("Verifying gzip sanity with null decompressor"); } else { LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString()); } is = codec.createInputStream(is, decompressor); BufferedReader r = new BufferedReader(new InputStreamReader(is)); int numLines = 0; while (true) { String ln = r.readLine(); if (ln == null) { break; } numLines++; } r.close(); assertEquals("Did not read back correct number of lines", expectedNumLines, numLines); LOG.info("gzip sanity check returned " + numLines + " lines; ok."); }
/** * Create a data file that gets exported to the db. * * @param fileNum the number of the file (for multi-file export) * @param numRecords how many records to write to the file. * @param gzip is true if the file should be gzipped. */ private void createTextFile( int fileNum, int numRecords, boolean gzip, ColumnGenerator... extraCols) throws IOException { int startId = fileNum * numRecords; String ext = ".txt"; if (gzip) { ext = ext + ".gz"; } Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part" + fileNum + ext); Configuration conf = new Configuration(); conf.set("fs.default.name", "file:///"); FileSystem fs = FileSystem.get(conf); fs.mkdirs(tablePath); OutputStream os = fs.create(filePath); if (gzip) { CompressionCodecFactory ccf = new CompressionCodecFactory(conf); CompressionCodec codec = ccf.getCodec(filePath); os = codec.createOutputStream(os); } BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); for (int i = 0; i < numRecords; i++) { w.write(getRecordLine(startId + i, extraCols)); } w.close(); os.close(); if (gzip) { verifyCompressedFile(filePath, numRecords); } }
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException( "Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
public static void decompressFile( final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious) throws IOException { final Path inPath = new Path(inFile); final Path outPath = new Path(outFile); final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); final CompressionCodec codec = factory.getCodec(inPath); final OutputStream out = fs.create(outPath); final InputStream in = codec.createInputStream(fs.open(inPath)); IOUtils.copyBytes(in, out, 8192); IOUtils.closeStream(in); IOUtils.closeStream(out); if (deletePrevious) fs.delete(new Path(inFile), true); }
/** * Returns an {@link InputStream} to the specified file. * * <p>Note: It is the caller's responsibility to close the returned {@link InputStream}. * * @param path The path to the file to open. * @return An {@link InputStream} for the specified file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the * specified file. */ @Override public InputStream getFileStream(String path) throws FileBasedHelperException { try { Path p = new Path(path); InputStream in = this.getFileSystem().open(p); // Account for compressed files (e.g. gzip). // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf()); CompressionCodec codec = factory.getCodec(p); return (codec == null) ? in : codec.createInputStream(in); } catch (IOException e) { throw new FileBasedHelperException( "Cannot open file " + path + " due to " + e.getMessage(), e); } }
public MutilCharRecordReader(FileSplit inputSplit, Configuration job) throws IOException { maxLineLength = job.getInt("mapred.mutilCharRecordReader.maxlength", Integer.MAX_VALUE); start = inputSplit.getStart(); end = start + inputSplit.getLength(); final Path file = inputSplit.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // 打开文件系统 FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(file); boolean skipFirstLine = false; if (codec != null) { lineReader = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new LineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); m_Sb.setLength(0); m_Start = split.getStart(); m_End = m_Start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the m_Start of the split FileSystem fs = file.getFileSystem(job); // getFileStatus fileStatus = fs.getFileStatus(split.getPath()); //noinspection deprecation @SuppressWarnings(value = "deprecated") long length = fs.getLength(file); FSDataInputStream fileIn = fs.open(split.getPath()); if (m_Start > 0) fileIn.seek(m_Start); if (codec != null) { CompressionInputStream inputStream = codec.createInputStream(fileIn); m_Input = new BufferedReader(new InputStreamReader(inputStream)); m_End = length; } else { m_Input = new BufferedReader(new InputStreamReader(fileIn)); } m_Current = m_Start; m_Key = split.getPath().getName(); }
/** * A little test program. * * @param args */ public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); boolean encode = false; for (int i = 0; i < args.length; ++i) { if ("-in".equals(args[i])) { encode = true; } else if ("-out".equals(args[i])) { encode = false; } else { CompressionCodec codec = factory.getCodec(new Path(args[i])); if (codec == null) { System.out.println("Codec for " + args[i] + " not found."); } else { if (encode) { CompressionOutputStream out = codec.createOutputStream(new java.io.FileOutputStream(args[i])); byte[] buffer = new byte[100]; String inFilename = removeSuffix(args[i], codec.getDefaultExtension()); java.io.InputStream in = new java.io.FileInputStream(inFilename); int len = in.read(buffer); while (len > 0) { out.write(buffer, 0, len); len = in.read(buffer); } in.close(); out.close(); } else { CompressionInputStream in = codec.createInputStream(new java.io.FileInputStream(args[i])); byte[] buffer = new byte[100]; int len = in.read(buffer); while (len > 0) { System.out.write(buffer, 0, len); len = in.read(buffer); } in.close(); } } } } }
public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException { if (jobConf.get(START_TAG_KEY) == null || jobConf.get(END_TAG_KEY) == null) throw new RuntimeException("Error! XML start and end tags unspecified!"); startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8"); endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8"); start = split.getStart(); Path file = split.getPath(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(jobConf); CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(jobConf); if (codec != null) { sLogger.info("Reading compressed file..."); // InputStream tempStream = codec.createInputStream(fileIn); // fsin = new DataInputStream(tempStream); fsin = new DataInputStream(codec.createInputStream(fs.open(file))); end = Long.MAX_VALUE; } else { sLogger.info("Reading uncompressed file..."); FSDataInputStream fileIn = fs.open(file); fileIn.seek(start); fsin = fileIn; end = start + split.getLength(); } recordStartPos = start; // Because input streams of gzipped files are not seekable // (specifically, do not support getPos), we need to keep // track of bytes consumed ourselves. pos = start; }
public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); FileSplit split = (FileSplit) genericSplit; final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (compressionCodecs != null && compressionCodecs.getCodec(file) != null) throw new RuntimeException("Not handling compression!"); return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job)); }
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start_ = split.getStart(); end_ = start_ + split.getLength(); final Path file = split.getPath(); Configuration job = HadoopCompat.getConfiguration(context); errorTracker = new InputErrorTracker(job); LOG.info("input split: " + file + " " + start_ + ":" + end_); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " found, cannot run"); } // Open the file and seek to the start of the split. fileIn_ = fs.open(split.getPath()); // Creates input stream and also reads the file header. createInputReader(codec.createInputStream(fileIn_), job); if (start_ != 0) { fileIn_.seek(start_); skipToNextSyncPoint(false); start_ = fileIn_.getPos(); LOG.info("Start is now " + start_); } else { skipToNextSyncPoint(true); } pos_ = start_; }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Path inputPath = new Path(args[0]); Path outputDir = new Path(args[1]); // Create configuration Configuration conf = new Configuration(true); // Create job @SuppressWarnings("deprecation") Job job = new Job(conf, "CountryIncomeConf"); job.setJarByClass(CountryIncomeConf.class); // Decompressing .gz file Ex. foo.csv.gz to foo.csv String uri = args[0]; FileSystem fs = FileSystem.get(URI.create(uri), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } // Setup MapReduce job.setMapperClass(CountryIncomeMapper.class); job.setReducerClass(CountryIncomeReducer.class); job.setNumReduceTasks(1); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Input // FileInputFormat.addInputPath(job, inputPath); FileInputFormat.addInputPaths(job, outputUri); job.setInputFormatClass(TextInputFormat.class); // Output FileOutputFormat.setOutputPath(job, outputDir); job.setOutputFormatClass(TextOutputFormat.class); // Delete output if exists FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true); // Execute job int code = job.waitForCompletion(true) ? 0 : 1; // Counter finding and displaying Counters counters = job.getCounters(); // Displaying counters System.out.printf( "Missing Fields: %d, Error Count: %d\n", counters.findCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT).getValue(), counters.findCounter(COUNTERS.NULL_OR_EMPTY).getValue()); System.exit(code); }
public void runBenchmark() throws IOException { System.out.println(); System.out.println(); System.out.println(); System.out.println(); /* * org.apache.hadoop.io.compress.BZip2Codec, * org.apache.hadoop.io.compress.DefaultCodec * org.apache.hadoop.io.compress.DeflateCodec * org.apache.hadoop.io.compress.GzipCodec * org.apache.hadoop.io.compress.Lz4Codec * org.apache.hadoop.io.compress.SnappyCodec */ Configuration conf = new Configuration(); List<Class<? extends CompressionCodec>> codecList = codecFactory.getCodecClasses(conf); Set<Class<? extends CompressionCodec>> codecSet = new HashSet<Class<? extends CompressionCodec>>(); codecSet.add(null); // no compression case codecSet.addAll(codecList); EnumSet<KV_TRAIT> traitSet = EnumSet.of(KV_TRAIT.KV, KV_TRAIT.MULTI_KV); for (KV_TRAIT trait : traitSet) { boolean rle = false; for (Class<? extends CompressionCodec> codec : codecSet) { try { String fileName = "result_" + trait + "_" + ((codec == null) ? "no_compression" : codec.getSimpleName()) + "_no_rle.out"; CompressionCodec compCodec = (codec == null) ? null : codecFactory.getCodecByClassName(codec.getName()); Path file = new Path(".", fileName); WriterOptions writeOptions = new WriterOptions(); conf.set("ifile.trait", trait.toString()); writeOptions.setConf(conf).setFilePath(fs, file).setCodec(compCodec).setRLE(false); createIFile(writeOptions, trait); Result rs = new Result(fileName, writeOptions, fs.getFileStatus(file).getLen()); System.out.println(rs); // with RLE fileName = "result_" + trait + "_" + ((codec == null) ? "no_compression" : codec.getSimpleName()) + "_rle.out"; writeOptions.setRLE(true); createIFile(writeOptions, trait); rs = new Result(fileName, writeOptions, fs.getFileStatus(file).getLen()); System.out.println(rs); } catch (Throwable t) { t.printStackTrace(); // proceed to the next benchmark. Quite possible that codec jars aren't available } } } }
public static void testFinding() { CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar")); assertEquals("default factory foo codec", null, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); assertEquals("default factory foo codec", null, codec); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("default factory for .gz", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("gzip"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("GZIP"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("GZIPCodec"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("gzipcodec"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); Class klass = factory.getCodecClassByName("gzipcodec"); assertEquals(GzipCodec.class, klass); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("default factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("bzip2"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("bzip2codec"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("BZIP2"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("BZIP2CODEC"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByClassName(DeflateCodec.class.getCanonicalName()); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("deflate"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("deflatecodec"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("DEFLATE"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("DEFLATECODEC"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); factory = setClasses(new Class[0]); // gz, bz2, snappy, lz4 are picked up by service loader, but bar isn't codec = factory.getCodec(new Path("/tmp/foo.bar")); assertEquals("empty factory bar codec", null, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); assertEquals("empty factory bar codec", null, codec); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("empty factory gz codec", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("empty factory gz codec", GzipCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("empty factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("empty factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.snappy")); checkCodec("empty factory snappy codec", SnappyCodec.class, codec); codec = factory.getCodecByClassName(SnappyCodec.class.getCanonicalName()); checkCodec("empty factory snappy codec", SnappyCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.lz4")); checkCodec("empty factory lz4 codec", Lz4Codec.class, codec); codec = factory.getCodecByClassName(Lz4Codec.class.getCanonicalName()); checkCodec("empty factory lz4 codec", Lz4Codec.class, codec); factory = setClasses(new Class[] {BarCodec.class, FooCodec.class, FooBarCodec.class}); codec = factory.getCodec(new Path("/tmp/.foo.bar.gz")); checkCodec("full factory gz codec", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("full codec gz codec", GzipCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("full factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("full codec bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.bar")); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodecByName("bar"); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodecByName("BAR"); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar")); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName()); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodecByName("foobar"); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodecByName("FOOBAR"); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.foo")); checkCodec("full factory foo codec", FooCodec.class, codec); codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName()); checkCodec("full factory foo codec", FooCodec.class, codec); codec = factory.getCodecByName("foo"); checkCodec("full factory foo codec", FooCodec.class, codec); codec = factory.getCodecByName("FOO"); checkCodec("full factory foo codec", FooCodec.class, codec); factory = setClasses(new Class[] {NewGzipCodec.class}); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("overridden factory for .gz", NewGzipCodec.class, codec); codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName()); checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec); }
protected boolean isSplitable(FileSystem fs, Path file) { if (compressionCodecs == null) return true; return compressionCodecs.getCodec(file) == null; }
/** * Returns a factory for a given set of codecs * * @param classes the codec classes to include * @return a new factory */ private static CompressionCodecFactory setClasses(Class[] classes) { Configuration conf = new Configuration(); CompressionCodecFactory.setCodecClasses(conf, Arrays.asList(classes)); return new CompressionCodecFactory(conf); }