public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); m_Sb.setLength(0); m_Start = split.getStart(); m_End = m_Start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the m_Start of the split FileSystem fs = file.getFileSystem(job); // getFileStatus fileStatus = fs.getFileStatus(split.getPath()); //noinspection deprecation @SuppressWarnings(value = "deprecated") long length = fs.getLength(file); FSDataInputStream fileIn = fs.open(split.getPath()); if (m_Start > 0) fileIn.seek(m_Start); if (codec != null) { CompressionInputStream inputStream = codec.createInputStream(fileIn); m_Input = new BufferedReader(new InputStreamReader(inputStream)); m_End = length; } else { m_Input = new BufferedReader(new InputStreamReader(fileIn)); } m_Current = m_Start; m_Key = split.getPath().getName(); }
/** * A little test program. * * @param args */ public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); boolean encode = false; for (int i = 0; i < args.length; ++i) { if ("-in".equals(args[i])) { encode = true; } else if ("-out".equals(args[i])) { encode = false; } else { CompressionCodec codec = factory.getCodec(new Path(args[i])); if (codec == null) { System.out.println("Codec for " + args[i] + " not found."); } else { if (encode) { CompressionOutputStream out = codec.createOutputStream(new java.io.FileOutputStream(args[i])); byte[] buffer = new byte[100]; String inFilename = removeSuffix(args[i], codec.getDefaultExtension()); java.io.InputStream in = new java.io.FileInputStream(inFilename); int len = in.read(buffer); while (len > 0) { out.write(buffer, 0, len); len = in.read(buffer); } in.close(); out.close(); } else { CompressionInputStream in = codec.createInputStream(new java.io.FileInputStream(args[i])); byte[] buffer = new byte[100]; int len = in.read(buffer); while (len > 0) { System.out.write(buffer, 0, len); len = in.read(buffer); } in.close(); } } } } }
public static void testFinding() { CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar")); assertEquals("default factory foo codec", null, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); assertEquals("default factory foo codec", null, codec); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("default factory for .gz", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("gzip"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("GZIP"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("GZIPCodec"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); codec = factory.getCodecByName("gzipcodec"); checkCodec("default factory for gzip codec", GzipCodec.class, codec); Class klass = factory.getCodecClassByName("gzipcodec"); assertEquals(GzipCodec.class, klass); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("default factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("bzip2"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("bzip2codec"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("BZIP2"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByName("BZIP2CODEC"); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodecByClassName(DeflateCodec.class.getCanonicalName()); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("deflate"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("deflatecodec"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("DEFLATE"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); codec = factory.getCodecByName("DEFLATECODEC"); checkCodec("default factory for deflate codec", DeflateCodec.class, codec); factory = setClasses(new Class[0]); // gz, bz2, snappy, lz4 are picked up by service loader, but bar isn't codec = factory.getCodec(new Path("/tmp/foo.bar")); assertEquals("empty factory bar codec", null, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); assertEquals("empty factory bar codec", null, codec); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("empty factory gz codec", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("empty factory gz codec", GzipCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("empty factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("empty factory for bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.snappy")); checkCodec("empty factory snappy codec", SnappyCodec.class, codec); codec = factory.getCodecByClassName(SnappyCodec.class.getCanonicalName()); checkCodec("empty factory snappy codec", SnappyCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.lz4")); checkCodec("empty factory lz4 codec", Lz4Codec.class, codec); codec = factory.getCodecByClassName(Lz4Codec.class.getCanonicalName()); checkCodec("empty factory lz4 codec", Lz4Codec.class, codec); factory = setClasses(new Class[] {BarCodec.class, FooCodec.class, FooBarCodec.class}); codec = factory.getCodec(new Path("/tmp/.foo.bar.gz")); checkCodec("full factory gz codec", GzipCodec.class, codec); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); checkCodec("full codec gz codec", GzipCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.bz2")); checkCodec("full factory for .bz2", BZip2Codec.class, codec); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); checkCodec("full codec bzip2 codec", BZip2Codec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.bar")); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodecByName("bar"); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodecByName("BAR"); checkCodec("full factory bar codec", BarCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar")); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName()); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodecByName("foobar"); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodecByName("FOOBAR"); checkCodec("full factory foo bar codec", FooBarCodec.class, codec); codec = factory.getCodec(new Path("/tmp/foo.foo")); checkCodec("full factory foo codec", FooCodec.class, codec); codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName()); checkCodec("full factory foo codec", FooCodec.class, codec); codec = factory.getCodecByName("foo"); checkCodec("full factory foo codec", FooCodec.class, codec); codec = factory.getCodecByName("FOO"); checkCodec("full factory foo codec", FooCodec.class, codec); factory = setClasses(new Class[] {NewGzipCodec.class}); codec = factory.getCodec(new Path("/tmp/foo.gz")); checkCodec("overridden factory for .gz", NewGzipCodec.class, codec); codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName()); checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec); }
/** * Returns a factory for a given set of codecs * * @param classes the codec classes to include * @return a new factory */ private static CompressionCodecFactory setClasses(Class[] classes) { Configuration conf = new Configuration(); CompressionCodecFactory.setCodecClasses(conf, Arrays.asList(classes)); return new CompressionCodecFactory(conf); }