public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
      FileSplit split = (FileSplit) genericSplit;
      Configuration job = context.getConfiguration();
      m_Sb.setLength(0);
      m_Start = split.getStart();
      m_End = m_Start + split.getLength();
      final Path file = split.getPath();
      compressionCodecs = new CompressionCodecFactory(job);
      final CompressionCodec codec = compressionCodecs.getCodec(file);

      // open the file and seek to the m_Start of the split
      FileSystem fs = file.getFileSystem(job);
      //  getFileStatus fileStatus = fs.getFileStatus(split.getPath());
      //noinspection deprecation
      @SuppressWarnings(value = "deprecated")
      long length = fs.getLength(file);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (m_Start > 0) fileIn.seek(m_Start);
      if (codec != null) {
        CompressionInputStream inputStream = codec.createInputStream(fileIn);
        m_Input = new BufferedReader(new InputStreamReader(inputStream));
        m_End = length;
      } else {
        m_Input = new BufferedReader(new InputStreamReader(fileIn));
      }
      m_Current = m_Start;
      m_Key = split.getPath().getName();
    }
Exemple #2
0
 public CompressionCodec compressionCodec() {
   byte magicByte = magic();
   switch (magicByte) {
     case 0:
       return CompressionCodec.NoCompressionCodec;
     case 1:
       return CompressionCodec.valueOf(buffer.get(ATTRIBUTE_OFFSET) & CompressionCodeMask);
   }
   throw new RuntimeException("Invalid magic byte " + magicByte);
 }
 /**
  * A little test program.
  *
  * @param args
  */
 public static void main(String[] args) throws Exception {
   Configuration conf = new Configuration();
   CompressionCodecFactory factory = new CompressionCodecFactory(conf);
   boolean encode = false;
   for (int i = 0; i < args.length; ++i) {
     if ("-in".equals(args[i])) {
       encode = true;
     } else if ("-out".equals(args[i])) {
       encode = false;
     } else {
       CompressionCodec codec = factory.getCodec(new Path(args[i]));
       if (codec == null) {
         System.out.println("Codec for " + args[i] + " not found.");
       } else {
         if (encode) {
           CompressionOutputStream out =
               codec.createOutputStream(new java.io.FileOutputStream(args[i]));
           byte[] buffer = new byte[100];
           String inFilename = removeSuffix(args[i], codec.getDefaultExtension());
           java.io.InputStream in = new java.io.FileInputStream(inFilename);
           int len = in.read(buffer);
           while (len > 0) {
             out.write(buffer, 0, len);
             len = in.read(buffer);
           }
           in.close();
           out.close();
         } else {
           CompressionInputStream in =
               codec.createInputStream(new java.io.FileInputStream(args[i]));
           byte[] buffer = new byte[100];
           int len = in.read(buffer);
           while (len > 0) {
             System.out.write(buffer, 0, len);
             len = in.read(buffer);
           }
           in.close();
         }
       }
     }
   }
 }
 private static void writeFile(FileSystem fs, Path name, CompressionCodec codec, String contents)
     throws IOException {
   OutputStream stm;
   if (codec == null) {
     stm = fs.create(name);
   } else {
     stm = codec.createOutputStream(fs.create(name));
   }
   stm.write(contents.getBytes());
   stm.close();
 }
  @Test
  public void testSplitableCodecs() throws IOException {
    JobConf conf = new JobConf(defaultConf);
    int seed = new Random().nextInt();
    // Create the codec
    CompressionCodec codec = null;
    try {
      codec =
          (CompressionCodec)
              ReflectionUtils.newInstance(
                  conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
    } catch (ClassNotFoundException cnfe) {
      throw new IOException("Illegal codec!");
    }
    Path file = new Path(workDir, "test" + codec.getDefaultExtension());

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileSystem localFs = FileSystem.getLocal(conf);

    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(conf, workDir);

    final int MAX_LENGTH = 500000;

    // for a variety of lengths
    for (int length = MAX_LENGTH / 2;
        length < MAX_LENGTH;
        length += random.nextInt(MAX_LENGTH / 4) + 1) {

      LOG.info("creating; entries = " + length);

      // create a file with length entries
      Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
      try {
        for (int i = 0; i < length; i++) {
          writer.write(Integer.toString(i));
          writer.write("\n");
        }
      } finally {
        writer.close();
      }

      // try splitting the file in a variety of sizes
      TextInputFormat format = new TextInputFormat();
      format.configure(conf);
      LongWritable key = new LongWritable();
      Text value = new Text();
      for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(conf, numSplits);
        LOG.info("splitting: got =        " + splits.length);

        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          LOG.debug("split[" + j + "]= " + splits[j]);
          RecordReader<LongWritable, Text> reader =
              format.getRecordReader(splits[j], conf, reporter);
          try {
            int counter = 0;
            while (reader.next(key, value)) {
              int v = Integer.parseInt(value.toString());
              LOG.debug("read " + v);

              if (bits.get(v)) {
                LOG.warn(
                    "conflict with " + v + " in split " + j + " at position " + reader.getPos());
              }
              assertFalse("Key in multiple partitions.", bits.get(v));
              bits.set(v);
              counter++;
            }
            if (counter > 0) {
              LOG.info("splits[" + j + "]=" + splits[j] + " count=" + counter);
            } else {
              LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + counter);
            }
          } finally {
            reader.close();
          }
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
      }
    }
  }
 private void addCodec(CompressionCodec codec) {
   String suffix = codec.getDefaultExtension();
   codecs.put(new StringBuffer(suffix).reverse().toString(), codec);
 }
 private static void checkCodec(String msg, Class expected, CompressionCodec actual) {
   assertEquals(msg + " unexpected codec found", expected.getName(), actual.getClass().getName());
 }