Ejemplo n.º 1
0
  private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException {
    Configuration conf = new Configuration();
    conf.set("fs.default.name", "file:///");
    FileSystem fs = FileSystem.get(conf);
    InputStream is = fs.open(f);
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(f);
    LOG.info("gzip check codec is " + codec);
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (null == decompressor) {
      LOG.info("Verifying gzip sanity with null decompressor");
    } else {
      LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString());
    }
    is = codec.createInputStream(is, decompressor);
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    int numLines = 0;
    while (true) {
      String ln = r.readLine();
      if (ln == null) {
        break;
      }
      numLines++;
    }

    r.close();
    assertEquals("Did not read back correct number of lines", expectedNumLines, numLines);
    LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
  }
Ejemplo n.º 2
0
    public Decompressor getDecompressor() {
      CompressionCodec codec = getCodec(conf);
      if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        if (LOG.isTraceEnabled())
          LOG.trace("Retrieved decompressor " + decompressor + " from pool.");
        if (decompressor != null) {
          if (decompressor.finished()) {
            // Somebody returns the decompressor to CodecPool but is still using it.
            LOG.warn("Deompressor obtained from CodecPool is already finished()");
          }
          decompressor.reset();
        }
        return decompressor;
      }

      return null;
    }
Ejemplo n.º 3
0
  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
      isCompressedInput = true;
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
            ((SplittableCompressionCodec) codec)
                .createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
        in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
        in =
            new SplitLineReader(
                codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
  }
Ejemplo n.º 4
0
    public Decompressor getDecompressor() {
      CompressionCodec codec = getCodec(conf);
      if (codec != null) {
        Decompressor decompressor = CodecPool.getDecompressor(codec);
        if (decompressor != null) {
          if (decompressor.finished()) {
            // Somebody returns the decompressor to CodecPool but is still using
            // it.
            LOG.warn("Deompressor obtained from CodecPool is already finished()");
            // throw new AssertionError(
            // "Decompressor obtained from CodecPool is already finished()");
          }
          decompressor.reset();
        }
        return decompressor;
      }

      return null;
    }
  public Fetcher(
      Configuration job,
      ShuffleScheduler scheduler,
      MergeManager merger,
      ShuffleClientMetrics metrics,
      Shuffle shuffle,
      SecretKey jobTokenSecret,
      TezInputContext inputContext)
      throws IOException {
    this.job = job;
    this.scheduler = scheduler;
    this.merger = merger;
    this.metrics = metrics;
    this.shuffle = shuffle;
    this.id = ++nextId;
    this.jobTokenSecret = jobTokenSecret;
    ioErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.IO_ERROR.toString());
    wrongLengthErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_LENGTH.toString());
    badIdErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.BAD_ID.toString());
    wrongMapErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_MAP.toString());
    connectionErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.CONNECTION.toString());
    wrongReduceErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_REDUCE.toString());

    if (ConfigUtils.isIntermediateInputCompressed(job)) {
      Class<? extends CompressionCodec> codecClass =
          ConfigUtils.getIntermediateInputCompressorClass(job, DefaultCodec.class);
      codec = ReflectionUtils.newInstance(codecClass, job);
      decompressor = CodecPool.getDecompressor(codec);
    } else {
      codec = null;
      decompressor = null;
    }

    this.connectionTimeout =
        job.getInt(
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT,
            TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_STALLED_COPY_TIMEOUT);
    this.readTimeout =
        job.getInt(
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT,
            TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT);

    setName("fetcher#" + id);
    setDaemon(true);

    synchronized (Fetcher.class) {
      sslShuffle =
          job.getBoolean(
              TezJobConfig.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL,
              TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_ENABLE_SSL);
      if (sslShuffle && sslFactory == null) {
        sslFactory = new SSLFactory(SSLFactory.Mode.CLIENT, job);
        try {
          sslFactory.init();
        } catch (Exception ex) {
          sslFactory.destroy();
          throw new RuntimeException(ex);
        }
      }
    }
  }