public QseqRecordReader(Configuration conf, FileSplit split) throws IOException {
      setConf(conf);
      file = split.getPath();
      start = split.getStart();
      end = start + split.getLength();

      FileSystem fs = file.getFileSystem(conf);
      FSDataInputStream fileIn = fs.open(file);

      CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
      CompressionCodec codec = codecFactory.getCodec(file);

      if (codec == null) // no codec.  Uncompressed file.
      {
        positionAtFirstRecord(fileIn);
        inputStream = fileIn;
      } else { // compressed file
        if (start != 0)
          throw new RuntimeException(
              "Start position for compressed file is not 0! (found " + start + ")");

        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
      }

      lineReader = new LineReader(inputStream);
    }
Esempio n. 2
0
  public MutilCharRecordReader(FileSplit inputSplit, Configuration job) throws IOException {

    maxLineLength = job.getInt("mapred.mutilCharRecordReader.maxlength", Integer.MAX_VALUE);
    start = inputSplit.getStart();
    end = start + inputSplit.getLength();
    final Path file = inputSplit.getPath();

    compressionCodecs = new CompressionCodecFactory(job);

    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // 打开文件系统
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;

    if (codec != null) {
      lineReader = new LineReader(codec.createInputStream(fileIn), job);
      end = Long.MAX_VALUE;
    } else {
      if (start != 0) {
        skipFirstLine = true;
        --start;
        fileIn.seek(start);
      }
      lineReader = new LineReader(fileIn, job);
    }

    if (skipFirstLine) {
      start +=
          lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
  }
Esempio n. 3
0
    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
      FileSplit split = (FileSplit) genericSplit;
      Configuration job = context.getConfiguration();
      m_Sb.setLength(0);
      m_Start = split.getStart();
      m_End = m_Start + split.getLength();
      final Path file = split.getPath();
      compressionCodecs = new CompressionCodecFactory(job);
      final CompressionCodec codec = compressionCodecs.getCodec(file);

      // open the file and seek to the m_Start of the split
      FileSystem fs = file.getFileSystem(job);
      //  getFileStatus fileStatus = fs.getFileStatus(split.getPath());
      //noinspection deprecation
      @SuppressWarnings(value = "deprecated")
      long length = fs.getLength(file);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (m_Start > 0) fileIn.seek(m_Start);
      if (codec != null) {
        CompressionInputStream inputStream = codec.createInputStream(fileIn);
        m_Input = new BufferedReader(new InputStreamReader(inputStream));
        m_End = length;
      } else {
        m_Input = new BufferedReader(new InputStreamReader(fileIn));
      }
      m_Current = m_Start;
      m_Key = split.getPath().getName();
    }
  @Override
  public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {

    // Obtain path to input list of input images and open input stream
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();
    FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fileSystem.open(path);

    // Note the start and length fields in the FileSplit object are being used to
    // convey a range of lines in the input list of image URLs
    startLine = fileSplit.getStart();
    numLines = fileSplit.getLength();
    linesRead = 0; // total lines read by this particular record reader instance
    linesPerRecord = 100; // can be modified to change key/value pair size (may improve efficiency)

    // If it exists, get the relevant compression codec for the FileSplit
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(context.getConfiguration());
    CompressionCodec codec = codecFactory.getCodec(path);

    // If the codec was found, use it to create an decompressed input stream.
    // Otherwise, assume input stream is already decompressed
    if (codec != null) {
      reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fileIn)));
    } else {
      reader = new BufferedReader(new InputStreamReader(fileIn));
    }
  }
Esempio n. 5
0
  private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException {
    Configuration conf = new Configuration();
    conf.set("fs.default.name", "file:///");
    FileSystem fs = FileSystem.get(conf);
    InputStream is = fs.open(f);
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(f);
    LOG.info("gzip check codec is " + codec);
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (null == decompressor) {
      LOG.info("Verifying gzip sanity with null decompressor");
    } else {
      LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString());
    }
    is = codec.createInputStream(is, decompressor);
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    int numLines = 0;
    while (true) {
      String ln = r.readLine();
      if (ln == null) {
        break;
      }
      numLines++;
    }

    r.close();
    assertEquals("Did not read back correct number of lines", expectedNumLines, numLines);
    LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
  }
Esempio n. 6
0
  /**
   * Create a data file that gets exported to the db.
   *
   * @param fileNum the number of the file (for multi-file export)
   * @param numRecords how many records to write to the file.
   * @param gzip is true if the file should be gzipped.
   */
  private void createTextFile(
      int fileNum, int numRecords, boolean gzip, ColumnGenerator... extraCols) throws IOException {
    int startId = fileNum * numRecords;

    String ext = ".txt";
    if (gzip) {
      ext = ext + ".gz";
    }
    Path tablePath = getTablePath();
    Path filePath = new Path(tablePath, "part" + fileNum + ext);

    Configuration conf = new Configuration();
    conf.set("fs.default.name", "file:///");
    FileSystem fs = FileSystem.get(conf);
    fs.mkdirs(tablePath);
    OutputStream os = fs.create(filePath);
    if (gzip) {
      CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
      CompressionCodec codec = ccf.getCodec(filePath);
      os = codec.createOutputStream(os);
    }
    BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
    for (int i = 0; i < numRecords; i++) {
      w.write(getRecordLine(startId + i, extraCols));
    }
    w.close();
    os.close();

    if (gzip) {
      verifyCompressedFile(filePath, numRecords);
    }
  }
Esempio n. 7
0
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext)
      throws IOException {
    context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    lzoFile = fileSplit.getPath();
    // The LzoSplitInputFormat is not splittable, so the split length is the whole file.
    totalFileSize = fileSplit.getLength();

    // Jump through some hoops to create the lzo codec.
    Configuration conf = CompatibilityUtil.getConfiguration(context);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    LzopDecompressor lzopDecompressor = (LzopDecompressor) codec.createDecompressor();
    FileSystem fs = lzoFile.getFileSystem(conf);
    rawInputStream = fs.open(lzoFile);

    // Creating the LzopInputStream here just reads the lzo header for us, nothing more.
    // We do the rest of our input off of the raw stream is.
    codec.createInputStream(rawInputStream, lzopDecompressor);

    // This must be called AFTER createInputStream is called, because createInputStream
    // is what reads the header, which has the checksum information.  Otherwise getChecksumsCount
    // erroneously returns zero, and all block offsets will be wrong.
    numCompressedChecksums = lzopDecompressor.getCompressedChecksumsCount();
    numDecompressedChecksums = lzopDecompressor.getDecompressedChecksumsCount();
  }
Esempio n. 8
0
 public InputStream createDecompressionStream(
     InputStream downStream, Decompressor decompressor, int downStreamBufferSize)
     throws IOException {
   CompressionCodec codec = getCodec(conf);
   // Set the internal buffer size to read from down stream.
   if (downStreamBufferSize > 0) {
     ((Configurable) codec).getConf().setInt("io.file.buffer.size", downStreamBufferSize);
   }
   CompressionInputStream cis = codec.createInputStream(downStream, decompressor);
   BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE);
   return bis2;
 }
Esempio n. 9
0
 /**
  * Returns an {@link InputStream} to the specified file.
  *
  * <p>Note: It is the caller's responsibility to close the returned {@link InputStream}.
  *
  * @param path The path to the file to open.
  * @return An {@link InputStream} for the specified file.
  * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the
  *     specified file.
  */
 @Override
 public InputStream getFileStream(String path) throws FileBasedHelperException {
   try {
     Path p = new Path(path);
     InputStream in = this.getFileSystem().open(p);
     // Account for compressed files (e.g. gzip).
     // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
     CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
     CompressionCodec codec = factory.getCodec(p);
     return (codec == null) ? in : codec.createInputStream(in);
   } catch (IOException e) {
     throw new FileBasedHelperException(
         "Cannot open file " + path + " due to " + e.getMessage(), e);
   }
 }
Esempio n. 10
0
  public static void decompressFile(
      final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious)
      throws IOException {
    final Path inPath = new Path(inFile);
    final Path outPath = new Path(outFile);
    final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
    final CompressionCodec codec = factory.getCodec(inPath);
    final OutputStream out = fs.create(outPath);
    final InputStream in = codec.createInputStream(fs.open(inPath));
    IOUtils.copyBytes(in, out, 8192);
    IOUtils.closeStream(in);
    IOUtils.closeStream(out);

    if (deletePrevious) fs.delete(new Path(inFile), true);
  }
Esempio n. 11
0
  /**
   * Creates an output segment file and sets up the output streams to point at it. If the file
   * already exists, retries with a different filename. This is a bit nasty -- after all, {@link
   * FileOutputFormat}'s work directory concept is supposed to prevent filename clashes -- but it
   * looks like Amazon Elastic MapReduce prevents use of per-task work directories if the output of
   * a job is on S3.
   *
   * <p>TODO: Investigate this and find a better solution.
   */
  private void createSegment() throws IOException {
    segmentsAttempted = 0;
    bytesWritten = 0;
    boolean success = false;

    while (!success) {
      Path path =
          workOutputPath.suffix(String.format(extensionFormat, segmentsCreated, segmentsAttempted));
      FileSystem fs = path.getFileSystem(conf);

      try {
        // The o.a.h.mapred OutputFormats overwrite existing files, whereas
        // the o.a.h.mapreduce OutputFormats don't overwrite. Bizarre...
        // Here, overwrite if progress != null, i.e. if using mapred API.
        FSDataOutputStream fsStream =
            (progress == null) ? fs.create(path, false) : fs.create(path, progress);
        byteStream = new CountingOutputStream(new BufferedOutputStream(fsStream));
        dataStream =
            new DataOutputStream(codec == null ? byteStream : codec.createOutputStream(byteStream));
        segmentsCreated++;
        logger.info("Writing to output file: {}", path);
        success = true;

      } catch (IOException e) {
        if (e.getMessage().startsWith("File already exists")) {
          logger.warn("Tried to create file {} but it already exists; retrying.", path);
          segmentsAttempted++; // retry
        } else {
          throw e;
        }
      }
    }
  }
  @Override
  public RecordWriter<IEtlKey, CamusWrapper> getDataRecordWriter(
      TaskAttemptContext context, String fileName, CamusWrapper data, FileOutputCommitter committer)
      throws IOException, InterruptedException {

    // If recordDelimiter hasn't been initialized, do so now
    if (recordDelimiter == null) {
      recordDelimiter =
          context.getConfiguration().get(ETL_OUTPUT_RECORD_DELIMITER, DEFAULT_RECORD_DELIMITER);
    }

    // Get the filename for this RecordWriter.
    Path path =
        new Path(
            committer.getWorkPath(),
            EtlMultiOutputFormat.getUniqueFile(context, fileName, getFilenameExtension()));

    //		final FSDataOutputStream writer =
    // path.getFileSystem(context.getConfiguration()).create(path);

    FileSystem fs = path.getFileSystem(context.getConfiguration());
    DataOutputStream writer; // = fs.create(path, false);
    if (isCompressed) {
      return new ByteRecordWriter(
          new DataOutputStream(codec.createOutputStream(fs.create(path, false))), recordDelimiter);
    } else {
      return new ByteRecordWriter(fs.create(path, false), recordDelimiter);
    }
  }
Esempio n. 13
0
 public PartitionFinalizer(SecorConfig config) throws Exception {
   mConfig = config;
   mKafkaClient = new KafkaClient(mConfig);
   mZookeeperConnector = new ZookeeperConnector(mConfig);
   mMessageParser =
       (TimestampedMessageParser)
           ReflectionUtil.createMessageParser(mConfig.getMessageParserClass(), mConfig);
   mQuboleClient = new QuboleClient(mConfig);
   if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
     CompressionCodec codec =
         CompressionUtil.createCompressionCodec(mConfig.getCompressionCodec());
     mFileExtension = codec.getDefaultExtension();
   } else {
     mFileExtension = "";
   }
 }
Esempio n. 14
0
  private DataOutputStream createRawOutputStream(TaskAttemptContext ctx) throws IOException {
    boolean isCompressed = getCompressOutput(ctx);

    if (!isCompressed) {
      Path file = getDefaultWorkFile(ctx, ".nt");
      FileSystem fs = file.getFileSystem(ctx.getConfiguration());
      return fs.create(file, false);
    } else {
      Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(ctx, GzipCodec.class);

      CompressionCodec codec = ReflectionUtils.newInstance(codecClass, ctx.getConfiguration());
      Path file = getDefaultWorkFile(ctx, ".nt" + codec.getDefaultExtension());
      FileSystem fs = file.getFileSystem(ctx.getConfiguration());
      FSDataOutputStream fileOut = fs.create(file, false);
      return new DataOutputStream(codec.createOutputStream(fileOut));
    }
  }
Esempio n. 15
0
 @Override
 public RecordWriter<K, V> getRecordWriter(
     FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
   boolean isCompressed = getCompressOutput(job);
   if (!isCompressed) {
     Path file = FileOutputFormat.getTaskOutputPath(job, name);
     FileSystem fs = file.getFileSystem(job);
     FSDataOutputStream fileOut = fs.create(file, progress);
     return new LineRecordWriter<K, V>(fileOut);
   } else {
     Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
     CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
     Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension());
     FileSystem fs = file.getFileSystem(job);
     FSDataOutputStream fileOut = fs.create(file, progress);
     return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)));
   }
 }
Esempio n. 16
0
  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
      isCompressedInput = true;
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
            ((SplittableCompressionCodec) codec)
                .createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
        in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
        in =
            new SplitLineReader(
                codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
  }
Esempio n. 17
0
    public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException {
      if (jobConf.get(START_TAG_KEY) == null || jobConf.get(END_TAG_KEY) == null)
        throw new RuntimeException("Error! XML start and end tags unspecified!");

      startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
      endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");

      start = split.getStart();
      Path file = split.getPath();

      CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(jobConf);
      CompressionCodec codec = compressionCodecs.getCodec(file);

      FileSystem fs = file.getFileSystem(jobConf);

      if (codec != null) {
        sLogger.info("Reading compressed file...");

        // InputStream tempStream = codec.createInputStream(fileIn);
        // fsin = new DataInputStream(tempStream);

        fsin = new DataInputStream(codec.createInputStream(fs.open(file)));

        end = Long.MAX_VALUE;
      } else {
        sLogger.info("Reading uncompressed file...");

        FSDataInputStream fileIn = fs.open(file);

        fileIn.seek(start);
        fsin = fileIn;

        end = start + split.getLength();
      }

      recordStartPos = start;

      // Because input streams of gzipped files are not seekable
      // (specifically, do not support getPos), we need to keep
      // track of bytes consumed ourselves.
      pos = start;
    }
 private static void writeFile(FileSystem fs, Path name, CompressionCodec codec, String contents)
     throws IOException {
   OutputStream stm;
   if (codec == null) {
     stm = fs.create(name);
   } else {
     stm = codec.createOutputStream(fs.create(name));
   }
   stm.write(contents.getBytes());
   stm.close();
 }
Esempio n. 19
0
 // copied from FaunusFileOutputFormat
 protected DataOutputStream getDataOuputStream(final TaskAttemptContext job)
     throws IOException, InterruptedException {
   final Configuration conf = job.getConfiguration();
   boolean isCompressed = getCompressOutput(job);
   CompressionCodec codec = null;
   String extension = "";
   if (isCompressed) {
     final Class<? extends CompressionCodec> codecClass =
         getOutputCompressorClass(job, DefaultCodec.class);
     codec = ReflectionUtils.newInstance(codecClass, conf);
     extension = codec.getDefaultExtension();
   }
   final Path file = super.getDefaultWorkFile(job, extension);
   final FileSystem fs = file.getFileSystem(conf);
   if (!isCompressed) {
     return new DataOutputStream(fs.create(file, false));
   } else {
     return new DataOutputStream(codec.createOutputStream(fs.create(file, false)));
   }
 }
Esempio n. 20
0
 /**
  * Creates a WARC file, and opens it for writing. If a file with the same name already exists, it
  * is *overwritten*. Note that this is different behaviour from the other constructor. Yes, this
  * sucks. It will probably change in a future version.
  *
  * @param conf The Hadoop configuration.
  * @param codec If null, the file is uncompressed. If non-null, this compression codec will be
  *     used. The codec's default file extension is appended to the filename.
  * @param workOutputPath The directory and filename prefix to which the data should be written. We
  *     append a segment number and filename extensions to it.
  * @param progress An object used by the mapred API for tracking a task's progress.
  * @throws IOException
  */
 public WARCFileWriter(
     Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress)
     throws IOException {
   this.conf = conf;
   this.codec = codec;
   this.workOutputPath = workOutputPath;
   this.progress = progress;
   this.extensionFormat =
       ".seg-%05d.attempt-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension());
   this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
   createSegment();
 }
Esempio n. 21
0
  private void runTest(int numChunks, int chunkSize) throws Exception {
    CompressionCodec codec = ReflectionUtils.newInstance(LzopCodec.class, conf);

    final Random writerRand = new Random(12345);
    final Random readerRand = new Random(12345);

    File testFile = new File(System.getProperty("test.build.data"), "randdata");
    String fileName = testFile.getAbsolutePath();

    // Create the file
    OutputStream fos = new FileOutputStream(fileName);
    fos = codec.createOutputStream(fos);

    // Write file
    byte[] data = new byte[chunkSize];
    System.out.println("Start to write to file...");
    for (int i = 0; i < numChunks; i++) {
      writerRand.nextBytes(data);
      fos.write(data);
    }
    fos.close();
    System.out.println("Closed file.");

    // Open file
    InputStream tis = new FileInputStream(fileName);
    tis = codec.createInputStream(tis);

    // Read file
    byte[] dataExpected = new byte[chunkSize];
    byte[] dataRead = new byte[chunkSize];
    for (int i = 0; i < numChunks; i++) {
      readerRand.nextBytes(dataExpected);
      readFully(tis, dataRead);
      assertArrayEquals(dataExpected, dataRead);
    }

    assertEquals(-1, tis.read());
    tis.close();
  }
 @Override
 public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
     throws IOException, InterruptedException {
   Configuration conf = job.getConfiguration();
   boolean isCompressed = getCompressOutput(job);
   CompressionCodec codec = null;
   String extension = "";
   if (isCompressed) {
     Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
     codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
     extension = codec.getDefaultExtension();
   }
   Path file = getDefaultWorkFile(job, extension);
   FileSystem fs = file.getFileSystem(conf);
   if (!isCompressed) {
     FSDataOutputStream fileOut = fs.create(file, false);
     return new org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter<K, V>(
         fileOut, KEY_VALUE_SEPARATOR);
   } else {
     FSDataOutputStream fileOut = fs.create(file, false);
     return new org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.LineRecordWriter<K, V>(
         new DataOutputStream(codec.createOutputStream(fileOut)), KEY_VALUE_SEPARATOR);
   }
 }
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext context)
      throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start_ = split.getStart();
    end_ = start_ + split.getLength();
    final Path file = split.getPath();
    Configuration job = HadoopCompat.getConfiguration(context);

    errorTracker = new InputErrorTracker(job);

    LOG.info("input split: " + file + " " + start_ + ":" + end_);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
      throw new IOException("No codec for file " + file + " found, cannot run");
    }

    // Open the file and seek to the start of the split.
    fileIn_ = fs.open(split.getPath());

    // Creates input stream and also reads the file header.
    createInputReader(codec.createInputStream(fileIn_), job);

    if (start_ != 0) {
      fileIn_.seek(start_);
      skipToNextSyncPoint(false);
      start_ = fileIn_.getPos();
      LOG.info("Start is now " + start_);
    } else {
      skipToNextSyncPoint(true);
    }
    pos_ = start_;
  }
 // ${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension}
 private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)
     throws IOException, InterruptedException {
   Configuration conf = job.getConfiguration();
   boolean isCompressed = getCompressOutput(job);
   String keyValueSeparator = ",";
   RecordWriter<K, V> recordWriter = null;
   if (isCompressed) {
     Class<? extends CompressionCodec> codecClass =
         getOutputCompressorClass(job, GzipCodec.class);
     CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
     Path file = new Path(workPath, baseName + codec.getDefaultExtension());
     FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
     recordWriter =
         new LineRecordWriter<K, V>(
             new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator);
   } else {
     Path file = new Path(workPath, baseName);
     FSDataOutputStream fileOut =
         file.getFileSystem(conf).create(file, false); // file 是指的file name of the output file
     recordWriter =
         new LineRecordWriter<K, V>(fileOut, keyValueSeparator); // 这里调用的LineRecordWriter
   }
   return recordWriter;
 }
  private void shuffleToMemory(
      MapHost host,
      MapOutput mapOutput,
      InputStream input,
      int decompressedLength,
      int compressedLength)
      throws IOException {
    IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, job);

    input = checksumIn;

    // Are map-outputs compressed?
    if (codec != null) {
      decompressor.reset();
      input = codec.createInputStream(input, decompressor);
    }

    // Copy map-output into an in-memory buffer
    byte[] shuffleData = mapOutput.getMemory();

    try {
      IOUtils.readFully(input, shuffleData, 0, shuffleData.length);
      metrics.inputBytes(shuffleData.length);
      LOG.info(
          "Read "
              + shuffleData.length
              + " bytes from map-output for "
              + mapOutput.getAttemptIdentifier());
    } catch (IOException ioe) {
      // Close the streams
      IOUtils.cleanup(LOG, input);

      // Re-throw
      throw ioe;
    }
  }
Esempio n. 26
0
  public ShuffleManager(
      TezInputContext inputContext,
      Configuration conf,
      int numInputs,
      int bufferSize,
      boolean ifileReadAheadEnabled,
      int ifileReadAheadLength,
      CompressionCodec codec,
      FetchedInputAllocator inputAllocator)
      throws IOException {
    this.inputContext = inputContext;
    this.numInputs = numInputs;

    this.shuffledInputsCounter =
        inputContext.getCounters().findCounter(TaskCounter.NUM_SHUFFLED_INPUTS);
    this.failedShufflesCounter =
        inputContext.getCounters().findCounter(TaskCounter.NUM_FAILED_SHUFFLE_INPUTS);
    this.bytesShuffledCounter = inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES);
    this.decompressedDataSizeCounter =
        inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_DECOMPRESSED);
    this.bytesShuffledToDiskCounter =
        inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_DISK);
    this.bytesShuffledToMemCounter =
        inputContext.getCounters().findCounter(TaskCounter.SHUFFLE_BYTES_TO_MEM);

    this.ifileBufferSize = bufferSize;
    this.ifileReadAhead = ifileReadAheadEnabled;
    this.ifileReadAheadLength = ifileReadAheadLength;
    this.codec = codec;
    this.inputManager = inputAllocator;

    this.srcNameTrimmed = TezUtils.cleanVertexName(inputContext.getSourceVertexName());

    completedInputSet =
        Collections.newSetFromMap(new ConcurrentHashMap<InputIdentifier, Boolean>(numInputs));
    completedInputs = new LinkedBlockingQueue<FetchedInput>(numInputs);
    knownSrcHosts = new ConcurrentHashMap<String, InputHost>();
    pendingHosts = new LinkedBlockingQueue<InputHost>();
    obsoletedInputs =
        Collections.newSetFromMap(new ConcurrentHashMap<InputAttemptIdentifier, Boolean>());
    runningFetchers = Collections.newSetFromMap(new ConcurrentHashMap<Fetcher, Boolean>());

    int maxConfiguredFetchers =
        conf.getInt(
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES,
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES_DEFAULT);

    this.numFetchers = Math.min(maxConfiguredFetchers, numInputs);

    ExecutorService fetcherRawExecutor =
        Executors.newFixedThreadPool(
            numFetchers,
            new ThreadFactoryBuilder()
                .setDaemon(true)
                .setNameFormat("Fetcher [" + srcNameTrimmed + "] #%d")
                .build());
    this.fetcherExecutor = MoreExecutors.listeningDecorator(fetcherRawExecutor);

    ExecutorService schedulerRawExecutor =
        Executors.newFixedThreadPool(
            1,
            new ThreadFactoryBuilder()
                .setDaemon(true)
                .setNameFormat("ShuffleRunner [" + srcNameTrimmed + "]")
                .build());
    this.schedulerExecutor = MoreExecutors.listeningDecorator(schedulerRawExecutor);

    this.startTime = System.currentTimeMillis();
    this.lastProgressTime = startTime;

    this.shuffleSecret =
        ShuffleUtils.getJobTokenSecretFromTokenBytes(
            inputContext.getServiceConsumerMetaData(
                TezConfiguration.TEZ_SHUFFLE_HANDLER_SERVICE_ID));
    httpConnectionParams = ShuffleUtils.constructHttpShuffleConnectionParams(conf);
    LOG.info(
        this.getClass().getSimpleName()
            + " : numInputs="
            + numInputs
            + ", compressionCodec="
            + (codec == null ? "NoCompressionCodec" : codec.getClass().getName())
            + ", numFetchers="
            + numFetchers
            + ", ifileBufferSize="
            + ifileBufferSize
            + ", ifileReadAheadEnabled="
            + ifileReadAhead
            + ", ifileReadAheadLength="
            + ifileReadAheadLength
            + ", "
            + httpConnectionParams.toString());
  }
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {

    Path inputPath = new Path(args[0]);
    Path outputDir = new Path(args[1]);

    // Create configuration
    Configuration conf = new Configuration(true);

    // Create job
    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "CountryIncomeConf");
    job.setJarByClass(CountryIncomeConf.class);

    // Decompressing .gz file Ex. foo.csv.gz to foo.csv

    String uri = args[0];
    FileSystem fs = FileSystem.get(URI.create(uri), conf);

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(inputPath);
    if (codec == null) {
      System.err.println("No codec found for " + uri);
      System.exit(1);
    }
    String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
    InputStream in = null;
    OutputStream out = null;
    try {
      in = codec.createInputStream(fs.open(inputPath));
      out = fs.create(new Path(outputUri));
      IOUtils.copyBytes(in, out, conf);
    } finally {
      IOUtils.closeStream(in);
      IOUtils.closeStream(out);
    }

    // Setup MapReduce
    job.setMapperClass(CountryIncomeMapper.class);
    job.setReducerClass(CountryIncomeReducer.class);
    job.setNumReduceTasks(1);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Input
    // FileInputFormat.addInputPath(job, inputPath);
    FileInputFormat.addInputPaths(job, outputUri);
    job.setInputFormatClass(TextInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    FileSystem hdfs = FileSystem.get(conf);
    if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true);

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;

    // Counter finding and displaying
    Counters counters = job.getCounters();

    // Displaying counters
    System.out.printf(
        "Missing Fields: %d, Error Count: %d\n",
        counters.findCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT).getValue(),
        counters.findCounter(COUNTERS.NULL_OR_EMPTY).getValue());

    System.exit(code);
  }
  @Test
  public void testSplitableCodecs() throws IOException {
    JobConf conf = new JobConf(defaultConf);
    int seed = new Random().nextInt();
    // Create the codec
    CompressionCodec codec = null;
    try {
      codec =
          (CompressionCodec)
              ReflectionUtils.newInstance(
                  conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
    } catch (ClassNotFoundException cnfe) {
      throw new IOException("Illegal codec!");
    }
    Path file = new Path(workDir, "test" + codec.getDefaultExtension());

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileSystem localFs = FileSystem.getLocal(conf);

    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(conf, workDir);

    final int MAX_LENGTH = 500000;

    // for a variety of lengths
    for (int length = MAX_LENGTH / 2;
        length < MAX_LENGTH;
        length += random.nextInt(MAX_LENGTH / 4) + 1) {

      LOG.info("creating; entries = " + length);

      // create a file with length entries
      Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
      try {
        for (int i = 0; i < length; i++) {
          writer.write(Integer.toString(i));
          writer.write("\n");
        }
      } finally {
        writer.close();
      }

      // try splitting the file in a variety of sizes
      TextInputFormat format = new TextInputFormat();
      format.configure(conf);
      LongWritable key = new LongWritable();
      Text value = new Text();
      for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(conf, numSplits);
        LOG.info("splitting: got =        " + splits.length);

        // check each split
        BitSet bits = new BitSet(length);
        for (int j = 0; j < splits.length; j++) {
          LOG.debug("split[" + j + "]= " + splits[j]);
          RecordReader<LongWritable, Text> reader =
              format.getRecordReader(splits[j], conf, reporter);
          try {
            int counter = 0;
            while (reader.next(key, value)) {
              int v = Integer.parseInt(value.toString());
              LOG.debug("read " + v);

              if (bits.get(v)) {
                LOG.warn(
                    "conflict with " + v + " in split " + j + " at position " + reader.getPos());
              }
              assertFalse("Key in multiple partitions.", bits.get(v));
              bits.set(v);
              counter++;
            }
            if (counter > 0) {
              LOG.info("splits[" + j + "]=" + splits[j] + " count=" + counter);
            } else {
              LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + counter);
            }
          } finally {
            reader.close();
          }
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
      }
    }
  }
Esempio n. 29
0
 /** Creates a compression stream without any additional wrapping into buffering streams. */
 public CompressionOutputStream createPlainCompressionStream(
     OutputStream downStream, Compressor compressor) throws IOException {
   CompressionCodec codec = getCodec(conf);
   ((Configurable) codec).getConf().setInt("io.file.buffer.size", 32 * 1024);
   return codec.createOutputStream(downStream, compressor);
 }
Esempio n. 30
0
  boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements =
        asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
      BufferedReader reader =
          new BufferedReader(
              new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

      try {
        String line;

        while (null != (line = reader.readLine())) {
          System.out.println(line);
        }
      } finally {
        reader.close();
      }

      return false;
    }

    if (cli.hasOption("verbose")) {
      console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
      console = Verbosity.INFO;
    } else {
      console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
      ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
      /*
       * Stand alone mode accepts two arguments.
       */
      mode = Mode.STAND_ALONE;

      srcDir = new Path(nonOptions[0]);

      dest = new Path(nonOptions[1]);

      if (cli.hasOption("input-format")) {
        inFormats = asList(cli.getOptionValue("input-format"));
      }

      if (cli.hasOption("output-format")) {
        outFormats = asList(cli.getOptionValue("output-format"));
      }

      replacements = asList(dest.getName());

      crushTimestamp = Long.toString(currentTimeMillis());

    } else {
      /*
       * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
       * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
       * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
       * smaller.
       */

      if ((args.length == 4 || args.length == 3)
          && args.length == nonOptions.length
          && args[2].length() != 14) {

        int maxTasks = Integer.parseInt(args[2]);

        if (maxTasks <= 0 || maxTasks > 4000) {
          throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
        }

        job.setInt("mapred.reduce.tasks", maxTasks);

        maxFileBlocks = Integer.MAX_VALUE;

        crushTimestamp = Long.toString(currentTimeMillis());

        srcDir = new Path(args[0]);
        dest = new Path(args[1]);

        mode = Mode.CLONE;

        if (args.length == 4) {
          if (args[3].equals("TEXT")) {
            /*
             * These are the defaults except with text input and output formats.
             */
            inFormats = asList(TextInputFormat.class.getName());
            outFormats = asList(TextOutputFormat.class.getName());

          } else if (!args[3].equals("SEQUENCE")) {
            throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
          }
        }
      } else {
        /*
         * V2 style arguments.
         */
        if (cli.hasOption("threshold")) {
          threshold = Double.parseDouble(cli.getOptionValue("threshold"));

          if (0 >= threshold
              || 1 < threshold
              || Double.isInfinite(threshold)
              || Double.isNaN(threshold)) {
            throw new IllegalArgumentException(
                "Block size threshold must be in (0, 1]: " + threshold);
          }
        }

        if (cli.hasOption("max-file-blocks")) {
          int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

          if (0 > maxFileBlocksOption) {
            throw new IllegalArgumentException(
                "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
          }

          maxFileBlocks = maxFileBlocksOption;
        } else {
          maxFileBlocks = 8;
        }

        if (cli.hasOption("regex")) {
          regexes = asList(cli.getOptionValues("regex"));
        }

        if (cli.hasOption("replacement")) {
          replacements = asList(cli.getOptionValues("replacement"));
        }

        if (cli.hasOption("input-format")) {
          inFormats = asList(cli.getOptionValues("input-format"));
        }

        if (cli.hasOption("output-format")) {
          outFormats = asList(cli.getOptionValues("output-format"));
        }

        if (3 != nonOptions.length) {
          throw new IllegalArgumentException(
              "Could not find source directory, out directory, and job timestamp");
        }

        srcDir = new Path(nonOptions[0]);
        dest = new Path(nonOptions[1]);

        crushTimestamp = nonOptions[2];

        if (cli.hasOption("clone")) {
          mode = Mode.CLONE;
        } else {
          mode = Mode.MAP_REDUCE;
        }

        if (!crushTimestamp.matches("\\d{14}")) {
          throw new IllegalArgumentException(
              "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
        }
      }

      dfsBlockSize = Long.parseLong(job.get("dfs.blocksize"));
      maxEligibleSize = (long) (dfsBlockSize * threshold);
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFiles != null) {
      job.set("crush.ignore-regex", ignoredFiles.pattern().pattern());
    }

    if (regexes.size() != replacements.size()
        || replacements.size() != inFormats.size()
        || inFormats.size() != outFormats.size()) {
      throw new IllegalArgumentException(
          "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
      job.set(format("crush.%d.regex", i), regexes.get(i));

      matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

      job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

      String inFmt = inFormats.get(i);

      if ("sequence".equals(inFmt)) {
        inFmt = SequenceFileInputFormat.class.getName();
      } else if ("text".equals(inFmt)) {
        inFmt = TextInputFormat.class.getName();
      } else {
        try {
          if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
            throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
          }
        } catch (ClassNotFoundException e) {
          throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
        }
      }

      job.set(format("crush.%d.input.format", i), inFmt);

      String outFmt = outFormats.get(i);

      if ("sequence".equals(outFmt)) {
        outFmt = SequenceFileOutputFormat.class.getName();
      } else if ("text".equals(outFmt)) {
        outFmt = TextOutputFormat.class.getName();
      } else {
        try {
          if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
            throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
          }
        } catch (ClassNotFoundException e) {
          throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
        }
      }

      job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");

    if (null == codec) {
      codec = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
      codec = null;
    } else if ("gzip".equals(codec)) {
      codec = GzipCodec.class.getName();
    } else {
      try {
        if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
          throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
      } catch (ClassNotFoundException e) {
        throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
      }
    }

    if (null == codec) {
      job.setBoolean("mapred.output.compress", false);
    } else {
      job.setBoolean("mapred.output.compress", true);
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapred.output.compression.codec", codec);

      try {
        CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance();
        codecExtension = instance.getDefaultExtension();
      } catch (Exception e) {
        throw new AssertionError();
      }
    }

    return true;
  }