Пример #1
0
  public MutilCharRecordReader(FileSplit inputSplit, Configuration job) throws IOException {

    maxLineLength = job.getInt("mapred.mutilCharRecordReader.maxlength", Integer.MAX_VALUE);
    start = inputSplit.getStart();
    end = start + inputSplit.getLength();
    final Path file = inputSplit.getPath();

    compressionCodecs = new CompressionCodecFactory(job);

    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // 打开文件系统
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;

    if (codec != null) {
      lineReader = new LineReader(codec.createInputStream(fileIn), job);
      end = Long.MAX_VALUE;
    } else {
      if (start != 0) {
        skipFirstLine = true;
        --start;
        fileIn.seek(start);
      }
      lineReader = new LineReader(fileIn, job);
    }

    if (skipFirstLine) {
      start +=
          lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
  }
    public QseqRecordReader(Configuration conf, FileSplit split) throws IOException {
      setConf(conf);
      file = split.getPath();
      start = split.getStart();
      end = start + split.getLength();

      FileSystem fs = file.getFileSystem(conf);
      FSDataInputStream fileIn = fs.open(file);

      CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
      CompressionCodec codec = codecFactory.getCodec(file);

      if (codec == null) // no codec.  Uncompressed file.
      {
        positionAtFirstRecord(fileIn);
        inputStream = fileIn;
      } else { // compressed file
        if (start != 0)
          throw new RuntimeException(
              "Start position for compressed file is not 0! (found " + start + ")");

        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
      }

      lineReader = new LineReader(inputStream);
    }
Пример #3
0
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext)
      throws IOException {
    context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    lzoFile = fileSplit.getPath();
    // The LzoSplitInputFormat is not splittable, so the split length is the whole file.
    totalFileSize = fileSplit.getLength();

    // Jump through some hoops to create the lzo codec.
    Configuration conf = CompatibilityUtil.getConfiguration(context);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    LzopDecompressor lzopDecompressor = (LzopDecompressor) codec.createDecompressor();
    FileSystem fs = lzoFile.getFileSystem(conf);
    rawInputStream = fs.open(lzoFile);

    // Creating the LzopInputStream here just reads the lzo header for us, nothing more.
    // We do the rest of our input off of the raw stream is.
    codec.createInputStream(rawInputStream, lzopDecompressor);

    // This must be called AFTER createInputStream is called, because createInputStream
    // is what reads the header, which has the checksum information.  Otherwise getChecksumsCount
    // erroneously returns zero, and all block offsets will be wrong.
    numCompressedChecksums = lzopDecompressor.getCompressedChecksumsCount();
    numDecompressedChecksums = lzopDecompressor.getDecompressedChecksumsCount();
  }
Пример #4
0
  @Override
  public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {

    // Obtain path to input list of input images and open input stream
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();
    FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fileSystem.open(path);

    // Note the start and length fields in the FileSplit object are being used to
    // convey a range of lines in the input list of image URLs
    startLine = fileSplit.getStart();
    numLines = fileSplit.getLength();
    linesRead = 0; // total lines read by this particular record reader instance
    linesPerRecord = 100; // can be modified to change key/value pair size (may improve efficiency)

    // If it exists, get the relevant compression codec for the FileSplit
    CompressionCodecFactory codecFactory = new CompressionCodecFactory(context.getConfiguration());
    CompressionCodec codec = codecFactory.getCodec(path);

    // If the codec was found, use it to create an decompressed input stream.
    // Otherwise, assume input stream is already decompressed
    if (codec != null) {
      reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fileIn)));
    } else {
      reader = new BufferedReader(new InputStreamReader(fileIn));
    }
  }
Пример #5
0
    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
      FileSplit split = (FileSplit) genericSplit;
      Configuration job = context.getConfiguration();
      m_Sb.setLength(0);
      m_Start = split.getStart();
      m_End = m_Start + split.getLength();
      final Path file = split.getPath();
      compressionCodecs = new CompressionCodecFactory(job);
      final CompressionCodec codec = compressionCodecs.getCodec(file);

      // open the file and seek to the m_Start of the split
      FileSystem fs = file.getFileSystem(job);
      //  getFileStatus fileStatus = fs.getFileStatus(split.getPath());
      //noinspection deprecation
      @SuppressWarnings(value = "deprecated")
      long length = fs.getLength(file);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (m_Start > 0) fileIn.seek(m_Start);
      if (codec != null) {
        CompressionInputStream inputStream = codec.createInputStream(fileIn);
        m_Input = new BufferedReader(new InputStreamReader(inputStream));
        m_End = length;
      } else {
        m_Input = new BufferedReader(new InputStreamReader(fileIn));
      }
      m_Current = m_Start;
      m_Key = split.getPath().getName();
    }
Пример #6
0
  private void verifyCompressedFile(Path f, int expectedNumLines) throws IOException {
    Configuration conf = new Configuration();
    conf.set("fs.default.name", "file:///");
    FileSystem fs = FileSystem.get(conf);
    InputStream is = fs.open(f);
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(f);
    LOG.info("gzip check codec is " + codec);
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (null == decompressor) {
      LOG.info("Verifying gzip sanity with null decompressor");
    } else {
      LOG.info("Verifying gzip sanity with decompressor: " + decompressor.toString());
    }
    is = codec.createInputStream(is, decompressor);
    BufferedReader r = new BufferedReader(new InputStreamReader(is));
    int numLines = 0;
    while (true) {
      String ln = r.readLine();
      if (ln == null) {
        break;
      }
      numLines++;
    }

    r.close();
    assertEquals("Did not read back correct number of lines", expectedNumLines, numLines);
    LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
  }
Пример #7
0
 public InputStream createDecompressionStream(
     InputStream downStream, Decompressor decompressor, int downStreamBufferSize)
     throws IOException {
   CompressionCodec codec = getCodec(conf);
   // Set the internal buffer size to read from down stream.
   if (downStreamBufferSize > 0) {
     ((Configurable) codec).getConf().setInt("io.file.buffer.size", downStreamBufferSize);
   }
   CompressionInputStream cis = codec.createInputStream(downStream, decompressor);
   BufferedInputStream bis2 = new BufferedInputStream(cis, DATA_IBUF_SIZE);
   return bis2;
 }
Пример #8
0
  public static void decompressFile(
      final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious)
      throws IOException {
    final Path inPath = new Path(inFile);
    final Path outPath = new Path(outFile);
    final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
    final CompressionCodec codec = factory.getCodec(inPath);
    final OutputStream out = fs.create(outPath);
    final InputStream in = codec.createInputStream(fs.open(inPath));
    IOUtils.copyBytes(in, out, 8192);
    IOUtils.closeStream(in);
    IOUtils.closeStream(out);

    if (deletePrevious) fs.delete(new Path(inFile), true);
  }
Пример #9
0
 /**
  * Returns an {@link InputStream} to the specified file.
  *
  * <p>Note: It is the caller's responsibility to close the returned {@link InputStream}.
  *
  * @param path The path to the file to open.
  * @return An {@link InputStream} for the specified file.
  * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the
  *     specified file.
  */
 @Override
 public InputStream getFileStream(String path) throws FileBasedHelperException {
   try {
     Path p = new Path(path);
     InputStream in = this.getFileSystem().open(p);
     // Account for compressed files (e.g. gzip).
     // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
     CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
     CompressionCodec codec = factory.getCodec(p);
     return (codec == null) ? in : codec.createInputStream(in);
   } catch (IOException e) {
     throw new FileBasedHelperException(
         "Cannot open file " + path + " due to " + e.getMessage(), e);
   }
 }
Пример #10
0
  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
      isCompressedInput = true;
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
            ((SplittableCompressionCodec) codec)
                .createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
        in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
        in =
            new SplitLineReader(
                codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
        filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
  }
Пример #11
0
    public XMLRecordReader(FileSplit split, JobConf jobConf) throws IOException {
      if (jobConf.get(START_TAG_KEY) == null || jobConf.get(END_TAG_KEY) == null)
        throw new RuntimeException("Error! XML start and end tags unspecified!");

      startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
      endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");

      start = split.getStart();
      Path file = split.getPath();

      CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(jobConf);
      CompressionCodec codec = compressionCodecs.getCodec(file);

      FileSystem fs = file.getFileSystem(jobConf);

      if (codec != null) {
        sLogger.info("Reading compressed file...");

        // InputStream tempStream = codec.createInputStream(fileIn);
        // fsin = new DataInputStream(tempStream);

        fsin = new DataInputStream(codec.createInputStream(fs.open(file)));

        end = Long.MAX_VALUE;
      } else {
        sLogger.info("Reading uncompressed file...");

        FSDataInputStream fileIn = fs.open(file);

        fileIn.seek(start);
        fsin = fileIn;

        end = start + split.getLength();
      }

      recordStartPos = start;

      // Because input streams of gzipped files are not seekable
      // (specifically, do not support getPos), we need to keep
      // track of bytes consumed ourselves.
      pos = start;
    }
Пример #12
0
  private void runTest(int numChunks, int chunkSize) throws Exception {
    CompressionCodec codec = ReflectionUtils.newInstance(LzopCodec.class, conf);

    final Random writerRand = new Random(12345);
    final Random readerRand = new Random(12345);

    File testFile = new File(System.getProperty("test.build.data"), "randdata");
    String fileName = testFile.getAbsolutePath();

    // Create the file
    OutputStream fos = new FileOutputStream(fileName);
    fos = codec.createOutputStream(fos);

    // Write file
    byte[] data = new byte[chunkSize];
    System.out.println("Start to write to file...");
    for (int i = 0; i < numChunks; i++) {
      writerRand.nextBytes(data);
      fos.write(data);
    }
    fos.close();
    System.out.println("Closed file.");

    // Open file
    InputStream tis = new FileInputStream(fileName);
    tis = codec.createInputStream(tis);

    // Read file
    byte[] dataExpected = new byte[chunkSize];
    byte[] dataRead = new byte[chunkSize];
    for (int i = 0; i < numChunks; i++) {
      readerRand.nextBytes(dataExpected);
      readFully(tis, dataRead);
      assertArrayEquals(dataExpected, dataRead);
    }

    assertEquals(-1, tis.read());
    tis.close();
  }
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext context)
      throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start_ = split.getStart();
    end_ = start_ + split.getLength();
    final Path file = split.getPath();
    Configuration job = HadoopCompat.getConfiguration(context);

    errorTracker = new InputErrorTracker(job);

    LOG.info("input split: " + file + " " + start_ + ":" + end_);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
      throw new IOException("No codec for file " + file + " found, cannot run");
    }

    // Open the file and seek to the start of the split.
    fileIn_ = fs.open(split.getPath());

    // Creates input stream and also reads the file header.
    createInputReader(codec.createInputStream(fileIn_), job);

    if (start_ != 0) {
      fileIn_.seek(start_);
      skipToNextSyncPoint(false);
      start_ = fileIn_.getPos();
      LOG.info("Start is now " + start_);
    } else {
      skipToNextSyncPoint(true);
    }
    pos_ = start_;
  }
  private void shuffleToMemory(
      MapHost host,
      MapOutput mapOutput,
      InputStream input,
      int decompressedLength,
      int compressedLength)
      throws IOException {
    IFileInputStream checksumIn = new IFileInputStream(input, compressedLength, job);

    input = checksumIn;

    // Are map-outputs compressed?
    if (codec != null) {
      decompressor.reset();
      input = codec.createInputStream(input, decompressor);
    }

    // Copy map-output into an in-memory buffer
    byte[] shuffleData = mapOutput.getMemory();

    try {
      IOUtils.readFully(input, shuffleData, 0, shuffleData.length);
      metrics.inputBytes(shuffleData.length);
      LOG.info(
          "Read "
              + shuffleData.length
              + " bytes from map-output for "
              + mapOutput.getAttemptIdentifier());
    } catch (IOException ioe) {
      // Close the streams
      IOUtils.cleanup(LOG, input);

      // Re-throw
      throw ioe;
    }
  }
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {

    Path inputPath = new Path(args[0]);
    Path outputDir = new Path(args[1]);

    // Create configuration
    Configuration conf = new Configuration(true);

    // Create job
    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "CountryIncomeConf");
    job.setJarByClass(CountryIncomeConf.class);

    // Decompressing .gz file Ex. foo.csv.gz to foo.csv

    String uri = args[0];
    FileSystem fs = FileSystem.get(URI.create(uri), conf);

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(inputPath);
    if (codec == null) {
      System.err.println("No codec found for " + uri);
      System.exit(1);
    }
    String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
    InputStream in = null;
    OutputStream out = null;
    try {
      in = codec.createInputStream(fs.open(inputPath));
      out = fs.create(new Path(outputUri));
      IOUtils.copyBytes(in, out, conf);
    } finally {
      IOUtils.closeStream(in);
      IOUtils.closeStream(out);
    }

    // Setup MapReduce
    job.setMapperClass(CountryIncomeMapper.class);
    job.setReducerClass(CountryIncomeReducer.class);
    job.setNumReduceTasks(1);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Input
    // FileInputFormat.addInputPath(job, inputPath);
    FileInputFormat.addInputPaths(job, outputUri);
    job.setInputFormatClass(TextInputFormat.class);

    // Output
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Delete output if exists
    FileSystem hdfs = FileSystem.get(conf);
    if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true);

    // Execute job
    int code = job.waitForCompletion(true) ? 0 : 1;

    // Counter finding and displaying
    Counters counters = job.getCounters();

    // Displaying counters
    System.out.printf(
        "Missing Fields: %d, Error Count: %d\n",
        counters.findCounter(COUNTERS.MISSING_FIELDS_RECORD_COUNT).getValue(),
        counters.findCounter(COUNTERS.NULL_OR_EMPTY).getValue());

    System.exit(code);
  }