/**
  * Return the progress within the input split
  *
  * @return 0.0 to 1.0 of the input byte range
  */
 public float getProgress() throws IOException, InterruptedException {
   if (end == start) {
     return 0.0f;
   } else {
     return Math.min(1.0f, (float) ((in.getPosition() - start) / (double) (end - start)));
   }
 }
 /** Read raw bytes from a SequenceFile. */
 public synchronized boolean nextKeyValue() throws IOException, InterruptedException {
   if (done) {
     return false;
   }
   long pos = in.getPosition();
   key.set(pos);
   info.setPosition(pos);
   boolean eof = -1 == in.nextRawKey(buffer);
   if (!eof) {
     in.nextRawValue(vbytes);
     value.set(
         buffer.getLength(), vbytes.getSize(), (int) (in.getPosition() - pos), in.syncSeen());
   }
   buffer.reset();
   return !(done = (eof || (pos >= end && in.syncSeen())));
 }
  private List<InputSplit> getSplits(
      Configuration configuration, int numSplits, long totalSizeBytes) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);

    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    long currentSplitSize = 0;
    long lastSplitStart = 0;
    long lastPosition = 0;

    final Path listingFilePath = getListingFilePath(configuration);

    if (LOG.isDebugEnabled()) {
      LOG.debug(
          "Average bytes per map: "
              + nBytesPerSplit
              + ", Number of maps: "
              + numSplits
              + ", total size: "
              + totalSizeBytes);
    }
    SequenceFile.Reader reader = null;
    try {
      reader = getListingFileReader(configuration);
      while (reader.next(srcRelPath, srcFileStatus)) {
        // If adding the current file would cause the bytes per map to exceed
        // limit. Add the current file to new split
        if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
          FileSplit split =
              new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
          if (LOG.isDebugEnabled()) {
            LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
          }
          splits.add(split);
          lastSplitStart = lastPosition;
          currentSplitSize = 0;
        }
        currentSplitSize += srcFileStatus.getLen();
        lastPosition = reader.getPosition();
      }
      if (lastPosition > lastSplitStart) {
        FileSplit split =
            new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
        if (LOG.isDebugEnabled()) {
          LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
        }
        splits.add(split);
      }

    } finally {
      IOUtils.closeStream(reader);
    }

    return splits;
  }
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      Path path = ((FileSplit) split).getPath();
      Configuration conf = context.getConfiguration();
      FileSystem fs = path.getFileSystem(conf);
      this.in = new SequenceFile.Reader(fs, path, conf);
      this.end = ((FileSplit) split).getStart() + split.getLength();
      if (((FileSplit) split).getStart() > in.getPosition()) {
        in.sync(((FileSplit) split).getStart()); // sync to start
      }
      this.start = in.getPosition();
      vbytes = in.createValueBytes();
      done = start >= end;

      info = InputInfo.getInstance();
      info.setSplit((FileSplit) split);

      System.err.println("input split = " + split);
    }
Esempio n. 5
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
Esempio n. 6
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      final int srcCount = job.getInt(OP_COUNT_LABEL, -1);
      final int targetcount = srcCount / numSplits;
      String srclist = job.get(OP_LIST_LABEL, "");
      if (srcCount < 0 || "".equals(srclist)) {
        throw new RuntimeException(
            "Invalid metadata: #files(" + srcCount + ") listuri(" + srclist + ")");
      }
      Path srcs = new Path(srclist);
      FileSystem fs = srcs.getFileSystem(job);

      List<FileSplit> splits = new ArrayList<FileSplit>(numSplits);

      Text key = new Text();
      PolicyInfo value = new PolicyInfo();
      SequenceFile.Reader in = null;
      long prev = 0L;
      int count = 0; // count src
      try {
        for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value); ) {
          long curr = in.getPosition();
          long delta = curr - prev;
          if (++count > targetcount) {
            count = 0;
            splits.add(new FileSplit(srcs, prev, delta, (String[]) null));
            prev = curr;
          }
        }
      } finally {
        in.close();
      }
      long remaining = fs.getFileStatus(srcs).getLen() - prev;
      if (remaining != 0) {
        splits.add(new FileSplit(srcs, prev, remaining, (String[]) null));
      }
      LOG.info(
          "jobname= " + jobName + " numSplits=" + numSplits + ", splits.size()=" + splits.size());
      return splits.toArray(new FileSplit[splits.size()]);
    }
    /**
     * splits the input files into tasks handled by a single node we have to read the input files to
     * do this based on a number of items in a sequence
     */
    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException {
      long filesPerTask = DistBlockFixer.filesPerTask(job.getConfiguration());

      Path[] inPaths = getInputPaths(job);

      List<InputSplit> splits = new ArrayList<InputSplit>();

      long fileCounter = 0;

      for (Path inPath : inPaths) {

        FileSystem fs = inPath.getFileSystem(job.getConfiguration());

        if (!fs.getFileStatus(inPath).isDir()) {
          throw new IOException(inPath.toString() + " is not a directory");
        }

        FileStatus[] inFiles = fs.listStatus(inPath);

        for (FileStatus inFileStatus : inFiles) {
          Path inFile = inFileStatus.getPath();

          if (!inFileStatus.isDir()
              && (inFile.getName().equals(job.getJobName() + IN_FILE_SUFFIX))) {

            fileCounter++;
            SequenceFile.Reader inFileReader =
                new SequenceFile.Reader(fs, inFile, job.getConfiguration());

            long startPos = inFileReader.getPosition();
            long counter = 0;

            // create an input split every filesPerTask items in the sequence
            LongWritable key = new LongWritable();
            Text value = new Text();
            try {
              while (inFileReader.next(key, value)) {
                if (counter % filesPerTask == filesPerTask - 1L) {
                  splits.add(
                      new FileSplit(inFile, startPos, inFileReader.getPosition() - startPos, null));
                  startPos = inFileReader.getPosition();
                }
                counter++;
              }

              // create input split for remaining items if necessary
              // this includes the case where no splits were created by the loop
              if (startPos != inFileReader.getPosition()) {
                splits.add(
                    new FileSplit(inFile, startPos, inFileReader.getPosition() - startPos, null));
              }
            } finally {
              inFileReader.close();
            }
          }
        }
      }

      LOG.info("created " + splits.size() + " input splits from " + fileCounter + " files");

      return splits;
    }