Пример #1
0
 public static boolean checkJobContextIfRunningFromBackend(JobContext j) {
   if (j.getConfiguration().get("mapred.task.id", "").equals("")
       && !("true".equals(j.getConfiguration().get("pig.illustrating")))) {
     return false;
   }
   return true;
 }
Пример #2
0
 public static Map<FormatBundle, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) {
   Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap();
   Configuration conf = job.getConfiguration();
   String crunchInputs = conf.get(CRUNCH_INPUTS);
   if (crunchInputs == null || crunchInputs.isEmpty()) {
     return ImmutableMap.of();
   }
   for (String input : Splitter.on(RECORD_SEP).split(crunchInputs)) {
     List<String> fields = Lists.newArrayList(SPLITTER.split(input));
     FormatBundle<InputFormat> inputBundle =
         FormatBundle.fromSerialized(fields.get(0), job.getConfiguration());
     if (!formatNodeMap.containsKey(inputBundle)) {
       formatNodeMap.put(inputBundle, Maps.<Integer, List<Path>>newHashMap());
     }
     Integer nodeIndex = Integer.valueOf(fields.get(1));
     if (!formatNodeMap.get(inputBundle).containsKey(nodeIndex)) {
       formatNodeMap.get(inputBundle).put(nodeIndex, Lists.<Path>newLinkedList());
     }
     List<Path> formatNodePaths = formatNodeMap.get(inputBundle).get(nodeIndex);
     String paths = fields.get(2);
     for (String path : Splitter.on(PATH_SEP).split(paths)) {
       formatNodePaths.add(new Path(path));
     }
   }
   return formatNodeMap;
 }
 public SqoopOutputFormatLoadExecutorSpark(JobContext jobctx) {
   context = jobctx;
   loaderName = context.getConfiguration().get(MRJobConstants.JOB_ETL_LOADER);
   writer = new SqoopRecordWriter();
   // jackh: This must be conditional - Extract schema using credentials in case of MR and simply
   // extract from the
   // credentials object in case of Spark (due to known issue with Hadoop/Spark that the
   // credentials are never added
   // for serialization)
   // matcher = MatcherFactory.getMatcher(
   // MRConfigurationUtils.getConnectorSchema(Direction.FROM, context.getConfiguration()),
   // MRConfigurationUtils.getConnectorSchema(Direction.TO, context.getConfiguration()));
   matcher =
       MatcherFactory.getMatcher(
           MRConfigurationUtils.getConnectorSchemaUnsafe(
               Direction.FROM, context.getConfiguration()),
           MRConfigurationUtils.getConnectorSchemaUnsafe(
               Direction.TO, context.getConfiguration()));
   toDataFormat =
       (IntermediateDataFormat<?>)
           ClassUtils.instantiate(
               context.getConfiguration().get(MRJobConstants.TO_INTERMEDIATE_DATA_FORMAT));
   // Using the TO schema since the SqoopDataWriter in the SqoopMapper encapsulates the
   // toDataFormat
   toDataFormat.setSchema(matcher.getToSchema());
 }
Пример #4
0
 public static void checkOutputSpecs(JobContext jc) throws IOException, InterruptedException {
   Map<String, OutputConfig> outputs = getNamedOutputs(jc.getConfiguration());
   for (Map.Entry<String, OutputConfig> e : outputs.entrySet()) {
     String namedOutput = e.getKey();
     Job job = getJob(jc.getJobID(), e.getKey(), jc.getConfiguration());
     OutputFormat fmt = getOutputFormat(namedOutput, job, e.getValue());
     fmt.checkOutputSpecs(job);
   }
 }
Пример #5
0
  @Test
  public void testDeleteMissing() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext =
        new JobContextImpl(
            taskAttemptContext.getConfiguration(),
            taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();

    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
      fs = FileSystem.get(conf);
      sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      fs.rename(new Path(targetBaseAdd), new Path(targetBase));

      DistCpOptions options =
          new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
      options.setSyncFolder(true);
      options.setDeleteMissing(true);
      options.appendToConf(conf);

      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
      listing.buildListing(listingFile, options);

      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);

      committer.commitJob(jobContext);
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
        Assert.fail("Source and target folders are not in sync");
      }

      // Test for idempotent commit
      committer.commitJob(jobContext);
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
    } catch (Throwable e) {
      LOG.error("Exception encountered while testing for delete missing", e);
      Assert.fail("Delete missing failure");
    } finally {
      TestDistCpUtils.delete(fs, "/tmp1");
      conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
  }
Пример #6
0
 @Override
 public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException {
   Configuration conf = jobContext.getConfiguration();
   for (Map.Entry<String, OutputCommitter> e : committers.entrySet()) {
     Job job = getJob(jobContext.getJobID(), e.getKey(), conf);
     configureJob(e.getKey(), job, outputs.get(e.getKey()));
     e.getValue().abortJob(job, state);
   }
 }
Пример #7
0
  /**
   * Generate the list of files and make them into FileSplits. This needs to be copied to insert a
   * filter on acceptable data
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    long desiredMappers =
        job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> fileStatuses = listStatus(job);
    boolean forceNumberMappers = fileStatuses.size() == 1;
    for (FileStatus file : fileStatuses) {
      Path path = file.getPath();
      if (!isPathAcceptable(path)) // filter acceptable data
      continue;
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        // use desired mappers to force more splits
        if (forceNumberMappers && desiredMappers > 0)
          maxSize = Math.min(maxSize, (length / desiredMappers));

        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (withinSlop(splitSize, bytesRemaining)) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    System.out.println("Total # of splits: " + splits.size());
    //     LOG.debug("Total # of splits: " + splits.size());
    return splits;
  }
  /**
   * Validates that a valid FIXED_RECORD_LENGTH config property has been set and if so, returns the
   * splits. If the FIXED_RECORD_LENGTH property has not been set, this will throw an IOException.
   *
   * @inheritDoc
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {

    // fetch configuration
    Configuration conf = job.getConfiguration();

    // ensure recordLength is properly setup
    try {
      if (this.recordLength == -1) {
        this.recordLength = getAndValidateRecordLength(job.getConfiguration());
      }
      LOG.info("FixedLengthInputFormat: my fixed record length is: " + recordLength);

    } catch (Exception e) {
      throw new IOException(
          "FixedLengthInputFormat requires the"
              + " Configuration property:"
              + FIXED_RECORD_LENGTH
              + " to"
              + " be set to something > 0. Currently the value is 0 (zero)");
    }

    // ensure recordKey start/end is setup properly if it was defined by the user
    if (this.recordKeyStartAt == -1) {

      this.recordKeyStartAt = FixedLengthInputFormat.getRecordKeyStartAt(conf);
      this.recordKeyEndAt = FixedLengthInputFormat.getRecordKeyEndAt(conf);

      // if one is set, they BOTH must be set, this is an error
      // if endAt < startAt, this is an error
      // if either is > record length, this is an error
      // if either are < -1 (default), this is an error
      if ((recordKeyStartAt >= 0 && recordKeyEndAt == -1)
          || (recordKeyStartAt == -1 && recordKeyEndAt >= 0)
          || (recordKeyEndAt < recordKeyStartAt)
          || (recordKeyEndAt > recordLength)
          || (recordKeyStartAt > recordLength)
          || (recordKeyStartAt < -1)
          || (recordKeyEndAt < -1)) {

        throw new IOException(
            "FixedLengthInputFormat requires the"
                + " optional configuration properties:"
                + FIXED_RECORD_KEY_START_AT
                + " and"
                + FIXED_RECORD_KEY_END_AT
                + " to A) be less than the "
                + " fixed record length. B) both must be set together C) neither "
                + " can be less than 0. D) end at must be > start at.");
      }
    }

    return super.getSplits(job);
  }
 private void setSplitSize(JobContext cx) {
   super.setMaxSplitSize(
       cx.getConfiguration()
           .getLong(
               COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE,
               DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE));
   super.setMinSplitSizeNode(
       cx.getConfiguration()
           .getLong(
               COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE,
               DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE));
 }
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = job.getConfiguration();
    int numMapTasks = conf.getInt("admm.iteration.num.map.tasks", 0);
    if (0 == numMapTasks) {
      return super.getSplits(job);
    }

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        long splitSize = Math.max(computeSplitSize(JAVA_OPTS, numMapTasks, length), blockSize);
        long splitLength = (long) (length / Math.ceil((double) length / splitSize));
        long bytesRemaining = length;

        while (((double) bytesRemaining) / splitLength > SPLIT_SLOP) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitLength, blkLocations[blkIndex].getHosts()));

          bytesRemaining -= splitLength;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    job.getConfiguration().setInt("admm.iteration.num.map.tasks", splits.size());
    return splits;
  }
  /**
   * List input directories. Subclasses may override to, e.g., select only files matching a regular
   * expression.
   *
   * @param job the job to list input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
  protected List<FileStatus> listStatus(JobContext job) throws IOException {
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
      throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    // Whether we need to recursive look into the directory structure
    boolean recursive = getInputDirRecursive(job);

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
      filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    List<FileStatus> result = null;

    int numThreads =
        job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
    Stopwatch sw = new Stopwatch().start();
    if (numThreads == 1) {
      result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
    } else {
      Iterable<FileStatus> locatedFiles = null;
      try {
        LocatedFileStatusFetcher locatedFileStatusFetcher =
            new LocatedFileStatusFetcher(
                job.getConfiguration(), dirs, recursive, inputFilter, true);
        locatedFiles = locatedFileStatusFetcher.getFileStatuses();
      } catch (InterruptedException e) {
        throw new IOException("Interrupted while getting file statuses");
      }
      result = Lists.newArrayList(locatedFiles);
    }

    sw.stop();
    if (LogGlobal.isDebugEnabled()) {
      /* LOG.debug("Time taken to get FileStatuses: "+sw.elapsedMillis()) */
      LOG.time_taken_get_filestatuses(String.valueOf(sw.elapsedMillis())).tag("methodCall").debug();
    }
    /* LOG.info("Total input paths to process : "+result.size()) */
    LOG.total_input_paths_process(String.valueOf(result.size())).tag("methodCall").info();
    return result;
  }
Пример #12
0
    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException {
      List<InputSplit> splitList = new ArrayList<InputSplit>();
      int totalRows = job.getConfiguration().getInt("LoadSplit.TOTAL_ROWS", 0);
      int clients = job.getConfiguration().getInt("LoadSplit.CLIENTS", 1);
      int numRows = totalRows / clients;

      for (int ii = 0; ii < clients; ++ii) {
        int startRow = ii * numRows;
        LoadSplit split = new LoadSplit(startRow, numRows, ii);
        splitList.add(split);
      }
      return splitList;
    }
    @Override
    public void checkOutputSpecs(JobContext job) throws IOException {
      String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);

      if (hosts == null || hosts.isEmpty()) {
        throw new IOException(REDIS_HOSTS_CONF + " is not set in configuration.");
      }

      String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF);

      if (hashKey == null || hashKey.isEmpty()) {
        throw new IOException(REDIS_HASH_KEY_CONF + " is not set in configuration.");
      }
    }
Пример #14
0
  @Test
  public void testPreserveStatus() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext =
        new JobContextImpl(
            taskAttemptContext.getConfiguration(),
            taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();

    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
      fs = FileSystem.get(conf);
      FsPermission sourcePerm = new FsPermission((short) 511);
      FsPermission initialPerm = new FsPermission((short) 448);
      sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm);
      targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm);

      DistCpOptions options =
          new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
      options.preserve(FileAttribute.PERMISSION);
      options.appendToConf(conf);

      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
      listing.buildListing(listingFile, options);

      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);

      committer.commitJob(jobContext);
      if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
        Assert.fail("Permission don't match");
      }

      // Test for idempotent commit
      committer.commitJob(jobContext);
      if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
        Assert.fail("Permission don't match");
      }

    } catch (IOException e) {
      LOG.error("Exception encountered while testing for preserve status", e);
      Assert.fail("Preserve status failure");
    } finally {
      TestDistCpUtils.delete(fs, "/tmp1");
    }
  }
  /**
   * Creates splits with multiple indexes per split (if they are smaller than
   * maxCombinedIndexSizePerSplit). It is possible for a split to be larger than
   * maxCombinedIndexSizePerSplit, if it consists of a single index that is larger than
   * maxCombinedIndexSizePerSplit.
   *
   * <p>All inputPaths will be searched for indexes recursively
   *
   * <p>The bin-packing problem of combining splits is solved naively:
   *
   * <ol>
   *   <li>Sort all indexes by size
   *   <li>Begin packing indexes into splits until adding the next split would cause the split to
   *       exceed maxCombinedIndexSizePerSplit
   *   <li>Begin packing subsequent indexes into the next split, and so on
   * </ol>
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

    // load settings from job conf
    loadConfig(job.getConfiguration());

    // find all the index dirs and create a split for each
    PriorityQueue<LuceneIndexInputSplit> splits = findSplits(job.getConfiguration());

    // combine the splits based on maxCombineSplitSize
    List<InputSplit> combinedSplits =
        combineSplits(splits, maxCombinedIndexSizePerSplit, maxNumIndexesPerSplit);

    return combinedSplits;
  }
Пример #16
0
  private String[] getActiveServersList(JobContext context) {

    String[] servers = null;
    try {
      JobClient jc = new JobClient((JobConf) context.getConfiguration());
      ClusterStatus status = jc.getClusterStatus(true);
      Collection<String> atc = status.getActiveTrackerNames();
      servers = new String[atc.size()];
      int s = 0;
      for (String serverInfo : atc) {
        // System.out.println("serverInfo:" + serverInfo);
        StringTokenizer st = new StringTokenizer(serverInfo, ":");
        String trackerName = st.nextToken();
        // System.out.println("trackerName:" + trackerName);
        StringTokenizer st1 = new StringTokenizer(trackerName, "_");
        st1.nextToken();
        servers[s++] = st1.nextToken();
      }

    } catch (IOException e) {
      e.printStackTrace();
    }

    return servers;
  }
  private List<InputSplit> getSplits(JobContext cx, List<Path> dirs)
      throws FileNotFoundException, IOException {

    List<InputSplit> splits = Lists.newArrayList();

    List<Path> subdirs = Lists.newArrayList();
    long totalFileCount = 0;

    FileSystem fs = FileSystem.get(cx.getConfiguration());
    for (Path input : dirs) {
      long count = fs.getContentSummary(input).getFileCount();
      subdirs.add(input);
      if (totalFileCount + count < GET_SPLIT_NUM_FILES_TRHESHOLD) {
        totalFileCount += count;
      } else {
        addAvroFilesInSubdirsToSplits(splits, subdirs, fs, cx);
        subdirs.clear();
        totalFileCount = 0;
      }
    }

    if (totalFileCount > 0) {
      addAvroFilesInSubdirsToSplits(splits, subdirs, fs, cx);
    }
    return splits;
  }
Пример #18
0
  /**
   * This functions modifies the splits list in place to retain only a random fraction of the input
   * splits. The fraction is expected in "starfish.profiler.sampling.fraction" as a number between 0
   * and 1. The default value is 0.1.
   *
   * @param job The job context
   * @param splits The list of input splits to modify
   */
  public static void sampleInputSplits(JobContext job, List<InputSplit> splits) {

    // Get the sampling fraction
    Configuration conf = job.getConfiguration();
    double fraction = conf.getFloat(Profiler.PROFILER_SAMPLING_FRACTION, 0.1f);
    if (fraction < 0 || fraction > 1)
      throw new RuntimeException("ERROR: Invalid sampling fraction: " + fraction);

    // Handle corner cases
    if (fraction == 0 || splits.size() == 0) {
      splits.clear();
      return;
    }
    if (fraction == 1) return;

    // Calculate the number of samples
    int numSplits = splits.size();
    int sampleSize = (int) Math.round(numSplits * fraction);
    if (sampleSize == 0) sampleSize = 1;

    // Shuffle the splits
    Collections.shuffle(splits);

    // Retain only a sampleSize number of splits
    for (int i = splits.size() - 1; i >= sampleSize; --i) {
      splits.remove(i);
    }

    nf.setMaximumFractionDigits(2);
    LOG.info("Executing only " + nf.format(fraction * 100) + "% of the map tasks");
  }
  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    final String overlordUrl = conf.get(CONF_DRUID_OVERLORD_HOSTPORT);
    final String storageDir = conf.get(CONF_DRUID_STORAGE_STORAGE_DIR);
    String dataSource = conf.get(CONF_DRUID_DATASOURCE);
    String intervalStr = conf.get(CONF_DRUID_INTERVAL);

    logger.info("druid overlord url = " + overlordUrl);
    logger.info("druid storage dir = " + storageDir);
    logger.info("druid datasource = " + dataSource);
    logger.info("druid datasource interval = " + intervalStr);

    // TODO: currently we are creating 1 split per segment which is not really
    // necessary, we can use some configuration to combine multiple segments into
    // one input split
    List<InputSplit> splits =
        Lists.transform(
            druid.getSegmentPathsToLoad(
                dataSource, new Interval(intervalStr), storageDir, overlordUrl),
            new Function<String, InputSplit>() {
              @Override
              public InputSplit apply(String input) {
                return new DruidInputSplit(input);
              }
            });

    logger.info("Number of splits = " + splits.size());
    return splits;
  }
Пример #20
0
  /**
   * Splitter used by both Vertex and Edge Input Format.
   *
   * @param context The job context
   * @param estimation Number of estimated objects
   * @return splits to be generated to read the input
   */
  public static List<InputSplit> getSplits(JobContext context, long estimation)
      throws IOException, InterruptedException {

    int chunks = context.getConfiguration().getInt("mapred.map.tasks", 1);
    long chunkSize = estimation / chunks;
    List<InputSplit> splits = new ArrayList<InputSplit>();

    if (LOG.isDebugEnabled()) {
      LOG.debug(String.format("Estimated objects: %d", estimation));
      LOG.debug(String.format("Number of chunks: %d", chunks));
    }

    for (int i = 0; i < chunks; ++i) {
      long start = i * chunkSize;
      long end = ((i + 1) == chunks) ? Long.MAX_VALUE : (i * chunkSize) + chunkSize;
      RexsterInputSplit split = new RexsterInputSplit(start, end);
      splits.add(split);

      if (LOG.isDebugEnabled()) {
        LOG.debug(String.format("Chunk: start %d; end %d;", start, end));
        LOG.debug(String.format("Chunk: size %d;", chunkSize));
        LOG.debug(split);
      }
    }

    return splits;
  }
 /**
  * Get a PathFilter instance of the filter set for the input paths.
  *
  * @return the PathFilter instance set for the job, NULL if none has been set.
  */
 public static PathFilter getInputPathFilter(JobContext context) {
   Configuration conf = context.getConfiguration();
   Class<?> filterClass = conf.getClass(PATHFILTER_CLASS, null, PathFilter.class);
   return (filterClass != null)
       ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf)
       : null;
 }
  @Override
  public List<LuceneSegmentInputSplit> getSplits(JobContext context)
      throws IOException, InterruptedException {
    Configuration configuration = context.getConfiguration();

    LuceneStorageConfiguration lucene2SeqConfiguration =
        new LuceneStorageConfiguration(configuration);

    List<LuceneSegmentInputSplit> inputSplits = new ArrayList<>();

    List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths();
    for (Path indexPath : indexPaths) {
      ReadOnlyFileSystemDirectory directory =
          new ReadOnlyFileSystemDirectory(
              FileSystem.get(configuration), indexPath, false, configuration);
      SegmentInfos segmentInfos = new SegmentInfos();
      segmentInfos.read(directory);

      for (SegmentCommitInfo segmentInfo : segmentInfos) {
        LuceneSegmentInputSplit inputSplit =
            new LuceneSegmentInputSplit(
                indexPath, segmentInfo.info.name, segmentInfo.sizeInBytes());
        inputSplits.add(inputSplit);
        LOG.info(
            "Created {} byte input split for index '{}' segment {}",
            segmentInfo.sizeInBytes(),
            indexPath.toUri(),
            segmentInfo.info.name);
      }
    }

    return inputSplits;
  }
 private void addAvroFilesInSubdirsToSplits(
     List<InputSplit> splits, List<Path> subdirs, FileSystem fs, JobContext cx)
     throws FileNotFoundException, IOException {
   List<Path> files = findAvroFilesInDirs(subdirs, fs);
   Job helperJob = Job.getInstance(cx.getConfiguration());
   setInputPaths(helperJob, files.toArray(new Path[files.size()]));
   splits.addAll(super.getSplits(helperJob));
 }
Пример #24
0
  @Override
  protected boolean isSplitable(JobContext context, Path filename) {

    CompressionCodec codec =
        new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);

    return codec == null;
  }
 @Override
 protected boolean isSplitable(JobContext context, Path file) {
   CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
   if (null == codec) {
     return true;
   }
   return codec instanceof SplittableCompressionCodec;
 }
Пример #26
0
 public List<InputSplit> getSplits(JobContext jobContext) {
   List<InputSplit> ret = new ArrayList<InputSplit>();
   int numSplits = jobContext.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
   for (int i = 0; i < numSplits; ++i) {
     ret.add(new EmptySplit());
   }
   return ret;
 }
Пример #27
0
 /** Create the desired number of splits, dividing the number of rows between the mappers. */
 @Override
 public List<InputSplit> getSplits(JobContext job) {
   long totalRows = job.getConfiguration().getLong(NUMROWS, 0);
   int numSplits = job.getConfiguration().getInt(NUMSPLITS, 1);
   long rowsPerSplit = totalRows / numSplits;
   System.out.println(
       "Generating " + totalRows + " using " + numSplits + " maps with step of " + rowsPerSplit);
   ArrayList<InputSplit> splits = new ArrayList<>(numSplits);
   long currentRow = 0;
   for (int split = 0; split < numSplits - 1; ++split) {
     splits.add(new RangeInputSplit(currentRow, rowsPerSplit));
     currentRow += rowsPerSplit;
   }
   splits.add(new RangeInputSplit(currentRow, totalRows - currentRow));
   System.out.println("Done Generating.");
   return splits;
 }
 /**
  * Get the list of input {@link Path}s for the map-reduce job.
  *
  * @param context The job
  * @return the list of input {@link Path}s for the map-reduce job.
  */
 public static Path[] getInputPaths(JobContext context) {
   String dirs = context.getConfiguration().get(INPUT_DIR, "");
   String[] list = StringUtils.split(dirs);
   Path[] result = new Path[list.length];
   for (int i = 0; i < list.length; i++) {
     result[i] = new Path(StringUtils.unEscapeString(list[i]));
   }
   return result;
 }
Пример #29
0
 /**
  * Sets the access mode for stage resources in the job.
  *
  * @param context the current job context
  * @param mode the access mode
  * @since 0.7.1
  */
 public static void setAccessMode(JobContext context, AccessMode mode) {
   if (context == null) {
     throw new IllegalArgumentException("context must not be null"); // $NON-NLS-1$
   }
   if (mode == null) {
     throw new IllegalArgumentException("mode must not be null"); // $NON-NLS-1$
   }
   context.getConfiguration().set(KEY_ACCESS_MODE, mode.encode());
 }
  /** @inheritDoc */
  @Override
  protected boolean isSplitable(JobContext context, Path file) {
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    if (codec != null) {
      return false;
    }

    return true;
  }