public static boolean checkJobContextIfRunningFromBackend(JobContext j) { if (j.getConfiguration().get("mapred.task.id", "").equals("") && !("true".equals(j.getConfiguration().get("pig.illustrating")))) { return false; } return true; }
public static Map<FormatBundle, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) { Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap(); Configuration conf = job.getConfiguration(); String crunchInputs = conf.get(CRUNCH_INPUTS); if (crunchInputs == null || crunchInputs.isEmpty()) { return ImmutableMap.of(); } for (String input : Splitter.on(RECORD_SEP).split(crunchInputs)) { List<String> fields = Lists.newArrayList(SPLITTER.split(input)); FormatBundle<InputFormat> inputBundle = FormatBundle.fromSerialized(fields.get(0), job.getConfiguration()); if (!formatNodeMap.containsKey(inputBundle)) { formatNodeMap.put(inputBundle, Maps.<Integer, List<Path>>newHashMap()); } Integer nodeIndex = Integer.valueOf(fields.get(1)); if (!formatNodeMap.get(inputBundle).containsKey(nodeIndex)) { formatNodeMap.get(inputBundle).put(nodeIndex, Lists.<Path>newLinkedList()); } List<Path> formatNodePaths = formatNodeMap.get(inputBundle).get(nodeIndex); String paths = fields.get(2); for (String path : Splitter.on(PATH_SEP).split(paths)) { formatNodePaths.add(new Path(path)); } } return formatNodeMap; }
public SqoopOutputFormatLoadExecutorSpark(JobContext jobctx) { context = jobctx; loaderName = context.getConfiguration().get(MRJobConstants.JOB_ETL_LOADER); writer = new SqoopRecordWriter(); // jackh: This must be conditional - Extract schema using credentials in case of MR and simply // extract from the // credentials object in case of Spark (due to known issue with Hadoop/Spark that the // credentials are never added // for serialization) // matcher = MatcherFactory.getMatcher( // MRConfigurationUtils.getConnectorSchema(Direction.FROM, context.getConfiguration()), // MRConfigurationUtils.getConnectorSchema(Direction.TO, context.getConfiguration())); matcher = MatcherFactory.getMatcher( MRConfigurationUtils.getConnectorSchemaUnsafe( Direction.FROM, context.getConfiguration()), MRConfigurationUtils.getConnectorSchemaUnsafe( Direction.TO, context.getConfiguration())); toDataFormat = (IntermediateDataFormat<?>) ClassUtils.instantiate( context.getConfiguration().get(MRJobConstants.TO_INTERMEDIATE_DATA_FORMAT)); // Using the TO schema since the SqoopDataWriter in the SqoopMapper encapsulates the // toDataFormat toDataFormat.setSchema(matcher.getToSchema()); }
public static void checkOutputSpecs(JobContext jc) throws IOException, InterruptedException { Map<String, OutputConfig> outputs = getNamedOutputs(jc.getConfiguration()); for (Map.Entry<String, OutputConfig> e : outputs.entrySet()) { String namedOutput = e.getKey(); Job job = getJob(jc.getJobID(), e.getKey(), jc.getConfiguration()); OutputFormat fmt = getOutputFormat(namedOutput, job, e.getValue()); fmt.checkOutputSpecs(job); } }
@Test public void testDeleteMissing() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = new JobContextImpl( taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID()); Configuration conf = jobContext.getConfiguration(); String sourceBase; String targetBase; FileSystem fs = null; try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); fs = FileSystem.get(conf); sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault()); fs.rename(new Path(targetBaseAdd), new Path(targetBase)); DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out")); options.setSyncFolder(true); options.setDeleteMissing(true); options.appendToConf(conf); CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); listing.buildListing(listingFile, options); conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase); committer.commitJob(jobContext); if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { Assert.fail("Source and target folders are not in sync"); } if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) { Assert.fail("Source and target folders are not in sync"); } // Test for idempotent commit committer.commitJob(jobContext); if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) { Assert.fail("Source and target folders are not in sync"); } if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) { Assert.fail("Source and target folders are not in sync"); } } catch (Throwable e) { LOG.error("Exception encountered while testing for delete missing", e); Assert.fail("Delete missing failure"); } finally { TestDistCpUtils.delete(fs, "/tmp1"); conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false"); } }
@Override public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException { Configuration conf = jobContext.getConfiguration(); for (Map.Entry<String, OutputCommitter> e : committers.entrySet()) { Job job = getJob(jobContext.getJobID(), e.getKey(), conf); configureJob(e.getKey(), job, outputs.get(e.getKey())); e.getValue().abortJob(job, state); } }
/** * Generate the list of files and make them into FileSplits. This needs to be copied to insert a * filter on acceptable data */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); long desiredMappers = job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> fileStatuses = listStatus(job); boolean forceNumberMappers = fileStatuses.size() == 1; for (FileStatus file : fileStatuses) { Path path = file.getPath(); if (!isPathAcceptable(path)) // filter acceptable data continue; FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); // use desired mappers to force more splits if (forceNumberMappers && desiredMappers > 0) maxSize = Math.min(maxSize, (length / desiredMappers)); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (withinSlop(splitSize, bytesRemaining)) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } System.out.println("Total # of splits: " + splits.size()); // LOG.debug("Total # of splits: " + splits.size()); return splits; }
/** * Validates that a valid FIXED_RECORD_LENGTH config property has been set and if so, returns the * splits. If the FIXED_RECORD_LENGTH property has not been set, this will throw an IOException. * * @inheritDoc */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { // fetch configuration Configuration conf = job.getConfiguration(); // ensure recordLength is properly setup try { if (this.recordLength == -1) { this.recordLength = getAndValidateRecordLength(job.getConfiguration()); } LOG.info("FixedLengthInputFormat: my fixed record length is: " + recordLength); } catch (Exception e) { throw new IOException( "FixedLengthInputFormat requires the" + " Configuration property:" + FIXED_RECORD_LENGTH + " to" + " be set to something > 0. Currently the value is 0 (zero)"); } // ensure recordKey start/end is setup properly if it was defined by the user if (this.recordKeyStartAt == -1) { this.recordKeyStartAt = FixedLengthInputFormat.getRecordKeyStartAt(conf); this.recordKeyEndAt = FixedLengthInputFormat.getRecordKeyEndAt(conf); // if one is set, they BOTH must be set, this is an error // if endAt < startAt, this is an error // if either is > record length, this is an error // if either are < -1 (default), this is an error if ((recordKeyStartAt >= 0 && recordKeyEndAt == -1) || (recordKeyStartAt == -1 && recordKeyEndAt >= 0) || (recordKeyEndAt < recordKeyStartAt) || (recordKeyEndAt > recordLength) || (recordKeyStartAt > recordLength) || (recordKeyStartAt < -1) || (recordKeyEndAt < -1)) { throw new IOException( "FixedLengthInputFormat requires the" + " optional configuration properties:" + FIXED_RECORD_KEY_START_AT + " and" + FIXED_RECORD_KEY_END_AT + " to A) be less than the " + " fixed record length. B) both must be set together C) neither " + " can be less than 0. D) end at must be > start at."); } } return super.getSplits(job); }
private void setSplitSize(JobContext cx) { super.setMaxSplitSize( cx.getConfiguration() .getLong( COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE, DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE)); super.setMinSplitSizeNode( cx.getConfiguration() .getLong( COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE, DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE)); }
public List<InputSplit> getSplits(JobContext job) throws IOException { Configuration conf = job.getConfiguration(); int numMapTasks = conf.getInt("admm.iteration.num.map.tasks", 0); if (0 == numMapTasks) { return super.getSplits(job); } // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = Math.max(computeSplitSize(JAVA_OPTS, numMapTasks, length), blockSize); long splitLength = (long) (length / Math.ceil((double) length / splitSize)); long bytesRemaining = length; while (((double) bytesRemaining) / splitLength > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitLength, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitLength; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { splits.add(new FileSplit(path, 0, length, new String[0])); } } // Save the number of input files in the job-conf job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); job.getConfiguration().setInt("admm.iteration.num.map.tasks", splits.size()); return splits; }
/** * List input directories. Subclasses may override to, e.g., select only files matching a regular * expression. * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); // Whether we need to recursive look into the directory structure boolean recursive = getInputDirRecursive(job); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); List<FileStatus> result = null; int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS); Stopwatch sw = new Stopwatch().start(); if (numThreads == 1) { result = singleThreadedListStatus(job, dirs, inputFilter, recursive); } else { Iterable<FileStatus> locatedFiles = null; try { LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher( job.getConfiguration(), dirs, recursive, inputFilter, true); locatedFiles = locatedFileStatusFetcher.getFileStatuses(); } catch (InterruptedException e) { throw new IOException("Interrupted while getting file statuses"); } result = Lists.newArrayList(locatedFiles); } sw.stop(); if (LogGlobal.isDebugEnabled()) { /* LOG.debug("Time taken to get FileStatuses: "+sw.elapsedMillis()) */ LOG.time_taken_get_filestatuses(String.valueOf(sw.elapsedMillis())).tag("methodCall").debug(); } /* LOG.info("Total input paths to process : "+result.size()) */ LOG.total_input_paths_process(String.valueOf(result.size())).tag("methodCall").info(); return result; }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splitList = new ArrayList<InputSplit>(); int totalRows = job.getConfiguration().getInt("LoadSplit.TOTAL_ROWS", 0); int clients = job.getConfiguration().getInt("LoadSplit.CLIENTS", 1); int numRows = totalRows / clients; for (int ii = 0; ii < clients; ++ii) { int startRow = ii * numRows; LoadSplit split = new LoadSplit(startRow, numRows, ii); splitList.add(split); } return splitList; }
@Override public void checkOutputSpecs(JobContext job) throws IOException { String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF); if (hosts == null || hosts.isEmpty()) { throw new IOException(REDIS_HOSTS_CONF + " is not set in configuration."); } String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF); if (hashKey == null || hashKey.isEmpty()) { throw new IOException(REDIS_HASH_KEY_CONF + " is not set in configuration."); } }
@Test public void testPreserveStatus() { TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config); JobContext jobContext = new JobContextImpl( taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID()); Configuration conf = jobContext.getConfiguration(); String sourceBase; String targetBase; FileSystem fs = null; try { OutputCommitter committer = new CopyCommitter(null, taskAttemptContext); fs = FileSystem.get(conf); FsPermission sourcePerm = new FsPermission((short) 511); FsPermission initialPerm = new FsPermission((short) 448); sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm); targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm); DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out")); options.preserve(FileAttribute.PERMISSION); options.appendToConf(conf); CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS); Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong())); listing.buildListing(listingFile, options); conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase); committer.commitJob(jobContext); if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) { Assert.fail("Permission don't match"); } // Test for idempotent commit committer.commitJob(jobContext); if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) { Assert.fail("Permission don't match"); } } catch (IOException e) { LOG.error("Exception encountered while testing for preserve status", e); Assert.fail("Preserve status failure"); } finally { TestDistCpUtils.delete(fs, "/tmp1"); } }
/** * Creates splits with multiple indexes per split (if they are smaller than * maxCombinedIndexSizePerSplit). It is possible for a split to be larger than * maxCombinedIndexSizePerSplit, if it consists of a single index that is larger than * maxCombinedIndexSizePerSplit. * * <p>All inputPaths will be searched for indexes recursively * * <p>The bin-packing problem of combining splits is solved naively: * * <ol> * <li>Sort all indexes by size * <li>Begin packing indexes into splits until adding the next split would cause the split to * exceed maxCombinedIndexSizePerSplit * <li>Begin packing subsequent indexes into the next split, and so on * </ol> */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { // load settings from job conf loadConfig(job.getConfiguration()); // find all the index dirs and create a split for each PriorityQueue<LuceneIndexInputSplit> splits = findSplits(job.getConfiguration()); // combine the splits based on maxCombineSplitSize List<InputSplit> combinedSplits = combineSplits(splits, maxCombinedIndexSizePerSplit, maxNumIndexesPerSplit); return combinedSplits; }
private String[] getActiveServersList(JobContext context) { String[] servers = null; try { JobClient jc = new JobClient((JobConf) context.getConfiguration()); ClusterStatus status = jc.getClusterStatus(true); Collection<String> atc = status.getActiveTrackerNames(); servers = new String[atc.size()]; int s = 0; for (String serverInfo : atc) { // System.out.println("serverInfo:" + serverInfo); StringTokenizer st = new StringTokenizer(serverInfo, ":"); String trackerName = st.nextToken(); // System.out.println("trackerName:" + trackerName); StringTokenizer st1 = new StringTokenizer(trackerName, "_"); st1.nextToken(); servers[s++] = st1.nextToken(); } } catch (IOException e) { e.printStackTrace(); } return servers; }
private List<InputSplit> getSplits(JobContext cx, List<Path> dirs) throws FileNotFoundException, IOException { List<InputSplit> splits = Lists.newArrayList(); List<Path> subdirs = Lists.newArrayList(); long totalFileCount = 0; FileSystem fs = FileSystem.get(cx.getConfiguration()); for (Path input : dirs) { long count = fs.getContentSummary(input).getFileCount(); subdirs.add(input); if (totalFileCount + count < GET_SPLIT_NUM_FILES_TRHESHOLD) { totalFileCount += count; } else { addAvroFilesInSubdirsToSplits(splits, subdirs, fs, cx); subdirs.clear(); totalFileCount = 0; } } if (totalFileCount > 0) { addAvroFilesInSubdirsToSplits(splits, subdirs, fs, cx); } return splits; }
/** * This functions modifies the splits list in place to retain only a random fraction of the input * splits. The fraction is expected in "starfish.profiler.sampling.fraction" as a number between 0 * and 1. The default value is 0.1. * * @param job The job context * @param splits The list of input splits to modify */ public static void sampleInputSplits(JobContext job, List<InputSplit> splits) { // Get the sampling fraction Configuration conf = job.getConfiguration(); double fraction = conf.getFloat(Profiler.PROFILER_SAMPLING_FRACTION, 0.1f); if (fraction < 0 || fraction > 1) throw new RuntimeException("ERROR: Invalid sampling fraction: " + fraction); // Handle corner cases if (fraction == 0 || splits.size() == 0) { splits.clear(); return; } if (fraction == 1) return; // Calculate the number of samples int numSplits = splits.size(); int sampleSize = (int) Math.round(numSplits * fraction); if (sampleSize == 0) sampleSize = 1; // Shuffle the splits Collections.shuffle(splits); // Retain only a sampleSize number of splits for (int i = splits.size() - 1; i >= sampleSize; --i) { splits.remove(i); } nf.setMaximumFractionDigits(2); LOG.info("Executing only " + nf.format(fraction * 100) + "% of the map tasks"); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); final String overlordUrl = conf.get(CONF_DRUID_OVERLORD_HOSTPORT); final String storageDir = conf.get(CONF_DRUID_STORAGE_STORAGE_DIR); String dataSource = conf.get(CONF_DRUID_DATASOURCE); String intervalStr = conf.get(CONF_DRUID_INTERVAL); logger.info("druid overlord url = " + overlordUrl); logger.info("druid storage dir = " + storageDir); logger.info("druid datasource = " + dataSource); logger.info("druid datasource interval = " + intervalStr); // TODO: currently we are creating 1 split per segment which is not really // necessary, we can use some configuration to combine multiple segments into // one input split List<InputSplit> splits = Lists.transform( druid.getSegmentPathsToLoad( dataSource, new Interval(intervalStr), storageDir, overlordUrl), new Function<String, InputSplit>() { @Override public InputSplit apply(String input) { return new DruidInputSplit(input); } }); logger.info("Number of splits = " + splits.size()); return splits; }
/** * Splitter used by both Vertex and Edge Input Format. * * @param context The job context * @param estimation Number of estimated objects * @return splits to be generated to read the input */ public static List<InputSplit> getSplits(JobContext context, long estimation) throws IOException, InterruptedException { int chunks = context.getConfiguration().getInt("mapred.map.tasks", 1); long chunkSize = estimation / chunks; List<InputSplit> splits = new ArrayList<InputSplit>(); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Estimated objects: %d", estimation)); LOG.debug(String.format("Number of chunks: %d", chunks)); } for (int i = 0; i < chunks; ++i) { long start = i * chunkSize; long end = ((i + 1) == chunks) ? Long.MAX_VALUE : (i * chunkSize) + chunkSize; RexsterInputSplit split = new RexsterInputSplit(start, end); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Chunk: start %d; end %d;", start, end)); LOG.debug(String.format("Chunk: size %d;", chunkSize)); LOG.debug(split); } } return splits; }
/** * Get a PathFilter instance of the filter set for the input paths. * * @return the PathFilter instance set for the job, NULL if none has been set. */ public static PathFilter getInputPathFilter(JobContext context) { Configuration conf = context.getConfiguration(); Class<?> filterClass = conf.getClass(PATHFILTER_CLASS, null, PathFilter.class); return (filterClass != null) ? (PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null; }
@Override public List<LuceneSegmentInputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); LuceneStorageConfiguration lucene2SeqConfiguration = new LuceneStorageConfiguration(configuration); List<LuceneSegmentInputSplit> inputSplits = new ArrayList<>(); List<Path> indexPaths = lucene2SeqConfiguration.getIndexPaths(); for (Path indexPath : indexPaths) { ReadOnlyFileSystemDirectory directory = new ReadOnlyFileSystemDirectory( FileSystem.get(configuration), indexPath, false, configuration); SegmentInfos segmentInfos = new SegmentInfos(); segmentInfos.read(directory); for (SegmentCommitInfo segmentInfo : segmentInfos) { LuceneSegmentInputSplit inputSplit = new LuceneSegmentInputSplit( indexPath, segmentInfo.info.name, segmentInfo.sizeInBytes()); inputSplits.add(inputSplit); LOG.info( "Created {} byte input split for index '{}' segment {}", segmentInfo.sizeInBytes(), indexPath.toUri(), segmentInfo.info.name); } } return inputSplits; }
private void addAvroFilesInSubdirsToSplits( List<InputSplit> splits, List<Path> subdirs, FileSystem fs, JobContext cx) throws FileNotFoundException, IOException { List<Path> files = findAvroFilesInDirs(subdirs, fs); Job helperJob = Job.getInstance(cx.getConfiguration()); setInputPaths(helperJob, files.toArray(new Path[files.size()])); splits.addAll(super.getSplits(helperJob)); }
@Override protected boolean isSplitable(JobContext context, Path filename) { CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); return codec == null; }
@Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); if (null == codec) { return true; } return codec instanceof SplittableCompressionCodec; }
public List<InputSplit> getSplits(JobContext jobContext) { List<InputSplit> ret = new ArrayList<InputSplit>(); int numSplits = jobContext.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1); for (int i = 0; i < numSplits; ++i) { ret.add(new EmptySplit()); } return ret; }
/** Create the desired number of splits, dividing the number of rows between the mappers. */ @Override public List<InputSplit> getSplits(JobContext job) { long totalRows = job.getConfiguration().getLong(NUMROWS, 0); int numSplits = job.getConfiguration().getInt(NUMSPLITS, 1); long rowsPerSplit = totalRows / numSplits; System.out.println( "Generating " + totalRows + " using " + numSplits + " maps with step of " + rowsPerSplit); ArrayList<InputSplit> splits = new ArrayList<>(numSplits); long currentRow = 0; for (int split = 0; split < numSplits - 1; ++split) { splits.add(new RangeInputSplit(currentRow, rowsPerSplit)); currentRow += rowsPerSplit; } splits.add(new RangeInputSplit(currentRow, totalRows - currentRow)); System.out.println("Done Generating."); return splits; }
/** * Get the list of input {@link Path}s for the map-reduce job. * * @param context The job * @return the list of input {@link Path}s for the map-reduce job. */ public static Path[] getInputPaths(JobContext context) { String dirs = context.getConfiguration().get(INPUT_DIR, ""); String[] list = StringUtils.split(dirs); Path[] result = new Path[list.length]; for (int i = 0; i < list.length; i++) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } return result; }
/** * Sets the access mode for stage resources in the job. * * @param context the current job context * @param mode the access mode * @since 0.7.1 */ public static void setAccessMode(JobContext context, AccessMode mode) { if (context == null) { throw new IllegalArgumentException("context must not be null"); // $NON-NLS-1$ } if (mode == null) { throw new IllegalArgumentException("mode must not be null"); // $NON-NLS-1$ } context.getConfiguration().set(KEY_ACCESS_MODE, mode.encode()); }
/** @inheritDoc */ @Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); if (codec != null) { return false; } return true; }