public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException { try { LOG.info( "Reading in " + vertexName + " taskid " + context.getTaskAttemptID().getTaskID().getId() + " key " + key.get()); LOG.info( "Sleeping in FinalReduce" + ", vertexName=" + vertexName + ", taskAttemptId=" + context.getTaskAttemptID() + ", reduceSleepDuration=" + reduceSleepDuration + ", reduceSleepCount=" + reduceSleepCount + ", sleepLeft=" + (reduceSleepDuration * (reduceSleepCount - count))); context.setStatus( "Sleeping... (" + (reduceSleepDuration * (reduceSleepCount - count)) + ") ms left"); if ((reduceSleepCount - count) > 0) { Thread.sleep(reduceSleepDuration); } } catch (InterruptedException ex) { throw (IOException) new IOException("Interrupted while sleeping").initCause(ex); } count++; }
public void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException { // it is expected that every map processes mapSleepCount number of records. try { LOG.info( "Reading in " + vertexName + " taskid " + context.getTaskAttemptID().getTaskID().getId() + " key " + key.get()); LOG.info( "Sleeping in InitialMap" + ", vertexName=" + vertexName + ", taskAttemptId=" + context.getTaskAttemptID() + ", mapSleepDuration=" + mapSleepDuration + ", mapSleepCount=" + mapSleepCount + ", sleepLeft=" + (mapSleepDuration * (mapSleepCount - count))); context.setStatus( "Sleeping... (" + (mapSleepDuration * (mapSleepCount - count)) + ") ms left"); if ((mapSleepCount - count) > 0) { Thread.sleep(mapSleepDuration); } if (throwError || throwFatal) { throw new IOException("Throwing a simulated error from map"); } } catch (InterruptedException ex) { throw (IOException) new IOException("Interrupted while sleeping").initCause(ex); } ++count; // output reduceSleepCount * numReduce number of random values, so that // each reducer will get reduceSleepCount number of keys. int k = key.get(); for (int i = 0; i < value.get(); ++i) { LOG.info( "Writing in " + vertexName + " taskid " + context.getTaskAttemptID().getTaskID().getId() + " key " + (k + i) + " value 1"); context.write(new IntWritable(k + i), new IntWritable(1)); } }
public void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { try { LOG.info( "Reading in " + vertexName + " taskid " + context.getTaskAttemptID().getTaskID().getId() + " key " + key.get()); LOG.info( "Sleeping in IntermediateReduce" + ", vertexName=" + vertexName + ", taskAttemptId=" + context.getTaskAttemptID() + ", iReduceSleepDuration=" + iReduceSleepDuration + ", iReduceSleepCount=" + iReduceSleepCount + ", sleepLeft=" + (iReduceSleepDuration * (iReduceSleepCount - count))); context.setStatus( "Sleeping... (" + (iReduceSleepDuration * (iReduceSleepCount - count)) + ") ms left"); if ((iReduceSleepCount - count) > 0) { Thread.sleep(iReduceSleepDuration); } } catch (InterruptedException ex) { throw (IOException) new IOException("Interrupted while sleeping").initCause(ex); } ++count; // output reduceSleepCount * numReduce number of random values, so that // each reducer will get reduceSleepCount number of keys. int k = key.get(); for (IntWritable value : values) { for (int i = 0; i < value.get(); ++i) { LOG.info( "Writing in " + vertexName + " taskid " + context.getTaskAttemptID().getTaskID().getId() + " key " + (k + i) + " value 1"); context.write(new IntWritable(k + i), new IntWritable(1)); } } }
/** Configures the Reduce plan, the POPackage operator and the reporter thread */ @SuppressWarnings("unchecked") @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); inIllustrator = inIllustrator(context); if (inIllustrator) pack = getPack(context); Configuration jConf = context.getConfiguration(); SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf)); context .getConfiguration() .set( PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId())); sJobContext = context; sJobConfInternal.set(context.getConfiguration()); sJobConf = context.getConfiguration(); try { PigContext.setPackageImportList( (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list"))); pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext")); // This attempts to fetch all of the generated code from the distributed cache, and resolve // it SchemaTupleBackend.initialize(jConf, pigContext); if (rp == null) rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan")); stores = PlanHelper.getPhysicalOperators(rp, POStore.class); if (!inIllustrator) pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package")); // To be removed if (rp.isEmpty()) log.debug("Reduce Plan empty!"); else { ByteArrayOutputStream baos = new ByteArrayOutputStream(); rp.explain(baos); log.debug(baos.toString()); } pigReporter = new ProgressableReporter(); if (!(rp.isEmpty())) { roots = rp.getRoots().toArray(new PhysicalOperator[1]); leaf = rp.getLeaves().get(0); } // Get the UDF specific context MapRedUtil.setupUDFContext(jConf); } catch (IOException ioe) { String msg = "Problem while configuring reduce plan."; throw new RuntimeException(msg, ioe); } log.info( "Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location")); Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get()); }
@Override public void setup(Context context) throws TableNotFoundException, IOException { taskId = context.getTaskAttemptID().getTaskID().getId(); schemaFileLocation = context.getConfiguration().get(SCHEMA_FILE_LOCATION_CONF); FileSystem fs = FileSystem.get(context.getConfiguration()); columns = generateColumnsFromSchemaFile(fs, schemaFileLocation); delimiter = context.getConfiguration().get(DELIMITER_CONF); rowKeyColumn = context.getConfiguration().get(ROW_KEY_COLUMN_CONF); }
@Override protected void setup(Context mapperContext) throws IOException, InterruptedException { String mapperAppContextXML = mapperContext.getConfiguration().get(MAPPER_APP_CONTEXT_XML); // processor batch size this.processorBatchSize = mapperContext.getConfiguration().getInt(MAPPER_DATA_PROCESSOR_BATCH_SIZE, 10000); // our record length? this.myRecordLength = mapperContext.getConfiguration().getInt(MAPPER_RECORD_LENGTH, -1); if (myRecordLength == -1) { throw new IOException( "USPSDataFileMapper must have the config property " + MAPPER_RECORD_LENGTH + " set > 0"); } // my records stored counter this.myTotalProcessedCounter = ((MapContext) mapperContext) .getCounter( USPS_COUNTERS_GROUP_NAME, MAPPER_RECORDS_PROCESSED_COUNTER + mapperContext.getTaskAttemptID().toString()); // overall job processed counter this.overallProcessedCounter = ((MapContext) mapperContext) .getCounter(USPS_COUNTERS_GROUP_NAME, OVERALL_RECORDS_PROCESSED_COUNTER); LOG.info( "USPSDataFileMapper configured: skipCopyrights=" + this.skipCopyrights + " processorBatchSize=" + processorBatchSize + " myRecordLength=" + this.myRecordLength + " mapperAppContextXML=" + mapperAppContextXML); // init static stuff if (!initialized) { synchronized (synchLock) { if (!initialized) { ApplicationContext context = new ClassPathXmlApplicationContext(mapperAppContextXML); classFinder = (ClassFinder) context.getBean("classFinder"); uspsUtils = (USPSUtils) context.getBean("uspsUtils"); idGenerator = (USPSIdGenerator) context.getBean("uspsIdGenerator"); lineParser = (USPSRecordParser) context.getBean("uspsLineParser"); dataProcessor = (USPSDataProcessor) context.getBean("uspsDataProcessor"); initialized = true; } } } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); id = context.getTaskAttemptID().getTaskID().getId(); increment = context.getConfiguration().getInt("mapred.map.tasks", 0); if (increment == 0) { throw new IllegalArgumentException("mapred.map.tasks is zero"); } }
protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); this.mapSleepCount = conf.getInt(MAP_SLEEP_COUNT, mapSleepCount); this.mapSleepDuration = mapSleepCount == 0 ? 0 : conf.getLong(MAP_SLEEP_TIME, 100) / mapSleepCount; vertexName = conf.get(org.apache.tez.mapreduce.hadoop.MRJobConfig.VERTEX_NAME); TaskAttemptID taId = context.getTaskAttemptID(); ObjectRegistry objectRegistry = ObjectRegistryFactory.getObjectRegistry(); String fooBarVal = (String) objectRegistry.get("FooBar"); if (null == fooBarVal) { LOG.info("Adding FooBar key to Object cache"); objectRegistry.add( ObjectLifeCycle.DAG, "FooBar", "BarFooFromTask" + taId.getTaskID().toString()); } else { LOG.info( "Got FooBar val from Object cache" + ", currentTaskId=" + taId.getTaskID().toString() + ", val=" + fooBarVal); } String[] taskIds = conf.getStrings(MAP_ERROR_TASK_IDS); if (taId.getId() + 1 >= context.getMaxMapAttempts()) { finalAttempt = true; } boolean found = false; if (taskIds != null) { if (taskIds.length == 1 && taskIds[0].equals("*")) { found = true; } if (!found) { for (String taskId : taskIds) { if (Integer.valueOf(taskId).intValue() == taId.getTaskID().getId()) { found = true; break; } } } } if (found) { if (!finalAttempt) { throwError = conf.getBoolean(MAP_THROW_ERROR, false); } throwFatal = conf.getBoolean(MAP_FATAL_ERROR, false); } }
protected void elPrepPreprocess( Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, IOException, QualityException, URISyntaxException { String dictF = ref.substring(0, ref.lastIndexOf('.')) + ".dict"; String rg = createReadGroupRecordString(RGID, RGLB, RGPL, RGPU, RGSM); String preSamOut = tmpFileBase + "-p1.sam"; String samOut = tmpFileBase + "-p2.sam"; String fCounts = tmpFileBase + "-features.count"; outHeader = header.clone(); outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); Logger.DEBUG("call elPrep"); context.setStatus("call elPrep"); int reads; if (keep) { reads = tools.callElPrep( preSamOut, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF); } else { reads = tools.streamElPrep( context, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF); } Logger.DEBUG(reads + " reads processed in elPrep"); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); if (gff != null) { Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, samOut, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS( context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(samOut, output, threads); context.setStatus("build bam index"); Logger.DEBUG("build bam index"); tools.runBuildBamIndex(output); // remove temporary files HalvadeFileUtils.removeLocalFile(keep, preSamOut, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, samOut, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }
protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { Pair<TaskEstimate, HiveObjectSpec> input = MetastoreReplicationJob.deseralizeJobResult(value.toString()); TaskEstimate estimate = input.getLeft(); HiveObjectSpec spec = input.getRight(); String result = value.toString(); String extra = ""; try { if (estimate.getTaskType() == TaskEstimate.TaskType.CHECK_PARTITION) { // Table exists in source, but not in dest. It should copy the table. TaskEstimate newEstimate = estimator.analyze(spec); result = MetastoreReplicationJob.serializeJobResult(newEstimate, spec); } } catch (HiveMetastoreException e) { LOG.error( String.format( "Hit exception during db:%s, tbl:%s, part:%s", spec.getDbName(), spec.getTableName(), spec.getPartitionName()), e); extra = String.format( "exception in %s of mapper = %s", estimate.getTaskType().toString(), context.getTaskAttemptID().toString()); } context.write(new Text(result), new Text(extra)); ++this.count; if (this.count % 100 == 0) { LOG.info("Processed " + this.count + " entities"); } } }
protected void PicardPreprocess( Context context, PreprocessingTools tools, SAMRecordIterator input, String output) throws InterruptedException, QualityException, IOException, URISyntaxException { outHeader = header.clone(); outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); // tmp files String tmpOut1 = tmpFileBase + "-p1.bam"; String tmpOut2 = tmpFileBase + "-p2.bam"; String tmpOut3 = tmpFileBase + "-p3.sam"; String fCounts = tmpFileBase + "-features.count"; String tmpMetrics = tmpFileBase + "-p3-metrics.txt"; SAMFileWriterFactory factory = new SAMFileWriterFactory(); if (!inputIsBam) { outHeader.addReadGroup(bamrg); } SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1)); long startTime = System.currentTimeMillis(); int count = 0; SAMRecord sam; while (input.hasNext()) { sam = input.next(); writer.addAlignment(sam); count++; } int reads = input.getCount(); writer.close(); context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads); long estimatedTime = System.currentTimeMillis() - startTime; context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime); Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000); Logger.DEBUG("clean sam"); context.setStatus("clean sam"); tools.runCleanSam(tmpOut1, tmpOut2); Logger.DEBUG("mark duplicates"); context.setStatus("mark duplicates"); tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics); if (gff != null) { // tmpOut3 is sam for htseq count! Logger.DEBUG("featureCounts"); context.setStatus("featureCounts"); tools.runFeatureCounts(gff, tmpOut3, fCounts, threads); HalvadeFileUtils.uploadFileToHDFS( context, FileSystem.get(new URI(outputdir), context.getConfiguration()), fCounts, outputdir + context.getTaskAttemptID().toString() + ".count"); } if (!inputIsBam) { Logger.DEBUG("add read-group"); context.setStatus("add read-group"); tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM); } else { context.setStatus("convert SAM to BAM"); Logger.DEBUG("convert SAM to BAM"); tools.callSAMToBAM(tmpOut3, output, threads); } Logger.DEBUG("build bam index"); context.setStatus("build bam index"); tools.runBuildBamIndex(output); estimatedTime = System.currentTimeMillis() - startTime; Logger.DEBUG("estimated time: " + estimatedTime / 1000); // remove all temporary files now! HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP); HalvadeFileUtils.removeLocalFile(keep, fCounts); }
@Override protected void map(String key, String value, final Context context) throws IOException, InterruptedException { final InputSplit split = context.getInputSplit(); if (!(split instanceof DatasourceInputSplit)) { throw new IAE( "Unexpected split type. Expected [%s] was [%s]", DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName()); } final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY); final File tmpDir = Paths.get(tmpDirLoc).toFile(); final DataSegment segment = Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment(); final HadoopDruidConverterConfig config = converterConfigFromConfiguration(context.getConfiguration()); context.setStatus("DOWNLOADING"); context.progress(); final Path inPath = new Path(JobHelper.getURIFromSegment(segment)); final File inDir = new File(tmpDir, "in"); if (inDir.exists() && !inDir.delete()) { log.warn("Could not delete [%s]", inDir); } if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) { log.warn("Unable to make directory"); } final long inSize = JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context); log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath()); context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize); context.setStatus("CONVERTING"); context.progress(); final File outDir = new File(tmpDir, "out"); if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) { throw new IOException(String.format("Could not create output directory [%s]", outDir)); } HadoopDruidConverterConfig.INDEX_MERGER.convert( inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context)); if (config.isValidate()) { context.setStatus("Validating"); HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir); } context.progress(); context.setStatus("Starting PUSH"); final Path baseOutputPath = new Path(config.getSegmentOutputPath()); final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration()); final DataSegment finalSegmentTemplate = segment.withVersion(segment.getVersion() + "_converted"); final DataSegment finalSegment = JobHelper.serializeOutIndex( finalSegmentTemplate, context.getConfiguration(), context, context.getTaskAttemptID(), outDir, JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate)); context.progress(); context.setStatus("Finished PUSH"); final String finalSegmentString = HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment); context .getConfiguration() .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString); context.write(new Text("dataSegment"), new Text(finalSegmentString)); context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize()); context.progress(); context.setStatus("Ready To Commit"); }
@Override protected void cleanup(Context context) throws IOException, InterruptedException { context.write( new IntWritable(context.getTaskAttemptID().getTaskID().getId()), new VectorWritable(new DenseVector(mYtY.getData()))); }