コード例 #1
0
 public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
     throws IOException {
   try {
     LOG.info(
         "Reading in "
             + vertexName
             + " taskid "
             + context.getTaskAttemptID().getTaskID().getId()
             + " key "
             + key.get());
     LOG.info(
         "Sleeping in FinalReduce"
             + ", vertexName="
             + vertexName
             + ", taskAttemptId="
             + context.getTaskAttemptID()
             + ", reduceSleepDuration="
             + reduceSleepDuration
             + ", reduceSleepCount="
             + reduceSleepCount
             + ", sleepLeft="
             + (reduceSleepDuration * (reduceSleepCount - count)));
     context.setStatus(
         "Sleeping... (" + (reduceSleepDuration * (reduceSleepCount - count)) + ") ms left");
     if ((reduceSleepCount - count) > 0) {
       Thread.sleep(reduceSleepDuration);
     }
   } catch (InterruptedException ex) {
     throw (IOException) new IOException("Interrupted while sleeping").initCause(ex);
   }
   count++;
 }
コード例 #2
0
 public void map(IntWritable key, IntWritable value, Context context)
     throws IOException, InterruptedException {
   // it is expected that every map processes mapSleepCount number of records.
   try {
     LOG.info(
         "Reading in "
             + vertexName
             + " taskid "
             + context.getTaskAttemptID().getTaskID().getId()
             + " key "
             + key.get());
     LOG.info(
         "Sleeping in InitialMap"
             + ", vertexName="
             + vertexName
             + ", taskAttemptId="
             + context.getTaskAttemptID()
             + ", mapSleepDuration="
             + mapSleepDuration
             + ", mapSleepCount="
             + mapSleepCount
             + ", sleepLeft="
             + (mapSleepDuration * (mapSleepCount - count)));
     context.setStatus(
         "Sleeping... (" + (mapSleepDuration * (mapSleepCount - count)) + ") ms left");
     if ((mapSleepCount - count) > 0) {
       Thread.sleep(mapSleepDuration);
     }
     if (throwError || throwFatal) {
       throw new IOException("Throwing a simulated error from map");
     }
   } catch (InterruptedException ex) {
     throw (IOException) new IOException("Interrupted while sleeping").initCause(ex);
   }
   ++count;
   // output reduceSleepCount * numReduce number of random values, so that
   // each reducer will get reduceSleepCount number of keys.
   int k = key.get();
   for (int i = 0; i < value.get(); ++i) {
     LOG.info(
         "Writing in "
             + vertexName
             + " taskid "
             + context.getTaskAttemptID().getTaskID().getId()
             + " key "
             + (k + i)
             + " value 1");
     context.write(new IntWritable(k + i), new IntWritable(1));
   }
 }
コード例 #3
0
    public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
      try {
        LOG.info(
            "Reading in "
                + vertexName
                + " taskid "
                + context.getTaskAttemptID().getTaskID().getId()
                + " key "
                + key.get());

        LOG.info(
            "Sleeping in IntermediateReduce"
                + ", vertexName="
                + vertexName
                + ", taskAttemptId="
                + context.getTaskAttemptID()
                + ", iReduceSleepDuration="
                + iReduceSleepDuration
                + ", iReduceSleepCount="
                + iReduceSleepCount
                + ", sleepLeft="
                + (iReduceSleepDuration * (iReduceSleepCount - count)));
        context.setStatus(
            "Sleeping... (" + (iReduceSleepDuration * (iReduceSleepCount - count)) + ") ms left");
        if ((iReduceSleepCount - count) > 0) {
          Thread.sleep(iReduceSleepDuration);
        }
      } catch (InterruptedException ex) {
        throw (IOException) new IOException("Interrupted while sleeping").initCause(ex);
      }
      ++count;
      // output reduceSleepCount * numReduce number of random values, so that
      // each reducer will get reduceSleepCount number of keys.
      int k = key.get();
      for (IntWritable value : values) {
        for (int i = 0; i < value.get(); ++i) {
          LOG.info(
              "Writing in "
                  + vertexName
                  + " taskid "
                  + context.getTaskAttemptID().getTaskID().getId()
                  + " key "
                  + (k + i)
                  + " value 1");
          context.write(new IntWritable(k + i), new IntWritable(1));
        }
      }
    }
コード例 #4
0
    /** Configures the Reduce plan, the POPackage operator and the reporter thread */
    @SuppressWarnings("unchecked")
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);
      inIllustrator = inIllustrator(context);
      if (inIllustrator) pack = getPack(context);
      Configuration jConf = context.getConfiguration();
      SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf));
      context
          .getConfiguration()
          .set(
              PigConstants.TASK_INDEX,
              Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
      sJobContext = context;
      sJobConfInternal.set(context.getConfiguration());
      sJobConf = context.getConfiguration();
      try {
        PigContext.setPackageImportList(
            (ArrayList<String>) ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext) ObjectSerializer.deserialize(jConf.get("pig.pigContext"));

        // This attempts to fetch all of the generated code from the distributed cache, and resolve
        // it
        SchemaTupleBackend.initialize(jConf, pigContext);

        if (rp == null)
          rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf.get("pig.reducePlan"));
        stores = PlanHelper.getPhysicalOperators(rp, POStore.class);

        if (!inIllustrator)
          pack = (POPackage) ObjectSerializer.deserialize(jConf.get("pig.reduce.package"));
        // To be removed
        if (rp.isEmpty()) log.debug("Reduce Plan empty!");
        else {
          ByteArrayOutputStream baos = new ByteArrayOutputStream();
          rp.explain(baos);
          log.debug(baos.toString());
        }
        pigReporter = new ProgressableReporter();
        if (!(rp.isEmpty())) {
          roots = rp.getRoots().toArray(new PhysicalOperator[1]);
          leaf = rp.getLeaves().get(0);
        }

        // Get the UDF specific context
        MapRedUtil.setupUDFContext(jConf);

      } catch (IOException ioe) {
        String msg = "Problem while configuring reduce plan.";
        throw new RuntimeException(msg, ioe);
      }

      log.info(
          "Aliases being processed per job phase (AliasName[line,offset]): "
              + jConf.get("pig.alias.location"));

      Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
    }
コード例 #5
0
    @Override
    public void setup(Context context) throws TableNotFoundException, IOException {
      taskId = context.getTaskAttemptID().getTaskID().getId();
      schemaFileLocation = context.getConfiguration().get(SCHEMA_FILE_LOCATION_CONF);
      FileSystem fs = FileSystem.get(context.getConfiguration());
      columns = generateColumnsFromSchemaFile(fs, schemaFileLocation);

      delimiter = context.getConfiguration().get(DELIMITER_CONF);
      rowKeyColumn = context.getConfiguration().get(ROW_KEY_COLUMN_CONF);
    }
コード例 #6
0
  @Override
  protected void setup(Context mapperContext) throws IOException, InterruptedException {

    String mapperAppContextXML = mapperContext.getConfiguration().get(MAPPER_APP_CONTEXT_XML);

    // processor batch size
    this.processorBatchSize =
        mapperContext.getConfiguration().getInt(MAPPER_DATA_PROCESSOR_BATCH_SIZE, 10000);

    // our record length?
    this.myRecordLength = mapperContext.getConfiguration().getInt(MAPPER_RECORD_LENGTH, -1);
    if (myRecordLength == -1) {
      throw new IOException(
          "USPSDataFileMapper must have the config property " + MAPPER_RECORD_LENGTH + " set > 0");
    }

    // my records stored counter
    this.myTotalProcessedCounter =
        ((MapContext) mapperContext)
            .getCounter(
                USPS_COUNTERS_GROUP_NAME,
                MAPPER_RECORDS_PROCESSED_COUNTER + mapperContext.getTaskAttemptID().toString());

    // overall job processed counter
    this.overallProcessedCounter =
        ((MapContext) mapperContext)
            .getCounter(USPS_COUNTERS_GROUP_NAME, OVERALL_RECORDS_PROCESSED_COUNTER);

    LOG.info(
        "USPSDataFileMapper configured: skipCopyrights="
            + this.skipCopyrights
            + " processorBatchSize="
            + processorBatchSize
            + " myRecordLength="
            + this.myRecordLength
            + " mapperAppContextXML="
            + mapperAppContextXML);

    // init static stuff
    if (!initialized) {
      synchronized (synchLock) {
        if (!initialized) {
          ApplicationContext context = new ClassPathXmlApplicationContext(mapperAppContextXML);
          classFinder = (ClassFinder) context.getBean("classFinder");
          uspsUtils = (USPSUtils) context.getBean("uspsUtils");
          idGenerator = (USPSIdGenerator) context.getBean("uspsIdGenerator");
          lineParser = (USPSRecordParser) context.getBean("uspsLineParser");
          dataProcessor = (USPSDataProcessor) context.getBean("uspsDataProcessor");

          initialized = true;
        }
      }
    }
  }
コード例 #7
0
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

      super.setup(context);

      id = context.getTaskAttemptID().getTaskID().getId();
      increment = context.getConfiguration().getInt("mapred.map.tasks", 0);
      if (increment == 0) {
        throw new IllegalArgumentException("mapred.map.tasks is zero");
      }
    }
コード例 #8
0
    protected void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      this.mapSleepCount = conf.getInt(MAP_SLEEP_COUNT, mapSleepCount);
      this.mapSleepDuration =
          mapSleepCount == 0 ? 0 : conf.getLong(MAP_SLEEP_TIME, 100) / mapSleepCount;
      vertexName = conf.get(org.apache.tez.mapreduce.hadoop.MRJobConfig.VERTEX_NAME);

      TaskAttemptID taId = context.getTaskAttemptID();

      ObjectRegistry objectRegistry = ObjectRegistryFactory.getObjectRegistry();
      String fooBarVal = (String) objectRegistry.get("FooBar");
      if (null == fooBarVal) {
        LOG.info("Adding FooBar key to Object cache");
        objectRegistry.add(
            ObjectLifeCycle.DAG, "FooBar", "BarFooFromTask" + taId.getTaskID().toString());
      } else {
        LOG.info(
            "Got FooBar val from Object cache"
                + ", currentTaskId="
                + taId.getTaskID().toString()
                + ", val="
                + fooBarVal);
      }

      String[] taskIds = conf.getStrings(MAP_ERROR_TASK_IDS);
      if (taId.getId() + 1 >= context.getMaxMapAttempts()) {
        finalAttempt = true;
      }
      boolean found = false;
      if (taskIds != null) {
        if (taskIds.length == 1 && taskIds[0].equals("*")) {
          found = true;
        }
        if (!found) {
          for (String taskId : taskIds) {
            if (Integer.valueOf(taskId).intValue() == taId.getTaskID().getId()) {
              found = true;
              break;
            }
          }
        }
      }
      if (found) {
        if (!finalAttempt) {
          throwError = conf.getBoolean(MAP_THROW_ERROR, false);
        }
        throwFatal = conf.getBoolean(MAP_FATAL_ERROR, false);
      }
    }
コード例 #9
0
ファイル: GATKReducer.java プロジェクト: svandenhoek/halvade
  protected void elPrepPreprocess(
      Context context, PreprocessingTools tools, SAMRecordIterator input, String output)
      throws InterruptedException, IOException, QualityException, URISyntaxException {
    String dictF = ref.substring(0, ref.lastIndexOf('.')) + ".dict";
    String rg = createReadGroupRecordString(RGID, RGLB, RGPL, RGPU, RGSM);
    String preSamOut = tmpFileBase + "-p1.sam";
    String samOut = tmpFileBase + "-p2.sam";
    String fCounts = tmpFileBase + "-features.count";

    outHeader = header.clone();
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);

    Logger.DEBUG("call elPrep");
    context.setStatus("call elPrep");
    int reads;
    if (keep) {
      reads =
          tools.callElPrep(
              preSamOut, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF);
    } else {
      reads =
          tools.streamElPrep(
              context, samOut, inputIsBam ? null : rg, threads, input, outHeader, dictF);
    }

    Logger.DEBUG(reads + " reads processed in elPrep");
    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);

    if (gff != null) {
      Logger.DEBUG("featureCounts");
      context.setStatus("featureCounts");
      tools.runFeatureCounts(gff, samOut, fCounts, threads);
      HalvadeFileUtils.uploadFileToHDFS(
          context,
          FileSystem.get(new URI(outputdir), context.getConfiguration()),
          fCounts,
          outputdir + context.getTaskAttemptID().toString() + ".count");
    }
    context.setStatus("convert SAM to BAM");
    Logger.DEBUG("convert SAM to BAM");
    tools.callSAMToBAM(samOut, output, threads);
    context.setStatus("build bam index");
    Logger.DEBUG("build bam index");
    tools.runBuildBamIndex(output);
    // remove temporary files
    HalvadeFileUtils.removeLocalFile(keep, preSamOut, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, samOut, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
  }
コード例 #10
0
  protected void reduce(LongWritable key, Iterable<Text> values, Context context)
      throws IOException, InterruptedException {

    for (Text value : values) {
      Pair<TaskEstimate, HiveObjectSpec> input =
          MetastoreReplicationJob.deseralizeJobResult(value.toString());
      TaskEstimate estimate = input.getLeft();
      HiveObjectSpec spec = input.getRight();
      String result = value.toString();
      String extra = "";

      try {
        if (estimate.getTaskType() == TaskEstimate.TaskType.CHECK_PARTITION) {
          // Table exists in source, but not in dest. It should copy the table.
          TaskEstimate newEstimate = estimator.analyze(spec);

          result = MetastoreReplicationJob.serializeJobResult(newEstimate, spec);
        }
      } catch (HiveMetastoreException e) {
        LOG.error(
            String.format(
                "Hit exception during db:%s, tbl:%s, part:%s",
                spec.getDbName(), spec.getTableName(), spec.getPartitionName()),
            e);
        extra =
            String.format(
                "exception in %s of mapper = %s",
                estimate.getTaskType().toString(), context.getTaskAttemptID().toString());
      }

      context.write(new Text(result), new Text(extra));
      ++this.count;
      if (this.count % 100 == 0) {
        LOG.info("Processed " + this.count + " entities");
      }
    }
  }
コード例 #11
0
ファイル: GATKReducer.java プロジェクト: svandenhoek/halvade
  protected void PicardPreprocess(
      Context context, PreprocessingTools tools, SAMRecordIterator input, String output)
      throws InterruptedException, QualityException, IOException, URISyntaxException {
    outHeader = header.clone();
    outHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
    // tmp files
    String tmpOut1 = tmpFileBase + "-p1.bam";
    String tmpOut2 = tmpFileBase + "-p2.bam";
    String tmpOut3 = tmpFileBase + "-p3.sam";
    String fCounts = tmpFileBase + "-features.count";
    String tmpMetrics = tmpFileBase + "-p3-metrics.txt";
    SAMFileWriterFactory factory = new SAMFileWriterFactory();
    if (!inputIsBam) {
      outHeader.addReadGroup(bamrg);
    }
    SAMFileWriter writer = factory.makeBAMWriter(outHeader, true, new File(tmpOut1));

    long startTime = System.currentTimeMillis();

    int count = 0;
    SAMRecord sam;
    while (input.hasNext()) {
      sam = input.next();
      writer.addAlignment(sam);
      count++;
    }
    int reads = input.getCount();
    writer.close();

    context.getCounter(HalvadeCounters.IN_PREP_READS).increment(reads);
    long estimatedTime = System.currentTimeMillis() - startTime;
    context.getCounter(HalvadeCounters.TIME_HADOOP_SAMTOBAM).increment(estimatedTime);
    Logger.DEBUG("time writing " + count + " records to disk: " + estimatedTime / 1000);

    Logger.DEBUG("clean sam");
    context.setStatus("clean sam");
    tools.runCleanSam(tmpOut1, tmpOut2);
    Logger.DEBUG("mark duplicates");
    context.setStatus("mark duplicates");
    tools.runMarkDuplicates(tmpOut2, tmpOut3, tmpMetrics);

    if (gff != null) {
      // tmpOut3 is sam for htseq count!
      Logger.DEBUG("featureCounts");
      context.setStatus("featureCounts");
      tools.runFeatureCounts(gff, tmpOut3, fCounts, threads);
      HalvadeFileUtils.uploadFileToHDFS(
          context,
          FileSystem.get(new URI(outputdir), context.getConfiguration()),
          fCounts,
          outputdir + context.getTaskAttemptID().toString() + ".count");
    }

    if (!inputIsBam) {
      Logger.DEBUG("add read-group");
      context.setStatus("add read-group");
      tools.runAddOrReplaceReadGroups(tmpOut3, output, RGID, RGLB, RGPL, RGPU, RGSM);
    } else {
      context.setStatus("convert SAM to BAM");
      Logger.DEBUG("convert SAM to BAM");
      tools.callSAMToBAM(tmpOut3, output, threads);
    }

    Logger.DEBUG("build bam index");
    context.setStatus("build bam index");
    tools.runBuildBamIndex(output);

    estimatedTime = System.currentTimeMillis() - startTime;
    Logger.DEBUG("estimated time: " + estimatedTime / 1000);

    // remove all temporary files now!
    HalvadeFileUtils.removeLocalFile(keep, tmpMetrics, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut1, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut2, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, tmpOut3, context, HalvadeCounters.FOUT_GATK_TMP);
    HalvadeFileUtils.removeLocalFile(keep, fCounts);
  }
コード例 #12
0
    @Override
    protected void map(String key, String value, final Context context)
        throws IOException, InterruptedException {
      final InputSplit split = context.getInputSplit();
      if (!(split instanceof DatasourceInputSplit)) {
        throw new IAE(
            "Unexpected split type. Expected [%s] was [%s]",
            DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName());
      }

      final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
      final File tmpDir = Paths.get(tmpDirLoc).toFile();

      final DataSegment segment =
          Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment();

      final HadoopDruidConverterConfig config =
          converterConfigFromConfiguration(context.getConfiguration());

      context.setStatus("DOWNLOADING");
      context.progress();
      final Path inPath = new Path(JobHelper.getURIFromSegment(segment));
      final File inDir = new File(tmpDir, "in");

      if (inDir.exists() && !inDir.delete()) {
        log.warn("Could not delete [%s]", inDir);
      }

      if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) {
        log.warn("Unable to make directory");
      }

      final long inSize =
          JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context);
      log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath());
      context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize);

      context.setStatus("CONVERTING");
      context.progress();
      final File outDir = new File(tmpDir, "out");
      if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) {
        throw new IOException(String.format("Could not create output directory [%s]", outDir));
      }
      HadoopDruidConverterConfig.INDEX_MERGER.convert(
          inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context));
      if (config.isValidate()) {
        context.setStatus("Validating");
        HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir);
      }
      context.progress();
      context.setStatus("Starting PUSH");
      final Path baseOutputPath = new Path(config.getSegmentOutputPath());
      final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration());
      final DataSegment finalSegmentTemplate =
          segment.withVersion(segment.getVersion() + "_converted");
      final DataSegment finalSegment =
          JobHelper.serializeOutIndex(
              finalSegmentTemplate,
              context.getConfiguration(),
              context,
              context.getTaskAttemptID(),
              outDir,
              JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate));
      context.progress();
      context.setStatus("Finished PUSH");
      final String finalSegmentString =
          HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment);
      context
          .getConfiguration()
          .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString);
      context.write(new Text("dataSegment"), new Text(finalSegmentString));

      context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize());
      context.progress();
      context.setStatus("Ready To Commit");
    }
コード例 #13
0
 @Override
 protected void cleanup(Context context) throws IOException, InterruptedException {
   context.write(
       new IntWritable(context.getTaskAttemptID().getTaskID().getId()),
       new VectorWritable(new DenseVector(mYtY.getData())));
 }