Beispiel #1
0
  /**
   * Implements the core-execution. Creates the file-list for copy, and launches the Hadoop-job, to
   * do the copy.
   *
   * @return Job handle
   * @throws Exception, on failure.
   */
  public Job execute() throws Exception {
    assert inputOptions != null;
    assert getConf() != null;

    Job job = null;
    try {
      metaFolder = createMetaFolderPath();
      jobFS = metaFolder.getFileSystem(getConf());

      job = createJob();
      createInputFileListing(job);

      job.submit();
      submitted = true;
    } finally {
      if (!submitted) {
        cleanup();
      }
    }

    String jobID = getJobID(job);
    job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);

    LOG.info("DistCp job-id: " + jobID);
    LOG.info("DistCp job may be tracked at: " + job.getTrackingURL());
    LOG.info("To cancel, run the following command:\thadoop job -kill " + jobID);

    long jobStartTime = System.nanoTime();
    if (inputOptions.shouldBlock() && !job.waitForCompletion(true)) {
      updateJobTimeInNanos(jobStartTime);
      throw new IOException("DistCp failure: Job " + jobID + " has failed. ");
    }
    updateJobTimeInNanos(jobStartTime);
    return job;
  }
 @Override
 public int run(String[] args) throws Exception {
   if (args.length < 4) {
     writeUsage();
     return 1;
   }
   Path secretsPath = new Path(args[0]);
   Path saltFilePath = new Path(args[1]);
   Path inputPath = new Path(args[2]);
   Path outputPath = new Path(args[3]);
   // Make sure the salt file exists
   generateSaltIfNeeded(saltFilePath, secretsPath);
   // Configure the job
   Job job = configureJob(secretsPath, saltFilePath, inputPath, outputPath);
   // Run it
   long startTime = System.currentTimeMillis();
   job.submit();
   if (job.waitForCompletion(true)) {
     System.out.printf(
         "Done obfuscating - took %d seconds.\n", (System.currentTimeMillis() - startTime) / 1000);
   } else {
     System.err.printf("Job finished with errors: %s\n", job.getStatus().getFailureInfo());
     return 2;
   }
   return 0;
 }
Beispiel #3
0
  /** Run a job. */
  static void runJob(String name, Job job, Machine machine, String startmessage, Util.Timer timer) {
    JOB_SEMAPHORE.acquireUninterruptibly();
    Long starttime = null;
    try {
      try {
        starttime = timer.tick("starting " + name + " ...\n  " + startmessage);

        // initialize and submit a job
        machine.init(job);
        job.submit();

        // Separate jobs
        final long sleeptime = 1000L * job.getConfiguration().getInt(JOB_SEPARATION_PROPERTY, 10);
        if (sleeptime > 0) {
          Util.out.println(name + "> sleep(" + Util.millis2String(sleeptime) + ")");
          Thread.sleep(sleeptime);
        }
      } finally {
        JOB_SEMAPHORE.release();
      }

      if (!job.waitForCompletion(false)) throw new RuntimeException(name + " failed.");
    } catch (Exception e) {
      throw e instanceof RuntimeException ? (RuntimeException) e : new RuntimeException(e);
    } finally {
      if (starttime != null)
        timer.tick(name + "> timetaken=" + Util.millis2String(timer.tick() - starttime));
    }
  }
Beispiel #4
0
  /**
   * Entry point to start job.
   *
   * @param args Command line parameters.
   * @throws Exception If fails.
   */
  public static void main(String[] args) throws Exception {
    if (args.length != 2) {
      System.out.println("usage: [input] [output]");
      System.exit(-1);
    }

    Job job = getJob(args[0], args[1]);

    job.submit();
  }
Beispiel #5
0
 static void waitForJob(Job job) throws Exception {
   job.submit();
   while (!job.isComplete()) {
     LOG.debug("waiting for job " + job.getJobName());
     sleep(50);
   }
   LOG.debug(
       "status for job " + job.getJobName() + ": " + (job.isSuccessful() ? "SUCCESS" : "FAILURE"));
   if (!job.isSuccessful()) {
     throw new RuntimeException("job failed " + job.getJobName());
   }
 }
Beispiel #6
0
 /**
  * Submit the job to the cluster and wait for it to finish.
  *
  * @param verbose print the progress to the user
  * @return true if the job succeeded
  * @throws IOException thrown if the communication with the <code>JobTracker</code> is lost
  */
 public boolean waitForCompletion(boolean verbose)
     throws IOException, InterruptedException, ClassNotFoundException {
   if (state == JobState.DEFINE) {
     submit();
   }
   if (verbose) {
     jobClient.monitorAndPrintJob(conf, info);
   } else {
     info.waitForCompletion();
   }
   return isSuccessful();
 }
Beispiel #7
0
  @SuppressWarnings({"unchecked", "rawtypes"})
  public void start(
      Path outputDir, int numReducers, boolean concurrent, String accessKey, String secretKey)
      throws GoraException, IOException, Exception {
    LOG.info("Running Verify with outputDir=" + outputDir + ", numReducers=" + numReducers);

    // DataStore<Long,CINode> store = DataStoreFactory.getDataStore(Long.class, CINode.class, new
    // Configuration());
    auth = new BasicAWSCredentials(accessKey, secretKey);

    DataStore<Long, cidynamonode> store =
        WSDataStoreFactory.createDataStore(
            DynamoDBStore.class, DynamoDBKey.class, cidynamonode.class, auth);

    job = new Job(getConf());

    if (!job.getConfiguration()
        .get("io.serializations")
        .contains("org.apache.hadoop.io.serializer.JavaSerialization")) {
      job.getConfiguration()
          .set(
              "io.serializations",
              job.getConfiguration().get("io.serializations")
                  + ",org.apache.hadoop.io.serializer.JavaSerialization");
    }

    job.setJobName("Link Verifier");
    job.setNumReduceTasks(numReducers);
    job.setJarByClass(getClass());

    Query query = store.newQuery();
    // if (!concurrent) {
    // no concurrency filtering, only need prev field
    // query.setFields("prev");
    // } else {
    // readFlushed(job.getCon  figuration());
    // }

    GoraMapper.initMapperJob(
        job, query, store, DynamoDBKey.class, VLongWritable.class, VerifyMapper.class, true);

    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);

    job.setReducerClass(VerifyReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    store.close();

    job.submit();
  }
 @Override
 public RunningJob submitJob(org.pentaho.hadoop.shim.api.Configuration c) throws IOException {
   ClassLoader cl = Thread.currentThread().getContextClassLoader();
   Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
   try {
     Job job = ((org.pentaho.hadoop.shim.cdh54.ConfigurationProxyV2) c).getJob();
     job.submit();
     return new RunningJobProxyV2(job);
   } catch (InterruptedException e) {
     throw new RuntimeException(e);
   } catch (ClassNotFoundException e) {
     throw new RuntimeException(e);
   } finally {
     Thread.currentThread().setContextClassLoader(cl);
   }
 }
Beispiel #9
0
  public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s <comma separated paths> <output path>\n", this.getClass().getName());
      return -1;
    }

    Job job = Job.getInstance();
    job.setJobName("PasmJoin");
    job.setJarByClass(PsamXY.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(AvroValue.class);
    job.setOutputKeyClass(AvroKey.class);
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(PsamXYMapper.class);
    job.setReducerClass(PsamXYReducer.class);

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    FileInputFormat.setInputPaths(job, args[0]);
    Path output = new Path(args[1]);
    FileOutputFormat.setOutputPath(job, output);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(output, true);

    AvroJob.setOutputKeySchema(job, outputSchema);
    AvroJob.setMapOutputValueSchema(job, outputSchema);

    // DistributedCache.addCacheFile(new Path("BM_TERM_TYPE_DMT.avro").toUri(),
    // job.getConfiguration());

    job.setNumReduceTasks(1);
    job.submit();

    job.waitForCompletion(true);
    return 0;
  }
  public static void run(
      Configuration conf, Path[] inputPaths, Path outputPath, int k, int p, long seed)
      throws ClassNotFoundException, InterruptedException, IOException {

    Job job = new Job(conf);
    job.setJobName("YtY-job");
    job.setJarByClass(YtYJob.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(YtYMapper.class);

    job.getConfiguration().setLong(PROP_OMEGA_SEED, seed);
    job.getConfiguration().setInt(PROP_K, k);
    job.getConfiguration().setInt(PROP_P, p);

    /*
     * we must reduce to just one matrix which means we need only one reducer.
     * But it's ok since each mapper outputs only one vector (a packed
     * UpperTriangular) so even if there're thousands of mappers, one reducer
     * should cope just fine.
     */
    job.setNumReduceTasks(1);

    job.submit();
    job.waitForCompletion(false);

    if (!job.isSuccessful()) {
      throw new IOException("YtY job unsuccessful.");
    }
  }
  /** Runs a GridMix data-generation job. */
  private static void runDataGenJob(Configuration conf, Path tempDir)
      throws IOException, ClassNotFoundException, InterruptedException {
    JobClient client = new JobClient(conf);

    // get the local job runner
    conf.setInt(MRJobConfig.NUM_MAPS, 1);

    Job job = new Job(conf);

    CompressionEmulationUtil.configure(job);
    job.setInputFormatClass(CustomInputFormat.class);

    // set the output path
    FileOutputFormat.setOutputPath(job, tempDir);

    // submit and wait for completion
    job.submit();
    int ret = job.waitForCompletion(true) ? 0 : 1;

    assertEquals("Job Failed", 0, ret);
  }
Beispiel #12
0
 public Job call() throws IOException, InterruptedException, ClassNotFoundException {
   job.setMapperClass(GridmixMapper.class);
   job.setReducerClass(GridmixReducer.class);
   job.setNumReduceTasks(jobdesc.getNumberReduces());
   job.setMapOutputKeyClass(GridmixKey.class);
   job.setMapOutputValueClass(GridmixRecord.class);
   job.setSortComparatorClass(GridmixKey.Comparator.class);
   job.setGroupingComparatorClass(SpecGroupingComparator.class);
   job.setInputFormatClass(GridmixInputFormat.class);
   job.setOutputFormatClass(RawBytesOutputFormat.class);
   job.setPartitionerClass(DraftPartitioner.class);
   job.setJarByClass(GridmixJob.class);
   job.getConfiguration().setInt("gridmix.job.seq", seq);
   job.getConfiguration()
       .set(ORIGNAME, null == jobdesc.getJobID() ? "<unknown>" : jobdesc.getJobID().toString());
   job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true);
   FileInputFormat.addInputPath(job, new Path("ignored"));
   FileOutputFormat.setOutputPath(job, outdir);
   job.submit();
   return job;
 }
Beispiel #13
0
 /**
  * Refer to {@link ReconstructionErrJob} for explanation of the job
  *
  * @param conf the configuration
  * @param yPath the path to input matrix Y
  * @param y2xPath the path to in-memory matrix Y2X, where X = Y * Y2X
  * @param yCols the number of columns in Y
  * @param xCols the number of columns in X
  * @param cPath the path to in-memory matrix C, where ReconY = Xc * C'
  * @param zmPath the path to vector Zm, where Zm = Ym * Y2X * C' - Ym
  * @param ymPath the path the the mean vector Ym
  * @param outPath the output path
  * @throws IOException
  * @throws InterruptedException
  * @throws ClassNotFoundException
  */
 public void run(
     Configuration conf,
     Path yPath,
     Path y2xPath,
     int yCols,
     int xCols,
     Path cPath,
     String zmPath,
     String ymPath,
     Path outPath,
     final float ERR_SAMPLE_RATE)
     throws IOException, InterruptedException, ClassNotFoundException {
   conf.set(MATRIXY2X, y2xPath.toString());
   conf.set(RECONSTRUCTIONMATRIX, cPath.toString());
   conf.set(ZMPATH, zmPath);
   conf.set(YMPATH, ymPath);
   conf.setInt(YCOLS, yCols);
   conf.setInt(XCOLS, xCols);
   conf.set(ERRSAMPLERATE, "" + ERR_SAMPLE_RATE);
   FileSystem fs = FileSystem.get(yPath.toUri(), conf);
   yPath = fs.makeQualified(yPath);
   outPath = fs.makeQualified(outPath);
   Job job = new Job(conf);
   FileInputFormat.addInputPath(job, yPath);
   FileOutputFormat.setOutputPath(job, outPath);
   job.setJobName("ReconErrJob-" + yPath.getName());
   job.setJarByClass(ReconstructionErrJob.class);
   job.setInputFormatClass(SequenceFileInputFormat.class);
   job.setNumReduceTasks(1);
   job.setOutputFormatClass(SequenceFileOutputFormat.class);
   job.setMapperClass(MyMapper.class);
   job.setReducerClass(MyReducer.class);
   job.setNumReduceTasks(1);
   job.setMapOutputKeyClass(IntWritable.class);
   job.setMapOutputValueClass(VectorWritable.class);
   job.setOutputKeyClass(IntWritable.class);
   job.setOutputValueClass(DoubleWritable.class);
   job.submit();
   job.waitForCompletion(true);
 }
Beispiel #14
0
  /*package*/ static Job sortOne(
      Configuration conf, Path inputFile, Path outputDir, String commandName, String samplingInfo)
      throws IOException, ClassNotFoundException, InterruptedException {
    conf.set(Utils.WORK_FILENAME_PROPERTY, inputFile.getName());
    Utils.configureSampling(outputDir, inputFile.getName(), conf);
    final Job job = new Job(conf);

    job.setJarByClass(Summarize.class);
    job.setMapperClass(Mapper.class);
    job.setReducerClass(SortReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(SortOutputFormat.class);

    FileInputFormat.setInputPaths(job, inputFile);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    final Timer t = new Timer();

    System.out.printf("%s :: Sampling%s...\n", commandName, samplingInfo);
    t.start();

    InputSampler.<LongWritable, Text>writePartitionFile(
        job,
        new InputSampler.SplitSampler<LongWritable, Text>(
            Math.max(1 << 16, conf.getInt("mapred.reduce.tasks", 1)), 10));

    System.out.printf("%s :: Sampling complete in %d.%03d s.\n", commandName, t.stopS(), t.fms());
    job.submit();
    return job;
  }
Beispiel #15
0
  @Override
  public int run(String[] arg) throws Exception {
    Job extractor = new Job(getConf());
    extractor.setMapperClass(MapClass.class);
    // no reduce, just identity

    extractor.setJobName("x-trace indexer");
    extractor.setJarByClass(this.getClass());

    extractor.setMapOutputKeyClass(BytesWritable.class);
    extractor.setMapOutputValueClass(TextArrayWritable.class);

    extractor.setOutputKeyClass(BytesWritable.class);
    extractor.setOutputValueClass(TextArrayWritable.class);

    extractor.setInputFormatClass(SequenceFileInputFormat.class);
    extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
    FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
    System.out.println("looks OK.  Submitting.");
    extractor.submit();
    //    extractor.waitForCompletion(false);
    return 0;
  }
Beispiel #16
0
  @Override
  protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
      System.err.println("fixmate :: WORKDIR not given.");
      return 3;
    }
    if (args.size() == 1) {
      System.err.println("fixmate :: INPATH not given.");
      return 3;
    }
    if (!cacheAndSetProperties(parser)) return 3;

    final SAMFileReader.ValidationStringency stringency =
        Utils.toStringency(
            parser.getOptionValue(
                stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()),
            "fixmate");
    if (stringency == null) return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs) inputs.add(new Path(in));

    final Configuration conf = getConf();

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    if (stringency != null)
      conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    final boolean globalSort = parser.getBoolean(sortOpt);
    if (globalSort) Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname);

    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    final Timer t = new Timer();
    try {
      // Required for path ".", for example.
      wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

      if (globalSort) Utils.configureSampling(wrkDir, intermediateOutName, conf);

      final Job job = new Job(conf);

      job.setJarByClass(FixMate.class);
      job.setMapperClass(FixMateMapper.class);
      job.setReducerClass(FixMateReducer.class);

      if (!parser.getBoolean(noCombinerOpt)) job.setCombinerClass(FixMateReducer.class);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(SAMRecordWritable.class);

      job.setInputFormatClass(AnySAMInputFormat.class);
      job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

      for (final Path in : inputs) FileInputFormat.addInputPath(job, in);

      FileOutputFormat.setOutputPath(job, wrkDir);

      if (globalSort) {
        job.setPartitionerClass(TotalOrderPartitioner.class);

        System.out.println("fixmate :: Sampling...");
        t.start();

        InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(
            job,
            new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(
                0.01, 10000, Math.max(100, reduceTasks)));

        System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());
      }

      job.submit();

      System.out.println("fixmate :: Waiting for job completion...");
      t.start();

      if (!job.waitForCompletion(verbose)) {
        System.err.println("fixmate :: Job failed.");
        return 4;
      }

      System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
      System.err.printf("fixmate :: Hadoop error: %s\n", e);
      return 4;
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e);
    } catch (InterruptedException e) {
      throw new RuntimeException(e);
    }

    if (outPath != null)
      try {
        Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate");
      } catch (IOException e) {
        System.err.printf("fixmate :: Output merging failed: %s\n", e);
        return 5;
      }
    return 0;
  }
  public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
      jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
      throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
      jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(
        JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
        JobHelper.distributedClassPath(
            getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
        job);

    Throwable throwable = null;
    try {
      job.submit();
      log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
      final boolean success = job.waitForCompletion(true);
      if (!success) {
        final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
        if (reports != null) {
          for (final TaskReport report : reports) {
            log.error(
                "Error in task [%s] : %s",
                report.getTaskId(), Arrays.toString(report.getDiagnostics()));
          }
        }
        return null;
      }
      try {
        loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
        writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
      } catch (IOException ex) {
        log.error(ex, "Could not fetch counters");
      }
      final JobID jobID = job.getJobID();

      final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
      final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
      final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
      final List<Path> goodPaths = new ArrayList<>();
      while (it.hasNext()) {
        final LocatedFileStatus locatedFileStatus = it.next();
        if (locatedFileStatus.isFile()) {
          final Path myPath = locatedFileStatus.getPath();
          if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
            goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
          }
        }
      }
      if (goodPaths.isEmpty()) {
        log.warn("No good data found at [%s]", jobDir);
        return null;
      }
      final List<DataSegment> returnList =
          ImmutableList.copyOf(
              Lists.transform(
                  goodPaths,
                  new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                      try {
                        if (!fs.exists(input)) {
                          throw new ISE(
                              "Somehow [%s] was found but [%s] is missing at [%s]",
                              ConvertingOutputFormat.DATA_SUCCESS_KEY,
                              ConvertingOutputFormat.DATA_FILE_KEY,
                              jobDir);
                        }
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                      try (final InputStream stream = fs.open(input)) {
                        return HadoopDruidConverterConfig.jsonMapper.readValue(
                            stream, DataSegment.class);
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                    }
                  }));
      if (returnList.size() == segments.size()) {
        return returnList;
      } else {
        throw new ISE(
            "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
            segments.size(), returnList.size(), jobDir);
      }
    } catch (InterruptedException | ClassNotFoundException e) {
      RuntimeException exception = Throwables.propagate(e);
      throwable = exception;
      throw exception;
    } catch (Throwable t) {
      throwable = t;
      throw t;
    } finally {
      try {
        cleanup(job);
      } catch (IOException e) {
        if (throwable != null) {
          throwable.addSuppressed(e);
        } else {
          log.error(e, "Could not clean up job [%s]", job.getJobID());
        }
      }
    }
  }
  public boolean run() {
    try {
      /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */

      if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
        throw new ISE(
            "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
            config.getPartitionsSpec());
      }

      if (!config.getPartitionsSpec().isAssumeGrouped()) {
        final Job groupByJob =
            Job.getInstance(
                new Configuration(),
                String.format(
                    "%s-determine_partitions_groupby-%s",
                    config.getDataSource(), config.getIntervals()));

        JobHelper.injectSystemProperties(groupByJob);
        config.addJobProperties(groupByJob);

        groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
        groupByJob.setMapOutputKeyClass(BytesWritable.class);
        groupByJob.setMapOutputValueClass(NullWritable.class);
        groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
        groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
        groupByJob.setOutputKeyClass(BytesWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        JobHelper.setupClasspath(
            JobHelper.distributedClassPath(config.getWorkingPath()),
            JobHelper.distributedClassPath(config.makeIntermediatePath()),
            groupByJob);

        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

        groupByJob.submit();
        log.info(
            "Job %s submitted, status available at: %s",
            groupByJob.getJobName(), groupByJob.getTrackingURL());

        if (!groupByJob.waitForCompletion(true)) {
          log.error("Job failed: %s", groupByJob.getJobID());
          return false;
        }
      } else {
        log.info("Skipping group-by job.");
      }

      /*
       * Read grouped data and determine appropriate partitions.
       */
      final Job dimSelectionJob =
          Job.getInstance(
              new Configuration(),
              String.format(
                  "%s-determine_partitions_dimselection-%s",
                  config.getDataSource(), config.getIntervals()));

      dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

      JobHelper.injectSystemProperties(dimSelectionJob);
      config.addJobProperties(dimSelectionJob);

      if (!config.getPartitionsSpec().isAssumeGrouped()) {
        // Read grouped data from the groupByJob.
        dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
        dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
        FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
      } else {
        // Directly read the source data, since we assume it's already grouped.
        dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
        config.addInputPaths(dimSelectionJob);
      }

      SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
      dimSelectionJob.setMapOutputValueClass(Text.class);
      dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
      dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
      dimSelectionJob.setOutputKeyClass(BytesWritable.class);
      dimSelectionJob.setOutputValueClass(Text.class);
      dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
      dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
      dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
      JobHelper.setupClasspath(
          JobHelper.distributedClassPath(config.getWorkingPath()),
          JobHelper.distributedClassPath(config.makeIntermediatePath()),
          dimSelectionJob);

      config.intoConfiguration(dimSelectionJob);
      FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

      dimSelectionJob.submit();
      log.info(
          "Job %s submitted, status available at: %s",
          dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());

      if (!dimSelectionJob.waitForCompletion(true)) {
        log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
        return false;
      }

      /*
       * Load partitions determined by the previous job.
       */

      log.info(
          "Job completed, loading up partitions for intervals[%s].",
          config.getSegmentGranularIntervals());
      FileSystem fileSystem = null;
      Map<DateTime, List<HadoopyShardSpec>> shardSpecs =
          Maps.newTreeMap(DateTimeComparator.getInstance());
      int shardCount = 0;
      for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
        final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
        if (fileSystem == null) {
          fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
        }
        if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
          List<ShardSpec> specs =
              config.JSON_MAPPER.readValue(
                  Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                  new TypeReference<List<ShardSpec>>() {});

          List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
          for (int i = 0; i < specs.size(); ++i) {
            actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
            log.info(
                "DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
          }

          shardSpecs.put(segmentGranularity.getStart(), actualSpecs);
        } else {
          log.info("Path[%s] didn't exist!?", partitionInfoPath);
        }
      }
      config.setShardSpecs(shardSpecs);

      return true;
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }
  }
Beispiel #19
0
  /**
   * start a vina hadoop job
   *
   * @param confLocalPath
   * @param receptorLocalPath
   * @param ligandPath
   * @param seed
   * @param topK
   * @param vinaJobID
   * @param node
   * @return
   */
  public HashMap<String, String> startJob(
      String confLocalPath,
      String receptorLocalPath,
      ArrayList<String> ligandPath,
      String seed,
      int topK,
      String vinaJobID,
      int numPerNode,
      boolean verbose) {
    HashMap<String, String> hm = new HashMap<String, String>();
    if (confLocalPath == null
        || receptorLocalPath == null
        || ligandPath == null
        || seed == null
        || vinaJobID == null
        || ligandPath.size() == 0
        || topK < 0) {
      hm.put("flag", "false");
      hm.put("hadoopID", "null");
      hm.put("vinaJobID", vinaJobID);
      hm.put("log", "error arguments");
      return hm;
    }
    GeneratePath gp = new GeneratePath(jobPath, srcDataPath);
    String confName = confLocalPath.substring(confLocalPath.lastIndexOf("/"));
    String confHDFSPath = jobPath + vinaJobID + confName;
    String receptorName = receptorLocalPath.substring(receptorLocalPath.lastIndexOf("/"));
    String receptorHDFSPATH = jobPath + vinaJobID + receptorName;
    HadoopFile hf;
    final String input = jobPath + vinaJobID + "/metadata";
    final String output = jobPath + vinaJobID + "/order";
    Path path = new Path(output);
    Configuration conf;
    FileSystem fs;
    Job job;
    try {
      gp.createMeta(ligandPath, vinaJobID, numPerNode);
      hf = new HadoopFile();
      hf.mkdir(jobPath + "/" + vinaJobID + "/exception");
      hf.mkdir(jobPath + "/" + vinaJobID + "/exceptionBackup");
      hf.localToHadoop(confLocalPath, confHDFSPath);
      hf.localToHadoop(receptorLocalPath, receptorHDFSPATH);
      conf = (new HadoopConf()).getConf();
      fs = FileSystem.get(conf);
      // set heart beat time 45min
      long milliSeconds = 45 * 60 * 1000;
      conf.setLong("mapred.task.timeout", milliSeconds);
      conf.set("vinaJobID", vinaJobID);
      conf.setInt("k", topK);
      conf.set("conf2HDFS", confHDFSPath);
      conf.set("receptorHDFS", receptorHDFSPATH);
      conf.set("seed", seed);
      if (fs.exists(path)) {
        fs.delete(path, true);
      }
      job = new Job(conf, vinaJobID);
      job.setNumReduceTasks(1);
      job.setJarByClass(VinaHadoop.class);
      job.setMapperClass(VinaMapper.class);
      job.setReducerClass(VinaReducer.class);
      job.setMapOutputKeyClass(DoubleWritable.class);
      job.setMapOutputValueClass(DataPair.class);
      job.setOutputKeyClass(DoubleWritable.class);
      job.setOutputValueClass(Text.class);
      FileInputFormat.addInputPath(job, new Path(input));
      FileOutputFormat.setOutputPath(job, new Path(output));

    } catch (IOException e) {
      // TODO Auto-generated catch block
      hm.put("flag", "false");
      hm.put("hadoopID", "null");
      hm.put("vinaJobID", vinaJobID);
      hm.put("log", e.getMessage());
      return hm;
    }

    try {
      if (verbose) {
        // System.exit(job.waitForCompletion(true) ? 0 : 1);
        job.waitForCompletion(true);
      } else {
        job.submit();
      }

    } catch (ClassNotFoundException | IOException | InterruptedException e) {
      // TODO Auto-generated catch block
      hm.put("flag", "false");
      hm.put("hadoopID", "null");
      hm.put("vinaJobID", vinaJobID);
      hm.put("log", e.getMessage());
      return hm;
    }
    hm.put("flag", "true");
    hm.put("hadoopID", job.getJobID().toString());
    hm.put("vinaJobID", vinaJobID);
    hm.put("log", "null");
    return hm;
  }
 // Can be overridded by tests.
 void submitJob(Job job, List<String> filesInJob, int priority)
     throws IOException, InterruptedException, ClassNotFoundException {
   job.submit();
   LOG.info("Job " + job.getID() + "(" + job.getJobName() + ") started");
   jobIndex.put(job, null);
 }
  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    // Configure intermediate reduces
    conf.setInt(MRJobConfig.MRR_INTERMEDIATE_STAGES, 1);

    // Set reducer class for intermediate reduce
    conf.setClass(
        MultiStageMRConfigUtil.getPropertyNameForIntermediateStage(1, "mapreduce.job.reduce.class"),
        MyGroupByReducer.class,
        Reducer.class);
    // Set reducer output key class
    conf.setClass(
        MultiStageMRConfigUtil.getPropertyNameForIntermediateStage(
            1, "mapreduce.map.output.key.class"),
        IntWritable.class,
        Object.class);
    // Set reducer output value class
    conf.setClass(
        MultiStageMRConfigUtil.getPropertyNameForIntermediateStage(
            1, "mapreduce.map.output.value.class"),
        Text.class,
        Object.class);
    conf.setInt(
        MultiStageMRConfigUtil.getPropertyNameForIntermediateStage(1, "mapreduce.job.reduces"), 2);

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: groupbyorderbymrrtest <in> <out>");
      System.exit(2);
    }

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "groupbyorderbymrrtest");

    job.setJarByClass(GroupByOrderByMRRTest.class);

    // Configure map
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Configure reduce
    job.setReducerClass(MyOrderByNoOpReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(1);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    TezClient tezClient = new TezClient(new TezConfiguration(conf));

    job.submit();
    JobID jobId = job.getJobID();
    ApplicationId appId = TypeConverter.toYarn(jobId).getAppId();

    DAGClient dagClient = tezClient.getDAGClient(appId);
    DAGStatus dagStatus = null;
    while (true) {
      dagStatus = dagClient.getDAGStatus();
      if (dagStatus.getState() == DAGStatus.State.RUNNING
          || dagStatus.getState() == DAGStatus.State.SUCCEEDED
          || dagStatus.getState() == DAGStatus.State.FAILED
          || dagStatus.getState() == DAGStatus.State.KILLED
          || dagStatus.getState() == DAGStatus.State.ERROR) {
        break;
      }
      try {
        Thread.sleep(500);
      } catch (InterruptedException e) {
        // continue;
      }
    }

    while (dagStatus.getState() == DAGStatus.State.RUNNING) {
      try {
        ExampleDriver.printMRRDAGStatus(dagStatus);
        try {
          Thread.sleep(1000);
        } catch (InterruptedException e) {
          // continue;
        }
        dagStatus = dagClient.getDAGStatus();
      } catch (TezException e) {
        LOG.fatal("Failed to get application progress. Exiting");
        System.exit(-1);
      }
    }

    ExampleDriver.printMRRDAGStatus(dagStatus);
    LOG.info("Application completed. " + "FinalState=" + dagStatus.getState());
    System.exit(dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1);
  }