/**
   * Configures the partitioner for generating HFiles.
   *
   * <p>Each generated HFile should fit within a region of of the target table. Additionally, it's
   * optimal to have only one HFile to load into each region, since a read from that region will
   * require reading from each HFile under management (until compaction happens and merges them all
   * back into one HFile).
   *
   * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the records output
   * from the Mapper based on their rank in a total ordering of the keys. The <code>startKeys</code>
   * argument should contain a list of the first key in each of those partitions.
   *
   * @param job The job to configure.
   * @param startKeys A list of keys that will mark the boundaries between the partitions for the
   *     sorted map output records.
   * @throws IOException If there is an error.
   */
  private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys)
      throws IOException {
    job.setPartitionerClass(TotalOrderPartitioner.class);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath =
        new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
      final URI cacheUri =
          new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
      DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
      throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
  }
示例#2
0
  //// jobSubmitDir is /tmp staging dir. copy lib 、files and so on into jobSubmitDir.
  private void copyAndConfigureFiles(Job job, Path jobSubmitDir) throws IOException {
    Configuration conf = job.getConfiguration();
    short replication = (short) conf.getInt(Job.SUBMIT_REPLICATION, 10);
    copyAndConfigureFiles(job, jobSubmitDir, replication);

    // Set the working directory
    if (job.getWorkingDirectory() == null) {
      job.setWorkingDirectory(jtFs.getWorkingDirectory());
    }
  }
 public static void cleanup(Job job) throws IOException {
   final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory());
   final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
   RuntimeException e = null;
   try {
     JobHelper.deleteWithRetry(fs, jobDir, true);
   } catch (RuntimeException ex) {
     e = ex;
   }
   try {
     JobHelper.deleteWithRetry(
         fs, getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true);
   } catch (RuntimeException ex) {
     if (e == null) {
       e = ex;
     } else {
       e.addSuppressed(ex);
     }
   }
   if (e != null) {
     throw e;
   }
 }
示例#4
0
  /** Hadoop {@link Tool} implementation */
  @Override
  public int run(String[] args) throws Exception {

    Options options = new Options();

    configureOptions(options);

    CommandLineParser parser = new GnuParser();

    try {
      CommandLine commandLine = parser.parse(options, args);

      if (commandLine.hasOption(VERBOSE)) {
        Logger.getGlobal().setLevel(Level.FINEST);
      }

      if (commandLine.hasOption(QUIET)) {
        Logger.getGlobal().setLevel(Level.OFF);
      }

      String transformationLocation = commandLine.getOptionValue(TRANSFORMATION);
      String sourcemmLocation = commandLine.getOptionValue(SOURCE_PACKAGE);
      String targetmmLocation = commandLine.getOptionValue(TARGET_PACKAGE);
      String recordsLocation = commandLine.getOptionValue(RECORDS_FILE);
      String inputLocation = commandLine.getOptionValue(INPUT_MODEL);
      String outputLocation =
          commandLine.getOptionValue(
              OUTPUT_MODEL, new Path(inputLocation).suffix(".out.xmi").toString());

      int recommendedMappers = 1;
      if (commandLine.hasOption(RECOMMENDED_MAPPERS)) {
        recommendedMappers =
            ((Number) commandLine.getParsedOptionValue(RECOMMENDED_MAPPERS)).intValue();
      }

      Configuration conf = this.getConf();
      Job job = Job.getInstance(conf, JOB_NAME);

      // Configure classes
      job.setJarByClass(ATLMRMaster.class);
      job.setMapperClass(ATLMRMapper.class);
      job.setReducerClass(ATLMRReducer.class);
      job.setInputFormatClass(NLineInputFormat.class);
      job.setOutputFormatClass(SequenceFileOutputFormat.class);
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(Text.class);
      job.setNumReduceTasks(1);

      // Configure MapReduce input/outputs
      Path recordsPath = new Path(recordsLocation);
      FileInputFormat.setInputPaths(job, recordsPath);
      String timestamp = new SimpleDateFormat("yyyyMMddhhmm").format(new Date());
      String outDirName = "atlmr-out-" + timestamp + "-" + UUID.randomUUID();
      FileOutputFormat.setOutputPath(
          job, new Path(job.getWorkingDirectory().suffix(Path.SEPARATOR + outDirName).toUri()));

      // Configure records per map
      FileSystem fileSystem = FileSystem.get(recordsPath.toUri(), conf);
      InputStream inputStream = fileSystem.open(recordsPath);
      long linesPerMap =
          (long) Math.ceil((double) countLines(inputStream) / (double) recommendedMappers);
      job.getConfiguration().setLong(NLineInputFormat.LINES_PER_MAP, linesPerMap);

      // Configure ATL related inputs/outputs
      job.getConfiguration().set(TRANSFORMATION, transformationLocation);
      job.getConfiguration().set(SOURCE_PACKAGE, sourcemmLocation);
      job.getConfiguration().set(TARGET_PACKAGE, targetmmLocation);
      job.getConfiguration().set(INPUT_MODEL, inputLocation);
      job.getConfiguration().set(OUTPUT_MODEL, outputLocation);

      Logger.getGlobal().log(Level.INFO, "Starting Job execution");
      long begin = System.currentTimeMillis();
      int returnValue = job.waitForCompletion(true) ? STATUS_OK : STATUS_ERROR;
      long end = System.currentTimeMillis();
      Logger.getGlobal()
          .log(
              Level.INFO,
              MessageFormat.format(
                  "Job execution ended in {0}s with status code {1}",
                  (end - begin) / 1000, returnValue));

      return returnValue;

    } catch (ParseException e) {
      System.err.println(e.getLocalizedMessage());
      HelpFormatter formatter = new HelpFormatter();
      formatter.setOptionComparator(new OptionComarator<>());
      try {
        formatter.setWidth(Math.max(Terminal.getTerminal().getTerminalWidth(), 80));
      } catch (Throwable t) {
        // Nothing to do...
      }
      ;
      formatter.printHelp("yarn jar <this-file.jar>", options, true);
      return STATUS_ERROR;
    }
  }
  public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
      jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
      throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
      jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(
        JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
        JobHelper.distributedClassPath(
            getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
        job);

    Throwable throwable = null;
    try {
      job.submit();
      log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
      final boolean success = job.waitForCompletion(true);
      if (!success) {
        final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
        if (reports != null) {
          for (final TaskReport report : reports) {
            log.error(
                "Error in task [%s] : %s",
                report.getTaskId(), Arrays.toString(report.getDiagnostics()));
          }
        }
        return null;
      }
      try {
        loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
        writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
      } catch (IOException ex) {
        log.error(ex, "Could not fetch counters");
      }
      final JobID jobID = job.getJobID();

      final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
      final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
      final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
      final List<Path> goodPaths = new ArrayList<>();
      while (it.hasNext()) {
        final LocatedFileStatus locatedFileStatus = it.next();
        if (locatedFileStatus.isFile()) {
          final Path myPath = locatedFileStatus.getPath();
          if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
            goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
          }
        }
      }
      if (goodPaths.isEmpty()) {
        log.warn("No good data found at [%s]", jobDir);
        return null;
      }
      final List<DataSegment> returnList =
          ImmutableList.copyOf(
              Lists.transform(
                  goodPaths,
                  new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                      try {
                        if (!fs.exists(input)) {
                          throw new ISE(
                              "Somehow [%s] was found but [%s] is missing at [%s]",
                              ConvertingOutputFormat.DATA_SUCCESS_KEY,
                              ConvertingOutputFormat.DATA_FILE_KEY,
                              jobDir);
                        }
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                      try (final InputStream stream = fs.open(input)) {
                        return HadoopDruidConverterConfig.jsonMapper.readValue(
                            stream, DataSegment.class);
                      } catch (final IOException e) {
                        throw Throwables.propagate(e);
                      }
                    }
                  }));
      if (returnList.size() == segments.size()) {
        return returnList;
      } else {
        throw new ISE(
            "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
            segments.size(), returnList.size(), jobDir);
      }
    } catch (InterruptedException | ClassNotFoundException e) {
      RuntimeException exception = Throwables.propagate(e);
      throwable = exception;
      throw exception;
    } catch (Throwable t) {
      throwable = t;
      throw t;
    } finally {
      try {
        cleanup(job);
      } catch (IOException e) {
        if (throwable != null) {
          throwable.addSuppressed(e);
        } else {
          log.error(e, "Could not clean up job [%s]", job.getJobID());
        }
      }
    }
  }