예제 #1
0
  /**
   * Generate the list of files and make them into FileSplits. This needs to be copied to insert a
   * filter on acceptable data
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    long desiredMappers =
        job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> fileStatuses = listStatus(job);
    boolean forceNumberMappers = fileStatuses.size() == 1;
    for (FileStatus file : fileStatuses) {
      Path path = file.getPath();
      if (!isPathAcceptable(path)) // filter acceptable data
      continue;
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        // use desired mappers to force more splits
        if (forceNumberMappers && desiredMappers > 0)
          maxSize = Math.min(maxSize, (length / desiredMappers));

        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (withinSlop(splitSize, bytesRemaining)) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    System.out.println("Total # of splits: " + splits.size());
    //     LOG.debug("Total # of splits: " + splits.size());
    return splits;
  }
예제 #2
0
  @Test
  public void testDeleteMissing() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext =
        new JobContextImpl(
            taskAttemptContext.getConfiguration(),
            taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();

    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
      fs = FileSystem.get(conf);
      sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
      fs.rename(new Path(targetBaseAdd), new Path(targetBase));

      DistCpOptions options =
          new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
      options.setSyncFolder(true);
      options.setDeleteMissing(true);
      options.appendToConf(conf);

      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
      listing.buildListing(listingFile, options);

      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);

      committer.commitJob(jobContext);
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
        Assert.fail("Source and target folders are not in sync");
      }

      // Test for idempotent commit
      committer.commitJob(jobContext);
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
      if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
        Assert.fail("Source and target folders are not in sync");
      }
    } catch (Throwable e) {
      LOG.error("Exception encountered while testing for delete missing", e);
      Assert.fail("Delete missing failure");
    } finally {
      TestDistCpUtils.delete(fs, "/tmp1");
      conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
  }
  /**
   * Implementation of InputFormat::getSplits(). Returns a list of InputSplits, such that the number
   * of bytes to be copied for all the splits are approximately equal.
   *
   * @param context JobContext for the job.
   * @return The list of uniformly-distributed input-splits.
   * @throws IOException
   * @throws InterruptedException
   */
  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Configuration configuration = context.getConfiguration();
    int numSplits = DistCpUtils.getInt(configuration, JobContext.NUM_MAPS);

    if (numSplits == 0) return new ArrayList<InputSplit>();

    return getSplits(
        configuration,
        numSplits,
        DistCpUtils.getLong(configuration, DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED));
  }
예제 #4
0
  @Test
  public void testPreserveStatus() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext =
        new JobContextImpl(
            taskAttemptContext.getConfiguration(),
            taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();

    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
      fs = FileSystem.get(conf);
      FsPermission sourcePerm = new FsPermission((short) 511);
      FsPermission initialPerm = new FsPermission((short) 448);
      sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm);
      targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm);

      DistCpOptions options =
          new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
      options.preserve(FileAttribute.PERMISSION);
      options.appendToConf(conf);

      CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
      Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
      listing.buildListing(listingFile, options);

      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);

      committer.commitJob(jobContext);
      if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
        Assert.fail("Permission don't match");
      }

      // Test for idempotent commit
      committer.commitJob(jobContext);
      if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
        Assert.fail("Permission don't match");
      }

    } catch (IOException e) {
      LOG.error("Exception encountered while testing for preserve status", e);
      Assert.fail("Preserve status failure");
    } finally {
      TestDistCpUtils.delete(fs, "/tmp1");
    }
  }
예제 #5
0
  @SuppressWarnings("unchecked")
  private <T extends InputSplit> int writeNewSplits(JobContext job, Path jobSubmitDir)
      throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);

    List<InputSplit> splits = input.getSplits(job);
    T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);

    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(array, new SplitComparator());
    JobSplitWriter.createSplitFiles(jobSubmitDir, conf, jobSubmitDir.getFileSystem(conf), array);
    //// num of split. the same as num of maps
    return array.length;
  }
예제 #6
0
  @Test
  public void testAtomicCommitExistingFinal() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext =
        new JobContextImpl(
            taskAttemptContext.getConfiguration(),
            taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();

    String workPath = "/tmp1/" + String.valueOf(rand.nextLong());
    String finalPath = "/tmp1/" + String.valueOf(rand.nextLong());
    FileSystem fs = null;
    try {
      OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
      fs = FileSystem.get(conf);
      fs.mkdirs(new Path(workPath));
      fs.mkdirs(new Path(finalPath));

      conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath);
      conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, finalPath);
      conf.setBoolean(DistCpConstants.CONF_LABEL_ATOMIC_COPY, true);

      Assert.assertTrue(fs.exists(new Path(workPath)));
      Assert.assertTrue(fs.exists(new Path(finalPath)));
      try {
        committer.commitJob(jobContext);
        Assert.fail("Should not be able to atomic-commit to pre-existing path.");
      } catch (Exception exception) {
        Assert.assertTrue(fs.exists(new Path(workPath)));
        Assert.assertTrue(fs.exists(new Path(finalPath)));
        LOG.info("Atomic-commit Test pass.");
      }

    } catch (IOException e) {
      LOG.error("Exception encountered while testing for atomic commit.", e);
      Assert.fail("Atomic commit failure");
    } finally {
      TestDistCpUtils.delete(fs, workPath);
      TestDistCpUtils.delete(fs, finalPath);
    }
  }
  /**
   * Set all the initial parameters needed in this class for connectivity out to Accumulo.
   *
   * @param context
   */
  private void initialize(JobContext context) { // Configuration conf){

    Configuration conf = context.getConfiguration();
    try {
      // output zoom level
      log.info("Working from zoom level = " + zoomLevel);
      if (zoomLevel == -1) {
        zoomLevel = Integer.parseInt(conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOMLEVEL));
      }

      table = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_OUTPUT_TABLE);
      username = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_USER);
      instanceName = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_INSTANCE);
      zooKeepers = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_ZOOKEEPERS);

      String pl = conf.get(MrGeoConstants.MRGEO_PROTECTION_LEVEL);
      if (pl != null) {
        colViz = new ColumnVisibility(pl);
      } else if (colViz == null) {
        vizStr = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_VIZ);

        if (vizStr == null) {
          colViz = new ColumnVisibility();
        } else {
          colViz = new ColumnVisibility(vizStr);
        }
      }

      password = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PASSWORD);
      String isEnc = conf.get(MrGeoAccumuloConstants.MRGEO_ACC_KEY_PWENCODED64, "false");
      if (isEnc.equalsIgnoreCase("true")) {
        password = Base64Utils.decodeToString(password);
      }

      if (_innerFormat != null) {
        return;
      }

      _innerFormat = AccumuloOutputFormat.class.newInstance();
      AuthenticationToken token = new PasswordToken(password.getBytes());
      //      log.info("Setting output with: u = " + username);
      //      log.info("Setting output with: p = " + password);
      //      log.info("Setting output with: i = " + instanceName);
      //      log.info("Setting output with: z = " + zooKeepers);

      boolean connSet = ConfiguratorBase.isConnectorInfoSet(AccumuloOutputFormat.class, conf);
      if (!connSet) {
        // job not always available - do it how Accumulo does it
        OutputConfigurator.setConnectorInfo(AccumuloOutputFormat.class, conf, username, token);
        ClientConfiguration cc = ClientConfiguration.loadDefault().withInstance(instanceName);
        cc.setProperty(ClientProperty.INSTANCE_ZK_HOST, zooKeepers);

        OutputConfigurator.setZooKeeperInstance(AccumuloOutputFormat.class, conf, cc);
        OutputConfigurator.setDefaultTableName(AccumuloOutputFormat.class, conf, table);
        OutputConfigurator.setCreateTables(AccumuloOutputFormat.class, conf, true);

        outputInfoSet = true;
      }
    } catch (InstantiationException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IllegalAccessException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (AccumuloSecurityException ase) {
      ase.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  } // end initialize
 /**
  * Check for validity of the output-specification for the job.
  *
  * @param context information about the job
  * @throws IOException when output should not be attempted
  */
 @Override
 public void checkOutputSpecs(JobContext context) {
   checkOutputSpecs(context.getConfiguration());
 }