Beispiel #1
0
  /**
   * Check whether the contents of src and dst are the same.
   *
   * <p>Return false if dstpath does not exist
   *
   * <p>If the files have different sizes, return false.
   *
   * <p>If the files have the same sizes, the file checksums will be compared.
   *
   * <p>When file checksum is not supported in any of file systems, two files are considered as the
   * same if they have the same size.
   */
  private static boolean sameFile(
      FileSystem srcfs, FileStatus srcstatus, FileSystem dstfs, Path dstpath) throws IOException {
    FileStatus dststatus;
    try {
      dststatus = dstfs.getFileStatus(dstpath);
    } catch (FileNotFoundException fnfe) {
      return false;
    }

    // same length?
    if (srcstatus.getLen() != dststatus.getLen()) {
      return false;
    }

    // compare checksums
    try {
      final FileChecksum srccs = srcfs.getFileChecksum(srcstatus.getPath());
      final FileChecksum dstcs = dstfs.getFileChecksum(dststatus.getPath());
      // return true if checksum is not supported
      // (i.e. some of the checksums is null)
      return srccs == null || dstcs == null || srccs.equals(dstcs);
    } catch (FileNotFoundException fnfe) {
      return false;
    }
  }
 /** Added for back compatibility. */
 public static Path getLocalCache(
     URI cache,
     Configuration conf,
     Path subdir,
     FileStatus fileStatus,
     boolean isArchive,
     long confFileStamp,
     Path currentWorkDir,
     boolean honorSymLinkConf,
     MRAsyncDiskService asyncDiskService,
     LocalDirAllocator lDirAllocator)
     throws IOException {
   return getLocalCache(
       cache,
       conf,
       subdir,
       fileStatus,
       isArchive,
       confFileStamp,
       fileStatus.getLen(),
       currentWorkDir,
       honorSymLinkConf,
       asyncDiskService,
       lDirAllocator);
 }
  /**
   * Generate the list of files and make them into FileSplits. This needs to be copied to insert a
   * filter on acceptable data
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    long desiredMappers =
        job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> fileStatuses = listStatus(job);
    boolean forceNumberMappers = fileStatuses.size() == 1;
    for (FileStatus file : fileStatuses) {
      Path path = file.getPath();
      if (!isPathAcceptable(path)) // filter acceptable data
      continue;
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        // use desired mappers to force more splits
        if (forceNumberMappers && desiredMappers > 0)
          maxSize = Math.min(maxSize, (length / desiredMappers));

        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (withinSlop(splitSize, bytesRemaining)) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    System.out.println("Total # of splits: " + splits.size());
    //     LOG.debug("Total # of splits: " + splits.size());
    return splits;
  }
Beispiel #4
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
 /**
  * Get the locally cached file or archive; it could either be previously cached (and valid) or
  * copy it from the {@link FileSystem} now.
  *
  * @param cache the cache to be localized, this should be specified as new
  *     URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is
  *     provided the file is assumed to be in the filesystem being used in the Configuration
  * @param conf The Confguration file which contains the filesystem
  * @param baseDir The base cache Dir where you wnat to localize the files/archives
  * @param fileStatus The file status on the dfs.
  * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or
  *     .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred
  *     automatically and the directory where the archive is unzipped/unjarred/untarred is returned
  *     as the Path. In case of a file, the path to the file is returned
  * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be
  *     cached hasn't changed since the job started
  * @param currentWorkDir this is the directory where you would want to create symlinks for the
  *     locally cached files/archives
  * @return the path to directory where the archives are unjarred in case of archives, the path to
  *     the file where the file is copied locally
  * @throws IOException
  */
 public static Path getLocalCache(
     URI cache,
     Configuration conf,
     Path baseDir,
     FileStatus fileStatus,
     boolean isArchive,
     long confFileStamp,
     Path currentWorkDir,
     MRAsyncDiskService asyncDiskService)
     throws IOException {
   return getLocalCache(
       cache,
       conf,
       baseDir,
       fileStatus,
       isArchive,
       confFileStamp,
       fileStatus.getLen(),
       currentWorkDir,
       true,
       asyncDiskService,
       new LocalDirAllocator("mapred.local.dir"));
 }
Beispiel #6
0
  /**
   * Launch application for the dag represented by this client.
   *
   * @throws YarnException
   * @throws IOException
   */
  public void startApplication() throws YarnException, IOException {
    Class<?>[] defaultClasses;

    if (applicationType.equals(YARN_APPLICATION_TYPE)) {
      // TODO restrict the security check to only check if security is enabled for webservices.
      if (UserGroupInformation.isSecurityEnabled()) {
        defaultClasses = DATATORRENT_SECURITY_CLASSES;
      } else {
        defaultClasses = DATATORRENT_CLASSES;
      }
    } else {
      throw new IllegalStateException(applicationType + " is not a valid application type.");
    }

    LinkedHashSet<String> localJarFiles = findJars(dag, defaultClasses);

    if (resources != null) {
      localJarFiles.addAll(resources);
    }

    YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics();
    LOG.info(
        "Got Cluster metric info from ASM"
            + ", numNodeManagers="
            + clusterMetrics.getNumNodeManagers());

    // GetClusterNodesRequest clusterNodesReq = Records.newRecord(GetClusterNodesRequest.class);
    // GetClusterNodesResponse clusterNodesResp =
    // rmClient.clientRM.getClusterNodes(clusterNodesReq);
    // LOG.info("Got Cluster node info from ASM");
    // for (NodeReport node : clusterNodesResp.getNodeReports()) {
    //  LOG.info("Got node report from ASM for"
    //           + ", nodeId=" + node.getNodeId()
    //           + ", nodeAddress" + node.getHttpAddress()
    //           + ", nodeRackName" + node.getRackName()
    //           + ", nodeNumContainers" + node.getNumContainers()
    //           + ", nodeHealthStatus" + node.getHealthReport());
    // }
    List<QueueUserACLInfo> listAclInfo = yarnClient.getQueueAclsInfo();
    for (QueueUserACLInfo aclInfo : listAclInfo) {
      for (QueueACL userAcl : aclInfo.getUserAcls()) {
        LOG.info(
            "User ACL Info for Queue"
                + ", queueName="
                + aclInfo.getQueueName()
                + ", userAcl="
                + userAcl.name());
      }
    }

    // Get a new application id
    YarnClientApplication newApp = yarnClient.createApplication();
    appId = newApp.getNewApplicationResponse().getApplicationId();

    // Dump out information about cluster capability as seen by the resource manager
    int maxMem = newApp.getNewApplicationResponse().getMaximumResourceCapability().getMemory();
    LOG.info("Max mem capabililty of resources in this cluster " + maxMem);
    int amMemory = dag.getMasterMemoryMB();
    if (amMemory > maxMem) {
      LOG.info(
          "AM memory specified above max threshold of cluster. Using max value."
              + ", specified="
              + amMemory
              + ", max="
              + maxMem);
      amMemory = maxMem;
    }

    if (dag.getAttributes().get(LogicalPlan.APPLICATION_ID) == null) {
      dag.setAttribute(LogicalPlan.APPLICATION_ID, appId.toString());
    }

    // Create launch context for app master
    LOG.info("Setting up application submission context for ASM");
    ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class);

    // set the application id
    appContext.setApplicationId(appId);
    // set the application name
    appContext.setApplicationName(dag.getValue(LogicalPlan.APPLICATION_NAME));
    appContext.setApplicationType(this.applicationType);
    if (YARN_APPLICATION_TYPE.equals(this.applicationType)) {
      // appContext.setMaxAppAttempts(1); // no retries until Stram is HA
    }

    // Set up the container launch context for the application master
    ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class);

    // Setup security tokens
    // If security is enabled get ResourceManager and NameNode delegation tokens.
    // Set these tokens on the container so that they are sent as part of application submission.
    // This also sets them up for renewal by ResourceManager. The NameNode delegation rmToken
    // is also used by ResourceManager to fetch the jars from HDFS and set them up for the
    // application master launch.
    if (UserGroupInformation.isSecurityEnabled()) {
      Credentials credentials = new Credentials();
      String tokenRenewer = conf.get(YarnConfiguration.RM_PRINCIPAL);
      if (tokenRenewer == null || tokenRenewer.length() == 0) {
        throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer");
      }

      // For now, only getting tokens for the default file-system.
      FileSystem fs = StramClientUtils.newFileSystemInstance(conf);
      try {
        final Token<?> tokens[] = fs.addDelegationTokens(tokenRenewer, credentials);
        if (tokens != null) {
          for (Token<?> token : tokens) {
            LOG.info("Got dt for " + fs.getUri() + "; " + token);
          }
        }
      } finally {
        fs.close();
      }

      addRMDelegationToken(tokenRenewer, credentials);

      DataOutputBuffer dob = new DataOutputBuffer();
      credentials.writeTokenStorageToStream(dob);
      ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
      amContainer.setTokens(fsTokens);
    }

    // set local resources for the application master
    // local files or archives as needed
    // In this scenario, the jar file for the application master is part of the local resources
    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();

    // copy required jar files to dfs, to be localized for containers
    FileSystem fs = StramClientUtils.newFileSystemInstance(conf);
    try {
      Path appsBasePath =
          new Path(StramClientUtils.getDTDFSRootDir(fs, conf), StramClientUtils.SUBDIR_APPS);
      Path appPath = new Path(appsBasePath, appId.toString());

      String libJarsCsv = copyFromLocal(fs, appPath, localJarFiles.toArray(new String[] {}));

      LOG.info("libjars: {}", libJarsCsv);
      dag.getAttributes().put(LogicalPlan.LIBRARY_JARS, libJarsCsv);
      LaunchContainerRunnable.addFilesToLocalResources(
          LocalResourceType.FILE, libJarsCsv, localResources, fs);

      if (archives != null) {
        String[] localFiles = archives.split(",");
        String archivesCsv = copyFromLocal(fs, appPath, localFiles);
        LOG.info("archives: {}", archivesCsv);
        dag.getAttributes().put(LogicalPlan.ARCHIVES, archivesCsv);
        LaunchContainerRunnable.addFilesToLocalResources(
            LocalResourceType.ARCHIVE, archivesCsv, localResources, fs);
      }

      if (files != null) {
        String[] localFiles = files.split(",");
        String filesCsv = copyFromLocal(fs, appPath, localFiles);
        LOG.info("files: {}", filesCsv);
        dag.getAttributes().put(LogicalPlan.FILES, filesCsv);
        LaunchContainerRunnable.addFilesToLocalResources(
            LocalResourceType.FILE, filesCsv, localResources, fs);
      }

      dag.getAttributes().put(LogicalPlan.APPLICATION_PATH, appPath.toString());
      if (dag.getAttributes().get(OperatorContext.STORAGE_AGENT) == null) {
          /* which would be the most likely case */
        Path checkpointPath = new Path(appPath, LogicalPlan.SUBDIR_CHECKPOINTS);
        // use conf client side to pickup any proxy settings from dt-site.xml
        dag.setAttribute(
            OperatorContext.STORAGE_AGENT, new FSStorageAgent(checkpointPath.toString(), conf));
      }
      if (dag.getAttributes().get(LogicalPlan.CONTAINER_OPTS_CONFIGURATOR) == null) {
        dag.setAttribute(
            LogicalPlan.CONTAINER_OPTS_CONFIGURATOR, new BasicContainerOptConfigurator());
      }

      // Set the log4j properties if needed
      if (!log4jPropFile.isEmpty()) {
        Path log4jSrc = new Path(log4jPropFile);
        Path log4jDst = new Path(appPath, "log4j.props");
        fs.copyFromLocalFile(false, true, log4jSrc, log4jDst);
        FileStatus log4jFileStatus = fs.getFileStatus(log4jDst);
        LocalResource log4jRsrc = Records.newRecord(LocalResource.class);
        log4jRsrc.setType(LocalResourceType.FILE);
        log4jRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
        log4jRsrc.setResource(ConverterUtils.getYarnUrlFromURI(log4jDst.toUri()));
        log4jRsrc.setTimestamp(log4jFileStatus.getModificationTime());
        log4jRsrc.setSize(log4jFileStatus.getLen());
        localResources.put("log4j.properties", log4jRsrc);
      }

      if (originalAppId != null) {
        Path origAppPath = new Path(appsBasePath, this.originalAppId);
        LOG.info("Restart from {}", origAppPath);
        copyInitialState(origAppPath);
      }

      // push logical plan to DFS location
      Path cfgDst = new Path(appPath, LogicalPlan.SER_FILE_NAME);
      FSDataOutputStream outStream = fs.create(cfgDst, true);
      LogicalPlan.write(this.dag, outStream);
      outStream.close();

      Path launchConfigDst = new Path(appPath, LogicalPlan.LAUNCH_CONFIG_FILE_NAME);
      outStream = fs.create(launchConfigDst, true);
      conf.writeXml(outStream);
      outStream.close();

      FileStatus topologyFileStatus = fs.getFileStatus(cfgDst);
      LocalResource topologyRsrc = Records.newRecord(LocalResource.class);
      topologyRsrc.setType(LocalResourceType.FILE);
      topologyRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
      topologyRsrc.setResource(ConverterUtils.getYarnUrlFromURI(cfgDst.toUri()));
      topologyRsrc.setTimestamp(topologyFileStatus.getModificationTime());
      topologyRsrc.setSize(topologyFileStatus.getLen());
      localResources.put(LogicalPlan.SER_FILE_NAME, topologyRsrc);

      // Set local resource info into app master container launch context
      amContainer.setLocalResources(localResources);

      // Set the necessary security tokens as needed
      // amContainer.setContainerTokens(containerToken);
      // Set the env variables to be setup in the env where the application master will be run
      LOG.info("Set the environment for the application master");
      Map<String, String> env = new HashMap<String, String>();

      // Add application jar(s) location to classpath
      // At some point we should not be required to add
      // the hadoop specific classpaths to the env.
      // It should be provided out of the box.
      // For now setting all required classpaths including
      // the classpath to "." for the application jar(s)
      // including ${CLASSPATH} will duplicate the class path in app master, removing it for now
      // StringBuilder classPathEnv = new StringBuilder("${CLASSPATH}:./*");
      StringBuilder classPathEnv = new StringBuilder("./*");
      String classpath = conf.get(YarnConfiguration.YARN_APPLICATION_CLASSPATH);
      for (String c :
          StringUtils.isBlank(classpath)
              ? YarnConfiguration.DEFAULT_YARN_APPLICATION_CLASSPATH
              : classpath.split(",")) {
        if (c.equals("$HADOOP_CLIENT_CONF_DIR")) {
          // SPOI-2501
          continue;
        }
        classPathEnv.append(':');
        classPathEnv.append(c.trim());
      }
      env.put("CLASSPATH", classPathEnv.toString());
      // propagate to replace node managers user name (effective in non-secure mode)
      env.put("HADOOP_USER_NAME", UserGroupInformation.getLoginUser().getUserName());

      amContainer.setEnvironment(env);

      // Set the necessary command to execute the application master
      ArrayList<CharSequence> vargs = new ArrayList<CharSequence>(30);

      // Set java executable command
      LOG.info("Setting up app master command");
      vargs.add(javaCmd);
      if (dag.isDebug()) {
        vargs.add("-agentlib:jdwp=transport=dt_socket,server=y,suspend=n");
      }
      // Set Xmx based on am memory size
      // default heap size 75% of total memory
      vargs.add("-Xmx" + (amMemory * 3 / 4) + "m");
      vargs.add("-XX:+HeapDumpOnOutOfMemoryError");
      vargs.add("-XX:HeapDumpPath=/tmp/dt-heap-" + appId.getId() + ".bin");
      vargs.add("-Dhadoop.root.logger=" + (dag.isDebug() ? "DEBUG" : "INFO") + ",RFA");
      vargs.add("-Dhadoop.log.dir=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR);
      vargs.add(String.format("-D%s=%s", StreamingContainer.PROP_APP_PATH, dag.assertAppPath()));
      if (dag.isDebug()) {
        vargs.add("-Dlog4j.debug=true");
      }

      String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL);
      if (loggersLevel != null) {
        vargs.add(String.format("-D%s=%s", DTLoggerFactory.DT_LOGGERS_LEVEL, loggersLevel));
      }
      vargs.add(StreamingAppMaster.class.getName());
      vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stdout");
      vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/AppMaster.stderr");

      // Get final command
      StringBuilder command = new StringBuilder(9 * vargs.size());
      for (CharSequence str : vargs) {
        command.append(str).append(" ");
      }

      LOG.info("Completed setting up app master command " + command.toString());
      List<String> commands = new ArrayList<String>();
      commands.add(command.toString());
      amContainer.setCommands(commands);

      // Set up resource type requirements
      // For now, only memory is supported so we set memory requirements
      Resource capability = Records.newRecord(Resource.class);
      capability.setMemory(amMemory);
      appContext.setResource(capability);

      // Service data is a binary blob that can be passed to the application
      // Not needed in this scenario
      // amContainer.setServiceData(serviceData);
      appContext.setAMContainerSpec(amContainer);

      // Set the priority for the application master
      Priority pri = Records.newRecord(Priority.class);
      pri.setPriority(amPriority);
      appContext.setPriority(pri);
      // Set the queue to which this application is to be submitted in the RM
      appContext.setQueue(queueName);

      // Submit the application to the applications manager
      // SubmitApplicationResponse submitResp = rmClient.submitApplication(appRequest);
      // Ignore the response as either a valid response object is returned on success
      // or an exception thrown to denote some form of a failure
      String specStr =
          Objects.toStringHelper("Submitting application: ")
              .add("name", appContext.getApplicationName())
              .add("queue", appContext.getQueue())
              .add("user", UserGroupInformation.getLoginUser())
              .add("resource", appContext.getResource())
              .toString();
      LOG.info(specStr);
      if (dag.isDebug()) {
        // LOG.info("Full submission context: " + appContext);
      }
      yarnClient.submitApplication(appContext);
    } finally {
      fs.close();
    }
  }
Beispiel #7
0
  /**
   * Initialize DFSCopyFileMapper specific job-configuration.
   *
   * @param conf : The dfs/mapred configuration.
   * @param jobConf : The handle to the jobConf object to be initialized.
   * @param args Arguments
   */
  private static void setup(Configuration conf, JobConf jobConf, final Arguments args)
      throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(
        Options.IGNORE_READ_FAILURES.propertyname,
        args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(
        Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
      dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
      String filename = "_distcp_logs_" + randomId;
      if (!dstExists || !dstIsDir) {
        Path parent = args.dst.getParent();
        if (!dstfs.exists(parent)) {
          dstfs.mkdirs(parent);
        }
        logPath = new Path(parent, filename);
      } else {
        logPath = new Path(args.dst, filename);
      }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            srcfilelist,
            LongWritable.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer =
        SequenceFile.createWriter(
            jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            dstdirlist,
            Text.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
      for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) {
        final Path src = srcItr.next();
        FileSystem srcfs = src.getFileSystem(conf);
        FileStatus srcfilestat = srcfs.getFileStatus(src);
        Path root = special && srcfilestat.isDir() ? src : src.getParent();
        if (srcfilestat.isDir()) {
          ++srcCount;
        }

        Stack<FileStatus> pathstack = new Stack<FileStatus>();
        for (pathstack.push(srcfilestat); !pathstack.empty(); ) {
          FileStatus cur = pathstack.pop();
          FileStatus[] children = srcfs.listStatus(cur.getPath());
          for (int i = 0; i < children.length; i++) {
            boolean skipfile = false;
            final FileStatus child = children[i];
            final String dst = makeRelative(root, child.getPath());
            ++srcCount;

            if (child.isDir()) {
              pathstack.push(child);
            } else {
              // skip file if the src and the dst files are the same.
              skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
              // skip file if it exceed file limit or size limit
              skipfile |=
                  fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

              if (!skipfile) {
                ++fileCount;
                byteCount += child.getLen();

                if (LOG.isTraceEnabled()) {
                  LOG.trace("adding file " + child.getPath());
                }

                ++cnsyncf;
                cbsyncs += child.getLen();
                if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                  src_writer.sync();
                  dst_writer.sync();
                  cnsyncf = 0;
                  cbsyncs = 0L;
                }
              }
            }

            if (!skipfile) {
              src_writer.append(
                  new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst));
            }

            dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
          }

          if (cur.isDir()) {
            String dst = makeRelative(root, cur.getPath());
            dir_writer.append(new Text(dst), new FilePair(cur, dst));
            if (++dirsyn > SYNC_FILE_MAX) {
              dirsyn = 0;
              dir_writer.sync();
            }
          }
        }
      }
    } finally {
      checkAndClose(src_writer);
      checkAndClose(dst_writer);
      checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
      dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
      LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
      if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
        throw new IOException("Failed to create" + args.dst);
      }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
      deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir =
        new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1)
                ? args.dst.getParent()
                : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
  }
Beispiel #8
0
    /**
     * Copy a file to a destination.
     *
     * @param srcstat src path and metadata
     * @param dstpath dst path
     * @param reporter
     */
    private void copy(
        FileStatus srcstat,
        Path relativedst,
        OutputCollector<WritableComparable<?>, Text> outc,
        Reporter reporter)
        throws IOException {
      Path absdst = new Path(destPath, relativedst);
      int totfiles = job.getInt(SRC_COUNT_LABEL, -1);
      assert totfiles >= 0 : "Invalid file count " + totfiles;

      // if a directory, ensure created even if empty
      if (srcstat.isDir()) {
        if (destFileSys.exists(absdst)) {
          if (!destFileSys.getFileStatus(absdst).isDir()) {
            throw new IOException("Failed to mkdirs: " + absdst + " is a file.");
          }
        } else if (!destFileSys.mkdirs(absdst)) {
          throw new IOException("Failed to mkdirs " + absdst);
        }
        // TODO: when modification times can be set, directories should be
        // emitted to reducers so they might be preserved. Also, mkdirs does
        // not currently return an error when the directory already exists;
        // if this changes, all directory work might as well be done in reduce
        return;
      }

      if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) {
        outc.collect(null, new Text("SKIP: " + srcstat.getPath()));
        ++skipcount;
        reporter.incrCounter(Counter.SKIP, 1);
        updateStatus(reporter);
        return;
      }

      Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst);
      long cbcopied = 0L;
      FSDataInputStream in = null;
      FSDataOutputStream out = null;
      try {
        // open src file
        try {
          in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath());
        } catch (IOException e) {
          LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return");
          in = null;
          return;
        }
        reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen());
        // open tmp file
        out = create(tmpfile, reporter, srcstat);
        // copy file
        for (int cbread; (cbread = in.read(buffer)) >= 0; ) {
          out.write(buffer, 0, cbread);
          cbcopied += cbread;
          reporter.setStatus(
              String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen())
                  + absdst
                  + " [ "
                  + StringUtils.humanReadableInt(cbcopied)
                  + " / "
                  + StringUtils.humanReadableInt(srcstat.getLen())
                  + " ]");
        }
      } finally {
        checkAndClose(in);
        checkAndClose(out);
      }

      if (cbcopied != srcstat.getLen()) {
        if (srcstat.getLen() == 0 && cbcopied > 0) {
          LOG.info("most likely see a WAL file corruption: " + srcstat.getPath());
        } else {
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(cbcopied)
                  + " to tmpfile (="
                  + tmpfile
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
      } else {
        if (totfiles == 1) {
          // Copying a single file; use dst path provided by user as destination
          // rather than destination directory, if a file
          Path dstparent = absdst.getParent();
          if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) {
            absdst = dstparent;
          }
        }
        if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) {
          throw new IOException(absdst + " is a directory");
        }
        if (!destFileSys.mkdirs(absdst.getParent())) {
          throw new IOException("Failed to craete parent dir: " + absdst.getParent());
        }
        rename(tmpfile, absdst);

        FileStatus dststat = destFileSys.getFileStatus(absdst);
        if (dststat.getLen() != srcstat.getLen()) {
          destFileSys.delete(absdst, false);
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(dststat.getLen())
                  + " to dst (="
                  + absdst
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
        updatePermissions(srcstat, dststat);
      }

      // report at least once for each file
      ++copycount;
      reporter.incrCounter(Counter.BYTESCOPIED, cbcopied);
      reporter.incrCounter(Counter.COPY, 1);
      updateStatus(reporter);
    }
Beispiel #9
0
    private void parseMetaData() throws IOException {
      Text line = new Text();
      long read;
      FSDataInputStream in = null;
      LineReader lin = null;

      try {
        in = fs.open(masterIndexPath);
        FileStatus masterStat = fs.getFileStatus(masterIndexPath);
        masterIndexTimestamp = masterStat.getModificationTime();
        lin = new LineReader(in, getConf());
        read = lin.readLine(line);

        // the first line contains the version of the index file
        String versionLine = line.toString();
        String[] arr = versionLine.split(" ");
        version = Integer.parseInt(arr[0]);
        // make it always backwards-compatible
        if (this.version > HarFileSystem.VERSION) {
          throw new IOException(
              "Invalid version " + this.version + " expected " + HarFileSystem.VERSION);
        }

        // each line contains a hashcode range and the index file name
        String[] readStr;
        while (read < masterStat.getLen()) {
          int b = lin.readLine(line);
          read += b;
          readStr = line.toString().split(" ");
          int startHash = Integer.parseInt(readStr[0]);
          int endHash = Integer.parseInt(readStr[1]);
          stores.add(
              new Store(
                  Long.parseLong(readStr[2]), Long.parseLong(readStr[3]), startHash, endHash));
          line.clear();
        }
      } catch (IOException ioe) {
        LOG.warn("Encountered exception ", ioe);
        throw ioe;
      } finally {
        IOUtils.cleanup(LOG, lin, in);
      }

      FSDataInputStream aIn = fs.open(archiveIndexPath);
      try {
        FileStatus archiveStat = fs.getFileStatus(archiveIndexPath);
        archiveIndexTimestamp = archiveStat.getModificationTime();
        LineReader aLin;

        // now start reading the real index file
        for (Store s : stores) {
          read = 0;
          aIn.seek(s.begin);
          aLin = new LineReader(aIn, getConf());
          while (read + s.begin < s.end) {
            int tmp = aLin.readLine(line);
            read += tmp;
            String lineFeed = line.toString();
            String[] parsed = lineFeed.split(" ");
            parsed[0] = decodeFileName(parsed[0]);
            archive.put(new Path(parsed[0]), new HarStatus(lineFeed));
            line.clear();
          }
        }
      } finally {
        IOUtils.cleanup(LOG, aIn);
      }
    }
  /**
   * Compare the checksums of the hdfs file as well as the local copied file.
   *
   * @author [email protected]
   * @date Fri Jan 27 06:06:00 2012
   */
  boolean compareChecksums(FileSystem fs, Path p, String sFsPath) {
    try {
      // get hdfs file info
      FileStatus stat = fs.getFileStatus(p);

      // get HDFS checksum
      FileChecksum ck = fs.getFileChecksum(p);
      String sCk, sCkShort;
      if (ck == null) {
        sCk = sCkShort = "<null>";
      } else {
        sCk = ck.toString();
        sCkShort = sCk.replaceAll("^.*:", "");
      }

      // System.out.println(p.toUri().getPath() + " len=" + stat.getLen()
      // + " " + stat.getOwner() + "/" + stat.getGroup()
      // + " checksum=" + sCk);

      // find the local file
      File fLocal = new File(sFsPath);
      if (!fLocal.exists()) {
        System.out.println("CHECKSUM-ERROR: file does not exist: " + sFsPath);
        return false;
      }
      if (!fLocal.isFile()) {
        System.out.println("CHECKSUM-ERROR: path is not a file: " + sFsPath);
        return false;
      }
      if (stat.getLen() != fLocal.length()) {
        System.out.println(
            "CHECKSUM-ERROR: length mismatch: "
                + sFsPath
                + " hdfslen="
                + stat.getLen()
                + " fslen="
                + fLocal.length());
        return false;
      }

      // get local fs checksum
      FileChecksum ckLocal = getLocalFileChecksum(sFsPath);
      if (ckLocal == null) {
        System.out.println("ERROR Failed to get checksum for local file " + sFsPath);
        return false;
      }

      // compare checksums as a string, after stripping the
      // algorithm name from the beginning
      String sCkLocal = ckLocal.toString();
      String sCkLocalShort = sCkLocal.replaceAll("^.*:", "");

      if (false == sCkShort.equals(sCkLocalShort)) {
        System.out.println(
            "CHECKSUM-ERROR: checksum mismatch: "
                + sFsPath
                + "\nhdfs = "
                + sCk
                + "\nlocal= "
                + sCkLocal);
        return false;
      }

      return true;
    } catch (IOException e) {
      System.out.println("CHECKSUM-ERROR: " + sFsPath + " exception " + e.toString());
    }

    return false;
  }