Beispiel #1
0
  /**
   * set up input file which has the list of input files.
   *
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter =
          SequenceFile.createWriter(
              fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (FileStatus st : p.srcPaths) {
          opWriter.append(new Text(st.getPath().toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();

    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(
        getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers()));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;
  }
  /** creates the input file (containing the names of the files to be fixed */
  private List<String> createInputFile(
      String jobName, Path inDir, Map<String, Integer> corruptFilePriority, int priority)
      throws IOException {
    Path file = new Path(inDir, jobName + IN_FILE_SUFFIX);
    FileSystem fs = file.getFileSystem(getConf());
    SequenceFile.Writer fileOut =
        SequenceFile.createWriter(fs, getConf(), file, LongWritable.class, Text.class);
    long index = 0L;

    List<String> filesAdded = new ArrayList<String>();
    int count = 0;
    final long max = filesPerTask * BLOCKFIX_TASKS_PER_JOB;
    for (Map.Entry<String, Integer> entry : corruptFilePriority.entrySet()) {
      if (entry.getValue() != priority) {
        continue;
      }
      if (count >= max) {
        break;
      }
      String corruptFileName = entry.getKey();
      fileOut.append(new LongWritable(index++), new Text(corruptFileName));
      filesAdded.add(corruptFileName);
      count++;

      if (index % filesPerTask == 0) {
        fileOut.sync(); // create sync point to make sure we can split here
      }
    }

    fileOut.close();
    return filesAdded;
  }
  private void writeToFileListing(
      SequenceFile.Writer fileListWriter,
      FileStatus fileStatus,
      Path sourcePathRoot,
      boolean localFile)
      throws IOException {
    if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDir())
      return; // Skip the root-paths.

    if (LOG.isDebugEnabled()) {
      LOG.debug(
          "REL PATH: "
              + DistCpUtils.getRelativePath(sourcePathRoot, fileStatus.getPath())
              + ", FULL PATH: "
              + fileStatus.getPath());
    }

    FileStatus status = fileStatus;
    if (localFile) {
      status = getFileStatus(fileStatus);
    }

    fileListWriter.append(
        new Text(DistCpUtils.getRelativePath(sourcePathRoot, fileStatus.getPath())), status);
    fileListWriter.sync();

    if (!fileStatus.isDir()) {
      totalBytesToCopy += fileStatus.getLen();
    }
    totalPaths++;
  }
Beispiel #4
0
  @Test
  public void testReadString() throws Exception {
    if (SKIP) {
      return;
    }

    //        final Path file = new Path("hdfs://localhost:9000/tmp/test/test-hdfs-file");
    final Path file =
        new Path(new File("../../../../target/test/test-camel-string").getAbsolutePath());
    org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
    // now set classes for filesystems. This is normally done using java.util.ServiceLoader which
    // doesn't
    // work inside OSGi.
    conf.setClass("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class, FileSystem.class);
    conf.setClass(
        "fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class, FileSystem.class);
    SequenceFile.Writer writer =
        SequenceFile.createWriter(
            conf,
            SequenceFile.Writer.file(file),
            SequenceFile.Writer.keyClass(NullWritable.class),
            SequenceFile.Writer.valueClass(Text.class));
    NullWritable keyWritable = NullWritable.get();
    Text valueWritable = new Text();
    String value = "CIAO!";
    valueWritable.set(value);
    writer.append(keyWritable, valueWritable);
    writer.sync();
    writer.close();

    context.addRoutes(
        new RouteBuilder() {
          public void configure() {
            //
            // from("hdfs2://localhost:9000/tmp/test/test-hdfs-file?fileSystemType=HDFS&fileType=SEQUENCE_FILE&initialDelay=0").to("mock:result");
            from("hdfs2:///"
                    + file.toUri()
                    + "?fileSystemType=LOCAL&fileType=SEQUENCE_FILE&initialDelay=0")
                .to("mock:result");
          }
        });
    context.start();

    MockEndpoint resultEndpoint = context.getEndpoint("mock:result", MockEndpoint.class);
    resultEndpoint.expectedMessageCount(1);
    resultEndpoint.assertIsSatisfied();
  }
Beispiel #5
0
  /**
   * Initialize DFSCopyFileMapper specific job-configuration.
   *
   * @param conf : The dfs/mapred configuration.
   * @param jobConf : The handle to the jobConf object to be initialized.
   * @param args Arguments
   */
  private static void setup(Configuration conf, JobConf jobConf, final Arguments args)
      throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(
        Options.IGNORE_READ_FAILURES.propertyname,
        args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(
        Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
      dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
      String filename = "_distcp_logs_" + randomId;
      if (!dstExists || !dstIsDir) {
        Path parent = args.dst.getParent();
        if (!dstfs.exists(parent)) {
          dstfs.mkdirs(parent);
        }
        logPath = new Path(parent, filename);
      } else {
        logPath = new Path(args.dst, filename);
      }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            srcfilelist,
            LongWritable.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer =
        SequenceFile.createWriter(
            jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            dstdirlist,
            Text.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
      for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) {
        final Path src = srcItr.next();
        FileSystem srcfs = src.getFileSystem(conf);
        FileStatus srcfilestat = srcfs.getFileStatus(src);
        Path root = special && srcfilestat.isDir() ? src : src.getParent();
        if (srcfilestat.isDir()) {
          ++srcCount;
        }

        Stack<FileStatus> pathstack = new Stack<FileStatus>();
        for (pathstack.push(srcfilestat); !pathstack.empty(); ) {
          FileStatus cur = pathstack.pop();
          FileStatus[] children = srcfs.listStatus(cur.getPath());
          for (int i = 0; i < children.length; i++) {
            boolean skipfile = false;
            final FileStatus child = children[i];
            final String dst = makeRelative(root, child.getPath());
            ++srcCount;

            if (child.isDir()) {
              pathstack.push(child);
            } else {
              // skip file if the src and the dst files are the same.
              skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
              // skip file if it exceed file limit or size limit
              skipfile |=
                  fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

              if (!skipfile) {
                ++fileCount;
                byteCount += child.getLen();

                if (LOG.isTraceEnabled()) {
                  LOG.trace("adding file " + child.getPath());
                }

                ++cnsyncf;
                cbsyncs += child.getLen();
                if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                  src_writer.sync();
                  dst_writer.sync();
                  cnsyncf = 0;
                  cbsyncs = 0L;
                }
              }
            }

            if (!skipfile) {
              src_writer.append(
                  new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst));
            }

            dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
          }

          if (cur.isDir()) {
            String dst = makeRelative(root, cur.getPath());
            dir_writer.append(new Text(dst), new FilePair(cur, dst));
            if (++dirsyn > SYNC_FILE_MAX) {
              dirsyn = 0;
              dir_writer.sync();
            }
          }
        }
      }
    } finally {
      checkAndClose(src_writer);
      checkAndClose(dst_writer);
      checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
      dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
      LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
      if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
        throw new IOException("Failed to create" + args.dst);
      }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
      deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir =
        new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1)
                ? args.dst.getParent()
                : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
  }
  public int run(String[] argv) throws Exception {
    JobConf job = new JobConf(getConf());
    job.setJarByClass(GenericMRLoadGenerator.class);
    job.setMapperClass(SampleMapper.class);
    job.setReducerClass(SampleReducer.class);
    if (!parseArgs(argv, job)) {
      return -1;
    }

    if (null == FileOutputFormat.getOutputPath(job)) {
      // No output dir? No writes
      job.setOutputFormat(NullOutputFormat.class);
    }

    if (0 == FileInputFormat.getInputPaths(job).length) {
      // No input dir? Generate random data
      System.err.println("No input path; ignoring InputFormat");
      confRandom(job);
    } else if (null
        != job.getClass(
            org.apache.hadoop.mapreduce.GenericMRLoadGenerator.INDIRECT_INPUT_FORMAT, null)) {
      // specified IndirectInputFormat? Build src list
      JobClient jClient = new JobClient(job);
      Path tmpDir = new Path(jClient.getFs().getHomeDirectory(), ".staging");
      Random r = new Random();
      Path indirInputFile =
          new Path(tmpDir, Integer.toString(r.nextInt(Integer.MAX_VALUE), 36) + "_files");
      job.set(
          org.apache.hadoop.mapreduce.GenericMRLoadGenerator.INDIRECT_INPUT_FILE,
          indirInputFile.toString());
      SequenceFile.Writer writer =
          SequenceFile.createWriter(
              tmpDir.getFileSystem(job),
              job,
              indirInputFile,
              LongWritable.class,
              Text.class,
              SequenceFile.CompressionType.NONE);
      try {
        for (Path p : FileInputFormat.getInputPaths(job)) {
          FileSystem fs = p.getFileSystem(job);
          Stack<Path> pathstack = new Stack<Path>();
          pathstack.push(p);
          while (!pathstack.empty()) {
            for (FileStatus stat : fs.listStatus(pathstack.pop())) {
              if (stat.isDirectory()) {
                if (!stat.getPath().getName().startsWith("_")) {
                  pathstack.push(stat.getPath());
                }
              } else {
                writer.sync();
                writer.append(
                    new LongWritable(stat.getLen()), new Text(stat.getPath().toUri().toString()));
              }
            }
          }
        }
      } finally {
        writer.close();
      }
    }

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println(
        "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return 0;
  }