public JobBuilder compressType(CompressionType type) throws IOException {
   if (type == CompressionType.NONE) {
     _jobConf.setBoolean("mapred.output.compress", false);
   } else {
     _jobConf.setBoolean("mapred.output.compress", true);
   }
   _jobConf.set("mapred.output.compression.type", type.toString());
   return this;
 }
 private JobConf createJobConf() {
   JobConf job = new NutchJob(getConf());
   job.setBoolean("segment.reader.co", this.co);
   job.setBoolean("segment.reader.fe", this.fe);
   job.setBoolean("segment.reader.ge", this.ge);
   job.setBoolean("segment.reader.pa", this.pa);
   job.setBoolean("segment.reader.pd", this.pd);
   job.setBoolean("segment.reader.pt", this.pt);
   return job;
 }
 public JobBuilder compressor(CompressionType type, Class<? extends CompressionCodec> codec)
     throws IOException {
   _jobConf.setBoolean("mapred.output.compress", true);
   _jobConf.set("mapred.output.compression.type", type.toString());
   _jobConf.setClass("mapred.output.compression.codec", codec, CompressionCodec.class);
   return this;
 }
  /**
   * {@inheritDoc}
   *
   * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
   */
  @Override
  public int run(String[] args) throws Exception {
    JobConf configuration = new JobConf(getConf(), WordCountExtended.class);
    configuration.setJobName(JOB_NAME);

    configuration.setOutputKeyClass(Text.class);
    configuration.setOutputValueClass(IntWritable.class);

    configuration.setMapperClass(Map.class);
    configuration.setCombinerClass(Reduce.class);
    configuration.setReducerClass(Reduce.class);

    configuration.setInputFormat(TextInputFormat.class);
    configuration.setOutputFormat(TextOutputFormat.class);

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      if (JOB_SKIP_ARGUMENT.equals(args[i])) {
        DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration);
        configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true);
      } else {
        otherArgs.add(args[i]);
      }
    }

    FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0)));
    FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1)));

    JobClient.runJob(configuration);
    return 0;
  }
 @Test
 public void testEncryptedMerger() throws Throwable {
   jobConf.setBoolean(MRJobConfig.MR_ENCRYPTED_INTERMEDIATE_DATA, true);
   conf.setBoolean(MRJobConfig.MR_ENCRYPTED_INTERMEDIATE_DATA, true);
   Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
   TokenCache.setShuffleSecretKey(new byte[16], credentials);
   UserGroupInformation.getCurrentUser().addCredentials(credentials);
   testInMemoryAndOnDiskMerger();
 }
Example #6
0
  /**
   * Creates and initializes a JobConf object that can be used to execute the DAG. The configuration
   * object will contain configurations from mapred-site overlaid with key/value pairs from the
   * hiveConf object. Finally it will also contain some hive specific configurations that do not
   * change from DAG to DAG.
   *
   * @param hiveConf Current hiveConf for the execution
   * @return JobConf base configuration for job execution
   * @throws IOException
   */
  public JobConf createConfiguration(HiveConf hiveConf) throws IOException {
    hiveConf.setBoolean("mapred.mapper.new-api", false);

    JobConf conf = (JobConf) MRHelpers.getBaseMRConfiguration(hiveConf);

    conf.set("mapred.output.committer.class", NullOutputCommitter.class.getName());

    conf.setBoolean("mapred.committer.job.setup.cleanup.needed", false);
    conf.setBoolean("mapred.committer.job.task.cleanup.needed", false);

    conf.setClass("mapred.output.format.class", HiveOutputFormatImpl.class, OutputFormat.class);

    conf.set(MRJobConfig.OUTPUT_KEY_CLASS, HiveKey.class.getName());
    conf.set(MRJobConfig.OUTPUT_VALUE_CLASS, BytesWritable.class.getName());

    conf.set("mapred.partitioner.class", HiveConf.getVar(conf, HiveConf.ConfVars.HIVEPARTITIONER));
    conf.set("tez.runtime.partitioner.class", MRPartitioner.class.getName());

    return conf;
  }
  /**
   * Test if {@link CompressionEmulationUtil#configureCompressionEmulation(
   * org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.JobConf)} can extract compression
   * related configuration parameters.
   */
  @Test
  public void testExtractCompressionConfigs() {
    JobConf source = new JobConf();
    JobConf target = new JobConf();

    // set the default values
    source.setBoolean(FileOutputFormat.COMPRESS, false);
    source.set(FileOutputFormat.COMPRESS_CODEC, "MyDefaultCodec");
    source.set(FileOutputFormat.COMPRESS_TYPE, "MyDefaultType");
    source.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false);
    source.set(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, "MyDefaultCodec2");

    CompressionEmulationUtil.configureCompressionEmulation(source, target);

    // check default values
    assertFalse(target.getBoolean(FileOutputFormat.COMPRESS, true));
    assertEquals("MyDefaultCodec", target.get(FileOutputFormat.COMPRESS_CODEC));
    assertEquals("MyDefaultType", target.get(FileOutputFormat.COMPRESS_TYPE));
    assertFalse(target.getBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true));
    assertEquals("MyDefaultCodec2", target.get(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC));
    assertFalse(CompressionEmulationUtil.isInputCompressionEmulationEnabled(target));

    // set new values
    source.setBoolean(FileOutputFormat.COMPRESS, true);
    source.set(FileOutputFormat.COMPRESS_CODEC, "MyCodec");
    source.set(FileOutputFormat.COMPRESS_TYPE, "MyType");
    source.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
    source.set(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, "MyCodec2");
    org.apache.hadoop.mapred.FileInputFormat.setInputPaths(source, "file.gz");

    target = new JobConf(); // reset
    CompressionEmulationUtil.configureCompressionEmulation(source, target);

    // check new values
    assertTrue(target.getBoolean(FileOutputFormat.COMPRESS, false));
    assertEquals("MyCodec", target.get(FileOutputFormat.COMPRESS_CODEC));
    assertEquals("MyType", target.get(FileOutputFormat.COMPRESS_TYPE));
    assertTrue(target.getBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false));
    assertEquals("MyCodec2", target.get(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC));
    assertTrue(CompressionEmulationUtil.isInputCompressionEmulationEnabled(target));
  }
Example #8
0
  @Override
  public int run(String[] args) throws Exception {
    final int ret = parseArgs(args);
    if (ret < 0) {
      return ret;
    }

    JobConf config = new JobConf(getConf(), TfIdfNovelty.class);
    config.setJobName("Influence-TfIdfNovelty");

    config.set(Fields.BASIS.get(), basisPath);
    if (datesPath != null) {
      config.set(Fields.DOC_DATES.get(), datesPath);
    }
    config.setBoolean(Fields.IGNORE.get(), ignoreDocs);
    if (bands > 0) {
      config.setInt(Fields.BANDS.get(), bands);
    }
    if (rows > 0) {
      config.setInt(Fields.ROWS.get(), rows);
    }

    SetupHelper.getInstance()
        .setSequenceInput(config, inputPath)
        .setSequenceOutput(config, outputPath);

    config.setMapOutputKeyClass(HashBandWritable.class);
    config.setMapOutputValueClass(DocumentWithVectorWritable.class);
    config.setMapperClass(TfIdfNoveltyLshMapper.class);

    if (outputBuckets) {
      config.setOutputKeyClass(HashBandWritable.class);
      config.setOutputValueClass(IntArrayWritable.class);
      config.setReducerClass(TfIdfNoveltyIdentityReducer.class);
    } else {
      config.setOutputKeyClass(Text.class);
      config.setOutputValueClass(VectorWritable.class);
      config.setReducerClass(TfIdfNoveltyReducer.class);
    }

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    JobClient.runJob(config);

    return 0;
  }
  public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception {

    fs.delete(READ_DIR, true);

    JobConf job = new JobConf(conf, TestFileSystem.class);
    job.setBoolean("fs.test.fastCheck", fastCheck);

    FileInputFormat.setInputPaths(job, CONTROL_DIR);
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(SeekMapper.class);
    job.setReducerClass(LongSumReducer.class);

    FileOutputFormat.setOutputPath(job, READ_DIR);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setNumReduceTasks(1);
    JobClient.runJob(job);
  }
 public void testTotalOrderBinarySearch() throws Exception {
   TotalOrderPartitioner<Text, NullWritable> partitioner =
       new TotalOrderPartitioner<Text, NullWritable>();
   JobConf job = new JobConf();
   Path p =
       TestTotalOrderPartitioner.<Text>writePartitionFile(
           "totalorderbinarysearch", job, splitStrings);
   job.setBoolean("total.order.partitioner.natural.order", false);
   job.setMapOutputKeyClass(Text.class);
   try {
     partitioner.configure(job);
     NullWritable nw = NullWritable.get();
     for (Check<Text> chk : testStrings) {
       assertEquals(
           chk.data.toString(),
           chk.part,
           partitioner.getPartition(chk.data, nw, splitStrings.length + 1));
     }
   } finally {
     p.getFileSystem(job).delete(p);
   }
 }
 public void testTotalOrderCustomComparator() throws Exception {
   TotalOrderPartitioner<Text, NullWritable> partitioner =
       new TotalOrderPartitioner<Text, NullWritable>();
   JobConf job = new JobConf();
   Text[] revSplitStrings = Arrays.copyOf(splitStrings, splitStrings.length);
   Arrays.sort(revSplitStrings, new ReverseStringComparator());
   Path p =
       TestTotalOrderPartitioner.<Text>writePartitionFile(
           "totalordercustomcomparator", job, revSplitStrings);
   job.setBoolean("total.order.partitioner.natural.order", false);
   job.setMapOutputKeyClass(Text.class);
   job.setOutputKeyComparatorClass(ReverseStringComparator.class);
   ArrayList<Check<Text>> revCheck = new ArrayList<Check<Text>>();
   revCheck.add(new Check<Text>(new Text("aaaaa"), 9));
   revCheck.add(new Check<Text>(new Text("aaabb"), 9));
   revCheck.add(new Check<Text>(new Text("aabbb"), 9));
   revCheck.add(new Check<Text>(new Text("aaaaa"), 9));
   revCheck.add(new Check<Text>(new Text("babbb"), 8));
   revCheck.add(new Check<Text>(new Text("baabb"), 8));
   revCheck.add(new Check<Text>(new Text("yai"), 1));
   revCheck.add(new Check<Text>(new Text("yak"), 1));
   revCheck.add(new Check<Text>(new Text("z"), 0));
   revCheck.add(new Check<Text>(new Text("ddngo"), 4));
   revCheck.add(new Check<Text>(new Text("hi"), 3));
   try {
     partitioner.configure(job);
     NullWritable nw = NullWritable.get();
     for (Check<Text> chk : revCheck) {
       assertEquals(
           chk.data.toString(),
           chk.part,
           partitioner.getPartition(chk.data, nw, splitStrings.length + 1));
     }
   } finally {
     p.getFileSystem(job).delete(p);
   }
 }
  /**
   * Test {@link CompressionEmulationUtil#getPossiblyDecompressedInputStream(Path, Configuration,
   * long)} and {@link CompressionEmulationUtil#getPossiblyCompressedOutputStream(Path,
   * Configuration)}.
   */
  @Test
  public void testPossiblyCompressedDecompressedStreams() throws IOException {
    JobConf conf = new JobConf();
    FileSystem lfs = FileSystem.getLocal(conf);
    String inputLine = "Hi Hello!";

    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
    conf.setBoolean(FileOutputFormat.COMPRESS, true);
    conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class);

    // define the test's root temp directory
    Path rootTempDir =
        new Path(System.getProperty("test.build.data", "/tmp"))
            .makeQualified(lfs.getUri(), lfs.getWorkingDirectory());

    Path tempDir = new Path(rootTempDir, "TestPossiblyCompressedDecompressedStreams");
    lfs.delete(tempDir, true);

    // create a compressed file
    Path compressedFile = new Path(tempDir, "test");
    OutputStream out =
        CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, conf);
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
    writer.write(inputLine);
    writer.close();

    // now read back the data from the compressed stream
    compressedFile = compressedFile.suffix(".gz");
    InputStream in =
        CompressionEmulationUtil.getPossiblyDecompressedInputStream(compressedFile, conf, 0);
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    String readLine = reader.readLine();
    assertEquals("Compression/Decompression error", inputLine, readLine);
    reader.close();
  }
Example #13
0
  /**
   * Initialize DFSCopyFileMapper specific job-configuration.
   *
   * @param conf : The dfs/mapred configuration.
   * @param jobConf : The handle to the jobConf object to be initialized.
   * @param args Arguments
   */
  private static void setup(Configuration conf, JobConf jobConf, final Arguments args)
      throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(
        Options.IGNORE_READ_FAILURES.propertyname,
        args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(
        Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
      dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
      String filename = "_distcp_logs_" + randomId;
      if (!dstExists || !dstIsDir) {
        Path parent = args.dst.getParent();
        if (!dstfs.exists(parent)) {
          dstfs.mkdirs(parent);
        }
        logPath = new Path(parent, filename);
      } else {
        logPath = new Path(args.dst, filename);
      }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            srcfilelist,
            LongWritable.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer =
        SequenceFile.createWriter(
            jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer =
        SequenceFile.createWriter(
            jobfs,
            jobConf,
            dstdirlist,
            Text.class,
            FilePair.class,
            SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
      for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) {
        final Path src = srcItr.next();
        FileSystem srcfs = src.getFileSystem(conf);
        FileStatus srcfilestat = srcfs.getFileStatus(src);
        Path root = special && srcfilestat.isDir() ? src : src.getParent();
        if (srcfilestat.isDir()) {
          ++srcCount;
        }

        Stack<FileStatus> pathstack = new Stack<FileStatus>();
        for (pathstack.push(srcfilestat); !pathstack.empty(); ) {
          FileStatus cur = pathstack.pop();
          FileStatus[] children = srcfs.listStatus(cur.getPath());
          for (int i = 0; i < children.length; i++) {
            boolean skipfile = false;
            final FileStatus child = children[i];
            final String dst = makeRelative(root, child.getPath());
            ++srcCount;

            if (child.isDir()) {
              pathstack.push(child);
            } else {
              // skip file if the src and the dst files are the same.
              skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
              // skip file if it exceed file limit or size limit
              skipfile |=
                  fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

              if (!skipfile) {
                ++fileCount;
                byteCount += child.getLen();

                if (LOG.isTraceEnabled()) {
                  LOG.trace("adding file " + child.getPath());
                }

                ++cnsyncf;
                cbsyncs += child.getLen();
                if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) {
                  src_writer.sync();
                  dst_writer.sync();
                  cnsyncf = 0;
                  cbsyncs = 0L;
                }
              }
            }

            if (!skipfile) {
              src_writer.append(
                  new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst));
            }

            dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
          }

          if (cur.isDir()) {
            String dst = makeRelative(root, cur.getPath());
            dir_writer.append(new Text(dst), new FilePair(cur, dst));
            if (++dirsyn > SYNC_FILE_MAX) {
              dirsyn = 0;
              dir_writer.sync();
            }
          }
        }
      }
    } finally {
      checkAndClose(src_writer);
      checkAndClose(dst_writer);
      checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
      dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
      LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
      if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
        throw new IOException("Failed to create" + args.dst);
      }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
      deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir =
        new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1)
                ? args.dst.getParent()
                : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());
    LOG.info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
  }
Example #14
0
 /**
  * Set if native hadoop libraries, if present, can be used for this job.
  *
  * @param jobConf job configuration
  * @param loadNativeLibraries can native hadoop libraries be loaded
  */
 public void setLoadNativeLibraries(JobConf jobConf, boolean loadNativeLibraries) {
   jobConf.setBoolean("hadoop.native.lib", loadNativeLibraries);
 }
Example #15
0
  boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements =
        asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
      BufferedReader reader =
          new BufferedReader(
              new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

      try {
        String line;

        while (null != (line = reader.readLine())) {
          System.out.println(line);
        }
      } finally {
        reader.close();
      }

      return false;
    }

    if (cli.hasOption("verbose")) {
      console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
      console = Verbosity.INFO;
    } else {
      console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
      ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
      /*
       * Stand alone mode accepts two arguments.
       */
      mode = Mode.STAND_ALONE;

      srcDir = new Path(nonOptions[0]);

      dest = new Path(nonOptions[1]);

      if (cli.hasOption("input-format")) {
        inFormats = asList(cli.getOptionValue("input-format"));
      }

      if (cli.hasOption("output-format")) {
        outFormats = asList(cli.getOptionValue("output-format"));
      }

      replacements = asList(dest.getName());

      crushTimestamp = Long.toString(currentTimeMillis());

    } else {
      /*
       * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
       * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
       * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
       * smaller.
       */

      if ((args.length == 4 || args.length == 3)
          && args.length == nonOptions.length
          && args[2].length() != 14) {

        int maxTasks = Integer.parseInt(args[2]);

        if (maxTasks <= 0 || maxTasks > 4000) {
          throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
        }

        job.setInt("mapred.reduce.tasks", maxTasks);

        maxFileBlocks = Integer.MAX_VALUE;

        crushTimestamp = Long.toString(currentTimeMillis());

        srcDir = new Path(args[0]);
        dest = new Path(args[1]);

        mode = Mode.CLONE;

        if (args.length == 4) {
          if (args[3].equals("TEXT")) {
            /*
             * These are the defaults except with text input and output formats.
             */
            inFormats = asList(TextInputFormat.class.getName());
            outFormats = asList(TextOutputFormat.class.getName());

          } else if (!args[3].equals("SEQUENCE")) {
            throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
          }
        }
      } else {
        /*
         * V2 style arguments.
         */
        if (cli.hasOption("threshold")) {
          threshold = Double.parseDouble(cli.getOptionValue("threshold"));

          if (0 >= threshold
              || 1 < threshold
              || Double.isInfinite(threshold)
              || Double.isNaN(threshold)) {
            throw new IllegalArgumentException(
                "Block size threshold must be in (0, 1]: " + threshold);
          }
        }

        if (cli.hasOption("max-file-blocks")) {
          int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

          if (0 > maxFileBlocksOption) {
            throw new IllegalArgumentException(
                "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
          }

          maxFileBlocks = maxFileBlocksOption;
        } else {
          maxFileBlocks = 8;
        }

        if (cli.hasOption("regex")) {
          regexes = asList(cli.getOptionValues("regex"));
        }

        if (cli.hasOption("replacement")) {
          replacements = asList(cli.getOptionValues("replacement"));
        }

        if (cli.hasOption("input-format")) {
          inFormats = asList(cli.getOptionValues("input-format"));
        }

        if (cli.hasOption("output-format")) {
          outFormats = asList(cli.getOptionValues("output-format"));
        }

        if (3 != nonOptions.length) {
          throw new IllegalArgumentException(
              "Could not find source directory, out directory, and job timestamp");
        }

        srcDir = new Path(nonOptions[0]);
        dest = new Path(nonOptions[1]);

        crushTimestamp = nonOptions[2];

        if (cli.hasOption("clone")) {
          mode = Mode.CLONE;
        } else {
          mode = Mode.MAP_REDUCE;
        }

        if (!crushTimestamp.matches("\\d{14}")) {
          throw new IllegalArgumentException(
              "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
        }
      }

      dfsBlockSize = Long.parseLong(job.get("dfs.blocksize"));
      maxEligibleSize = (long) (dfsBlockSize * threshold);
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFiles != null) {
      job.set("crush.ignore-regex", ignoredFiles.pattern().pattern());
    }

    if (regexes.size() != replacements.size()
        || replacements.size() != inFormats.size()
        || inFormats.size() != outFormats.size()) {
      throw new IllegalArgumentException(
          "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
      job.set(format("crush.%d.regex", i), regexes.get(i));

      matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

      job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

      String inFmt = inFormats.get(i);

      if ("sequence".equals(inFmt)) {
        inFmt = SequenceFileInputFormat.class.getName();
      } else if ("text".equals(inFmt)) {
        inFmt = TextInputFormat.class.getName();
      } else {
        try {
          if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
            throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
          }
        } catch (ClassNotFoundException e) {
          throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
        }
      }

      job.set(format("crush.%d.input.format", i), inFmt);

      String outFmt = outFormats.get(i);

      if ("sequence".equals(outFmt)) {
        outFmt = SequenceFileOutputFormat.class.getName();
      } else if ("text".equals(outFmt)) {
        outFmt = TextOutputFormat.class.getName();
      } else {
        try {
          if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
            throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
          }
        } catch (ClassNotFoundException e) {
          throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
        }
      }

      job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");

    if (null == codec) {
      codec = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
      codec = null;
    } else if ("gzip".equals(codec)) {
      codec = GzipCodec.class.getName();
    } else {
      try {
        if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
          throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
      } catch (ClassNotFoundException e) {
        throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
      }
    }

    if (null == codec) {
      job.setBoolean("mapred.output.compress", false);
    } else {
      job.setBoolean("mapred.output.compress", true);
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapred.output.compression.codec", codec);

      try {
        CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance();
        codecExtension = instance.getDefaultExtension();
      } catch (Exception e) {
        throw new AssertionError();
      }
    }

    return true;
  }
  @Test
  public void testReduceProcessor() throws Exception {
    final String dagName = "mrdag0";
    String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
    String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName();
    JobConf jobConf = new JobConf(defaultConf);
    setUpJobConf(jobConf);

    MRHelpers.translateVertexConfToTez(jobConf);
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

    jobConf.set(
        MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
        new Path(workDir, "localized-resources").toUri().toString());
    jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

    Path mapInput = new Path(workDir, "map0");
    MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput);

    InputSpec mapInputSpec =
        new InputSpec(
            "NullSrcVertex",
            new InputDescriptor(MRInputLegacy.class.getName())
                .setUserPayload(MRHelpers.createMRInputPayload(jobConf, null)),
            1);
    OutputSpec mapOutputSpec =
        new OutputSpec(
            "NullDestVertex",
            new OutputDescriptor(LocalOnFileSorterOutput.class.getName())
                .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);
    // Run a map
    LogicalIOProcessorRuntimeTask mapTask =
        MapUtils.createLogicalTask(
            localFs,
            workDir,
            jobConf,
            0,
            mapInput,
            new TestUmbilical(),
            dagName,
            mapVertexName,
            Collections.singletonList(mapInputSpec),
            Collections.singletonList(mapOutputSpec));

    mapTask.initialize();
    mapTask.run();
    mapTask.close();

    LOG.info("Starting reduce...");

    Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>();

    jobConf.setOutputFormat(SequenceFileOutputFormat.class);
    jobConf.set(
        MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
        new Path(workDir, "localized-resources").toUri().toString());
    FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output"));
    ProcessorDescriptor reduceProcessorDesc =
        new ProcessorDescriptor(ReduceProcessor.class.getName())
            .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf));

    InputSpec reduceInputSpec =
        new InputSpec(
            mapVertexName,
            new InputDescriptor(LocalMergedInput.class.getName())
                .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);
    OutputSpec reduceOutputSpec =
        new OutputSpec(
            "NullDestinationVertex",
            new OutputDescriptor(MROutputLegacy.class.getName())
                .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);

    // Now run a reduce
    TaskSpec taskSpec =
        new TaskSpec(
            TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0),
            dagName,
            reduceVertexName,
            reduceProcessorDesc,
            Collections.singletonList(reduceInputSpec),
            Collections.singletonList(reduceOutputSpec),
            null);

    Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
    serviceConsumerMetadata.put(
        ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, ShuffleUtils.convertJobTokenToBytes(shuffleToken));

    LogicalIOProcessorRuntimeTask task =
        new LogicalIOProcessorRuntimeTask(
            taskSpec,
            0,
            jobConf,
            new String[] {workDir.toString()},
            new TestUmbilical(),
            serviceConsumerMetadata,
            HashMultimap.<String, String>create());

    task.initialize();
    task.run();
    task.close();

    // MRTask mrTask = (MRTask)t.getProcessor();
    // TODO NEWTEZ Verify the partitioner has not been created
    // Likely not applicable anymore.
    // Assert.assertNull(mrTask.getPartitioner());

    // Only a task commit happens, hence the data is still in the temporary directory.
    Path reduceOutputDir =
        new Path(
            new Path(workDir, "output"),
            "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0)));

    Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000");

    SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf);

    LongWritable key = new LongWritable();
    Text value = new Text();
    long prev = Long.MIN_VALUE;
    while (reader.next(key, value)) {
      if (prev != Long.MIN_VALUE) {
        Assert.assertTrue(prev < key.get());
        prev = key.get();
      }
    }

    reader.close();
  }
Example #17
0
  /*
   * Helper function to create Vertex from MapWork.
   */
  private Vertex createVertex(
      JobConf conf,
      MapWork mapWork,
      LocalResource appJarLr,
      List<LocalResource> additionalLr,
      FileSystem fs,
      Path mrScratchDir,
      Context ctx,
      TezWork tezWork)
      throws Exception {

    Path tezDir = getTezDir(mrScratchDir);

    // set up the operator plan
    Utilities.setMapWork(conf, mapWork, mrScratchDir, false);

    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);

    // Tez ask us to call this even if there's no preceding vertex
    MultiStageMRConfToTezTranslator.translateVertexConfToTez(conf, null);

    // finally create the vertex
    Vertex map = null;

    // use tez to combine splits
    boolean useTezGroupedSplits = false;

    int numTasks = -1;
    Class amSplitGeneratorClass = null;
    InputSplitInfo inputSplitInfo = null;
    Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);

    boolean vertexHasCustomInput = false;
    if (tezWork != null) {
      for (BaseWork baseWork : tezWork.getParents(mapWork)) {
        if (tezWork.getEdgeType(baseWork, mapWork) == EdgeType.CUSTOM_EDGE) {
          vertexHasCustomInput = true;
        }
      }
    }
    if (vertexHasCustomInput) {
      useTezGroupedSplits = false;
      // grouping happens in execution phase. Setting the class to TezGroupedSplitsInputFormat
      // here would cause pre-mature grouping which would be incorrect.
      inputFormatClass = HiveInputFormat.class;
      conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
      // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
      // this plug-in to avoid getting a serialized event at run-time.
      conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
      // we'll set up tez to combine spits for us iff the input format
      // is HiveInputFormat
      if (inputFormatClass == HiveInputFormat.class) {
        useTezGroupedSplits = true;
        conf.setClass(
            "mapred.input.format.class", TezGroupedSplitsInputFormat.class, InputFormat.class);
      }
    }

    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
      // if we're generating the splits in the AM, we just need to set
      // the correct plugin.
      amSplitGeneratorClass = MRInputAMSplitGenerator.class;
    } else {
      // client side split generation means we have to compute them now
      inputSplitInfo =
          MRHelpers.generateInputSplits(
              conf, new Path(tezDir, "split_" + mapWork.getName().replaceAll(" ", "_")));
      numTasks = inputSplitInfo.getNumTasks();
    }

    byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf);
    map =
        new Vertex(
            mapWork.getName(),
            new ProcessorDescriptor(MapTezProcessor.class.getName()).setUserPayload(serializedConf),
            numTasks,
            getContainerResource(conf));
    Map<String, String> environment = new HashMap<String, String>();
    MRHelpers.updateEnvironmentForMRTasks(conf, environment, true);
    map.setTaskEnvironment(environment);
    map.setJavaOpts(getContainerJavaOpts(conf));

    assert mapWork.getAliasToWork().keySet().size() == 1;

    String alias = mapWork.getAliasToWork().keySet().iterator().next();

    byte[] mrInput = null;
    if (useTezGroupedSplits) {
      mrInput =
          MRHelpers.createMRInputPayloadWithGrouping(
              serializedConf, HiveInputFormat.class.getName());
    } else {
      mrInput = MRHelpers.createMRInputPayload(serializedConf, null);
    }
    map.addInput(
        alias,
        new InputDescriptor(MRInputLegacy.class.getName()).setUserPayload(mrInput),
        amSplitGeneratorClass);

    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
    localResources.put(getBaseName(appJarLr), appJarLr);
    for (LocalResource lr : additionalLr) {
      localResources.put(getBaseName(lr), lr);
    }

    if (inputSplitInfo != null) {
      // only relevant for client-side split generation
      map.setTaskLocationsHint(inputSplitInfo.getTaskLocationHints());
      MRHelpers.updateLocalResourcesForInputSplits(
          FileSystem.get(conf), inputSplitInfo, localResources);
    }

    map.setTaskLocalResources(localResources);
    return map;
  }
Example #18
0
  public static void main(String[] args) throws Exception {
    if (!validArgs(args)) {
      printUsage();
      return;
    }
    // These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/csrconverter-output";
    String dir2 = "/user/miyuru/csrconverter-output-sorted";

    // We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    System.out.println("Deleting the dir : " + dir1);

    if (fs1.exists(new Path(dir1))) {
      fs1.delete(new Path(dir1), true);
    }

    System.out.println("Done deleting the dir : " + dir1);
    System.out.println("Deleting the dir : " + dir2);
    if (fs1.exists(new Path(dir2))) {
      fs1.delete(new Path(dir2), true);
    }

    Path notinPath = new Path("/user/miyuru/notinverts/notinverts");

    if (!fs1.exists(notinPath)) {
      fs1.create(notinPath);
    }

    System.out.println("Done deleting the dir : " + dir2);

    // Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why.

    VertexCounterClient.setDefaultGraphID(args[3], args[2]);

    // First job creates the inverted index

    JobConf conf = new JobConf(CSRConverter.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]);
    conf.set("org.acacia.partitioner.hbase.table", args[2]);
    conf.set("org.acacia.partitioner.hbase.contacthost", args[3]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    // conf.setMapperClass(InvertedMapper.class);
    conf.setReducerClass(InvertedReducer.class);
    // conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // FileInputFormat.setInputPaths(conf, new Path(args[0]));
    MultipleInputs.addInputPath(
        conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class);
    MultipleInputs.addInputPath(
        conf,
        new Path("/user/miyuru/notinverts/notinverts"),
        TextInputFormat.class,
        InvertedMapper.class);
    FileOutputFormat.setOutputPath(conf, new Path(dir1));

    // Also for the moment we turn-off the speculative execution
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setNumMapTasks(96);
    conf.setNumReduceTasks(96);
    conf.setPartitionerClass(VertexPartitioner.class);
    conf.set("vertex-count", args[4]);
    conf.set("zero-flag", args[5]);
    Job job = new Job(conf, "csr_inverter");
    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
  }
 @Override
 public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException {
   super.localizeConfiguration(jobConf);
   jobConf.setBoolean(JobContext.TASK_ISMAP, true);
 }
  public void merge(Path out, Path[] segs, boolean filter, long slice) throws Exception {
    String segmentName = Generator.generateSegmentName();
    if (LOG.isInfoEnabled()) {
      LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
    }
    JobConf job = new NutchJob(getConf());
    job.setJobName("mergesegs " + out + "/" + segmentName);
    job.setBoolean("segment.merger.filter", filter);
    job.setLong("segment.merger.slice", slice);
    job.set("segment.merger.segmentName", segmentName);
    FileSystem fs = FileSystem.get(getConf());
    // prepare the minimal common set of input dirs
    boolean g = true;
    boolean f = true;
    boolean p = true;
    boolean c = true;
    boolean pd = true;
    boolean pt = true;
    for (int i = 0; i < segs.length; i++) {
      if (!fs.exists(segs[i])) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
        }
        segs[i] = null;
        continue;
      }
      if (LOG.isInfoEnabled()) {
        LOG.info("SegmentMerger:   adding " + segs[i]);
      }
      Path cDir = new Path(segs[i], Content.DIR_NAME);
      Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
      Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
      Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
      Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
      Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
      c = c && fs.exists(cDir);
      g = g && fs.exists(gDir);
      f = f && fs.exists(fDir);
      p = p && fs.exists(pDir);
      pd = pd && fs.exists(pdDir);
      pt = pt && fs.exists(ptDir);
    }
    StringBuffer sb = new StringBuffer();
    if (c) sb.append(" " + Content.DIR_NAME);
    if (g) sb.append(" " + CrawlDatum.GENERATE_DIR_NAME);
    if (f) sb.append(" " + CrawlDatum.FETCH_DIR_NAME);
    if (p) sb.append(" " + CrawlDatum.PARSE_DIR_NAME);
    if (pd) sb.append(" " + ParseData.DIR_NAME);
    if (pt) sb.append(" " + ParseText.DIR_NAME);
    if (LOG.isInfoEnabled()) {
      LOG.info("SegmentMerger: using segment data from:" + sb.toString());
    }
    for (int i = 0; i < segs.length; i++) {
      if (segs[i] == null) continue;
      if (g) {
        Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
        job.addInputPath(gDir);
      }
      if (c) {
        Path cDir = new Path(segs[i], Content.DIR_NAME);
        job.addInputPath(cDir);
      }
      if (f) {
        Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
        job.addInputPath(fDir);
      }
      if (p) {
        Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
        job.addInputPath(pDir);
      }
      if (pd) {
        Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
        job.addInputPath(pdDir);
      }
      if (pt) {
        Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
        job.addInputPath(ptDir);
      }
    }
    job.setInputFormat(ObjectInputFormat.class);
    job.setMapperClass(SegmentMerger.class);
    job.setReducerClass(SegmentMerger.class);
    job.setOutputPath(out);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MetaWrapper.class);
    job.setOutputFormat(SegmentOutputFormat.class);

    setConf(job);

    JobClient.runJob(job);
  }
Example #21
0
  /** Runs this tool. */
  @SuppressWarnings("deprecation")
  public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf(), Docnos2Titles.class);

    // Read commandline arguments
    CommandLine cmdline = parseArgs(args);
    if (cmdline == null) {
      printUsage();
    }
    String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION);
    String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION);
    String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION);
    String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String eLang = cmdline.getOptionValue(ELANG_OPTION);
    String fLang = cmdline.getOptionValue(FLANG_OPTION);
    String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION);
    job.setJobName("Docnos2Titles_" + fLang + "-" + eLang);

    FileInputFormat.addInputPaths(job, eCollectionPath);
    FileInputFormat.addInputPaths(job, fCollectionPath);
    FileOutputFormat.setOutputPath(job, new Path(titlePairsPath));
    DistributedCache.addCacheFile(new URI(pwsimOutputPath), job);
    DistributedCache.addCacheFile(new URI(samplesFile), job);
    job.set("eLang", eLang);
    job.set("fLang", fLang);
    job.set("PwsimPairs", pwsimOutputPath);
    job.set("Ivory.SampleFile", samplesFile);

    job.setInt("mapred.task.timeout", 60000000);
    job.set("mapreduce.map.memory.mb", "3000");
    job.set("mapreduce.map.java.opts", "-Xmx3000m");
    job.setBoolean("mapred.map.tasks.speculative.execution", false);
    job.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setNumMapTasks(100);
    job.setNumReduceTasks(1);
    job.setInt("mapred.min.split.size", 2000000000);
    job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(PairOfIntString.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("E-collection path: " + eCollectionPath);
    sLogger.info("F-collection path: " + fCollectionPath);
    sLogger.info("Pwsim output path: " + pwsimOutputPath);
    sLogger.info("Output path: " + titlePairsPath);
    sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none"));

    long startTime = System.currentTimeMillis();
    JobClient.runJob(job);
    System.out.println(
        "Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }
Example #22
0
  /**
   * Run a compactor job.
   *
   * @param conf Hive configuration file
   * @param jobName name to run this job with
   * @param t metastore table
   * @param sd metastore storage descriptor
   * @param txns list of valid transactions
   * @param isMajor is this a major compaction?
   * @throws java.io.IOException if the job fails
   */
  void run(
      HiveConf conf,
      String jobName,
      Table t,
      StorageDescriptor sd,
      ValidTxnList txns,
      boolean isMajor,
      Worker.StatsUpdater su)
      throws IOException {
    JobConf job = new JobConf(conf);
    job.setJobName(jobName);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);
    job.setJarByClass(CompactorMR.class);
    LOG.debug("User jar set to " + job.getJar());
    job.setMapperClass(CompactorMap.class);
    job.setNumReduceTasks(0);
    job.setInputFormat(CompactorInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setOutputCommitter(CompactorOutputCommitter.class);

    String queueName = conf.getVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE);
    if (queueName != null && queueName.length() > 0) {
      job.setQueueName(queueName);
    }

    job.set(FINAL_LOCATION, sd.getLocation());
    job.set(TMP_LOCATION, sd.getLocation() + "/" + TMPDIR + "_" + UUID.randomUUID().toString());
    job.set(INPUT_FORMAT_CLASS_NAME, sd.getInputFormat());
    job.set(OUTPUT_FORMAT_CLASS_NAME, sd.getOutputFormat());
    job.setBoolean(IS_MAJOR, isMajor);
    job.setBoolean(IS_COMPRESSED, sd.isCompressed());
    job.set(TABLE_PROPS, new StringableMap(t.getParameters()).toString());
    job.setInt(NUM_BUCKETS, sd.getNumBuckets());
    job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
    setColumnTypes(job, sd.getCols());

    // Figure out and encode what files we need to read.  We do this here (rather than in
    // getSplits below) because as part of this we discover our minimum and maximum transactions,
    // and discovering that in getSplits is too late as we then have no way to pass it to our
    // mapper.

    AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false);
    StringableList dirsToSearch = new StringableList();
    Path baseDir = null;
    if (isMajor) {
      // There may not be a base dir if the partition was empty before inserts or if this
      // partition is just now being converted to ACID.
      baseDir = dir.getBaseDirectory();
      if (baseDir == null) {
        List<HdfsFileStatusWithId> originalFiles = dir.getOriginalFiles();
        if (!(originalFiles == null) && !(originalFiles.size() == 0)) {
          // There are original format files
          for (HdfsFileStatusWithId stat : originalFiles) {
            Path path = stat.getFileStatus().getPath();
            dirsToSearch.add(path);
            LOG.debug("Adding original file " + path + " to dirs to search");
          }
          // Set base to the location so that the input format reads the original files.
          baseDir = new Path(sd.getLocation());
        }
      } else {
        // add our base to the list of directories to search for files in.
        LOG.debug("Adding base directory " + baseDir + " to dirs to search");
        dirsToSearch.add(baseDir);
      }
    }

    List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories();

    if (parsedDeltas == null || parsedDeltas.size() == 0) {
      // Seriously, no deltas?  Can't compact that.
      LOG.error("No delta files found to compact in " + sd.getLocation());
      return;
    }

    StringableList deltaDirs = new StringableList();
    long minTxn = Long.MAX_VALUE;
    long maxTxn = Long.MIN_VALUE;
    for (AcidUtils.ParsedDelta delta : parsedDeltas) {
      LOG.debug("Adding delta " + delta.getPath() + " to directories to search");
      dirsToSearch.add(delta.getPath());
      deltaDirs.add(delta.getPath());
      minTxn = Math.min(minTxn, delta.getMinTransaction());
      maxTxn = Math.max(maxTxn, delta.getMaxTransaction());
    }

    if (baseDir != null) job.set(BASE_DIR, baseDir.toString());
    job.set(DELTA_DIRS, deltaDirs.toString());
    job.set(DIRS_TO_SEARCH, dirsToSearch.toString());
    job.setLong(MIN_TXN, minTxn);
    job.setLong(MAX_TXN, maxTxn);
    LOG.debug("Setting minimum transaction to " + minTxn);
    LOG.debug("Setting maximume transaction to " + maxTxn);

    RunningJob rj = JobClient.runJob(job);
    LOG.info(
        "Submitted "
            + (isMajor ? CompactionType.MAJOR : CompactionType.MINOR)
            + " compaction job '"
            + jobName
            + "' with jobID="
            + rj.getID()
            + " to "
            + job.getQueueName()
            + " queue.  "
            + "(current delta dirs count="
            + dir.getCurrentDirectories().size()
            + ", obsolete delta dirs count="
            + dir.getObsolete());
    rj.waitForCompletion();
    su.gatherStats();
  }
Example #23
0
  public void indexSolr(
      String solrUrl,
      Path crawlDb,
      Path linkDb,
      List<Path> segments,
      boolean noCommit,
      boolean deleteGone,
      String solrParams,
      boolean filter,
      boolean normalize)
      throws IOException {

    filter = false;

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
      job.set(SolrConstants.PARAMS, solrParams);
    }
    NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
      JobClient.runJob(job);
      // do the commits once and for all the reducers in one go
      SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

      if (!noCommit) {
        solr.commit();
      }
      long end = System.currentTimeMillis();
      LOG.info(
          "SolrIndexer: finished at "
              + sdf.format(end)
              + ", elapsed: "
              + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
      LOG.error(e.toString());
    } finally {
      FileSystem.get(job).delete(tmp, true);
    }
  }