Пример #1
0
  private static DBConfiguration setOutput(JobConf job, String tableName) {
    job.setOutputFormat(DBOutputFormat.class);
    job.setReduceSpeculativeExecution(false);

    DBConfiguration dbConf = new DBConfiguration(job);

    dbConf.setOutputTableName(tableName);
    return dbConf;
  }
Пример #2
0
  boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException {

    job = new JobConf(getConf(), Crush.class);

    /*
     * Turn off speculative execution because that's just wasting network io.
     */
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    /*
     * Turn off pre-emption because we don't want to kill a task after two hours of network io.
     */
    job.set("mapred.fairscheduler.preemption", "false");

    tmpDir = new Path("tmp/crush-" + UUID.randomUUID());
    outDir = new Path(tmpDir, "out");

    double threshold = 0.75;

    List<String> regexes = asList(".+");
    List<String> replacements =
        asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}");
    List<String> inFormats = asList(SequenceFileInputFormat.class.getName());
    List<String> outFormats = asList(SequenceFileOutputFormat.class.getName());

    String crushTimestamp;

    Options options = buildOptions();
    CommandLine cli = new GnuParser().parse(options, args);

    if (cli.hasOption("?")) {
      BufferedReader reader =
          new BufferedReader(
              new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt")));

      try {
        String line;

        while (null != (line = reader.readLine())) {
          System.out.println(line);
        }
      } finally {
        reader.close();
      }

      return false;
    }

    if (cli.hasOption("verbose")) {
      console = Verbosity.VERBOSE;
    } else if (cli.hasOption("info")) {
      console = Verbosity.INFO;
    } else {
      console = Verbosity.NONE;
    }

    if (cli.hasOption("ignore-regex")) {
      ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher("");
    }

    excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs");

    String[] nonOptions = cli.getArgs();

    if (2 == nonOptions.length) {
      /*
       * Stand alone mode accepts two arguments.
       */
      mode = Mode.STAND_ALONE;

      srcDir = new Path(nonOptions[0]);

      dest = new Path(nonOptions[1]);

      if (cli.hasOption("input-format")) {
        inFormats = asList(cli.getOptionValue("input-format"));
      }

      if (cli.hasOption("output-format")) {
        outFormats = asList(cli.getOptionValue("output-format"));
      }

      replacements = asList(dest.getName());

      crushTimestamp = Long.toString(currentTimeMillis());

    } else {
      /*
       * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an
       * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking
       * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much
       * smaller.
       */

      if ((args.length == 4 || args.length == 3)
          && args.length == nonOptions.length
          && args[2].length() != 14) {

        int maxTasks = Integer.parseInt(args[2]);

        if (maxTasks <= 0 || maxTasks > 4000) {
          throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks);
        }

        job.setInt("mapred.reduce.tasks", maxTasks);

        maxFileBlocks = Integer.MAX_VALUE;

        crushTimestamp = Long.toString(currentTimeMillis());

        srcDir = new Path(args[0]);
        dest = new Path(args[1]);

        mode = Mode.CLONE;

        if (args.length == 4) {
          if (args[3].equals("TEXT")) {
            /*
             * These are the defaults except with text input and output formats.
             */
            inFormats = asList(TextInputFormat.class.getName());
            outFormats = asList(TextOutputFormat.class.getName());

          } else if (!args[3].equals("SEQUENCE")) {
            throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]);
          }
        }
      } else {
        /*
         * V2 style arguments.
         */
        if (cli.hasOption("threshold")) {
          threshold = Double.parseDouble(cli.getOptionValue("threshold"));

          if (0 >= threshold
              || 1 < threshold
              || Double.isInfinite(threshold)
              || Double.isNaN(threshold)) {
            throw new IllegalArgumentException(
                "Block size threshold must be in (0, 1]: " + threshold);
          }
        }

        if (cli.hasOption("max-file-blocks")) {
          int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks"));

          if (0 > maxFileBlocksOption) {
            throw new IllegalArgumentException(
                "Maximum file size in blocks must be positive: " + maxFileBlocksOption);
          }

          maxFileBlocks = maxFileBlocksOption;
        } else {
          maxFileBlocks = 8;
        }

        if (cli.hasOption("regex")) {
          regexes = asList(cli.getOptionValues("regex"));
        }

        if (cli.hasOption("replacement")) {
          replacements = asList(cli.getOptionValues("replacement"));
        }

        if (cli.hasOption("input-format")) {
          inFormats = asList(cli.getOptionValues("input-format"));
        }

        if (cli.hasOption("output-format")) {
          outFormats = asList(cli.getOptionValues("output-format"));
        }

        if (3 != nonOptions.length) {
          throw new IllegalArgumentException(
              "Could not find source directory, out directory, and job timestamp");
        }

        srcDir = new Path(nonOptions[0]);
        dest = new Path(nonOptions[1]);

        crushTimestamp = nonOptions[2];

        if (cli.hasOption("clone")) {
          mode = Mode.CLONE;
        } else {
          mode = Mode.MAP_REDUCE;
        }

        if (!crushTimestamp.matches("\\d{14}")) {
          throw new IllegalArgumentException(
              "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp);
        }
      }

      dfsBlockSize = Long.parseLong(job.get("dfs.blocksize"));
      maxEligibleSize = (long) (dfsBlockSize * threshold);
    }

    /*
     * Add the crush specs and compression options to the configuration.
     */
    job.set("crush.timestamp", crushTimestamp);

    if (ignoredFiles != null) {
      job.set("crush.ignore-regex", ignoredFiles.pattern().pattern());
    }

    if (regexes.size() != replacements.size()
        || replacements.size() != inFormats.size()
        || inFormats.size() != outFormats.size()) {
      throw new IllegalArgumentException(
          "Must be an equal number of regex, replacement, in-format, and out-format options");
    }

    job.setInt("crush.num.specs", regexes.size());

    matchers = new ArrayList<Matcher>(regexes.size());

    for (int i = 0; i < regexes.size(); i++) {
      job.set(format("crush.%d.regex", i), regexes.get(i));

      matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy"));

      job.set(format("crush.%d.regex.replacement", i), replacements.get(i));

      String inFmt = inFormats.get(i);

      if ("sequence".equals(inFmt)) {
        inFmt = SequenceFileInputFormat.class.getName();
      } else if ("text".equals(inFmt)) {
        inFmt = TextInputFormat.class.getName();
      } else {
        try {
          if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) {
            throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
          }
        } catch (ClassNotFoundException e) {
          throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt);
        }
      }

      job.set(format("crush.%d.input.format", i), inFmt);

      String outFmt = outFormats.get(i);

      if ("sequence".equals(outFmt)) {
        outFmt = SequenceFileOutputFormat.class.getName();
      } else if ("text".equals(outFmt)) {
        outFmt = TextOutputFormat.class.getName();
      } else {
        try {
          if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) {
            throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
          }
        } catch (ClassNotFoundException e) {
          throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt);
        }
      }

      job.set(format("crush.%d.output.format", i), outFmt);
    }

    String codec = cli.getOptionValue("compress");

    if (null == codec) {
      codec = DefaultCodec.class.getName();
    } else if ("none".equals(codec)) {
      codec = null;
    } else if ("gzip".equals(codec)) {
      codec = GzipCodec.class.getName();
    } else {
      try {
        if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) {
          throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
        }
      } catch (ClassNotFoundException e) {
        throw new IllegalArgumentException("Not a CompressionCodec: " + codec);
      }
    }

    if (null == codec) {
      job.setBoolean("mapred.output.compress", false);
    } else {
      job.setBoolean("mapred.output.compress", true);
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapred.output.compression.codec", codec);

      try {
        CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance();
        codecExtension = instance.getDefaultExtension();
      } catch (Exception e) {
        throw new AssertionError();
      }
    }

    return true;
  }
 public JobBuilder speculativeReducerExecution() throws IOException {
   _jobConf.setReduceSpeculativeExecution(true);
   return this;
 }
 public JobBuilder speculativeReducerExecution(boolean enable) throws IOException {
   _jobConf.setReduceSpeculativeExecution(enable);
   return this;
 }
Пример #5
0
  public void indexSolr(
      String solrUrl,
      Path crawlDb,
      Path linkDb,
      List<Path> segments,
      boolean noCommit,
      boolean deleteGone,
      String solrParams,
      boolean filter,
      boolean normalize)
      throws IOException {

    filter = false;

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
      job.set(SolrConstants.PARAMS, solrParams);
    }
    NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
      JobClient.runJob(job);
      // do the commits once and for all the reducers in one go
      SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

      if (!noCommit) {
        solr.commit();
      }
      long end = System.currentTimeMillis();
      LOG.info(
          "SolrIndexer: finished at "
              + sdf.format(end)
              + ", elapsed: "
              + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
      LOG.error(e.toString());
    } finally {
      FileSystem.get(job).delete(tmp, true);
    }
  }