private static DBConfiguration setOutput(JobConf job, String tableName) { job.setOutputFormat(DBOutputFormat.class); job.setReduceSpeculativeExecution(false); DBConfiguration dbConf = new DBConfiguration(job); dbConf.setOutputTableName(tableName); return dbConf; }
boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException { job = new JobConf(getConf(), Crush.class); /* * Turn off speculative execution because that's just wasting network io. */ job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); /* * Turn off pre-emption because we don't want to kill a task after two hours of network io. */ job.set("mapred.fairscheduler.preemption", "false"); tmpDir = new Path("tmp/crush-" + UUID.randomUUID()); outDir = new Path(tmpDir, "out"); double threshold = 0.75; List<String> regexes = asList(".+"); List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); List<String> inFormats = asList(SequenceFileInputFormat.class.getName()); List<String> outFormats = asList(SequenceFileOutputFormat.class.getName()); String crushTimestamp; Options options = buildOptions(); CommandLine cli = new GnuParser().parse(options, args); if (cli.hasOption("?")) { BufferedReader reader = new BufferedReader( new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt"))); try { String line; while (null != (line = reader.readLine())) { System.out.println(line); } } finally { reader.close(); } return false; } if (cli.hasOption("verbose")) { console = Verbosity.VERBOSE; } else if (cli.hasOption("info")) { console = Verbosity.INFO; } else { console = Verbosity.NONE; } if (cli.hasOption("ignore-regex")) { ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher(""); } excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs"); String[] nonOptions = cli.getArgs(); if (2 == nonOptions.length) { /* * Stand alone mode accepts two arguments. */ mode = Mode.STAND_ALONE; srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValue("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValue("output-format")); } replacements = asList(dest.getName()); crushTimestamp = Long.toString(currentTimeMillis()); } else { /* * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much * smaller. */ if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length && args[2].length() != 14) { int maxTasks = Integer.parseInt(args[2]); if (maxTasks <= 0 || maxTasks > 4000) { throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks); } job.setInt("mapred.reduce.tasks", maxTasks); maxFileBlocks = Integer.MAX_VALUE; crushTimestamp = Long.toString(currentTimeMillis()); srcDir = new Path(args[0]); dest = new Path(args[1]); mode = Mode.CLONE; if (args.length == 4) { if (args[3].equals("TEXT")) { /* * These are the defaults except with text input and output formats. */ inFormats = asList(TextInputFormat.class.getName()); outFormats = asList(TextOutputFormat.class.getName()); } else if (!args[3].equals("SEQUENCE")) { throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]); } } } else { /* * V2 style arguments. */ if (cli.hasOption("threshold")) { threshold = Double.parseDouble(cli.getOptionValue("threshold")); if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold) || Double.isNaN(threshold)) { throw new IllegalArgumentException( "Block size threshold must be in (0, 1]: " + threshold); } } if (cli.hasOption("max-file-blocks")) { int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks")); if (0 > maxFileBlocksOption) { throw new IllegalArgumentException( "Maximum file size in blocks must be positive: " + maxFileBlocksOption); } maxFileBlocks = maxFileBlocksOption; } else { maxFileBlocks = 8; } if (cli.hasOption("regex")) { regexes = asList(cli.getOptionValues("regex")); } if (cli.hasOption("replacement")) { replacements = asList(cli.getOptionValues("replacement")); } if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValues("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValues("output-format")); } if (3 != nonOptions.length) { throw new IllegalArgumentException( "Could not find source directory, out directory, and job timestamp"); } srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); crushTimestamp = nonOptions[2]; if (cli.hasOption("clone")) { mode = Mode.CLONE; } else { mode = Mode.MAP_REDUCE; } if (!crushTimestamp.matches("\\d{14}")) { throw new IllegalArgumentException( "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp); } } dfsBlockSize = Long.parseLong(job.get("dfs.blocksize")); maxEligibleSize = (long) (dfsBlockSize * threshold); } /* * Add the crush specs and compression options to the configuration. */ job.set("crush.timestamp", crushTimestamp); if (ignoredFiles != null) { job.set("crush.ignore-regex", ignoredFiles.pattern().pattern()); } if (regexes.size() != replacements.size() || replacements.size() != inFormats.size() || inFormats.size() != outFormats.size()) { throw new IllegalArgumentException( "Must be an equal number of regex, replacement, in-format, and out-format options"); } job.setInt("crush.num.specs", regexes.size()); matchers = new ArrayList<Matcher>(regexes.size()); for (int i = 0; i < regexes.size(); i++) { job.set(format("crush.%d.regex", i), regexes.get(i)); matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy")); job.set(format("crush.%d.regex.replacement", i), replacements.get(i)); String inFmt = inFormats.get(i); if ("sequence".equals(inFmt)) { inFmt = SequenceFileInputFormat.class.getName(); } else if ("text".equals(inFmt)) { inFmt = TextInputFormat.class.getName(); } else { try { if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } job.set(format("crush.%d.input.format", i), inFmt); String outFmt = outFormats.get(i); if ("sequence".equals(outFmt)) { outFmt = SequenceFileOutputFormat.class.getName(); } else if ("text".equals(outFmt)) { outFmt = TextOutputFormat.class.getName(); } else { try { if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } job.set(format("crush.%d.output.format", i), outFmt); } String codec = cli.getOptionValue("compress"); if (null == codec) { codec = DefaultCodec.class.getName(); } else if ("none".equals(codec)) { codec = null; } else if ("gzip".equals(codec)) { codec = GzipCodec.class.getName(); } else { try { if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } if (null == codec) { job.setBoolean("mapred.output.compress", false); } else { job.setBoolean("mapred.output.compress", true); job.set("mapred.output.compression.type", "BLOCK"); job.set("mapred.output.compression.codec", codec); try { CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance(); codecExtension = instance.getDefaultExtension(); } catch (Exception e) { throw new AssertionError(); } } return true; }
public JobBuilder speculativeReducerExecution() throws IOException { _jobConf.setReduceSpeculativeExecution(true); return this; }
public JobBuilder speculativeReducerExecution(boolean enable) throws IOException { _jobConf.setReduceSpeculativeExecution(enable); return this; }
public void indexSolr( String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { filter = false; SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(SolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(SolrConstants.PARAMS, solrParams); } NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info( "SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }