Exemple #1
0
  public static void getData(CloudataConf conf, Path keyPath) throws IOException {
    JobConf jobConf = new JobConf(TeraReadJob.class);
    jobConf.set("user.name", conf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    Path tempOutputPath = new Path("ManyTableJob_Get_" + System.currentTimeMillis());

    jobConf.setJobName("ManyTableJob_Get_" + "(" + new Date() + ")");

    TextOutputFormat.setOutputPath(jobConf, tempOutputPath);
    // <MAP>
    jobConf.setMapperClass(ManyTableGetMap.class);
    jobConf.setInputFormat(TextInputFormat.class);
    TextInputFormat.addInputPath(jobConf, keyPath);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    // </MAP>

    // <REDUCE>
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
      // Run Job
      JobClient.runJob(jobConf);
    } finally {
      // delete temp output path
      FileSystem fs = FileSystem.get(jobConf);
      FileUtil.delete(fs, tempOutputPath, true);
      CloudataMapReduceUtil.clearMapReduce(libDir);
    }
  }
Exemple #2
0
  public static Path putData() throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf jobConf = new JobConf(ManyTableJob.class);
    jobConf.set("user.name", nconf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("ManyTableJob_Put" + "(" + new Date() + ")");

    jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000);

    Path outputPath = new Path("ManyTableJob_KEY_" + System.currentTimeMillis());

    FileOutputFormat.setOutputPath(jobConf, outputPath);

    // <MAP>
    jobConf.setMapperClass(ManyTablePutMap.class);
    jobConf.setInputFormat(SimpleInputFormat.class);
    jobConf.setNumMapTasks(numOfTables);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    // </MAP>

    // <REDUCE>
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
      // Run Job
      JobClient.runJob(jobConf);
      return outputPath;
    } finally {
      // delete temp output path
      FileSystem fs = FileSystem.get(jobConf);
      CloudataMapReduceUtil.clearMapReduce(libDir);
    }
  }
  @Test
  public void mrRun() throws Exception {
    FileSystem fs = dfsCluster.getFileSystem();
    Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
    fs.delete(inDir, true);
    String DATADIR = "/user/testing/testMapperReducer/data";
    Path dataDir = fs.makeQualified(new Path(DATADIR));
    fs.delete(dataDir, true);
    Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
    fs.delete(outDir, true);

    assertTrue(fs.mkdirs(inDir));
    Path INPATH = new Path(inDir, "input.txt");
    OutputStream os = fs.create(INPATH);
    Writer wr = new OutputStreamWriter(os, StandardCharsets.UTF_8);
    wr.write(DATADIR + "/" + inputAvroFile);
    wr.close();

    assertTrue(fs.mkdirs(dataDir));
    fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir);

    JobConf jobConf = getJobConf();
    jobConf.set("jobclient.output.filter", "ALL");
    if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger
      // and set breakpoints
      jobConf.set("mapred.job.tracker", "local");
    }
    jobConf.setMaxMapAttempts(1);
    jobConf.setMaxReduceAttempts(1);
    jobConf.setJar(SEARCH_ARCHIVES_JAR);

    int shards = 2;
    int maxReducers = Integer.MAX_VALUE;
    if (ENABLE_LOCAL_JOB_RUNNER) {
      // local job runner has a couple of limitations: only one reducer is supported and the
      // DistributedCache doesn't work.
      // see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/
      maxReducers = 1;
      shards = 1;
    }

    String[] args =
        new String[] {
          "--morphline-file=" + tempDir + "/test-morphlines/solrCellDocumentTypes.conf",
          "--morphline-id=morphline1",
          "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
          "--output-dir=" + outDir.toString(),
          "--shards=" + shards,
          "--verbose",
          numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
          numRuns % 3 == 0
              ? "--reducers=" + shards
              : (numRuns % 3 == 1 ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers))
        };
    if (numRuns % 3 == 2) {
      args = concat(args, new String[] {"--fanout=2"});
    }
    if (numRuns == 0) {
      // force (slow) MapReduce based randomization to get coverage for that as well
      args =
          concat(
              new String[] {"-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1"},
              args);
    }
    MapReduceIndexerTool tool = createTool();
    int res = ToolRunner.run(jobConf, tool, args);
    assertEquals(0, res);
    Job job = tool.job;
    assertTrue(job.isComplete());
    assertTrue(job.isSuccessful());

    if (numRuns % 3 != 2) {
      // Only run this check if mtree merge is disabled.
      // With mtree merge enabled the BatchWriter counters aren't available anymore because
      // variable "job" now refers to the merge job rather than the indexing job
      assertEquals(
          "Invalid counter "
              + SolrRecordWriter.class.getName()
              + "."
              + SolrCounters.DOCUMENTS_WRITTEN,
          count,
          job.getCounters()
              .findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString())
              .getValue());
    }

    // Check the output is as expected
    outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR);
    Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir));

    System.out.println("outputfiles:" + Arrays.toString(outputFiles));

    UtilsForTests.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards);

    // run again with --dryrun mode:
    tool = createTool();
    args = concat(args, new String[] {"--dry-run"});
    res = ToolRunner.run(jobConf, tool, args);
    assertEquals(0, res);

    numRuns++;
  }
 public JobBuilder maxMapAttempts(int maxAttempts) throws IOException {
   _jobConf.setMaxMapAttempts(maxAttempts);
   return this;
 }