/**
   * Configure the job
   *
   * @param conf Job to configure
   * @param rules classification rules to evaluate
   * @param target label value to evaluate the rules for
   * @param inpath input path (the dataset)
   * @param outpath output <code>Path</code>
   * @param split DatasetSplit used to separate training and testing input
   */
  private static void configureJob(
      JobConf conf,
      List<? extends Rule> rules,
      int target,
      Path inpath,
      Path outpath,
      DatasetSplit split) {
    split.storeJobParameters(conf);

    FileInputFormat.setInputPaths(conf, inpath);
    FileOutputFormat.setOutputPath(conf, outpath);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(CDFitness.class);

    conf.setMapperClass(CDMapper.class);
    conf.setCombinerClass(CDReducer.class);
    conf.setReducerClass(CDReducer.class);

    conf.setInputFormat(DatasetTextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    // store the parameters
    conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules));
    conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
    conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target);
  }
Example #2
0
  /**
   * set up input file which has the list of input files.
   *
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter =
          SequenceFile.createWriter(
              fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (FileStatus st : p.srcPaths) {
          opWriter.append(new Text(st.getPath().toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();

    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(
        getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers()));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;
  }
Example #3
0
  /**
   * Driver to copy srcPath to destPath depending on required protocol.
   *
   * @param args arguments
   */
  static void copy(final Configuration conf, final Arguments args) throws IOException {
    LOG.info("srcPaths=" + args.srcs);
    LOG.info("destPath=" + args.dst);
    checkSrcPath(conf, args.srcs);

    JobConf job = createJobConf(conf);
    if (args.preservedAttributes != null) {
      job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes);
    }
    if (args.mapredSslConf != null) {
      job.set("dfs.https.client.keystore.resource", args.mapredSslConf);
    }

    // Initialize the mapper
    try {
      setup(conf, job, args);
      JobClient.runJob(job);
      finalize(conf, job, args.dst, args.preservedAttributes);
    } finally {
      // delete tmp
      fullyDelete(job.get(TMP_DIR_LABEL), job);
      // delete jobDirectory
      fullyDelete(job.get(JOB_DIR_LABEL), job);
    }
  }
Example #4
0
  /**
   * Run the job
   *
   * @param params The Job parameters containing the gramSize, input output folders, defaultCat,
   *     encoding
   */
  public static void runJob(Parameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesClassifierDriver.class);
    conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath"));
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath")));
    Path outPath = new Path(params.get("testDirPath") + "-output");
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setMapperClass(BayesClassifierMapper.class);
    conf.setCombinerClass(BayesClassifierReducer.class);
    conf.setReducerClass(BayesClassifierReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    HadoopUtil.overwriteOutput(outPath);
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

    Path outputFiles = new Path(outPath, "part*");
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params);
    log.info("{}", matrix.summarize());
  }
Example #5
0
  public void runParseTest(
      String fieldTerminator,
      String lineTerminator,
      String encloser,
      String escape,
      boolean encloseRequired)
      throws IOException {

    ClassLoader prevClassLoader = null;

    String[] argv =
        getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired);
    runImport(argv);
    try {
      String tableClassName = getTableName();

      argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired);
      SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true);

      CompilationManager compileMgr = new CompilationManager(opts);
      String jarFileName = compileMgr.getJarFilename();

      // Make sure the user's class is loaded into our address space.
      prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName);

      JobConf job = new JobConf();
      job.setJar(jarFileName);

      // Tell the job what class we're testing.
      job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);

      // use local mode in the same JVM.
      ConfigurationHelper.setJobtrackerAddr(job, "local");
      if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
      }
      String warehouseDir = getWarehouseDir();
      Path warehousePath = new Path(warehouseDir);
      Path inputPath = new Path(warehousePath, getTableName());
      Path outputPath = new Path(warehousePath, getTableName() + "-out");

      job.setMapperClass(ReparseMapper.class);
      job.setNumReduceTasks(0);
      FileInputFormat.addInputPath(job, inputPath);
      FileOutputFormat.setOutputPath(job, outputPath);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(NullWritable.class);

      JobClient.runJob(job);
    } catch (InvalidOptionsException ioe) {
      fail(ioe.toString());
    } catch (ParseException pe) {
      fail(pe.toString());
    } finally {
      if (null != prevClassLoader) {
        ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
      }
    }
  }
  public void runMR(String myMultiLocs, String sortKey)
      throws ParseException, IOException, Exception, org.apache.hadoop.zebra.parser.ParseException {

    JobConf jobConf = new JobConf(conf);
    jobConf.setJobName("TestMultipleOutputs4");
    jobConf.setJarByClass(TestMultipleOutputs4.class);
    jobConf.set("table.output.tfile.compression", "gz");
    jobConf.set("sortKey", sortKey);
    // input settings
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(TestMultipleOutputs4.MapClass.class);
    jobConf.setMapOutputKeyClass(BytesWritable.class);
    jobConf.setMapOutputValueClass(ZebraTuple.class);
    FileInputFormat.setInputPaths(jobConf, inputPath);

    jobConf.setNumMapTasks(1);

    // output settings

    jobConf.setOutputFormat(BasicTableOutputFormat.class);
    BasicTableOutputFormat.setMultipleOutputs(
        jobConf, myMultiLocs, TestMultipleOutputs4.OutputPartitionerClass.class);

    // set the logical schema with 2 columns
    BasicTableOutputFormat.setSchema(jobConf, "word:string, count:int");
    // for demo purposes, create 2 physical column groups
    BasicTableOutputFormat.setStorageHint(jobConf, "[word];[count]");
    BasicTableOutputFormat.setSortInfo(jobConf, sortKey);
    System.out.println("in runMR, sortkey: " + sortKey);
    // set map-only job.
    jobConf.setNumReduceTasks(1);
    JobClient.runJob(jobConf);
    BasicTableOutputFormat.close(jobConf);
  }
  /**
   * run a distributed job and verify that TokenCache is available
   *
   * @throws IOException
   */
  @Test
  public void testTokenCache() throws IOException {

    System.out.println("running dist job");

    // make sure JT starts
    jConf = mrCluster.createJobConf();

    // provide namenodes names for the job to get the delegation tokens for
    String nnUri = dfsCluster.getURI(0).toString();
    jConf.set(MRJobConfig.JOB_NAMENODES, nnUri + "," + nnUri);
    // job tracker principla id..
    jConf.set(JTConfig.JT_USER_NAME, "jt_id/foo@BAR");

    // using argument to pass the file name
    String[] args = {
      "-tokenCacheFile", tokenFileName.toString(), "-m", "1", "-r", "1", "-mt", "1", "-rt", "1"
    };

    int res = -1;
    try {
      res = ToolRunner.run(jConf, new MySleepJob(), args);
    } catch (Exception e) {
      System.out.println("Job failed with" + e.getLocalizedMessage());
      e.printStackTrace(System.out);
      fail("Job failed");
    }
    assertEquals("dist job res is not 0", res, 0);
  }
  /**
   * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce
   * job.
   *
   * @throws IOException When there is communication problems with the job tracker.
   */
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCountSeqOutput.class);
    conf.setJobName("wordcount_seqOF");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);
    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    // conf.setOutputValueClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Combiner.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);

    //      // compress Mapper output
    //      conf.setCompressMapOutput(true);
    //      conf.setMapOutputCompressorClass(org.apache.hadoop.io.compress.GzipCodec.class);

    // compress final output
    conf.set("mapred.output.compress", conf.get("mapred.output.compress", "true"));
    conf.set("mapred.output.compression.type", conf.get("mapred.output.compression.type", "BLOCK"));
    conf.set(
        "mapred.output.compression.codec",
        conf.get("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"));

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
  }
Example #9
0
  private RunningJob _test(String... arg) throws Exception {
    Path actionDir = getFsTestCaseDir();

    File jar =
        IOUtils.createJar(
            new File(getTestCaseDir()),
            "launcher.jar",
            LauncherMapper.class,
            LauncherSecurityManager.class,
            LauncherException.class,
            LauncherMainTester.class);

    FileSystem fs = getFileSystem();

    Path launcherJar = new Path(actionDir, "launcher.jar");
    fs.copyFromLocalFile(new Path(jar.toString()), launcherJar);

    JobConf jobConf = new JobConf();
    jobConf.set("user.name", getTestUser());
    jobConf.set("group.name", getTestGroup());
    jobConf.setInt("mapred.map.tasks", 1);
    jobConf.setInt("mapred.map.max.attempts", 1);
    jobConf.setInt("mapred.reduce.max.attempts", 1);

    jobConf.set("mapred.job.tracker", getJobTrackerUri());
    jobConf.set("fs.default.name", getNameNodeUri());
    injectKerberosInfo(jobConf);

    LauncherMapper lm = new LauncherMapper();
    lm.setupMainClass(jobConf, LauncherMainTester.class.getName());
    lm.setupMainArguments(jobConf, arg);

    Configuration actionConf = new XConfiguration();
    lm.setupLauncherInfo(jobConf, "1", "1@a", actionDir, "1@a-0", actionConf);

    assertEquals("1", actionConf.get("oozie.job.id"));
    assertEquals("1@a", actionConf.get("oozie.action.id"));

    DistributedCache.addFileToClassPath(new Path(launcherJar.toUri().getPath()), jobConf);

    JobClient jobClient = createJobClient();

    final RunningJob runningJob = jobClient.submitJob(jobConf);

    System.out.println("Action Dir: " + actionDir);
    System.out.println("LauncherMapper ID: " + runningJob.getJobID().toString());

    waitFor(
        180 * 1000,
        new Predicate() {
          public boolean evaluate() throws Exception {
            return runningJob.isComplete();
          }
        });
    return runningJob;
  }
 public void setUpJobConf(JobConf job) {
   job.set(TezRuntimeFrameworkConfigs.LOCAL_DIRS, workDir.toString());
   job.set(MRConfig.LOCAL_DIR, workDir.toString());
   job.setClass(
       Constants.TEZ_RUNTIME_TASK_OUTPUT_MANAGER,
       TezLocalTaskOutputFiles.class,
       TezTaskOutput.class);
   job.set(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName());
   job.setNumReduceTasks(1);
 }
Example #11
0
    public MiniMrShim(Configuration conf, int numberOfTaskTrackers, String nameNode, int numDir)
        throws IOException {
      this.conf = conf;

      JobConf jConf = new JobConf(conf);
      jConf.set("yarn.scheduler.capacity.root.queues", "default");
      jConf.set("yarn.scheduler.capacity.root.default.capacity", "100");

      mr = new MiniMRCluster(numberOfTaskTrackers, nameNode, numDir, null, null, jConf);
    }
Example #12
0
 @Override
 protected void parseArgs(JobConf conf, String[] args) {
   if (args.length != 1) {
     throw new RuntimeException("Required arguments <inputFileName>");
   }
   String inFileName = args[0];
   LOGGER.info("inFileName: %s", inFileName);
   conf.set("in", inFileName);
   conf.set(ImportMRMapper.CONFIG_SOURCE_FILE_NAME, new File(inFileName).getName());
 }
  @Override
  public int run(String[] args) throws Exception {

    System.out.println("\n\nConvolutionJob\n");
    JobConf conf = new JobConf(getConf(), ConvolutionJob.class);
    conf.setJobName("ConvolutionJob");

    this.cacheKernel(conf);
    this.CreateRats(conf);
    conf.setMapperClass(ConvolutionMapper.class);
    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }

    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }

    conf.setNumReduceTasks(0);
    conf.setInputFormat(NonSplittableTextInputFormat.class);
    conf.setOutputFormat(MultiFileOutput.class);
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setCompressMapOutput(true);
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    conf.set("mapred.output.compression.type", "BLOCK");

    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
    // FileOutputFormat.setCompressOutput(conf, true);

    JobClient.runJob(conf);

    return 0;
  }
Example #14
0
  protected void initConfig(Map<Object, Object> properties, JobConf parentConfig) {
    if (properties != null) parentConfig = createConfig(properties, parentConfig);

    if (parentConfig == null) // this is ok, getJobConf will pass a default parent in
    return;

    jobConf = HadoopUtil.copyJobConf(parentConfig); // prevent local values from being shared
    jobConf.set("fs.http.impl", HttpFileSystem.class.getName());
    jobConf.set("fs.https.impl", HttpFileSystem.class.getName());

    syncPaths = HadoopUtil.addToClassPath(jobConf, getClassPath());
  }
Example #15
0
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("Domain-MR2");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);
    //		        conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    //		conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputFormat(MultiFileOutput.class);

    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(args[1]), true); // delete output dir

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    int reducers = 272;
    int mappers = 272;
    conf.setNumMapTasks(reducers);
    conf.setNumReduceTasks(mappers);

    // set parameters
    conf.set("k", "" + k);
    conf.set("r", "" + k);
    conf.set("parts", "" + parts); // number of partitions per dimension

    System.out.println(
        "running DOMAIN with k="
            + k
            + " r="
            + r
            + " parts="
            + parts
            + " "
            + "useCellBasedAlgo="
            + useCellBasedAlgo
            + " reducers="
            + reducers
            + " mappers="
            + mappers);
    JobClient.runJob(conf);
    return 0;
  }
  public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(AccessProcessJob.class);
    conf.set(nameNode, hdfsURL);
    conf.setJobName("AccessProcessJob");
    Job job = Job.getInstance(conf, "AccessProcessJob");

    new Path(outputPath).getFileSystem(conf).delete(new Path(outputPath), true);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(AccessProcessMap.class);
    conf.setReducerClass(AccessProcessReduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    JobClient.runJob(conf);
  }
 public JobBuilder compressor(CompressionType type, Class<? extends CompressionCodec> codec)
     throws IOException {
   _jobConf.setBoolean("mapred.output.compress", true);
   _jobConf.set("mapred.output.compression.type", type.toString());
   _jobConf.setClass("mapred.output.compression.codec", codec, CompressionCodec.class);
   return this;
 }
Example #18
0
  @Override
  protected void setConfigProperty(JobConf config, Object key, Object value) {
    // don't let these objects pass, even though toString is called below.
    if (value instanceof Class || value instanceof JobConf) return;

    config.set(key.toString(), value.toString());
  }
Example #19
0
 public void run() throws Exception {
   long startTime = System.currentTimeMillis();
   JobConf conf = new JobConf(ItemCFJob.class);
   conf.setJobName("ItemCF" + System.currentTimeMillis());
   conf.setNumMapTasks(10);
   conf.set(
       "io.serializations",
       "org.apache.hadoop.io.serializer.JavaSerialization,"
           + "org.apache.hadoop.io.serializer.WritableSerialization");
   StringBuilder sb = new StringBuilder();
   sb.append("--input ").append(input);
   sb.append(" --output ").append(output);
   if (flag) {
     sb.append(" --booleanData true");
   } else {
     sb.append(" --booleanData false");
   }
   sb.append(" --similarityClassname " + Constants.mahout_similarityclassname);
   sb.append(" --tempDir ").append(tmp);
   String[] args = sb.toString().split(" ");
   RecommenderJob job = new RecommenderJob();
   job.setConf(conf);
   job.run(args);
   long endTime = System.currentTimeMillis();
   logger.info(
       "recommdation job ["
           + conf.getJobName()
           + "] run finish. it costs"
           + (endTime - startTime) / 1000
           + "s.");
 }
Example #20
0
  private void setPageRankLinksOptions(JobConf job) throws URISyntaxException {
    job.setLong("pages", options.getNumPages());
    job.setLong("slotpages", options.getNumSlotPages());
    job.set("delimiter", cdelim);

    Utils.shareLinkZipfCore(options, job);
  }
Example #21
0
  public static void getData(CloudataConf conf, Path keyPath) throws IOException {
    JobConf jobConf = new JobConf(TeraReadJob.class);
    jobConf.set("user.name", conf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    Path tempOutputPath = new Path("ManyTableJob_Get_" + System.currentTimeMillis());

    jobConf.setJobName("ManyTableJob_Get_" + "(" + new Date() + ")");

    TextOutputFormat.setOutputPath(jobConf, tempOutputPath);
    // <MAP>
    jobConf.setMapperClass(ManyTableGetMap.class);
    jobConf.setInputFormat(TextInputFormat.class);
    TextInputFormat.addInputPath(jobConf, keyPath);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    // </MAP>

    // <REDUCE>
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
      // Run Job
      JobClient.runJob(jobConf);
    } finally {
      // delete temp output path
      FileSystem fs = FileSystem.get(jobConf);
      FileUtil.delete(fs, tempOutputPath, true);
      CloudataMapReduceUtil.clearMapReduce(libDir);
    }
  }
Example #22
0
  public RunningJob run(String inputPath, String outputPath) throws Exception {

    JobConf conf = new JobConf(BuildIndex.class);
    conf.setJobName("BuildIndex");

    FileInputFormat.addInputPath(conf, new Path(inputPath)); // multiple path
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(LongWritable.class);
    conf.setMapOutputValueClass(LongWritable.class);
    conf.set("delim", delim);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setInt("keyFieldIndexTwo", keyFieldIndexTwo);
    conf.setMapperClass(BuildIndexMapper.class);
    conf.setNumReduceTasks(1);
    conf.setReducerClass(BuildIndexReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    // conf.setInputFormat(CustomInputFormat.class);
    // FileOutputFormat.setCompressOutput(conf,true);
    // delete the output directory if it exists already

    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }
Example #23
0
  /*
   * Creates the configuration object necessary to run a specific vertex from
   * map work. This includes input formats, input processor, etc.
   */
  private JobConf initializeVertexConf(JobConf baseConf, MapWork mapWork) {
    JobConf conf = new JobConf(baseConf);

    if (mapWork.getNumMapTasks() != null) {
      conf.setInt(MRJobConfig.NUM_MAPS, mapWork.getNumMapTasks().intValue());
    }

    if (mapWork.getMaxSplitSize() != null) {
      HiveConf.setLongVar(
          conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mapWork.getMaxSplitSize().longValue());
    }

    if (mapWork.getMinSplitSize() != null) {
      HiveConf.setLongVar(
          conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mapWork.getMinSplitSize().longValue());
    }

    if (mapWork.getMinSplitSizePerNode() != null) {
      HiveConf.setLongVar(
          conf,
          HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE,
          mapWork.getMinSplitSizePerNode().longValue());
    }

    if (mapWork.getMinSplitSizePerRack() != null) {
      HiveConf.setLongVar(
          conf,
          HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK,
          mapWork.getMinSplitSizePerRack().longValue());
    }

    Utilities.setInputAttributes(conf, mapWork);

    String inpFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVETEZINPUTFORMAT);
    if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) {
      inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName();
    }

    if (mapWork.isUseBucketizedHiveInputFormat()) {
      inpFormat = BucketizedHiveInputFormat.class.getName();
    }

    conf.set("mapred.mapper.class", ExecMapper.class.getName());
    conf.set("mapred.input.format.class", inpFormat);

    return conf;
  }
Example #24
0
  @Override
  public int run(String[] args) throws Exception {
    final int ret = parseArgs(args);
    if (ret < 0) {
      return ret;
    }

    JobConf config = new JobConf(getConf(), TfIdfNovelty.class);
    config.setJobName("Influence-TfIdfNovelty");

    config.set(Fields.BASIS.get(), basisPath);
    if (datesPath != null) {
      config.set(Fields.DOC_DATES.get(), datesPath);
    }
    config.setBoolean(Fields.IGNORE.get(), ignoreDocs);
    if (bands > 0) {
      config.setInt(Fields.BANDS.get(), bands);
    }
    if (rows > 0) {
      config.setInt(Fields.ROWS.get(), rows);
    }

    SetupHelper.getInstance()
        .setSequenceInput(config, inputPath)
        .setSequenceOutput(config, outputPath);

    config.setMapOutputKeyClass(HashBandWritable.class);
    config.setMapOutputValueClass(DocumentWithVectorWritable.class);
    config.setMapperClass(TfIdfNoveltyLshMapper.class);

    if (outputBuckets) {
      config.setOutputKeyClass(HashBandWritable.class);
      config.setOutputValueClass(IntArrayWritable.class);
      config.setReducerClass(TfIdfNoveltyIdentityReducer.class);
    } else {
      config.setOutputKeyClass(Text.class);
      config.setOutputValueClass(VectorWritable.class);
      config.setReducerClass(TfIdfNoveltyReducer.class);
    }

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    JobClient.runJob(config);

    return 0;
  }
  @SuppressWarnings({"deprecation", "null"})
  public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output)
      throws IOException {
    sLogger.setLevel(Level.INFO);

    JobConf conf = new JobConf(c);

    conf.setJobName("bitext.compile");

    boolean useVocabServer = false;

    Thread vst1 = null;
    Thread vst2 = null;
    VocabServer vocabServer1 = null;
    VocabServer vocabServer2 = null;
    try {
      // inputPaths = bi-text given as input in main method of HadoopAlign
      conf.setOutputKeyClass(Text.class);
      conf.setOutputValueClass(PhrasePair.class);
      conf.setMapperClass(BitextCompilerMapper.class);
      conf.setReducerClass(IdentityReducer.class);
      conf.setNumMapTasks(1);
      conf.setNumReduceTasks(1);
      FileInputFormat.setInputPaths(conf, inputPaths);
      conf.set("stream.recordreader.begin", "<pchunk");
      conf.set("stream.recordreader.end", "</pchunk>");
      conf.set("stream.recordreader.slowmatch", "false");
      conf.set("stream.recordreader.maxrec", "100000");
      conf.setInputFormat(XMLInput.class);
      FileOutputFormat.setOutputPath(conf, output);
      conf.setOutputFormat(SequenceFileOutputFormat.class);
      conf.setJar("/chomes/fture/jars/ivory.jar");
      conf.set("mapred.child.java.opts", "-Xmx2048m");
      System.out.println("Running job " + conf.getJobName());
      System.out.println("Input: " + inputPaths);
      System.out.println("Output: " + output);
      JobClient.runJob(conf);
    } finally {
      try {
        if (vst1 != null) vocabServer1.stopServer();
        if (vst2 != null) vocabServer2.stopServer();
        if (vst1 != null) vst1.join();
        if (vst2 != null) vst2.join();
      } catch (InterruptedException e) {
      }
    }
  }
  @Before
  public void createMockKeyValues() throws Exception {
    // Make a MockInstance here, by setting the instance name to be the same as this mock instance
    // we can "trick" the InputFormat into using a MockInstance
    mockInstance = new MockInstance(test.getMethodName());
    inputformat = new HiveAccumuloTableInputFormat();
    conf = new JobConf();
    conf.set(AccumuloSerDeParameters.TABLE_NAME, TEST_TABLE);
    conf.set(AccumuloSerDeParameters.USE_MOCK_INSTANCE, "true");
    conf.set(AccumuloSerDeParameters.INSTANCE_NAME, test.getMethodName());
    conf.set(AccumuloSerDeParameters.USER_NAME, USER);
    conf.set(AccumuloSerDeParameters.USER_PASS, PASS);
    conf.set(AccumuloSerDeParameters.ZOOKEEPERS, "localhost:2181"); // not used for mock, but
    // required by input format.

    columnNames = Arrays.asList("name", "sid", "dgrs", "mills");
    columnTypes =
        Arrays.<TypeInfo>asList(
            TypeInfoFactory.stringTypeInfo,
            TypeInfoFactory.intTypeInfo,
            TypeInfoFactory.doubleTypeInfo,
            TypeInfoFactory.longTypeInfo);
    conf.set(AccumuloSerDeParameters.COLUMN_MAPPINGS, "cf:name,cf:sid,cf:dgrs,cf:mills");
    conf.set(serdeConstants.LIST_COLUMNS, "name,sid,dgrs,mills");
    conf.set(serdeConstants.LIST_COLUMN_TYPES, "string,int,double,bigint");

    con = mockInstance.getConnector(USER, new PasswordToken(PASS.getBytes()));
    con.tableOperations().create(TEST_TABLE);
    con.securityOperations().changeUserAuthorizations(USER, new Authorizations("blah"));
    BatchWriterConfig writerConf = new BatchWriterConfig();
    BatchWriter writer = con.createBatchWriter(TEST_TABLE, writerConf);

    Mutation m1 = new Mutation(new Text("r1"));
    m1.put(COLUMN_FAMILY, NAME, new Value("brian".getBytes()));
    m1.put(COLUMN_FAMILY, SID, new Value(parseIntBytes("1")));
    m1.put(COLUMN_FAMILY, DEGREES, new Value(parseDoubleBytes("44.5")));
    m1.put(COLUMN_FAMILY, MILLIS, new Value(parseLongBytes("555")));

    Mutation m2 = new Mutation(new Text("r2"));
    m2.put(COLUMN_FAMILY, NAME, new Value("mark".getBytes()));
    m2.put(COLUMN_FAMILY, SID, new Value(parseIntBytes("2")));
    m2.put(COLUMN_FAMILY, DEGREES, new Value(parseDoubleBytes("55.5")));
    m2.put(COLUMN_FAMILY, MILLIS, new Value(parseLongBytes("666")));

    Mutation m3 = new Mutation(new Text("r3"));
    m3.put(COLUMN_FAMILY, NAME, new Value("dennis".getBytes()));
    m3.put(COLUMN_FAMILY, SID, new Value(parseIntBytes("3")));
    m3.put(COLUMN_FAMILY, DEGREES, new Value(parseDoubleBytes("65.5")));
    m3.put(COLUMN_FAMILY, MILLIS, new Value(parseLongBytes("777")));

    writer.addMutation(m1);
    writer.addMutation(m2);
    writer.addMutation(m3);

    writer.close();
  }
Example #27
0
 public static void setAggregatorDescriptors(
     JobConf job, Class<? extends ValueAggregatorDescriptor>[] descriptors) {
   job.setInt("aggregator.descriptor.num", descriptors.length);
   // specify the aggregator descriptors
   for (int i = 0; i < descriptors.length; i++) {
     job.set("aggregator.descriptor." + i, "UserDefined," + descriptors[i].getName());
   }
 }
Example #28
0
 @SuppressWarnings("rawtypes")
 @Override
 public void sinkConfInit(
     FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
   DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
   jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
   ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
 }
Example #29
0
 /**
  * Set the column names and types into the job conf for the input format to use.
  *
  * @param job the job to update
  * @param cols the columns of the table
  */
 private void setColumnTypes(JobConf job, List<FieldSchema> cols) {
   StringBuilder colNames = new StringBuilder();
   StringBuilder colTypes = new StringBuilder();
   boolean isFirst = true;
   for (FieldSchema col : cols) {
     if (isFirst) {
       isFirst = false;
     } else {
       colNames.append(',');
       colTypes.append(',');
     }
     colNames.append(col.getName());
     colTypes.append(col.getType());
   }
   job.set(serdeConstants.LIST_COLUMNS, colNames.toString());
   job.set(serdeConstants.LIST_COLUMN_TYPES, colTypes.toString());
 }
Example #30
0
  private void createPageRankLinksDirectly() throws IOException, URISyntaxException {

    log.info("Creating PageRank links", null);

    JobConf job = new JobConf(PagerankData.class);
    String jobname = "Create pagerank links";

    Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME);

    job.setJobName(jobname);
    setPageRankLinksOptions(job);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    //		job.setMapOutputKeyClass(LongWritable.class);
    //		job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, dummy.getPath());
    job.setInputFormat(NLineInputFormat.class);

    job.setMapperClass(DummyToPageRankLinksMapper.class);

    if (options.isSequenceOut()) {
      job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
      job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.getCodecClass()) {
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
      FileOutputFormat.setCompressOutput(job, true);
      FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
    }

    FileOutputFormat.setOutputPath(job, fout);

    log.info("Running Job: " + jobname);
    log.info("Dummy file " + dummy.getPath() + " as input");
    log.info("Edges file " + fout + " as output");
    JobClient.runJob(job);
    log.info("Finished Running Job: " + jobname);
  }