Java FileOutputFormat Examples, org.apache.hadoop.mapred.FileOutputFormat Java Examples

Example #1

0

Show file

File: AFormatterWG.java Project: ezubaric/Cloud9

  public int run(String[] args) throws Exception {

    if (args.length != 5) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    String stoplistPath = args[4];

    sLogger.info("Tool: AFormatter");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(AFormatterWG.class);
    conf.setJobName("Authority Formatter -- Web Graph");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    // conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setCompressMapOutput(true);
    conf.setSpeculativeExecution(false);
    // InputSampler.Sampler<IntWritable, Text> sampler = new
    // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10);
    // InputSampler.writePartitionFile(conf, sampler);
    // conf.setPartitionerClass(TotalOrderPartitioner.class);
    conf.setMapperClass(AFormatMapperIMC.class);
    conf.setCombinerClass(AFormatReducer.class);
    conf.setReducerClass(AFormatReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    Path stopList = new Path(stoplistPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    sLogger.info("Starting job");
    DistributedCache.addCacheFile(stopList.toUri(), conf);
    JobClient.runJob(conf);
    sLogger.info(
        "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }

Example #2

0

Show file

File: JobUtil.java Project: riteshbagrecha/jumbune

 /**
  * This method call when injected into a class will modify the output path, only if output is into
  * HDFS
  *
  * @param job Job whose output path need to be changed
  */
 public static void modifyOutputPath(JobConf job) {
   Path path = org.apache.hadoop.mapred.FileOutputFormat.getOutputPath(job);
   if (path == null) {
     throw new IllegalArgumentException("Job Output path is null, expecting not null path value");
   }
   StringBuilder out = new StringBuilder(path.toString());
   out.append(SEPARATOR_UNDERSCORE).append(System.currentTimeMillis());
   org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(job, new Path(out.toString()));
 }

Example #3

0

Show file

File: MultiFileWordCount.java Project: njoubert/UndergraduateProjects

  public int run(String[] args) throws Exception {

    if (args.length < 2) {
      printUsage();
      return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    // set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
  }

Example #4

0

Show file

File: HbaseMR.java Project: chinahuangxin/prjs

  public static void htableFile() throws Exception {
    Job job = new Job(conf, "ExampleSummaryToFile");
    job.setJarByClass(HbaseMR.class); // class that contains mapper

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(
        "sourceTable", // input table
        scan, // Scan instance to control CF and attribute selection
        MyMapper.class, // mapper class
        Text.class, // mapper output key
        IntWritable.class, // mapper output value
        job);
    job.setReducerClass(MyReducer4.class); // reducer class
    job.setNumReduceTasks(1); // at least one, adjust as required
    FileOutputFormat.setOutputPath(new JobConf(conf), new Path("/tmp/mr/mySummaryFile")); // adjust
    // directories
    // as
    // required

    boolean b = job.waitForCompletion(true);
    if (!b) {
      throw new IOException("error with job!");
    }
  }

Example #5

0

Show file

File: AvroMR.java Project: EricDoug/Hadoop-Beginner-s-Guide-Code

  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }

Example #6

0

Show file

File: BayesClassifierDriver.java Project: guitao/tyful

  /**
   * Run the job
   *
   * @param params The Job parameters containing the gramSize, input output folders, defaultCat,
   *     encoding
   */
  public static void runJob(Parameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesClassifierDriver.class);
    conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath"));
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath")));
    Path outPath = new Path(params.get("testDirPath") + "-output");
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setMapperClass(BayesClassifierMapper.class);
    conf.setCombinerClass(BayesClassifierReducer.class);
    conf.setReducerClass(BayesClassifierReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    HadoopUtil.overwriteOutput(outPath);
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

    Path outputFiles = new Path(outPath, "part*");
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params);
    log.info("{}", matrix.summarize());
  }

Example #7

0

Show file

File: Compressible.java Project: nabsrock786/Hadoop_NGS_Lakshman

  public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: Compressible");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    // JobConf conf = new JobConf(Stats.class);
    JobConf conf = new JobConf(Compressible.class);
    conf.setJobName("Compressible " + inputPath);

    BrushConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CompressibleMapper.class);
    conf.setReducerClass(CompressibleReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }

Example #8

0

Show file

File: NotInFinder.java Project: KGayan/Acacia

  public static void main(String[] args) throws Exception {
    String dir1 = "/user/miyuru/wcout";
    String dir2 = "/user/miyuru/notinverts";
    // We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    if (fs1.exists(new Path(dir2))) {
      fs1.delete(new Path(dir2), true);
    }

    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));
    Job job = new Job(conf, "NotInFinder");
    job.setJarByClass(WordCount.class);
    //   job.setMapperClass(TokenizerMapper.class);
    //   job.setCombinerClass(IntSumReducer.class);
    //   job.setReducerClass(IntSumReducer.class);
    //   job.setOutputKeyClass(LongWritable.class);
    //   job.setOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
  }

Example #9

0

Show file

File: WordCountExtended.java Project: st3ffwo3/hadoop_samples

  /**
   * {@inheritDoc}
   *
   * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
   */
  @Override
  public int run(String[] args) throws Exception {
    JobConf configuration = new JobConf(getConf(), WordCountExtended.class);
    configuration.setJobName(JOB_NAME);

    configuration.setOutputKeyClass(Text.class);
    configuration.setOutputValueClass(IntWritable.class);

    configuration.setMapperClass(Map.class);
    configuration.setCombinerClass(Reduce.class);
    configuration.setReducerClass(Reduce.class);

    configuration.setInputFormat(TextInputFormat.class);
    configuration.setOutputFormat(TextOutputFormat.class);

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      if (JOB_SKIP_ARGUMENT.equals(args[i])) {
        DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration);
        configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true);
      } else {
        otherArgs.add(args[i]);
      }
    }

    FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0)));
    FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1)));

    JobClient.runJob(configuration);
    return 0;
  }

Example #10

0

Show file

File: KPITime.java Project: caichangqi/HadoopDemo

  public static void main(String[] args) throws Exception {
    String input = "hdfs://centos:9000/access.log.10";
    String output = "hdfs://centos:9000/out_kpitime";

    JobConf conf = new JobConf(KPITime.class);
    conf.setJobName("KPITime");
    //        conf.addResource("classpath:/hadoop/core-site.xml");
    //        conf.addResource("classpath:/hadoop/hdfs-site.xml");
    //        conf.addResource("classpath:/hadoop/mapred-site.xml");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(KPITimeMapper.class);
    conf.setCombinerClass(KPITimeReducer.class);
    conf.setReducerClass(KPITimeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);
    System.exit(0);
  }

Example #11

0

Show file

File: TestParseMethods.java Project: Noddy76/sqoop

  public void runParseTest(
      String fieldTerminator,
      String lineTerminator,
      String encloser,
      String escape,
      boolean encloseRequired)
      throws IOException {

    ClassLoader prevClassLoader = null;

    String[] argv =
        getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired);
    runImport(argv);
    try {
      String tableClassName = getTableName();

      argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired);
      SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true);

      CompilationManager compileMgr = new CompilationManager(opts);
      String jarFileName = compileMgr.getJarFilename();

      // Make sure the user's class is loaded into our address space.
      prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName);

      JobConf job = new JobConf();
      job.setJar(jarFileName);

      // Tell the job what class we're testing.
      job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName);

      // use local mode in the same JVM.
      ConfigurationHelper.setJobtrackerAddr(job, "local");
      if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
        job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
      }
      String warehouseDir = getWarehouseDir();
      Path warehousePath = new Path(warehouseDir);
      Path inputPath = new Path(warehousePath, getTableName());
      Path outputPath = new Path(warehousePath, getTableName() + "-out");

      job.setMapperClass(ReparseMapper.class);
      job.setNumReduceTasks(0);
      FileInputFormat.addInputPath(job, inputPath);
      FileOutputFormat.setOutputPath(job, outputPath);

      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(NullWritable.class);

      JobClient.runJob(job);
    } catch (InvalidOptionsException ioe) {
      fail(ioe.toString());
    } catch (ParseException pe) {
      fail(pe.toString());
    } finally {
      if (null != prevClassLoader) {
        ClassLoaderStack.setCurrentClassLoader(prevClassLoader);
      }
    }
  }

Example #12

0

Show file

File: Step2.java Project: hss123/maven_hadoop_template

  public static void run(Map<String, String> path) throws IOException {
    JobConf conf = Recommend.config();

    String input = path.get("Step2Input");
    String output = path.get("Step2Output");

    HdfsDAO hdfs = new HdfsDAO(Recommend.HDFS, conf);
    hdfs.rmr(output);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Step2_UserVectorToCooccurrenceMapper.class);
    //        conf.setCombinerClass(Step2_UserVectorToConoccurrenceReducer.class);
    //        conf.setReducerClass(Step2_UserVectorToConoccurrenceReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    RunningJob job = JobClient.runJob(conf);
    while (!job.isComplete()) {
      job.waitForCompletion();
    }
  }

Example #13

0

Show file

File: KPIPV.java Project: cyhrosefer/note

  public static void main(String[] args) throws Exception {
    String input = "hdfs://192.168.0.110:9000/input/access.log";
    String output = "hdfs://192.168.0.110:9000/user/hdfs/pv";

    JobConf conf = new JobConf(KPIPV.class);
    conf.setJobName("KPIPV");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(KPIPVMapper.class);
    conf.setCombinerClass(KPIPVReducer.class);
    conf.setReducerClass(KPIPVReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);
    System.exit(0);
  }

Example #14

0

Show file

File: BuildGraph.java Project: avijitgupta/contrail-bio

  public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: BuildGraph");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(BuildGraph.class);
    conf.setJobName("BuildGraph " + inputPath + " " + ContrailConfig.K);

    ContrailConfig.initializeConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(BuildGraphMapper.class);
    conf.setReducerClass(BuildGraphReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }

Example #15

0

Show file

File: CDMahoutEvaluator.java Project: maximzhao/Mahout-GSOC-LibLinear

  /**
   * Configure the job
   *
   * @param conf Job to configure
   * @param rules classification rules to evaluate
   * @param target label value to evaluate the rules for
   * @param inpath input path (the dataset)
   * @param outpath output <code>Path</code>
   * @param split DatasetSplit used to separate training and testing input
   */
  private static void configureJob(
      JobConf conf,
      List<? extends Rule> rules,
      int target,
      Path inpath,
      Path outpath,
      DatasetSplit split) {
    split.storeJobParameters(conf);

    FileInputFormat.setInputPaths(conf, inpath);
    FileOutputFormat.setOutputPath(conf, outpath);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(CDFitness.class);

    conf.setMapperClass(CDMapper.class);
    conf.setCombinerClass(CDReducer.class);
    conf.setReducerClass(CDReducer.class);

    conf.setInputFormat(DatasetTextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    // store the parameters
    conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules));
    conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
    conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target);
  }

Example #16

0

Show file

File: DistRaid.java Project: fire9/hadoop-20

  /**
   * set up input file which has the list of input files.
   *
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter =
          SequenceFile.createWriter(
              fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (FileStatus st : p.srcPaths) {
          opWriter.append(new Text(st.getPath().toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();

    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(
        getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers()));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;
  }

Example #17

0

Show file

File: BuildIndex.java Project: sofiecone/MapRedJoinsLib

  public RunningJob run(String inputPath, String outputPath) throws Exception {

    JobConf conf = new JobConf(BuildIndex.class);
    conf.setJobName("BuildIndex");

    FileInputFormat.addInputPath(conf, new Path(inputPath)); // multiple path
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(LongWritable.class);
    conf.setMapOutputValueClass(LongWritable.class);
    conf.set("delim", delim);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setInt("keyFieldIndexTwo", keyFieldIndexTwo);
    conf.setMapperClass(BuildIndexMapper.class);
    conf.setNumReduceTasks(1);
    conf.setReducerClass(BuildIndexReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    // conf.setInputFormat(CustomInputFormat.class);
    // FileOutputFormat.setCompressOutput(conf,true);
    // delete the output directory if it exists already

    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
  }

Example #18

0

Show file

File: CassandraBulkLoader.java Project: devdattakulkarni/Cassandra-KVAC

  public static void runJob(String[] args) {
    JobConf conf = new JobConf(CassandraBulkLoader.class);

    if (args.length >= 4) {
      conf.setNumReduceTasks(new Integer(args[3]));
    }

    try {
      // We store the cassandra storage-conf.xml on the HDFS cluster
      DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setJobName("CassandraBulkLoader_v2");
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));
    FileOutputFormat.setOutputPath(conf, new Path(args[2]));
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }

Example #19

0

Show file

File: MaxTemperatureDriver.java Project: saintstack/hadoop-book

  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }

Example #20

0

Show file

File: SampleJob.java Project: hao707822882/hadoop-book-1

  public static void runSortJob(String... args) throws Exception {

    Path input = new Path(args[0]);
    Path output = new Path(args[1]);

    JobConf job = new JobConf();

    job.setNumReduceTasks(2);

    job.setInputFormat(KeyValueTextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setJarByClass(SampleJob.class);

    output.getFileSystem(job).delete(output, true);

    JobClient jc = new JobClient(job);
    JobClient.setTaskOutputFilter(job, JobClient.TaskStatusFilter.ALL);
    RunningJob rj = jc.submitJob(job);
    try {
      if (!jc.monitorAndPrintJob(job, rj)) {
        System.out.println("Job Failed: " + rj.getFailureInfo());
        throw new IOException("Job failed!");
      }
    } catch (InterruptedException ie) {
      Thread.currentThread().interrupt();
    }
  }

Example #21

0

Show file

File: RandomWriter.java Project: heipacker/wordcount

 /**
  * Generate the requested number of file splits, with the filename set to the filename of the
  * output file.
  */
 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   /** 设置输入分片的个数* */
   JobClient client = new JobClient(job);
   ClusterStatus cluster = client.getClusterStatus();
   /** 如果属性不存在 则返回默认的值 * */
   int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
   long numBytesToWritePerMap =
       job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
   if (numBytesToWritePerMap == 0) {
     System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
   }
   long totalBytesToWrite =
       job.getLong(
           "test.randomwrite.total_bytes",
           numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
   int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
   if (numMaps == 0 && totalBytesToWrite > 0) {
     numMaps = 1;
   }
   System.out.println("numMaps-------" + numMaps);
   InputSplit[] result = new InputSplit[numMaps];
   Path outDir = FileOutputFormat.getOutputPath(job);
   for (int i = 0; i < result.length; ++i) {
     result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null);
   }
   return result;
 }

Example #22

0

Show file

File: Injector.java Project: soolr/nutch-1.7

  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }

Example #23

0

Show file

File: WordCountSeqOutput.java Project: tanping/MapRedToRc

  /**
   * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce
   * job.
   *
   * @throws IOException When there is communication problems with the job tracker.
   */
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCountSeqOutput.class);
    conf.setJobName("wordcount_seqOF");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);
    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    // conf.setOutputValueClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Combiner.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);

    //      // compress Mapper output
    //      conf.setCompressMapOutput(true);
    //      conf.setMapOutputCompressorClass(org.apache.hadoop.io.compress.GzipCodec.class);

    // compress final output
    conf.set("mapred.output.compress", conf.get("mapred.output.compress", "true"));
    conf.set("mapred.output.compression.type", conf.get("mapred.output.compression.type", "BLOCK"));
    conf.set(
        "mapred.output.compression.codec",
        conf.get("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"));

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
  }

Example #24

0

Show file

File: RandomWriter.java Project: heipacker/wordcount

  /**
   * This is the main routine for launching a distributed random write job. It runs 10 maps/node and
   * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything.
   *
   * @throws IOException
   */
  public int run(String[] args) throws Exception {
    if (args.length == 0) {
      System.out.println("Usage: writer <out-dir>");
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    Path outDir = new Path(args[0]);
    JobConf job = new JobConf(getConf());

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);

    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(IdentityReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient client = new JobClient(job);
    ClusterStatus cluster = client.getClusterStatus();
    /** 如果属性不存在 则返回默认的值 * */
    int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
    long numBytesToWritePerMap =
        job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
      System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
      return -2;
    }
    long totalBytesToWrite =
        job.getLong(
            "test.randomwrite.total_bytes",
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
      numMaps = 1;
      job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
    }

    job.setNumMapTasks(numMaps);
    /** 建议型的 * */
    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println(
        "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return 0;
  }

Example #25

0

Show file

File: InstrumentDataflow.java Project: MarvinLiu0810/HiTune

  /* (non-Javadoc)
   * @see org.apache.hadoop.chukwa.analysis.HiTune.AnalysisProcessor#run()
   */
  @Override
  public void run() {
    // TODO Auto-generated method stub

    long timestamp = System.currentTimeMillis();

    JobConf conf = new JobConf(this.conf, InstrumentDataflow.class);
    try {
      conf.setJobName(this.getClass().getSimpleName() + timestamp);
      conf.setInputFormat(MultiSequenceFileInputFormat.class);
      conf.setMapperClass(InstrumentDataflow.MapClass.class);
      conf.setReducerClass(InstrumentDataflow.ReduceClass.class);
      conf.setOutputKeyClass(Text.class);
      Class<? extends WritableComparable> outputKeyClass =
          Class.forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass))
              .asSubclass(WritableComparable.class);
      Class<? extends Writable> outputValueClass =
          Class.forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass))
              .asSubclass(Writable.class);
      conf.setMapOutputKeyClass(outputKeyClass);
      conf.setMapOutputValueClass(outputValueClass);

      conf.setOutputValueClass(TextArrayWritable.class);
      conf.setOutputFormat(CSVFileOutputFormat.class);

      String outputPaths =
          conf.get(AnalysisProcessorConfiguration.reportfolder)
              + "/"
              + conf.get(AnalysisProcessorConfiguration.reportfile);
      String temp_outputPaths = getTempOutputDir(outputPaths);

      if (this.inputfiles != null) {
        log.debug("inputPaths:" + inputfiles);
        FileInputFormat.setInputPaths(conf, inputfiles);
        FileOutputFormat.setOutputPath(conf, new Path(temp_outputPaths));

        // FileInputFormat.setInputPathFilter(conf, evtFileFilter.class);
        // conf.setNumReduceTasks(1);

        try {
          JobClient.runJob(conf);
          moveResults(conf, outputPaths, temp_outputPaths);
        } catch (IOException e) {
          // TODO Auto-generated catch block
          log.warn("For " + getOutputFileName() + " :JOB fails!");
          log.warn(e);
          e.printStackTrace();
          this.MOVE_DONE = false;
        }

      } else {
        log.warn("For " + getOutputFileName() + " :No input path!");
      }
    } catch (Exception e) {
      log.warn("Job preparation failure!");
      log.warn(e);
      e.printStackTrace();
    }
  }

Example #26

0

Show file

File: MongoDBScheme.java Project: brush51/PredictionIO

  /**
   * @param process
   * @param tap
   * @param conf
   */
  @Override
  public void sinkConfInit(
      FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {
    conf.setOutputFormat(MongoOutputFormat.class);
    MongoConfigUtil.setOutputURI(conf, this.mongoUri);

    FileOutputFormat.setOutputPath(conf, getPath());
  }

Example #27

0

Show file

File: TestCompressionEmulationUtils.java Project: pwendell/mr2-fairscheduler

  /** Test compressible {@link GridmixRecord}. */
  @Test
  public void testCompressibleGridmixRecord() throws IOException {
    JobConf conf = new JobConf();
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);

    FileSystem lfs = FileSystem.getLocal(conf);
    int dataSize = 1024 * 1024 * 10; // 10 MB
    float ratio = 0.357F;

    // define the test's root temp directory
    Path rootTempDir =
        new Path(System.getProperty("test.build.data", "/tmp"))
            .makeQualified(lfs.getUri(), lfs.getWorkingDirectory());

    Path tempDir = new Path(rootTempDir, "TestPossiblyCompressibleGridmixRecord");
    lfs.delete(tempDir, true);

    // define a compressible GridmixRecord
    GridmixRecord record = new GridmixRecord(dataSize, 0);
    record.setCompressibility(true, ratio); // enable compression

    conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class);
    org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);

    // write the record to a file
    Path recordFile = new Path(tempDir, "record");
    OutputStream outStream =
        CompressionEmulationUtil.getPossiblyCompressedOutputStream(recordFile, conf);
    DataOutputStream out = new DataOutputStream(outStream);
    record.write(out);
    out.close();
    outStream.close();

    // open the compressed stream for reading
    Path actualRecordFile = recordFile.suffix(".gz");
    InputStream in =
        CompressionEmulationUtil.getPossiblyDecompressedInputStream(actualRecordFile, conf, 0);

    // get the compressed file size
    long compressedFileSize = lfs.listStatus(actualRecordFile)[0].getLen();

    GridmixRecord recordRead = new GridmixRecord();
    recordRead.readFields(new DataInputStream(in));

    assertEquals(
        "Record size mismatch in a compressible GridmixRecord", dataSize, recordRead.getSize());
    assertTrue(
        "Failed to generate a compressible GridmixRecord",
        recordRead.getSize() > compressedFileSize);

    // check if the record can generate data with the desired compression ratio
    float seenRatio = ((float) compressedFileSize) / dataSize;
    assertEquals(
        CompressionEmulationUtil.standardizeCompressionRatio(ratio),
        CompressionEmulationUtil.standardizeCompressionRatio(seenRatio),
        1.0D);
  }

Example #28

0

Show file

File: PagerankData.java Project: ConeyLiu/HiBench

  private void createPageRankLinksDirectly() throws IOException, URISyntaxException {

    log.info("Creating PageRank links", null);

    JobConf job = new JobConf(PagerankData.class);
    String jobname = "Create pagerank links";

    Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME);

    job.setJobName(jobname);
    setPageRankLinksOptions(job);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    //		job.setMapOutputKeyClass(LongWritable.class);
    //		job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, dummy.getPath());
    job.setInputFormat(NLineInputFormat.class);

    job.setMapperClass(DummyToPageRankLinksMapper.class);

    if (options.isSequenceOut()) {
      job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
      job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.getCodecClass()) {
      job.set("mapred.output.compression.type", "BLOCK");
      job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
      FileOutputFormat.setCompressOutput(job, true);
      FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
    }

    FileOutputFormat.setOutputPath(job, fout);

    log.info("Running Job: " + jobname);
    log.info("Dummy file " + dummy.getPath() + " as input");
    log.info("Edges file " + fout + " as output");
    JobClient.runJob(job);
    log.info("Finished Running Job: " + jobname);
  }

Example #29

0

Show file

File: DemoCountTrecDocuments.java Project: zgap118/Cloud9

  /** Runs this tool. */
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    LOG.info("Tool: " + DemoCountTrecDocuments.class.getCanonicalName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output dir: " + outputPath);
    LOG.info(" - docno mapping file: " + mappingFile);

    JobConf conf = new JobConf(getConf(), DemoCountTrecDocuments.class);
    conf.setJobName(DemoCountTrecDocuments.class.getSimpleName());

    conf.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general in being able to load
    // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping
    // object.
    conf.set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName());

    // Put the mapping file in the distributed cache so each map worker will have it.
    DistributedCache.addCacheFile(new URI(mappingFile), conf);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TrecDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
  }

Example #30

0

Show file

File: Kmeans.java Project: LeoYao/ComparativeStudy_HPCC_Hadoop

  public static int main(String[] args) throws Exception {

    int i;
    String outPath;
    int numMaps = 0, numReds = 0;

    List<String> other_args = new ArrayList<String>();
    for (i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          numMaps = Integer.parseInt(args[++i]);
        } else if ("-r".equals(args[i])) {
          numReds = Integer.parseInt(args[++i]);
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        printUsage(); // exits
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      printUsage();
    }

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    Date startIteration;
    Date endIteration;
    JobConf conf = new JobConf(Kmeans.class);
    conf.setJobName("kmeans");
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ClusterWritable.class);
    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);
    conf.setNumMapTasks(numMaps);
    conf.setNumReduceTasks(numReds);
    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    outPath = new String(other_args.get(1));
    FileOutputFormat.setOutputPath(conf, new Path(outPath));
    startIteration = new Date();
    JobClient.runJob(conf);
    endIteration = new Date();
    System.out.println(
        "The iteration took "
            + (endIteration.getTime() - startIteration.getTime()) / 1000
            + " seconds.");
    return 0;
  }