Esempi in Java per JobConf.setNumMapTasks, esempi in Java per org.apache.hadoop.mapred.JobConf.setNumMapTasks

Esempio n. 1

0

Mostra file

File: TestMultipleOutputs4.java Progetto: kidaak/Hadoop-MapReduce-1

  public void runMR(String myMultiLocs, String sortKey)
      throws ParseException, IOException, Exception, org.apache.hadoop.zebra.parser.ParseException {

    JobConf jobConf = new JobConf(conf);
    jobConf.setJobName("TestMultipleOutputs4");
    jobConf.setJarByClass(TestMultipleOutputs4.class);
    jobConf.set("table.output.tfile.compression", "gz");
    jobConf.set("sortKey", sortKey);
    // input settings
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(TestMultipleOutputs4.MapClass.class);
    jobConf.setMapOutputKeyClass(BytesWritable.class);
    jobConf.setMapOutputValueClass(ZebraTuple.class);
    FileInputFormat.setInputPaths(jobConf, inputPath);

    jobConf.setNumMapTasks(1);

    // output settings

    jobConf.setOutputFormat(BasicTableOutputFormat.class);
    BasicTableOutputFormat.setMultipleOutputs(
        jobConf, myMultiLocs, TestMultipleOutputs4.OutputPartitionerClass.class);

    // set the logical schema with 2 columns
    BasicTableOutputFormat.setSchema(jobConf, "word:string, count:int");
    // for demo purposes, create 2 physical column groups
    BasicTableOutputFormat.setStorageHint(jobConf, "[word];[count]");
    BasicTableOutputFormat.setSortInfo(jobConf, sortKey);
    System.out.println("in runMR, sortkey: " + sortKey);
    // set map-only job.
    jobConf.setNumReduceTasks(1);
    JobClient.runJob(jobConf);
    BasicTableOutputFormat.close(jobConf);
  }

Esempio n. 2

0

Mostra file

File: AccessProcessJob.java Progetto: zezewoo/flume-hdfs-mapreduce

  public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(AccessProcessJob.class);
    conf.set(nameNode, hdfsURL);
    conf.setJobName("AccessProcessJob");
    Job job = Job.getInstance(conf, "AccessProcessJob");

    new Path(outputPath).getFileSystem(conf).delete(new Path(outputPath), true);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(AccessProcessMap.class);
    conf.setReducerClass(AccessProcessReduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    JobClient.runJob(conf);
  }

Esempio n. 3

0

Mostra file

File: FileCombiner.java Progetto: hfausta/thesis-file-combiner

  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    // TODO Auto-generated method stub
    JobConf conf = new JobConf();
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(5);

    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(args[0]);
    FileStatus[] stats = fs.listStatus(dir);
    numFiles = stats.length;

    Job job = new Job(conf);
    job.setJarByClass(FileCombiner.class);
    job.setJobName("File Combiner");

    job.setMapperClass(FileCombinerMapper.class);
    job.setReducerClass(FileCombinerReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
  }

Esempio n. 4

0

Mostra file

File: DistRaid.java Progetto: fire9/hadoop-20

  /**
   * set up input file which has the list of input files.
   *
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter =
          SequenceFile.createWriter(
              fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (FileStatus st : p.srcPaths) {
          opWriter.append(new Text(st.getPath().toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();

    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(
        getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers()));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;
  }

Esempio n. 5

0

Mostra file

File: NotInFinder.java Progetto: KGayan/Acacia

  public static void main(String[] args) throws Exception {
    String dir1 = "/user/miyuru/wcout";
    String dir2 = "/user/miyuru/notinverts";
    // We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    if (fs1.exists(new Path(dir2))) {
      fs1.delete(new Path(dir2), true);
    }

    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));
    Job job = new Job(conf, "NotInFinder");
    job.setJarByClass(WordCount.class);
    //   job.setMapperClass(TokenizerMapper.class);
    //   job.setCombinerClass(IntSumReducer.class);
    //   job.setReducerClass(IntSumReducer.class);
    //   job.setOutputKeyClass(LongWritable.class);
    //   job.setOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
  }

Esempio n. 6

0

Mostra file

File: ItemCFJob.java Progetto: bytegriffin/recsys-offline

 public void run() throws Exception {
   long startTime = System.currentTimeMillis();
   JobConf conf = new JobConf(ItemCFJob.class);
   conf.setJobName("ItemCF" + System.currentTimeMillis());
   conf.setNumMapTasks(10);
   conf.set(
       "io.serializations",
       "org.apache.hadoop.io.serializer.JavaSerialization,"
           + "org.apache.hadoop.io.serializer.WritableSerialization");
   StringBuilder sb = new StringBuilder();
   sb.append("--input ").append(input);
   sb.append(" --output ").append(output);
   if (flag) {
     sb.append(" --booleanData true");
   } else {
     sb.append(" --booleanData false");
   }
   sb.append(" --similarityClassname " + Constants.mahout_similarityclassname);
   sb.append(" --tempDir ").append(tmp);
   String[] args = sb.toString().split(" ");
   RecommenderJob job = new RecommenderJob();
   job.setConf(conf);
   job.run(args);
   long endTime = System.currentTimeMillis();
   logger.info(
       "recommdation job ["
           + conf.getJobName()
           + "] run finish. it costs"
           + (endTime - startTime) / 1000
           + "s.");
 }

Esempio n. 7

0

Mostra file

File: RandomWriter.java Progetto: heipacker/wordcount

  /**
   * This is the main routine for launching a distributed random write job. It runs 10 maps/node and
   * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything.
   *
   * @throws IOException
   */
  public int run(String[] args) throws Exception {
    if (args.length == 0) {
      System.out.println("Usage: writer <out-dir>");
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    Path outDir = new Path(args[0]);
    JobConf job = new JobConf(getConf());

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);

    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(IdentityReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient client = new JobClient(job);
    ClusterStatus cluster = client.getClusterStatus();
    /** 如果属性不存在 则返回默认的值 * */
    int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
    long numBytesToWritePerMap =
        job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
      System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
      return -2;
    }
    long totalBytesToWrite =
        job.getLong(
            "test.randomwrite.total_bytes",
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
      numMaps = 1;
      job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
    }

    job.setNumMapTasks(numMaps);
    /** 建议型的 * */
    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println(
        "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return 0;
  }

Esempio n. 8

0

Mostra file

File: AFormatterWG.java Progetto: ezubaric/Cloud9

  public int run(String[] args) throws Exception {

    if (args.length != 5) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    String stoplistPath = args[4];

    sLogger.info("Tool: AFormatter");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(AFormatterWG.class);
    conf.setJobName("Authority Formatter -- Web Graph");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    // conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setCompressMapOutput(true);
    conf.setSpeculativeExecution(false);
    // InputSampler.Sampler<IntWritable, Text> sampler = new
    // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10);
    // InputSampler.writePartitionFile(conf, sampler);
    // conf.setPartitionerClass(TotalOrderPartitioner.class);
    conf.setMapperClass(AFormatMapperIMC.class);
    conf.setCombinerClass(AFormatReducer.class);
    conf.setReducerClass(AFormatReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    Path stopList = new Path(stoplistPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    sLogger.info("Starting job");
    DistributedCache.addCacheFile(stopList.toUri(), conf);
    JobClient.runJob(conf);
    sLogger.info(
        "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }

Esempio n. 9

0

Mostra file

File: WordCountSeqOutput.java Progetto: tanping/MapRedToRc

  /**
   * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce
   * job.
   *
   * @throws IOException When there is communication problems with the job tracker.
   */
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCountSeqOutput.class);
    conf.setJobName("wordcount_seqOF");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);
    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    // conf.setOutputValueClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Combiner.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);

    //      // compress Mapper output
    //      conf.setCompressMapOutput(true);
    //      conf.setMapOutputCompressorClass(org.apache.hadoop.io.compress.GzipCodec.class);

    // compress final output
    conf.set("mapred.output.compress", conf.get("mapred.output.compress", "true"));
    conf.set("mapred.output.compression.type", conf.get("mapred.output.compression.type", "BLOCK"));
    conf.set(
        "mapred.output.compression.codec",
        conf.get("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"));

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
  }

Esempio n. 10

0

Mostra file

File: Kmeans.java Progetto: LeoYao/ComparativeStudy_HPCC_Hadoop

  public static int main(String[] args) throws Exception {

    int i;
    String outPath;
    int numMaps = 0, numReds = 0;

    List<String> other_args = new ArrayList<String>();
    for (i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          numMaps = Integer.parseInt(args[++i]);
        } else if ("-r".equals(args[i])) {
          numReds = Integer.parseInt(args[++i]);
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        printUsage(); // exits
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      printUsage();
    }

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    Date startIteration;
    Date endIteration;
    JobConf conf = new JobConf(Kmeans.class);
    conf.setJobName("kmeans");
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ClusterWritable.class);
    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(Reduce.class);
    conf.setNumMapTasks(numMaps);
    conf.setNumReduceTasks(numReds);
    FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
    outPath = new String(other_args.get(1));
    FileOutputFormat.setOutputPath(conf, new Path(outPath));
    startIteration = new Date();
    JobClient.runJob(conf);
    endIteration = new Date();
    System.out.println(
        "The iteration took "
            + (endIteration.getTime() - startIteration.getTime()) / 1000
            + " seconds.");
    return 0;
  }

Esempio n. 11

0

Mostra file

File: Step0JobTest.java Progetto: maximzhao/Mahout-GSOC-LibLinear

  public void testStep0Mapper() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, numAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, numInstances);
    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);

    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Step0OutputCollector collector = new Step0OutputCollector(numMaps);
    Reporter reporter = Reporter.NULL;

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Step0Mapper mapper = new Step0Mapper();
      mapper.configure(p);

      Long firstKey = null;
      int size = 0;

      while (reader.next(key, value)) {
        if (firstKey == null) {
          firstKey = key.get();
        }

        mapper.map(key, value, collector, reporter);

        size++;
      }

      mapper.close();

      // validate the mapper's output
      assertEquals(p, collector.keys[p]);
      assertEquals(firstKey.longValue(), collector.values[p].getFirstId());
      assertEquals(size, collector.values[p].getSize());
    }
  }

Esempio n. 12

0

Mostra file

File: DistCp.java Progetto: neutronsharc/hdfsbackup

 /**
  * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size
  * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at
  * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster).
  *
  * @param totalBytes Count of total bytes for job
  * @param job The job to configure
  * @return Count of maps to run.
  */
 private static void setMapCount(long totalBytes, JobConf job) throws IOException {
   int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP));
   numMaps =
       Math.min(
           numMaps,
           job.getInt(
               MAX_MAPS_LABEL,
               MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers()));
   job.setNumMapTasks(Math.max(numMaps, 1));
 }

Esempio n. 13

0

Mostra file

File: ConvolutionJob.java Progetto: holdonbear/NeuroHadoop

  @Override
  public int run(String[] args) throws Exception {

    System.out.println("\n\nConvolutionJob\n");
    JobConf conf = new JobConf(getConf(), ConvolutionJob.class);
    conf.setJobName("ConvolutionJob");

    this.cacheKernel(conf);
    this.CreateRats(conf);
    conf.setMapperClass(ConvolutionMapper.class);
    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }

    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }

    conf.setNumReduceTasks(0);
    conf.setInputFormat(NonSplittableTextInputFormat.class);
    conf.setOutputFormat(MultiFileOutput.class);
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setCompressMapOutput(true);
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    conf.set("mapred.output.compression.type", "BLOCK");

    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
    // FileOutputFormat.setCompressOutput(conf, true);

    JobClient.runJob(conf);

    return 0;
  }

Esempio n. 14

0

Mostra file

File: KMeansMapCollective.java Progetto: jessezbj/harp-project

 private Job configureKMeansJob(
     int numOfDataPoints,
     int numCentroids,
     int vectorSize,
     int numPointFiles,
     int numMapTasks,
     Configuration configuration,
     Path workDirPath,
     Path dataDir,
     Path cDir,
     Path outDir,
     int jobID,
     int iterationCount)
     throws IOException, URISyntaxException {
   Job job = new Job(configuration, "kmeans_job_" + jobID);
   Configuration jobConfig = job.getConfiguration();
   Path jobOutDir = new Path(outDir, "kmeans_out_" + jobID);
   FileSystem fs = FileSystem.get(configuration);
   if (fs.exists(jobOutDir)) {
     fs.delete(jobOutDir, true);
   }
   FileInputFormat.setInputPaths(job, dataDir);
   FileOutputFormat.setOutputPath(job, jobOutDir);
   // The first centroid file with ID 0,
   // which should match with the centroid file name in data generation
   Path cFile = new Path(cDir, KMeansConstants.CENTROID_FILE_PREFIX + jobID);
   System.out.println("Centroid File Path: " + cFile.toString());
   jobConfig.set(KMeansConstants.CFILE, cFile.toString());
   jobConfig.setInt(KMeansConstants.JOB_ID, jobID);
   jobConfig.setInt(KMeansConstants.ITERATION_COUNT, iterationCount);
   // input class to file-based class
   // job.setInputFormatClass(DataFileInputFormat.class);
   job.setInputFormatClass(MultiFileInputFormat.class);
   // job.setOutputKeyClass(IntWritable.class);
   // job.setOutputValueClass(V2DDataWritable.class);
   // job.setOutputFormatClass(SequenceFileOutputFormat.class);
   job.setJarByClass(KMeansMapCollective.class);
   job.setMapperClass(KMeansCollectiveMapper.class);
   org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
   jobConf.set("mapreduce.framework.name", "map-collective");
   jobConf.setNumMapTasks(numMapTasks);
   jobConf.setInt("mapreduce.job.max.split.locations", 10000);
   job.setNumReduceTasks(0);
   jobConfig.setInt(KMeansConstants.VECTOR_SIZE, vectorSize);
   jobConfig.setInt(KMeansConstants.NUM_CENTROIDS, numCentroids);
   jobConfig.setInt(KMeansConstants.POINTS_PER_FILE, numOfDataPoints / numPointFiles);
   jobConfig.set(KMeansConstants.WORK_DIR, workDirPath.toString());
   jobConfig.setInt(KMeansConstants.NUM_MAPPERS, numMapTasks);
   return job;
 }

Esempio n. 15

0

Mostra file

File: Domain.java Progetto: caitlinkuhlman/Domain

  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("Domain-MR2");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);
    //		        conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    //		conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputFormat(MultiFileOutput.class);

    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(args[1]), true); // delete output dir

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    int reducers = 272;
    int mappers = 272;
    conf.setNumMapTasks(reducers);
    conf.setNumReduceTasks(mappers);

    // set parameters
    conf.set("k", "" + k);
    conf.set("r", "" + k);
    conf.set("parts", "" + parts); // number of partitions per dimension

    System.out.println(
        "running DOMAIN with k="
            + k
            + " r="
            + r
            + " parts="
            + parts
            + " "
            + "useCellBasedAlgo="
            + useCellBasedAlgo
            + " reducers="
            + reducers
            + " mappers="
            + mappers);
    JobClient.runJob(conf);
    return 0;
  }

Esempio n. 16

0

Mostra file

File: CorpusVocabNormalizerAndNumberizer.java Progetto: rahulbhawsar/Cloud9

  @SuppressWarnings({"deprecation", "null"})
  public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output)
      throws IOException {
    sLogger.setLevel(Level.INFO);

    JobConf conf = new JobConf(c);

    conf.setJobName("bitext.compile");

    boolean useVocabServer = false;

    Thread vst1 = null;
    Thread vst2 = null;
    VocabServer vocabServer1 = null;
    VocabServer vocabServer2 = null;
    try {
      // inputPaths = bi-text given as input in main method of HadoopAlign
      conf.setOutputKeyClass(Text.class);
      conf.setOutputValueClass(PhrasePair.class);
      conf.setMapperClass(BitextCompilerMapper.class);
      conf.setReducerClass(IdentityReducer.class);
      conf.setNumMapTasks(1);
      conf.setNumReduceTasks(1);
      FileInputFormat.setInputPaths(conf, inputPaths);
      conf.set("stream.recordreader.begin", "<pchunk");
      conf.set("stream.recordreader.end", "</pchunk>");
      conf.set("stream.recordreader.slowmatch", "false");
      conf.set("stream.recordreader.maxrec", "100000");
      conf.setInputFormat(XMLInput.class);
      FileOutputFormat.setOutputPath(conf, output);
      conf.setOutputFormat(SequenceFileOutputFormat.class);
      conf.setJar("/chomes/fture/jars/ivory.jar");
      conf.set("mapred.child.java.opts", "-Xmx2048m");
      System.out.println("Running job " + conf.getJobName());
      System.out.println("Input: " + inputPaths);
      System.out.println("Output: " + output);
      JobClient.runJob(conf);
    } finally {
      try {
        if (vst1 != null) vocabServer1.stopServer();
        if (vst2 != null) vocabServer2.stopServer();
        if (vst1 != null) vst1.join();
        if (vst2 != null) vst2.join();
      } catch (InterruptedException e) {
      }
    }
  }

Esempio n. 17

0

Mostra file

File: BuildPageRankRecords.java Progetto: kensk8er/MapReduceAssignment

  /** Runs this tool. */
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int n = Integer.parseInt(args[2]);

    sLogger.info("Tool name: BuildPageRankRecords");
    sLogger.info(" - inputDir: " + inputPath);
    sLogger.info(" - outputDir: " + outputPath);
    sLogger.info(" - numNodes: " + n);

    JobConf conf = new JobConf(BuildPageRankRecords.class);
    conf.setJobName("PackageLinkGraph");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    conf.setInt("NodeCnt", n);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    TextInputFormat.addInputPath(conf, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
  }

Esempio n. 18

0

Mostra file

File: CleanupMR.java Progetto: apache/incubator-systemml

  public static boolean runJob(DMLConfig conf) throws Exception {
    boolean ret = false;

    try {
      JobConf job;
      job = new JobConf(CleanupMR.class);
      job.setJobName("Cleanup-MR");

      // set up SystemML local tmp dir
      String dir = conf.getTextValue(DMLConfig.LOCAL_TMP_DIR);
      MRJobConfiguration.setSystemMLLocalTmpDir(job, dir);

      // set mappers, reducers
      int numNodes = InfrastructureAnalyzer.getRemoteParallelNodes();
      job.setMapperClass(CleanupMapper.class); // map-only
      job.setNumMapTasks(numNodes); // numMappers
      job.setNumReduceTasks(0);

      // set input/output format, input path
      String inFileName = conf.getTextValue(DMLConfig.SCRATCH_SPACE) + "/cleanup_tasks";
      job.setInputFormat(NLineInputFormat.class);
      job.setOutputFormat(NullOutputFormat.class);

      Path path = new Path(inFileName);
      FileInputFormat.setInputPaths(job, path);
      writeCleanupTasksToFile(path, numNodes);

      // disable automatic tasks timeouts and speculative task exec
      job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
      job.setMapSpeculativeExecution(false);

      /////
      // execute the MR job
      RunningJob runjob = JobClient.runJob(job);

      ret = runjob.isSuccessful();
    } catch (Exception ex) {
      // don't raise an exception, just gracefully an error message.
      LOG.error("Failed to run cleanup MR job. ", ex);
    }

    return ret;
  }

Esempio n. 19

0

Mostra file

File: CreateNetworkLinkFiles.java Progetto: saigoda/Recommender-System

 @SuppressWarnings("deprecation")
 public int run(String[] args) throws Exception {
   JobConf job = new JobConf(super.getConf(), this.getClass());
   job.setJarByClass(this.getClass());
   job.setJobName("Create Files of Each Network Type");
   job.setJobPriority(JobPriority.VERY_HIGH);
   job.setMapperClass(CreateNetworkLinkFilesMap.class);
   job.setReducerClass(CreateNetworkLinkFilesReduce.class);
   job.set("output", args[1]);
   job.setNumMapTasks(50);
   job.setNumReduceTasks(30);
   job.setMapOutputKeyClass(Text.class);
   job.setMapOutputValueClass(Text.class);
   FileInputFormat.setInputPaths(job, args[0]);
   FileSystem.get(job).delete(new Path(args[1]));
   FileOutputFormat.setOutputPath(job, new Path("/tmp/DeleteThisDirectory1"));
   JobClient.runJob(job);
   System.out.println("***********DONE********");
   return 0;
 }

Esempio n. 20

0

Mostra file

File: LogMean3.java Progetto: smcvb/Masters

  public static void main(String[] args) throws Exception {
    JobConf job = new JobConf(LogMean3.class);
    job.setJobName("LogMean - Example 3");
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    if (args.length == 4) {
      job.setNumMapTasks(Integer.parseInt(args[0]));
      job.setNumReduceTasks(Integer.parseInt(args[1]));
      FileInputFormat.setInputPaths(job, new Path(args[2]));
      FileOutputFormat.setOutputPath(job, new Path(args[3]));
    } else if (args.length < 4) {
      System.out.println("To few arguments given:\n");
      System.out.println(
          "How to\n"
              + "*\tEstimation of Map-task\n"
              + "*\tNumber of Reduce-tasks\n"
              + "*\tInput file path\n"
              + "*\tOutput file path");
      System.exit(1);
    } else { // Case when more than 4 arguments given: incorrect
      System.out.println("To many arguments given:\n");
      System.out.println(
          "How to\n"
              + "*\tEstimation of Map-task\n"
              + "*\tNumber of Reduce-tasks\n"
              + "*\tInput file path\n"
              + "*\tOutput file path");
      System.exit(1);
    }

    JobClient.runJob(job);
  }

Esempio n. 21

0

Mostra file

File: ManyTableJob.java Progetto: babokim/cloudata

  public static Path putData() throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf jobConf = new JobConf(ManyTableJob.class);
    jobConf.set("user.name", nconf.getUserId());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("ManyTableJob_Put" + "(" + new Date() + ")");

    jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000);

    Path outputPath = new Path("ManyTableJob_KEY_" + System.currentTimeMillis());

    FileOutputFormat.setOutputPath(jobConf, outputPath);

    // <MAP>
    jobConf.setMapperClass(ManyTablePutMap.class);
    jobConf.setInputFormat(SimpleInputFormat.class);
    jobConf.setNumMapTasks(numOfTables);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    // </MAP>

    // <REDUCE>
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
      // Run Job
      JobClient.runJob(jobConf);
      return outputPath;
    } finally {
      // delete temp output path
      FileSystem fs = FileSystem.get(jobConf);
      CloudataMapReduceUtil.clearMapReduce(libDir);
    }
  }

Esempio n. 22

0

Mostra file

File: MapreduceStringFinder.java Progetto: computingfacts/open-Technologies

  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      System.err.println("Usage: " + getClass().getName() + " <input> <output> <nPopulation>");
      // ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    // Create a JobConf using the processed <code>conf</code>
    final JobConf jobConf = new JobConf(getConf(), getClass());

    // Specify various job-specific parameters
    jobConf.setJobName(MapreduceStringFinder.class.getSimpleName());

    // setting the input format
    jobConf.setInputFormat(Individuals.class);

    // setting the output ke and value class
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BooleanWritable.class);

    // setting the mapper class
    jobConf.setMapperClass(CIMapper.class);
    jobConf.setNumMapTasks(3); // setting number of maptasks

    // setting the reducer class
    jobConf.setReducerClass(CIReducer.class);

    // setup input/output directories
    final String dataset = args[0];

    FileInputFormat.setInputPaths(jobConf, new Path(dataset));
    FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));
    final int pop = Integer.parseInt(args[2]);

    // based on the configuration, make this job threadable
    if (jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2) == 1) {
      jobConf.setMapRunnerClass(MultithreadedMapRunner.class);
      jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    }
    jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    // for computation intensive data, do not allow the job to fail if the task tracker does not
    // respond
    // with a heatbeat message before the timeout value
    final int timeout = 9000000;
    jobConf.setInt("mapred.task.timeout", timeout);

    // set the parameters to be available before a call to the mapper
    jobConf.setInt("popsize", pop);
    jobConf.setStrings("dataset", dataset);

    // int map = jobConf.getNumMapTasks();
    // System.out.println("Number of Maps"+ map);

    // start the  map/reduce job
    System.out.println("Starting Job");

    // get the start time for this job
    final long startTime = System.currentTimeMillis();

    // Submit the job, then poll for progress until the job is complete
    JobClient.runJob(jobConf);

    // get the end time for this job
    final long endTime = System.currentTimeMillis();

    // get the duration of this job
    final double duration = (endTime - startTime) / 1000.0;
    // System.out.println("Job Finished in " + duration + " seconds");
    // getElapsedTime(startTime - endTime);

    return 0;
  }

Esempio n. 23

0

Mostra file

File: ExecDriver.java Progetto: EasonYi/hive

  /** Execute a query plan using Hadoop. */
  @SuppressWarnings({"deprecation", "unchecked"})
  @Override
  public int execute(DriverContext driverContext) {

    IOPrepareCache ioPrepareCache = IOPrepareCache.get();
    ioPrepareCache.clear();

    boolean success = true;

    Context ctx = driverContext.getCtx();
    boolean ctxCreated = false;
    Path emptyScratchDir;

    MapWork mWork = work.getMapWork();
    ReduceWork rWork = work.getReduceWork();

    try {
      if (ctx == null) {
        ctx = new Context(job);
        ctxCreated = true;
      }

      emptyScratchDir = ctx.getMRTmpPath();
      FileSystem fs = emptyScratchDir.getFileSystem(job);
      fs.mkdirs(emptyScratchDir);
    } catch (IOException e) {
      e.printStackTrace();
      console.printError(
          "Error launching map-reduce job",
          "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
      return 5;
    }

    HiveFileFormatUtils.prepareJobOutput(job);
    // See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput()
    job.setOutputFormat(HiveOutputFormatImpl.class);

    job.setMapperClass(ExecMapper.class);

    job.setMapOutputKeyClass(HiveKey.class);
    job.setMapOutputValueClass(BytesWritable.class);

    try {
      String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER);
      job.setPartitionerClass(JavaUtils.loadClass(partitioner));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage(), e);
    }

    if (mWork.getNumMapTasks() != null) {
      job.setNumMapTasks(mWork.getNumMapTasks().intValue());
    }

    if (mWork.getMaxSplitSize() != null) {
      HiveConf.setLongVar(
          job, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mWork.getMaxSplitSize().longValue());
    }

    if (mWork.getMinSplitSize() != null) {
      HiveConf.setLongVar(
          job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mWork.getMinSplitSize().longValue());
    }

    if (mWork.getMinSplitSizePerNode() != null) {
      HiveConf.setLongVar(
          job,
          HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE,
          mWork.getMinSplitSizePerNode().longValue());
    }

    if (mWork.getMinSplitSizePerRack() != null) {
      HiveConf.setLongVar(
          job,
          HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK,
          mWork.getMinSplitSizePerRack().longValue());
    }

    job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
    job.setReducerClass(ExecReducer.class);

    // set input format information if necessary
    setInputAttributes(job);

    // Turn on speculative execution for reducers
    boolean useSpeculativeExecReducers =
        HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS);
    HiveConf.setBoolVar(
        job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS, useSpeculativeExecReducers);

    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);

    if (mWork.isUseBucketizedHiveInputFormat()) {
      inpFormat = BucketizedHiveInputFormat.class.getName();
    }

    LOG.info("Using " + inpFormat);

    try {
      job.setInputFormat(JavaUtils.loadClass(inpFormat));
    } catch (ClassNotFoundException e) {
      throw new RuntimeException(e.getMessage(), e);
    }

    // No-Op - we don't really write anything here ..
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands
    // it
    String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS);
    String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS);
    if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) {
      String allJars =
          StringUtils.isNotBlank(auxJars)
              ? (StringUtils.isNotBlank(addedJars) ? addedJars + "," + auxJars : auxJars)
              : addedJars;
      LOG.info("adding libjars: " + allJars);
      initializeFiles("tmpjars", allJars);
    }

    // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it
    String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES);
    if (StringUtils.isNotBlank(addedFiles)) {
      initializeFiles("tmpfiles", addedFiles);
    }
    int returnVal = 0;
    boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME));

    if (noName) {
      // This is for a special case to ensure unit tests pass
      HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt());
    }
    String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES);
    // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it
    if (StringUtils.isNotBlank(addedArchives)) {
      initializeFiles("tmparchives", addedArchives);
    }

    try {
      MapredLocalWork localwork = mWork.getMapRedLocalWork();
      if (localwork != null && localwork.hasStagedAlias()) {
        if (!ShimLoader.getHadoopShims().isLocalMode(job)) {
          Path localPath = localwork.getTmpPath();
          Path hdfsPath = mWork.getTmpHDFSPath();

          FileSystem hdfs = hdfsPath.getFileSystem(job);
          FileSystem localFS = localPath.getFileSystem(job);
          FileStatus[] hashtableFiles = localFS.listStatus(localPath);
          int fileNumber = hashtableFiles.length;
          String[] fileNames = new String[fileNumber];

          for (int i = 0; i < fileNumber; i++) {
            fileNames[i] = hashtableFiles[i].getPath().getName();
          }

          // package and compress all the hashtable files to an archive file
          String stageId = this.getId();
          String archiveFileName = Utilities.generateTarFileName(stageId);
          localwork.setStageID(stageId);

          CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName);
          Path archivePath = Utilities.generateTarPath(localPath, stageId);
          LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath);

          // upload archive file to hdfs
          Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId);
          short replication = (short) job.getInt("mapred.submit.replication", 10);
          hdfs.copyFromLocalFile(archivePath, hdfsFilePath);
          hdfs.setReplication(hdfsFilePath, replication);
          LOG.info("Upload 1 archive file  from" + archivePath + " to: " + hdfsFilePath);

          // add the archive file to distributed cache
          DistributedCache.createSymlink(job);
          DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job);
          LOG.info(
              "Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri());
        }
      }
      work.configureJobConf(job);
      List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false);
      Utilities.setInputPaths(job, inputPaths);

      Utilities.setMapRedWork(job, work, ctx.getMRTmpPath());

      if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) {
        try {
          handleSampling(ctx, mWork, job);
          job.setPartitionerClass(HiveTotalOrderPartitioner.class);
        } catch (IllegalStateException e) {
          console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        } catch (Exception e) {
          LOG.error("Sampling error", e);
          console.printError(
              e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
          rWork.setNumReduceTasks(1);
          job.setNumReduceTasks(1);
        }
      }

      // remove the pwd from conf file so that job tracker doesn't show this
      // logs
      String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
      }
      JobClient jc = new JobClient(job);
      // make this client wait if job tracker is not behaving well.
      Throttle.checkJobTracker(job, LOG);

      if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) {
        // initialize stats publishing table
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(job);
        if (factory != null) {
          statsPublisher = factory.getStatsPublisher();
          List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job);
          if (rWork != null) {
            statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job));
          }
          StatsCollectionContext sc = new StatsCollectionContext(job);
          sc.setStatsTmpDirs(statsTmpDir);
          if (!statsPublisher.init(sc)) { // creating stats table if not exists
            if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
              throw new HiveException(
                  ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
            }
          }
        }
      }

      Utilities.createTmpDirs(job, mWork);
      Utilities.createTmpDirs(job, rWork);

      SessionState ss = SessionState.get();
      if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")
          && ss != null) {
        TezSessionState session = ss.getTezSession();
        TezSessionPoolManager.getInstance().close(session, true);
      }

      // Finally SUBMIT the JOB!
      rj = jc.submitJob(job);
      // replace it back
      if (pwd != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd);
      }

      returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager());
      success = (returnVal == 0);
    } catch (Exception e) {
      e.printStackTrace();
      String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
      if (rj != null) {
        mesg = "Ended Job = " + rj.getJobID() + mesg;
      } else {
        mesg = "Job Submission failed" + mesg;
      }

      // Has to use full name to make sure it does not conflict with
      // org.apache.commons.lang.StringUtils
      console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));

      success = false;
      returnVal = 1;
    } finally {
      Utilities.clearWork(job);
      try {
        if (ctxCreated) {
          ctx.clear();
        }

        if (rj != null) {
          if (returnVal != 0) {
            rj.killJob();
          }
          jobID = rj.getID().toString();
        }
      } catch (Exception e) {
        LOG.warn("Failed while cleaning up ", e);
      } finally {
        HadoopJobExecHelper.runningJobs.remove(rj);
      }
    }

    // get the list of Dynamic partition paths
    try {
      if (rj != null) {
        if (mWork.getAliasToWork() != null) {
          for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) {
            op.jobClose(job, success);
          }
        }
        if (rWork != null) {
          rWork.getReducer().jobClose(job, success);
        }
      }
    } catch (Exception e) {
      // jobClose needs to execute successfully otherwise fail task
      if (success) {
        success = false;
        returnVal = 3;
        String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'";
        console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
      }
    }

    return (returnVal);
  }

Esempio n. 24

0

Mostra file

File: MapReduce.java Progetto: uuee6543/Panda-information-retrieval-package

  /** Runs this tool. */
  public int run(String[] args) throws Exception {

    long startWholeProgram = System.currentTimeMillis();
    boolean hasConverged = false;
    int counter = 0;
    Reader sequenceFilereader;

    // Must have four arguments
    if (args.length != 4) {
      MapReduce.printUsage();
      return -1;
    }

    // Set input and output file paths
    String inputPathToAdjacencyTextFile = args[0];
    String outputPathToNodeSequenceFileFormat = args[1];

    // Configure Job 1:
    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);
    int reduceTasksSetup = 1;

    // Configure Job Setup
    JobConf conf1 = new JobConf(MapReduce.class);
    conf1.setInt("numberOfNodes", numberNodes);
    conf1.setJobName("Setup Job");
    conf1.setNumMapTasks(mapTasks);
    conf1.setNumReduceTasks(reduceTasksSetup);

    FileInputFormat.setInputPaths(conf1, new Path(inputPathToAdjacencyTextFile));
    FileOutputFormat.setOutputPath(conf1, new Path(outputPathToNodeSequenceFileFormat));
    FileOutputFormat.setCompressOutput(conf1, false);

    conf1.setOutputKeyClass(Text.class);
    conf1.setOutputValueClass(MapReduceNode.class);

    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setInputFormat(TextInputFormat.class);

    conf1.setMapperClass(ConfigureMapper.class);
    conf1.setReducerClass(ConfigureReducer.class);

    // Delete the output directory if it exists already
    Path tempDir = new Path(outputPathToNodeSequenceFileFormat);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempDir, true);

    long startTime = System.currentTimeMillis();

    // Run Configure Job
    RunningJob job = JobClient.runJob(conf1);

    sLogger.info(
        "Config Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    String inputPath = args[1];
    String outputPath = null;
    String outputPathForNormalisedPagerank = null;
    String outputPathforFinalSortedPagerank = null;

    // Page Rank Calculation Job 2 (Iterative until convergence has been reached)

    while (!hasConverged) {
      System.out.println("*** ITERATION " + counter + ", number of nodes: " + numberNodes);
      counter++;

      sLogger.info("***** ITERATION " + counter);

      outputPath = args[1] + counter;

      // Pure Page Rank Calculation Job Setup
      JobConf pageRankJob = new JobConf(getConf(), MapReduce.class);
      pageRankJob.setInt("numberOfNodes", numberNodes);

      FileInputFormat.setInputPaths(pageRankJob, new Path(inputPath));
      FileOutputFormat.setOutputPath(pageRankJob, new Path(outputPath));
      FileOutputFormat.setCompressOutput(pageRankJob, false);

      pageRankJob.setJobName("PP Iteration " + counter);
      pageRankJob.setNumMapTasks(mapTasks);
      pageRankJob.setNumReduceTasks(reduceTasks);

      pageRankJob.setOutputKeyClass(Text.class);
      pageRankJob.setOutputValueClass(MapReduceNode.class);

      pageRankJob.setOutputFormat(SequenceFileOutputFormat.class);
      pageRankJob.setInputFormat(SequenceFileInputFormat.class);

      pageRankJob.setMapperClass(MapReduce.PageRankCalcMapper.class);
      pageRankJob.setReducerClass(MapReduce.PageRankCalcReducer.class);

      // Delete the output directory if it exists already
      Path tempPageRankDir = new Path(outputPath);
      FileSystem.get(tempDir.toUri(), conf1).delete(tempPageRankDir, true);

      startTime = System.currentTimeMillis();

      // Run Pure Page Rank Calculation Job
      RunningJob runningJob = JobClient.runJob(pageRankJob);

      sLogger.info(
          "PP Job"
              + counter
              + "Finished in "
              + (System.currentTimeMillis() - startTime) / 1000.0
              + " seconds");

      // Delete the input directory if it exists already
      Path tempInputPageRankDir = new Path(inputPath);
      FileSystem.get(tempDir.toUri(), conf1).delete(tempInputPageRankDir, true);

      // Set the output path of this iteration to be the inputpath for the next iteration
      inputPath = outputPath;

      // Check for convergence after every five iterations
      if (counter % 5 == 0) {

        Configuration conf = getConf();
        if (outputPath != null) {
          sLogger.info("Attempting to open file: " + outputPath + File.separator + "part-00000");
          System.out.println(
              "Attempting to open file: " + outputPath + File.separator + "part-00000");
        } else {
          sLogger.info("OUTPUT PATH IS NULL");
          System.out.println("OUTPUT PATH IS NULL");
        }
        Path cFile = new Path(outputPath + File.separator + "part-00000");
        FileSystem fs = FileSystem.get(cFile.toUri(), conf);

        sequenceFilereader = new Reader(fs, cFile, conf);

        for (int i = 0; i < 5; i++) {
          MapReduceNode readValue = new MapReduceNode();
          Text readKey = new Text();

          sequenceFilereader.next(readKey, readValue);
          if (!(readValue.hasConverged())) {
            break;
          }

          if (i == 4) {
            hasConverged = true;
            sequenceFilereader.close();
          }
        }
        sequenceFilereader.close();
      }
      if (counter == 75) {
        sLogger.info("****************** Exiting (purposefully) after 75th iteration");
        hasConverged = true;
      }
    }

    // Normalised Page Rank Calculation Job 3
    outputPathForNormalisedPagerank = args[1] + "normalizedPageRank";

    // Normalised Page Rank Calculation Job Setup
    JobConf normalizationJob = new JobConf(getConf(), MapReduce.class);

    FileInputFormat.setInputPaths(normalizationJob, new Path(inputPath));
    FileOutputFormat.setOutputPath(normalizationJob, new Path(outputPathForNormalisedPagerank));
    FileOutputFormat.setCompressOutput(normalizationJob, false);

    normalizationJob.setJobName("Normalised Pagerank Output");
    normalizationJob.setNumMapTasks(mapTasks);
    normalizationJob.setNumReduceTasks(1);

    normalizationJob.setOutputKeyClass(Text.class);
    normalizationJob.setOutputValueClass(DoubleWritable.class);

    normalizationJob.setInputFormat(SequenceFileInputFormat.class);
    normalizationJob.setOutputFormat(SequenceFileOutputFormat.class);

    normalizationJob.setMapperClass(NormalisationMapper.class);
    normalizationJob.setReducerClass(NormalisationReducer.class);

    // Delete the output directory if it exists already
    Path tempUpdatedPageRankDir = new Path(outputPathForNormalisedPagerank);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempUpdatedPageRankDir, true);

    startTime = System.currentTimeMillis();

    // Run Normalised Page Rank Calculation Job
    RunningJob runningUpdateJob = JobClient.runJob(normalizationJob);

    sLogger.info(
        "Normalisation Job Finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    // Sorting and Output Job 4

    // Delete the intermediary files created
    Path tempNormalizationInputPath = new Path(inputPath);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempNormalizationInputPath, true);

    inputPath = outputPathForNormalisedPagerank;
    outputPathforFinalSortedPagerank = args[1] + "FinalSortedPageRank";

    // Sorting and Output Job Setup
    JobConf outputJob = new JobConf(getConf(), MapReduce.class);

    FileInputFormat.setInputPaths(outputJob, new Path(inputPath));
    FileOutputFormat.setOutputPath(outputJob, new Path(outputPathforFinalSortedPagerank));
    FileOutputFormat.setCompressOutput(outputJob, false);

    outputJob.setJobName("Final Pagerank Output");
    sLogger.info("Starting final sotirng job -> this will output a single file");
    outputJob.setNumMapTasks(1);
    outputJob.setNumReduceTasks(1);

    outputJob.setOutputKeyClass(DoubleWritable.class);
    outputJob.setOutputValueClass(Text.class);

    outputJob.setInputFormat(SequenceFileInputFormat.class);
    outputJob.setOutputFormat(TextOutputFormat.class);

    outputJob.setMapperClass(OutputMapper.class);
    outputJob.setReducerClass(OutputReducer.class);

    outputJob.setOutputKeyComparatorClass(ReverseComparator.class);

    startTime = System.currentTimeMillis();

    // Run Sorting and Output Job
    RunningJob runningSortingJob = JobClient.runJob(outputJob);

    // Delete the intermediary files created
    Path tempFinalSortedInputPath = new Path(inputPath);
    FileSystem.get(tempDir.toUri(), conf1).delete(tempFinalSortedInputPath, true);

    sLogger.info(
        "Final Sorting Job Finished in "
            + (System.currentTimeMillis() - startTime) / 1000.0
            + " seconds");

    sLogger.info(
        "The program lasted "
            + (System.currentTimeMillis() - startWholeProgram) / 1000.0
            + "s ("
            + (System.currentTimeMillis() - startWholeProgram) / 60000.0
            + " mins)");

    return 0;
  }

Esempio n. 25

0

Mostra file

File: Step0JobTest.java Progetto: maximzhao/Mahout-GSOC-LibLinear

  public void testProcessOutput() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, numAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, numInstances);

    // each instance label is its index in the dataset
    int labelId = Utils.findLabel(descriptor);
    for (int index = 0; index < numInstances; index++) {
      source[index][labelId] = index;
    }

    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);
    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Reporter reporter = Reporter.NULL;

    int[] keys = new int[numMaps];
    Step0Output[] values = new Step0Output[numMaps];

    int[] expectedIds = new int[numMaps];

    for (int p = 0; p < numMaps; p++) {
      InputSplit split = sorted[p];
      RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

      LongWritable key = reader.createKey();
      Text value = reader.createValue();

      Long firstKey = null;
      int size = 0;

      while (reader.next(key, value)) {
        if (firstKey == null) {
          firstKey = key.get();
          expectedIds[p] = converter.convert(0, value.toString()).label;
        }

        size++;
      }

      keys[p] = p;
      values[p] = new Step0Output(firstKey, size);
    }

    Step0Output[] partitions = Step0Job.processOutput(keys, values);

    int[] actualIds = Step0Output.extractFirstIds(partitions);

    assertTrue(
        "Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds),
        Arrays.equals(expectedIds, actualIds));
  }

Esempio n. 26

0

Mostra file

File: DedupCLIRMHPairs.java Progetto: seweissman/wikiduper

  @SuppressWarnings("static-access")
  @Override
  public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("bz2 input path").create(INPUT));
    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
        OptionBuilder.withArgName("integer")
            .hasArg()
            .withDescription("number of samples")
            .create(nSamplesOption));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    if (!cmdline.hasOption(INPUT)
        || !cmdline.hasOption(OUTPUT)
        || !cmdline.hasOption(nSamplesOption)) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.setWidth(120);
      formatter.printHelp(this.getClass().getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    String nSamplesIn = cmdline.getOptionValue(nSamplesOption);
    int reduceTasks = 1;

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - bz2 file: " + inputPath);
    LOG.info(" - output file: " + outputPath);

    JobConf conf = new JobConf(getConf(), DedupCLIRMHPairs.class);
    conf.setJobName(
        String.format("DedupSentencePairs[%s: %s, %s: %s]", INPUT, inputPath, OUTPUT, outputPath));

    conf.setNumMapTasks(4);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("nSamples", Integer.parseInt(nSamplesIn));

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    // Set heap space - using old API
    conf.set("mapred.job.map.memory.mb", "2048");
    conf.set("mapred.map.child.java.opts", "-Xmx2048m");
    conf.set("mapred.job.reduce.memory.mb", "4096");
    conf.set("mapred.reduce.child.java.opts", "-Xmx4096m");

    conf.setMapperClass(DedupMapper.class);
    conf.setReducerClass(DedupReducer.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(PairOfInts.class);
    conf.setOutputValueClass(IntWritable.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    JobClient.runJob(conf);

    return 0;
  }

Esempio n. 27

0

Mostra file

File: ValueAggregatorJob.java Progetto: ukulililixl/core

  /**
   * Create an Aggregate based map/reduce job.
   *
   * @param args the arguments used for job creation. Generic hadoop arguments are accepted.
   * @return a JobConf object ready for submission.
   * @throws IOException
   * @see GenericOptionsParser
   */
  public static JobConf createValueAggregatorJob(String args[]) throws IOException {

    Configuration conf = new Configuration();

    GenericOptionsParser genericParser = new GenericOptionsParser(conf, args);
    args = genericParser.getRemainingArgs();

    if (args.length < 2) {
      System.out.println(
          "usage: inputDirs outDir " + "[numOfReducer [textinputformat|seq [specfile [jobName]]]]");
      GenericOptionsParser.printGenericCommandUsage(System.out);
      System.exit(1);
    }
    String inputDir = args[0];
    String outputDir = args[1];
    int numOfReducers = 1;
    if (args.length > 2) {
      numOfReducers = Integer.parseInt(args[2]);
    }

    Class<? extends InputFormat> theInputFormat = TextInputFormat.class;
    if (args.length > 3 && args[3].compareToIgnoreCase("textinputformat") == 0) {
      theInputFormat = TextInputFormat.class;
    } else {
      theInputFormat = SequenceFileInputFormat.class;
    }

    Path specFile = null;

    if (args.length > 4) {
      specFile = new Path(args[4]);
    }

    String jobName = "";

    if (args.length > 5) {
      jobName = args[5];
    }

    JobConf theJob = new JobConf(conf);
    if (specFile != null) {
      theJob.addResource(specFile);
    }
    String userJarFile = theJob.get("user.jar.file");
    if (userJarFile == null) {
      theJob.setJarByClass(ValueAggregator.class);
    } else {
      theJob.setJar(userJarFile);
    }
    theJob.setJobName("ValueAggregatorJob: " + jobName);

    FileInputFormat.addInputPaths(theJob, inputDir);

    theJob.setInputFormat(theInputFormat);

    theJob.setMapperClass(ValueAggregatorMapper.class);
    FileOutputFormat.setOutputPath(theJob, new Path(outputDir));
    theJob.setOutputFormat(TextOutputFormat.class);
    theJob.setMapOutputKeyClass(Text.class);
    theJob.setMapOutputValueClass(Text.class);
    theJob.setOutputKeyClass(Text.class);
    theJob.setOutputValueClass(Text.class);
    theJob.setReducerClass(ValueAggregatorReducer.class);
    theJob.setCombinerClass(ValueAggregatorCombiner.class);
    theJob.setNumMapTasks(1);
    theJob.setNumReduceTasks(numOfReducers);
    return theJob;
  }

Esempio n. 28

0

Mostra file

File: RemoteParForMR.java Progetto: zhanghua498/incubator-systemml

  /**
   * @param pfid
   * @param program
   * @param taskFile
   * @param resultFile
   * @param _enableCPCaching
   * @param mode
   * @param numMappers
   * @param replication
   * @return
   * @throws DMLRuntimeException
   */
  public static RemoteParForJobReturn runJob(
      long pfid,
      String program,
      String taskFile,
      String resultFile,
      MatrixObject colocatedDPMatrixObj, // inputs
      boolean enableCPCaching,
      int numMappers,
      int replication,
      int max_retry,
      long minMem,
      boolean jvmReuse) // opt params
      throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    // maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
      /////
      // configure the MR job

      // set arbitrary CP program blocks that will perform in the mapper
      MRJobConfiguration.setProgramBlocks(job, program);

      // enable/disable caching
      MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

      // set mappers, reducers, combiners
      job.setMapperClass(RemoteParWorkerMapper.class); // map-only

      // set input format (one split per row, NLineInputFormat default N=1)
      if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
        job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
        MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
        MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
        MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
        MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
        MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
      } else // default case
      {
        job.setInputFormat(NLineInputFormat.class);
      }

      // set the input path and output path
      FileInputFormat.setInputPaths(job, new Path(taskFile));

      // set output format
      job.setOutputFormat(SequenceFileOutputFormat.class);

      // set output path
      MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
      FileOutputFormat.setOutputPath(job, new Path(resultFile));

      // set the output key, value schema
      job.setMapOutputKeyClass(LongWritable.class);
      job.setMapOutputValueClass(Text.class);
      job.setOutputKeyClass(LongWritable.class);
      job.setOutputValueClass(Text.class);

      //////
      // set optimization parameters

      // set the number of mappers and reducers
      job.setNumMapTasks(numMappers); // numMappers
      job.setNumReduceTasks(0);
      // job.setInt("mapred.map.tasks.maximum", 1); //system property
      // job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
      // job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

      // use FLEX scheduler configuration properties
      if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
        job.setInt("flex.priority", 0); // highest

        job.setInt("flex.map.min", 0);
        job.setInt("flex.map.max", numMappers);
        job.setInt("flex.reduce.min", 0);
        job.setInt("flex.reduce.max", numMappers);
      }

      // set jvm memory size (if require)
      String memKey = "mapred.child.java.opts";
      if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
        InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
        LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
      }

      // disable automatic tasks timeouts and speculative task exec
      job.setInt("mapred.task.timeout", 0);
      job.setMapSpeculativeExecution(false);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // enables the reuse of JVMs (multiple tasks per MR task)
      if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); // unlimited

      // set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
      job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); // 8MB

      // set the replication factor for the results
      job.setInt("dfs.replication", replication);

      // set the max number of retries per map task
      //  disabled job-level configuration to respect cluster configuration
      //  note: this refers to hadoop2, hence it never had effect on mr1
      // job.setInt("mapreduce.map.maxattempts", max_retry);

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      /////
      // execute the MR job
      RunningJob runjob = JobClient.runJob(job);

      // Process different counters
      Statistics.incrementNoOfExecutedMRJobs();
      Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
      int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
      int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
      if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
        Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
        Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
        Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
        Group cgroup =
            runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
        CacheStatistics.incrementMemHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
        CacheStatistics.incrementFSBuffHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
        CacheStatistics.incrementFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
        CacheStatistics.incrementHDFSHits(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
        CacheStatistics.incrementFSBuffWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
        CacheStatistics.incrementFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
        CacheStatistics.incrementHDFSWrites(
            (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
        CacheStatistics.incrementAcquireRTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
        CacheStatistics.incrementAcquireMTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
        CacheStatistics.incrementReleaseTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
        CacheStatistics.incrementExportTime(
            cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
      }

      // read all files of result variables and prepare for return
      LocalVariableMap[] results = readResultFile(job, resultFile);

      ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
      throw new DMLRuntimeException(ex);
    } finally {
      // remove created files
      try {
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
        MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
      } catch (IOException ex) {
        throw new DMLRuntimeException(ex);
      }
    }

    if (DMLScript.STATISTICS) {
      long t1 = System.nanoTime();
      Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
  }

Esempio n. 29

0

Mostra file

File: RangeQuery.java Progetto: easysg/spatedb

  /**
   * Performs a range query using MapReduce
   *
   * @param fs
   * @param inputFile
   * @param queryRange
   * @param shape
   * @param output
   * @return
   * @throws IOException
   */
  public static long rangeQueryMapReduce(
      FileSystem fs,
      Path inputFile,
      Path userOutputPath,
      Shape queryShape,
      Shape shape,
      boolean overwrite,
      boolean background,
      QueryInput query)
      throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    FileSystem outFs = inputFile.getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
      do {
        outputPath =
            new Path(
                inputFile.toUri().getPath() + ".rangequery_" + (int) (Math.random() * 1000000));
      } while (outFs.exists(outputPath));
    } else {
      if (outFs.exists(outputPath)) {
        if (overwrite) {
          outFs.delete(outputPath, true);
        } else {
          throw new RuntimeException("Output path already exists and -overwrite flag is not set");
        }
      }
    }

    job.setJobName("RangeQuery");
    job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    RangeFilter.setQueryRange(job, queryShape); // Set query range for
    // filter

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(3);

    // Decide which map function to use depending on how blocks are indexed
    // And also which input format to use
    if (SpatialSite.isRTree(fs, inputFile)) {
      // RTree indexed file
      LOG.info("Searching an RTree indexed file");
      job.setInputFormat(RTreeInputFormat.class);
    } else {
      // A file with no local index
      LOG.info("Searching a non local-indexed file");
      job.setInputFormat(ShapeInputFormat.class);
    }

    GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inputFile);
    // if (gIndex != null && gIndex.isReplicated()){
    // job.setMapperClass(RangeQueryMap.class);

    Class<?> OutputKey = NullWritable.class;
    try {
      Class<?> c = shape.getClass();
      Field f = c.getDeclaredField(query.field);
      f.setAccessible(true);
      if (f.getType().equals(Integer.TYPE)) {
        OutputKey = IntWritable.class;
      } else if (f.getType().equals(Double.TYPE)) {
        OutputKey = DoubleWritable.class;
      } else if (f.getType().equals(Long.TYPE)) {
        OutputKey = LongWritable.class;
      }
    } catch (SecurityException e) {
      e.printStackTrace();
    } catch (NoSuchFieldException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    job.setMapOutputKeyClass(OutputKey);
    switch (query.type) {
      case Distinct:
        job.setMapperClass(DistinctQueryMap.class);
        job.setReducerClass(DistinctQueryReduce.class);
        job.setMapOutputValueClass(NullWritable.class);
        break;
      case Distribution:
        job.setMapperClass(DistributionQueryMap.class);
        job.setReducerClass(DistributionQueryReduce.class);
        job.setMapOutputValueClass(IntWritable.class);
        break;
      default:
        break;
    }
    // }
    // else
    // job.setMapperClass(RangeQueryMapNoDupAvoidance.class);

    // Set query range for the map function
    job.set(QUERY_SHAPE_CLASS, queryShape.getClass().getName());
    job.set(QUERY_SHAPE, queryShape.toText(new Text()).toString());
    job.set(QUERY_FIELD, query.field);

    // Set shape class for the SpatialInputFormat
    SpatialSite.setShapeClass(job, shape.getClass());

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, inputFile);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    if (!background) {
      RunningJob runningJob = JobClient.runJob(job);
      Counters counters = runningJob.getCounters();
      Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
      final long resultCount = outputRecordCounter.getValue();

      // If outputPath not set by user, automatically delete it
      if (userOutputPath == null) outFs.delete(outputPath, true);

      return resultCount;
    } else {
      JobClient jc = new JobClient(job);
      lastRunningJob = jc.submitJob(job);
      return -1;
    }
  }

Esempio n. 30

0

Mostra file

File: CSRConverter.java Progetto: KGayan/Acacia

  public static void main(String[] args) throws Exception {
    if (!validArgs(args)) {
      printUsage();
      return;
    }
    // These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/csrconverter-output";
    String dir2 = "/user/miyuru/csrconverter-output-sorted";

    // We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    System.out.println("Deleting the dir : " + dir1);

    if (fs1.exists(new Path(dir1))) {
      fs1.delete(new Path(dir1), true);
    }

    System.out.println("Done deleting the dir : " + dir1);
    System.out.println("Deleting the dir : " + dir2);
    if (fs1.exists(new Path(dir2))) {
      fs1.delete(new Path(dir2), true);
    }

    Path notinPath = new Path("/user/miyuru/notinverts/notinverts");

    if (!fs1.exists(notinPath)) {
      fs1.create(notinPath);
    }

    System.out.println("Done deleting the dir : " + dir2);

    // Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why.

    VertexCounterClient.setDefaultGraphID(args[3], args[2]);

    // First job creates the inverted index

    JobConf conf = new JobConf(CSRConverter.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]);
    conf.set("org.acacia.partitioner.hbase.table", args[2]);
    conf.set("org.acacia.partitioner.hbase.contacthost", args[3]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    // conf.setMapperClass(InvertedMapper.class);
    conf.setReducerClass(InvertedReducer.class);
    // conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // FileInputFormat.setInputPaths(conf, new Path(args[0]));
    MultipleInputs.addInputPath(
        conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class);
    MultipleInputs.addInputPath(
        conf,
        new Path("/user/miyuru/notinverts/notinverts"),
        TextInputFormat.class,
        InvertedMapper.class);
    FileOutputFormat.setOutputPath(conf, new Path(dir1));

    // Also for the moment we turn-off the speculative execution
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setNumMapTasks(96);
    conf.setNumReduceTasks(96);
    conf.setPartitionerClass(VertexPartitioner.class);
    conf.set("vertex-count", args[4]);
    conf.set("zero-flag", args[5]);
    Job job = new Job(conf, "csr_inverter");
    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
  }