Пример #1
0
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(ImagePreProcessor.class);
    conf.setJobName("preprocessor");

    String baseOutputPath = args[1];
    FileSystem fs = FileSystem.get(conf);
    Path interPath = new Path(baseOutputPath + "_inter");
    fs.mkdirs(interPath);
    Path imageFilesPath = new Path(interPath + "\\imageFiles");
    fs.mkdirs(imageFilesPath);
    System.out.println("imageFilesPath--------------->" + imageFilesPath);
    conf.setStrings("working.directory", imageFilesPath.toString());

    copyFilesFromFolder(conf, args[0], imageFilesPath);
    //		conf.setJobName("FileSystemMove");
    Path fileNamesPath = new Path(interPath + "\\fileNames");
    System.out.println("fileNamesPath------------>" + fileNamesPath);
    writeImageNamesToFile(conf, imageFilesPath, fileNamesPath);
    //		FileSystem.get(conf).setWorkingDirectory(imageFilesPath);
    //		Path finalIntegerPath = new Path(imageFilesPath.toString()+"\\intImages");
    //		fs.mkdirs(finalIntegerPath);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    System.out.println("setting mapper");
    conf.setMapperClass(ImagePreprocessorMap.class);
    conf.setReducerClass(ImagePreprocessorReduce.class);

    System.out.println("mapper set");
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(outputClass.class);

    FileInputFormat.setInputPaths(conf, fileNamesPath);
    FileOutputFormat.setOutputPath(conf, new Path(interPath.toString() + "\\temp"));
    JobClient.runJob(conf);
    return 0;
  }
  public int run(String[] args) throws Exception {
    if (args.length != 3) {
      System.err.println("Usage: " + getClass().getName() + " <input> <output> <nPopulation>");
      // ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    // Create a JobConf using the processed <code>conf</code>
    final JobConf jobConf = new JobConf(getConf(), getClass());

    // Specify various job-specific parameters
    jobConf.setJobName(MapreduceStringFinder.class.getSimpleName());

    // setting the input format
    jobConf.setInputFormat(Individuals.class);

    // setting the output ke and value class
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BooleanWritable.class);

    // setting the mapper class
    jobConf.setMapperClass(CIMapper.class);
    jobConf.setNumMapTasks(3); // setting number of maptasks

    // setting the reducer class
    jobConf.setReducerClass(CIReducer.class);

    // setup input/output directories
    final String dataset = args[0];

    FileInputFormat.setInputPaths(jobConf, new Path(dataset));
    FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));
    final int pop = Integer.parseInt(args[2]);

    // based on the configuration, make this job threadable
    if (jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2) == 1) {
      jobConf.setMapRunnerClass(MultithreadedMapRunner.class);
      jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    }
    jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    // for computation intensive data, do not allow the job to fail if the task tracker does not
    // respond
    // with a heatbeat message before the timeout value
    final int timeout = 9000000;
    jobConf.setInt("mapred.task.timeout", timeout);

    // set the parameters to be available before a call to the mapper
    jobConf.setInt("popsize", pop);
    jobConf.setStrings("dataset", dataset);

    // int map = jobConf.getNumMapTasks();
    // System.out.println("Number of Maps"+ map);

    // start the  map/reduce job
    System.out.println("Starting Job");

    // get the start time for this job
    final long startTime = System.currentTimeMillis();

    // Submit the job, then poll for progress until the job is complete
    JobClient.runJob(jobConf);

    // get the end time for this job
    final long endTime = System.currentTimeMillis();

    // get the duration of this job
    final double duration = (endTime - startTime) / 1000.0;
    // System.out.println("Job Finished in " + duration + " seconds");
    // getElapsedTime(startTime - endTime);

    return 0;
  }
  /** @param args */
  public static void main(String[] args) {

    File inputFile = new File(args[0]);
    File frameFile = new File(args[1]);
    File tempDir = new File(args[2]);
    String dbPath = args[3];

    try {
      JobControl jobControl = new JobControl("jsonld-entities");

      JobConf defaultConf = new JobConf();

      // Map the triples into JSON-LD fragments
      JobConf initialLoadConf = new JobConf(defaultConf);
      initialLoadConf.setInt("rank", 0);
      initialLoadConf.setStrings("frame-file", frameFile.toString());
      initialLoadConf.setMapperClass(TripleMapper.class);
      initialLoadConf.setReducerClass(EntityReducer.class);
      initialLoadConf.setInputFormat(TextInputFormat.class);
      initialLoadConf.setOutputFormat(TextOutputFormat.class);
      initialLoadConf.setMapOutputKeyClass(Text.class);
      initialLoadConf.setMapOutputValueClass(Text.class);
      initialLoadConf.setOutputKeyClass(Text.class);
      initialLoadConf.setOutputValueClass(Text.class);
      FileInputFormat.setInputPaths(initialLoadConf, new Path(inputFile.toString()));
      Path outputPath = new Path(tempDir.toString() + "/stage0");
      FileOutputFormat.setOutputPath(initialLoadConf, outputPath);
      Path prevOutput = outputPath;
      Job initialLoad = new Job(initialLoadConf);
      jobControl.addJob(initialLoad);

      // Aggregate JSON-LD fragments into nested structure
      EntityFrame entityFrame = new EntityFrame();
      entityFrame.parse(frameFile);
      Job prevJob = initialLoad;
      for (int rank = 1; rank <= entityFrame.getMaxRank(); rank++) {
        JobConf conf = new JobConf(defaultConf);
        conf.setInt("rank", rank);
        conf.setStrings("frame-file", frameFile.toString());
        conf.setMapperClass(IdentityMapper.class);
        conf.setReducerClass(EntityReducer.class);
        conf.setInputFormat(KeyValueTextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(conf, prevOutput);
        outputPath = new Path(tempDir.toString() + "/stage" + rank);
        FileOutputFormat.setOutputPath(conf, outputPath);
        prevOutput = outputPath;
        Job buildEntityJob = new Job(conf);
        jobControl.addJob(buildEntityJob);
        buildEntityJob.addDependingJob(prevJob);
        prevJob = buildEntityJob;
      }

      // Frame nested data
      JobConf frameConf = new JobConf(defaultConf);
      frameConf.setStrings("frame-file", frameFile.toString());
      frameConf.setMapperClass(IdentityMapper.class);
      frameConf.setReducerClass(EntityFrameReducer.class);
      frameConf.setInputFormat(KeyValueTextInputFormat.class);
      frameConf.setOutputFormat(MongoOutputFormat.class);
      frameConf.set("mongo.output.uri", dbPath);
      frameConf.set(
          "stream.io.identifier.resolver.class", "com.mongodb.hadoop.mapred.MongoOutputFormat");
      frameConf.setMapOutputKeyClass(Text.class);
      frameConf.setMapOutputValueClass(Text.class);
      frameConf.setOutputKeyClass(NullWritable.class);
      frameConf.setOutputValueClass(MongoUpdateWritable.class);
      FileInputFormat.setInputPaths(frameConf, prevOutput);
      Job frameEntitiesJob = new Job(frameConf);
      jobControl.addJob(frameEntitiesJob);
      frameEntitiesJob.addDependingJob(prevJob);

      FileSystem fs = FileSystem.get(defaultConf);
      fs.delete(new Path(tempDir.toString()), true);

      // Run pipeline
      jobControl.run();

    } catch (IOException e) {
      // TODO(simister): Auto-generated catch block
      e.printStackTrace();
    }
  }