Java Job.setGroupingComparatorClass示例，org.apache.hadoop.mapreduce.Job.setGroupingComparatorClass Java示例

示例#1

0

显示文件

文件： EnumerateTrianglesJob.java 项目： TillmannFiehn/ktrusses

  @Override
  public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    // scatter the edges to lower degree vertex and build open triads
    Job scatter =
        prepareJob(
            getInputPath(),
            getTempPath(TMP_OPEN_TRIADS),
            SequenceFileInputFormat.class,
            ScatterEdgesToLowerDegreeVertexMapper.class,
            Vertex.class,
            Vertex.class,
            BuildOpenTriadsReducer.class,
            JoinableUndirectedEdge.class,
            VertexOrMarker.class,
            SequenceFileOutputFormat.class);
    scatter.waitForCompletion(true);

    // necessary as long as we don't have access to an undeprecated MultipleInputs
    Job prepareInput =
        prepareJob(
            getInputPath(),
            getTempPath(TMP_CLOSING_EDGES),
            SequenceFileInputFormat.class,
            PrepareInputMapper.class,
            JoinableUndirectedEdge.class,
            VertexOrMarker.class,
            Reducer.class,
            JoinableUndirectedEdge.class,
            VertexOrMarker.class,
            SequenceFileOutputFormat.class);
    prepareInput.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    prepareInput.waitForCompletion(true);

    // join opentriads and edges pairwise to get all triangles
    Job joinTriads =
        prepareJob(
            getCombinedTempPath(TMP_OPEN_TRIADS, TMP_CLOSING_EDGES),
            getOutputPath(),
            SequenceFileInputFormat.class,
            Mapper.class,
            JoinableUndirectedEdge.class,
            VertexOrMarker.class,
            JoinTrianglesReducer.class,
            Triangle.class,
            NullWritable.class,
            SequenceFileOutputFormat.class);
    joinTriads.setGroupingComparatorClass(JoinableUndirectedEdge.GroupingComparator.class);
    joinTriads.waitForCompletion(true);

    return 0;
  }

示例#2

0

显示文件

文件： ImportVTLocationFromFileWithReducer.java 项目： sihanwang/vesselmovemnt

  public int run(String[] args) throws Exception {
    // TODO Auto-generated method stub

    Job job =
        Job.getInstance(
            getConf(),
            "Import vessel locations from files in "
                + args[0]
                + " into table cdb_vessel:vessel_location"); // co

    FileInputFormat.addInputPath(job, new Path(args[0]));

    job.setJarByClass(ImportVTLocationFromFileWithReducer.class);
    job.setJobName("Vessel_location_injection");
    job.setInputFormatClass(VTVesselLocationFileInputFormat.class);
    job.setMapOutputKeyClass(Key_IMOAndRecordTime.class);
    job.setMapOutputValueClass(TextArrayWritable.class);

    job.setPartitionerClass(Partitioner_IMO.class);
    job.setGroupingComparatorClass(GroupComparator_IMO.class);

    job.setReducerClass(ImportReducer.class);
    job.setNumReduceTasks(Integer.parseInt(args[1]));

    job.setOutputFormatClass(NullOutputFormat.class);

    return job.waitForCompletion(true) ? 0 : 1;
  }

示例#3

0

显示文件

文件： UtilityPredictor.java 项目： pranab/sifarish

  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Rating predictor  MR";
    job.setJobName(jobName);

    job.setJarByClass(UtilityPredictor.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(UtilityPredictor.PredictionMapper.class);
    job.setReducerClass(UtilityPredictor.PredictorReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(ItemIdGroupComprator.class);
    job.setPartitionerClass(ItemIdPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("utp.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }

示例#4

0

显示文件

文件： ImplicitRatingEstimator.java 项目： vbajaria/sifarish

  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Implicit rating estimator MR";
    job.setJobName(jobName);

    job.setJarByClass(ImplicitRatingEstimator.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(ImplicitRatingEstimator.RatingEstimatorMapper.class);
    job.setReducerClass(ImplicitRatingEstimator.RatingEstimatorReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));
    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }

示例#5

0

显示文件

文件： TopFive.java 项目： vikash11/Amazon-Reviews

  @Override
  public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
      System.err.println("Usage: topreviews <in> [<in>...] <out>");
      System.exit(2);
    }

    Job job = Job.getInstance(conf, "Top Five Reviews");
    job.setJarByClass(TopFive.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapperClass(TopFiveMapper.class);
    job.setReducerClass(TopFiveReducer.class);

    job.setMapOutputKeyClass(TextPair.class);
    job.setMapOutputValueClass(TextPair.class);

    job.setOutputKeyClass(TextPair.class);
    job.setOutputValueClass(TextPair.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    return 0;
  }

示例#6

0

显示文件

文件： JoinMain.java 项目： kiichi7/WorkFiles

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    //		    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    //		    if (otherArgs.length != 3) {
    //		      System.err.println("Usage: <tradeTableDir> <payTableDir> <output>");
    //		      System.exit(2);
    //		    }
    //		    String tradeTableDir = args[0];
    //		    String payTableDir = args[1];
    //		    String joinTableDir = args[2];
    Job job = new Job(conf, "Join");
    job.setJarByClass(JoinMain.class);
    job.setMapperClass(PreMapper.class);

    job.setMapOutputKeyClass(TextPair.class);
    job.setPartitionerClass(KeyPartition.class);
    job.setGroupingComparatorClass(FirstComparator.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setReducerClass(CommonReduce.class);
    FileInputFormat.addInputPath(job, new Path("/user/hadoop/input/load3/action.txt"));
    FileInputFormat.addInputPath(job, new Path("/user/hadoop/input/load3/alipay.txt"));
    FileOutputFormat.setOutputPath(job, new Path("/user/hadoop/output3/"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }

示例#7

0

显示文件

文件： TopMatches.java 项目： pranab/sifarish

  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Top n matches MR";
    job.setJobName(jobName);

    job.setJarByClass(TopMatches.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(TopMatches.TopMatchesMapper.class);
    job.setReducerClass(TopMatches.TopMatchesReducer.class);
    job.setCombinerClass(TopMatches.TopMatchesCombiner.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());
    int numReducer = job.getConfiguration().getInt("tom.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }

示例#8

0

显示文件

文件： map_reduce.java 项目： upadhyaysabyasachi/InvertedWordIndexUsingHadoop

  public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    conf.set("mapred.textoutputformat.separator", ",");
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    //	Logger log  = Logger.getLogger("sds");
    Job job = new Job(conf, "Max ");

    job.setMapOutputKeyClass(CompositeKey.class);

    job.setPartitionerClass(ActualKeyPartitioner.class);
    job.setGroupingComparatorClass(ActualKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setJarByClass(map_reduce.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setNumReduceTasks(27);

    job.setMapOutputKeyClass(CompositeKey.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }

示例#9

0

显示文件

文件： DiffTypeSimilarity.java 项目： Satya-AK/sifarish

  @Override
  public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Dirfferent type entity similarity MR";
    job.setJobName(jobName);

    job.setJarByClass(DiffTypeSimilarity.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(DiffTypeSimilarity.SimilarityMapper.class);
    job.setReducerClass(DiffTypeSimilarity.SimilarityReducer.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(IdPairGroupComprator.class);
    job.setPartitionerClass(IdPairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    int numReducer = job.getConfiguration().getInt("dts.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
  }

示例#10

0

显示文件

文件： RepartitionJoinQ11.java 项目： subbu3490/rdf-mapreduce-joins

  public static Job startJob(String[] args) throws IOException {

    // args[0] = hbase table name
    // args[1] = zookeeper

    Configuration hConf = HBaseConfiguration.create(new Configuration());
    hConf.set("hbase.zookeeper.quorum", args[1]);
    hConf.set("scan.table", args[0]);
    hConf.set("hbase.zookeeper.property.clientPort", "2181");

    Scan scan = new Scan();
    // scan.setFilter(rowColBloomFilter());

    Job job = new Job(hConf);
    job.setJobName("BSBM-Q11-RepartitionJoin");
    job.setJarByClass(RepartitionJoinQ11.class);
    // Change caching to speed up the scan
    scan.setCaching(500);
    scan.setMaxVersions(200);
    scan.setCacheBlocks(false);

    // Mapper settings
    TableMapReduceUtil.initTableMapperJob(
        args[0], // input HBase table name
        scan, // Scan instance to control CF and attribute selection
        RepartitionMapper.class, // mapper
        CompositeKeyWritable.class, // mapper output key
        KeyValueArrayWritable.class, // mapper output value
        job);

    // Repartition settings
    job.setPartitionerClass(CompositePartitioner.class);
    job.setSortComparatorClass(CompositeSortComparator.class);
    job.setGroupingComparatorClass(CompositeGroupingComparator.class);

    // Reducer settings
    job.setReducerClass(SharedServices.RepartitionJoin_Reducer.class); // reducer class
    job.setNumReduceTasks(1); // at least one, adjust as required

    FileOutputFormat.setOutputPath(job, new Path("output/BSBMQ11"));

    try {
      System.exit(job.waitForCompletion(true) ? 0 : 1);
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }

    return job;
  }

示例#11

0

显示文件

文件： GridmixJob.java 项目： JichengSong/hadoop-20

 public Job call() throws IOException, InterruptedException, ClassNotFoundException {
   job.setMapperClass(GridmixMapper.class);
   job.setReducerClass(GridmixReducer.class);
   job.setNumReduceTasks(jobdesc.getNumberReduces());
   job.setMapOutputKeyClass(GridmixKey.class);
   job.setMapOutputValueClass(GridmixRecord.class);
   job.setSortComparatorClass(GridmixKey.Comparator.class);
   job.setGroupingComparatorClass(SpecGroupingComparator.class);
   job.setInputFormatClass(GridmixInputFormat.class);
   job.setOutputFormatClass(RawBytesOutputFormat.class);
   job.setPartitionerClass(DraftPartitioner.class);
   job.setJarByClass(GridmixJob.class);
   job.getConfiguration().setInt("gridmix.job.seq", seq);
   job.getConfiguration()
       .set(ORIGNAME, null == jobdesc.getJobID() ? "<unknown>" : jobdesc.getJobID().toString());
   job.getConfiguration().setBoolean("mapred.used.genericoptionsparser", true);
   FileInputFormat.addInputPath(job, new Path("ignored"));
   FileOutputFormat.setOutputPath(job, outdir);
   job.submit();
   return job;
 }

示例#12

0

显示文件

文件： JobFactoryBean.java 项目： nellaivijay/spring-hadoop

  @SuppressWarnings("rawtypes")
  public void afterPropertiesSet() throws Exception {
    final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);

    buildGenericOptions(cfg);

    if (StringUtils.hasText(user)) {
      UserGroupInformation ugi =
          UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());
      ugi.doAs(
          new PrivilegedExceptionAction<Void>() {

            @Override
            public Void run() throws Exception {
              job = new Job(cfg);
              return null;
            }
          });
    } else {
      job = new Job(cfg);
    }

    ClassLoader loader =
        (beanClassLoader != null
            ? beanClassLoader
            : org.springframework.util.ClassUtils.getDefaultClassLoader());

    if (jar != null) {
      JobConf conf = (JobConf) job.getConfiguration();
      conf.setJar(jar.getURI().toString());
      loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg);
      conf.setClassLoader(loader);
    }

    // set first to enable auto-detection of K/V to skip the key/value types to be specified
    if (mapper != null) {
      Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class);
      job.setMapperClass(mapperClass);
      configureMapperTypesIfPossible(job, mapperClass);
    }

    if (reducer != null) {
      Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class);
      job.setReducerClass(reducerClass);
      configureReducerTypesIfPossible(job, reducerClass);
    }

    if (StringUtils.hasText(name)) {
      job.setJobName(name);
    }
    if (combiner != null) {
      job.setCombinerClass(resolveClass(combiner, loader, Reducer.class));
    }
    if (groupingComparator != null) {
      job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class));
    }
    if (inputFormat != null) {
      job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class));
    }
    if (mapKey != null) {
      job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class));
    }
    if (mapValue != null) {
      job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class));
    }
    if (numReduceTasks != null) {
      job.setNumReduceTasks(numReduceTasks);
    }
    if (key != null) {
      job.setOutputKeyClass(resolveClass(key, loader, Object.class));
    }
    if (value != null) {
      job.setOutputValueClass(resolveClass(value, loader, Object.class));
    }
    if (outputFormat != null) {
      job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class));
    }
    if (partitioner != null) {
      job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class));
    }
    if (sortComparator != null) {
      job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class));
    }
    if (StringUtils.hasText(workingDir)) {
      job.setWorkingDirectory(new Path(workingDir));
    }
    if (jarClass != null) {
      job.setJarByClass(jarClass);
    }

    if (!CollectionUtils.isEmpty(inputPaths)) {
      for (String path : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(path));
      }
    }

    if (StringUtils.hasText(outputPath)) {
      FileOutputFormat.setOutputPath(job, new Path(outputPath));
    }

    if (compressOutput != null) {
      FileOutputFormat.setCompressOutput(job, compressOutput);
    }

    if (codecClass != null) {
      FileOutputFormat.setOutputCompressorClass(
          job, resolveClass(codecClass, loader, CompressionCodec.class));
    }

    processJob(job);
  }

示例#13

0

显示文件

文件： CachingTool.java 项目： hadoop-dns-tracking/hadoop-dns-tracking

  /** The main entry point if this class is called as a {@link Tool}. */
  @Override
  public int run(String[] args) throws Exception {
    Path inputPath = null;
    Path outputPath = null;

    Configuration conf = getConf();

    // retrieve our paths from the configuration
    inputPath = new Path(conf.get(Util.CONF_LOGDATA_PATH));
    outputPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH));

    final int numCores = conf.getInt(Util.CONF_NUM_CORES, Util.DEFAULT_NUM_CORES);
    final int numNodes = conf.getInt(Util.CONF_NUM_NODES, Util.DEFAULT_NUM_NODES);

    NUM_OF_REDUCE_TASKS = numCores * numNodes;

    // set the jobname
    String jobName =
        Util.JOB_NAME
            + " ["
            + CachingTool.ACTION
            + "] {logdata="
            + inputPath.getName()
            + ", session="
            + conf.get(Util.CONF_SESSION_DURATION)
            + "}";

    Util.showStatus("Running " + jobName);

    conf.set("hadoop.job.ugi", Util.HADOOP_USER);
    conf.set("mapred.child.java.opts", "-Xmx1500M -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode");
    conf.set("mapred.task.timeout", "1800000");
    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    FileSystem fs = FileSystem.get(conf);

    Job job = new Job(conf, jobName);

    // set number of reduce tasks
    job.setNumReduceTasks(NUM_OF_REDUCE_TASKS);

    // set mapper, reducer, partitioner and grouping comperator
    job.setJarByClass(CachingTool.class);
    job.setMapperClass(CachingMapper.class);
    job.setReducerClass(CachingReducer.class);
    // GroupingComperator used for Secondary-Sort
    job.setGroupingComparatorClass(TextPair.FirstComparator.class);
    job.setPartitionerClass(TextPair.FirstPartitioner.class);
    job.setOutputKeyClass(TextPair.class);
    job.setOutputValueClass(Text.class);

    // set input and output format
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setMaxInputSplitSize(job, Util.DATASET_MB_SPLIT * 25);
    FileInputFormat.setMinInputSplitSize(job, Util.DATASET_MB_SPLIT * 25);

    // add input path subdirectories if there are any
    ArrayList<Path> inputPaths = Util.getInputDirectories(fs, inputPath);
    int pathsAdded = 0;
    if (inputPaths.size() > 0) {
      for (Path p : inputPaths) {
        if (!p.getName().contains(".") && !p.getName().contains("_")) {
          Util.showStatus("Adding input paths " + p);
          FileInputFormat.addInputPath(job, p);
          pathsAdded++;
        }
      }
    }

    if (pathsAdded == 0) {
      Util.showStatus("Adding input path " + inputPath);
      FileInputFormat.addInputPath(job, inputPath);
    }

    // clear output dir
    fs.delete(outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)), true);

    FileOutputFormat.setOutputPath(
        job, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)));

    // run the job and wait for it to be completed
    boolean b = job.waitForCompletion(true);

    // NOTE! The counters will be written HERE
    // retrieve the counters
    Counter numNewInCache = job.getCounters().findCounter(CachingReducer.CacheCounter.NEW_TO_CACHE);
    Counter numRenewCache = job.getCounters().findCounter(CachingReducer.CacheCounter.RENEW_CACHE);
    Counter numUsedFromCache =
        job.getCounters().findCounter(CachingReducer.CacheCounter.USED_FROM_CACHE);

    // write the counters to the metadata file
    Path headerPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH));
    FSDataOutputStream out =
        fs.create(headerPath.suffix("/" + DataSetHeader.SIMULATE_CACHING_METADATA_FILE));
    PrintWriter w = new PrintWriter(out);

    // the sum of all counters equals the sum of all queries in the log file
    w.println("hostnametypeAddedToCache=" + numNewInCache.getValue());
    w.println("queriesAddedAgainToCache=" + numRenewCache.getValue());
    w.println("queriesAnsweredFromCache=" + numUsedFromCache.getValue());

    w.close();
    out.close();

    // Delete all empty output files
    Util.deleteEmptyFiles(fs, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)));

    return b ? 1 : 0;
  }

示例#14

0

显示文件

文件： ItemSimilarityJob.java 项目： guitao/tyful

  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(
        "similarityClassname",
        "s",
        "Name of distributed similarity class to instantiate, alternatively use "
            + "one of the predefined similarities ("
            + SimilarityType.listEnumNames()
            + ')');
    addOption(
        "maxSimilaritiesPerItem",
        "m",
        "try to cap the number of similar items per item to this number "
            + "(default: "
            + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM
            + ')',
        String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption(
        "maxCooccurrencesPerItem",
        "mo",
        "try to cap the number of cooccurrences per item to this number "
            + "(default: "
            + DEFAULT_MAX_COOCCURRENCES_PER_ITEM
            + ')',
        String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job itemIDIndex =
          prepareJob(
              inputPath,
              itemIDIndexPath,
              TextInputFormat.class,
              ItemIDIndexMapper.class,
              VarIntWritable.class,
              VarLongWritable.class,
              ItemIDIndexReducer.class,
              VarIntWritable.class,
              VarLongWritable.class,
              SequenceFileOutputFormat.class);
      itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
      itemIDIndex.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job countUsers =
          prepareJob(
              inputPath,
              countUsersPath,
              TextInputFormat.class,
              CountUsersMapper.class,
              CountUsersKeyWritable.class,
              VarLongWritable.class,
              CountUsersReducer.class,
              VarIntWritable.class,
              NullWritable.class,
              TextOutputFormat.class);
      countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
      countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
      countUsers.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job toUserVector =
          prepareJob(
              inputPath,
              userVectorPath,
              TextInputFormat.class,
              ToItemPrefsMapper.class,
              VarLongWritable.class,
              booleanData ? VarLongWritable.class : EntityPrefWritable.class,
              ToUserVectorReducer.class,
              VarLongWritable.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
      toUserVector.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job maybePruneAndTransponse =
          prepareJob(
              userVectorPath,
              itemUserMatrixPath,
              SequenceFileInputFormat.class,
              MaybePruneRowsMapper.class,
              IntWritable.class,
              DistributedRowMatrix.MatrixEntryWritable.class,
              ToItemVectorsReducer.class,
              IntWritable.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      maybePruneAndTransponse
          .getConfiguration()
          .setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem);
      maybePruneAndTransponse.waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...) */
    ToolRunner.run(
        getConf(),
        new RowSimilarityJob(),
        new String[] {
          "-Dmapred.input.dir=" + itemUserMatrixPath.toString(),
          "-Dmapred.output.dir=" + similarityMatrixPath.toString(),
          "--numberOfColumns",
          String.valueOf(numberOfUsers),
          "--similarityClassname",
          similarityClassName,
          "--maxSimilaritiesPerRow",
          String.valueOf(maxSimilarItemsPerItem + 1),
          "--tempDir",
          tempDirPath.toString()
        });

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job mostSimilarItems =
          prepareJob(
              similarityMatrixPath,
              outputPath,
              SequenceFileInputFormat.class,
              MostSimilarItemPairsMapper.class,
              EntityEntityWritable.class,
              DoubleWritable.class,
              MostSimilarItemPairsReducer.class,
              EntityEntityWritable.class,
              DoubleWritable.class,
              TextOutputFormat.class);
      Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
      mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
      mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
      mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
      mostSimilarItems.waitForCompletion(true);
    }

    return 0;
  }

示例#15

0

显示文件

文件： FrameworkDriver.java 项目： jdstapleton/amino

  private int setJobParameters(Job job, AminoJob aj) throws Exception {
    final Configuration conf = job.getConfiguration();
    final Class<? extends DataLoader> dataLoaderClass = aj.getDataLoaderClass();
    AminoInputFormat.setDataLoader(job.getConfiguration(), dataLoaderClass.newInstance());

    if (aj instanceof AminoEnrichmentJob) {
      String output = "";
      int returnType = JOB_TYPE_ENRICHMENT;

      if (aj instanceof AminoReuseEnrichmentJob) {
        System.out.println("Running REUSE Enrichment Join Job");

        AminoReuseEnrichmentJob reuseJob = (AminoReuseEnrichmentJob) aj;
        AminoInputFormat.setDataLoader(
            job.getConfiguration(), reuseJob.getFirstPhaseDataLoaderClass().newInstance());

        String root = conf.get(AminoDriverUtils.ENRICHMENT_ROOT_OUTPUT);
        String front = "";
        if (!root.endsWith("/")) front = "/";
        root += front;
        String dir = reuseJob.getOutputSubDirectory(conf);
        output += root + dir;

        returnType = JOB_TYPE_REUSE_ENRICHMENT;
      } else {
        System.out.println("Running Enrichment Join Job");
      }

      int numReducers =
          conf.getInt(
              AMINO_NUM_REDUCERS_ENRICH_PHASE1,
              conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS));
      job.setNumReduceTasks(numReducers);

      // Our Framework mapper and reducer
      job.setMapperClass(FrameworkEnrichmentJoinMapper.class);
      job.setCombinerClass(FrameworkEnrichmentJoinCombiner.class);
      job.setReducerClass(FrameworkEnrichmentJoinReducer.class);

      job.setMapOutputKeyClass(EnrichmentJoinKey.class); // Different
      job.setMapOutputValueClass(MapWritable.class);

      job.setOutputKeyClass(BucketStripped.class);
      job.setOutputValueClass(MapWritable.class); // Different

      job.setPartitionerClass(NaturalKeyPartitioner.class);
      job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
      job.setSortComparatorClass(CompositeKeyComparator.class);

      job.setInputFormatClass(AminoMultiInputFormat.class);

      AminoEnrichmentJob aej = (AminoEnrichmentJob) aj;
      // AminoMultiInputFormat.setJoinDataLoader(conf, aej.getEnrichmentDataLoader().newInstance());
      AminoMultiInputFormat.setJoinDataLoaders(conf, aej.getEnrichmentDataLoaders());
      AminoMultiInputFormat.setEnrichWorker(conf, aej.getEnrichWorker().newInstance());

      job.setOutputFormatClass(SequenceFileOutputFormat.class);

      // TODO If it already exists, and its age is less than job running frequency, just reuse it
      // instead of doing the above job...
      if (output.length() == 0) {
        output = getEnrichmentOutputPath(aej, conf);
      }
      System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output));

      SequenceFileOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output)));
      JobUtilities.deleteDirectory(conf, output);

      CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf);

      return returnType;

    } else {
      System.out.println("\n==================== Running Amino Job =================\n");

      // Our Framework mapper and reducer
      job.setMapperClass(FrameworkMapper.class);
      job.setReducerClass(FrameworkReducer.class);

      job.setMapOutputKeyClass(BucketStripped.class);
      job.setMapOutputValueClass(MapWritable.class);

      job.setOutputKeyClass(BucketStripped.class);
      job.setOutputValueClass(AminoWritable.class);

      job.setInputFormatClass(AminoInputFormat.class);

      job.setOutputFormatClass(AminoOutputFormat.class);
      job.setNumReduceTasks(conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS));

      AminoOutputFormat.setAminoConfigPath(
          job, job.getConfiguration().get(AminoConfiguration.DEFAULT_CONFIGURATION_PATH_KEY));

      String output = conf.get("amino.output");
      System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output));
      AminoOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output)));
      JobUtilities.deleteDirectory(conf, output);

      CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf);
      return JOB_TYPE_NORMAL;
    }
  }