Exemple #1
0
  public BucketCache(Configuration conf) throws IOException {
    bucketCache = new HashMap<IntWritable, Bucket>();

    for (String cachePath : PathUtils.getCachePaths(conf)) {
      String bucketCachePath = cachePath + BUCKET_CACHE_FOLDER;
      MapFile.Reader reader = new MapFile.Reader(new Path(bucketCachePath), conf);
      IntWritable key = new IntWritable();
      Bucket value = new Bucket();
      while (reader.next(key, value)) {
        bucketCache.put(new IntWritable(key.get()), new Bucket(value));
      }
    }

    for (IntWritable i : bucketCache.keySet()) {
      System.out.println("Loaded bucket from cache:" + i.get() + ":" + bucketCache.get(i));
    }
  }
Exemple #2
0
  @SuppressWarnings("unchecked")
  public void writeToDisk(Configuration conf, boolean writeToDistributedCache) throws IOException {
    String bucketCachePath = PathUtils.getCachePath(conf) + BUCKET_CACHE_FOLDER;

    FileSystem fs = FileSystem.get(conf);
    MapFile.Writer writer = null;

    try {
      writer =
          new MapFile.Writer(
              conf,
              new Path(bucketCachePath),
              MapFile.Writer.keyClass(IntWritable.class),
              MapFile.Writer.valueClass(Bucket.class));

      ArrayList<IntWritable> keyList = new ArrayList<IntWritable>();
      for (IntWritable i : bucketCache.keySet()) {
        keyList.add(i);
      }

      Collections.sort(keyList);
      for (IntWritable i : keyList) {
        writer.append(i, bucketCache.get(i));
      }
    } finally {
      if (writer != null) {
        IOUtils.closeStream(writer);
      }
    }

    if (writeToDistributedCache) {
      for (FileStatus status : fs.listStatus(new Path(bucketCachePath))) {
        if (!status.isDirectory()) {
          DistributedCache.addCacheFile(status.getPath().toUri(), conf);
        }
      }
    }
  }
  private boolean runSecondPhaseEnrichmentJob(
      AminoEnrichmentJob aej, Configuration conf, int jobType) throws Exception {
    System.out.println("Running Amino Job");

    final Job job = new Job(conf, aej.getJobName() + " phase 2");
    job.setJarByClass(aej.getClass());

    AminoDriverUtils.setAminoJob(job.getConfiguration(), aej.getClass());

    if (jobType == JOB_TYPE_ENRICHMENT) {
      job.getConfiguration().set(AminoDriverUtils.ENRICHMENT_OUTPUT, this.enrichmentOutput);
    } else if (jobType == JOB_TYPE_REUSE_ENRICHMENT) {
      String root = conf.get(AminoDriverUtils.ENRICHMENT_ROOT_OUTPUT);
      String front = "";
      if (!root.endsWith("/")) {
        front = "/";
      }
      root += front;

      final Iterable<String> inputs =
          ((AminoReuseEnrichmentJob) aej)
              .getSecondPhaseEnrichmentInputDirectories(job.getConfiguration());
      String inputStr = "";
      System.out.println("Using enrichment input paths:");
      for (String input : inputs) {
        if (inputStr.length() > 0) {
          inputStr += "," + PathUtils.getJobDataPath(root + input);
        } else {
          inputStr += PathUtils.getJobDataPath(root + input);
        }
        System.out.println(PathUtils.getJobDataPath(root + input));
      }

      job.getConfiguration().set(AminoDriverUtils.ENRICHMENT_OUTPUT, inputStr);

      // Need to do this because the first phase data loader is sitting in this slot currently
      AminoInputFormat.setDataLoader(
          job.getConfiguration(), aej.getDataLoaderClass().newInstance());
    }

    int numReducers =
        job.getConfiguration()
            .getInt(
                AMINO_NUM_REDUCERS_ENRICH_PHASE2,
                job.getConfiguration().getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS));
    job.setNumReduceTasks(numReducers);

    job.setMapperClass(FrameworkMapper.class);
    job.setReducerClass(FrameworkReducer.class);

    job.setMapOutputKeyClass(BucketStripped.class);
    job.setMapOutputValueClass(MapWritable.class);

    job.setOutputKeyClass(BucketStripped.class);
    job.setOutputValueClass(AminoWritable.class);

    job.setInputFormatClass(AminoMultiInputFormat.class);
    AminoMultiInputFormat.setDataLoader(
        job.getConfiguration(), aej.getDataLoaderClass().newInstance());

    // Call job configuration for special properties
    jobConfiguration(job);

    @SuppressWarnings("serial")
    ArrayList<Class<? extends DataLoader>> joinSource =
        new ArrayList<Class<? extends DataLoader>>() {
          {
            add(EnrichmentDataLoader.class);
          }
        };
    AminoMultiInputFormat.setJoinDataLoaders(job.getConfiguration(), joinSource);

    job.setOutputFormatClass(AminoOutputFormat.class);
    AminoOutputFormat.setAminoConfigPath(
        job, job.getConfiguration().get(AminoConfiguration.DEFAULT_CONFIGURATION_PATH_KEY));

    String output = job.getConfiguration().get("amino.output");
    System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output));
    AminoOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output)));
    JobUtilities.deleteDirectory(job.getConfiguration(), output);
    CacheBuilder.buildCaches(
        AminoDataUtils.getDataLoader(job.getConfiguration()), aej, output, job.getConfiguration());

    return job.waitForCompletion(true);
  }
  private int setJobParameters(Job job, AminoJob aj) throws Exception {
    final Configuration conf = job.getConfiguration();
    final Class<? extends DataLoader> dataLoaderClass = aj.getDataLoaderClass();
    AminoInputFormat.setDataLoader(job.getConfiguration(), dataLoaderClass.newInstance());

    if (aj instanceof AminoEnrichmentJob) {
      String output = "";
      int returnType = JOB_TYPE_ENRICHMENT;

      if (aj instanceof AminoReuseEnrichmentJob) {
        System.out.println("Running REUSE Enrichment Join Job");

        AminoReuseEnrichmentJob reuseJob = (AminoReuseEnrichmentJob) aj;
        AminoInputFormat.setDataLoader(
            job.getConfiguration(), reuseJob.getFirstPhaseDataLoaderClass().newInstance());

        String root = conf.get(AminoDriverUtils.ENRICHMENT_ROOT_OUTPUT);
        String front = "";
        if (!root.endsWith("/")) front = "/";
        root += front;
        String dir = reuseJob.getOutputSubDirectory(conf);
        output += root + dir;

        returnType = JOB_TYPE_REUSE_ENRICHMENT;
      } else {
        System.out.println("Running Enrichment Join Job");
      }

      int numReducers =
          conf.getInt(
              AMINO_NUM_REDUCERS_ENRICH_PHASE1,
              conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS));
      job.setNumReduceTasks(numReducers);

      // Our Framework mapper and reducer
      job.setMapperClass(FrameworkEnrichmentJoinMapper.class);
      job.setCombinerClass(FrameworkEnrichmentJoinCombiner.class);
      job.setReducerClass(FrameworkEnrichmentJoinReducer.class);

      job.setMapOutputKeyClass(EnrichmentJoinKey.class); // Different
      job.setMapOutputValueClass(MapWritable.class);

      job.setOutputKeyClass(BucketStripped.class);
      job.setOutputValueClass(MapWritable.class); // Different

      job.setPartitionerClass(NaturalKeyPartitioner.class);
      job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
      job.setSortComparatorClass(CompositeKeyComparator.class);

      job.setInputFormatClass(AminoMultiInputFormat.class);

      AminoEnrichmentJob aej = (AminoEnrichmentJob) aj;
      // AminoMultiInputFormat.setJoinDataLoader(conf, aej.getEnrichmentDataLoader().newInstance());
      AminoMultiInputFormat.setJoinDataLoaders(conf, aej.getEnrichmentDataLoaders());
      AminoMultiInputFormat.setEnrichWorker(conf, aej.getEnrichWorker().newInstance());

      job.setOutputFormatClass(SequenceFileOutputFormat.class);

      // TODO If it already exists, and its age is less than job running frequency, just reuse it
      // instead of doing the above job...
      if (output.length() == 0) {
        output = getEnrichmentOutputPath(aej, conf);
      }
      System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output));

      SequenceFileOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output)));
      JobUtilities.deleteDirectory(conf, output);

      CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf);

      return returnType;

    } else {
      System.out.println("\n==================== Running Amino Job =================\n");

      // Our Framework mapper and reducer
      job.setMapperClass(FrameworkMapper.class);
      job.setReducerClass(FrameworkReducer.class);

      job.setMapOutputKeyClass(BucketStripped.class);
      job.setMapOutputValueClass(MapWritable.class);

      job.setOutputKeyClass(BucketStripped.class);
      job.setOutputValueClass(AminoWritable.class);

      job.setInputFormatClass(AminoInputFormat.class);

      job.setOutputFormatClass(AminoOutputFormat.class);
      job.setNumReduceTasks(conf.getInt(AMINO_NUM_REDUCERS, DEFAULT_NUM_REDUCERS));

      AminoOutputFormat.setAminoConfigPath(
          job, job.getConfiguration().get(AminoConfiguration.DEFAULT_CONFIGURATION_PATH_KEY));

      String output = conf.get("amino.output");
      System.out.println("Output will be written to: " + PathUtils.getJobDataPath(output));
      AminoOutputFormat.setOutputPath(job, new Path(PathUtils.getJobDataPath(output)));
      JobUtilities.deleteDirectory(conf, output);

      CacheBuilder.buildCaches(AminoDataUtils.getDataLoader(conf), aj, output, conf);
      return JOB_TYPE_NORMAL;
    }
  }