Пример #1
0
  /**
   * Constructs Countergroups from job runtime statistics
   *
   * @param counterNameToValue
   * @return
   */
  public static Map<String, CounterGroup> counterGroupInfoMap(
      Map<String, Double> counterNameToValue) {

    Counters counters = new Counters();
    for (Map.Entry<String, ? extends Number> entry : counterNameToValue.entrySet()) {

      String[] cNames = entry.getKey().split("::");
      String groupName = cNames[0];
      String counterName = cNames[1];
      Counter counter = counters.findCounter(groupName, counterName);
      counter.setValue(entry.getValue().longValue());
    }
    return CounterGroup.counterGroupInfoMap(counters);
  }
Пример #2
0
 public static Object countersToJson(Counters counters) {
   Map<String, Object> jsonObj = new HashMap<String, Object>();
   Collection<String> counterGroups = counters.getGroupNames();
   for (String groupName : counterGroups) {
     Map<String, String> counterStats = new HashMap<String, String>();
     Group group = counters.getGroup(groupName);
     Iterator<Counters.Counter> it = group.iterator();
     while (it.hasNext()) {
       Counter counter = it.next();
       counterStats.put(counter.getDisplayName(), String.valueOf(counter.getCounter()));
     }
     jsonObj.put(groupName, counterStats);
   }
   return jsonObj;
 }
Пример #3
0
  /**
   * helper method for counter group map retrieval
   *
   * @param HadoopStepStats
   * @return a map of counter name to counter value
   */
  private Map<String, Long> counterGroupInfoMapHelper(HadoopStepStats stats) {
    Counters counters = new Counters();
    Map<String, Long> counterNameToValue = new HashMap<String, Long>();
    for (String groupName : stats.getCounterGroups()) { // retreiving groups
      for (String counterName :
          stats.getCountersFor(groupName)) { // retreiving counters in that group
        Long counterValue = stats.getCounterValue(groupName, counterName);
        counterNameToValue.put(groupName + "::" + counterName, counterValue);

        // creating counter
        Counter counter = counters.findCounter(groupName, counterName);
        counter.setValue(counterValue);
      }
    }
    setCounterGroupMap(CounterGroup.counterGroupInfoMap(counters));
    return counterNameToValue;
  }
Пример #4
0
  /**
   * Reduces values for a given key
   *
   * @param key the Key for the given value being passed in
   * @param value an Array to process that corresponds to the given key
   * @param context the Context object for the currently executing job
   */
  public void map(Object key, TransBinaryMapInputValue value, Context context)
      throws IOException, InterruptedException {
    TransBinaryInputSplit split = value.getSplit();
    ZoneClient zclient = null;
    OptimusZone zone1 = null;
    OptimusZone zone2 = null;
    OptimusZone zone3 = null;
    cl = (Counter) context.getCounter("TRANS_READ", "MAPPER_LOCAL_READ");
    cr = (Counter) context.getCounter("TRANS_READ", "MAPPER_REMOTE_READ");
    co = (Counter) context.getCounter("TRANS_WRITE", "MAPPER_WRITE");

    OptimusCatalogProtocol ci = null;
    boolean earlybird = split.isEarlybird();
    PartitionReader reader = null;
    try {
      OptimusConfiguration conf = new OptimusConfiguration(split.getConfDir());
      zclient = new ZoneClient(conf);
      ci = zclient.getCi();
      zone1 = ci.openZone(new ZoneID(split.getZid1()));
      zone2 = ci.openZone(new ZoneID(split.getZid2()));
      zone3 = ci.openZone(new ZoneID(split.getZid3()));
      reader = new PartitionReader(conf);
    } catch (WrongArgumentException e) {
      throw new IOException("Create Client Failure");
    } catch (JDOMException e) {
      throw new IOException("Create Client Failure");
    }
    String[] host = split.getLocations();
    Host h = UTILS.RandomHost(host, split.getHosts().getHosts());
    Object[] data2 = null;
    Object[] data1 = null;
    DataChunk chunk = new DataChunk(zone3.getSize().getShape(), zone3.getPstep().getShape());

    try {
      if (split.getZid1() == split.getZid2()
          && Arrays.equals(split.getStart1(), split.getStart2())) {

        data1 = this.colocatedRead(split, h, zone1, zone2, zone3);
        cl.increment(data1.length);
      } else {
        OptimusDataProtocol dp = h.getDataProtocol();
        int[] start1 = split.getStart1();
        int[] cstart1 = split.getCstart();
        int[] rstart1 = new int[start1.length];
        for (int i = 0; i < start1.length; i++) {
          rstart1[i] = start1[i] - cstart1[i];
        }
        data1 =
            (Double[])
                dp.readData(
                        new ArrayID(split.getAid1()),
                        new PID(split.getPnum1()),
                        new OptimusShape(split.getPs1()),
                        new OptimusShape(rstart1),
                        new OptimusShape(split.getRoff()))
                    .getData();
        data2 =
            (Double[]) reader.readData(zone2, split.getAid2(), split.getStart2(), split.getOff());
        // data1 = reader.readData(zone1, split.getAid1(), split.getStart1(), split.getOff());
        cl.increment(data1.length);
        cr.increment(data2.length);
        data1 = split.getCal().calcArray(data1, data2);
      }

    } catch (WrongArgumentException e) {
      e.printStackTrace();
      throw new IOException("Reading from the second aray failure");
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      throw new IOException("Reading from the second aray failure");
    }

    int[] off = split.getRoff();
    int[] rstart = split.getRstart();

    TRANSDataIterator itr1 =
        new TRANSDataIterator(
            new TransDataType(data1[0].getClass()), data1, rstart, split.getRoff());
    ZoneID zid = zone3.getId();
    ArrayID aid = new ArrayID(split.getAid3());
    RID rid = new RID(zone3.getStrategy().getShapes().size() - 2);
    Set<DataChunk> chunks = chunk.getAdjacentChunks(split.getRstart(), split.getOff());

    for (DataChunk c : chunks) {
      int[] cstart = c.getStart();
      int[] coff = c.getChunkSize();
      int[] nstart = new int[cstart.length];
      int[] noff = new int[cstart.length];
      int len = 1;
      for (int i = 0; i < cstart.length; i++) {
        nstart[i] = cstart[i] > rstart[i] ? cstart[i] : rstart[i];
        noff[i] = Math.min(cstart[i] + coff[i], rstart[i] + off[i]);
        noff[i] -= nstart[i];
        len *= noff[i];
      }

      Object[] tmp = new Object[len];
      TRANSDataIterator itr =
          new TRANSDataIterator(new TransDataType(data1[0].getClass()), tmp, nstart, noff);

      if (!itr.init(itr1.getStart(), itr1.getShape())) {
        continue;
      }
      itr1.init(itr.getStart(), itr.getShape());
      while (itr.next() && itr1.next()) {
        itr.set(itr1.get());
      }
      if (earlybird) {
        Partition p = new Partition(zid, aid, new PID(c.getChunkNum()), rid);

        Host ho = null;
        try {
          ci.CreatePartition(p);
          ho = ci.getReplicateHost(p, rid);
        } catch (WrongArgumentException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
          throw new IOException("Get Replicate host failure");
        }
        OptimusDataProtocol dp = ho.getDataProtocol();
        dp.putPartitionData(p, itr);
        co.increment(itr.getSize());
      } else {
        context.write(new IntWritable(c.getChunkNum()), itr);
      }
    }
  }
Пример #5
0
  @Override
  public int run(String[] args) throws IOException {
    OptionParser p = new OptionParser();
    OptionSpec<String> maxwiOpt =
        p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED")
            .withRequiredArg()
            .ofType(String.class);
    OptionSpec<Float> thresholdOpt =
        p.accepts(thresholdOptName, "similarity threshold")
            .withRequiredArg()
            .ofType(Float.class)
            .defaultsTo(DEFAULT_THRESHOLD);
    OptionSpec<Integer> stripesOpt =
        p.accepts(stripesOptName, "number of stripes to divide the similarity matrix")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(1);
    OptionSpec<Integer> spreadOpt =
        p.accepts(spreadOptName, "number of reducers per stripe")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_SPREAD);
    OptionSpec<Integer> factorOpt =
        p.accepts(factorOptName, "number of mappers per reducer")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_FACTOR);
    OptionSpec<Integer> maxVectorIDOpt =
        p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class);
    p.acceptsAll(Arrays.asList("h", "?"), "show help");

    OptionSet options = parseOptions(p, args);

    // to distinguish indexes built in successive runs
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    Date date = new Date();

    float threshold = options.valueOf(thresholdOpt); // threshold
    if (threshold < 0 || threshold >= 1) {
      System.err.println(thresholdOptName + " should be between 0 and 1");
      System.exit(1);
    }

    int numStripes = options.valueOf(stripesOpt); // number of stripes
    if (numStripes < 1) {
      System.err.println(stripesOptName + " should be > 0");
      System.exit(1);
    }

    // MapReduce parameters
    int spread = options.valueOf(spreadOpt); // how many reducers per stripe
    if (spread < 1) {
      System.err.println(spreadOptName + " should be > 0");
      System.exit(1);
    }

    int factor = options.valueOf(factorOpt); // how many mappers per reducer
    if (factor < 1) {
      System.err.println(factorOptName + " should be > 0");
      System.exit(1);
    }

    int maxKey = 0;
    if (options.has(maxVectorIDOpt)) {
      maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID
      if (maxKey < 1) {
        System.err.println(maxVectorIDOptName + " should be > 0");
        System.exit(1);
      }
    }

    int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread);
    int numMappers = numReducers * factor;
    int numBuckets = numMappers;

    // pick the file with max weights from command line
    String maxWiDir = options.valueOf(maxwiOpt);
    List<String> nonOptArgs = options.nonOptionArguments();

    LOG.info("Threshold set to " + threshold);
    LOG.info(
        String.format(
            "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s",
            numBuckets, factor, numStripes, spread, numReducers));

    // start building the jobs
    JobConf conf1 = new JobConf(getConf(), Similarity.class);
    conf1.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf1.setInt(PARAM_APS_STRIPES, numStripes);
    DistributedCache.addCacheFile(URI.create(maxWiDir), conf1);

    Path inputPath = new Path(nonOptArgs.get(0));
    Path indexPath =
        new Path(
            nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date));
    // index filtering pruned nested directory
    Path indexOnlyPath = new Path(indexPath, "part*");
    Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes);
    FileInputFormat.setInputPaths(conf1, inputPath);
    FileOutputFormat.setOutputPath(conf1, indexPath);

    conf1.setInputFormat(SequenceFileInputFormat.class);
    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setMapOutputKeyClass(LongWritable.class);
    conf1.setMapOutputValueClass(IndexItem.class);
    conf1.setOutputKeyClass(LongWritable.class);
    conf1.setOutputValueClass(IndexItemArrayWritable.class);
    conf1.setMapperClass(IndexerMapper.class);
    conf1.setReducerClass(IndexerReducer.class);

    // assuming input is sorted according to the key (vectorID) so that the
    // part files are locally sorted
    MultipleOutputs.addNamedOutput(
        conf1,
        PRUNED,
        SequenceFileOutputFormat.class,
        IntWritable.class,
        VectorComponentArrayWritable.class);

    // remove the stuff we added from the job name
    conf1.set(
        "mapred.job.name",
        "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16));
    conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf1.setSpeculativeExecution(false);
    conf1.setCompressMapOutput(true);
    // hash the posting lists in different buckets to distribute the load
    conf1.setNumReduceTasks(numBuckets);

    RunningJob job1 = JobClient.runJob(conf1);

    // part 2
    JobConf conf2 = new JobConf(getConf(), Similarity.class);

    if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL);

    MultipleInputs.addInputPath(
        conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class);
    MultipleInputs.addInputPath(
        conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class);
    FileOutputFormat.setOutputPath(conf2, outputPath);
    conf2.setCombinerClass(SimilarityCombiner.class);
    conf2.setReducerClass(SimilarityReducer.class);
    conf2.setPartitionerClass(GenericKey.StripePartitioner.class);
    conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class);
    conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class);
    conf2.setMapOutputKeyClass(GenericKey.class);
    conf2.setMapOutputValueClass(GenericValue.class);
    conf2.setOutputKeyClass(VectorPair.class);
    conf2.setOutputValueClass(NullWritable.class);

    Counter numDocs =
        job1.getCounters()
            .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS");
    maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue();
    LOG.info("Setting max key value in input to " + maxKey);
    conf2.setInt(PARAM_APS_MAXKEY, maxKey);

    conf2.setInt(PARAM_APS_STRIPES, numStripes);
    conf2.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread);
    conf2.set("mapred.job.name", "APS-" + outputPath.getName());

    conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf2.setSpeculativeExecution(false);
    conf2.setCompressMapOutput(true);
    conf2.setNumReduceTasks(numReducers);

    JobClient.runJob(conf2);

    return 0;
  }
Пример #6
0
  /**
   * Performs a range query using MapReduce
   *
   * @param fs
   * @param inputFile
   * @param queryRange
   * @param shape
   * @param output
   * @return
   * @throws IOException
   */
  public static long rangeQueryMapReduce(
      FileSystem fs,
      Path inputFile,
      Path userOutputPath,
      Shape queryShape,
      Shape shape,
      boolean overwrite,
      boolean background,
      QueryInput query)
      throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    FileSystem outFs = inputFile.getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
      do {
        outputPath =
            new Path(
                inputFile.toUri().getPath() + ".rangequery_" + (int) (Math.random() * 1000000));
      } while (outFs.exists(outputPath));
    } else {
      if (outFs.exists(outputPath)) {
        if (overwrite) {
          outFs.delete(outputPath, true);
        } else {
          throw new RuntimeException("Output path already exists and -overwrite flag is not set");
        }
      }
    }

    job.setJobName("RangeQuery");
    job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    RangeFilter.setQueryRange(job, queryShape); // Set query range for
    // filter

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(3);

    // Decide which map function to use depending on how blocks are indexed
    // And also which input format to use
    if (SpatialSite.isRTree(fs, inputFile)) {
      // RTree indexed file
      LOG.info("Searching an RTree indexed file");
      job.setInputFormat(RTreeInputFormat.class);
    } else {
      // A file with no local index
      LOG.info("Searching a non local-indexed file");
      job.setInputFormat(ShapeInputFormat.class);
    }

    GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inputFile);
    // if (gIndex != null && gIndex.isReplicated()){
    // job.setMapperClass(RangeQueryMap.class);

    Class<?> OutputKey = NullWritable.class;
    try {
      Class<?> c = shape.getClass();
      Field f = c.getDeclaredField(query.field);
      f.setAccessible(true);
      if (f.getType().equals(Integer.TYPE)) {
        OutputKey = IntWritable.class;
      } else if (f.getType().equals(Double.TYPE)) {
        OutputKey = DoubleWritable.class;
      } else if (f.getType().equals(Long.TYPE)) {
        OutputKey = LongWritable.class;
      }
    } catch (SecurityException e) {
      e.printStackTrace();
    } catch (NoSuchFieldException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    job.setMapOutputKeyClass(OutputKey);
    switch (query.type) {
      case Distinct:
        job.setMapperClass(DistinctQueryMap.class);
        job.setReducerClass(DistinctQueryReduce.class);
        job.setMapOutputValueClass(NullWritable.class);
        break;
      case Distribution:
        job.setMapperClass(DistributionQueryMap.class);
        job.setReducerClass(DistributionQueryReduce.class);
        job.setMapOutputValueClass(IntWritable.class);
        break;
      default:
        break;
    }
    // }
    // else
    // job.setMapperClass(RangeQueryMapNoDupAvoidance.class);

    // Set query range for the map function
    job.set(QUERY_SHAPE_CLASS, queryShape.getClass().getName());
    job.set(QUERY_SHAPE, queryShape.toText(new Text()).toString());
    job.set(QUERY_FIELD, query.field);

    // Set shape class for the SpatialInputFormat
    SpatialSite.setShapeClass(job, shape.getClass());

    job.setOutputFormat(TextOutputFormat.class);

    ShapeInputFormat.setInputPaths(job, inputFile);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    if (!background) {
      RunningJob runningJob = JobClient.runJob(job);
      Counters counters = runningJob.getCounters();
      Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
      final long resultCount = outputRecordCounter.getValue();

      // If outputPath not set by user, automatically delete it
      if (userOutputPath == null) outFs.delete(outputPath, true);

      return resultCount;
    } else {
      JobClient jc = new JobClient(job);
      lastRunningJob = jc.submitJob(job);
      return -1;
    }
  }