@Test
  public void testGranularitySpec() {
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonReadWriteRead(
              "{"
                  + " \"granularitySpec\":{"
                  + "   \"type\":\"uniform\","
                  + "   \"gran\":\"hour\","
                  + "   \"intervals\":[\"2012-01-01/P1D\"]"
                  + " }"
                  + "}",
              HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    final UniformGranularitySpec granularitySpec =
        (UniformGranularitySpec) cfg.getGranularitySpec();

    Assert.assertEquals(
        "getIntervals",
        Lists.newArrayList(new Interval("2012-01-01/P1D")),
        granularitySpec.getIntervals());

    Assert.assertEquals("getGranularity", "HOUR", granularitySpec.getGranularity().toString());
  }
  @Test
  public void testGranularitySpecPostConstructorIntervals() {
    // Deprecated and replaced by granularitySpec, but still supported
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonMapper.readValue(
              "{" + "\"segmentGranularity\":\"day\"" + "}", HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    cfg.setIntervals(Lists.newArrayList(new Interval("2012-03-01/P1D")));

    final UniformGranularitySpec granularitySpec =
        (UniformGranularitySpec) cfg.getGranularitySpec();

    Assert.assertEquals(
        "getIntervals",
        Lists.newArrayList(new Interval("2012-03-01/P1D")),
        granularitySpec.getIntervals());

    Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString());
  }
    public DeterminePartitionsDimSelectionMapperHelper(
        HadoopDruidIndexerConfig config, String partitionDimension) {
      this.config = config;
      this.partitionDimension = partitionDimension;

      final ImmutableMap.Builder<DateTime, Integer> timeIndexBuilder = ImmutableMap.builder();
      int idx = 0;
      for (final Interval bucketInterval : config.getGranularitySpec().bucketIntervals().get()) {
        timeIndexBuilder.put(bucketInterval.getStart(), idx);
        idx++;
      }

      this.intervalIndexes = timeIndexBuilder.build();
    }
    public void emitDimValueCounts(
        TaskInputOutputContext<?, ?, BytesWritable, Text> context,
        DateTime timestamp,
        Map<String, Iterable<String>> dims)
        throws IOException, InterruptedException {
      final Optional<Interval> maybeInterval =
          config.getGranularitySpec().bucketInterval(timestamp);

      if (!maybeInterval.isPresent()) {
        throw new ISE("WTF?! No bucket found for timestamp: %s", timestamp);
      }

      final Interval interval = maybeInterval.get();
      final int intervalIndex = intervalIndexes.get(interval.getStart());

      final ByteBuffer buf = ByteBuffer.allocate(4 + 8);
      buf.putInt(intervalIndex);
      buf.putLong(interval.getStartMillis());
      final byte[] groupKey = buf.array();

      // Emit row-counter value.
      write(context, groupKey, new DimValueCount("", "", 1));

      for (final Map.Entry<String, Iterable<String>> dimAndValues : dims.entrySet()) {
        final String dim = dimAndValues.getKey();

        if (partitionDimension == null || partitionDimension.equals(dim)) {
          final Iterable<String> dimValues = dimAndValues.getValue();

          if (Iterables.size(dimValues) == 1) {
            // Emit this value.
            write(
                context, groupKey, new DimValueCount(dim, Iterables.getOnlyElement(dimValues), 1));
          } else {
            // This dimension is unsuitable for partitioning. Poison it by emitting a negative
            // value.
            write(context, groupKey, new DimValueCount(dim, "", -1));
          }
        }
      }
    }
  @Test
  public void testGranularitySpecLegacy() {
    // Deprecated and replaced by granularitySpec, but still supported
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonReadWriteRead(
              "{" + "\"segmentGranularity\":\"day\"," + "\"intervals\":[\"2012-02-01/P1D\"]" + "}",
              HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    final UniformGranularitySpec granularitySpec =
        (UniformGranularitySpec) cfg.getGranularitySpec();

    Assert.assertEquals(
        "getIntervals",
        Lists.newArrayList(new Interval("2012-02-01/P1D")),
        granularitySpec.getIntervals());

    Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString());
  }
  public boolean run() {
    try {
      /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */

      if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
        throw new ISE(
            "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
            config.getPartitionsSpec());
      }

      if (!config.getPartitionsSpec().isAssumeGrouped()) {
        final Job groupByJob =
            Job.getInstance(
                new Configuration(),
                String.format(
                    "%s-determine_partitions_groupby-%s",
                    config.getDataSource(), config.getIntervals()));

        JobHelper.injectSystemProperties(groupByJob);
        config.addJobProperties(groupByJob);

        groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
        groupByJob.setMapOutputKeyClass(BytesWritable.class);
        groupByJob.setMapOutputValueClass(NullWritable.class);
        groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
        groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
        groupByJob.setOutputKeyClass(BytesWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        JobHelper.setupClasspath(
            JobHelper.distributedClassPath(config.getWorkingPath()),
            JobHelper.distributedClassPath(config.makeIntermediatePath()),
            groupByJob);

        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

        groupByJob.submit();
        log.info(
            "Job %s submitted, status available at: %s",
            groupByJob.getJobName(), groupByJob.getTrackingURL());

        if (!groupByJob.waitForCompletion(true)) {
          log.error("Job failed: %s", groupByJob.getJobID());
          return false;
        }
      } else {
        log.info("Skipping group-by job.");
      }

      /*
       * Read grouped data and determine appropriate partitions.
       */
      final Job dimSelectionJob =
          Job.getInstance(
              new Configuration(),
              String.format(
                  "%s-determine_partitions_dimselection-%s",
                  config.getDataSource(), config.getIntervals()));

      dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

      JobHelper.injectSystemProperties(dimSelectionJob);
      config.addJobProperties(dimSelectionJob);

      if (!config.getPartitionsSpec().isAssumeGrouped()) {
        // Read grouped data from the groupByJob.
        dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
        dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
        FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
      } else {
        // Directly read the source data, since we assume it's already grouped.
        dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
        config.addInputPaths(dimSelectionJob);
      }

      SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
      dimSelectionJob.setMapOutputValueClass(Text.class);
      dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
      dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
      dimSelectionJob.setOutputKeyClass(BytesWritable.class);
      dimSelectionJob.setOutputValueClass(Text.class);
      dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
      dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
      dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
      JobHelper.setupClasspath(
          JobHelper.distributedClassPath(config.getWorkingPath()),
          JobHelper.distributedClassPath(config.makeIntermediatePath()),
          dimSelectionJob);

      config.intoConfiguration(dimSelectionJob);
      FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

      dimSelectionJob.submit();
      log.info(
          "Job %s submitted, status available at: %s",
          dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());

      if (!dimSelectionJob.waitForCompletion(true)) {
        log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
        return false;
      }

      /*
       * Load partitions determined by the previous job.
       */

      log.info(
          "Job completed, loading up partitions for intervals[%s].",
          config.getSegmentGranularIntervals());
      FileSystem fileSystem = null;
      Map<DateTime, List<HadoopyShardSpec>> shardSpecs =
          Maps.newTreeMap(DateTimeComparator.getInstance());
      int shardCount = 0;
      for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
        final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
        if (fileSystem == null) {
          fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
        }
        if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
          List<ShardSpec> specs =
              config.JSON_MAPPER.readValue(
                  Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                  new TypeReference<List<ShardSpec>>() {});

          List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
          for (int i = 0; i < specs.size(); ++i) {
            actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
            log.info(
                "DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
          }

          shardSpecs.put(segmentGranularity.getStart(), actualSpecs);
        } else {
          log.info("Path[%s] didn't exist!?", partitionInfoPath);
        }
      }
      config.setShardSpecs(shardSpecs);

      return true;
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }
  }
Exemplo n.º 7
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
   parser = config.getParser();
   granularitySpec = config.getGranularitySpec();
 }