Пример #1
0
  public static boolean runJobs(List<Jobby> jobs, HadoopDruidIndexerConfig config) {
    String failedMessage = null;
    for (Jobby job : jobs) {
      if (failedMessage == null) {
        if (!job.run()) {
          failedMessage = String.format("Job[%s] failed!", job.getClass());
        }
      }
    }

    if (!config.getSchema().getTuningConfig().isLeaveIntermediate()) {
      if (failedMessage == null || config.getSchema().getTuningConfig().isCleanupOnFailure()) {
        Path workingPath = config.makeIntermediatePath();
        log.info("Deleting path[%s]", workingPath);
        try {
          workingPath
              .getFileSystem(injectSystemProperties(new Configuration()))
              .delete(workingPath, true);
        } catch (IOException e) {
          log.error(e, "Failed to cleanup path[%s]", workingPath);
        }
      }
    }

    if (failedMessage != null) {
      throw new ISE(failedMessage);
    }

    return true;
  }
  @Test
  public void testPartitionsSpecMaxPartitionSize() {
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonReadWriteRead(
              "{"
                  + "\"partitionsSpec\":{"
                  + "   \"targetPartitionSize\":100,"
                  + "   \"maxPartitionSize\":200,"
                  + "   \"partitionDimension\":\"foo\""
                  + " }"
                  + "}",
              HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    final PartitionsSpec partitionsSpec = cfg.getPartitionsSpec();

    Assert.assertEquals("isDeterminingPartitions", partitionsSpec.isDeterminingPartitions(), true);

    Assert.assertEquals("getTargetPartitionSize", partitionsSpec.getTargetPartitionSize(), 100);

    Assert.assertEquals("getMaxPartitionSize", partitionsSpec.getMaxPartitionSize(), 200);

    Assert.assertEquals("getPartitionDimension", partitionsSpec.getPartitionDimension(), "foo");
  }
 @Test
 public void
     testupdateSegmentListIfDatasourcePathSpecIsUsedWithJustDatasourcePathSpecAndPartialInterval()
         throws Exception {
   PathSpec pathSpec =
       new DatasourcePathSpec(
           jsonMapper,
           null,
           new DatasourceIngestionSpec(
               testDatasource,
               testDatasourceIntervalPartial,
               null,
               null,
               null,
               null,
               null,
               null,
               false),
           null);
   HadoopDruidIndexerConfig config =
       testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceIntervalPartial);
   Assert.assertEquals(
       ImmutableList.of(new WindowedDataSegment(SEGMENT, testDatasourceIntervalPartial)),
       ((DatasourcePathSpec) config.getPathSpec()).getSegments());
 }
 @Test
 public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithMultiplePathSpec()
     throws Exception {
   PathSpec pathSpec =
       new MultiplePathSpec(
           ImmutableList.of(
               new StaticPathSpec("/xyz", null),
               new DatasourcePathSpec(
                   jsonMapper,
                   null,
                   new DatasourceIngestionSpec(
                       testDatasource,
                       testDatasourceInterval,
                       null,
                       null,
                       null,
                       null,
                       null,
                       null,
                       false),
                   null)));
   HadoopDruidIndexerConfig config =
       testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceInterval);
   Assert.assertEquals(
       ImmutableList.of(WindowedDataSegment.of(SEGMENT)),
       ((DatasourcePathSpec) ((MultiplePathSpec) config.getPathSpec()).getChildren().get(1))
           .getSegments());
 }
  @Test
  public void testGranularitySpec() {
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonReadWriteRead(
              "{"
                  + " \"granularitySpec\":{"
                  + "   \"type\":\"uniform\","
                  + "   \"gran\":\"hour\","
                  + "   \"intervals\":[\"2012-01-01/P1D\"]"
                  + " }"
                  + "}",
              HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    final UniformGranularitySpec granularitySpec =
        (UniformGranularitySpec) cfg.getGranularitySpec();

    Assert.assertEquals(
        "getIntervals",
        Lists.newArrayList(new Interval("2012-01-01/P1D")),
        granularitySpec.getIntervals());

    Assert.assertEquals("getGranularity", "HOUR", granularitySpec.getGranularity().toString());
  }
  @Test
  public void testGranularitySpecPostConstructorIntervals() {
    // Deprecated and replaced by granularitySpec, but still supported
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonMapper.readValue(
              "{" + "\"segmentGranularity\":\"day\"" + "}", HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    cfg.setIntervals(Lists.newArrayList(new Interval("2012-03-01/P1D")));

    final UniformGranularitySpec granularitySpec =
        (UniformGranularitySpec) cfg.getGranularitySpec();

    Assert.assertEquals(
        "getIntervals",
        Lists.newArrayList(new Interval("2012-03-01/P1D")),
        granularitySpec.getIntervals());

    Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString());
  }
 @Test
 public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithNoDatasourcePathSpec()
     throws Exception {
   PathSpec pathSpec = new StaticPathSpec("/xyz", null);
   HadoopDruidIndexerConfig config =
       testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, null);
   Assert.assertTrue(config.getPathSpec() instanceof StaticPathSpec);
 }
Пример #8
0
 public static void setInputFormat(Job job, HadoopDruidIndexerConfig indexerConfig) {
   if (indexerConfig.getInputFormatClass() != null) {
     job.setInputFormatClass(indexerConfig.getInputFormatClass());
   } else if (indexerConfig.isCombineText()) {
     job.setInputFormatClass(CombineTextInputFormat.class);
   } else {
     job.setInputFormatClass(TextInputFormat.class);
   }
 }
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   final HadoopDruidIndexerConfig config =
       HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
   SingleDimensionPartitionsSpec spec =
       (SingleDimensionPartitionsSpec) config.getPartitionsSpec();
   helper =
       new DeterminePartitionsDimSelectionMapperHelper(config, spec.getPartitionDimension());
 }
  @Test
  public void testNoCleanupOnFailure() {
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg = jsonReadWriteRead("{\"cleanupOnFailure\":false}", HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    Assert.assertEquals("cleanupOnFailure", cfg.isCleanupOnFailure(), false);
  }
  private HadoopDruidIndexerConfig testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(
      PathSpec datasourcePathSpec, Interval jobInterval) throws Exception {
    HadoopIngestionSpec spec =
        new HadoopIngestionSpec(
            new DataSchema(
                "foo",
                null,
                new AggregatorFactory[0],
                new UniformGranularitySpec(
                    Granularity.DAY, null, ImmutableList.of(new Interval("2010-01-01/P1D"))),
                jsonMapper),
            new HadoopIOConfig(jsonMapper.convertValue(datasourcePathSpec, Map.class), null, null),
            null);

    spec = jsonMapper.readValue(jsonMapper.writeValueAsString(spec), HadoopIngestionSpec.class);

    UsedSegmentLister segmentLister = EasyMock.createMock(UsedSegmentLister.class);
    EasyMock.expect(
            segmentLister.getUsedSegmentsForIntervals(
                testDatasource, Lists.newArrayList(jobInterval)))
        .andReturn(ImmutableList.of(SEGMENT));
    EasyMock.replay(segmentLister);

    spec =
        HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed(
            spec, jsonMapper, segmentLister);
    return HadoopDruidIndexerConfig.fromString(jsonMapper.writeValueAsString(spec));
  }
Пример #12
0
  @Override
  protected void map(Object key, Object value, Context context)
      throws IOException, InterruptedException {
    try {
      final InputRow inputRow;
      try {
        inputRow = parseInputRow(value, parser);
      } catch (Exception e) {
        if (config.isIgnoreInvalidRows()) {
          log.debug(e, "Ignoring invalid row [%s] due to parsing error", value.toString());
          context
              .getCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER)
              .increment(1);
          return; // we're ignoring this invalid row
        } else {
          throw e;
        }
      }

      if (!granularitySpec.bucketIntervals().isPresent()
          || granularitySpec
              .bucketInterval(new DateTime(inputRow.getTimestampFromEpoch()))
              .isPresent()) {
        innerMap(inputRow, value, context);
      }
    } catch (RuntimeException e) {
      throw new RE(e, "Failure on row[%s]", value);
    }
  }
  @Test
  public void testDefaultSettings() {
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg = jsonReadWriteRead("{}", HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    Assert.assertEquals("cleanupOnFailure", cfg.isCleanupOnFailure(), true);

    Assert.assertEquals("overwriteFiles", cfg.isOverwriteFiles(), false);

    Assert.assertEquals(
        "isDeterminingPartitions", cfg.getPartitionsSpec().isDeterminingPartitions(), false);
  }
Пример #14
0
  public static void ensurePaths(HadoopDruidIndexerConfig config) {
    // config.addInputPaths() can have side-effects ( boo! :( ), so this stuff needs to be done
    // before anything else
    try {
      Job job =
          Job.getInstance(
              new Configuration(),
              String.format(
                  "%s-determine_partitions-%s", config.getDataSource(), config.getIntervals()));

      job.getConfiguration().set("io.sort.record.percent", "0.19");
      injectSystemProperties(job);

      config.addInputPaths(job);
    } catch (IOException e) {
      throw Throwables.propagate(e);
    }
  }
Пример #15
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   if (config == null) {
     synchronized (DeterminePartitionsDimSelectionBaseReducer.class) {
       if (config == null) {
         config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
       }
     }
   }
 }
Пример #16
0
    public DeterminePartitionsDimSelectionMapperHelper(
        HadoopDruidIndexerConfig config, String partitionDimension) {
      this.config = config;
      this.partitionDimension = partitionDimension;

      final ImmutableMap.Builder<DateTime, Integer> timeIndexBuilder = ImmutableMap.builder();
      int idx = 0;
      for (final Interval bucketInterval : config.getGranularitySpec().bucketIntervals().get()) {
        timeIndexBuilder.put(bucketInterval.getStart(), idx);
        idx++;
      }

      this.intervalIndexes = timeIndexBuilder.build();
    }
  @Test
  public void testGranularitySpecLegacy() {
    // Deprecated and replaced by granularitySpec, but still supported
    final HadoopDruidIndexerConfig cfg;

    try {
      cfg =
          jsonReadWriteRead(
              "{" + "\"segmentGranularity\":\"day\"," + "\"intervals\":[\"2012-02-01/P1D\"]" + "}",
              HadoopDruidIndexerConfig.class);
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }

    final UniformGranularitySpec granularitySpec =
        (UniformGranularitySpec) cfg.getGranularitySpec();

    Assert.assertEquals(
        "getIntervals",
        Lists.newArrayList(new Interval("2012-02-01/P1D")),
        granularitySpec.getIntervals());

    Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString());
  }
  @Test
  public void testDbUpdaterJobSpec() throws Exception {
    final HadoopDruidIndexerConfig cfg;

    cfg =
        jsonReadWriteRead(
            "{"
                + "\"updaterJobSpec\":{\n"
                + "    \"type\" : \"db\",\n"
                + "    \"connectURI\" : \"jdbc:mysql://localhost/druid\",\n"
                + "    \"user\" : \"rofl\",\n"
                + "    \"password\" : \"p4ssw0rd\",\n"
                + "    \"segmentTable\" : \"segments\"\n"
                + "  }"
                + "}",
            HadoopDruidIndexerConfig.class);

    final DbUpdaterJobSpec spec = (DbUpdaterJobSpec) cfg.getUpdaterJobSpec();
    Assert.assertEquals("segments", spec.getSegmentTable());
    Assert.assertEquals("jdbc:mysql://localhost/druid", spec.getDatabaseConnectURI());
    Assert.assertEquals("rofl", spec.getDatabaseUser());
    Assert.assertEquals("p4ssw0rd", spec.getDatabasePassword());
    Assert.assertEquals(false, spec.useValidationQuery());
  }
Пример #19
0
    public void emitDimValueCounts(
        TaskInputOutputContext<?, ?, BytesWritable, Text> context,
        DateTime timestamp,
        Map<String, Iterable<String>> dims)
        throws IOException, InterruptedException {
      final Optional<Interval> maybeInterval =
          config.getGranularitySpec().bucketInterval(timestamp);

      if (!maybeInterval.isPresent()) {
        throw new ISE("WTF?! No bucket found for timestamp: %s", timestamp);
      }

      final Interval interval = maybeInterval.get();
      final int intervalIndex = intervalIndexes.get(interval.getStart());

      final ByteBuffer buf = ByteBuffer.allocate(4 + 8);
      buf.putInt(intervalIndex);
      buf.putLong(interval.getStartMillis());
      final byte[] groupKey = buf.array();

      // Emit row-counter value.
      write(context, groupKey, new DimValueCount("", "", 1));

      for (final Map.Entry<String, Iterable<String>> dimAndValues : dims.entrySet()) {
        final String dim = dimAndValues.getKey();

        if (partitionDimension == null || partitionDimension.equals(dim)) {
          final Iterable<String> dimValues = dimAndValues.getValue();

          if (Iterables.size(dimValues) == 1) {
            // Emit this value.
            write(
                context, groupKey, new DimValueCount(dim, Iterables.getOnlyElement(dimValues), 1));
          } else {
            // This dimension is unsuitable for partitioning. Poison it by emitting a negative
            // value.
            write(context, groupKey, new DimValueCount(dim, "", -1));
          }
        }
      }
    }
Пример #20
0
  public boolean run() {
    try {
      /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */

      if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
        throw new ISE(
            "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]",
            config.getPartitionsSpec());
      }

      if (!config.getPartitionsSpec().isAssumeGrouped()) {
        final Job groupByJob =
            Job.getInstance(
                new Configuration(),
                String.format(
                    "%s-determine_partitions_groupby-%s",
                    config.getDataSource(), config.getIntervals()));

        JobHelper.injectSystemProperties(groupByJob);
        config.addJobProperties(groupByJob);

        groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
        groupByJob.setMapOutputKeyClass(BytesWritable.class);
        groupByJob.setMapOutputValueClass(NullWritable.class);
        groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
        groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
        groupByJob.setOutputKeyClass(BytesWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        JobHelper.setupClasspath(
            JobHelper.distributedClassPath(config.getWorkingPath()),
            JobHelper.distributedClassPath(config.makeIntermediatePath()),
            groupByJob);

        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

        groupByJob.submit();
        log.info(
            "Job %s submitted, status available at: %s",
            groupByJob.getJobName(), groupByJob.getTrackingURL());

        if (!groupByJob.waitForCompletion(true)) {
          log.error("Job failed: %s", groupByJob.getJobID());
          return false;
        }
      } else {
        log.info("Skipping group-by job.");
      }

      /*
       * Read grouped data and determine appropriate partitions.
       */
      final Job dimSelectionJob =
          Job.getInstance(
              new Configuration(),
              String.format(
                  "%s-determine_partitions_dimselection-%s",
                  config.getDataSource(), config.getIntervals()));

      dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

      JobHelper.injectSystemProperties(dimSelectionJob);
      config.addJobProperties(dimSelectionJob);

      if (!config.getPartitionsSpec().isAssumeGrouped()) {
        // Read grouped data from the groupByJob.
        dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
        dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
        FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
      } else {
        // Directly read the source data, since we assume it's already grouped.
        dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
        config.addInputPaths(dimSelectionJob);
      }

      SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
      dimSelectionJob.setMapOutputValueClass(Text.class);
      dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
      dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
      dimSelectionJob.setOutputKeyClass(BytesWritable.class);
      dimSelectionJob.setOutputValueClass(Text.class);
      dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
      dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
      dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
      JobHelper.setupClasspath(
          JobHelper.distributedClassPath(config.getWorkingPath()),
          JobHelper.distributedClassPath(config.makeIntermediatePath()),
          dimSelectionJob);

      config.intoConfiguration(dimSelectionJob);
      FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

      dimSelectionJob.submit();
      log.info(
          "Job %s submitted, status available at: %s",
          dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());

      if (!dimSelectionJob.waitForCompletion(true)) {
        log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
        return false;
      }

      /*
       * Load partitions determined by the previous job.
       */

      log.info(
          "Job completed, loading up partitions for intervals[%s].",
          config.getSegmentGranularIntervals());
      FileSystem fileSystem = null;
      Map<DateTime, List<HadoopyShardSpec>> shardSpecs =
          Maps.newTreeMap(DateTimeComparator.getInstance());
      int shardCount = 0;
      for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
        final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
        if (fileSystem == null) {
          fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
        }
        if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
          List<ShardSpec> specs =
              config.JSON_MAPPER.readValue(
                  Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                  new TypeReference<List<ShardSpec>>() {});

          List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
          for (int i = 0; i < specs.size(); ++i) {
            actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
            log.info(
                "DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
          }

          shardSpecs.put(segmentGranularity.getStart(), actualSpecs);
        } else {
          log.info("Path[%s] didn't exist!?", partitionInfoPath);
        }
      }
      config.setShardSpecs(shardSpecs);

      return true;
    } catch (Exception e) {
      throw Throwables.propagate(e);
    }
  }
Пример #21
0
 public DbUpdaterJob(HadoopDruidIndexerConfig config) {
   this.config = config;
   this.dbi = new DbConnector(config.getUpdaterJobSpec(), null).getDBI();
 }
Пример #22
0
 @Override
 protected void setup(Context context) throws IOException, InterruptedException {
   config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
   parser = config.getParser();
   granularitySpec = config.getGranularitySpec();
 }