public static boolean runJobs(List<Jobby> jobs, HadoopDruidIndexerConfig config) { String failedMessage = null; for (Jobby job : jobs) { if (failedMessage == null) { if (!job.run()) { failedMessage = String.format("Job[%s] failed!", job.getClass()); } } } if (!config.getSchema().getTuningConfig().isLeaveIntermediate()) { if (failedMessage == null || config.getSchema().getTuningConfig().isCleanupOnFailure()) { Path workingPath = config.makeIntermediatePath(); log.info("Deleting path[%s]", workingPath); try { workingPath .getFileSystem(injectSystemProperties(new Configuration())) .delete(workingPath, true); } catch (IOException e) { log.error(e, "Failed to cleanup path[%s]", workingPath); } } } if (failedMessage != null) { throw new ISE(failedMessage); } return true; }
@Test public void testPartitionsSpecMaxPartitionSize() { final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead( "{" + "\"partitionsSpec\":{" + " \"targetPartitionSize\":100," + " \"maxPartitionSize\":200," + " \"partitionDimension\":\"foo\"" + " }" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } final PartitionsSpec partitionsSpec = cfg.getPartitionsSpec(); Assert.assertEquals("isDeterminingPartitions", partitionsSpec.isDeterminingPartitions(), true); Assert.assertEquals("getTargetPartitionSize", partitionsSpec.getTargetPartitionSize(), 100); Assert.assertEquals("getMaxPartitionSize", partitionsSpec.getMaxPartitionSize(), 200); Assert.assertEquals("getPartitionDimension", partitionsSpec.getPartitionDimension(), "foo"); }
@Test public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithJustDatasourcePathSpecAndPartialInterval() throws Exception { PathSpec pathSpec = new DatasourcePathSpec( jsonMapper, null, new DatasourceIngestionSpec( testDatasource, testDatasourceIntervalPartial, null, null, null, null, null, null, false), null); HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceIntervalPartial); Assert.assertEquals( ImmutableList.of(new WindowedDataSegment(SEGMENT, testDatasourceIntervalPartial)), ((DatasourcePathSpec) config.getPathSpec()).getSegments()); }
@Test public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithMultiplePathSpec() throws Exception { PathSpec pathSpec = new MultiplePathSpec( ImmutableList.of( new StaticPathSpec("/xyz", null), new DatasourcePathSpec( jsonMapper, null, new DatasourceIngestionSpec( testDatasource, testDatasourceInterval, null, null, null, null, null, null, false), null))); HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, testDatasourceInterval); Assert.assertEquals( ImmutableList.of(WindowedDataSegment.of(SEGMENT)), ((DatasourcePathSpec) ((MultiplePathSpec) config.getPathSpec()).getChildren().get(1)) .getSegments()); }
@Test public void testGranularitySpec() { final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead( "{" + " \"granularitySpec\":{" + " \"type\":\"uniform\"," + " \"gran\":\"hour\"," + " \"intervals\":[\"2012-01-01/P1D\"]" + " }" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) cfg.getGranularitySpec(); Assert.assertEquals( "getIntervals", Lists.newArrayList(new Interval("2012-01-01/P1D")), granularitySpec.getIntervals()); Assert.assertEquals("getGranularity", "HOUR", granularitySpec.getGranularity().toString()); }
@Test public void testGranularitySpecPostConstructorIntervals() { // Deprecated and replaced by granularitySpec, but still supported final HadoopDruidIndexerConfig cfg; try { cfg = jsonMapper.readValue( "{" + "\"segmentGranularity\":\"day\"" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } cfg.setIntervals(Lists.newArrayList(new Interval("2012-03-01/P1D"))); final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) cfg.getGranularitySpec(); Assert.assertEquals( "getIntervals", Lists.newArrayList(new Interval("2012-03-01/P1D")), granularitySpec.getIntervals()); Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString()); }
@Test public void testupdateSegmentListIfDatasourcePathSpecIsUsedWithNoDatasourcePathSpec() throws Exception { PathSpec pathSpec = new StaticPathSpec("/xyz", null); HadoopDruidIndexerConfig config = testRunUpdateSegmentListIfDatasourcePathSpecIsUsed(pathSpec, null); Assert.assertTrue(config.getPathSpec() instanceof StaticPathSpec); }
public static void setInputFormat(Job job, HadoopDruidIndexerConfig indexerConfig) { if (indexerConfig.getInputFormatClass() != null) { job.setInputFormatClass(indexerConfig.getInputFormatClass()); } else if (indexerConfig.isCombineText()) { job.setInputFormatClass(CombineTextInputFormat.class); } else { job.setInputFormatClass(TextInputFormat.class); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { final HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration()); SingleDimensionPartitionsSpec spec = (SingleDimensionPartitionsSpec) config.getPartitionsSpec(); helper = new DeterminePartitionsDimSelectionMapperHelper(config, spec.getPartitionDimension()); }
@Test public void testNoCleanupOnFailure() { final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead("{\"cleanupOnFailure\":false}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } Assert.assertEquals("cleanupOnFailure", cfg.isCleanupOnFailure(), false); }
private HadoopDruidIndexerConfig testRunUpdateSegmentListIfDatasourcePathSpecIsUsed( PathSpec datasourcePathSpec, Interval jobInterval) throws Exception { HadoopIngestionSpec spec = new HadoopIngestionSpec( new DataSchema( "foo", null, new AggregatorFactory[0], new UniformGranularitySpec( Granularity.DAY, null, ImmutableList.of(new Interval("2010-01-01/P1D"))), jsonMapper), new HadoopIOConfig(jsonMapper.convertValue(datasourcePathSpec, Map.class), null, null), null); spec = jsonMapper.readValue(jsonMapper.writeValueAsString(spec), HadoopIngestionSpec.class); UsedSegmentLister segmentLister = EasyMock.createMock(UsedSegmentLister.class); EasyMock.expect( segmentLister.getUsedSegmentsForIntervals( testDatasource, Lists.newArrayList(jobInterval))) .andReturn(ImmutableList.of(SEGMENT)); EasyMock.replay(segmentLister); spec = HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed( spec, jsonMapper, segmentLister); return HadoopDruidIndexerConfig.fromString(jsonMapper.writeValueAsString(spec)); }
@Override protected void map(Object key, Object value, Context context) throws IOException, InterruptedException { try { final InputRow inputRow; try { inputRow = parseInputRow(value, parser); } catch (Exception e) { if (config.isIgnoreInvalidRows()) { log.debug(e, "Ignoring invalid row [%s] due to parsing error", value.toString()); context .getCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER) .increment(1); return; // we're ignoring this invalid row } else { throw e; } } if (!granularitySpec.bucketIntervals().isPresent() || granularitySpec .bucketInterval(new DateTime(inputRow.getTimestampFromEpoch())) .isPresent()) { innerMap(inputRow, value, context); } } catch (RuntimeException e) { throw new RE(e, "Failure on row[%s]", value); } }
@Test public void testDefaultSettings() { final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead("{}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } Assert.assertEquals("cleanupOnFailure", cfg.isCleanupOnFailure(), true); Assert.assertEquals("overwriteFiles", cfg.isOverwriteFiles(), false); Assert.assertEquals( "isDeterminingPartitions", cfg.getPartitionsSpec().isDeterminingPartitions(), false); }
public static void ensurePaths(HadoopDruidIndexerConfig config) { // config.addInputPaths() can have side-effects ( boo! :( ), so this stuff needs to be done // before anything else try { Job job = Job.getInstance( new Configuration(), String.format( "%s-determine_partitions-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.19"); injectSystemProperties(job); config.addInputPaths(job); } catch (IOException e) { throw Throwables.propagate(e); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { if (config == null) { synchronized (DeterminePartitionsDimSelectionBaseReducer.class) { if (config == null) { config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration()); } } } }
public DeterminePartitionsDimSelectionMapperHelper( HadoopDruidIndexerConfig config, String partitionDimension) { this.config = config; this.partitionDimension = partitionDimension; final ImmutableMap.Builder<DateTime, Integer> timeIndexBuilder = ImmutableMap.builder(); int idx = 0; for (final Interval bucketInterval : config.getGranularitySpec().bucketIntervals().get()) { timeIndexBuilder.put(bucketInterval.getStart(), idx); idx++; } this.intervalIndexes = timeIndexBuilder.build(); }
@Test public void testGranularitySpecLegacy() { // Deprecated and replaced by granularitySpec, but still supported final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead( "{" + "\"segmentGranularity\":\"day\"," + "\"intervals\":[\"2012-02-01/P1D\"]" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) cfg.getGranularitySpec(); Assert.assertEquals( "getIntervals", Lists.newArrayList(new Interval("2012-02-01/P1D")), granularitySpec.getIntervals()); Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString()); }
@Test public void testDbUpdaterJobSpec() throws Exception { final HadoopDruidIndexerConfig cfg; cfg = jsonReadWriteRead( "{" + "\"updaterJobSpec\":{\n" + " \"type\" : \"db\",\n" + " \"connectURI\" : \"jdbc:mysql://localhost/druid\",\n" + " \"user\" : \"rofl\",\n" + " \"password\" : \"p4ssw0rd\",\n" + " \"segmentTable\" : \"segments\"\n" + " }" + "}", HadoopDruidIndexerConfig.class); final DbUpdaterJobSpec spec = (DbUpdaterJobSpec) cfg.getUpdaterJobSpec(); Assert.assertEquals("segments", spec.getSegmentTable()); Assert.assertEquals("jdbc:mysql://localhost/druid", spec.getDatabaseConnectURI()); Assert.assertEquals("rofl", spec.getDatabaseUser()); Assert.assertEquals("p4ssw0rd", spec.getDatabasePassword()); Assert.assertEquals(false, spec.useValidationQuery()); }
public void emitDimValueCounts( TaskInputOutputContext<?, ?, BytesWritable, Text> context, DateTime timestamp, Map<String, Iterable<String>> dims) throws IOException, InterruptedException { final Optional<Interval> maybeInterval = config.getGranularitySpec().bucketInterval(timestamp); if (!maybeInterval.isPresent()) { throw new ISE("WTF?! No bucket found for timestamp: %s", timestamp); } final Interval interval = maybeInterval.get(); final int intervalIndex = intervalIndexes.get(interval.getStart()); final ByteBuffer buf = ByteBuffer.allocate(4 + 8); buf.putInt(intervalIndex); buf.putLong(interval.getStartMillis()); final byte[] groupKey = buf.array(); // Emit row-counter value. write(context, groupKey, new DimValueCount("", "", 1)); for (final Map.Entry<String, Iterable<String>> dimAndValues : dims.entrySet()) { final String dim = dimAndValues.getKey(); if (partitionDimension == null || partitionDimension.equals(dim)) { final Iterable<String> dimValues = dimAndValues.getValue(); if (Iterables.size(dimValues) == 1) { // Emit this value. write( context, groupKey, new DimValueCount(dim, Iterables.getOnlyElement(dimValues), 1)); } else { // This dimension is unsuitable for partitioning. Poison it by emitting a negative // value. write(context, groupKey, new DimValueCount(dim, "", -1)); } } } }
public boolean run() { try { /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) { throw new ISE( "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec()); } if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = Job.getInstance( new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); JobHelper.injectSystemProperties(groupByJob); config.addJobProperties(groupByJob); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); JobHelper.setupClasspath( JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info( "Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = Job.getInstance( new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); JobHelper.injectSystemProperties(dimSelectionJob); config.addJobProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class); dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size()); JobHelper.setupClasspath( JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info( "Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info( "Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) { final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) { List<ShardSpec> specs = config.JSON_MAPPER.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {}); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info( "DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i)); } shardSpecs.put(segmentGranularity.getStart(), actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
public DbUpdaterJob(HadoopDruidIndexerConfig config) { this.config = config; this.dbi = new DbConnector(config.getUpdaterJobSpec(), null).getDBI(); }
@Override protected void setup(Context context) throws IOException, InterruptedException { config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration()); parser = config.getParser(); granularitySpec = config.getGranularitySpec(); }