@Test public void testGranularitySpec() { final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead( "{" + " \"granularitySpec\":{" + " \"type\":\"uniform\"," + " \"gran\":\"hour\"," + " \"intervals\":[\"2012-01-01/P1D\"]" + " }" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) cfg.getGranularitySpec(); Assert.assertEquals( "getIntervals", Lists.newArrayList(new Interval("2012-01-01/P1D")), granularitySpec.getIntervals()); Assert.assertEquals("getGranularity", "HOUR", granularitySpec.getGranularity().toString()); }
@Test public void testGranularitySpecPostConstructorIntervals() { // Deprecated and replaced by granularitySpec, but still supported final HadoopDruidIndexerConfig cfg; try { cfg = jsonMapper.readValue( "{" + "\"segmentGranularity\":\"day\"" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } cfg.setIntervals(Lists.newArrayList(new Interval("2012-03-01/P1D"))); final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) cfg.getGranularitySpec(); Assert.assertEquals( "getIntervals", Lists.newArrayList(new Interval("2012-03-01/P1D")), granularitySpec.getIntervals()); Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString()); }
public DeterminePartitionsDimSelectionMapperHelper( HadoopDruidIndexerConfig config, String partitionDimension) { this.config = config; this.partitionDimension = partitionDimension; final ImmutableMap.Builder<DateTime, Integer> timeIndexBuilder = ImmutableMap.builder(); int idx = 0; for (final Interval bucketInterval : config.getGranularitySpec().bucketIntervals().get()) { timeIndexBuilder.put(bucketInterval.getStart(), idx); idx++; } this.intervalIndexes = timeIndexBuilder.build(); }
public void emitDimValueCounts( TaskInputOutputContext<?, ?, BytesWritable, Text> context, DateTime timestamp, Map<String, Iterable<String>> dims) throws IOException, InterruptedException { final Optional<Interval> maybeInterval = config.getGranularitySpec().bucketInterval(timestamp); if (!maybeInterval.isPresent()) { throw new ISE("WTF?! No bucket found for timestamp: %s", timestamp); } final Interval interval = maybeInterval.get(); final int intervalIndex = intervalIndexes.get(interval.getStart()); final ByteBuffer buf = ByteBuffer.allocate(4 + 8); buf.putInt(intervalIndex); buf.putLong(interval.getStartMillis()); final byte[] groupKey = buf.array(); // Emit row-counter value. write(context, groupKey, new DimValueCount("", "", 1)); for (final Map.Entry<String, Iterable<String>> dimAndValues : dims.entrySet()) { final String dim = dimAndValues.getKey(); if (partitionDimension == null || partitionDimension.equals(dim)) { final Iterable<String> dimValues = dimAndValues.getValue(); if (Iterables.size(dimValues) == 1) { // Emit this value. write( context, groupKey, new DimValueCount(dim, Iterables.getOnlyElement(dimValues), 1)); } else { // This dimension is unsuitable for partitioning. Poison it by emitting a negative // value. write(context, groupKey, new DimValueCount(dim, "", -1)); } } } }
@Test public void testGranularitySpecLegacy() { // Deprecated and replaced by granularitySpec, but still supported final HadoopDruidIndexerConfig cfg; try { cfg = jsonReadWriteRead( "{" + "\"segmentGranularity\":\"day\"," + "\"intervals\":[\"2012-02-01/P1D\"]" + "}", HadoopDruidIndexerConfig.class); } catch (Exception e) { throw Throwables.propagate(e); } final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) cfg.getGranularitySpec(); Assert.assertEquals( "getIntervals", Lists.newArrayList(new Interval("2012-02-01/P1D")), granularitySpec.getIntervals()); Assert.assertEquals("getGranularity", "DAY", granularitySpec.getGranularity().toString()); }
public boolean run() { try { /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) { throw new ISE( "DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec()); } if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = Job.getInstance( new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); JobHelper.injectSystemProperties(groupByJob); config.addJobProperties(groupByJob); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); JobHelper.setupClasspath( JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info( "Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = Job.getInstance( new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); JobHelper.injectSystemProperties(dimSelectionJob); config.addJobProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class); dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size()); JobHelper.setupClasspath( JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info( "Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info( "Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) { final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) { List<ShardSpec> specs = config.JSON_MAPPER.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {}); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info( "DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i)); } shardSpecs.put(segmentGranularity.getStart(), actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration()); parser = config.getParser(); granularitySpec = config.getGranularitySpec(); }