/** Read the metadata table to get tablets and match up ranges to them. */ public static List<InputSplit> getSplits( final BasicHBaseOperations operations, final DistributableQuery query, final QueryOptions queryOptions, final AdapterStore adapterStore, final DataStatisticsStore statsStore, final IndexStore indexStore, final AdapterIndexMappingStore adapterIndexMappingStore, final Integer minSplits, final Integer maxSplits) throws IOException, InterruptedException { final Map<PrimaryIndex, RowRangeHistogramStatistics<?>> statsCache = new HashMap<PrimaryIndex, RowRangeHistogramStatistics<?>>(); final List<InputSplit> retVal = new ArrayList<InputSplit>(); final TreeSet<IntermediateSplitInfo> splits = new TreeSet<IntermediateSplitInfo>(); for (final Pair<PrimaryIndex, List<DataAdapter<Object>>> indexAdapterPair : queryOptions.getAdaptersWithMinimalSetOfIndices( adapterStore, adapterIndexMappingStore, indexStore)) { populateIntermediateSplits( splits, operations, indexAdapterPair.getLeft(), indexAdapterPair.getValue(), statsCache, adapterStore, statsStore, maxSplits, query, queryOptions.getAuthorizations()); } // this is an incremental algorithm, it may be better use the target // split count to drive it (ie. to get 3 splits this will split 1 // large // range into two down the middle and then split one of those ranges // down the middle to get 3, rather than splitting one range into // thirds) if (!statsCache.isEmpty() && !splits.isEmpty() && (minSplits != null) && (splits.size() < minSplits)) { // set the ranges to at least min splits do { // remove the highest range, split it into 2 and add both // back, // increasing the size by 1 final IntermediateSplitInfo highestSplit = splits.pollLast(); final IntermediateSplitInfo otherSplit = highestSplit.split(statsCache); splits.add(highestSplit); if (otherSplit == null) { LOGGER.warn("Cannot meet minimum splits"); break; } splits.add(otherSplit); } while (splits.size() < minSplits); } else if (((maxSplits != null) && (maxSplits > 0)) && (splits.size() > maxSplits)) { // merge splits to fit within max splits do { // this is the naive approach, remove the lowest two ranges // and // merge them, decreasing the size by 1 // TODO Ideally merge takes into account locations (as well // as // possibly the index as a secondary criteria) to limit the // number of locations/indices final IntermediateSplitInfo lowestSplit = splits.pollFirst(); final IntermediateSplitInfo nextLowestSplit = splits.pollFirst(); lowestSplit.merge(nextLowestSplit); splits.add(lowestSplit); } while (splits.size() > maxSplits); } for (final IntermediateSplitInfo split : splits) { retVal.add(split.toFinalSplit()); } return retVal; }
/** Initialize a scanner over the given input split using this task attempt configuration. */ @Override public void initialize(final InputSplit inSplit, final TaskAttemptContext attempt) throws IOException { split = (GeoWaveAccumuloInputSplit) inSplit; numKeysRead = 0; final Map<RangeLocationPair, CloseableIterator<?>> iteratorsPerRange = new LinkedHashMap<RangeLocationPair, CloseableIterator<?>>(); final Set<PrimaryIndex> indices = split.getIndices(); BigDecimal sum = BigDecimal.ZERO; final Map<RangeLocationPair, BigDecimal> incrementalRangeSums = new LinkedHashMap<RangeLocationPair, BigDecimal>(); for (final PrimaryIndex i : indices) { final List<RangeLocationPair> ranges = split.getRanges(i); List<QueryFilter> queryFilters = null; if (query != null) { queryFilters = query.createFilters(i.getIndexModel()); } for (final RangeLocationPair r : ranges) { final QueryOptions rangeQueryOptions = new QueryOptions(queryOptions); rangeQueryOptions.setIndex(i); iteratorsPerRange.put( r, new InputFormatAccumuloRangeQuery( adapterStore, i, r.getRange(), queryFilters, isOutputWritable, rangeQueryOptions) .query( accumuloOperations, adapterStore, rangeQueryOptions.getMaxResolutionSubsamplingPerDimension(), rangeQueryOptions.getLimit())); incrementalRangeSums.put(r, sum); sum = sum.add(BigDecimal.valueOf(r.getCardinality())); } } // finally we can compute percent progress progressPerRange = new LinkedHashMap<RangeLocationPair, ProgressPerRange>(); RangeLocationPair prevRangeIndex = null; float prevProgress = 0f; for (final Entry<RangeLocationPair, BigDecimal> entry : incrementalRangeSums.entrySet()) { final BigDecimal value = entry.getValue(); final float progress = value.divide(sum, RoundingMode.HALF_UP).floatValue(); if (prevRangeIndex != null) { progressPerRange.put(prevRangeIndex, new ProgressPerRange(prevProgress, progress)); } prevRangeIndex = entry.getKey(); prevProgress = progress; } progressPerRange.put(prevRangeIndex, new ProgressPerRange(prevProgress, 1f)); // concatenate iterators iterator = new CloseableIteratorWrapper<Object>( new Closeable() { @Override public void close() throws IOException { for (final CloseableIterator<?> it : iteratorsPerRange.values()) { it.close(); } } }, concatenateWithCallback( iteratorsPerRange.entrySet().iterator(), new NextRangeCallback() { @Override public void setRange(final RangeLocationPair indexPair) { currentGeoWaveRangeIndexPair = indexPair; } })); }