public void mapOverShards(final Bundle<F>[] bundles) { /* * Sort the binding sets in the chunk by the fromKey associated with * each asBound predicate. */ Arrays.sort(bundles); // The most recently discovered locator. PartitionLocator current = null; // The key order for [current] IKeyOrder<?> currentKeyOrder = null; // // The list of binding sets which are bound for the current locator. // List<IBindingSet> list = new LinkedList<IBindingSet>(); final Iterator<Bundle<F>> bitr = Arrays.asList(bundles).iterator(); while (bitr.hasNext()) { final Bundle<F> bundle = bitr.next(); if (current != null && currentKeyOrder == bundle.keyOrder // same s/o index && BytesUtil.rangeCheck( bundle.fromKey, current.getLeftSeparatorKey(), current.getRightSeparatorKey()) && BytesUtil.rangeCheck( bundle.toKey, current.getLeftSeparatorKey(), current.getRightSeparatorKey())) { /* * Optimization when the bundle fits inside of the last index * partition scanned (this optimization is only possible when * the asBound predicate will be mapped onto a single index * partition, but this is a very common case since we try to * choose selective indices for access paths). * * Note: The bundle MUST be for the scale-out index associated * with the last PartitionLocator. We enforce this constraint by * tracking the IKeyOrder for the last PartitionLocator and * verifying that the Bundle is associated with the same * IKeyOrder. * * Note: Bundle#compareTo() is written to group together the * [Bundle]s first by their IKeyOrder and then by their fromKey. * That provides the maximum possibility of reuse of the last * PartitionLocator. It also provides ordered within scale-out * index partition locator scans. */ final IBuffer<IBindingSet[]> sink = op.getBuffer(current); sink.add(new IBindingSet[] {bundle.bindingSet}); continue; } /* * Locator scan for the index partitions for that predicate as * bound. */ final Iterator<PartitionLocator> itr = op.locatorScan(bundle.keyOrder, bundle.fromKey, bundle.toKey); // Clear the old partition locator. current = null; // Update key order for the partition that we are scanning. currentKeyOrder = bundle.keyOrder; // Scan locators. while (itr.hasNext()) { final PartitionLocator locator = current = itr.next(); if (log.isTraceEnabled()) log.trace( "adding bindingSet to buffer" + ": asBound=" + bundle.asBound + ", partitionId=" + locator.getPartitionId() + ", dataService=" + locator.getDataServiceUUID() + ", bindingSet=" + bundle.bindingSet); final IBuffer<IBindingSet[]> sink = op.getBuffer(locator); sink.add(new IBindingSet[] {bundle.bindingSet}); } } }
/** * Now that the index partition is compact, decide if we will take any after action, such as * {move, join, split, tailSplit, scatterSplit, etc). All of these operations are much cheaper * while the index is compact which is why we do them here. * * <p>Note: asynchronous overflow processing WILL NOT complete until the CompactingMergeTask is * done. This means that we will still be reading from the same journal. As long as we are reading * from the same ordered set of resources the lastCommitTime chosen here is somewhat arbitrary. * * <p>The updated view metadata as of the last commit time on the live journal. * * <p>FIXME Concurrent operations can replace the view definition. However, what would not be good * is if they changed the set of resources in the view. The AtomicUpdate of the after action task * MUST check for this precondition (same set of resources in the view) and abort (and clean up * any intermediate files) if the precondition has been violated (no harm is done if we abort, * just some lost work). * * @todo split + move and friends seem unnecessarily complicated. We can just move anything that * is compact. [Clean up the tasks to remove this stuff.] * @todo We might be better off running {@link #chooseAfterActionTask()} from inside of the atomic * update and then doing any work there while we have the lock on the shard. This will prevent * any new data from building up and can help ensure that the preconditions for the operation * remain valid. This might also help simplify the HA design. * @todo Once we have flow control on writes we can save the DS a lot of work by not accepting new * writes for an index partition when we are going to compact it, move it, split it, etc. */ private AbstractTask<?> chooseAfterActionTask() { final ViewMetadata vmd = new ViewMetadata( resourceManager, resourceManager.getLiveJournal().getLastCommitTime(), this.vmd.name, resourceManager.getIndexCounters(this.vmd.name)); /* * Scatter split? * * Note: Scatter splits are considered before tail splits and normal * splits since they can only be taken when there is a single index * partition for a scale-out index. The other kinds of splits are used * once the index has already been distributed onto the cluster by a * scatter split. */ { final ScatterSplitConfiguration ssc = vmd.indexMetadata.getScatterSplitConfiguration(); if ( // only a single index partitions? (vmd.getIndexPartitionCount() == 1L) // // scatter splits enabled for service && resourceManager.scatterSplitEnabled // // scatter splits enabled for index && ssc.isEnabled() // // The view is compact (only one segment). && vmd.compactView // // trigger scatter split before too much data builds up in one place. && vmd.getPercentOfSplit() >= ssc.getPercentOfSplitThreshold()) { // Target data services for the new index partitions. final UUID[] moveTargets = getScatterSplitTargets(ssc); if (moveTargets != null) { // #of splits. final int nsplits = ssc.getIndexPartitionCount() == 0 // ? (2 * moveTargets.length) // two per data service. : ssc.getIndexPartitionCount() // ; if (log.isInfoEnabled()) log.info("will scatter: " + vmd); // scatter split task. return new ScatterSplitTask(vmd, nsplits, moveTargets); } } } /* * Tail split? * * Note: We can do a tail split as long as we are "close" to a full * index partition. We have an expectation that the head of the split * will be over the minimum capacity. While the tail of the split MIGHT * be under the minimum capacity, if there are continued heavy writes on * the tail then it will should reach the minimum capacity for an index * partition by the time the live journal overflows again. */ if (vmd.isTailSplit() && false) { /* * FIXME The current tailSplit implementation operations against the * BTree, NOT the FusedView and NOT the IndexSegment. It needs to be * refactored before it can be an after action for a compacting * merge. * * It is written to identify the separator key based on an * examination of the mutable BTree. Once it has the separator key * it then does a normal build for each key-range. [@todo It * probably should use a compacting merge in order to avoid sharing * index segments across shards.] */ if (log.isInfoEnabled()) log.info("Will tailSpl" + vmd.name); return new SplitTailTask(vmd, null /* moveTarget */); } /* * Should split? * * Note: Split is NOT allowed if the index is currently being moved * onto this data service. Split, join, and move are all disallowed * until the index partition move is complete since each of them * would cause the index partition to become invalidated. */ if (vmd.getPercentOfSplit() > 1.0) { if (log.isInfoEnabled()) log.info("will split : " + vmd); return new SplitIndexPartitionTask(vmd, (UUID) null /* moveTarget */); } /* * Join undercapacity shard (either with local rightSibling or move to * join with remote rightSibling). * * If the rightSibling of an undercapacity index partition is also local * then a {@link JoinIndexPartitionTask} is used to join those index * partitions. * * If the rightSibling of an undercapacity index partition is remote, * then a {@link MoveTask} is created to move the undercapacity index * partition to the remove data service. * * Note: joins are only considered when the rightSibling of an index * partition exists. The last index partition has [rightSeparatorKey == * null] and there is no rightSibling for that index partition. * * @todo What kinds of guarantees do we have that a local rightSibling * will be around by the time the JoinIndexPartitionTask runs? * * @todo This has even more assumptions about [lastCommitTime] than the * other tasks. All these tasks need to be reviewed to make sure that * there are no gaps created by this refactor. Running these after * action tasks while we hold the write lock on the source shard could * probably help us to reduce the possibility of any such problems but * might require a revisit / refactor / simplification of the tasks. * * FIXME Make sure that we are not running compacting merges as part of * the split, scatter split and other tasks. Some tasks used to do this * in order to have a compact view. */ if (resourceManager.joinsEnabled && vmd.pmd.getRightSeparatorKey() != null && vmd.getPercentOfSplit() < resourceManager.percentOfJoinThreshold) { final String scaleOutIndexName = vmd.indexMetadata.getName(); final PartitionLocator rightSiblingLocator = getRightSiblingLocator(scaleOutIndexName, vmd.commitTime); if (rightSiblingLocator != null) { final UUID targetDataServiceUUID = rightSiblingLocator.getDataServiceUUID(); final String[] resources = new String[2]; // the underutilized index partition. resources[0] = DataService.getIndexPartitionName(scaleOutIndexName, vmd.pmd.getPartitionId()); // its right sibling (may be local or remote). resources[1] = DataService.getIndexPartitionName( scaleOutIndexName, rightSiblingLocator.getPartitionId()); if (resourceManager.getDataServiceUUID().equals(targetDataServiceUUID)) { /* * JOIN underutilized index partition with its local * rightSibling. * * Note: This is only joining two index partitions at a * time. It's possible to do more than that if it happens * that N > 2 underutilized sibling index partitions are on * the same data service, but that is a relatively unlikely * combination of events. */ if (log.isInfoEnabled()) log.info("Will JOIN: " + Arrays.toString(resources)); final String rightSiblingName = DataService.getIndexPartitionName( scaleOutIndexName, rightSiblingLocator.getPartitionId()); final ViewMetadata vmd2 = new ViewMetadata( resourceManager, vmd.commitTime, rightSiblingName, resourceManager.getIndexCounters(rightSiblingName)); return new JoinIndexPartitionTask( resourceManager, vmd.commitTime, resources, new ViewMetadata[] {vmd, vmd2}); } else { /* * MOVE underutilized index partition to data service * hosting the right sibling. * * @todo The decision to join shards is asymmetric (an * undercapacity shard is moved to its rightSibling). * However, it is possible that its rightSibling was also * undercapacity and was either moved to or locally joined * with its rightSibling (in which case its partition * identifier would have been changed). To avoid these edge * cases there could be a global synchronous agreement for * move/join decisions */ if (log.isInfoEnabled()) { // get the target service name. String targetDataServiceName; try { targetDataServiceName = resourceManager .getFederation() .getDataService(targetDataServiceUUID) .getServiceName(); } catch (Throwable t) { targetDataServiceName = targetDataServiceUUID.toString(); } log.info( "willMoveToJoinWithRightSibling" + "( " + vmd.name + " -> " + targetDataServiceName // + ", leftSibling=" + resources[0] // + ", rightSibling=" + resources[1] // + ")"); } return new MoveTask(vmd, targetDataServiceUUID); } } // rightSibling != null } // if(join) /* * Move (to shed or redistribute load). * * @todo We should prefer to move smaller shards (faster to move) or * "hotter" shards (sheds more workload). There should be a way to * estimate how much workload will be transferred so we know when we are * done. * * FIXME We should limit the #of shards that we move in a given period * of time to allow both this host and the target host an opportunity to * adapt to their new load. [An exception would be if this host was * critically overloaded, but that should probably be handled by * different logic.] */ ILoadBalancerService loadBalancerService = null; if (vmd.getPercentOfSplit() < resourceManager.maximumMovePercentOfSplit && resourceManager.maximumMovesPerTarget != 0 && resourceManager.getLiveJournal().getName2Addr().rangeCount() > resourceManager.minimumActiveIndexPartitions && (loadBalancerService = getLoadBalancerService()) != null && shouldMove(loadBalancerService)) { // the UUID of this data service. final UUID sourceServiceUUID = resourceManager.getDataServiceUUID(); // Obtain UUID of a relatively underutilized data service. final UUID targetDataServiceUUID = getMoveTarget(sourceServiceUUID, loadBalancerService); if (targetDataServiceUUID != null) { if (log.isInfoEnabled()) { // get the target service name. String targetDataServiceName; try { targetDataServiceName = resourceManager .getFederation() .getDataService(targetDataServiceUUID) .getServiceName(); } catch (Throwable t) { targetDataServiceName = targetDataServiceUUID.toString(); } log.info("willMove" + "( " + vmd.name + " -> " + targetDataServiceName + ")"); } // Move the shard to the target host. return new MoveTask(vmd, targetDataServiceUUID); } } // No after action was chosen. return null; }