public void mapOverShards(final Bundle<F>[] bundles) {

    /*
     * Sort the binding sets in the chunk by the fromKey associated with
     * each asBound predicate.
     */
    Arrays.sort(bundles);

    // The most recently discovered locator.
    PartitionLocator current = null;
    // The key order for [current]
    IKeyOrder<?> currentKeyOrder = null;

    //		// The list of binding sets which are bound for the current locator.
    //		List<IBindingSet> list = new LinkedList<IBindingSet>();

    final Iterator<Bundle<F>> bitr = Arrays.asList(bundles).iterator();

    while (bitr.hasNext()) {

      final Bundle<F> bundle = bitr.next();

      if (current != null
          && currentKeyOrder == bundle.keyOrder // same s/o index
          && BytesUtil.rangeCheck(
              bundle.fromKey, current.getLeftSeparatorKey(), current.getRightSeparatorKey())
          && BytesUtil.rangeCheck(
              bundle.toKey, current.getLeftSeparatorKey(), current.getRightSeparatorKey())) {

        /*
         * Optimization when the bundle fits inside of the last index
         * partition scanned (this optimization is only possible when
         * the asBound predicate will be mapped onto a single index
         * partition, but this is a very common case since we try to
         * choose selective indices for access paths).
         *
         * Note: The bundle MUST be for the scale-out index associated
         * with the last PartitionLocator. We enforce this constraint by
         * tracking the IKeyOrder for the last PartitionLocator and
         * verifying that the Bundle is associated with the same
         * IKeyOrder.
         *
         * Note: Bundle#compareTo() is written to group together the
         * [Bundle]s first by their IKeyOrder and then by their fromKey.
         * That provides the maximum possibility of reuse of the last
         * PartitionLocator. It also provides ordered within scale-out
         * index partition locator scans.
         */

        final IBuffer<IBindingSet[]> sink = op.getBuffer(current);

        sink.add(new IBindingSet[] {bundle.bindingSet});

        continue;
      }

      /*
       * Locator scan for the index partitions for that predicate as
       * bound.
       */
      final Iterator<PartitionLocator> itr =
          op.locatorScan(bundle.keyOrder, bundle.fromKey, bundle.toKey);

      // Clear the old partition locator.
      current = null;

      // Update key order for the partition that we are scanning.
      currentKeyOrder = bundle.keyOrder;

      // Scan locators.
      while (itr.hasNext()) {

        final PartitionLocator locator = current = itr.next();

        if (log.isTraceEnabled())
          log.trace(
              "adding bindingSet to buffer"
                  + ": asBound="
                  + bundle.asBound
                  + ", partitionId="
                  + locator.getPartitionId()
                  + ", dataService="
                  + locator.getDataServiceUUID()
                  + ", bindingSet="
                  + bundle.bindingSet);

        final IBuffer<IBindingSet[]> sink = op.getBuffer(locator);

        sink.add(new IBindingSet[] {bundle.bindingSet});
      }
    }
  }
Example #2
0
  /**
   * Now that the index partition is compact, decide if we will take any after action, such as
   * {move, join, split, tailSplit, scatterSplit, etc). All of these operations are much cheaper
   * while the index is compact which is why we do them here.
   *
   * <p>Note: asynchronous overflow processing WILL NOT complete until the CompactingMergeTask is
   * done. This means that we will still be reading from the same journal. As long as we are reading
   * from the same ordered set of resources the lastCommitTime chosen here is somewhat arbitrary.
   *
   * <p>The updated view metadata as of the last commit time on the live journal.
   *
   * <p>FIXME Concurrent operations can replace the view definition. However, what would not be good
   * is if they changed the set of resources in the view. The AtomicUpdate of the after action task
   * MUST check for this precondition (same set of resources in the view) and abort (and clean up
   * any intermediate files) if the precondition has been violated (no harm is done if we abort,
   * just some lost work).
   *
   * @todo split + move and friends seem unnecessarily complicated. We can just move anything that
   *     is compact. [Clean up the tasks to remove this stuff.]
   * @todo We might be better off running {@link #chooseAfterActionTask()} from inside of the atomic
   *     update and then doing any work there while we have the lock on the shard. This will prevent
   *     any new data from building up and can help ensure that the preconditions for the operation
   *     remain valid. This might also help simplify the HA design.
   * @todo Once we have flow control on writes we can save the DS a lot of work by not accepting new
   *     writes for an index partition when we are going to compact it, move it, split it, etc.
   */
  private AbstractTask<?> chooseAfterActionTask() {

    final ViewMetadata vmd =
        new ViewMetadata(
            resourceManager,
            resourceManager.getLiveJournal().getLastCommitTime(),
            this.vmd.name,
            resourceManager.getIndexCounters(this.vmd.name));

    /*
     * Scatter split?
     *
     * Note: Scatter splits are considered before tail splits and normal
     * splits since they can only be taken when there is a single index
     * partition for a scale-out index. The other kinds of splits are used
     * once the index has already been distributed onto the cluster by a
     * scatter split.
     */
    {
      final ScatterSplitConfiguration ssc = vmd.indexMetadata.getScatterSplitConfiguration();

      if ( // only a single index partitions?
      (vmd.getIndexPartitionCount() == 1L) //
          // scatter splits enabled for service
          && resourceManager.scatterSplitEnabled //
          // scatter splits enabled for index
          && ssc.isEnabled() //
          // The view is compact (only one segment).
          && vmd.compactView //
          // trigger scatter split before too much data builds up in one place.
          && vmd.getPercentOfSplit() >= ssc.getPercentOfSplitThreshold()) {

        // Target data services for the new index partitions.
        final UUID[] moveTargets = getScatterSplitTargets(ssc);

        if (moveTargets != null) {

          // #of splits.
          final int nsplits =
              ssc.getIndexPartitionCount() == 0 //
                  ? (2 * moveTargets.length) // two per data service.
                  : ssc.getIndexPartitionCount() //
              ;

          if (log.isInfoEnabled()) log.info("will scatter: " + vmd);

          // scatter split task.
          return new ScatterSplitTask(vmd, nsplits, moveTargets);
        }
      }
    }

    /*
     * Tail split?
     *
     * Note: We can do a tail split as long as we are "close" to a full
     * index partition. We have an expectation that the head of the split
     * will be over the minimum capacity. While the tail of the split MIGHT
     * be under the minimum capacity, if there are continued heavy writes on
     * the tail then it will should reach the minimum capacity for an index
     * partition by the time the live journal overflows again.
     */
    if (vmd.isTailSplit() && false) {

      /*
       * FIXME The current tailSplit implementation operations against the
       * BTree, NOT the FusedView and NOT the IndexSegment. It needs to be
       * refactored before it can be an after action for a compacting
       * merge.
       *
       * It is written to identify the separator key based on an
       * examination of the mutable BTree. Once it has the separator key
       * it then does a normal build for each key-range. [@todo It
       * probably should use a compacting merge in order to avoid sharing
       * index segments across shards.]
       */

      if (log.isInfoEnabled()) log.info("Will tailSpl" + vmd.name);

      return new SplitTailTask(vmd, null /* moveTarget */);
    }

    /*
     * Should split?
     *
     * Note: Split is NOT allowed if the index is currently being moved
     * onto this data service. Split, join, and move are all disallowed
     * until the index partition move is complete since each of them
     * would cause the index partition to become invalidated.
     */
    if (vmd.getPercentOfSplit() > 1.0) {

      if (log.isInfoEnabled()) log.info("will split  : " + vmd);

      return new SplitIndexPartitionTask(vmd, (UUID) null /* moveTarget */);
    }

    /*
     * Join undercapacity shard (either with local rightSibling or move to
     * join with remote rightSibling).
     *
     * If the rightSibling of an undercapacity index partition is also local
     * then a {@link JoinIndexPartitionTask} is used to join those index
     * partitions.
     *
     * If the rightSibling of an undercapacity index partition is remote,
     * then a {@link MoveTask} is created to move the undercapacity index
     * partition to the remove data service.
     *
     * Note: joins are only considered when the rightSibling of an index
     * partition exists. The last index partition has [rightSeparatorKey ==
     * null] and there is no rightSibling for that index partition.
     *
     * @todo What kinds of guarantees do we have that a local rightSibling
     * will be around by the time the JoinIndexPartitionTask runs?
     *
     * @todo This has even more assumptions about [lastCommitTime] than the
     * other tasks. All these tasks need to be reviewed to make sure that
     * there are no gaps created by this refactor. Running these after
     * action tasks while we hold the write lock on the source shard could
     * probably help us to reduce the possibility of any such problems but
     * might require a revisit / refactor / simplification of the tasks.
     *
     * FIXME Make sure that we are not running compacting merges as part of
     * the split, scatter split and other tasks. Some tasks used to do this
     * in order to have a compact view.
     */
    if (resourceManager.joinsEnabled
        && vmd.pmd.getRightSeparatorKey() != null
        && vmd.getPercentOfSplit() < resourceManager.percentOfJoinThreshold) {

      final String scaleOutIndexName = vmd.indexMetadata.getName();

      final PartitionLocator rightSiblingLocator =
          getRightSiblingLocator(scaleOutIndexName, vmd.commitTime);

      if (rightSiblingLocator != null) {

        final UUID targetDataServiceUUID = rightSiblingLocator.getDataServiceUUID();

        final String[] resources = new String[2];

        // the underutilized index partition.
        resources[0] =
            DataService.getIndexPartitionName(scaleOutIndexName, vmd.pmd.getPartitionId());

        // its right sibling (may be local or remote).
        resources[1] =
            DataService.getIndexPartitionName(
                scaleOutIndexName, rightSiblingLocator.getPartitionId());

        if (resourceManager.getDataServiceUUID().equals(targetDataServiceUUID)) {

          /*
           * JOIN underutilized index partition with its local
           * rightSibling.
           *
           * Note: This is only joining two index partitions at a
           * time. It's possible to do more than that if it happens
           * that N > 2 underutilized sibling index partitions are on
           * the same data service, but that is a relatively unlikely
           * combination of events.
           */

          if (log.isInfoEnabled()) log.info("Will JOIN: " + Arrays.toString(resources));

          final String rightSiblingName =
              DataService.getIndexPartitionName(
                  scaleOutIndexName, rightSiblingLocator.getPartitionId());

          final ViewMetadata vmd2 =
              new ViewMetadata(
                  resourceManager,
                  vmd.commitTime,
                  rightSiblingName,
                  resourceManager.getIndexCounters(rightSiblingName));

          return new JoinIndexPartitionTask(
              resourceManager, vmd.commitTime, resources, new ViewMetadata[] {vmd, vmd2});

        } else {

          /*
           * MOVE underutilized index partition to data service
           * hosting the right sibling.
           *
           * @todo The decision to join shards is asymmetric (an
           * undercapacity shard is moved to its rightSibling).
           * However, it is possible that its rightSibling was also
           * undercapacity and was either moved to or locally joined
           * with its rightSibling (in which case its partition
           * identifier would have been changed). To avoid these edge
           * cases there could be a global synchronous agreement for
           * move/join decisions
           */

          if (log.isInfoEnabled()) {

            // get the target service name.
            String targetDataServiceName;
            try {
              targetDataServiceName =
                  resourceManager
                      .getFederation()
                      .getDataService(targetDataServiceUUID)
                      .getServiceName();
            } catch (Throwable t) {
              targetDataServiceName = targetDataServiceUUID.toString();
            }

            log.info(
                "willMoveToJoinWithRightSibling"
                    + "( "
                    + vmd.name
                    + " -> "
                    + targetDataServiceName //
                    + ", leftSibling="
                    + resources[0] //
                    + ", rightSibling="
                    + resources[1] //
                    + ")");
          }

          return new MoveTask(vmd, targetDataServiceUUID);
        }
      } // rightSibling != null
    } // if(join)

    /*
     * Move (to shed or redistribute load).
     *
     * @todo We should prefer to move smaller shards (faster to move) or
     * "hotter" shards (sheds more workload). There should be a way to
     * estimate how much workload will be transferred so we know when we are
     * done.
     *
     * FIXME We should limit the #of shards that we move in a given period
     * of time to allow both this host and the target host an opportunity to
     * adapt to their new load. [An exception would be if this host was
     * critically overloaded, but that should probably be handled by
     * different logic.]
     */
    ILoadBalancerService loadBalancerService = null;
    if (vmd.getPercentOfSplit() < resourceManager.maximumMovePercentOfSplit
        && resourceManager.maximumMovesPerTarget != 0
        && resourceManager.getLiveJournal().getName2Addr().rangeCount()
            > resourceManager.minimumActiveIndexPartitions
        && (loadBalancerService = getLoadBalancerService()) != null
        && shouldMove(loadBalancerService)) {

      // the UUID of this data service.
      final UUID sourceServiceUUID = resourceManager.getDataServiceUUID();

      // Obtain UUID of a relatively underutilized data service.
      final UUID targetDataServiceUUID = getMoveTarget(sourceServiceUUID, loadBalancerService);

      if (targetDataServiceUUID != null) {

        if (log.isInfoEnabled()) {

          // get the target service name.
          String targetDataServiceName;
          try {
            targetDataServiceName =
                resourceManager
                    .getFederation()
                    .getDataService(targetDataServiceUUID)
                    .getServiceName();
          } catch (Throwable t) {
            targetDataServiceName = targetDataServiceUUID.toString();
          }

          log.info("willMove" + "( " + vmd.name + " -> " + targetDataServiceName + ")");
        }

        // Move the shard to the target host.
        return new MoveTask(vmd, targetDataServiceUUID);
      }
    }

    // No after action was chosen.
    return null;
  }