/** * Algo: - we put the query into the execution pool. - after x ms, if we don't have a result, we * add the queries for the secondary replicas - we take the first answer - when done, we cancel * what's left. Cancelling means: - removing from the pool if the actual call was not started - * interrupting the call if it has started Client side, we need to take into account - a call is * not executed immediately after being put into the pool - a call is a thread. Let's not multiply * the number of thread by the number of replicas. Server side, if we can cancel when it's still * in the handler pool, it's much better, as a call can take some i/o. * * <p>Globally, the number of retries, timeout and so on still applies, but it's per replica, not * global. We continue until all retries are done, or all timeouts are exceeded. */ public synchronized Result call() throws DoNotRetryIOException, InterruptedIOException, RetriesExhaustedException { boolean isTargetReplicaSpecified = (get.getReplicaId() >= 0); RegionLocations rl = getRegionLocations( true, (isTargetReplicaSpecified ? get.getReplicaId() : RegionReplicaUtil.DEFAULT_REPLICA_ID), cConnection, tableName, get.getRow()); ResultBoundedCompletionService<Result> cs = new ResultBoundedCompletionService<Result>(this.rpcRetryingCallerFactory, pool, rl.size()); if (isTargetReplicaSpecified) { addCallsForReplica(cs, rl, get.getReplicaId(), get.getReplicaId()); } else { addCallsForReplica(cs, rl, 0, 0); try { // wait for the timeout to see whether the primary responds back Future<Result> f = cs.poll(timeBeforeReplicas, TimeUnit.MICROSECONDS); // Yes, microseconds if (f != null) { return f.get(); // great we got a response } } catch (ExecutionException e) { throwEnrichedException(e, retries); } catch (CancellationException e) { throw new InterruptedIOException(); } catch (InterruptedException e) { throw new InterruptedIOException(); } // submit call for the all of the secondaries at once addCallsForReplica(cs, rl, 1, rl.size() - 1); } try { try { Future<Result> f = cs.take(); return f.get(); } catch (ExecutionException e) { throwEnrichedException(e, retries); } } catch (CancellationException e) { throw new InterruptedIOException(); } catch (InterruptedException e) { throw new InterruptedIOException(); } finally { // We get there because we were interrupted or because one or more of the // calls succeeded or failed. In all case, we stop all our tasks. cs.cancelAll(); } return null; // unreachable }
/** * Creates the calls and submit them * * @param cs - the completion service to use for submitting * @param rl - the region locations * @param min - the id of the first replica, inclusive * @param max - the id of the last replica, inclusive. */ private void addCallsForReplica( ResultBoundedCompletionService<Result> cs, RegionLocations rl, int min, int max) { for (int id = min; id <= max; id++) { HRegionLocation hrl = rl.getRegionLocation(id); ReplicaRegionServerCallable callOnReplica = new ReplicaRegionServerCallable(id, hrl); cs.submit(callOnReplica, callTimeout, id); } }
private void printLocations(Result r) { RegionLocations rl = null; if (r == null) { LOG.info("FAILED FOR null Result"); return; } LOG.info("FAILED FOR " + resultToString(r) + " Stale " + r.isStale()); if (r.getRow() == null) { return; } try { rl = ((ClusterConnection) connection).locateRegion(tableName, r.getRow(), true, true); } catch (IOException e) { LOG.warn("Couldn't get locations for row " + Bytes.toString(r.getRow())); } HRegionLocation locations[] = rl.getRegionLocations(); for (HRegionLocation h : locations) { LOG.info("LOCATION " + h); } }
/** * Two responsibilities - if the call is already completed (by another replica) stops the * retries. - set the location to the right region, depending on the replica. */ @Override public void prepare(final boolean reload) throws IOException { if (controller.isCanceled()) return; if (Thread.interrupted()) { throw new InterruptedIOException(); } if (reload || location == null) { RegionLocations rl = getRegionLocations(false, id, cConnection, tableName, get.getRow()); location = id < rl.size() ? rl.getRegionLocation(id) : null; } if (location == null || location.getServerName() == null) { // With this exception, there will be a retry. The location can be null for a replica // when the table is created or after a split. throw new HBaseIOException("There is no location for replica id #" + id); } ServerName dest = location.getServerName(); setStub(cConnection.getClient(dest)); }
@Override public Result[] call(int timeout) throws IOException { // If the active replica callable was closed somewhere, invoke the RPC to // really close it. In the case of regular scanners, this applies. We make couple // of RPCs to a RegionServer, and when that region is exhausted, we set // the closed flag. Then an RPC is required to actually close the scanner. if (currentScannerCallable != null && currentScannerCallable.closed) { // For closing we target that exact scanner (and not do replica fallback like in // the case of normal reads) if (LOG.isDebugEnabled()) { LOG.debug("Closing scanner " + currentScannerCallable.scannerId); } Result[] r = currentScannerCallable.call(timeout); currentScannerCallable = null; return r; } // We need to do the following: // 1. When a scan goes out to a certain replica (default or not), we need to // continue to hit that until there is a failure. So store the last successfully invoked // replica // 2. We should close the "losing" scanners (scanners other than the ones we hear back // from first) // RegionLocations rl = RpcRetryingCallerWithReadReplicas.getRegionLocations( true, RegionReplicaUtil.DEFAULT_REPLICA_ID, cConnection, tableName, currentScannerCallable.getRow()); // allocate a boundedcompletion pool of some multiple of number of replicas. // We want to accomodate some RPCs for redundant replica scans (but are still in progress) BoundedCompletionService<Pair<Result[], ScannerCallable>> cs = new BoundedCompletionService<Pair<Result[], ScannerCallable>>(pool, rl.size() * 5); List<ExecutionException> exceptions = null; int submitted = 0, completed = 0; AtomicBoolean done = new AtomicBoolean(false); replicaSwitched.set(false); // submit call for the primary replica. submitted += addCallsForCurrentReplica(cs, rl); try { // wait for the timeout to see whether the primary responds back Future<Pair<Result[], ScannerCallable>> f = cs.poll(timeBeforeReplicas, TimeUnit.MICROSECONDS); // Yes, microseconds if (f != null) { Pair<Result[], ScannerCallable> r = f.get(); if (r != null && r.getSecond() != null) { updateCurrentlyServingReplica(r.getSecond(), r.getFirst(), done, pool); } return r == null ? null : r.getFirst(); // great we got a response } } catch (ExecutionException e) { // the primary call failed with RetriesExhaustedException or DoNotRetryIOException // but the secondaries might still succeed. Continue on the replica RPCs. exceptions = new ArrayList<ExecutionException>(rl.size()); exceptions.add(e); completed++; } catch (CancellationException e) { throw new InterruptedIOException(e.getMessage()); } catch (InterruptedException e) { throw new InterruptedIOException(e.getMessage()); } // submit call for the all of the secondaries at once // TODO: this may be an overkill for large region replication submitted += addCallsForOtherReplicas(cs, rl, 0, rl.size() - 1); try { while (completed < submitted) { try { Future<Pair<Result[], ScannerCallable>> f = cs.take(); Pair<Result[], ScannerCallable> r = f.get(); if (r != null && r.getSecond() != null) { updateCurrentlyServingReplica(r.getSecond(), r.getFirst(), done, pool); } return r == null ? null : r.getFirst(); // great we got an answer } catch (ExecutionException e) { // if not cancel or interrupt, wait until all RPC's are done // one of the tasks failed. Save the exception for later. if (exceptions == null) exceptions = new ArrayList<ExecutionException>(rl.size()); exceptions.add(e); completed++; } } } catch (CancellationException e) { throw new InterruptedIOException(e.getMessage()); } catch (InterruptedException e) { throw new InterruptedIOException(e.getMessage()); } finally { // We get there because we were interrupted or because one or more of the // calls succeeded or failed. In all case, we stop all our tasks. cs.cancelAll(true); } if (exceptions != null && !exceptions.isEmpty()) { RpcRetryingCallerWithReadReplicas.throwEnrichedException( exceptions.get(0), retries); // just rethrow the first exception for now. } return null; // unreachable }
public void append( TableName tableName, byte[] encodedRegionName, byte[] row, List<Entry> entries) throws IOException { if (disabledAndDroppedTables.getIfPresent(tableName) != null) { if (LOG.isTraceEnabled()) { LOG.trace( "Skipping " + entries.size() + " entries because table " + tableName + " is cached as a disabled or dropped table"); for (Entry entry : entries) { LOG.trace("Skipping : " + entry); } } sink.getSkippedEditsCounter().addAndGet(entries.size()); return; } // If the table is disabled or dropped, we should not replay the entries, and we can skip // replaying them. However, we might not know whether the table is disabled until we // invalidate the cache and check from meta RegionLocations locations = null; boolean useCache = true; while (true) { // get the replicas of the primary region try { locations = RegionReplicaReplayCallable.getRegionLocations( connection, tableName, row, useCache, 0); if (locations == null) { throw new HBaseIOException( "Cannot locate locations for " + tableName + ", row:" + Bytes.toStringBinary(row)); } } catch (TableNotFoundException e) { if (LOG.isTraceEnabled()) { LOG.trace( "Skipping " + entries.size() + " entries because table " + tableName + " is dropped. Adding table to cache."); for (Entry entry : entries) { LOG.trace("Skipping : " + entry); } } disabledAndDroppedTables.put(tableName, Boolean.TRUE); // put to cache. Value ignored // skip this entry sink.getSkippedEditsCounter().addAndGet(entries.size()); return; } // check whether we should still replay this entry. If the regions are changed, or the // entry is not coming from the primary region, filter it out. HRegionLocation primaryLocation = locations.getDefaultRegionLocation(); if (!Bytes.equals( primaryLocation.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) { if (useCache) { useCache = false; continue; // this will retry location lookup } if (LOG.isTraceEnabled()) { LOG.trace( "Skipping " + entries.size() + " entries in table " + tableName + " because located region " + primaryLocation.getRegionInfo().getEncodedName() + " is different than the original region " + Bytes.toStringBinary(encodedRegionName) + " from WALEdit"); for (Entry entry : entries) { LOG.trace("Skipping : " + entry); } } sink.getSkippedEditsCounter().addAndGet(entries.size()); return; } break; } if (locations.size() == 1) { return; } ArrayList<Future<ReplicateWALEntryResponse>> tasks = new ArrayList<Future<ReplicateWALEntryResponse>>(locations.size() - 1); // All passed entries should belong to one region because it is coming from the EntryBuffers // split per region. But the regions might split and merge (unlike log recovery case). for (int replicaId = 0; replicaId < locations.size(); replicaId++) { HRegionLocation location = locations.getRegionLocation(replicaId); if (!RegionReplicaUtil.isDefaultReplica(replicaId)) { HRegionInfo regionInfo = location == null ? RegionReplicaUtil.getRegionInfoForReplica( locations.getDefaultRegionLocation().getRegionInfo(), replicaId) : location.getRegionInfo(); RegionReplicaReplayCallable callable = new RegionReplicaReplayCallable( connection, rpcControllerFactory, tableName, location, regionInfo, row, entries, sink.getSkippedEditsCounter()); Future<ReplicateWALEntryResponse> task = pool.submit( new RetryingRpcCallable<ReplicateWALEntryResponse>( rpcRetryingCallerFactory, callable, operationTimeout)); tasks.add(task); } } boolean tasksCancelled = false; for (Future<ReplicateWALEntryResponse> task : tasks) { try { task.get(); } catch (InterruptedException e) { throw new InterruptedIOException(e.getMessage()); } catch (ExecutionException e) { Throwable cause = e.getCause(); if (cause instanceof IOException) { // The table can be disabled or dropped at this time. For disabled tables, we have no // cheap mechanism to detect this case because meta does not contain this information. // HConnection.isTableDisabled() is a zk call which we cannot do for every replay RPC. // So instead we start the replay RPC with retries and // check whether the table is dropped or disabled which might cause // SocketTimeoutException, or RetriesExhaustedException or similar if we get IOE. if (cause instanceof TableNotFoundException || connection.isTableDisabled(tableName)) { if (LOG.isTraceEnabled()) { LOG.trace( "Skipping " + entries.size() + " entries in table " + tableName + " because received exception for dropped or disabled table", cause); for (Entry entry : entries) { LOG.trace("Skipping : " + entry); } } disabledAndDroppedTables.put(tableName, Boolean.TRUE); // put to cache for later. if (!tasksCancelled) { sink.getSkippedEditsCounter().addAndGet(entries.size()); tasksCancelled = true; // so that we do not add to skipped counter again } continue; } // otherwise rethrow throw (IOException) cause; } // unexpected exception throw new IOException(cause); } } }