/** * Retry the given RPC. * * @param rpc an RPC to retry or fail * @param exception an exception to propagate with the RPC */ private void failOrRetryRpc(final KuduRpc<?> rpc, final ConnectionResetException exception) { AsyncKuduClient.RemoteTablet tablet = rpc.getTablet(); // Note As of the time of writing (03/11/16), a null tablet doesn't make sense, if we see a null // tablet it's because we didn't set it properly before calling sendRpc(). if (tablet == null) { // Can't retry, dunno where this RPC should go. rpc.errback(exception); } else { kuduClient.handleRetryableError(rpc, exception); } }
/** * Provides different handling for various kinds of master errors: re-uses the mechanisms already * in place for handling tablet server errors as much as possible. * * @param rpc The original RPC call that triggered the error. * @param error The error the master sent. * @return An exception if we couldn't dispatch the error, or null. */ private Exception dispatchMasterErrorOrReturnException(KuduRpc rpc, Master.MasterErrorPB error) { WireProtocol.AppStatusPB.ErrorCode code = error.getStatus().getCode(); MasterErrorException ex = new MasterErrorException(uuid, error); if (error.getCode() == Master.MasterErrorPB.Code.NOT_THE_LEADER) { kuduClient.handleNotLeader(rpc, ex, this); } else if (code == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE && (!(rpc instanceof GetMasterRegistrationRequest))) { // TODO: This is a crutch until we either don't have to retry RPCs going to the // same server or use retry policies. kuduClient.handleRetryableError(rpc, ex); } else { return ex; } return null; }
/** * Takes care of a few kinds of TS errors that we handle differently, like tablets or leaders * moving. Builds and returns an exception if we don't know what to do with it. * * @param rpc The original RPC call that triggered the error. * @param error The error the TS sent. * @return An exception if we couldn't dispatch the error, or null. */ private Exception dispatchTSErrorOrReturnException( KuduRpc rpc, Tserver.TabletServerErrorPB error) { WireProtocol.AppStatusPB.ErrorCode code = error.getStatus().getCode(); TabletServerErrorException ex = new TabletServerErrorException(uuid, error.getStatus()); if (error.getCode() == Tserver.TabletServerErrorPB.Code.TABLET_NOT_FOUND) { kuduClient.handleTabletNotFound(rpc, ex, this); // we're not calling rpc.callback() so we rely on the client to retry that RPC } else if (code == WireProtocol.AppStatusPB.ErrorCode.SERVICE_UNAVAILABLE) { kuduClient.handleRetryableError(rpc, ex); // The following two error codes are an indication that the tablet isn't a leader. } else if (code == WireProtocol.AppStatusPB.ErrorCode.ILLEGAL_STATE || code == WireProtocol.AppStatusPB.ErrorCode.ABORTED) { kuduClient.handleNotLeader(rpc, ex, this); } else { return ex; } return null; }
/** * The reason we are suppressing the unchecked conversions is because the KuduRpc is coming from a * collection that has RPCs with different generics, and there's no way to get "decoded" casted * correctly. The best we can do is to rely on the RPC to decode correctly, and to not pass an * Exception in the callback. */ @Override @SuppressWarnings("unchecked") protected Object decode( ChannelHandlerContext ctx, Channel chan, ChannelBuffer buf, VoidEnum voidEnum) { final long start = System.nanoTime(); final int rdx = buf.readerIndex(); LOG.debug("------------------>> ENTERING DECODE >>------------------"); try { buf = secureRpcHelper.handleResponse(buf, chan); } catch (SaslException e) { String message = getPeerUuidLoggingString() + "Couldn't complete the SASL handshake"; LOG.error(message); throw new NonRecoverableException(message, e); } if (buf == null) { return null; } CallResponse response = new CallResponse(buf); RpcHeader.ResponseHeader header = response.getHeader(); if (!header.hasCallId()) { final int size = response.getTotalResponseSize(); final String msg = getPeerUuidLoggingString() + "RPC response (size: " + size + ") doesn't" + " have a call ID: " + header + ", buf=" + Bytes.pretty(buf); LOG.error(msg); throw new NonRecoverableException(msg); } final int rpcid = header.getCallId(); @SuppressWarnings("rawtypes") final KuduRpc rpc = rpcs_inflight.get(rpcid); if (rpc == null) { final String msg = getPeerUuidLoggingString() + "Invalid rpcid: " + rpcid + " found in " + buf + '=' + Bytes.pretty(buf); LOG.error(msg); // The problem here is that we don't know which Deferred corresponds to // this RPC, since we don't have a valid ID. So we're hopeless, we'll // never be able to recover because responses are not framed, we don't // know where the next response will start... We have to give up here // and throw this outside of our Netty handler, so Netty will call our // exception handler where we'll close this channel, which will cause // all RPCs in flight to be failed. throw new NonRecoverableException(msg); } Pair<Object, Object> decoded = null; Exception exception = null; KuduException retryableHeaderException = null; if (header.hasIsError() && header.getIsError()) { RpcHeader.ErrorStatusPB.Builder errorBuilder = RpcHeader.ErrorStatusPB.newBuilder(); KuduRpc.readProtobuf(response.getPBMessage(), errorBuilder); RpcHeader.ErrorStatusPB error = errorBuilder.build(); if (error.getCode().equals(RpcHeader.ErrorStatusPB.RpcErrorCodePB.ERROR_SERVER_TOO_BUSY)) { // We can't return right away, we still need to remove ourselves from 'rpcs_inflight', so we // populate 'retryableHeaderException'. retryableHeaderException = new TabletServerErrorException(uuid, error); } else { String message = getPeerUuidLoggingString() + "Tablet server sent error " + error.getMessage(); exception = new NonRecoverableException(message); LOG.error(message); // can be useful } } else { try { decoded = rpc.deserialize(response, this.uuid); } catch (Exception ex) { exception = ex; } } if (LOG.isDebugEnabled()) { LOG.debug( getPeerUuidLoggingString() + "rpcid=" + rpcid + ", response size=" + (buf.readerIndex() - rdx) + " bytes" + ", " + actualReadableBytes() + " readable bytes left" + ", rpc=" + rpc); } { final KuduRpc<?> removed = rpcs_inflight.remove(rpcid); if (removed == null) { // The RPC we were decoding was cleaned up already, give up. throw new NonRecoverableException("RPC not found"); } } // This check is specifically for the ERROR_SERVER_TOO_BUSY case above. if (retryableHeaderException != null) { kuduClient.handleRetryableError(rpc, retryableHeaderException); return null; } // We can get this Message from within the RPC's expected type, // so convert it into an exception and nullify decoded so that we use the errback route. // Have to do it for both TS and Master errors. if (decoded != null) { if (decoded.getSecond() instanceof Tserver.TabletServerErrorPB) { Tserver.TabletServerErrorPB error = (Tserver.TabletServerErrorPB) decoded.getSecond(); exception = dispatchTSErrorOrReturnException(rpc, error); if (exception == null) { // It was taken care of. return null; } else { // We're going to errback. decoded = null; } } else if (decoded.getSecond() instanceof Master.MasterErrorPB) { Master.MasterErrorPB error = (Master.MasterErrorPB) decoded.getSecond(); exception = dispatchMasterErrorOrReturnException(rpc, error); if (exception == null) { // Exception was taken care of. return null; } else { decoded = null; } } } try { if (decoded != null) { assert !(decoded.getFirst() instanceof Exception); if (kuduClient.isStatisticsEnabled()) { rpc.updateStatistics(kuduClient.getStatistics(), decoded.getFirst()); } rpc.callback(decoded.getFirst()); } else { if (kuduClient.isStatisticsEnabled()) { rpc.updateStatistics(kuduClient.getStatistics(), null); } rpc.errback(exception); } } catch (Exception e) { LOG.debug( getPeerUuidLoggingString() + "Unexpected exception while handling RPC #" + rpcid + ", rpc=" + rpc + ", buf=" + Bytes.pretty(buf), e); } if (LOG.isDebugEnabled()) { LOG.debug( "------------------<< LEAVING DECODE <<------------------" + " time elapsed: " + ((System.nanoTime() - start) / 1000) + "us"); } return null; // Stop processing here. The Deferred does everything else. }