private DiscoveryNode findMaster() { ZenPing.PingResponse[] fullPingResponses = pingService.pingAndWait(pingTimeout); if (fullPingResponses == null) { logger.trace("No full ping responses"); return null; } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder("full ping responses:"); if (fullPingResponses.length == 0) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : fullPingResponses) { sb.append("\n\t--> ") .append("target [") .append(pingResponse.target()) .append("], master [") .append(pingResponse.master()) .append("]"); } } logger.trace(sb.toString()); } // filter responses List<ZenPing.PingResponse> pingResponses = Lists.newArrayList(); for (ZenPing.PingResponse pingResponse : fullPingResponses) { DiscoveryNode node = pingResponse.target(); if (masterElectionFilterClientNodes && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) { // filter out the client node, which is a client node, or also one that is not data and not // master (effectively, client) } else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) { // filter out data node that is not also master } else { pingResponses.add(pingResponse); } } if (logger.isDebugEnabled()) { StringBuilder sb = new StringBuilder("filtered ping responses: (filter_client[") .append(masterElectionFilterClientNodes) .append("], filter_data[") .append(masterElectionFilterDataNodes) .append("])"); if (pingResponses.isEmpty()) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : pingResponses) { sb.append("\n\t--> ") .append("target [") .append(pingResponse.target()) .append("], master [") .append(pingResponse.master()) .append("]"); } } logger.debug(sb.toString()); } List<DiscoveryNode> pingMasters = newArrayList(); for (ZenPing.PingResponse pingResponse : pingResponses) { if (pingResponse.master() != null) { pingMasters.add(pingResponse.master()); } } Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet(); possibleMasterNodes.add(localNode); for (ZenPing.PingResponse pingResponse : pingResponses) { possibleMasterNodes.add(pingResponse.target()); } // if we don't have enough master nodes, we bail, even if we get a response that indicates // there is a master by other node, we don't see enough... if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) { return null; } if (pingMasters.isEmpty()) { // lets tie break between discovered nodes DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes); if (localNode.equals(electedMaster)) { return localNode; } } else { DiscoveryNode electedMaster = electMaster.electMaster(pingMasters); if (electedMaster != null) { return electedMaster; } } return null; }
@Override public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Exception { Object m = e.getMessage(); if (!(m instanceof ChannelBuffer)) { ctx.sendUpstream(e); return; } ChannelBuffer buffer = (ChannelBuffer) m; int size = buffer.getInt(buffer.readerIndex() - 4); transportServiceAdapter.received(size + 6); // we have additional bytes to read, outside of the header boolean hasMessageBytesToRead = (size - (NettyHeader.HEADER_SIZE - 6)) != 0; int markedReaderIndex = buffer.readerIndex(); int expectedIndexReader = markedReaderIndex + size; // netty always copies a buffer, either in NioWorker in its read handler, where it copies to a // fresh // buffer, or in the cumlation buffer, which is cleaned each time StreamInput streamIn = ChannelBufferStreamInputFactory.create(buffer, size); long requestId = buffer.readLong(); byte status = buffer.readByte(); Version version = Version.fromId(buffer.readInt()); StreamInput wrappedStream; if (TransportStatus.isCompress(status) && hasMessageBytesToRead && buffer.readable()) { Compressor compressor = CompressorFactory.compressor(buffer); if (compressor == null) { int maxToRead = Math.min(buffer.readableBytes(), 10); int offset = buffer.readerIndex(); StringBuilder sb = new StringBuilder("stream marked as compressed, but no compressor found, first [") .append(maxToRead) .append("] content bytes out of [") .append(buffer.readableBytes()) .append("] readable bytes with message size [") .append(size) .append("] ") .append("] are ["); for (int i = 0; i < maxToRead; i++) { sb.append(buffer.getByte(offset + i)).append(","); } sb.append("]"); throw new ElasticsearchIllegalStateException(sb.toString()); } wrappedStream = CachedStreamInput.cachedHandlesCompressed(compressor, streamIn); } else { wrappedStream = CachedStreamInput.cachedHandles(streamIn); } wrappedStream.setVersion(version); if (TransportStatus.isRequest(status)) { String action = handleRequest(ctx.getChannel(), wrappedStream, requestId, version); if (buffer.readerIndex() != expectedIndexReader) { if (buffer.readerIndex() < expectedIndexReader) { logger.warn( "Message not fully read (request) for [{}] and action [{}], resetting", requestId, action); } else { logger.warn( "Message read past expected size (request) for [{}] and action [{}], resetting", requestId, action); } buffer.readerIndex(expectedIndexReader); } } else { TransportResponseHandler handler = transportServiceAdapter.remove(requestId); // ignore if its null, the adapter logs it if (handler != null) { if (TransportStatus.isError(status)) { handlerResponseError(wrappedStream, handler); } else { handleResponse(ctx.getChannel(), wrappedStream, handler); } } else { // if its null, skip those bytes buffer.readerIndex(markedReaderIndex + size); } if (buffer.readerIndex() != expectedIndexReader) { if (buffer.readerIndex() < expectedIndexReader) { logger.warn( "Message not fully read (response) for [{}] handler {}, error [{}], resetting", requestId, handler, TransportStatus.isError(status)); } else { logger.warn( "Message read past expected size (response) for [{}] handler {}, error [{}], resetting", requestId, handler, TransportStatus.isError(status)); } buffer.readerIndex(expectedIndexReader); } } wrappedStream.close(); }
private void doRecovery(final RecoveryStatus recoveryStatus) { assert recoveryStatus.sourceNode() != null : "can't do a recovery without a source node"; logger.trace("collecting local files for {}", recoveryStatus); Store.MetadataSnapshot metadataSnapshot = null; try { metadataSnapshot = recoveryStatus.store().getMetadataOrEmpty(); } catch (IOException e) { logger.warn("error while listing local files, recover as if there are none", e); metadataSnapshot = Store.MetadataSnapshot.EMPTY; } catch (Exception e) { // this will be logged as warning later on... logger.trace("unexpected error while listing local files, failing recovery", e); onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(recoveryStatus.state(), "failed to list local files", e), true); return; } final StartRecoveryRequest request = new StartRecoveryRequest( recoveryStatus.shardId(), recoveryStatus.sourceNode(), clusterService.localNode(), false, metadataSnapshot, recoveryStatus.state().getType(), recoveryStatus.recoveryId()); final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>(); try { logger.trace( "[{}][{}] starting recovery from {}", request.shardId().index().name(), request.shardId().id(), request.sourceNode()); recoveryStatus.indexShard().prepareForIndexRecovery(); recoveryStatus .CancellableThreads() .execute( new CancellableThreads.Interruptable() { @Override public void run() throws InterruptedException { responseHolder.set( transportService .submitRequest( request.sourceNode(), RecoverySource.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() { @Override public RecoveryResponse newInstance() { return new RecoveryResponse(); } }) .txGet()); } }); final RecoveryResponse recoveryResponse = responseHolder.get(); assert responseHolder != null; final TimeValue recoveryTime = new TimeValue(recoveryStatus.state().getTimer().time()); // do this through ongoing recoveries to remove it from the collection onGoingRecoveries.markRecoveryAsDone(recoveryStatus.recoveryId()); if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder(); sb.append('[') .append(request.shardId().index().name()) .append(']') .append('[') .append(request.shardId().id()) .append("] "); sb.append("recovery completed from ") .append(request.sourceNode()) .append(", took[") .append(recoveryTime) .append("]\n"); sb.append(" phase1: recovered_files [") .append(recoveryResponse.phase1FileNames.size()) .append("]") .append(" with total_size of [") .append(new ByteSizeValue(recoveryResponse.phase1TotalSize)) .append("]") .append(", took [") .append(timeValueMillis(recoveryResponse.phase1Time)) .append("], throttling_wait [") .append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)) .append(']') .append("\n"); sb.append(" : reusing_files [") .append(recoveryResponse.phase1ExistingFileNames.size()) .append("] with total_size of [") .append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)) .append("]\n"); sb.append(" phase2: start took [") .append(timeValueMillis(recoveryResponse.startTime)) .append("]\n"); sb.append(" : recovered [") .append(recoveryResponse.phase2Operations) .append("]") .append(" transaction log operations") .append(", took [") .append(timeValueMillis(recoveryResponse.phase2Time)) .append("]") .append("\n"); logger.trace(sb.toString()); } else { logger.debug( "{} recovery done from [{}], took [{}]", request.shardId(), recoveryStatus.sourceNode(), recoveryTime); } } catch (CancellableThreads.ExecutionCancelledException e) { logger.trace("recovery cancelled", e); } catch (Throwable e) { if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] Got exception on recovery", e, request.shardId().index().name(), request.shardId().id()); } Throwable cause = ExceptionsHelper.unwrapCause(e); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // do it twice, in case we have double transport exception cause = ExceptionsHelper.unwrapCause(cause); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // here, we would add checks against exception that need to be retried (and not removeAndClean // in this case) if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) { // if the target is not ready yet, retry retryRecovery( recoveryStatus, "remote shard not ready", recoverySettings.retryDelayStateSync(), request); return; } if (cause instanceof DelayRecoveryException) { retryRecovery(recoveryStatus, cause, recoverySettings.retryDelayStateSync(), request); return; } if (cause instanceof ConnectTransportException) { logger.debug( "delaying recovery of {} for [{}] due to networking error [{}]", recoveryStatus.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage()); retryRecovery( recoveryStatus, cause.getMessage(), recoverySettings.retryDelayNetwork(), request); return; } if (cause instanceof IndexShardClosedException) { onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(request, "source shard is closed", cause), false); return; } if (cause instanceof AlreadyClosedException) { onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(request, "source shard is closed", cause), false); return; } onGoingRecoveries.failRecovery( recoveryStatus.recoveryId(), new RecoveryFailedException(request, e), true); } }