/** @author kimchy (shay.banon) */ public class RecoveryStatus { public static enum Stage { INIT, INDEX, TRANSLOG, FINALIZE, DONE } ConcurrentMap<String, IndexOutput> openIndexOutputs = ConcurrentCollections.newConcurrentMap(); ConcurrentMap<String, String> checksums = ConcurrentCollections.newConcurrentMap(); final long startTime = System.currentTimeMillis(); long time; List<String> phase1FileNames; List<Long> phase1FileSizes; List<String> phase1ExistingFileNames; List<Long> phase1ExistingFileSizes; long phase1TotalSize; long phase1ExistingTotalSize; volatile Stage stage = Stage.INIT; volatile long currentTranslogOperations = 0; AtomicLong currentFilesSize = new AtomicLong(); public long startTime() { return startTime; } public long time() { return this.time; } public long phase1TotalSize() { return phase1TotalSize; } public long phase1ExistingTotalSize() { return phase1ExistingTotalSize; } public Stage stage() { return stage; } public long currentTranslogOperations() { return currentTranslogOperations; } public long currentFilesSize() { return currentFilesSize.get(); } }
/** * A {@link ZenPing} implementation which returns results based on an static in-memory map. This * allows pinging to be immediate and can be used to speed up tests. */ public final class MockZenPing extends AbstractComponent implements ZenPing { /** * A marker plugin used by {@link org.elasticsearch.node.MockNode} to indicate this mock zen ping * should be used. */ public static class TestPlugin extends Plugin {} static final Map<ClusterName, Set<MockZenPing>> activeNodesPerCluster = ConcurrentCollections.newConcurrentMap(); private volatile PingContextProvider contextProvider; @Inject public MockZenPing(Settings settings) { super(settings); } @Override public void start(PingContextProvider contextProvider) { this.contextProvider = contextProvider; assert contextProvider != null; boolean added = getActiveNodesForCurrentCluster().add(this); assert added; } @Override public void ping(PingListener listener, TimeValue timeout) { logger.info("pinging using mock zen ping"); List<PingResponse> responseList = getActiveNodesForCurrentCluster() .stream() .filter( p -> p != this) // remove this as pings are not expected to return the local node .map(MockZenPing::getPingResponse) .collect(Collectors.toList()); listener.onPing(responseList); } private ClusterName getClusterName() { return contextProvider.clusterState().getClusterName(); } private PingResponse getPingResponse() { final ClusterState clusterState = contextProvider.clusterState(); return new PingResponse( clusterState.nodes().getLocalNode(), clusterState.nodes().getMasterNode(), clusterState); } private Set<MockZenPing> getActiveNodesForCurrentCluster() { return activeNodesPerCluster.computeIfAbsent( getClusterName(), clusterName -> ConcurrentCollections.newConcurrentSet()); } @Override public void close() { boolean found = getActiveNodesForCurrentCluster().remove(this); assert found; } }
@Override public BloomFilter filter(IndexReader reader, String fieldName, boolean asyncLoad) { int currentNumDocs = reader.numDocs(); if (currentNumDocs == 0) { return BloomFilter.EMPTY; } ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey()); if (fieldCache == null) { synchronized (creationMutex) { fieldCache = cache.get(reader.getFieldCacheKey()); if (fieldCache == null) { fieldCache = ConcurrentCollections.newConcurrentMap(); cache.put(reader.getFieldCacheKey(), fieldCache); } } } BloomFilterEntry filter = fieldCache.get(fieldName); if (filter == null) { synchronized (fieldCache) { filter = fieldCache.get(fieldName); if (filter == null) { filter = new BloomFilterEntry(reader.numDocs(), BloomFilter.NONE); filter.loading.set(true); fieldCache.put(fieldName, filter); // now, do the async load of it... BloomFilterLoader loader = new BloomFilterLoader(reader, fieldName); if (asyncLoad) { threadPool.cached().execute(loader); } else { loader.run(); filter = fieldCache.get(fieldName); } } } } // if we too many deletes, we need to reload the bloom filter so it will be more effective if (filter.numDocs > 1000 && (currentNumDocs / filter.numDocs) < 0.6) { if (filter.loading.compareAndSet(false, true)) { // do the async loading BloomFilterLoader loader = new BloomFilterLoader(reader, fieldName); if (asyncLoad) { threadPool.cached().execute(loader); } else { loader.run(); filter = fieldCache.get(fieldName); } } } return filter.filter; }
/** * The scan context allows to optimize readers we already processed during scanning. We do that by * keeping track of the count per reader, and if we are done with it, we no longer process it by * using a filter that returns null docIdSet for this reader. */ public class ScanContext { private final ConcurrentMap<IndexReader, ReaderState> readerStates = ConcurrentCollections.newConcurrentMap(); public void clear() { readerStates.clear(); } public TopDocs execute(SearchContext context) throws IOException { ScanCollector collector = new ScanCollector(readerStates, context.from(), context.size(), context.trackScores()); Query query = new FilteredQuery(context.query(), new ScanFilter(readerStates, collector)); try { context.searcher().search(query, collector); } catch (ScanCollector.StopCollectingException e) { // all is well } return collector.topDocs(); } static class ScanCollector extends SimpleCollector { private final ConcurrentMap<IndexReader, ReaderState> readerStates; private final int from; private final int to; private final ArrayList<ScoreDoc> docs; private final boolean trackScores; private Scorer scorer; private int docBase; private int counter; private IndexReader currentReader; private ReaderState readerState; ScanCollector( ConcurrentMap<IndexReader, ReaderState> readerStates, int from, int size, boolean trackScores) { this.readerStates = readerStates; this.from = from; this.to = from + size; this.trackScores = trackScores; this.docs = new ArrayList<>(size); } void incCounter(int count) { this.counter += count; } public TopDocs topDocs() { return new TopDocs(docs.size(), docs.toArray(new ScoreDoc[docs.size()]), 0f); } @Override public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } @Override public void collect(int doc) throws IOException { if (counter >= from) { docs.add(new ScoreDoc(docBase + doc, trackScores ? scorer.score() : 0f)); } readerState.count++; counter++; if (counter >= to) { throw StopCollectingException; } } @Override public void doSetNextReader(LeafReaderContext context) throws IOException { // if we have a reader state, and we haven't registered one already, register it // we need to check in readersState since even when the filter return null, setNextReader is // still // called for that reader (before) if (currentReader != null && !readerStates.containsKey(currentReader)) { assert readerState != null; readerState.done = true; readerStates.put(currentReader, readerState); } this.currentReader = context.reader(); this.docBase = context.docBase; this.readerState = new ReaderState(); } public static final RuntimeException StopCollectingException = new StopCollectingException(); static class StopCollectingException extends RuntimeException { @Override public Throwable fillInStackTrace() { return null; } } } public static class ScanFilter extends Filter { private final ConcurrentMap<IndexReader, ReaderState> readerStates; private final ScanCollector scanCollector; public ScanFilter( ConcurrentMap<IndexReader, ReaderState> readerStates, ScanCollector scanCollector) { this.readerStates = readerStates; this.scanCollector = scanCollector; } @Override public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException { ReaderState readerState = readerStates.get(context.reader()); if (readerState != null && readerState.done) { scanCollector.incCounter(readerState.count); return null; } return BitsFilteredDocIdSet.wrap(new AllDocIdSet(context.reader().maxDoc()), acceptedDocs); } } static class ReaderState { public int count; public boolean done; } }
public class IndexFieldDataService extends AbstractIndexComponent { private static final String DISABLED_FORMAT = "disabled"; private static final String DOC_VALUES_FORMAT = "doc_values"; private static final String ARRAY_FORMAT = "array"; private static final String PAGED_BYTES_FORMAT = "paged_bytes"; private static final String FST_FORMAT = "fst"; private static final String COMPRESSED_FORMAT = "compressed"; private static final ImmutableMap<String, IndexFieldData.Builder> buildersByType; private static final ImmutableMap<String, IndexFieldData.Builder> docValuesBuildersByType; private static final ImmutableMap<Tuple<String, String>, IndexFieldData.Builder> buildersByTypeAndFormat; private final CircuitBreakerService circuitBreakerService; private final IndicesFieldDataCacheListener indicesFieldDataCacheListener; static { buildersByType = MapBuilder.<String, IndexFieldData.Builder>newMapBuilder() .put("string", new PagedBytesIndexFieldData.Builder()) .put("float", new FloatArrayIndexFieldData.Builder()) .put("double", new DoubleArrayIndexFieldData.Builder()) .put( "byte", new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.BYTE)) .put( "short", new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.SHORT)) .put( "int", new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.INT)) .put( "long", new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.LONG)) .put("geo_point", new GeoPointDoubleArrayIndexFieldData.Builder()) .put(ParentFieldMapper.NAME, new ParentChildIndexFieldData.Builder()) .put("binary", new DisabledIndexFieldData.Builder()) .immutableMap(); docValuesBuildersByType = MapBuilder.<String, IndexFieldData.Builder>newMapBuilder() .put("string", new DocValuesIndexFieldData.Builder()) .put( "float", new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.FLOAT)) .put( "double", new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.DOUBLE)) .put( "byte", new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.BYTE)) .put( "short", new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.SHORT)) .put( "int", new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.INT)) .put( "long", new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.LONG)) .put("geo_point", new GeoPointBinaryDVIndexFieldData.Builder()) .put("binary", new BytesBinaryDVIndexFieldData.Builder()) .immutableMap(); buildersByTypeAndFormat = MapBuilder.<Tuple<String, String>, IndexFieldData.Builder>newMapBuilder() .put(Tuple.tuple("string", PAGED_BYTES_FORMAT), new PagedBytesIndexFieldData.Builder()) .put(Tuple.tuple("string", FST_FORMAT), new FSTBytesIndexFieldData.Builder()) .put(Tuple.tuple("string", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder()) .put(Tuple.tuple("string", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put(Tuple.tuple("float", ARRAY_FORMAT), new FloatArrayIndexFieldData.Builder()) .put( Tuple.tuple("float", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.FLOAT)) .put(Tuple.tuple("float", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put(Tuple.tuple("double", ARRAY_FORMAT), new DoubleArrayIndexFieldData.Builder()) .put( Tuple.tuple("double", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.DOUBLE)) .put(Tuple.tuple("double", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put( Tuple.tuple("byte", ARRAY_FORMAT), new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.BYTE)) .put( Tuple.tuple("byte", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.BYTE)) .put(Tuple.tuple("byte", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put( Tuple.tuple("short", ARRAY_FORMAT), new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.SHORT)) .put( Tuple.tuple("short", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.SHORT)) .put(Tuple.tuple("short", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put( Tuple.tuple("int", ARRAY_FORMAT), new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.INT)) .put( Tuple.tuple("int", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.INT)) .put(Tuple.tuple("int", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put( Tuple.tuple("long", ARRAY_FORMAT), new PackedArrayIndexFieldData.Builder() .setNumericType(IndexNumericFieldData.NumericType.LONG)) .put( Tuple.tuple("long", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder() .numericType(IndexNumericFieldData.NumericType.LONG)) .put(Tuple.tuple("long", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put( Tuple.tuple("geo_point", ARRAY_FORMAT), new GeoPointDoubleArrayIndexFieldData.Builder()) .put( Tuple.tuple("geo_point", DOC_VALUES_FORMAT), new GeoPointBinaryDVIndexFieldData.Builder()) .put(Tuple.tuple("geo_point", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .put( Tuple.tuple("geo_point", COMPRESSED_FORMAT), new GeoPointCompressedIndexFieldData.Builder()) .put( Tuple.tuple("binary", DOC_VALUES_FORMAT), new BytesBinaryDVIndexFieldData.Builder()) .put(Tuple.tuple("binary", DISABLED_FORMAT), new DisabledIndexFieldData.Builder()) .immutableMap(); } private final IndicesFieldDataCache indicesFieldDataCache; private final ConcurrentMap<String, IndexFieldData<?>> loadedFieldData = ConcurrentCollections.newConcurrentMap(); private final Map<String, IndexFieldDataCache> fieldDataCaches = Maps.newHashMap(); // no need for concurrency support, always used under lock IndexService indexService; // public for testing public IndexFieldDataService(Index index, CircuitBreakerService circuitBreakerService) { this( index, ImmutableSettings.Builder.EMPTY_SETTINGS, new IndicesFieldDataCache( ImmutableSettings.Builder.EMPTY_SETTINGS, new IndicesFieldDataCacheListener(circuitBreakerService)), circuitBreakerService, new IndicesFieldDataCacheListener(circuitBreakerService)); } // public for testing public IndexFieldDataService( Index index, CircuitBreakerService circuitBreakerService, IndicesFieldDataCache indicesFieldDataCache) { this( index, ImmutableSettings.Builder.EMPTY_SETTINGS, indicesFieldDataCache, circuitBreakerService, new IndicesFieldDataCacheListener(circuitBreakerService)); } @Inject public IndexFieldDataService( Index index, @IndexSettings Settings indexSettings, IndicesFieldDataCache indicesFieldDataCache, CircuitBreakerService circuitBreakerService, IndicesFieldDataCacheListener indicesFieldDataCacheListener) { super(index, indexSettings); this.indicesFieldDataCache = indicesFieldDataCache; this.circuitBreakerService = circuitBreakerService; this.indicesFieldDataCacheListener = indicesFieldDataCacheListener; } // we need to "inject" the index service to not create cyclic dep public void setIndexService(IndexService indexService) { this.indexService = indexService; } public void clear() { synchronized (loadedFieldData) { for (IndexFieldData<?> fieldData : loadedFieldData.values()) { fieldData.clear(); } loadedFieldData.clear(); for (IndexFieldDataCache cache : fieldDataCaches.values()) { cache.clear(); } fieldDataCaches.clear(); } } public void clearField(String fieldName) { synchronized (loadedFieldData) { IndexFieldData<?> fieldData = loadedFieldData.remove(fieldName); if (fieldData != null) { fieldData.clear(); } IndexFieldDataCache cache = fieldDataCaches.remove(fieldName); if (cache != null) { cache.clear(); } } } public void clear(IndexReader reader) { synchronized (loadedFieldData) { for (IndexFieldData<?> indexFieldData : loadedFieldData.values()) { indexFieldData.clear(reader); } for (IndexFieldDataCache cache : fieldDataCaches.values()) { cache.clear(reader); } } } public void onMappingUpdate() { // synchronize to make sure to not miss field data instances that are being loaded synchronized (loadedFieldData) { // important: do not clear fieldDataCaches: the cache may be reused loadedFieldData.clear(); } } public <IFD extends IndexFieldData<?>> IFD getForField(FieldMapper<?> mapper) { final FieldMapper.Names fieldNames = mapper.names(); final FieldDataType type = mapper.fieldDataType(); final boolean docValues = mapper.hasDocValues(); IndexFieldData<?> fieldData = loadedFieldData.get(fieldNames.indexName()); if (fieldData == null) { synchronized (loadedFieldData) { fieldData = loadedFieldData.get(fieldNames.indexName()); if (fieldData == null) { IndexFieldData.Builder builder = null; String format = type.getFormat(indexSettings); if (format != null && FieldDataType.DOC_VALUES_FORMAT_VALUE.equals(format) && !docValues) { logger.warn( "field [" + fieldNames.fullName() + "] has no doc values, will use default field data format"); format = null; } if (format != null) { builder = buildersByTypeAndFormat.get(Tuple.tuple(type.getType(), format)); if (builder == null) { logger.warn( "failed to find format [" + format + "] for field [" + fieldNames.fullName() + "], will use default"); } } if (builder == null && docValues) { builder = docValuesBuildersByType.get(type.getType()); } if (builder == null) { builder = buildersByType.get(type.getType()); } if (builder == null) { throw new ElasticsearchIllegalArgumentException( "failed to find field data builder for field " + fieldNames.fullName() + ", and type " + type.getType()); } IndexFieldDataCache cache = fieldDataCaches.get(fieldNames.indexName()); if (cache == null) { // we default to node level cache, which in turn defaults to be unbounded // this means changing the node level settings is simple, just set the bounds there String cacheType = type.getSettings().get("cache", indexSettings.get("index.fielddata.cache", "node")); if ("resident".equals(cacheType)) { cache = new IndexFieldDataCache.Resident( indexService, fieldNames, type, indicesFieldDataCacheListener); } else if ("soft".equals(cacheType)) { cache = new IndexFieldDataCache.Soft( indexService, fieldNames, type, indicesFieldDataCacheListener); } else if ("node".equals(cacheType)) { cache = indicesFieldDataCache.buildIndexFieldDataCache( indexService, index, fieldNames, type); } else { throw new ElasticsearchIllegalArgumentException( "cache type not supported [" + cacheType + "] for field [" + fieldNames.fullName() + "]"); } fieldDataCaches.put(fieldNames.indexName(), cache); } GlobalOrdinalsBuilder globalOrdinalBuilder = new InternalGlobalOrdinalsBuilder(index(), indexSettings); fieldData = builder.build( index, indexSettings, mapper, cache, circuitBreakerService, indexService.mapperService(), globalOrdinalBuilder); loadedFieldData.put(fieldNames.indexName(), fieldData); } } } return (IFD) fieldData; } }
public class SearchService extends AbstractLifecycleComponent<SearchService> { public static final String NORMS_LOADING_KEY = "index.norms.loading"; public static final String DEFAULT_KEEPALIVE_KEY = "search.default_keep_alive"; public static final String KEEPALIVE_INTERVAL_KEY = "search.keep_alive_interval"; private final ThreadPool threadPool; private final ClusterService clusterService; private final IndicesService indicesService; private final IndicesWarmer indicesWarmer; private final ScriptService scriptService; private final PageCacheRecycler pageCacheRecycler; private final BigArrays bigArrays; private final DfsPhase dfsPhase; private final QueryPhase queryPhase; private final FetchPhase fetchPhase; private final IndicesQueryCache indicesQueryCache; private final long defaultKeepAlive; private final ScheduledFuture<?> keepAliveReaper; private final AtomicLong idGenerator = new AtomicLong(); private final ConcurrentMapLong<SearchContext> activeContexts = ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency(); private final ImmutableMap<String, SearchParseElement> elementParsers; @Inject public SearchService( Settings settings, ClusterService clusterService, IndicesService indicesService, IndicesWarmer indicesWarmer, ThreadPool threadPool, ScriptService scriptService, PageCacheRecycler pageCacheRecycler, BigArrays bigArrays, DfsPhase dfsPhase, QueryPhase queryPhase, FetchPhase fetchPhase, IndicesQueryCache indicesQueryCache) { super(settings); this.threadPool = threadPool; this.clusterService = clusterService; this.indicesService = indicesService; indicesService .indicesLifecycle() .addListener( new IndicesLifecycle.Listener() { @Override public void afterIndexDeleted(Index index, @IndexSettings Settings indexSettings) { // once an index is closed we can just clean up all the pending search context // information // to release memory and let references to the filesystem go etc. freeAllContextForIndex(index); } }); this.indicesWarmer = indicesWarmer; this.scriptService = scriptService; this.pageCacheRecycler = pageCacheRecycler; this.bigArrays = bigArrays; this.dfsPhase = dfsPhase; this.queryPhase = queryPhase; this.fetchPhase = fetchPhase; this.indicesQueryCache = indicesQueryCache; TimeValue keepAliveInterval = settings.getAsTime(KEEPALIVE_INTERVAL_KEY, timeValueMinutes(1)); // we can have 5 minutes here, since we make sure to clean with search requests and when // shard/index closes this.defaultKeepAlive = settings.getAsTime(DEFAULT_KEEPALIVE_KEY, timeValueMinutes(5)).millis(); Map<String, SearchParseElement> elementParsers = new HashMap<>(); elementParsers.putAll(dfsPhase.parseElements()); elementParsers.putAll(queryPhase.parseElements()); elementParsers.putAll(fetchPhase.parseElements()); elementParsers.put("stats", new StatsGroupsParseElement()); this.elementParsers = ImmutableMap.copyOf(elementParsers); this.keepAliveReaper = threadPool.scheduleWithFixedDelay(new Reaper(), keepAliveInterval); this.indicesWarmer.addListener(new NormsWarmer()); this.indicesWarmer.addListener(new FieldDataWarmer()); this.indicesWarmer.addListener(new SearchWarmer()); } protected void putContext(SearchContext context) { final SearchContext previous = activeContexts.put(context.id(), context); assert previous == null; } protected SearchContext removeContext(long id) { return activeContexts.remove(id); } @Override protected void doStart() {} @Override protected void doStop() { for (final SearchContext context : activeContexts.values()) { freeContext(context.id()); } } @Override protected void doClose() { doStop(); FutureUtils.cancel(keepAliveReaper); } public DfsSearchResult executeDfsPhase(ShardSearchRequest request) { final SearchContext context = createAndPutContext(request); try { contextProcessing(context); dfsPhase.execute(context); contextProcessedSuccessfully(context); return context.dfsResult(); } catch (Throwable e) { logger.trace("Dfs phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public QuerySearchResult executeScan(ShardSearchRequest request) { final SearchContext context = createAndPutContext(request); final int originalSize = context.size(); try { if (context.aggregations() != null) { throw new IllegalArgumentException("aggregations are not supported with search_type=scan"); } if (context.scroll() == null) { throw new ElasticsearchException("Scroll must be provided when scanning..."); } assert context.searchType() == SearchType.SCAN; context.searchType( SearchType .QUERY_THEN_FETCH); // move to QUERY_THEN_FETCH, and then, when scrolling, move to // SCAN context.size(0); // set size to 0 so that we only count matches assert context.searchType() == SearchType.QUERY_THEN_FETCH; contextProcessing(context); queryPhase.execute(context); contextProcessedSuccessfully(context); return context.queryResult(); } catch (Throwable e) { logger.trace("Scan phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { context.size(originalSize); cleanContext(context); } } public ScrollQueryFetchSearchResult executeScan(InternalScrollSearchRequest request) { final SearchContext context = findContext(request.id()); contextProcessing(context); try { processScroll(request, context); if (context.searchType() == SearchType.QUERY_THEN_FETCH) { // first scanning, reset the from to 0 context.searchType(SearchType.SCAN); context.from(0); } queryPhase.execute(context); shortcutDocIdsToLoadForScanning(context); fetchPhase.execute(context); if (context.scroll() == null || context.fetchResult().hits().hits().length < context.size()) { freeContext(request.id()); } else { contextProcessedSuccessfully(context); } return new ScrollQueryFetchSearchResult( new QueryFetchSearchResult(context.queryResult(), context.fetchResult()), context.shardTarget()); } catch (Throwable e) { logger.trace("Scan phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } /** * Try to load the query results from the cache or execute the query phase directly if the cache * cannot be used. */ private void loadOrExecuteQueryPhase( final ShardSearchRequest request, final SearchContext context, final QueryPhase queryPhase) throws Exception { final boolean canCache = indicesQueryCache.canCache(request, context); if (canCache) { indicesQueryCache.loadIntoContext(request, context, queryPhase); } else { queryPhase.execute(context); } } public QuerySearchResultProvider executeQueryPhase(ShardSearchRequest request) { final SearchContext context = createAndPutContext(request); final ShardSearchStats shardSearchStats = context.indexShard().searchService(); try { shardSearchStats.onPreQueryPhase(context); long time = System.nanoTime(); contextProcessing(context); loadOrExecuteQueryPhase(request, context, queryPhase); if (context.queryResult().topDocs().scoreDocs.length == 0 && context.scroll() == null) { freeContext(context.id()); } else { contextProcessedSuccessfully(context); } shardSearchStats.onQueryPhase(context, System.nanoTime() - time); return context.queryResult(); } catch (Throwable e) { // execution exception can happen while loading the cache, strip it if (e instanceof ExecutionException) { e = e.getCause(); } shardSearchStats.onFailedQueryPhase(context); logger.trace("Query phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public ScrollQuerySearchResult executeQueryPhase(InternalScrollSearchRequest request) { final SearchContext context = findContext(request.id()); ShardSearchStats shardSearchStats = context.indexShard().searchService(); try { shardSearchStats.onPreQueryPhase(context); long time = System.nanoTime(); contextProcessing(context); processScroll(request, context); queryPhase.execute(context); contextProcessedSuccessfully(context); shardSearchStats.onQueryPhase(context, System.nanoTime() - time); return new ScrollQuerySearchResult(context.queryResult(), context.shardTarget()); } catch (Throwable e) { shardSearchStats.onFailedQueryPhase(context); logger.trace("Query phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public QuerySearchResult executeQueryPhase(QuerySearchRequest request) { final SearchContext context = findContext(request.id()); contextProcessing(context); try { final IndexCache indexCache = context.indexShard().indexService().cache(); context .searcher() .dfSource( new CachedDfSource( context.searcher().getIndexReader(), request.dfs(), context.similarityService().similarity(), indexCache.filter(), indexCache.filterPolicy())); } catch (Throwable e) { processFailure(context, e); cleanContext(context); throw new QueryPhaseExecutionException(context, "Failed to set aggregated df", e); } ShardSearchStats shardSearchStats = context.indexShard().searchService(); try { shardSearchStats.onPreQueryPhase(context); long time = System.nanoTime(); queryPhase.execute(context); if (context.queryResult().topDocs().scoreDocs.length == 0 && context.scroll() == null) { // no hits, we can release the context since there will be no fetch phase freeContext(context.id()); } else { contextProcessedSuccessfully(context); } shardSearchStats.onQueryPhase(context, System.nanoTime() - time); return context.queryResult(); } catch (Throwable e) { shardSearchStats.onFailedQueryPhase(context); logger.trace("Query phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public QueryFetchSearchResult executeFetchPhase(ShardSearchRequest request) { final SearchContext context = createAndPutContext(request); contextProcessing(context); try { ShardSearchStats shardSearchStats = context.indexShard().searchService(); shardSearchStats.onPreQueryPhase(context); long time = System.nanoTime(); try { loadOrExecuteQueryPhase(request, context, queryPhase); } catch (Throwable e) { shardSearchStats.onFailedQueryPhase(context); throw ExceptionsHelper.convertToRuntime(e); } long time2 = System.nanoTime(); shardSearchStats.onQueryPhase(context, time2 - time); shardSearchStats.onPreFetchPhase(context); try { shortcutDocIdsToLoad(context); fetchPhase.execute(context); if (context.scroll() == null) { freeContext(context.id()); } else { contextProcessedSuccessfully(context); } } catch (Throwable e) { shardSearchStats.onFailedFetchPhase(context); throw ExceptionsHelper.convertToRuntime(e); } shardSearchStats.onFetchPhase(context, System.nanoTime() - time2); return new QueryFetchSearchResult(context.queryResult(), context.fetchResult()); } catch (Throwable e) { logger.trace("Fetch phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public QueryFetchSearchResult executeFetchPhase(QuerySearchRequest request) { final SearchContext context = findContext(request.id()); contextProcessing(context); try { final IndexCache indexCache = context.indexShard().indexService().cache(); context .searcher() .dfSource( new CachedDfSource( context.searcher().getIndexReader(), request.dfs(), context.similarityService().similarity(), indexCache.filter(), indexCache.filterPolicy())); } catch (Throwable e) { freeContext(context.id()); cleanContext(context); throw new QueryPhaseExecutionException(context, "Failed to set aggregated df", e); } try { ShardSearchStats shardSearchStats = context.indexShard().searchService(); shardSearchStats.onPreQueryPhase(context); long time = System.nanoTime(); try { queryPhase.execute(context); } catch (Throwable e) { shardSearchStats.onFailedQueryPhase(context); throw ExceptionsHelper.convertToRuntime(e); } long time2 = System.nanoTime(); shardSearchStats.onQueryPhase(context, time2 - time); shardSearchStats.onPreFetchPhase(context); try { shortcutDocIdsToLoad(context); fetchPhase.execute(context); if (context.scroll() == null) { freeContext(request.id()); } else { contextProcessedSuccessfully(context); } } catch (Throwable e) { shardSearchStats.onFailedFetchPhase(context); throw ExceptionsHelper.convertToRuntime(e); } shardSearchStats.onFetchPhase(context, System.nanoTime() - time2); return new QueryFetchSearchResult(context.queryResult(), context.fetchResult()); } catch (Throwable e) { logger.trace("Fetch phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public ScrollQueryFetchSearchResult executeFetchPhase(InternalScrollSearchRequest request) { final SearchContext context = findContext(request.id()); contextProcessing(context); try { ShardSearchStats shardSearchStats = context.indexShard().searchService(); processScroll(request, context); shardSearchStats.onPreQueryPhase(context); long time = System.nanoTime(); try { queryPhase.execute(context); } catch (Throwable e) { shardSearchStats.onFailedQueryPhase(context); throw ExceptionsHelper.convertToRuntime(e); } long time2 = System.nanoTime(); shardSearchStats.onQueryPhase(context, time2 - time); shardSearchStats.onPreFetchPhase(context); try { shortcutDocIdsToLoad(context); fetchPhase.execute(context); if (context.scroll() == null) { freeContext(request.id()); } else { contextProcessedSuccessfully(context); } } catch (Throwable e) { shardSearchStats.onFailedFetchPhase(context); throw ExceptionsHelper.convertToRuntime(e); } shardSearchStats.onFetchPhase(context, System.nanoTime() - time2); return new ScrollQueryFetchSearchResult( new QueryFetchSearchResult(context.queryResult(), context.fetchResult()), context.shardTarget()); } catch (Throwable e) { logger.trace("Fetch phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } public FetchSearchResult executeFetchPhase(ShardFetchRequest request) { final SearchContext context = findContext(request.id()); contextProcessing(context); final ShardSearchStats shardSearchStats = context.indexShard().searchService(); try { if (request.lastEmittedDoc() != null) { context.lastEmittedDoc(request.lastEmittedDoc()); } context.docIdsToLoad(request.docIds(), 0, request.docIdsSize()); shardSearchStats.onPreFetchPhase(context); long time = System.nanoTime(); fetchPhase.execute(context); if (context.scroll() == null) { freeContext(request.id()); } else { contextProcessedSuccessfully(context); } shardSearchStats.onFetchPhase(context, System.nanoTime() - time); return context.fetchResult(); } catch (Throwable e) { shardSearchStats.onFailedFetchPhase(context); logger.trace("Fetch phase failed", e); processFailure(context, e); throw ExceptionsHelper.convertToRuntime(e); } finally { cleanContext(context); } } private SearchContext findContext(long id) throws SearchContextMissingException { SearchContext context = activeContexts.get(id); if (context == null) { throw new SearchContextMissingException(id); } SearchContext.setCurrent(context); return context; } final SearchContext createAndPutContext(ShardSearchRequest request) { SearchContext context = createContext(request, null); boolean success = false; try { putContext(context); context.indexShard().searchService().onNewContext(context); success = true; return context; } finally { if (!success) { freeContext(context.id()); } } } final SearchContext createContext( ShardSearchRequest request, @Nullable Engine.Searcher searcher) { IndexService indexService = indicesService.indexServiceSafe(request.index()); IndexShard indexShard = indexService.shardSafe(request.shardId()); SearchShardTarget shardTarget = new SearchShardTarget(clusterService.localNode().id(), request.index(), request.shardId()); Engine.Searcher engineSearcher = searcher == null ? indexShard.acquireSearcher("search") : searcher; SearchContext context = new DefaultSearchContext( idGenerator.incrementAndGet(), request, shardTarget, engineSearcher, indexService, indexShard, scriptService, pageCacheRecycler, bigArrays, threadPool.estimatedTimeInMillisCounter()); SearchContext.setCurrent(context); try { context.scroll(request.scroll()); parseTemplate(request); parseSource(context, request.source()); parseSource(context, request.extraSource()); // if the from and size are still not set, default them if (context.from() == -1) { context.from(0); } if (context.searchType() == SearchType.COUNT) { // so that the optimizations we apply to size=0 also apply to search_type=COUNT // and that we close contexts when done with the query phase context.searchType(SearchType.QUERY_THEN_FETCH); context.size(0); } else if (context.size() == -1) { context.size(10); } // pre process dfsPhase.preProcess(context); queryPhase.preProcess(context); fetchPhase.preProcess(context); // compute the context keep alive long keepAlive = defaultKeepAlive; if (request.scroll() != null && request.scroll().keepAlive() != null) { keepAlive = request.scroll().keepAlive().millis(); } context.keepAlive(keepAlive); } catch (Throwable e) { context.close(); throw ExceptionsHelper.convertToRuntime(e); } return context; } private void freeAllContextForIndex(Index index) { assert index != null; for (SearchContext ctx : activeContexts.values()) { if (index.equals(ctx.indexShard().shardId().index())) { freeContext(ctx.id()); } } } public boolean freeContext(long id) { final SearchContext context = removeContext(id); if (context != null) { try { context.indexShard().searchService().onFreeContext(context); } finally { context.close(); } return true; } return false; } public void freeAllScrollContexts() { for (SearchContext searchContext : activeContexts.values()) { if (searchContext.scroll() != null) { freeContext(searchContext.id()); } } } private void contextProcessing(SearchContext context) { // disable timeout while executing a search context.accessed(-1); } private void contextProcessedSuccessfully(SearchContext context) { context.accessed(threadPool.estimatedTimeInMillis()); } private void cleanContext(SearchContext context) { assert context == SearchContext.current(); context.clearReleasables(Lifetime.PHASE); SearchContext.removeCurrent(); } private void processFailure(SearchContext context, Throwable t) { freeContext(context.id()); try { if (Lucene.isCorruptionException(t)) { context.indexShard().failShard("search execution corruption failure", t); } } catch (Throwable e) { logger.warn( "failed to process shard failure to (potentially) send back shard failure on corruption", e); } } private void parseTemplate(ShardSearchRequest request) { BytesReference processedQuery; if (request.template() != null) { ExecutableScript executable = this.scriptService.executable(request.template(), ScriptContext.Standard.SEARCH); processedQuery = (BytesReference) executable.run(); } else { if (!hasLength(request.templateSource())) { return; } XContentParser parser = null; Template template = null; try { parser = XContentFactory.xContent(request.templateSource()) .createParser(request.templateSource()); template = TemplateQueryParser.parse(parser, "params", "template"); if (template.getType() == ScriptService.ScriptType.INLINE) { // Try to double parse for nested template id/file parser = null; try { ExecutableScript executable = this.scriptService.executable(template, ScriptContext.Standard.SEARCH); processedQuery = (BytesReference) executable.run(); parser = XContentFactory.xContent(processedQuery).createParser(processedQuery); } catch (ElasticsearchParseException epe) { // This was an non-nested template, the parse failure was due to this, it is safe to // assume this refers to a file // for backwards compatibility and keep going template = new Template( template.getScript(), ScriptService.ScriptType.FILE, MustacheScriptEngineService.NAME, null, template.getParams()); ExecutableScript executable = this.scriptService.executable(template, ScriptContext.Standard.SEARCH); processedQuery = (BytesReference) executable.run(); } if (parser != null) { try { Template innerTemplate = TemplateQueryParser.parse(parser); if (hasLength(innerTemplate.getScript()) && !innerTemplate.getType().equals(ScriptService.ScriptType.INLINE)) { // An inner template referring to a filename or id template = new Template( innerTemplate.getScript(), innerTemplate.getType(), MustacheScriptEngineService.NAME, null, template.getParams()); ExecutableScript executable = this.scriptService.executable(template, ScriptContext.Standard.SEARCH); processedQuery = (BytesReference) executable.run(); } } catch (ScriptParseException e) { // No inner template found, use original template from above } } } else { ExecutableScript executable = this.scriptService.executable(template, ScriptContext.Standard.SEARCH); processedQuery = (BytesReference) executable.run(); } } catch (IOException e) { throw new ElasticsearchParseException("Failed to parse template", e); } finally { Releasables.closeWhileHandlingException(parser); } if (!hasLength(template.getScript())) { throw new ElasticsearchParseException("Template must have [template] field configured"); } } request.source(processedQuery); } private void parseSource(SearchContext context, BytesReference source) throws SearchParseException { // nothing to parse... if (source == null || source.length() == 0) { return; } XContentParser parser = null; try { parser = XContentFactory.xContent(source).createParser(source); XContentParser.Token token; token = parser.nextToken(); if (token != XContentParser.Token.START_OBJECT) { throw new ElasticsearchParseException( "Expected START_OBJECT but got " + token.name() + " " + parser.currentName()); } while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { String fieldName = parser.currentName(); parser.nextToken(); SearchParseElement element = elementParsers.get(fieldName); if (element == null) { throw new SearchParseException( context, "No parser for element [" + fieldName + "]", parser.getTokenLocation()); } element.parse(parser, context); } else { if (token == null) { throw new ElasticsearchParseException( "End of query source reached but query is not complete."); } else { throw new ElasticsearchParseException( "Expected field name but got " + token.name() + " \"" + parser.currentName() + "\""); } } } } catch (Throwable e) { String sSource = "_na_"; try { sSource = XContentHelper.convertToJson(source, false); } catch (Throwable e1) { // ignore } throw new SearchParseException( context, "Failed to parse source [" + sSource + "]", parser.getTokenLocation(), e); } finally { if (parser != null) { parser.close(); } } } private static final int[] EMPTY_DOC_IDS = new int[0]; /** * Shortcut ids to load, we load only "from" and up to "size". The phase controller handles this * as well since the result is always size * shards for Q_A_F */ private void shortcutDocIdsToLoad(SearchContext context) { if (context.request().scroll() != null) { TopDocs topDocs = context.queryResult().topDocs(); int[] docIdsToLoad = new int[topDocs.scoreDocs.length]; for (int i = 0; i < topDocs.scoreDocs.length; i++) { docIdsToLoad[i] = topDocs.scoreDocs[i].doc; } context.docIdsToLoad(docIdsToLoad, 0, docIdsToLoad.length); } else { TopDocs topDocs = context.queryResult().topDocs(); if (topDocs.scoreDocs.length < context.from()) { // no more docs... context.docIdsToLoad(EMPTY_DOC_IDS, 0, 0); return; } int totalSize = context.from() + context.size(); int[] docIdsToLoad = new int[Math.min(topDocs.scoreDocs.length - context.from(), context.size())]; int counter = 0; for (int i = context.from(); i < totalSize; i++) { if (i < topDocs.scoreDocs.length) { docIdsToLoad[counter] = topDocs.scoreDocs[i].doc; } else { break; } counter++; } context.docIdsToLoad(docIdsToLoad, 0, counter); } } private void shortcutDocIdsToLoadForScanning(SearchContext context) { TopDocs topDocs = context.queryResult().topDocs(); if (topDocs.scoreDocs.length == 0) { // no more docs... context.docIdsToLoad(EMPTY_DOC_IDS, 0, 0); return; } int[] docIdsToLoad = new int[topDocs.scoreDocs.length]; for (int i = 0; i < docIdsToLoad.length; i++) { docIdsToLoad[i] = topDocs.scoreDocs[i].doc; } context.docIdsToLoad(docIdsToLoad, 0, docIdsToLoad.length); } private void processScroll(InternalScrollSearchRequest request, SearchContext context) { // process scroll context.from(context.from() + context.size()); context.scroll(request.scroll()); // update the context keep alive based on the new scroll value if (request.scroll() != null && request.scroll().keepAlive() != null) { context.keepAlive(request.scroll().keepAlive().millis()); } } /** Returns the number of active contexts in this SearchService */ public int getActiveContexts() { return this.activeContexts.size(); } static class NormsWarmer extends IndicesWarmer.Listener { @Override public TerminationHandle warmNewReaders( final IndexShard indexShard, IndexMetaData indexMetaData, final WarmerContext context, ThreadPool threadPool) { final Loading defaultLoading = Loading.parse(indexMetaData.settings().get(NORMS_LOADING_KEY), Loading.LAZY); final MapperService mapperService = indexShard.mapperService(); final ObjectSet<String> warmUp = new ObjectHashSet<>(); for (DocumentMapper docMapper : mapperService.docMappers(false)) { for (FieldMapper fieldMapper : docMapper.mappers()) { final String indexName = fieldMapper.fieldType().names().indexName(); Loading normsLoading = fieldMapper.fieldType().normsLoading(); if (normsLoading == null) { normsLoading = defaultLoading; } if (fieldMapper.fieldType().indexOptions() != IndexOptions.NONE && !fieldMapper.fieldType().omitNorms() && normsLoading == Loading.EAGER) { warmUp.add(indexName); } } } final CountDownLatch latch = new CountDownLatch(1); // Norms loading may be I/O intensive but is not CPU intensive, so we execute it in a single // task threadPool .executor(executor()) .execute( new Runnable() { @Override public void run() { try { for (ObjectCursor<String> stringObjectCursor : warmUp) { final String indexName = stringObjectCursor.value; final long start = System.nanoTime(); for (final LeafReaderContext ctx : context.searcher().reader().leaves()) { final NumericDocValues values = ctx.reader().getNormValues(indexName); if (values != null) { values.get(0); } } if (indexShard.warmerService().logger().isTraceEnabled()) { indexShard .warmerService() .logger() .trace( "warmed norms for [{}], took [{}]", indexName, TimeValue.timeValueNanos(System.nanoTime() - start)); } } } catch (Throwable t) { indexShard.warmerService().logger().warn("failed to warm-up norms", t); } finally { latch.countDown(); } } }); return new TerminationHandle() { @Override public void awaitTermination() throws InterruptedException { latch.await(); } }; } @Override public TerminationHandle warmTopReader( IndexShard indexShard, IndexMetaData indexMetaData, WarmerContext context, ThreadPool threadPool) { return TerminationHandle.NO_WAIT; } } static class FieldDataWarmer extends IndicesWarmer.Listener { @Override public TerminationHandle warmNewReaders( final IndexShard indexShard, IndexMetaData indexMetaData, final WarmerContext context, ThreadPool threadPool) { final MapperService mapperService = indexShard.mapperService(); final Map<String, MappedFieldType> warmUp = new HashMap<>(); for (DocumentMapper docMapper : mapperService.docMappers(false)) { for (FieldMapper fieldMapper : docMapper.mappers()) { final FieldDataType fieldDataType = fieldMapper.fieldType().fieldDataType(); if (fieldDataType == null) { continue; } if (fieldDataType.getLoading() == Loading.LAZY) { continue; } final String indexName = fieldMapper.fieldType().names().indexName(); if (warmUp.containsKey(indexName)) { continue; } warmUp.put(indexName, fieldMapper.fieldType()); } } final IndexFieldDataService indexFieldDataService = indexShard.indexFieldDataService(); final Executor executor = threadPool.executor(executor()); final CountDownLatch latch = new CountDownLatch(context.searcher().reader().leaves().size() * warmUp.size()); for (final LeafReaderContext ctx : context.searcher().reader().leaves()) { for (final MappedFieldType fieldType : warmUp.values()) { executor.execute( new Runnable() { @Override public void run() { try { final long start = System.nanoTime(); indexFieldDataService.getForField(fieldType).load(ctx); if (indexShard.warmerService().logger().isTraceEnabled()) { indexShard .warmerService() .logger() .trace( "warmed fielddata for [{}], took [{}]", fieldType.names().fullName(), TimeValue.timeValueNanos(System.nanoTime() - start)); } } catch (Throwable t) { indexShard .warmerService() .logger() .warn( "failed to warm-up fielddata for [{}]", t, fieldType.names().fullName()); } finally { latch.countDown(); } } }); } } return new TerminationHandle() { @Override public void awaitTermination() throws InterruptedException { latch.await(); } }; } @Override public TerminationHandle warmTopReader( final IndexShard indexShard, IndexMetaData indexMetaData, final WarmerContext context, ThreadPool threadPool) { final MapperService mapperService = indexShard.mapperService(); final Map<String, MappedFieldType> warmUpGlobalOrdinals = new HashMap<>(); for (DocumentMapper docMapper : mapperService.docMappers(false)) { for (FieldMapper fieldMapper : docMapper.mappers()) { final FieldDataType fieldDataType = fieldMapper.fieldType().fieldDataType(); if (fieldDataType == null) { continue; } if (fieldDataType.getLoading() != Loading.EAGER_GLOBAL_ORDINALS) { continue; } final String indexName = fieldMapper.fieldType().names().indexName(); if (warmUpGlobalOrdinals.containsKey(indexName)) { continue; } warmUpGlobalOrdinals.put(indexName, fieldMapper.fieldType()); } } final IndexFieldDataService indexFieldDataService = indexShard.indexFieldDataService(); final Executor executor = threadPool.executor(executor()); final CountDownLatch latch = new CountDownLatch(warmUpGlobalOrdinals.size()); for (final MappedFieldType fieldType : warmUpGlobalOrdinals.values()) { executor.execute( new Runnable() { @Override public void run() { try { final long start = System.nanoTime(); IndexFieldData.Global ifd = indexFieldDataService.getForField(fieldType); ifd.loadGlobal(context.reader()); if (indexShard.warmerService().logger().isTraceEnabled()) { indexShard .warmerService() .logger() .trace( "warmed global ordinals for [{}], took [{}]", fieldType.names().fullName(), TimeValue.timeValueNanos(System.nanoTime() - start)); } } catch (Throwable t) { indexShard .warmerService() .logger() .warn( "failed to warm-up global ordinals for [{}]", t, fieldType.names().fullName()); } finally { latch.countDown(); } } }); } return new TerminationHandle() { @Override public void awaitTermination() throws InterruptedException { latch.await(); } }; } } class SearchWarmer extends IndicesWarmer.Listener { @Override public TerminationHandle warmNewReaders( IndexShard indexShard, IndexMetaData indexMetaData, WarmerContext context, ThreadPool threadPool) { return internalWarm(indexShard, indexMetaData, context, threadPool, false); } @Override public TerminationHandle warmTopReader( IndexShard indexShard, IndexMetaData indexMetaData, WarmerContext context, ThreadPool threadPool) { return internalWarm(indexShard, indexMetaData, context, threadPool, true); } public TerminationHandle internalWarm( final IndexShard indexShard, final IndexMetaData indexMetaData, final IndicesWarmer.WarmerContext warmerContext, ThreadPool threadPool, final boolean top) { IndexWarmersMetaData custom = indexMetaData.custom(IndexWarmersMetaData.TYPE); if (custom == null) { return TerminationHandle.NO_WAIT; } final Executor executor = threadPool.executor(executor()); final CountDownLatch latch = new CountDownLatch(custom.entries().size()); for (final IndexWarmersMetaData.Entry entry : custom.entries()) { executor.execute( new Runnable() { @Override public void run() { SearchContext context = null; try { long now = System.nanoTime(); ShardSearchRequest request = new ShardSearchLocalRequest( indexShard.shardId(), indexMetaData.numberOfShards(), SearchType.QUERY_THEN_FETCH, entry.source(), entry.types(), entry.queryCache()); context = createContext(request, warmerContext.searcher()); // if we use sort, we need to do query to sort on it and load relevant field data // if not, we might as well set size=0 (and cache if needed) if (context.sort() == null) { context.size(0); } boolean canCache = indicesQueryCache.canCache(request, context); // early terminate when we can cache, since we can only do proper caching on top // level searcher // also, if we can't cache, and its top, we don't need to execute it, since we // already did when its not top if (canCache != top) { return; } loadOrExecuteQueryPhase(request, context, queryPhase); long took = System.nanoTime() - now; if (indexShard.warmerService().logger().isTraceEnabled()) { indexShard .warmerService() .logger() .trace( "warmed [{}], took [{}]", entry.name(), TimeValue.timeValueNanos(took)); } } catch (Throwable t) { indexShard.warmerService().logger().warn("warmer [{}] failed", t, entry.name()); } finally { try { if (context != null) { freeContext(context.id()); cleanContext(context); } } finally { latch.countDown(); } } } }); } return new TerminationHandle() { @Override public void awaitTermination() throws InterruptedException { latch.await(); } }; } } class Reaper implements Runnable { @Override public void run() { final long time = threadPool.estimatedTimeInMillis(); for (SearchContext context : activeContexts.values()) { // Use the same value for both checks since lastAccessTime can // be modified by another thread between checks! final long lastAccessTime = context.lastAccessTime(); if (lastAccessTime == -1l) { // its being processed or timeout is disabled continue; } if ((time - lastAccessTime > context.keepAlive())) { logger.debug( "freeing search context [{}], time [{}], lastAccessTime [{}], keepAlive [{}]", context.id(), time, lastAccessTime, context.keepAlive()); freeContext(context.id()); } } } } }
/** * Each shard will have a percolator registry even if there isn't a {@link * PercolatorService#TYPE_NAME} document type in the index. For shards with indices that have no * {@link PercolatorService#TYPE_NAME} document type, this will hold no percolate queries. * * <p>Once a document type has been created, the real-time percolator will start to listen to write * events and update the this registry with queries in real time. */ public class PercolatorQueriesRegistry extends AbstractIndexShardComponent { // This is a shard level service, but these below are index level service: private final IndexQueryParserService queryParserService; private final MapperService mapperService; private final IndicesLifecycle indicesLifecycle; private final IndexCache indexCache; private final IndexFieldDataService indexFieldDataService; private final ShardIndexingService indexingService; private final ShardPercolateService shardPercolateService; private final ConcurrentMap<HashedBytesRef, Query> percolateQueries = ConcurrentCollections.newConcurrentMapWithAggressiveConcurrency(); private final ShardLifecycleListener shardLifecycleListener = new ShardLifecycleListener(); private final RealTimePercolatorOperationListener realTimePercolatorOperationListener = new RealTimePercolatorOperationListener(); private final PercolateTypeListener percolateTypeListener = new PercolateTypeListener(); private final AtomicBoolean realTimePercolatorEnabled = new AtomicBoolean(false); @Inject public PercolatorQueriesRegistry( ShardId shardId, @IndexSettings Settings indexSettings, IndexQueryParserService queryParserService, ShardIndexingService indexingService, IndicesLifecycle indicesLifecycle, MapperService mapperService, IndexCache indexCache, IndexFieldDataService indexFieldDataService, ShardPercolateService shardPercolateService) { super(shardId, indexSettings); this.queryParserService = queryParserService; this.mapperService = mapperService; this.indicesLifecycle = indicesLifecycle; this.indexingService = indexingService; this.indexCache = indexCache; this.indexFieldDataService = indexFieldDataService; this.shardPercolateService = shardPercolateService; indicesLifecycle.addListener(shardLifecycleListener); mapperService.addTypeListener(percolateTypeListener); } public ConcurrentMap<HashedBytesRef, Query> percolateQueries() { return percolateQueries; } public void close() { mapperService.removeTypeListener(percolateTypeListener); indicesLifecycle.removeListener(shardLifecycleListener); indexingService.removeListener(realTimePercolatorOperationListener); clear(); } public void clear() { percolateQueries.clear(); } void enableRealTimePercolator() { if (realTimePercolatorEnabled.compareAndSet(false, true)) { indexingService.addListener(realTimePercolatorOperationListener); } } void disableRealTimePercolator() { if (realTimePercolatorEnabled.compareAndSet(true, false)) { indexingService.removeListener(realTimePercolatorOperationListener); } } public void addPercolateQuery(String idAsString, BytesReference source) { Query newquery = parsePercolatorDocument(idAsString, source); HashedBytesRef id = new HashedBytesRef(new BytesRef(idAsString)); Query previousQuery = percolateQueries.put(id, newquery); shardPercolateService.addedQuery(id, previousQuery, newquery); } public void removePercolateQuery(String idAsString) { HashedBytesRef id = new HashedBytesRef(idAsString); Query query = percolateQueries.remove(id); if (query != null) { shardPercolateService.removedQuery(id, query); } } Query parsePercolatorDocument(String id, BytesReference source) { String type = null; BytesReference querySource = null; XContentParser parser = null; try { parser = XContentHelper.createParser(source); String currentFieldName = null; XContentParser.Token token = parser.nextToken(); // move the START_OBJECT if (token != XContentParser.Token.START_OBJECT) { throw new ElasticsearchException( "failed to parse query [" + id + "], not starting with OBJECT"); } while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { currentFieldName = parser.currentName(); } else if (token == XContentParser.Token.START_OBJECT) { if ("query".equals(currentFieldName)) { if (type != null) { return parseQuery(type, null, parser); } else { XContentBuilder builder = XContentFactory.contentBuilder(parser.contentType()); builder.copyCurrentStructure(parser); querySource = builder.bytes(); builder.close(); } } else { parser.skipChildren(); } } else if (token == XContentParser.Token.START_ARRAY) { parser.skipChildren(); } else if (token.isValue()) { if ("type".equals(currentFieldName)) { type = parser.text(); } } } return parseQuery(type, querySource, null); } catch (Exception e) { throw new PercolatorException(shardId().index(), "failed to parse query [" + id + "]", e); } finally { if (parser != null) { parser.close(); } } } private Query parseQuery(String type, BytesReference querySource, XContentParser parser) { if (type == null) { if (parser != null) { return queryParserService.parse(parser).query(); } else { return queryParserService.parse(querySource).query(); } } String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[] {type}); try { if (parser != null) { return queryParserService.parse(parser).query(); } else { return queryParserService.parse(querySource).query(); } } finally { QueryParseContext.setTypes(previousTypes); } } private class PercolateTypeListener implements DocumentTypeListener { @Override public void beforeCreate(DocumentMapper mapper) { if (PercolatorService.TYPE_NAME.equals(mapper.type())) { enableRealTimePercolator(); } } @Override public void afterRemove(DocumentMapper mapper) { if (PercolatorService.TYPE_NAME.equals(mapper.type())) { disableRealTimePercolator(); clear(); } } } private class ShardLifecycleListener extends IndicesLifecycle.Listener { @Override public void afterIndexShardCreated(IndexShard indexShard) { if (hasPercolatorType(indexShard)) { enableRealTimePercolator(); } } @Override public void afterIndexShardPostRecovery(IndexShard indexShard) { if (hasPercolatorType(indexShard)) { // percolator index has started, fetch what we can from it and initialize the indices // we have logger.debug( "loading percolator queries for index [{}] and shard[{}]...", shardId.index(), shardId.id()); loadQueries(indexShard); logger.trace( "done loading percolator queries for index [{}] and shard[{}]", shardId.index(), shardId.id()); } } private boolean hasPercolatorType(IndexShard indexShard) { ShardId otherShardId = indexShard.shardId(); return shardId.equals(otherShardId) && mapperService.hasMapping(PercolatorService.TYPE_NAME); } private void loadQueries(IndexShard shard) { try { shard.refresh(new Engine.Refresh("percolator_load_queries").force(true)); // Maybe add a mode load? This isn't really a write. We need write b/c state=post_recovery Engine.Searcher searcher = shard.acquireSearcher("percolator_load_queries", IndexShard.Mode.WRITE); try { Query query = new XConstantScoreQuery( indexCache .filter() .cache( new TermFilter( new Term(TypeFieldMapper.NAME, PercolatorService.TYPE_NAME)))); QueriesLoaderCollector queryCollector = new QueriesLoaderCollector( PercolatorQueriesRegistry.this, logger, mapperService, indexFieldDataService); searcher.searcher().search(query, queryCollector); Map<HashedBytesRef, Query> queries = queryCollector.queries(); for (Map.Entry<HashedBytesRef, Query> entry : queries.entrySet()) { Query previousQuery = percolateQueries.put(entry.getKey(), entry.getValue()); shardPercolateService.addedQuery(entry.getKey(), previousQuery, entry.getValue()); } } finally { searcher.release(); } } catch (Exception e) { throw new PercolatorException( shardId.index(), "failed to load queries from percolator index", e); } } } private class RealTimePercolatorOperationListener extends IndexingOperationListener { @Override public Engine.Create preCreate(Engine.Create create) { // validate the query here, before we index if (PercolatorService.TYPE_NAME.equals(create.type())) { parsePercolatorDocument(create.id(), create.source()); } return create; } @Override public void postCreateUnderLock(Engine.Create create) { // add the query under a doc lock if (PercolatorService.TYPE_NAME.equals(create.type())) { addPercolateQuery(create.id(), create.source()); } } @Override public Engine.Index preIndex(Engine.Index index) { // validate the query here, before we index if (PercolatorService.TYPE_NAME.equals(index.type())) { parsePercolatorDocument(index.id(), index.source()); } return index; } @Override public void postIndexUnderLock(Engine.Index index) { // add the query under a doc lock if (PercolatorService.TYPE_NAME.equals(index.type())) { addPercolateQuery(index.id(), index.source()); } } @Override public void postDeleteUnderLock(Engine.Delete delete) { // remove the query under a lock if (PercolatorService.TYPE_NAME.equals(delete.type())) { removePercolateQuery(delete.id()); } } // Updating the live percolate queries for a delete by query is tricky with the current way // delete by queries // are handled. It is only possible if we put a big lock around the post delete by query hook... // If we implement delete by query, that just runs a query and generates delete operations in a // bulk, then // updating the live percolator is automatically supported for delete by query. // @Override // public void postDeleteByQuery(Engine.DeleteByQuery deleteByQuery) { // } } }
final class MockTransport implements Transport { Set<DiscoveryNode> connectedNodes = ConcurrentCollections.newConcurrentSet(); volatile boolean randomConnectionExceptions = false; @Override public void transportServiceAdapter(TransportServiceAdapter service) {} @Override public BoundTransportAddress boundAddress() { return null; } @Override public Map<String, BoundTransportAddress> profileBoundAddresses() { return null; } @Override public TransportAddress[] addressesFromString(String address, int perAddressLimit) throws UnknownHostException { return new TransportAddress[0]; } @Override public boolean nodeConnected(DiscoveryNode node) { return connectedNodes.contains(node); } @Override public void connectToNode(DiscoveryNode node, ConnectionProfile connectionProfile) throws ConnectTransportException { if (connectionProfile == null) { if (connectedNodes.contains(node) == false && randomConnectionExceptions && randomBoolean()) { throw new ConnectTransportException(node, "simulated"); } connectedNodes.add(node); } } @Override public void disconnectFromNode(DiscoveryNode node) { connectedNodes.remove(node); } @Override public Connection getConnection(DiscoveryNode node) { return new Connection() { @Override public DiscoveryNode getNode() { return node; } @Override public void sendRequest( long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException {} @Override public void close() throws IOException {} }; } @Override public Connection openConnection(DiscoveryNode node, ConnectionProfile profile) throws IOException { return getConnection(node); } @Override public long serverOpen() { return 0; } @Override public List<String> getLocalAddresses() { return null; } @Override public Lifecycle.State lifecycleState() { return null; } @Override public void addLifecycleListener(LifecycleListener listener) {} @Override public void removeLifecycleListener(LifecycleListener listener) {} @Override public void start() {} @Override public void stop() {} @Override public void close() {} }
public class IndicesClusterStateService extends AbstractLifecycleComponent<IndicesClusterStateService> implements ClusterStateListener { private final IndicesService indicesService; private final ClusterService clusterService; private final ThreadPool threadPool; private final RecoveryTarget recoveryTarget; private final ShardStateAction shardStateAction; private final NodeIndexDeletedAction nodeIndexDeletedAction; private final NodeMappingRefreshAction nodeMappingRefreshAction; // a map of mappings type we have seen per index due to cluster state // we need this so we won't remove types automatically created as part of the indexing process private final ConcurrentMap<Tuple<String, String>, Boolean> seenMappings = ConcurrentCollections.newConcurrentMap(); // a list of shards that failed during recovery // we keep track of these shards in order to prevent repeated recovery of these shards on each // cluster state update private final ConcurrentMap<ShardId, FailedShard> failedShards = ConcurrentCollections.newConcurrentMap(); static class FailedShard { public final long version; public final long timestamp; FailedShard(long version) { this.version = version; this.timestamp = System.currentTimeMillis(); } } private final Object mutex = new Object(); private final FailedEngineHandler failedEngineHandler = new FailedEngineHandler(); private final boolean sendRefreshMapping; @Inject public IndicesClusterStateService( Settings settings, IndicesService indicesService, ClusterService clusterService, ThreadPool threadPool, RecoveryTarget recoveryTarget, ShardStateAction shardStateAction, NodeIndexDeletedAction nodeIndexDeletedAction, NodeMappingRefreshAction nodeMappingRefreshAction) { super(settings); this.indicesService = indicesService; this.clusterService = clusterService; this.threadPool = threadPool; this.recoveryTarget = recoveryTarget; this.shardStateAction = shardStateAction; this.nodeIndexDeletedAction = nodeIndexDeletedAction; this.nodeMappingRefreshAction = nodeMappingRefreshAction; this.sendRefreshMapping = this.settings.getAsBoolean("indices.cluster.send_refresh_mapping", true); } @Override protected void doStart() { clusterService.addFirst(this); } @Override protected void doStop() { clusterService.remove(this); } @Override protected void doClose() {} @Override public void clusterChanged(final ClusterChangedEvent event) { if (!indicesService.changesAllowed()) { return; } if (!lifecycle.started()) { return; } synchronized (mutex) { // we need to clean the shards and indices we have on this node, since we // are going to recover them again once state persistence is disabled (no master / not // recovered) // TODO: this feels a bit hacky here, a block disables state persistence, and then we clean // the allocated shards, maybe another flag in blocks? if (event.state().blocks().disableStatePersistence()) { for (IndexService indexService : indicesService) { String index = indexService.index().getName(); for (Integer shardId : indexService.shardIds()) { logger.debug("[{}][{}] removing shard (disabled block persistence)", index, shardId); try { indexService.removeShard(shardId, "removing shard (disabled block persistence)"); } catch (Throwable e) { logger.warn("[{}] failed to remove shard (disabled block persistence)", e, index); } } removeIndex(index, "cleaning index (disabled block persistence)"); } return; } cleanFailedShards(event); applyDeletedIndices(event); applyNewIndices(event); applyMappings(event); applyAliases(event); applyNewOrUpdatedShards(event); applyDeletedShards(event); applyCleanedIndices(event); applySettings(event); } } private void applyCleanedIndices(final ClusterChangedEvent event) { // handle closed indices, since they are not allocated on a node once they are closed // so applyDeletedIndices might not take them into account for (IndexService indexService : indicesService) { String index = indexService.index().getName(); IndexMetaData indexMetaData = event.state().metaData().index(index); if (indexMetaData != null && indexMetaData.state() == IndexMetaData.State.CLOSE) { for (Integer shardId : indexService.shardIds()) { logger.debug("[{}][{}] removing shard (index is closed)", index, shardId); try { indexService.removeShard(shardId, "removing shard (index is closed)"); } catch (Throwable e) { logger.warn("[{}] failed to remove shard (index is closed)", e, index); } } } } for (IndexService indexService : indicesService) { String index = indexService.index().getName(); if (indexService.shardIds().isEmpty()) { if (logger.isDebugEnabled()) { logger.debug("[{}] cleaning index (no shards allocated)", index); } // clean the index removeIndex(index, "removing index (no shards allocated)"); } } } private void applyDeletedIndices(final ClusterChangedEvent event) { final ClusterState previousState = event.previousState(); final String localNodeId = event.state().nodes().localNodeId(); assert localNodeId != null; for (IndexService indexService : indicesService) { IndexMetaData indexMetaData = event.state().metaData().index(indexService.index().name()); if (indexMetaData != null) { if (!indexMetaData.isSameUUID(indexService.indexUUID())) { logger.debug( "[{}] mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated", indexMetaData.index()); deleteIndex( indexMetaData.index(), "mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated"); } } } for (String index : event.indicesDeleted()) { if (logger.isDebugEnabled()) { logger.debug("[{}] cleaning index, no longer part of the metadata", index); } final Settings indexSettings; final IndexService idxService = indicesService.indexService(index); if (idxService != null) { indexSettings = idxService.getIndexSettings(); deleteIndex(index, "index no longer part of the metadata"); } else { final IndexMetaData metaData = previousState.metaData().index(index); assert metaData != null; indexSettings = metaData.settings(); indicesService.deleteClosedIndex( "closed index no longer part of the metadata", metaData, event.state()); } try { nodeIndexDeletedAction.nodeIndexDeleted(event.state(), index, indexSettings, localNodeId); } catch (Throwable e) { logger.debug("failed to send to master index {} deleted event", e, index); } } } private void applyDeletedShards(final ClusterChangedEvent event) { RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { return; } IntHashSet newShardIds = new IntHashSet(); for (IndexService indexService : indicesService) { String index = indexService.index().name(); IndexMetaData indexMetaData = event.state().metaData().index(index); if (indexMetaData == null) { continue; } // now, go over and delete shards that needs to get deleted newShardIds.clear(); for (ShardRouting shard : routingNode) { if (shard.index().equals(index)) { newShardIds.add(shard.id()); } } for (Integer existingShardId : indexService.shardIds()) { if (!newShardIds.contains(existingShardId)) { if (indexMetaData.state() == IndexMetaData.State.CLOSE) { if (logger.isDebugEnabled()) { logger.debug("[{}][{}] removing shard (index is closed)", index, existingShardId); } indexService.removeShard(existingShardId, "removing shard (index is closed)"); } else { // we can just remove the shard, without cleaning it locally, since we will clean it // when all shards are allocated in the IndicesStore if (logger.isDebugEnabled()) { logger.debug("[{}][{}] removing shard (not allocated)", index, existingShardId); } indexService.removeShard(existingShardId, "removing shard (not allocated)"); } } } } } private void applyNewIndices(final ClusterChangedEvent event) { // we only create indices for shards that are allocated RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { return; } for (ShardRouting shard : routingNode) { if (!indicesService.hasIndex(shard.index())) { final IndexMetaData indexMetaData = event.state().metaData().index(shard.index()); if (logger.isDebugEnabled()) { logger.debug("[{}] creating index", indexMetaData.index()); } try { indicesService.createIndex( indexMetaData.index(), indexMetaData.settings(), event.state().nodes().localNode().id()); } catch (Throwable e) { sendFailShard(shard, indexMetaData.getIndexUUID(), "failed to create index", e); } } } } private void applySettings(ClusterChangedEvent event) { if (!event.metaDataChanged()) { return; } for (IndexMetaData indexMetaData : event.state().metaData()) { if (!indicesService.hasIndex(indexMetaData.index())) { // we only create / update here continue; } // if the index meta data didn't change, no need check for refreshed settings if (!event.indexMetaDataChanged(indexMetaData)) { continue; } String index = indexMetaData.index(); IndexService indexService = indicesService.indexService(index); if (indexService == null) { // already deleted on us, ignore it continue; } IndexSettingsService indexSettingsService = indexService.injector().getInstance(IndexSettingsService.class); indexSettingsService.refreshSettings(indexMetaData.settings()); } } private void applyMappings(ClusterChangedEvent event) { // go over and update mappings for (IndexMetaData indexMetaData : event.state().metaData()) { if (!indicesService.hasIndex(indexMetaData.index())) { // we only create / update here continue; } List<String> typesToRefresh = Lists.newArrayList(); String index = indexMetaData.index(); IndexService indexService = indicesService.indexService(index); if (indexService == null) { // got deleted on us, ignore (closing the node) return; } try { MapperService mapperService = indexService.mapperService(); // first, go over and update the _default_ mapping (if exists) if (indexMetaData.mappings().containsKey(MapperService.DEFAULT_MAPPING)) { boolean requireRefresh = processMapping( index, mapperService, MapperService.DEFAULT_MAPPING, indexMetaData.mapping(MapperService.DEFAULT_MAPPING).source()); if (requireRefresh) { typesToRefresh.add(MapperService.DEFAULT_MAPPING); } } // go over and add the relevant mappings (or update them) for (ObjectCursor<MappingMetaData> cursor : indexMetaData.mappings().values()) { MappingMetaData mappingMd = cursor.value; String mappingType = mappingMd.type(); CompressedXContent mappingSource = mappingMd.source(); if (mappingType.equals(MapperService.DEFAULT_MAPPING)) { // we processed _default_ first continue; } boolean requireRefresh = processMapping(index, mapperService, mappingType, mappingSource); if (requireRefresh) { typesToRefresh.add(mappingType); } } if (!typesToRefresh.isEmpty() && sendRefreshMapping) { nodeMappingRefreshAction.nodeMappingRefresh( event.state(), new NodeMappingRefreshAction.NodeMappingRefreshRequest( index, indexMetaData.indexUUID(), typesToRefresh.toArray(new String[typesToRefresh.size()]), event.state().nodes().localNodeId())); } } catch (Throwable t) { // if we failed the mappings anywhere, we need to fail the shards for this index, note, we // safeguard // by creating the processing the mappings on the master, or on the node the mapping was // introduced on, // so this failure typically means wrong node level configuration or something similar for (IndexShard indexShard : indexService) { ShardRouting shardRouting = indexShard.routingEntry(); failAndRemoveShard(shardRouting, indexService, true, "failed to update mappings", t); } } } } private boolean processMapping( String index, MapperService mapperService, String mappingType, CompressedXContent mappingSource) throws Throwable { if (!seenMappings.containsKey(new Tuple<>(index, mappingType))) { seenMappings.put(new Tuple<>(index, mappingType), true); } // refresh mapping can happen for 2 reasons. The first is less urgent, and happens when the // mapping on this // node is ahead of what there is in the cluster state (yet an update-mapping has been sent to // it already, // it just hasn't been processed yet and published). Eventually, the mappings will converge, and // the refresh // mapping sent is more of a safe keeping (assuming the update mapping failed to reach the // master, ...) // the second case is where the parsing/merging of the mapping from the metadata doesn't result // in the same // mapping, in this case, we send to the master to refresh its own version of the mappings (to // conform with the // merge version of it, which it does when refreshing the mappings), and warn log it. boolean requiresRefresh = false; try { if (!mapperService.hasMapping(mappingType)) { if (logger.isDebugEnabled() && mappingSource.compressed().length < 512) { logger.debug( "[{}] adding mapping [{}], source [{}]", index, mappingType, mappingSource.string()); } else if (logger.isTraceEnabled()) { logger.trace( "[{}] adding mapping [{}], source [{}]", index, mappingType, mappingSource.string()); } else { logger.debug( "[{}] adding mapping [{}] (source suppressed due to length, use TRACE level if needed)", index, mappingType); } // we don't apply default, since it has been applied when the mappings were parsed initially mapperService.merge(mappingType, mappingSource, false, true); if (!mapperService.documentMapper(mappingType).mappingSource().equals(mappingSource)) { logger.debug( "[{}] parsed mapping [{}], and got different sources\noriginal:\n{}\nparsed:\n{}", index, mappingType, mappingSource, mapperService.documentMapper(mappingType).mappingSource()); requiresRefresh = true; } } else { DocumentMapper existingMapper = mapperService.documentMapper(mappingType); if (!mappingSource.equals(existingMapper.mappingSource())) { // mapping changed, update it if (logger.isDebugEnabled() && mappingSource.compressed().length < 512) { logger.debug( "[{}] updating mapping [{}], source [{}]", index, mappingType, mappingSource.string()); } else if (logger.isTraceEnabled()) { logger.trace( "[{}] updating mapping [{}], source [{}]", index, mappingType, mappingSource.string()); } else { logger.debug( "[{}] updating mapping [{}] (source suppressed due to length, use TRACE level if needed)", index, mappingType); } // we don't apply default, since it has been applied when the mappings were parsed // initially mapperService.merge(mappingType, mappingSource, false, true); if (!mapperService.documentMapper(mappingType).mappingSource().equals(mappingSource)) { requiresRefresh = true; logger.debug( "[{}] parsed mapping [{}], and got different sources\noriginal:\n{}\nparsed:\n{}", index, mappingType, mappingSource, mapperService.documentMapper(mappingType).mappingSource()); } } } } catch (Throwable e) { logger.warn( "[{}] failed to add mapping [{}], source [{}]", e, index, mappingType, mappingSource); throw e; } return requiresRefresh; } private boolean aliasesChanged(ClusterChangedEvent event) { return !event.state().metaData().aliases().equals(event.previousState().metaData().aliases()) || !event.state().routingTable().equals(event.previousState().routingTable()); } private void applyAliases(ClusterChangedEvent event) { // check if aliases changed if (aliasesChanged(event)) { // go over and update aliases for (IndexMetaData indexMetaData : event.state().metaData()) { String index = indexMetaData.index(); IndexService indexService = indicesService.indexService(index); if (indexService == null) { // we only create / update here continue; } IndexAliasesService indexAliasesService = indexService.aliasesService(); indexAliasesService.setAliases(indexMetaData.getAliases()); } } } private void applyNewOrUpdatedShards(final ClusterChangedEvent event) { if (!indicesService.changesAllowed()) { return; } RoutingTable routingTable = event.state().routingTable(); RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { failedShards.clear(); return; } DiscoveryNodes nodes = event.state().nodes(); for (final ShardRouting shardRouting : routingNode) { final IndexService indexService = indicesService.indexService(shardRouting.index()); if (indexService == null) { // got deleted on us, ignore continue; } final IndexMetaData indexMetaData = event.state().metaData().index(shardRouting.index()); if (indexMetaData == null) { // the index got deleted on the metadata, we will clean it later in the apply deleted method // call continue; } final int shardId = shardRouting.id(); if (!indexService.hasShard(shardId) && shardRouting.started()) { if (failedShards.containsKey(shardRouting.shardId())) { if (nodes.masterNode() != null) { shardStateAction.resendShardFailed( shardRouting, indexMetaData.getIndexUUID(), "master " + nodes.masterNode() + " marked shard as started, but shard has previous failed. resending shard failure.", nodes.masterNode()); } } else { // the master thinks we are started, but we don't have this shard at all, mark it as // failed sendFailShard( shardRouting, indexMetaData.getIndexUUID(), "master [" + nodes.masterNode() + "] marked shard as started, but shard has not been created, mark shard as failed", null); } continue; } IndexShard indexShard = indexService.shard(shardId); if (indexShard != null) { ShardRouting currentRoutingEntry = indexShard.routingEntry(); // if the current and global routing are initializing, but are still not the same, its a // different "shard" being allocated // for example: a shard that recovers from one node and now needs to recover to another // node, // or a replica allocated and then allocating a primary because the primary // failed on another node boolean shardHasBeenRemoved = false; if (currentRoutingEntry.initializing() && shardRouting.initializing() && !currentRoutingEntry.equals(shardRouting)) { logger.debug( "[{}][{}] removing shard (different instance of it allocated on this node, current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting); // closing the shard will also cancel any ongoing recovery. indexService.removeShard( shardRouting.id(), "removing shard (different instance of it allocated on this node)"); shardHasBeenRemoved = true; } else if (isPeerRecovery(shardRouting)) { final DiscoveryNode sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting); // check if there is an existing recovery going, and if so, and the source node is not the // same, cancel the recovery to restart it final Predicate<RecoveryStatus> shouldCancel = new Predicate<RecoveryStatus>() { @Override public boolean apply(@Nullable RecoveryStatus status) { return status.sourceNode().equals(sourceNode) == false; } }; if (recoveryTarget.cancelRecoveriesForShard( indexShard.shardId(), "recovery source node changed", shouldCancel)) { logger.debug( "[{}][{}] removing shard (recovery source changed), current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting); // closing the shard will also cancel any ongoing recovery. indexService.removeShard( shardRouting.id(), "removing shard (recovery source node changed)"); shardHasBeenRemoved = true; } } if (shardHasBeenRemoved == false && (shardRouting.equals(indexShard.routingEntry()) == false || shardRouting.version() > indexShard.routingEntry().version())) { if (shardRouting.primary() && indexShard.routingEntry().primary() == false && shardRouting.initializing() && indexShard.allowsPrimaryPromotion() == false) { logger.debug("{} reinitialize shard on primary promotion", indexShard.shardId()); indexService.removeShard(shardId, "promoted to primary"); } else { // if we happen to remove the shardRouting by id above we don't need to jump in here! indexShard.updateRoutingEntry( shardRouting, event.state().blocks().disableStatePersistence() == false); } } } if (shardRouting.initializing()) { applyInitializingShard(event.state(), indexMetaData, shardRouting); } } } private void cleanFailedShards(final ClusterChangedEvent event) { RoutingTable routingTable = event.state().routingTable(); RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { failedShards.clear(); return; } DiscoveryNodes nodes = event.state().nodes(); long now = System.currentTimeMillis(); String localNodeId = nodes.localNodeId(); Iterator<Map.Entry<ShardId, FailedShard>> iterator = failedShards.entrySet().iterator(); shards: while (iterator.hasNext()) { Map.Entry<ShardId, FailedShard> entry = iterator.next(); FailedShard failedShard = entry.getValue(); IndexRoutingTable indexRoutingTable = routingTable.index(entry.getKey().getIndex()); if (indexRoutingTable != null) { IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(entry.getKey().id()); if (shardRoutingTable != null) { for (ShardRouting shardRouting : shardRoutingTable.assignedShards()) { if (localNodeId.equals(shardRouting.currentNodeId())) { // we have a timeout here just to make sure we don't have dangled failed shards for // some reason // its just another safely layer if (shardRouting.version() == failedShard.version && ((now - failedShard.timestamp) < TimeValue.timeValueMinutes(60).millis())) { // It's the same failed shard - keep it if it hasn't timed out continue shards; } else { // Different version or expired, remove it break; } } } } } iterator.remove(); } } private void applyInitializingShard( final ClusterState state, final IndexMetaData indexMetaData, final ShardRouting shardRouting) { final IndexService indexService = indicesService.indexService(shardRouting.index()); if (indexService == null) { // got deleted on us, ignore return; } final RoutingTable routingTable = state.routingTable(); final DiscoveryNodes nodes = state.getNodes(); final int shardId = shardRouting.id(); if (indexService.hasShard(shardId)) { IndexShard indexShard = indexService.shardSafe(shardId); if (indexShard.state() == IndexShardState.STARTED || indexShard.state() == IndexShardState.POST_RECOVERY) { // the master thinks we are initializing, but we are already started or on POST_RECOVERY and // waiting // for master to confirm a shard started message (either master failover, or a cluster event // before // we managed to tell the master we started), mark us as started if (logger.isTraceEnabled()) { logger.trace( "{} master marked shard as initializing, but shard has state [{}], resending shard started to {}", indexShard.shardId(), indexShard.state(), nodes.masterNode()); } if (nodes.masterNode() != null) { shardStateAction.shardStarted( shardRouting, indexMetaData.getIndexUUID(), "master " + nodes.masterNode() + " marked shard as initializing, but shard state is [" + indexShard.state() + "], mark shard as started", nodes.masterNode()); } return; } else { if (indexShard.ignoreRecoveryAttempt()) { logger.trace( "ignoring recovery instruction for an existing shard {} (shard state: [{}])", indexShard.shardId(), indexShard.state()); return; } } } // if we're in peer recovery, try to find out the source node now so in case it fails, we will // not create the index shard DiscoveryNode sourceNode = null; if (isPeerRecovery(shardRouting)) { sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting); if (sourceNode == null) { logger.trace( "ignoring initializing shard {} - no source node can be found.", shardRouting.shardId()); return; } } // if there is no shard, create it if (!indexService.hasShard(shardId)) { if (failedShards.containsKey(shardRouting.shardId())) { if (nodes.masterNode() != null) { shardStateAction.resendShardFailed( shardRouting, indexMetaData.getIndexUUID(), "master " + nodes.masterNode() + " marked shard as initializing, but shard is marked as failed, resend shard failure", nodes.masterNode()); } return; } try { if (logger.isDebugEnabled()) { logger.debug("[{}][{}] creating shard", shardRouting.index(), shardId); } IndexShard indexShard = indexService.createShard(shardId, shardRouting.primary()); indexShard.updateRoutingEntry( shardRouting, state.blocks().disableStatePersistence() == false); indexShard.addFailedEngineListener(failedEngineHandler); } catch (IndexShardAlreadyExistsException e) { // ignore this, the method call can happen several times } catch (Throwable e) { failAndRemoveShard(shardRouting, indexService, true, "failed to create shard", e); return; } } final IndexShard indexShard = indexService.shardSafe(shardId); if (indexShard.ignoreRecoveryAttempt()) { // we are already recovering (we can get to this state since the cluster event can happen // several // times while we recover) logger.trace( "ignoring recovery instruction for shard {} (shard state: [{}])", indexShard.shardId(), indexShard.state()); return; } if (isPeerRecovery(shardRouting)) { try { assert sourceNode != null : "peer recovery started but sourceNode is null"; // we don't mark this one as relocated at the end. // For primaries: requests in any case are routed to both when its relocating and that way // we handle // the edge case where its mark as relocated, and we might need to roll it back... // For replicas: we are recovering a backup from a primary RecoveryState.Type type = shardRouting.primary() ? RecoveryState.Type.RELOCATION : RecoveryState.Type.REPLICA; recoveryTarget.startRecovery( indexShard, type, sourceNode, new PeerRecoveryListener(shardRouting, indexService, indexMetaData)); } catch (Throwable e) { indexShard.failShard("corrupted preexisting index", e); handleRecoveryFailure(indexService, shardRouting, true, e); } } else { final IndexShardRoutingTable indexShardRouting = routingTable.index(shardRouting.index()).shard(shardRouting.id()); indexService .shard(shardId) .recoverFromStore( indexShardRouting, new StoreRecoveryService.RecoveryListener() { @Override public void onRecoveryDone() { shardStateAction.shardStarted( shardRouting, indexMetaData.getIndexUUID(), "after recovery from store"); } @Override public void onIgnoreRecovery(String reason) {} @Override public void onRecoveryFailed(IndexShardRecoveryException e) { handleRecoveryFailure(indexService, shardRouting, true, e); } }); } } /** * Finds the routing source node for peer recovery, return null if its not found. Note, this * method expects the shard routing to *require* peer recovery, use {@link * #isPeerRecovery(org.elasticsearch.cluster.routing.ShardRouting)} to check if its needed or not. */ private DiscoveryNode findSourceNodeForPeerRecovery( RoutingTable routingTable, DiscoveryNodes nodes, ShardRouting shardRouting) { DiscoveryNode sourceNode = null; if (!shardRouting.primary()) { IndexShardRoutingTable shardRoutingTable = routingTable.index(shardRouting.index()).shard(shardRouting.id()); for (ShardRouting entry : shardRoutingTable) { if (entry.primary() && entry.active()) { // only recover from started primary, if we can't find one, we will do it next round sourceNode = nodes.get(entry.currentNodeId()); if (sourceNode == null) { logger.trace( "can't find replica source node because primary shard {} is assigned to an unknown node.", entry); return null; } break; } } if (sourceNode == null) { logger.trace( "can't find replica source node for {} because a primary shard can not be found.", shardRouting.shardId()); } } else if (shardRouting.relocatingNodeId() != null) { sourceNode = nodes.get(shardRouting.relocatingNodeId()); if (sourceNode == null) { logger.trace( "can't find relocation source node for shard {} because it is assigned to an unknown node [{}].", shardRouting.shardId(), shardRouting.relocatingNodeId()); } } else { throw new IllegalStateException( "trying to find source node for peer recovery when routing state means no peer recovery: " + shardRouting); } return sourceNode; } private boolean isPeerRecovery(ShardRouting shardRouting) { return !shardRouting.primary() || shardRouting.relocatingNodeId() != null; } private class PeerRecoveryListener implements RecoveryTarget.RecoveryListener { private final ShardRouting shardRouting; private final IndexService indexService; private final IndexMetaData indexMetaData; private PeerRecoveryListener( ShardRouting shardRouting, IndexService indexService, IndexMetaData indexMetaData) { this.shardRouting = shardRouting; this.indexService = indexService; this.indexMetaData = indexMetaData; } @Override public void onRecoveryDone(RecoveryState state) { shardStateAction.shardStarted( shardRouting, indexMetaData.getIndexUUID(), "after recovery (replica) from node [" + state.getSourceNode() + "]"); } @Override public void onRecoveryFailure( RecoveryState state, RecoveryFailedException e, boolean sendShardFailure) { handleRecoveryFailure(indexService, shardRouting, sendShardFailure, e); } } private void handleRecoveryFailure( IndexService indexService, ShardRouting shardRouting, boolean sendShardFailure, Throwable failure) { synchronized (mutex) { failAndRemoveShard(shardRouting, indexService, sendShardFailure, "failed recovery", failure); } } private void removeIndex(String index, String reason) { try { indicesService.removeIndex(index, reason); } catch (Throwable e) { logger.warn("failed to clean index ({})", e, reason); } clearSeenMappings(index); } private void clearSeenMappings(String index) { // clear seen mappings as well for (Tuple<String, String> tuple : seenMappings.keySet()) { if (tuple.v1().equals(index)) { seenMappings.remove(tuple); } } } private void deleteIndex(String index, String reason) { try { indicesService.deleteIndex(index, reason); } catch (Throwable e) { logger.warn("failed to delete index ({})", e, reason); } // clear seen mappings as well clearSeenMappings(index); } private void failAndRemoveShard( ShardRouting shardRouting, IndexService indexService, boolean sendShardFailure, String message, @Nullable Throwable failure) { if (indexService.hasShard(shardRouting.getId())) { try { indexService.removeShard(shardRouting.getId(), message); } catch (ShardNotFoundException e) { // the node got closed on us, ignore it } catch (Throwable e1) { logger.warn( "[{}][{}] failed to remove shard after failure ([{}])", e1, shardRouting.getIndex(), shardRouting.getId(), message); } } if (sendShardFailure) { sendFailShard(shardRouting, indexService.indexUUID(), message, failure); } } private void sendFailShard( ShardRouting shardRouting, String indexUUID, String message, @Nullable Throwable failure) { try { logger.warn( "[{}] marking and sending shard failed due to [{}]", failure, shardRouting.shardId(), message); failedShards.put(shardRouting.shardId(), new FailedShard(shardRouting.version())); shardStateAction.shardFailed( shardRouting, indexUUID, "shard failure [" + message + "]" + (failure == null ? "" : "[" + detailedMessage(failure) + "]")); } catch (Throwable e1) { logger.warn( "[{}][{}] failed to mark shard as failed (because of [{}])", e1, shardRouting.getIndex(), shardRouting.getId(), message); } } private class FailedEngineHandler implements Engine.FailedEngineListener { @Override public void onFailedEngine( final ShardId shardId, final String reason, final @Nullable Throwable failure) { ShardRouting shardRouting = null; final IndexService indexService = indicesService.indexService(shardId.index().name()); if (indexService != null) { IndexShard indexShard = indexService.shard(shardId.id()); if (indexShard != null) { shardRouting = indexShard.routingEntry(); } } if (shardRouting == null) { logger.warn( "[{}][{}] engine failed, but can't find index shard. failure reason: [{}]", failure, shardId.index().name(), shardId.id(), reason); return; } final ShardRouting fShardRouting = shardRouting; threadPool .generic() .execute( new Runnable() { @Override public void run() { synchronized (mutex) { failAndRemoveShard( fShardRouting, indexService, true, "engine failure, reason [" + reason + "]", failure); } } }); } } }
public class ClusterService extends AbstractLifecycleComponent { public static final Setting<TimeValue> CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING = Setting.positiveTimeSetting( "cluster.service.slow_task_logging_threshold", TimeValue.timeValueSeconds(30), Property.Dynamic, Property.NodeScope); public static final String UPDATE_THREAD_NAME = "clusterService#updateTask"; private final ThreadPool threadPool; private final ClusterName clusterName; private BiConsumer<ClusterChangedEvent, Discovery.AckListener> clusterStatePublisher; private final OperationRouting operationRouting; private final ClusterSettings clusterSettings; private TimeValue slowTaskLoggingThreshold; private volatile PrioritizedEsThreadPoolExecutor updateTasksExecutor; /** Those 3 state listeners are changing infrequently - CopyOnWriteArrayList is just fine */ private final Collection<ClusterStateListener> priorityClusterStateListeners = new CopyOnWriteArrayList<>(); private final Collection<ClusterStateListener> clusterStateListeners = new CopyOnWriteArrayList<>(); private final Collection<ClusterStateListener> lastClusterStateListeners = new CopyOnWriteArrayList<>(); private final Map<ClusterStateTaskExecutor, List<UpdateTask>> updateTasksPerExecutor = new HashMap<>(); // TODO this is rather frequently changing I guess a Synced Set would be better here and a // dedicated remove API private final Collection<ClusterStateListener> postAppliedListeners = new CopyOnWriteArrayList<>(); private final Iterable<ClusterStateListener> preAppliedListeners = Iterables.concat( priorityClusterStateListeners, clusterStateListeners, lastClusterStateListeners); private final LocalNodeMasterListeners localNodeMasterListeners; private final Queue<NotifyTimeout> onGoingTimeouts = ConcurrentCollections.newQueue(); private volatile ClusterState clusterState; private final ClusterBlocks.Builder initialBlocks; private NodeConnectionsService nodeConnectionsService; public ClusterService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) { super(settings); this.operationRouting = new OperationRouting(settings, clusterSettings); this.threadPool = threadPool; this.clusterSettings = clusterSettings; this.clusterName = ClusterName.CLUSTER_NAME_SETTING.get(settings); // will be replaced on doStart. this.clusterState = ClusterState.builder(clusterName).build(); this.clusterSettings.addSettingsUpdateConsumer( CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING, this::setSlowTaskLoggingThreshold); this.slowTaskLoggingThreshold = CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(settings); localNodeMasterListeners = new LocalNodeMasterListeners(threadPool); initialBlocks = ClusterBlocks.builder(); } private void setSlowTaskLoggingThreshold(TimeValue slowTaskLoggingThreshold) { this.slowTaskLoggingThreshold = slowTaskLoggingThreshold; } public synchronized void setClusterStatePublisher( BiConsumer<ClusterChangedEvent, Discovery.AckListener> publisher) { clusterStatePublisher = publisher; } public synchronized void setLocalNode(DiscoveryNode localNode) { assert clusterState.nodes().getLocalNodeId() == null : "local node is already set"; DiscoveryNodes.Builder nodeBuilder = DiscoveryNodes.builder(clusterState.nodes()).add(localNode).localNodeId(localNode.getId()); this.clusterState = ClusterState.builder(clusterState).nodes(nodeBuilder).build(); } public synchronized void setNodeConnectionsService( NodeConnectionsService nodeConnectionsService) { assert this.nodeConnectionsService == null : "nodeConnectionsService is already set"; this.nodeConnectionsService = nodeConnectionsService; } /** Adds an initial block to be set on the first cluster state created. */ public synchronized void addInitialStateBlock(ClusterBlock block) throws IllegalStateException { if (lifecycle.started()) { throw new IllegalStateException("can't set initial block when started"); } initialBlocks.addGlobalBlock(block); } /** Remove an initial block to be set on the first cluster state created. */ public synchronized void removeInitialStateBlock(ClusterBlock block) throws IllegalStateException { removeInitialStateBlock(block.id()); } /** Remove an initial block to be set on the first cluster state created. */ public synchronized void removeInitialStateBlock(int blockId) throws IllegalStateException { if (lifecycle.started()) { throw new IllegalStateException("can't set initial block when started"); } initialBlocks.removeGlobalBlock(blockId); } @Override protected synchronized void doStart() { Objects.requireNonNull( clusterStatePublisher, "please set a cluster state publisher before starting"); Objects.requireNonNull( clusterState.nodes().getLocalNode(), "please set the local node before starting"); Objects.requireNonNull( nodeConnectionsService, "please set the node connection service before starting"); add(localNodeMasterListeners); this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build(); this.updateTasksExecutor = EsExecutors.newSinglePrioritizing( UPDATE_THREAD_NAME, daemonThreadFactory(settings, UPDATE_THREAD_NAME), threadPool.getThreadContext()); this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build(); } @Override protected synchronized void doStop() { for (NotifyTimeout onGoingTimeout : onGoingTimeouts) { onGoingTimeout.cancel(); try { onGoingTimeout.cancel(); onGoingTimeout.listener.onClose(); } catch (Exception ex) { logger.debug("failed to notify listeners on shutdown", ex); } } ThreadPool.terminate(updateTasksExecutor, 10, TimeUnit.SECONDS); // close timeout listeners that did not have an ongoing timeout postAppliedListeners .stream() .filter(listener -> listener instanceof TimeoutClusterStateListener) .map(listener -> (TimeoutClusterStateListener) listener) .forEach(TimeoutClusterStateListener::onClose); remove(localNodeMasterListeners); } @Override protected synchronized void doClose() {} /** The local node. */ public DiscoveryNode localNode() { DiscoveryNode localNode = clusterState.getNodes().getLocalNode(); if (localNode == null) { throw new IllegalStateException("No local node found. Is the node started?"); } return localNode; } public OperationRouting operationRouting() { return operationRouting; } /** The current state. */ public ClusterState state() { return this.clusterState; } /** Adds a priority listener for updated cluster states. */ public void addFirst(ClusterStateListener listener) { priorityClusterStateListeners.add(listener); } /** Adds last listener. */ public void addLast(ClusterStateListener listener) { lastClusterStateListeners.add(listener); } /** Adds a listener for updated cluster states. */ public void add(ClusterStateListener listener) { clusterStateListeners.add(listener); } /** Removes a listener for updated cluster states. */ public void remove(ClusterStateListener listener) { clusterStateListeners.remove(listener); priorityClusterStateListeners.remove(listener); lastClusterStateListeners.remove(listener); postAppliedListeners.remove(listener); for (Iterator<NotifyTimeout> it = onGoingTimeouts.iterator(); it.hasNext(); ) { NotifyTimeout timeout = it.next(); if (timeout.listener.equals(listener)) { timeout.cancel(); it.remove(); } } } /** Add a listener for on/off local node master events */ public void add(LocalNodeMasterListener listener) { localNodeMasterListeners.add(listener); } /** Remove the given listener for on/off local master events */ public void remove(LocalNodeMasterListener listener) { localNodeMasterListeners.remove(listener); } /** * Adds a cluster state listener that will timeout after the provided timeout, and is executed * after the clusterstate has been successfully applied ie. is in state {@link * org.elasticsearch.cluster.ClusterState.ClusterStateStatus#APPLIED} NOTE: a {@code null} timeout * means that the listener will never be removed automatically */ public void add(@Nullable final TimeValue timeout, final TimeoutClusterStateListener listener) { if (lifecycle.stoppedOrClosed()) { listener.onClose(); return; } // call the post added notification on the same event thread try { updateTasksExecutor.execute( new SourcePrioritizedRunnable(Priority.HIGH, "_add_listener_") { @Override public void run() { if (timeout != null) { NotifyTimeout notifyTimeout = new NotifyTimeout(listener, timeout); notifyTimeout.future = threadPool.schedule(timeout, ThreadPool.Names.GENERIC, notifyTimeout); onGoingTimeouts.add(notifyTimeout); } postAppliedListeners.add(listener); listener.postAdded(); } }); } catch (EsRejectedExecutionException e) { if (lifecycle.stoppedOrClosed()) { listener.onClose(); } else { throw e; } } } /** * Submits a cluster state update task; unlike {@link #submitStateUpdateTask(String, Object, * ClusterStateTaskConfig, ClusterStateTaskExecutor, ClusterStateTaskListener)}, submitted updates * will not be batched. * * @param source the source of the cluster state update task * @param updateTask the full context for the cluster state update task */ public void submitStateUpdateTask(final String source, final ClusterStateUpdateTask updateTask) { submitStateUpdateTask(source, updateTask, updateTask, updateTask, updateTask); } /** * Submits a cluster state update task; submitted updates will be batched across the same instance * of executor. The exact batching semantics depend on the underlying implementation but a rough * guideline is that if the update task is submitted while there are pending update tasks for the * same executor, these update tasks will all be executed on the executor in a single batch * * @param source the source of the cluster state update task * @param task the state needed for the cluster state update task * @param config the cluster state update task configuration * @param executor the cluster state update task executor; tasks that share the same executor will * be executed batches on this executor * @param listener callback after the cluster state update task completes * @param <T> the type of the cluster state update task state */ public <T> void submitStateUpdateTask( final String source, final T task, final ClusterStateTaskConfig config, final ClusterStateTaskExecutor<T> executor, final ClusterStateTaskListener listener) { submitStateUpdateTasks(source, Collections.singletonMap(task, listener), config, executor); } /** * Submits a batch of cluster state update tasks; submitted updates are guaranteed to be processed * together, potentially with more tasks of the same executor. * * @param source the source of the cluster state update task * @param tasks a map of update tasks and their corresponding listeners * @param config the cluster state update task configuration * @param executor the cluster state update task executor; tasks that share the same executor will * be executed batches on this executor * @param <T> the type of the cluster state update task state */ public <T> void submitStateUpdateTasks( final String source, final Map<T, ClusterStateTaskListener> tasks, final ClusterStateTaskConfig config, final ClusterStateTaskExecutor<T> executor) { if (!lifecycle.started()) { return; } if (tasks.isEmpty()) { return; } try { // convert to an identity map to check for dups based on update tasks semantics of using // identity instead of equal final IdentityHashMap<T, ClusterStateTaskListener> tasksIdentity = new IdentityHashMap<>(tasks); final List<UpdateTask<T>> updateTasks = tasksIdentity .entrySet() .stream() .map( entry -> new UpdateTask<>( source, entry.getKey(), config, executor, safe(entry.getValue(), logger))) .collect(Collectors.toList()); synchronized (updateTasksPerExecutor) { List<UpdateTask> existingTasks = updateTasksPerExecutor.computeIfAbsent(executor, k -> new ArrayList<>()); for (@SuppressWarnings("unchecked") UpdateTask<T> existing : existingTasks) { if (tasksIdentity.containsKey(existing.task)) { throw new IllegalStateException( "task [" + executor.describeTasks(Collections.singletonList(existing.task)) + "] with source [" + source + "] is already queued"); } } existingTasks.addAll(updateTasks); } final UpdateTask<T> firstTask = updateTasks.get(0); if (config.timeout() != null) { updateTasksExecutor.execute( firstTask, threadPool.scheduler(), config.timeout(), () -> threadPool .generic() .execute( () -> { for (UpdateTask<T> task : updateTasks) { if (task.processed.getAndSet(true) == false) { logger.debug( "cluster state update task [{}] timed out after [{}]", source, config.timeout()); task.listener.onFailure( source, new ProcessClusterEventTimeoutException( config.timeout(), source)); } } })); } else { updateTasksExecutor.execute(firstTask); } } catch (EsRejectedExecutionException e) { // ignore cases where we are shutting down..., there is really nothing interesting // to be done here... if (!lifecycle.stoppedOrClosed()) { throw e; } } } /** Returns the tasks that are pending. */ public List<PendingClusterTask> pendingTasks() { PrioritizedEsThreadPoolExecutor.Pending[] pendings = updateTasksExecutor.getPending(); List<PendingClusterTask> pendingClusterTasks = new ArrayList<>(pendings.length); for (PrioritizedEsThreadPoolExecutor.Pending pending : pendings) { final String source; final long timeInQueue; // we have to capture the task as it will be nulled after execution and we don't want to // change while we check things here. final Object task = pending.task; if (task == null) { continue; } else if (task instanceof SourcePrioritizedRunnable) { SourcePrioritizedRunnable runnable = (SourcePrioritizedRunnable) task; source = runnable.source(); timeInQueue = runnable.getAgeInMillis(); } else { assert false : "expected SourcePrioritizedRunnable got " + task.getClass(); source = "unknown [" + task.getClass() + "]"; timeInQueue = 0; } pendingClusterTasks.add( new PendingClusterTask( pending.insertionOrder, pending.priority, new Text(source), timeInQueue, pending.executing)); } return pendingClusterTasks; } /** Returns the number of currently pending tasks. */ public int numberOfPendingTasks() { return updateTasksExecutor.getNumberOfPendingTasks(); } /** * Returns the maximum wait time for tasks in the queue * * @return A zero time value if the queue is empty, otherwise the time value oldest task waiting * in the queue */ public TimeValue getMaxTaskWaitTime() { return updateTasksExecutor.getMaxTaskWaitTime(); } /** asserts that the current thread is the cluster state update thread */ public static boolean assertClusterStateThread() { assert Thread.currentThread().getName().contains(ClusterService.UPDATE_THREAD_NAME) : "not called from the cluster state update thread"; return true; } public ClusterName getClusterName() { return clusterName; } abstract static class SourcePrioritizedRunnable extends PrioritizedRunnable { protected final String source; public SourcePrioritizedRunnable(Priority priority, String source) { super(priority); this.source = source; } public String source() { return source; } } <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) { final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>(); final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>(); synchronized (updateTasksPerExecutor) { List<UpdateTask> pending = updateTasksPerExecutor.remove(executor); if (pending != null) { for (UpdateTask<T> task : pending) { if (task.processed.getAndSet(true) == false) { logger.trace("will process {}", task.toString(executor)); toExecute.add(task); processTasksBySource .computeIfAbsent(task.source, s -> new ArrayList<>()) .add(task.task); } else { logger.trace("skipping {}, already processed", task.toString(executor)); } } } } if (toExecute.isEmpty()) { return; } final String tasksSummary = processTasksBySource .entrySet() .stream() .map( entry -> { String tasks = executor.describeTasks(entry.getValue()); return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]"; }) .reduce((s1, s2) -> s1 + ", " + s2) .orElse(""); if (!lifecycle.started()) { logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary); return; } logger.debug("processing [{}]: execute", tasksSummary); ClusterState previousClusterState = clusterState; if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) { logger.debug("failing [{}]: local node is no longer master", tasksSummary); toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source)); return; } ClusterStateTaskExecutor.BatchResult<T> batchResult; long startTimeNS = currentTimeInNanos(); try { List<T> inputs = toExecute.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList()); batchResult = executor.execute(previousClusterState, inputs); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); if (logger.isTraceEnabled()) { logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}", executionTime, previousClusterState.version(), tasksSummary, previousClusterState.nodes().prettyPrint(), previousClusterState.routingTable().prettyPrint(), previousClusterState.getRoutingNodes().prettyPrint()), e); } warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); batchResult = ClusterStateTaskExecutor.BatchResult.<T>builder() .failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e) .build(previousClusterState); } assert batchResult.executionResults != null; assert batchResult.executionResults.size() == toExecute.size() : String.format( Locale.ROOT, "expected [%d] task result%s but was [%d]", toExecute.size(), toExecute.size() == 1 ? "" : "s", batchResult.executionResults.size()); boolean assertsEnabled = false; assert (assertsEnabled = true); if (assertsEnabled) { for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing task result for " + updateTask.toString(executor); } } ClusterState newClusterState = batchResult.resultingState; final ArrayList<UpdateTask<T>> proccessedListeners = new ArrayList<>(); // fail all tasks that have failed and extract those that are waiting for results for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing " + updateTask.toString(executor); final ClusterStateTaskExecutor.TaskResult executionResult = batchResult.executionResults.get(updateTask.task); executionResult.handle( () -> proccessedListeners.add(updateTask), ex -> { logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "cluster state update task {} failed", updateTask.toString(executor)), ex); updateTask.listener.onFailure(updateTask.source, ex); }); } if (previousClusterState == newClusterState) { for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { // no need to wait for ack if nothing changed, the update can be counted as acknowledged ((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null); } task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); return; } try { ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>(); if (newClusterState.nodes().isLocalNodeElectedMaster()) { // only the master controls the version numbers Builder builder = ClusterState.builder(newClusterState).incrementVersion(); if (previousClusterState.routingTable() != newClusterState.routingTable()) { builder.routingTable( RoutingTable.builder(newClusterState.routingTable()) .version(newClusterState.routingTable().version() + 1) .build()); } if (previousClusterState.metaData() != newClusterState.metaData()) { builder.metaData( MetaData.builder(newClusterState.metaData()) .version(newClusterState.metaData().version() + 1)); } newClusterState = builder.build(); for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { final AckedClusterStateTaskListener ackedListener = (AckedClusterStateTaskListener) task.listener; if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) { ackedListener.onAckTimeout(); } else { try { ackListeners.add( new AckCountDownListener( ackedListener, newClusterState.version(), newClusterState.nodes(), threadPool)); } catch (EsRejectedExecutionException ex) { if (logger.isDebugEnabled()) { logger.debug( "Couldn't schedule timeout thread - node might be shutting down", ex); } // timeout straightaway, otherwise we could wait forever as the timeout thread has // not started ackedListener.onAckTimeout(); } } } } } final Discovery.AckListener ackListener = new DelegetingAckListener(ackListeners); newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED); if (logger.isTraceEnabled()) { logger.trace( "cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint()); } else if (logger.isDebugEnabled()) { logger.debug( "cluster state updated, version [{}], source [{}]", newClusterState.version(), tasksSummary); } ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState); // new cluster state, notify all listeners final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta(); if (nodesDelta.hasChanges() && logger.isInfoEnabled()) { String summary = nodesDelta.shortSummary(); if (summary.length() > 0) { logger.info("{}, reason: {}", summary, tasksSummary); } } nodeConnectionsService.connectToAddedNodes(clusterChangedEvent); // if we are the master, publish the new state to all nodes // we publish here before we send a notification to all the listeners, since if it fails // we don't want to notify if (newClusterState.nodes().isLocalNodeElectedMaster()) { logger.debug("publishing cluster state version [{}]", newClusterState.version()); try { clusterStatePublisher.accept(clusterChangedEvent, ackListener); } catch (Discovery.FailedToCommitClusterStateException t) { final long version = newClusterState.version(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failing [{}]: failed to commit cluster state version [{}]", tasksSummary, version), t); proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t)); return; } } // update the current cluster state clusterState = newClusterState; logger.debug("set local cluster state to version {}", newClusterState.version()); try { // nothing to do until we actually recover from the gateway or any other block indicates we // need to disable persistency if (clusterChangedEvent.state().blocks().disableStatePersistence() == false && clusterChangedEvent.metaDataChanged()) { final Settings incomingSettings = clusterChangedEvent.state().metaData().settings(); clusterSettings.applySettings(incomingSettings); } } catch (Exception ex) { logger.warn("failed to apply cluster settings", ex); } for (ClusterStateListener listener : preAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } nodeConnectionsService.disconnectFromRemovedNodes(clusterChangedEvent); newClusterState.status(ClusterState.ClusterStateStatus.APPLIED); for (ClusterStateListener listener : postAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } // manual ack only from the master at the end of the publish if (newClusterState.nodes().isLocalNodeElectedMaster()) { try { ackListener.onNodeAck(newClusterState.nodes().getLocalNode(), null); } catch (Exception e) { final DiscoveryNode localNode = newClusterState.nodes().getLocalNode(); logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "error while processing ack for master node [{}]", localNode), e); } } for (UpdateTask<T> task : proccessedListeners) { task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } try { executor.clusterStatePublished(clusterChangedEvent); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown while notifying executor of new cluster state publication [{}]", tasksSummary), e); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})", tasksSummary, executionTime, newClusterState.version(), newClusterState.stateUUID()); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); final long version = newClusterState.version(); final String stateUUID = newClusterState.stateUUID(); final String prettyPrint = newClusterState.prettyPrint(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}", executionTime, version, stateUUID, tasksSummary, prettyPrint), e); // TODO: do we want to call updateTask.onFailure here? } } // this one is overridden in tests so we can control time protected long currentTimeInNanos() { return System.nanoTime(); } private static SafeClusterStateTaskListener safe( ClusterStateTaskListener listener, Logger logger) { if (listener instanceof AckedClusterStateTaskListener) { return new SafeAckedClusterStateTaskListener( (AckedClusterStateTaskListener) listener, logger); } else { return new SafeClusterStateTaskListener(listener, logger); } } private static class SafeClusterStateTaskListener implements ClusterStateTaskListener { private final ClusterStateTaskListener listener; private final Logger logger; public SafeClusterStateTaskListener(ClusterStateTaskListener listener, Logger logger) { this.listener = listener; this.logger = logger; } @Override public void onFailure(String source, Exception e) { try { listener.onFailure(source, e); } catch (Exception inner) { inner.addSuppressed(e); logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener notifying of failure from [{}]", source), inner); } } @Override public void onNoLongerMaster(String source) { try { listener.onNoLongerMaster(source); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener while notifying no longer master from [{}]", source), e); } } @Override public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { try { listener.clusterStateProcessed(source, oldState, newState); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener while notifying of cluster state processed from [{}], old cluster state:\n" + "{}\nnew cluster state:\n{}", source, oldState.prettyPrint(), newState.prettyPrint()), e); } } } private static class SafeAckedClusterStateTaskListener extends SafeClusterStateTaskListener implements AckedClusterStateTaskListener { private final AckedClusterStateTaskListener listener; private final Logger logger; public SafeAckedClusterStateTaskListener( AckedClusterStateTaskListener listener, Logger logger) { super(listener, logger); this.listener = listener; this.logger = logger; } @Override public boolean mustAck(DiscoveryNode discoveryNode) { return listener.mustAck(discoveryNode); } @Override public void onAllNodesAcked(@Nullable Exception e) { try { listener.onAllNodesAcked(e); } catch (Exception inner) { inner.addSuppressed(e); logger.error("exception thrown by listener while notifying on all nodes acked", inner); } } @Override public void onAckTimeout() { try { listener.onAckTimeout(); } catch (Exception e) { logger.error("exception thrown by listener while notifying on ack timeout", e); } } @Override public TimeValue ackTimeout() { return listener.ackTimeout(); } } class UpdateTask<T> extends SourcePrioritizedRunnable { public final T task; public final ClusterStateTaskConfig config; public final ClusterStateTaskExecutor<T> executor; public final ClusterStateTaskListener listener; public final AtomicBoolean processed = new AtomicBoolean(); UpdateTask( String source, T task, ClusterStateTaskConfig config, ClusterStateTaskExecutor<T> executor, ClusterStateTaskListener listener) { super(config.priority(), source); this.task = task; this.config = config; this.executor = executor; this.listener = listener; } @Override public void run() { // if this task is already processed, the executor shouldn't execute other tasks (that arrived // later), // to give other executors a chance to execute their tasks. if (processed.get() == false) { runTasksForExecutor(executor); } } public String toString(ClusterStateTaskExecutor<T> executor) { String taskDescription = executor.describeTasks(Collections.singletonList(task)); if (taskDescription.isEmpty()) { return "[" + source + "]"; } else { return "[" + source + "[" + taskDescription + "]]"; } } } private void warnAboutSlowTaskIfNeeded(TimeValue executionTime, String source) { if (executionTime.getMillis() > slowTaskLoggingThreshold.getMillis()) { logger.warn( "cluster state update task [{}] took [{}] above the warn threshold of {}", source, executionTime, slowTaskLoggingThreshold); } } class NotifyTimeout implements Runnable { final TimeoutClusterStateListener listener; final TimeValue timeout; volatile ScheduledFuture future; NotifyTimeout(TimeoutClusterStateListener listener, TimeValue timeout) { this.listener = listener; this.timeout = timeout; } public void cancel() { FutureUtils.cancel(future); } @Override public void run() { if (future != null && future.isCancelled()) { return; } if (lifecycle.stoppedOrClosed()) { listener.onClose(); } else { listener.onTimeout(this.timeout); } // note, we rely on the listener to remove itself in case of timeout if needed } } private static class LocalNodeMasterListeners implements ClusterStateListener { private final List<LocalNodeMasterListener> listeners = new CopyOnWriteArrayList<>(); private final ThreadPool threadPool; private volatile boolean master = false; private LocalNodeMasterListeners(ThreadPool threadPool) { this.threadPool = threadPool; } @Override public void clusterChanged(ClusterChangedEvent event) { if (!master && event.localNodeMaster()) { master = true; for (LocalNodeMasterListener listener : listeners) { Executor executor = threadPool.executor(listener.executorName()); executor.execute(new OnMasterRunnable(listener)); } return; } if (master && !event.localNodeMaster()) { master = false; for (LocalNodeMasterListener listener : listeners) { Executor executor = threadPool.executor(listener.executorName()); executor.execute(new OffMasterRunnable(listener)); } } } private void add(LocalNodeMasterListener listener) { listeners.add(listener); } private void remove(LocalNodeMasterListener listener) { listeners.remove(listener); } private void clear() { listeners.clear(); } } private static class OnMasterRunnable implements Runnable { private final LocalNodeMasterListener listener; private OnMasterRunnable(LocalNodeMasterListener listener) { this.listener = listener; } @Override public void run() { listener.onMaster(); } } private static class OffMasterRunnable implements Runnable { private final LocalNodeMasterListener listener; private OffMasterRunnable(LocalNodeMasterListener listener) { this.listener = listener; } @Override public void run() { listener.offMaster(); } } private static class DelegetingAckListener implements Discovery.AckListener { private final List<Discovery.AckListener> listeners; private DelegetingAckListener(List<Discovery.AckListener> listeners) { this.listeners = listeners; } @Override public void onNodeAck(DiscoveryNode node, @Nullable Exception e) { for (Discovery.AckListener listener : listeners) { listener.onNodeAck(node, e); } } @Override public void onTimeout() { throw new UnsupportedOperationException("no timeout delegation"); } } private static class AckCountDownListener implements Discovery.AckListener { private static final Logger logger = Loggers.getLogger(AckCountDownListener.class); private final AckedClusterStateTaskListener ackedTaskListener; private final CountDown countDown; private final DiscoveryNodes nodes; private final long clusterStateVersion; private final Future<?> ackTimeoutCallback; private Exception lastFailure; AckCountDownListener( AckedClusterStateTaskListener ackedTaskListener, long clusterStateVersion, DiscoveryNodes nodes, ThreadPool threadPool) { this.ackedTaskListener = ackedTaskListener; this.clusterStateVersion = clusterStateVersion; this.nodes = nodes; int countDown = 0; for (DiscoveryNode node : nodes) { if (ackedTaskListener.mustAck(node)) { countDown++; } } // we always wait for at least 1 node (the master) countDown = Math.max(1, countDown); logger.trace( "expecting {} acknowledgements for cluster_state update (version: {})", countDown, clusterStateVersion); this.countDown = new CountDown(countDown); this.ackTimeoutCallback = threadPool.schedule( ackedTaskListener.ackTimeout(), ThreadPool.Names.GENERIC, new Runnable() { @Override public void run() { onTimeout(); } }); } @Override public void onNodeAck(DiscoveryNode node, @Nullable Exception e) { if (!ackedTaskListener.mustAck(node)) { // we always wait for the master ack anyway if (!node.equals(nodes.getMasterNode())) { return; } } if (e == null) { logger.trace( "ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion); } else { this.lastFailure = e; logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion), e); } if (countDown.countDown()) { logger.trace( "all expected nodes acknowledged cluster_state update (version: {})", clusterStateVersion); FutureUtils.cancel(ackTimeoutCallback); ackedTaskListener.onAllNodesAcked(lastFailure); } } @Override public void onTimeout() { if (countDown.fastForward()) { logger.trace( "timeout waiting for acknowledgement for cluster_state update (version: {})", clusterStateVersion); ackedTaskListener.onAckTimeout(); } } } public ClusterSettings getClusterSettings() { return clusterSettings; } public Settings getSettings() { return settings; } }
/** * The indices request cache allows to cache a shard level request stage responses, helping with * improving similar requests that are potentially expensive (because of aggs for example). The * cache is fully coherent with the semantics of NRT (the index reader version is part of the cache * key), and relies on size based eviction to evict old reader associated cache entries as well as * scheduler reaper to clean readers that are no longer used or closed shards. * * <p>Currently, the cache is only enabled for count requests, and can only be opted in on an index * level setting that can be dynamically changed and defaults to false. * * <p>There are still several TODOs left in this class, some easily addressable, some more complex, * but the support is functional. */ public class IndicesRequestCache extends AbstractComponent implements RemovalListener<IndicesRequestCache.Key, IndicesRequestCache.Value> { /** * A setting to enable or disable request caching on an index level. Its dynamic by default since * we are checking on the cluster state IndexMetaData always. */ public static final String INDEX_CACHE_REQUEST_ENABLED = "index.requests.cache.enable"; @Deprecated public static final String DEPRECATED_INDEX_CACHE_REQUEST_ENABLED = "index.cache.query.enable"; public static final String INDICES_CACHE_REQUEST_CLEAN_INTERVAL = "indices.requests.cache.clean_interval"; public static final String INDICES_CACHE_QUERY_SIZE = "indices.requests.cache.size"; @Deprecated public static final String DEPRECATED_INDICES_CACHE_QUERY_SIZE = "indices.cache.query.size"; public static final String INDICES_CACHE_QUERY_EXPIRE = "indices.requests.cache.expire"; public static final String INDICES_CACHE_QUERY_CONCURRENCY_LEVEL = "indices.requests.cache.concurrency_level"; private static final Set<SearchType> CACHEABLE_SEARCH_TYPES = EnumSet.of(SearchType.QUERY_THEN_FETCH, SearchType.QUERY_AND_FETCH); private final ThreadPool threadPool; private final ClusterService clusterService; private final TimeValue cleanInterval; private final Reaper reaper; final ConcurrentMap<CleanupKey, Boolean> registeredClosedListeners = ConcurrentCollections.newConcurrentMap(); final Set<CleanupKey> keysToClean = ConcurrentCollections.newConcurrentSet(); // TODO make these changes configurable on the cluster level private final String size; private final TimeValue expire; private final int concurrencyLevel; private volatile Cache<Key, Value> cache; @Inject public IndicesRequestCache( Settings settings, ClusterService clusterService, ThreadPool threadPool) { super(settings); this.clusterService = clusterService; this.threadPool = threadPool; this.cleanInterval = settings.getAsTime(INDICES_CACHE_REQUEST_CLEAN_INTERVAL, TimeValue.timeValueSeconds(60)); String size = settings.get(INDICES_CACHE_QUERY_SIZE); if (size == null) { size = settings.get(DEPRECATED_INDICES_CACHE_QUERY_SIZE); if (size != null) { deprecationLogger.deprecated( "The [" + DEPRECATED_INDICES_CACHE_QUERY_SIZE + "] settings is now deprecated, use [" + INDICES_CACHE_QUERY_SIZE + "] instead"); } } if (size == null) { // this cache can be very small yet still be very effective size = "1%"; } this.size = size; this.expire = settings.getAsTime(INDICES_CACHE_QUERY_EXPIRE, null); // defaults to 4, but this is a busy map for all indices, increase it a bit by default this.concurrencyLevel = settings.getAsInt(INDICES_CACHE_QUERY_CONCURRENCY_LEVEL, 16); if (concurrencyLevel <= 0) { throw new IllegalArgumentException( "concurrency_level must be > 0 but was: " + concurrencyLevel); } buildCache(); this.reaper = new Reaper(); threadPool.schedule(cleanInterval, ThreadPool.Names.SAME, reaper); } private boolean isCacheEnabled(Settings settings, boolean defaultEnable) { Boolean enable = settings.getAsBoolean(INDEX_CACHE_REQUEST_ENABLED, null); if (enable == null) { enable = settings.getAsBoolean(DEPRECATED_INDEX_CACHE_REQUEST_ENABLED, null); if (enable != null) { deprecationLogger.deprecated( "The [" + DEPRECATED_INDEX_CACHE_REQUEST_ENABLED + "] settings is now deprecated, use [" + INDEX_CACHE_REQUEST_ENABLED + "] instead"); } } if (enable == null) { enable = defaultEnable; } return enable; } private void buildCache() { long sizeInBytes = MemorySizeValue.parseBytesSizeValueOrHeapRatio(size, INDICES_CACHE_QUERY_SIZE).bytes(); CacheBuilder<Key, Value> cacheBuilder = CacheBuilder.newBuilder() .maximumWeight(sizeInBytes) .weigher(new QueryCacheWeigher()) .removalListener(this); cacheBuilder.concurrencyLevel(concurrencyLevel); if (expire != null) { cacheBuilder.expireAfterAccess(expire.millis(), TimeUnit.MILLISECONDS); } cache = cacheBuilder.build(); } private static class QueryCacheWeigher implements Weigher<Key, Value> { @Override public int weigh(Key key, Value value) { return (int) (key.ramBytesUsed() + value.ramBytesUsed()); } } public void close() { reaper.close(); cache.invalidateAll(); } public void clear(IndexShard shard) { if (shard == null) { return; } keysToClean.add(new CleanupKey(shard, -1)); logger.trace("{} explicit cache clear", shard.shardId()); reaper.reap(); } @Override public void onRemoval(RemovalNotification<Key, Value> notification) { if (notification.getKey() == null) { return; } notification.getKey().shard.requestCache().onRemoval(notification); } /** Can the shard request be cached at all? */ public boolean canCache(ShardSearchRequest request, SearchContext context) { // TODO: for now, template is not supported, though we could use the generated bytes as the key if (hasLength(request.templateSource())) { return false; } // for now, only enable it for requests with no hits if (context.size() != 0) { return false; } // We cannot cache with DFS because results depend not only on the content of the index but also // on the overridden statistics. So if you ran two queries on the same index with different // stats // (because an other shard was updated) you would get wrong results because of the scores // (think about top_hits aggs or scripts using the score) if (!CACHEABLE_SEARCH_TYPES.contains(context.searchType())) { return false; } IndexMetaData index = clusterService.state().getMetaData().index(request.index()); if (index == null) { // in case we didn't yet have the cluster state, or it just got deleted return false; } // if not explicitly set in the request, use the index setting, if not, use the request if (request.requestCache() == null) { if (!isCacheEnabled(index.settings(), Boolean.FALSE)) { return false; } } else if (!request.requestCache()) { return false; } // if the reader is not a directory reader, we can't get the version from it if (!(context.searcher().getIndexReader() instanceof DirectoryReader)) { return false; } // if now in millis is used (or in the future, a more generic "isDeterministic" flag // then we can't cache based on "now" key within the search request, as it is not deterministic if (context.nowInMillisUsed()) { return false; } return true; } /** * Loads the cache result, computing it if needed by executing the query phase and otherwise * deserializing the cached value into the {@link SearchContext#queryResult() context's query * result}. The combination of load + compute allows to have a single load operation that will * cause other requests with the same key to wait till its loaded an reuse the same cache. */ public void loadIntoContext( final ShardSearchRequest request, final SearchContext context, final QueryPhase queryPhase) throws Exception { assert canCache(request, context); Key key = buildKey(request, context); Loader loader = new Loader(queryPhase, context, key); Value value = cache.get(key, loader); if (loader.isLoaded()) { key.shard.requestCache().onMiss(); // see if its the first time we see this reader, and make sure to register a cleanup key CleanupKey cleanupKey = new CleanupKey( context.indexShard(), ((DirectoryReader) context.searcher().getIndexReader()).getVersion()); if (!registeredClosedListeners.containsKey(cleanupKey)) { Boolean previous = registeredClosedListeners.putIfAbsent(cleanupKey, Boolean.TRUE); if (previous == null) { context.searcher().getIndexReader().addReaderClosedListener(cleanupKey); } } } else { key.shard.requestCache().onHit(); // restore the cached query result into the context final QuerySearchResult result = context.queryResult(); result.readFromWithId(context.id(), value.reference.streamInput()); result.shardTarget(context.shardTarget()); } } private static class Loader implements Callable<Value> { private final QueryPhase queryPhase; private final SearchContext context; private final IndicesRequestCache.Key key; private boolean loaded; Loader(QueryPhase queryPhase, SearchContext context, IndicesRequestCache.Key key) { this.queryPhase = queryPhase; this.context = context; this.key = key; } public boolean isLoaded() { return this.loaded; } @Override public Value call() throws Exception { queryPhase.execute(context); /* BytesStreamOutput allows to pass the expected size but by default uses * BigArrays.PAGE_SIZE_IN_BYTES which is 16k. A common cached result ie. * a date histogram with 3 buckets is ~100byte so 16k might be very wasteful * since we don't shrink to the actual size once we are done serializing. * By passing 512 as the expected size we will resize the byte array in the stream * slowly until we hit the page size and don't waste too much memory for small query * results.*/ final int expectedSizeInBytes = 512; try (BytesStreamOutput out = new BytesStreamOutput(expectedSizeInBytes)) { context.queryResult().writeToNoId(out); // for now, keep the paged data structure, which might have unused bytes to fill a page, but // better to keep // the memory properly paged instead of having varied sized bytes final BytesReference reference = out.bytes(); loaded = true; Value value = new Value(reference, out.ramBytesUsed()); key.shard.requestCache().onCached(key, value); return value; } } } public static class Value implements Accountable { final BytesReference reference; final long ramBytesUsed; public Value(BytesReference reference, long ramBytesUsed) { this.reference = reference; this.ramBytesUsed = ramBytesUsed; } @Override public long ramBytesUsed() { return ramBytesUsed; } @Override public Collection<Accountable> getChildResources() { return Collections.emptyList(); } } public static class Key implements Accountable { public final IndexShard shard; // use as identity equality public final long readerVersion; // use the reader version to now keep a reference to a "short" lived reader // until its reaped public final BytesReference value; Key(IndexShard shard, long readerVersion, BytesReference value) { this.shard = shard; this.readerVersion = readerVersion; this.value = value; } @Override public long ramBytesUsed() { return RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_LONG + value.length(); } @Override public Collection<Accountable> getChildResources() { // TODO: more detailed ram usage? return Collections.emptyList(); } @Override public boolean equals(Object o) { if (this == o) return true; Key key = (Key) o; if (readerVersion != key.readerVersion) return false; if (!shard.equals(key.shard)) return false; if (!value.equals(key.value)) return false; return true; } @Override public int hashCode() { int result = shard.hashCode(); result = 31 * result + (int) (readerVersion ^ (readerVersion >>> 32)); result = 31 * result + value.hashCode(); return result; } } private class CleanupKey implements IndexReader.ReaderClosedListener { IndexShard indexShard; long readerVersion; // use the reader version to now keep a reference to a "short" lived reader // until its reaped private CleanupKey(IndexShard indexShard, long readerVersion) { this.indexShard = indexShard; this.readerVersion = readerVersion; } @Override public void onClose(IndexReader reader) { Boolean remove = registeredClosedListeners.remove(this); if (remove != null) { keysToClean.add(this); } } @Override public boolean equals(Object o) { if (this == o) return true; CleanupKey that = (CleanupKey) o; if (readerVersion != that.readerVersion) return false; if (!indexShard.equals(that.indexShard)) return false; return true; } @Override public int hashCode() { int result = indexShard.hashCode(); result = 31 * result + (int) (readerVersion ^ (readerVersion >>> 32)); return result; } } private class Reaper implements Runnable { private final ObjectSet<CleanupKey> currentKeysToClean = new ObjectHashSet<>(); private final ObjectSet<IndexShard> currentFullClean = new ObjectHashSet<>(); private volatile boolean closed; void close() { closed = true; } @Override public void run() { if (closed) { return; } if (keysToClean.isEmpty()) { schedule(); return; } try { threadPool .executor(ThreadPool.Names.GENERIC) .execute( new Runnable() { @Override public void run() { reap(); schedule(); } }); } catch (EsRejectedExecutionException ex) { logger.debug("Can not run ReaderCleaner - execution rejected", ex); } } private void schedule() { try { threadPool.schedule(cleanInterval, ThreadPool.Names.SAME, this); } catch (EsRejectedExecutionException ex) { logger.debug("Can not schedule ReaderCleaner - execution rejected", ex); } } synchronized void reap() { currentKeysToClean.clear(); currentFullClean.clear(); for (Iterator<CleanupKey> iterator = keysToClean.iterator(); iterator.hasNext(); ) { CleanupKey cleanupKey = iterator.next(); iterator.remove(); if (cleanupKey.readerVersion == -1 || cleanupKey.indexShard.state() == IndexShardState.CLOSED) { // -1 indicates full cleanup, as does a closed shard currentFullClean.add(cleanupKey.indexShard); } else { currentKeysToClean.add(cleanupKey); } } if (!currentKeysToClean.isEmpty() || !currentFullClean.isEmpty()) { CleanupKey lookupKey = new CleanupKey(null, -1); for (Iterator<Key> iterator = cache.asMap().keySet().iterator(); iterator.hasNext(); ) { Key key = iterator.next(); if (currentFullClean.contains(key.shard)) { iterator.remove(); } else { lookupKey.indexShard = key.shard; lookupKey.readerVersion = key.readerVersion; if (currentKeysToClean.contains(lookupKey)) { iterator.remove(); } } } } cache.cleanUp(); currentKeysToClean.clear(); currentFullClean.clear(); } } private static Key buildKey(ShardSearchRequest request, SearchContext context) throws Exception { // TODO: for now, this will create different keys for different JSON order // TODO: tricky to get around this, need to parse and order all, which can be expensive return new Key( context.indexShard(), ((DirectoryReader) context.searcher().getIndexReader()).getVersion(), request.cacheKey()); } }
// we do our best to return the shard failures, but its ok if its not fully concurrently safe // we simply try and return as much as possible protected final void addShardFailure(ShardSearchFailure failure) { if (shardFailures == null) { shardFailures = ConcurrentCollections.newQueue(); } shardFailures.add(failure); }
/** * A node level registry of analyzers, to be reused by different indices which use default * analyzers. * * @author kimchy (shay.banon) */ public class IndicesAnalysisService extends AbstractComponent { private final Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = ConcurrentCollections.newConcurrentMap(); public IndicesAnalysisService() { super(EMPTY_SETTINGS); } @Inject public IndicesAnalysisService(Settings settings) { super(settings); analyzerProviderFactories.put( "standard", new PreBuiltAnalyzerProviderFactory( "standard", AnalyzerScope.INDICES, new StandardAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "keyword", new PreBuiltAnalyzerProviderFactory( "keyword", AnalyzerScope.INDICES, new KeywordAnalyzer())); analyzerProviderFactories.put( "stop", new PreBuiltAnalyzerProviderFactory( "stop", AnalyzerScope.INDICES, new StopAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "whitespace", new PreBuiltAnalyzerProviderFactory( "whitespace", AnalyzerScope.INDICES, new WhitespaceAnalyzer())); analyzerProviderFactories.put( "simple", new PreBuiltAnalyzerProviderFactory("simple", AnalyzerScope.INDICES, new SimpleAnalyzer())); // extended ones analyzerProviderFactories.put( "arabic", new PreBuiltAnalyzerProviderFactory( "arabic", AnalyzerScope.INDICES, new ArabicAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "brazilian", new PreBuiltAnalyzerProviderFactory( "brazilian", AnalyzerScope.INDICES, new BrazilianAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "chinese", new PreBuiltAnalyzerProviderFactory( "chinese", AnalyzerScope.INDICES, new ChineseAnalyzer())); analyzerProviderFactories.put( "cjk", new PreBuiltAnalyzerProviderFactory("cjk", AnalyzerScope.INDICES, new ChineseAnalyzer())); analyzerProviderFactories.put( "czech", new PreBuiltAnalyzerProviderFactory( "czech", AnalyzerScope.INDICES, new CzechAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "dutch", new PreBuiltAnalyzerProviderFactory( "dutch", AnalyzerScope.INDICES, new DutchAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "french", new PreBuiltAnalyzerProviderFactory( "french", AnalyzerScope.INDICES, new FrenchAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "german", new PreBuiltAnalyzerProviderFactory( "german", AnalyzerScope.INDICES, new GermanAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "greek", new PreBuiltAnalyzerProviderFactory( "greek", AnalyzerScope.INDICES, new GreekAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "persian", new PreBuiltAnalyzerProviderFactory( "persian", AnalyzerScope.INDICES, new PersianAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "russian", new PreBuiltAnalyzerProviderFactory( "russian", AnalyzerScope.INDICES, new RussianAnalyzer(Lucene.ANALYZER_VERSION))); analyzerProviderFactories.put( "thai", new PreBuiltAnalyzerProviderFactory( "thai", AnalyzerScope.INDICES, new ThaiAnalyzer(Lucene.ANALYZER_VERSION))); } public PreBuiltAnalyzerProviderFactory analyzerProviderFactory(String name) { return analyzerProviderFactories.get(name); } public boolean hasAnalyzer(String name) { return analyzer(name) != null; } public Analyzer analyzer(String name) { PreBuiltAnalyzerProviderFactory analyzerProviderFactory = analyzerProviderFactory(name); if (analyzerProviderFactory == null) { return null; } return analyzerProviderFactory.analyzer(); } public void close() { for (PreBuiltAnalyzerProviderFactory analyzerProviderFactory : analyzerProviderFactories.values()) { analyzerProviderFactory.analyzer().close(); } } }
public class LocalGatewayMetaState extends AbstractComponent implements ClusterStateListener { static enum AutoImportDangledState { NO() { @Override public boolean shouldImport() { return false; } }, YES() { @Override public boolean shouldImport() { return true; } }, CLOSED() { @Override public boolean shouldImport() { return true; } }; public abstract boolean shouldImport(); public static AutoImportDangledState fromString(String value) { if ("no".equalsIgnoreCase(value)) { return NO; } else if ("yes".equalsIgnoreCase(value)) { return YES; } else if ("closed".equalsIgnoreCase(value)) { return CLOSED; } else { throw new ElasticSearchIllegalArgumentException( "failed to parse [" + value + "], not a valid auto dangling import type"); } } } private final NodeEnvironment nodeEnv; private final ThreadPool threadPool; private final LocalAllocateDangledIndices allocateDangledIndices; @Nullable private volatile MetaData currentMetaData; private final XContentType format; private final ToXContent.Params formatParams; private final AutoImportDangledState autoImportDangled; private final TimeValue danglingTimeout; private final Map<String, DanglingIndex> danglingIndices = ConcurrentCollections.newConcurrentMap(); private final Object danglingMutex = new Object(); @Inject public LocalGatewayMetaState( Settings settings, ThreadPool threadPool, NodeEnvironment nodeEnv, TransportNodesListGatewayMetaState nodesListGatewayMetaState, LocalAllocateDangledIndices allocateDangledIndices) throws Exception { super(settings); this.nodeEnv = nodeEnv; this.threadPool = threadPool; this.format = XContentType.fromRestContentType(settings.get("format", "smile")); this.allocateDangledIndices = allocateDangledIndices; nodesListGatewayMetaState.init(this); if (this.format == XContentType.SMILE) { Map<String, String> params = Maps.newHashMap(); params.put("binary", "true"); formatParams = new ToXContent.MapParams(params); } else { formatParams = ToXContent.EMPTY_PARAMS; } this.autoImportDangled = AutoImportDangledState.fromString( settings.get( "gateway.local.auto_import_dangled", AutoImportDangledState.YES.toString())); this.danglingTimeout = settings.getAsTime("gateway.local.dangling_timeout", TimeValue.timeValueHours(2)); logger.debug( "using gateway.local.auto_import_dangled [{}], with gateway.local.dangling_timeout [{}]", this.autoImportDangled, this.danglingTimeout); if (DiscoveryNode.masterNode(settings)) { try { pre019Upgrade(); long start = System.currentTimeMillis(); loadState(); logger.debug( "took {} to load state", TimeValue.timeValueMillis(System.currentTimeMillis() - start)); } catch (Exception e) { logger.error("failed to read local state, exiting...", e); throw e; } } } public MetaData loadMetaState() throws Exception { return loadState(); } public boolean isDangling(String index) { return danglingIndices.containsKey(index); } @Override public void clusterChanged(ClusterChangedEvent event) { if (event.state().blocks().disableStatePersistence()) { // reset the current metadata, we need to start fresh... this.currentMetaData = null; return; } MetaData newMetaData = event.state().metaData(); // we don't check if metaData changed, since we might be called several times and we need to // check dangling... boolean success = true; // only applied to master node, writing the global and index level states if (event.state().nodes().localNode().masterNode()) { // check if the global state changed? if (currentMetaData == null || !MetaData.isGlobalStateEquals(currentMetaData, newMetaData)) { try { writeGlobalState("changed", newMetaData, currentMetaData); } catch (Exception e) { success = false; } } // check and write changes in indices for (IndexMetaData indexMetaData : newMetaData) { String writeReason = null; IndexMetaData currentIndexMetaData; if (currentMetaData == null) { // a new event..., check from the state stored currentIndexMetaData = loadIndex(indexMetaData.index()); } else { currentIndexMetaData = currentMetaData.index(indexMetaData.index()); } if (currentIndexMetaData == null) { writeReason = "freshly created"; } else if (currentIndexMetaData.version() != indexMetaData.version()) { writeReason = "version changed from [" + currentIndexMetaData.version() + "] to [" + indexMetaData.version() + "]"; } // we update the writeReason only if we really need to write it if (writeReason == null) { continue; } try { writeIndex(writeReason, indexMetaData, currentIndexMetaData); } catch (Exception e) { success = false; } } } // delete indices that were there before, but are deleted now // we need to do it so they won't be detected as dangling if (nodeEnv.hasNodeFile()) { if (currentMetaData != null) { // only delete indices when we already received a state (currentMetaData != null) // and we had a go at processing dangling indices at least once // this will also delete the _state of the index itself for (IndexMetaData current : currentMetaData) { if (danglingIndices.containsKey(current.index())) { continue; } if (!newMetaData.hasIndex(current.index())) { logger.debug( "[{}] deleting index that is no longer part of the metadata (indices: [{}])", current.index(), newMetaData.indices().keySet()); FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(current.index()))); } } } } // handle dangling indices, we handle those for all nodes that have a node file (data or master) if (nodeEnv.hasNodeFile()) { if (danglingTimeout.millis() >= 0) { synchronized (danglingMutex) { for (String danglingIndex : danglingIndices.keySet()) { if (newMetaData.hasIndex(danglingIndex)) { logger.debug("[{}] no longer dangling (created), removing", danglingIndex); DanglingIndex removed = danglingIndices.remove(danglingIndex); removed.future.cancel(false); } } // delete indices that are no longer part of the metadata try { for (String indexName : nodeEnv.findAllIndices()) { // if we have the index on the metadata, don't delete it if (newMetaData.hasIndex(indexName)) { continue; } if (danglingIndices.containsKey(indexName)) { // already dangling, continue continue; } IndexMetaData indexMetaData = loadIndex(indexName); if (indexMetaData != null) { if (danglingTimeout.millis() == 0) { logger.info( "[{}] dangling index, exists on local file system, but not in cluster metadata, timeout set to 0, deleting now", indexName); FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(indexName))); } else { logger.info( "[{}] dangling index, exists on local file system, but not in cluster metadata, scheduling to delete in [{}], auto import to cluster state [{}]", indexName, danglingTimeout, autoImportDangled); danglingIndices.put( indexName, new DanglingIndex( indexName, threadPool.schedule( danglingTimeout, ThreadPool.Names.SAME, new RemoveDanglingIndex(indexName)))); } } } } catch (Exception e) { logger.warn("failed to find dangling indices", e); } } } if (autoImportDangled.shouldImport() && !danglingIndices.isEmpty()) { final List<IndexMetaData> dangled = Lists.newArrayList(); for (String indexName : danglingIndices.keySet()) { IndexMetaData indexMetaData = loadIndex(indexName); if (indexMetaData == null) { logger.debug("failed to find state for dangling index [{}]", indexName); continue; } // we might have someone copying over an index, renaming the directory, handle that if (!indexMetaData.index().equals(indexName)) { logger.info( "dangled index directory name is [{}], state name is [{}], renaming to directory name", indexName, indexMetaData.index()); indexMetaData = IndexMetaData.newIndexMetaDataBuilder(indexMetaData).index(indexName).build(); } if (autoImportDangled == AutoImportDangledState.CLOSED) { indexMetaData = IndexMetaData.newIndexMetaDataBuilder(indexMetaData) .state(IndexMetaData.State.CLOSE) .build(); } if (indexMetaData != null) { dangled.add(indexMetaData); } } IndexMetaData[] dangledIndices = dangled.toArray(new IndexMetaData[dangled.size()]); try { allocateDangledIndices.allocateDangled( dangledIndices, new LocalAllocateDangledIndices.Listener() { @Override public void onResponse( LocalAllocateDangledIndices.AllocateDangledResponse response) { logger.trace("allocated dangled"); } @Override public void onFailure(Throwable e) { logger.info("failed to send allocated dangled", e); } }); } catch (Exception e) { logger.warn("failed to send allocate dangled", e); } } } if (success) { currentMetaData = newMetaData; } } private void deleteIndex(String index) { logger.trace("[{}] delete index state", index); File[] indexLocations = nodeEnv.indexLocations(new Index(index)); for (File indexLocation : indexLocations) { if (!indexLocation.exists()) { continue; } FileSystemUtils.deleteRecursively(new File(indexLocation, "_state")); } } private void writeIndex( String reason, IndexMetaData indexMetaData, @Nullable IndexMetaData previousIndexMetaData) throws Exception { logger.trace("[{}] writing state, reason [{}]", indexMetaData.index(), reason); XContentBuilder builder = XContentFactory.contentBuilder(format, new BytesStreamOutput()); builder.startObject(); IndexMetaData.Builder.toXContent(indexMetaData, builder, formatParams); builder.endObject(); builder.flush(); String stateFileName = "state-" + indexMetaData.version(); Exception lastFailure = null; boolean wroteAtLeastOnce = false; for (File indexLocation : nodeEnv.indexLocations(new Index(indexMetaData.index()))) { File stateLocation = new File(indexLocation, "_state"); FileSystemUtils.mkdirs(stateLocation); File stateFile = new File(stateLocation, stateFileName); FileOutputStream fos = null; try { fos = new FileOutputStream(stateFile); BytesReference bytes = builder.bytes(); fos.write(bytes.array(), bytes.arrayOffset(), bytes.length()); fos.getChannel().force(true); fos.close(); wroteAtLeastOnce = true; } catch (Exception e) { lastFailure = e; } finally { IOUtils.closeWhileHandlingException(fos); } } if (!wroteAtLeastOnce) { logger.warn("[{}]: failed to state", lastFailure, indexMetaData.index()); throw new IOException( "failed to write state for [" + indexMetaData.index() + "]", lastFailure); } // delete the old files if (previousIndexMetaData != null && previousIndexMetaData.version() != indexMetaData.version()) { for (File indexLocation : nodeEnv.indexLocations(new Index(indexMetaData.index()))) { File[] files = new File(indexLocation, "_state").listFiles(); if (files == null) { continue; } for (File file : files) { if (!file.getName().startsWith("state-")) { continue; } if (file.getName().equals(stateFileName)) { continue; } file.delete(); } } } } private void writeGlobalState( String reason, MetaData metaData, @Nullable MetaData previousMetaData) throws Exception { logger.trace("[_global] writing state, reason [{}]", reason); // create metadata to write with just the global state MetaData globalMetaData = MetaData.builder().metaData(metaData).removeAllIndices().build(); XContentBuilder builder = XContentFactory.contentBuilder(format); builder.startObject(); MetaData.Builder.toXContent(globalMetaData, builder, formatParams); builder.endObject(); builder.flush(); String globalFileName = "global-" + globalMetaData.version(); Exception lastFailure = null; boolean wroteAtLeastOnce = false; for (File dataLocation : nodeEnv.nodeDataLocations()) { File stateLocation = new File(dataLocation, "_state"); FileSystemUtils.mkdirs(stateLocation); File stateFile = new File(stateLocation, globalFileName); FileOutputStream fos = null; try { fos = new FileOutputStream(stateFile); BytesReference bytes = builder.bytes(); fos.write(bytes.array(), bytes.arrayOffset(), bytes.length()); fos.getChannel().force(true); fos.close(); wroteAtLeastOnce = true; } catch (Exception e) { lastFailure = e; } finally { IOUtils.closeWhileHandlingException(fos); } } if (!wroteAtLeastOnce) { logger.warn("[_global]: failed to write global state", lastFailure); throw new IOException("failed to write global state", lastFailure); } // delete the old files for (File dataLocation : nodeEnv.nodeDataLocations()) { File[] files = new File(dataLocation, "_state").listFiles(); if (files == null) { continue; } for (File file : files) { if (!file.getName().startsWith("global-")) { continue; } if (file.getName().equals(globalFileName)) { continue; } file.delete(); } } } private MetaData loadState() throws Exception { MetaData.Builder metaDataBuilder = MetaData.builder(); MetaData globalMetaData = loadGlobalState(); if (globalMetaData != null) { metaDataBuilder.metaData(globalMetaData); } Set<String> indices = nodeEnv.findAllIndices(); for (String index : indices) { IndexMetaData indexMetaData = loadIndex(index); if (indexMetaData == null) { logger.debug("[{}] failed to find metadata for existing index location", index); } else { metaDataBuilder.put(indexMetaData, false); } } return metaDataBuilder.build(); } @Nullable private IndexMetaData loadIndex(String index) { long highestVersion = -1; IndexMetaData indexMetaData = null; for (File indexLocation : nodeEnv.indexLocations(new Index(index))) { File stateDir = new File(indexLocation, "_state"); if (!stateDir.exists() || !stateDir.isDirectory()) { continue; } // now, iterate over the current versions, and find latest one File[] stateFiles = stateDir.listFiles(); if (stateFiles == null) { continue; } for (File stateFile : stateFiles) { if (!stateFile.getName().startsWith("state-")) { continue; } try { long version = Long.parseLong(stateFile.getName().substring("state-".length())); if (version > highestVersion) { byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile)); if (data.length == 0) { logger.debug( "[{}]: no data for [" + stateFile.getAbsolutePath() + "], ignoring...", index); continue; } XContentParser parser = null; try { parser = XContentHelper.createParser(data, 0, data.length); parser.nextToken(); // move to START_OBJECT indexMetaData = IndexMetaData.Builder.fromXContent(parser); highestVersion = version; } finally { if (parser != null) { parser.close(); } } } } catch (Exception e) { logger.debug( "[{}]: failed to read [" + stateFile.getAbsolutePath() + "], ignoring...", e, index); } } } return indexMetaData; } private MetaData loadGlobalState() { long highestVersion = -1; MetaData metaData = null; for (File dataLocation : nodeEnv.nodeDataLocations()) { File stateLocation = new File(dataLocation, "_state"); if (!stateLocation.exists()) { continue; } File[] stateFiles = stateLocation.listFiles(); if (stateFiles == null) { continue; } for (File stateFile : stateFiles) { String name = stateFile.getName(); if (!name.startsWith("global-")) { continue; } try { long version = Long.parseLong(stateFile.getName().substring("global-".length())); if (version > highestVersion) { byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile)); if (data.length == 0) { logger.debug( "[_global] no data for [" + stateFile.getAbsolutePath() + "], ignoring..."); continue; } XContentParser parser = null; try { parser = XContentHelper.createParser(data, 0, data.length); metaData = MetaData.Builder.fromXContent(parser); highestVersion = version; } finally { if (parser != null) { parser.close(); } } } } catch (Exception e) { logger.debug("failed to load global state from [{}]", e, stateFile.getAbsolutePath()); } } } return metaData; } private void pre019Upgrade() throws Exception { long index = -1; File metaDataFile = null; MetaData metaData = null; long version = -1; for (File dataLocation : nodeEnv.nodeDataLocations()) { File stateLocation = new File(dataLocation, "_state"); if (!stateLocation.exists()) { continue; } File[] stateFiles = stateLocation.listFiles(); if (stateFiles == null) { continue; } for (File stateFile : stateFiles) { if (logger.isTraceEnabled()) { logger.trace("[upgrade]: processing [" + stateFile.getName() + "]"); } String name = stateFile.getName(); if (!name.startsWith("metadata-")) { continue; } long fileIndex = Long.parseLong(name.substring(name.indexOf('-') + 1)); if (fileIndex >= index) { // try and read the meta data try { byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile)); if (data.length == 0) { continue; } XContentParser parser = XContentHelper.createParser(data, 0, data.length); try { String currentFieldName = null; XContentParser.Token token = parser.nextToken(); if (token != null) { while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { if (token == XContentParser.Token.FIELD_NAME) { currentFieldName = parser.currentName(); } else if (token == XContentParser.Token.START_OBJECT) { if ("meta-data".equals(currentFieldName)) { metaData = MetaData.Builder.fromXContent(parser); } } else if (token.isValue()) { if ("version".equals(currentFieldName)) { version = parser.longValue(); } } } } } finally { parser.close(); } index = fileIndex; metaDataFile = stateFile; } catch (IOException e) { logger.warn("failed to read pre 0.19 state from [" + name + "], ignoring...", e); } } } } if (metaData == null) { return; } logger.info( "found old metadata state, loading metadata from [{}] and converting to new metadata location and strucutre...", metaDataFile.getAbsolutePath()); writeGlobalState( "upgrade", MetaData.builder().metaData(metaData).version(version).build(), null); for (IndexMetaData indexMetaData : metaData) { IndexMetaData.Builder indexMetaDataBuilder = IndexMetaData.newIndexMetaDataBuilder(indexMetaData).version(version); // set the created version to 0.18 indexMetaDataBuilder.settings( ImmutableSettings.settingsBuilder() .put(indexMetaData.settings()) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_18_0)); writeIndex("upgrade", indexMetaDataBuilder.build(), null); } // rename shards state to backup state File backupFile = new File(metaDataFile.getParentFile(), "backup-" + metaDataFile.getName()); if (!metaDataFile.renameTo(backupFile)) { throw new IOException( "failed to rename old state to backup state [" + metaDataFile.getAbsolutePath() + "]"); } // delete all other shards state files for (File dataLocation : nodeEnv.nodeDataLocations()) { File stateLocation = new File(dataLocation, "_state"); if (!stateLocation.exists()) { continue; } File[] stateFiles = stateLocation.listFiles(); if (stateFiles == null) { continue; } for (File stateFile : stateFiles) { String name = stateFile.getName(); if (!name.startsWith("metadata-")) { continue; } stateFile.delete(); } } logger.info( "conversion to new metadata location and format done, backup create at [{}]", backupFile.getAbsolutePath()); } class RemoveDanglingIndex implements Runnable { private final String index; RemoveDanglingIndex(String index) { this.index = index; } @Override public void run() { synchronized (danglingMutex) { DanglingIndex remove = danglingIndices.remove(index); // no longer there... if (remove == null) { return; } logger.info("[{}] deleting dangling index", index); FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(index))); } } } static class DanglingIndex { public final String index; public final ScheduledFuture future; DanglingIndex(String index, ScheduledFuture future) { this.index = index; this.future = future; } } }
@Override protected void doSample() { // the nodes we are going to ping include the core listed nodes that were added // and the last round of discovered nodes Set<DiscoveryNode> nodesToPing = Sets.newHashSet(); for (DiscoveryNode node : listedNodes) { nodesToPing.add(node); } for (DiscoveryNode node : nodes) { nodesToPing.add(node); } final CountDownLatch latch = new CountDownLatch(nodesToPing.size()); final ConcurrentMap<DiscoveryNode, ClusterStateResponse> clusterStateResponses = ConcurrentCollections.newConcurrentMap(); for (final DiscoveryNode listedNode : nodesToPing) { threadPool .executor(ThreadPool.Names.MANAGEMENT) .execute( new Runnable() { @Override public void run() { try { if (!transportService.nodeConnected(listedNode)) { try { // if its one of the actual nodes we will talk to, not to listed nodes, // fully connect if (nodes.contains(listedNode)) { logger.trace("connecting to cluster node [{}]", listedNode); transportService.connectToNode(listedNode); } else { // its a listed node, light connect to it... logger.trace("connecting to listed node (light) [{}]", listedNode); transportService.connectToNodeLight(listedNode); } } catch (Exception e) { logger.debug( "failed to connect to node [{}], ignoring...", e, listedNode); latch.countDown(); return; } } transportService.sendRequest( listedNode, ClusterStateAction.NAME, headers.applyTo( Requests.clusterStateRequest().clear().nodes(true).local(true)), TransportRequestOptions.options() .withType(TransportRequestOptions.Type.STATE) .withTimeout(pingTimeout), new BaseTransportResponseHandler<ClusterStateResponse>() { @Override public ClusterStateResponse newInstance() { return new ClusterStateResponse(); } @Override public String executor() { return ThreadPool.Names.SAME; } @Override public void handleResponse(ClusterStateResponse response) { clusterStateResponses.put(listedNode, response); latch.countDown(); } @Override public void handleException(TransportException e) { logger.info( "failed to get local cluster state for {}, disconnecting...", e, listedNode); transportService.disconnectFromNode(listedNode); latch.countDown(); } }); } catch (Throwable e) { logger.info( "failed to get local cluster state info for {}, disconnecting...", e, listedNode); transportService.disconnectFromNode(listedNode); latch.countDown(); } } }); } try { latch.await(); } catch (InterruptedException e) { return; } HashSet<DiscoveryNode> newNodes = new HashSet<>(); HashSet<DiscoveryNode> newFilteredNodes = new HashSet<>(); for (Map.Entry<DiscoveryNode, ClusterStateResponse> entry : clusterStateResponses.entrySet()) { if (!ignoreClusterName && !clusterName.equals(entry.getValue().getClusterName())) { logger.warn( "node {} not part of the cluster {}, ignoring...", entry.getValue().getState().nodes().localNode(), clusterName); newFilteredNodes.add(entry.getKey()); continue; } for (ObjectCursor<DiscoveryNode> cursor : entry.getValue().getState().nodes().dataNodes().values()) { newNodes.add(cursor.value); } } nodes = validateNewNodes(newNodes); filteredNodes = ImmutableList.copyOf(newFilteredNodes); }
public class TransportService extends AbstractLifecycleComponent<TransportService> { private final AtomicBoolean started = new AtomicBoolean(false); protected final Transport transport; protected final ThreadPool threadPool; volatile ImmutableMap<String, TransportRequestHandler> serverHandlers = ImmutableMap.of(); final Object serverHandlersMutex = new Object(); final ConcurrentMapLong<RequestHolder> clientHandlers = ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency(); final AtomicLong requestIds = new AtomicLong(); final CopyOnWriteArrayList<TransportConnectionListener> connectionListeners = new CopyOnWriteArrayList<>(); // An LRU (don't really care about concurrency here) that holds the latest timed out requests so // if they // do show up, we can print more descriptive information about them final Map<Long, TimeoutInfoHolder> timeoutInfoHandlers = Collections.synchronizedMap( new LinkedHashMap<Long, TimeoutInfoHolder>(100, .75F, true) { protected boolean removeEldestEntry(Map.Entry eldest) { return size() > 100; } }); private final TransportService.Adapter adapter = new Adapter(); public TransportService(Transport transport, ThreadPool threadPool) { this(EMPTY_SETTINGS, transport, threadPool); } @Inject public TransportService(Settings settings, Transport transport, ThreadPool threadPool) { super(settings); this.transport = transport; this.threadPool = threadPool; } @Override protected void doStart() throws ElasticsearchException { adapter.rxMetric.clear(); adapter.txMetric.clear(); transport.transportServiceAdapter(adapter); transport.start(); if (transport.boundAddress() != null && logger.isInfoEnabled()) { logger.info("{}", transport.boundAddress()); } boolean setStarted = started.compareAndSet(false, true); assert setStarted : "service was already started"; } @Override protected void doStop() throws ElasticsearchException { final boolean setStopped = started.compareAndSet(true, false); assert setStopped : "service has already been stopped"; try { transport.stop(); } finally { // in case the transport is not connected to our local node (thus cleaned on node disconnect) // make sure to clean any leftover on going handles for (Map.Entry<Long, RequestHolder> entry : clientHandlers.entrySet()) { final RequestHolder holderToNotify = clientHandlers.remove(entry.getKey()); if (holderToNotify != null) { // callback that an exception happened, but on a different thread since we don't // want handlers to worry about stack overflows threadPool .generic() .execute( new Runnable() { @Override public void run() { holderToNotify .handler() .handleException( new TransportException( "transport stopped, action: " + holderToNotify.action())); } }); } } } } @Override protected void doClose() throws ElasticsearchException { transport.close(); } public boolean addressSupported(Class<? extends TransportAddress> address) { return transport.addressSupported(address); } public TransportInfo info() { BoundTransportAddress boundTransportAddress = boundAddress(); if (boundTransportAddress == null) { return null; } return new TransportInfo(boundTransportAddress); } public TransportStats stats() { return new TransportStats( transport.serverOpen(), adapter.rxMetric.count(), adapter.rxMetric.sum(), adapter.txMetric.count(), adapter.txMetric.sum()); } public BoundTransportAddress boundAddress() { return transport.boundAddress(); } public boolean nodeConnected(DiscoveryNode node) { return transport.nodeConnected(node); } public void connectToNode(DiscoveryNode node) throws ConnectTransportException { transport.connectToNode(node); } public void connectToNodeLight(DiscoveryNode node) throws ConnectTransportException { transport.connectToNodeLight(node); } public void disconnectFromNode(DiscoveryNode node) { transport.disconnectFromNode(node); } public void addConnectionListener(TransportConnectionListener listener) { connectionListeners.add(listener); } public void removeConnectionListener(TransportConnectionListener listener) { connectionListeners.remove(listener); } public <T extends TransportResponse> TransportFuture<T> submitRequest( DiscoveryNode node, String action, TransportRequest request, TransportResponseHandler<T> handler) throws TransportException { return submitRequest(node, action, request, TransportRequestOptions.EMPTY, handler); } public <T extends TransportResponse> TransportFuture<T> submitRequest( DiscoveryNode node, String action, TransportRequest request, TransportRequestOptions options, TransportResponseHandler<T> handler) throws TransportException { PlainTransportFuture<T> futureHandler = new PlainTransportFuture<>(handler); sendRequest(node, action, request, options, futureHandler); return futureHandler; } public <T extends TransportResponse> void sendRequest( final DiscoveryNode node, final String action, final TransportRequest request, final TransportResponseHandler<T> handler) { sendRequest(node, action, request, TransportRequestOptions.EMPTY, handler); } public <T extends TransportResponse> void sendRequest( final DiscoveryNode node, final String action, final TransportRequest request, final TransportRequestOptions options, TransportResponseHandler<T> handler) { if (node == null) { throw new ElasticsearchIllegalStateException("can't send request to a null node"); } final long requestId = newRequestId(); TimeoutHandler timeoutHandler = null; try { clientHandlers.put(requestId, new RequestHolder<>(handler, node, action, timeoutHandler)); if (started.get() == false) { // if we are not started the exception handling will remove the RequestHolder again and // calls the handler to notify the caller. // it will only notify if the toStop code hasn't done the work yet. throw new TransportException("TransportService is closed stopped can't send request"); } if (options.timeout() != null) { timeoutHandler = new TimeoutHandler(requestId); timeoutHandler.future = threadPool.schedule(options.timeout(), ThreadPool.Names.GENERIC, timeoutHandler); } transport.sendRequest(node, requestId, action, request, options); } catch (final Throwable e) { // usually happen either because we failed to connect to the node // or because we failed serializing the message final RequestHolder holderToNotify = clientHandlers.remove(requestId); // if the scheduler raise a EsRejectedExecutionException (due to shutdown), we may have a // timeout handler, but no future if (timeoutHandler != null) { FutureUtils.cancel(timeoutHandler.future); } // If holderToNotify == null then handler has already been taken care of. if (holderToNotify != null) { // callback that an exception happened, but on a different thread since we don't // want handlers to worry about stack overflows final SendRequestTransportException sendRequestException = new SendRequestTransportException(node, action, e); threadPool .executor(ThreadPool.Names.GENERIC) .execute( new Runnable() { @Override public void run() { holderToNotify.handler().handleException(sendRequestException); } }); } } } private long newRequestId() { return requestIds.getAndIncrement(); } public TransportAddress[] addressesFromString(String address) throws Exception { return transport.addressesFromString(address); } public void registerHandler(String action, TransportRequestHandler handler) { synchronized (serverHandlersMutex) { TransportRequestHandler handlerReplaced = serverHandlers.get(action); serverHandlers = MapBuilder.newMapBuilder(serverHandlers).put(action, handler).immutableMap(); if (handlerReplaced != null) { logger.warn( "Registered two transport handlers for action {}, handlers: {}, {}", action, handler, handlerReplaced); } } } public void removeHandler(String action) { synchronized (serverHandlersMutex) { serverHandlers = MapBuilder.newMapBuilder(serverHandlers).remove(action).immutableMap(); } } protected TransportRequestHandler getHandler(String action) { return serverHandlers.get(action); } class Adapter implements TransportServiceAdapter { final MeanMetric rxMetric = new MeanMetric(); final MeanMetric txMetric = new MeanMetric(); @Override public void received(long size) { rxMetric.inc(size); } @Override public void sent(long size) { txMetric.inc(size); } @Override public TransportRequestHandler handler(String action, Version version) { return serverHandlers.get(ActionNames.incomingAction(action, version)); } @Override public TransportResponseHandler remove(long requestId) { RequestHolder holder = clientHandlers.remove(requestId); if (holder == null) { // lets see if its in the timeout holder TimeoutInfoHolder timeoutInfoHolder = timeoutInfoHandlers.remove(requestId); if (timeoutInfoHolder != null) { long time = System.currentTimeMillis(); logger.warn( "Received response for a request that has timed out, sent [{}ms] ago, timed out [{}ms] ago, action [{}], node [{}], id [{}]", time - timeoutInfoHolder.sentTime(), time - timeoutInfoHolder.timeoutTime(), timeoutInfoHolder.action(), timeoutInfoHolder.node(), requestId); } else { logger.warn("Transport response handler not found of id [{}]", requestId); } return null; } holder.cancel(); return holder.handler(); } @Override public void raiseNodeConnected(final DiscoveryNode node) { threadPool .generic() .execute( new Runnable() { @Override public void run() { for (TransportConnectionListener connectionListener : connectionListeners) { connectionListener.onNodeConnected(node); } } }); } @Override public void raiseNodeDisconnected(final DiscoveryNode node) { try { for (final TransportConnectionListener connectionListener : connectionListeners) { threadPool .generic() .execute( new Runnable() { @Override public void run() { connectionListener.onNodeDisconnected(node); } }); } for (Map.Entry<Long, RequestHolder> entry : clientHandlers.entrySet()) { RequestHolder holder = entry.getValue(); if (holder.node().equals(node)) { final RequestHolder holderToNotify = clientHandlers.remove(entry.getKey()); if (holderToNotify != null) { // callback that an exception happened, but on a different thread since we don't // want handlers to worry about stack overflows threadPool .generic() .execute( new Runnable() { @Override public void run() { holderToNotify .handler() .handleException( new NodeDisconnectedException(node, holderToNotify.action())); } }); } } } } catch (EsRejectedExecutionException ex) { logger.debug("Rejected execution on NodeDisconnected", ex); } } @Override public String action(String action, Version version) { return ActionNames.outgoingAction(action, version); } } class TimeoutHandler implements Runnable { private final long requestId; private final long sentTime = System.currentTimeMillis(); ScheduledFuture future; TimeoutHandler(long requestId) { this.requestId = requestId; } public long sentTime() { return sentTime; } @Override public void run() { if (future.isCancelled()) { return; } final RequestHolder holder = clientHandlers.remove(requestId); if (holder != null) { // add it to the timeout information holder, in case we are going to get a response later long timeoutTime = System.currentTimeMillis(); timeoutInfoHandlers.put( requestId, new TimeoutInfoHolder(holder.node(), holder.action(), sentTime, timeoutTime)); holder .handler() .handleException( new ReceiveTimeoutTransportException( holder.node(), holder.action(), "request_id [" + requestId + "] timed out after [" + (timeoutTime - sentTime) + "ms]")); } } } static class TimeoutInfoHolder { private final DiscoveryNode node; private final String action; private final long sentTime; private final long timeoutTime; TimeoutInfoHolder(DiscoveryNode node, String action, long sentTime, long timeoutTime) { this.node = node; this.action = action; this.sentTime = sentTime; this.timeoutTime = timeoutTime; } public DiscoveryNode node() { return node; } public String action() { return action; } public long sentTime() { return sentTime; } public long timeoutTime() { return timeoutTime; } } static class RequestHolder<T extends TransportResponse> { private final TransportResponseHandler<T> handler; private final DiscoveryNode node; private final String action; private final TimeoutHandler timeout; RequestHolder( TransportResponseHandler<T> handler, DiscoveryNode node, String action, TimeoutHandler timeout) { this.handler = handler; this.node = node; this.action = action; this.timeout = timeout; } public TransportResponseHandler<T> handler() { return handler; } public DiscoveryNode node() { return this.node; } public String action() { return this.action; } public void cancel() { if (timeout != null) { FutureUtils.cancel(timeout.future); } } } }
/** * inner class is responsible for send the requests to all replica shards and manage the responses */ final class ReplicationPhase extends AbstractRunnable { private final ReplicaRequest replicaRequest; private final Response finalResponse; private final ShardIterator shardIt; private final ActionListener<Response> listener; private final AtomicBoolean finished = new AtomicBoolean(false); private final AtomicInteger success = new AtomicInteger(1); // We already wrote into the primary shard private final ConcurrentMap<String, Throwable> shardReplicaFailures = ConcurrentCollections.newConcurrentMap(); private final IndexMetaData indexMetaData; private final ShardRouting originalPrimaryShard; private final AtomicInteger pending; private final int totalShards; private final ClusterStateObserver observer; private final Releasable indexShardReference; private final TimeValue shardFailedTimeout; /** * the constructor doesn't take any action, just calculates state. Call {@link #run()} to start * replicating. */ public ReplicationPhase( ShardIterator originalShardIt, ReplicaRequest replicaRequest, Response finalResponse, ClusterStateObserver observer, ShardRouting originalPrimaryShard, InternalRequest internalRequest, ActionListener<Response> listener, Releasable indexShardReference, TimeValue shardFailedTimeout) { this.replicaRequest = replicaRequest; this.listener = listener; this.finalResponse = finalResponse; this.originalPrimaryShard = originalPrimaryShard; this.observer = observer; indexMetaData = observer.observedState().metaData().index(internalRequest.concreteIndex()); this.indexShardReference = indexShardReference; this.shardFailedTimeout = shardFailedTimeout; ShardRouting shard; // we double check on the state, if it got changed we need to make sure we take the latest one // cause // maybe a replica shard started its recovery process and we need to apply it there... // we also need to make sure if the new state has a new primary shard (that we indexed to // before) started // and assigned to another node (while the indexing happened). In that case, we want to apply // it on the // new primary shard as well... ClusterState newState = clusterService.state(); int numberOfUnassignedOrIgnoredReplicas = 0; int numberOfPendingShardInstances = 0; if (observer.observedState() != newState) { observer.reset(newState); shardIt = shards(newState, internalRequest); while ((shard = shardIt.nextOrNull()) != null) { if (shard.primary()) { if (originalPrimaryShard.currentNodeId().equals(shard.currentNodeId()) == false) { // there is a new primary, we'll have to replicate to it. numberOfPendingShardInstances++; } if (shard.relocating()) { numberOfPendingShardInstances++; } } else if (shouldExecuteReplication(indexMetaData.getSettings()) == false) { // If the replicas use shadow replicas, there is no reason to // perform the action on the replica, so skip it and // immediately return // this delays mapping updates on replicas because they have // to wait until they get the new mapping through the cluster // state, which is why we recommend pre-defined mappings for // indices using shadow replicas numberOfUnassignedOrIgnoredReplicas++; } else if (shard.unassigned()) { numberOfUnassignedOrIgnoredReplicas++; } else if (shard.relocating()) { // we need to send to two copies numberOfPendingShardInstances += 2; } else { numberOfPendingShardInstances++; } } } else { shardIt = originalShardIt; shardIt.reset(); while ((shard = shardIt.nextOrNull()) != null) { if (shard.unassigned()) { numberOfUnassignedOrIgnoredReplicas++; } else if (shard.primary()) { if (shard.relocating()) { // we have to replicate to the other copy numberOfPendingShardInstances += 1; } } else if (shouldExecuteReplication(indexMetaData.getSettings()) == false) { // If the replicas use shadow replicas, there is no reason to // perform the action on the replica, so skip it and // immediately return // this delays mapping updates on replicas because they have // to wait until they get the new mapping through the cluster // state, which is why we recommend pre-defined mappings for // indices using shadow replicas numberOfUnassignedOrIgnoredReplicas++; } else if (shard.relocating()) { // we need to send to two copies numberOfPendingShardInstances += 2; } else { numberOfPendingShardInstances++; } } } // one for the primary already done this.totalShards = 1 + numberOfPendingShardInstances + numberOfUnassignedOrIgnoredReplicas; this.pending = new AtomicInteger(numberOfPendingShardInstances); } /** total shard copies */ int totalShards() { return totalShards; } /** total successful operations so far */ int successful() { return success.get(); } /** number of pending operations */ int pending() { return pending.get(); } @Override public void onFailure(Throwable t) { logger.error( "unexpected error while replicating for action [{}]. shard [{}]. ", t, actionName, shardIt.shardId()); forceFinishAsFailed(t); } /** start sending current requests to replicas */ @Override protected void doRun() { if (pending.get() == 0) { doFinish(); return; } ShardRouting shard; shardIt.reset(); // reset the iterator while ((shard = shardIt.nextOrNull()) != null) { // if its unassigned, nothing to do here... if (shard.unassigned()) { continue; } // we index on a replica that is initializing as well since we might not have got the event // yet that it was started. We will get an exception IllegalShardState exception if its not // started // and that's fine, we will ignore it if (shard.primary()) { if (originalPrimaryShard.currentNodeId().equals(shard.currentNodeId()) == false) { // there is a new primary, we'll have to replicate to it. performOnReplica(shard, shard.currentNodeId()); } if (shard.relocating()) { performOnReplica(shard, shard.relocatingNodeId()); } } else if (shouldExecuteReplication(indexMetaData.getSettings())) { performOnReplica(shard, shard.currentNodeId()); if (shard.relocating()) { performOnReplica(shard, shard.relocatingNodeId()); } } } } /** send operation to the given node or perform it if local */ void performOnReplica(final ShardRouting shard, final String nodeId) { // if we don't have that node, it means that it might have failed and will be created again, // in // this case, we don't have to do the operation, and just let it failover if (!observer.observedState().nodes().nodeExists(nodeId)) { onReplicaFailure(nodeId, null); return; } replicaRequest.internalShardId = shardIt.shardId(); if (!nodeId.equals(observer.observedState().nodes().localNodeId())) { final DiscoveryNode node = observer.observedState().nodes().get(nodeId); transportService.sendRequest( node, transportReplicaAction, replicaRequest, transportOptions, new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleResponse(TransportResponse.Empty vResponse) { onReplicaSuccess(); } @Override public void handleException(TransportException exp) { logger.trace( "[{}] transport failure during replica request [{}] ", exp, node, replicaRequest); if (ignoreReplicaException(exp)) { onReplicaFailure(nodeId, exp); } else { logger.warn( "{} failed to perform {} on node {}", exp, shardIt.shardId(), actionName, node); shardStateAction.shardFailed( shard, indexMetaData.getIndexUUID(), "failed to perform " + actionName + " on replica on node " + node, exp, shardFailedTimeout, new ReplicationFailedShardStateListener(nodeId, exp)); } } }); } else { try { threadPool .executor(executor) .execute( new AbstractRunnable() { @Override protected void doRun() { try { shardOperationOnReplica(shard.shardId(), replicaRequest); onReplicaSuccess(); } catch (Throwable e) { onReplicaFailure(nodeId, e); failReplicaIfNeeded(shard.index(), shard.id(), e); } } // we must never reject on because of thread pool capacity on replicas @Override public boolean isForceExecution() { return true; } @Override public void onFailure(Throwable t) { onReplicaFailure(nodeId, t); } }); } catch (Throwable e) { failReplicaIfNeeded(shard.index(), shard.id(), e); onReplicaFailure(nodeId, e); } } } void onReplicaFailure(String nodeId, @Nullable Throwable e) { // Only version conflict should be ignored from being put into the _shards header? if (e != null && ignoreReplicaException(e) == false) { shardReplicaFailures.put(nodeId, e); } decPendingAndFinishIfNeeded(); } void onReplicaSuccess() { success.incrementAndGet(); decPendingAndFinishIfNeeded(); } private void decPendingAndFinishIfNeeded() { if (pending.decrementAndGet() <= 0) { doFinish(); } } private void forceFinishAsFailed(Throwable t) { if (finished.compareAndSet(false, true)) { Releasables.close(indexShardReference); listener.onFailure(t); } } private void doFinish() { if (finished.compareAndSet(false, true)) { Releasables.close(indexShardReference); final ShardId shardId = shardIt.shardId(); final ActionWriteResponse.ShardInfo.Failure[] failuresArray; if (!shardReplicaFailures.isEmpty()) { int slot = 0; failuresArray = new ActionWriteResponse.ShardInfo.Failure[shardReplicaFailures.size()]; for (Map.Entry<String, Throwable> entry : shardReplicaFailures.entrySet()) { RestStatus restStatus = ExceptionsHelper.status(entry.getValue()); failuresArray[slot++] = new ActionWriteResponse.ShardInfo.Failure( shardId.getIndex(), shardId.getId(), entry.getKey(), entry.getValue(), restStatus, false); } } else { failuresArray = ActionWriteResponse.EMPTY; } finalResponse.setShardInfo( new ActionWriteResponse.ShardInfo(totalShards, success.get(), failuresArray)); listener.onResponse(finalResponse); } } public class ReplicationFailedShardStateListener implements ShardStateAction.Listener { private final String nodeId; private Throwable failure; public ReplicationFailedShardStateListener(String nodeId, Throwable failure) { this.nodeId = nodeId; this.failure = failure; } @Override public void onSuccess() { onReplicaFailure(nodeId, failure); } @Override public void onShardFailedNoMaster() { onReplicaFailure(nodeId, failure); } @Override public void onShardFailedFailure(DiscoveryNode master, TransportException e) { if (e instanceof ReceiveTimeoutTransportException) { logger.trace("timeout sending shard failure to master [{}]", e, master); } onReplicaFailure(nodeId, failure); } } }
/** * Ensures that the mapping in the cluster state is the same as the mapping in our mapper service. * If the mapping is not in sync, sends a request to update it in the cluster state and blocks * until it has finished being updated. */ private void updateMappingOnMaster() { // we test that the cluster state is in sync with our in memory mapping stored by the // mapperService // we have to do it under the "cluster state update" thread to make sure that one doesn't modify // it // while we're checking final BlockingQueue<DocumentMapper> documentMappersToUpdate = ConcurrentCollections.newBlockingQueue(); final CountDownLatch latch = new CountDownLatch(1); final AtomicReference<Throwable> mappingCheckException = new AtomicReference<>(); // we use immediate as this is a very light weight check and we don't wait to delay recovery clusterService.submitStateUpdateTask( "recovery_mapping_check", Priority.IMMEDIATE, new MappingUpdateTask( clusterService, indexService, recoverySettings, latch, documentMappersToUpdate, mappingCheckException, this.cancellableThreads)); cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { latch.await(); } }); if (mappingCheckException.get() != null) { logger.warn("error during mapping check, failing recovery", mappingCheckException.get()); throw new ElasticsearchException("error during mapping check", mappingCheckException.get()); } if (documentMappersToUpdate.isEmpty()) { return; } final CountDownLatch updatedOnMaster = new CountDownLatch(documentMappersToUpdate.size()); MappingUpdatedAction.MappingUpdateListener listener = new MappingUpdatedAction.MappingUpdateListener() { @Override public void onMappingUpdate() { updatedOnMaster.countDown(); } @Override public void onFailure(Throwable t) { logger.debug( "{} recovery to {}: failed to update mapping on master", request.shardId(), request.targetNode(), t); updatedOnMaster.countDown(); } }; for (DocumentMapper documentMapper : documentMappersToUpdate) { mappingUpdatedAction.updateMappingOnMaster( indexService.index().getName(), documentMapper, indexService.indexUUID(), listener); } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { try { if (!updatedOnMaster.await( recoverySettings.internalActionTimeout().millis(), TimeUnit.MILLISECONDS)) { logger.debug( "[{}][{}] recovery [phase2] to {}: waiting on pending mapping update timed out. waited [{}]", indexName, shardId, request.targetNode(), recoverySettings.internalActionTimeout()); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); logger.debug("interrupted while waiting for mapping to update on master"); } } }); }
/** * The tribe service holds a list of node clients connected to a list of tribe members, and uses * their cluster state events to update this local node cluster state with the merged view of it. * * <p>The {@link #processSettings(org.elasticsearch.common.settings.Settings)} method should be * called before starting the node, so it will make sure to configure this current node properly * with the relevant tribe node settings. * * <p>The tribe node settings make sure the discovery used is "local", but with no master elected. * This means no write level master node operations will work ({@link * org.elasticsearch.discovery.MasterNotDiscoveredException} will be thrown), and state level * metadata operations with automatically use the local flag. * * <p>The state merged from different clusters include the list of nodes, metadata, and routing * table. Each node merged will have in its tribe which tribe member it came from. Each index merged * will have in its settings which tribe member it came from. In case an index has already been * merged from one cluster, and the same name index is discovered in another cluster, the conflict * one will be discarded. This happens because we need to have the correct index name to propagate * to the relevant cluster. */ public class TribeService extends AbstractLifecycleComponent<TribeService> { public static final ClusterBlock TRIBE_METADATA_BLOCK = new ClusterBlock( 10, "tribe node, metadata not allowed", false, false, RestStatus.BAD_REQUEST, EnumSet.of(ClusterBlockLevel.METADATA_READ, ClusterBlockLevel.METADATA_WRITE)); public static final ClusterBlock TRIBE_WRITE_BLOCK = new ClusterBlock( 11, "tribe node, write not allowed", false, false, RestStatus.BAD_REQUEST, EnumSet.of(ClusterBlockLevel.WRITE)); public static Settings processSettings(Settings settings) { if (settings.get(TRIBE_NAME) != null) { // if its a node client started by this service as tribe, remove any tribe group setting // to avoid recursive configuration Settings.Builder sb = Settings.builder().put(settings); for (String s : settings.getAsMap().keySet()) { if (s.startsWith("tribe.") && !s.equals(TRIBE_NAME)) { sb.remove(s); } } return sb.build(); } Map<String, Settings> nodesSettings = settings.getGroups("tribe", true); if (nodesSettings.isEmpty()) { return settings; } // its a tribe configured node..., force settings Settings.Builder sb = Settings.builder().put(settings); sb.put(Node.NODE_CLIENT_SETTING.getKey(), true); // this node should just act as a node client sb.put( DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(), "local"); // a tribe node should not use zen discovery sb.put( DiscoveryService.INITIAL_STATE_TIMEOUT_SETTING.getKey(), 0); // nothing is going to be discovered, since no master will be elected if (sb.get("cluster.name") == null) { sb.put( "cluster.name", "tribe_" + Strings .randomBase64UUID()); // make sure it won't join other tribe nodes in the same JVM } sb.put(TransportMasterNodeReadAction.FORCE_LOCAL_SETTING, true); return sb.build(); } public static final String TRIBE_NAME = "tribe.name"; private final ClusterService clusterService; private final String[] blockIndicesWrite; private final String[] blockIndicesRead; private final String[] blockIndicesMetadata; private static final String ON_CONFLICT_ANY = "any", ON_CONFLICT_DROP = "drop", ON_CONFLICT_PREFER = "prefer_"; private final String onConflict; private final Set<String> droppedIndices = ConcurrentCollections.newConcurrentSet(); private final List<Node> nodes = new CopyOnWriteArrayList<>(); @Inject public TribeService( Settings settings, ClusterService clusterService, DiscoveryService discoveryService) { super(settings); this.clusterService = clusterService; Map<String, Settings> nodesSettings = new HashMap<>(settings.getGroups("tribe", true)); nodesSettings.remove("blocks"); // remove prefix settings that don't indicate a client nodesSettings.remove("on_conflict"); // remove prefix settings that don't indicate a client for (Map.Entry<String, Settings> entry : nodesSettings.entrySet()) { Settings.Builder sb = Settings.builder().put(entry.getValue()); sb.put("name", settings.get("name") + "/" + entry.getKey()); sb.put( Environment.PATH_HOME_SETTING.getKey(), Environment.PATH_HOME_SETTING.get(settings)); // pass through ES home dir sb.put(TRIBE_NAME, entry.getKey()); if (sb.get("http.enabled") == null) { sb.put("http.enabled", false); } sb.put(Node.NODE_CLIENT_SETTING.getKey(), true); nodes.add(new TribeClientNode(sb.build())); } String[] blockIndicesWrite = Strings.EMPTY_ARRAY; String[] blockIndicesRead = Strings.EMPTY_ARRAY; String[] blockIndicesMetadata = Strings.EMPTY_ARRAY; if (!nodes.isEmpty()) { // remove the initial election / recovery blocks since we are not going to have a // master elected in this single tribe node local "cluster" clusterService.removeInitialStateBlock(discoveryService.getNoMasterBlock()); clusterService.removeInitialStateBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK); if (settings.getAsBoolean("tribe.blocks.write", false)) { clusterService.addInitialStateBlock(TRIBE_WRITE_BLOCK); } blockIndicesWrite = settings.getAsArray("tribe.blocks.write.indices", Strings.EMPTY_ARRAY); if (settings.getAsBoolean("tribe.blocks.metadata", false)) { clusterService.addInitialStateBlock(TRIBE_METADATA_BLOCK); } blockIndicesMetadata = settings.getAsArray("tribe.blocks.metadata.indices", Strings.EMPTY_ARRAY); blockIndicesRead = settings.getAsArray("tribe.blocks.read.indices", Strings.EMPTY_ARRAY); for (Node node : nodes) { node.injector().getInstance(ClusterService.class).add(new TribeClusterStateListener(node)); } } this.blockIndicesMetadata = blockIndicesMetadata; this.blockIndicesRead = blockIndicesRead; this.blockIndicesWrite = blockIndicesWrite; this.onConflict = settings.get("tribe.on_conflict", ON_CONFLICT_ANY); } @Override protected void doStart() { for (Node node : nodes) { try { node.start(); } catch (Throwable e) { // calling close is safe for non started nodes, we can just iterate over all for (Node otherNode : nodes) { try { otherNode.close(); } catch (Throwable t) { logger.warn("failed to close node {} on failed start", otherNode, t); } } if (e instanceof RuntimeException) { throw (RuntimeException) e; } throw new ElasticsearchException(e); } } } @Override protected void doStop() { doClose(); } @Override protected void doClose() { for (Node node : nodes) { try { node.close(); } catch (Throwable t) { logger.warn("failed to close node {}", t, node); } } } class TribeClusterStateListener implements ClusterStateListener { private final String tribeName; private final TribeNodeClusterStateTaskExecutor executor; TribeClusterStateListener(Node tribeNode) { String tribeName = tribeNode.settings().get(TRIBE_NAME); this.tribeName = tribeName; executor = new TribeNodeClusterStateTaskExecutor(tribeName); } @Override public void clusterChanged(final ClusterChangedEvent event) { logger.debug("[{}] received cluster event, [{}]", tribeName, event.source()); clusterService.submitStateUpdateTask( "cluster event from " + tribeName + ", " + event.source(), event, ClusterStateTaskConfig.build(Priority.NORMAL), executor, (source, t) -> logger.warn("failed to process [{}]", t, source)); } } class TribeNodeClusterStateTaskExecutor implements ClusterStateTaskExecutor<ClusterChangedEvent> { private final String tribeName; TribeNodeClusterStateTaskExecutor(String tribeName) { this.tribeName = tribeName; } @Override public boolean runOnlyOnMaster() { return false; } @Override public BatchResult<ClusterChangedEvent> execute( ClusterState currentState, List<ClusterChangedEvent> tasks) throws Exception { ClusterState accumulator = ClusterState.builder(currentState).build(); BatchResult.Builder<ClusterChangedEvent> builder = BatchResult.builder(); try { // we only need to apply the latest cluster state update accumulator = applyUpdate(accumulator, tasks.get(tasks.size() - 1)); builder.successes(tasks); } catch (Throwable t) { builder.failures(tasks, t); } return builder.build(accumulator); } private ClusterState applyUpdate(ClusterState currentState, ClusterChangedEvent task) { boolean clusterStateChanged = false; ClusterState tribeState = task.state(); DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(currentState.nodes()); // -- merge nodes // go over existing nodes, and see if they need to be removed for (DiscoveryNode discoNode : currentState.nodes()) { String markedTribeName = discoNode.attributes().get(TRIBE_NAME); if (markedTribeName != null && markedTribeName.equals(tribeName)) { if (tribeState.nodes().get(discoNode.id()) == null) { clusterStateChanged = true; logger.info("[{}] removing node [{}]", tribeName, discoNode); nodes.remove(discoNode.id()); } } } // go over tribe nodes, and see if they need to be added for (DiscoveryNode tribe : tribeState.nodes()) { if (currentState.nodes().get(tribe.id()) == null) { // a new node, add it, but also add the tribe name to the attributes Map<String, String> tribeAttr = new HashMap<>(); for (ObjectObjectCursor<String, String> attr : tribe.attributes()) { tribeAttr.put(attr.key, attr.value); } tribeAttr.put(TRIBE_NAME, tribeName); DiscoveryNode discoNode = new DiscoveryNode( tribe.name(), tribe.id(), tribe.getHostName(), tribe.getHostAddress(), tribe.address(), unmodifiableMap(tribeAttr), tribe.version()); clusterStateChanged = true; logger.info("[{}] adding node [{}]", tribeName, discoNode); nodes.put(discoNode); } } // -- merge metadata ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks()); MetaData.Builder metaData = MetaData.builder(currentState.metaData()); RoutingTable.Builder routingTable = RoutingTable.builder(currentState.routingTable()); // go over existing indices, and see if they need to be removed for (IndexMetaData index : currentState.metaData()) { String markedTribeName = index.getSettings().get(TRIBE_NAME); if (markedTribeName != null && markedTribeName.equals(tribeName)) { IndexMetaData tribeIndex = tribeState.metaData().index(index.getIndex()); clusterStateChanged = true; if (tribeIndex == null || tribeIndex.getState() == IndexMetaData.State.CLOSE) { logger.info("[{}] removing index [{}]", tribeName, index.getIndex()); removeIndex(blocks, metaData, routingTable, index); } else { // always make sure to update the metadata and routing table, in case // there are changes in them (new mapping, shards moving from initializing to started) routingTable.add(tribeState.routingTable().index(index.getIndex())); Settings tribeSettings = Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build(); metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings)); } } } // go over tribe one, and see if they need to be added for (IndexMetaData tribeIndex : tribeState.metaData()) { // if there is no routing table yet, do nothing with it... IndexRoutingTable table = tribeState.routingTable().index(tribeIndex.getIndex()); if (table == null) { continue; } final IndexMetaData indexMetaData = currentState.metaData().index(tribeIndex.getIndex()); if (indexMetaData == null) { if (!droppedIndices.contains(tribeIndex.getIndex())) { // a new index, add it, and add the tribe name as a setting clusterStateChanged = true; logger.info("[{}] adding index [{}]", tribeName, tribeIndex.getIndex()); addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex); } } else { String existingFromTribe = indexMetaData.getSettings().get(TRIBE_NAME); if (!tribeName.equals(existingFromTribe)) { // we have a potential conflict on index names, decide what to do... if (ON_CONFLICT_ANY.equals(onConflict)) { // we chose any tribe, carry on } else if (ON_CONFLICT_DROP.equals(onConflict)) { // drop the indices, there is a conflict clusterStateChanged = true; logger.info( "[{}] dropping index [{}] due to conflict with [{}]", tribeName, tribeIndex.getIndex(), existingFromTribe); removeIndex(blocks, metaData, routingTable, tribeIndex); droppedIndices.add(tribeIndex.getIndex()); } else if (onConflict.startsWith(ON_CONFLICT_PREFER)) { // on conflict, prefer a tribe... String preferredTribeName = onConflict.substring(ON_CONFLICT_PREFER.length()); if (tribeName.equals(preferredTribeName)) { // the new one is hte preferred one, replace... clusterStateChanged = true; logger.info( "[{}] adding index [{}], preferred over [{}]", tribeName, tribeIndex.getIndex(), existingFromTribe); removeIndex(blocks, metaData, routingTable, tribeIndex); addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex); } // else: either the existing one is the preferred one, or we haven't seen one, carry // on } } } } if (!clusterStateChanged) { return currentState; } else { return ClusterState.builder(currentState) .incrementVersion() .blocks(blocks) .nodes(nodes) .metaData(metaData) .routingTable(routingTable.build()) .build(); } } private void removeIndex( ClusterBlocks.Builder blocks, MetaData.Builder metaData, RoutingTable.Builder routingTable, IndexMetaData index) { metaData.remove(index.getIndex()); routingTable.remove(index.getIndex()); blocks.removeIndexBlocks(index.getIndex()); } private void addNewIndex( ClusterState tribeState, ClusterBlocks.Builder blocks, MetaData.Builder metaData, RoutingTable.Builder routingTable, IndexMetaData tribeIndex) { Settings tribeSettings = Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build(); metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings)); routingTable.add(tribeState.routingTable().index(tribeIndex.getIndex())); if (Regex.simpleMatch(blockIndicesMetadata, tribeIndex.getIndex())) { blocks.addIndexBlock(tribeIndex.getIndex(), IndexMetaData.INDEX_METADATA_BLOCK); } if (Regex.simpleMatch(blockIndicesRead, tribeIndex.getIndex())) { blocks.addIndexBlock(tribeIndex.getIndex(), IndexMetaData.INDEX_READ_BLOCK); } if (Regex.simpleMatch(blockIndicesWrite, tribeIndex.getIndex())) { blocks.addIndexBlock(tribeIndex.getIndex(), IndexMetaData.INDEX_WRITE_BLOCK); } } } }
public class GatewayAllocator extends AbstractComponent { public static final String INDEX_RECOVERY_INITIAL_SHARDS = "index.recovery.initial_shards"; private final TransportNodesListGatewayStartedShards listGatewayStartedShards; private final TransportNodesListShardStoreMetaData listShardStoreMetaData; private final ConcurrentMap< ShardId, Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>> cachedStores = ConcurrentCollections.newConcurrentMap(); private final ConcurrentMap<ShardId, ObjectLongOpenHashMap<DiscoveryNode>> cachedShardsState = ConcurrentCollections.newConcurrentMap(); private final TimeValue listTimeout; private final String initialShards; @Inject public GatewayAllocator( Settings settings, TransportNodesListGatewayStartedShards listGatewayStartedShards, TransportNodesListShardStoreMetaData listShardStoreMetaData) { super(settings); this.listGatewayStartedShards = listGatewayStartedShards; this.listShardStoreMetaData = listShardStoreMetaData; this.listTimeout = componentSettings.getAsTime( "list_timeout", settings.getAsTime("gateway.local.list_timeout", TimeValue.timeValueSeconds(30))); this.initialShards = componentSettings.get( "initial_shards", settings.get("gateway.local.initial_shards", "quorum")); logger.debug("using initial_shards [{}], list_timeout [{}]", initialShards, listTimeout); } public void applyStartedShards(StartedRerouteAllocation allocation) { for (ShardRouting shardRouting : allocation.startedShards()) { cachedStores.remove(shardRouting.shardId()); cachedShardsState.remove(shardRouting.shardId()); } } public void applyFailedShards(FailedRerouteAllocation allocation) { for (ShardRouting failedShard : allocation.failedShards()) { cachedStores.remove(failedShard.shardId()); cachedShardsState.remove(failedShard.shardId()); } } public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard); int numberOfAllocationsFound = 0; long highestVersion = -1; Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet(); final boolean[] states = nodesState.allocated; final Object[] keys = nodesState.keys; final long[] values = nodesState.values; for (int i = 0; i < states.length; i++) { if (!states[i]) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (version != -1) { numberOfAllocationsFound++; if (highestVersion == -1) { nodesWithHighestVersion.add(node); highestVersion = version; } else { if (version > highestVersion) { nodesWithHighestVersion.clear(); nodesWithHighestVersion.add(node); highestVersion = version; } else if (version == highestVersion) { nodesWithHighestVersion.add(node); } } } } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index()); String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state allocation .routingNodes() .assign(new MutableShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = routingNodes.unassigned().iterator(); while (unassignedIterator.hasNext()) { MutableShardRouting shard = unassignedIterator.next(); // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { continue; } Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = buildShardStores(nodes, shard); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> nodeStoreEntry : shardStores.entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { MutableShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = shardStores.get(primaryNode); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { if (primaryNodeStore.fileExists(storeFileMetaData.name()) && primaryNodeStore .file(storeFileMetaData.name()) .isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; allocation.routingNodes().assign(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } } return changed; } private ObjectLongOpenHashMap<DiscoveryNode> buildShardStates( final DiscoveryNodes nodes, MutableShardRouting shard) { ObjectLongOpenHashMap<DiscoveryNode> shardStates = cachedShardsState.get(shard.shardId()); ObjectOpenHashSet<String> nodeIds; if (shardStates == null) { shardStates = new ObjectLongOpenHashMap<>(); cachedShardsState.put(shard.shardId(), shardStates); nodeIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { // clean nodes that have failed shardStates .keys() .removeAll( new ObjectPredicate<DiscoveryNode>() { @Override public boolean apply(DiscoveryNode node) { return !nodes.nodeExists(node.id()); } }); nodeIds = ObjectOpenHashSet.newInstance(); // we have stored cached from before, see if the nodes changed, if they have, go fetch again for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStates.containsKey(node)) { nodeIds.add(node.id()); } } } if (nodeIds.isEmpty()) { return shardStates; } String[] nodesIdsArray = nodeIds.toArray(String.class); TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response = listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet(); if (logger.isDebugEnabled()) { if (response.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list shards on nodes:"); for (int i = 0; i < response.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ").append(response.failures()[i].getDetailedMessage()); } logger.debug(sb.toString()); } } for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : response) { // -1 version means it does not exists, which is what the API returns, and what we expect to logger.trace( "[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), nodeShardState.version()); shardStates.put(nodeShardState.getNode(), nodeShardState.version()); } return shardStates; } private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) { Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId()); ObjectOpenHashSet<String> nodesIds; if (shardStores == null) { shardStores = Maps.newHashMap(); cachedStores.put(shard.shardId(), shardStores); nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { nodesIds = ObjectOpenHashSet.newInstance(); // clean nodes that have failed for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!nodes.nodeExists(node.id())) { it.remove(); } } for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStores.containsKey(node)) { nodesIds.add(node.id()); } } } if (!nodesIds.isEmpty()) { String[] nodesIdsArray = nodesIds.toArray(String.class); TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData .list(shard.shardId(), false, nodesIdsArray, listTimeout) .actionGet(); if (logger.isTraceEnabled()) { if (nodesStoreFilesMetaData.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:"); for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ") .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage()); } logger.trace(sb.toString()); } } for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) { if (nodeStoreFilesMetaData.storeFilesMetaData() != null) { shardStores.put( nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData()); } } } return shardStores; } }
public class GatewayAllocator extends AbstractComponent { private RoutingService routingService; private final PrimaryShardAllocator primaryShardAllocator; private final ReplicaShardAllocator replicaShardAllocator; private final ConcurrentMap< ShardId, AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>> asyncFetchStarted = ConcurrentCollections.newConcurrentMap(); private final ConcurrentMap< ShardId, AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>> asyncFetchStore = ConcurrentCollections.newConcurrentMap(); @Inject public GatewayAllocator( Settings settings, final TransportNodesListGatewayStartedShards startedAction, final TransportNodesListShardStoreMetaData storeAction) { super(settings); this.primaryShardAllocator = new InternalPrimaryShardAllocator(settings, startedAction); this.replicaShardAllocator = new InternalReplicaShardAllocator(settings, storeAction); } public void setReallocation( final ClusterService clusterService, final RoutingService routingService) { this.routingService = routingService; clusterService.add( new ClusterStateListener() { @Override public void clusterChanged(ClusterChangedEvent event) { boolean cleanCache = false; DiscoveryNode localNode = event.state().nodes().localNode(); if (localNode != null) { if (localNode.masterNode() == true && event.localNodeMaster() == false) { cleanCache = true; } } else { cleanCache = true; } if (cleanCache) { Releasables.close(asyncFetchStarted.values()); asyncFetchStarted.clear(); Releasables.close(asyncFetchStore.values()); asyncFetchStore.clear(); } } }); } public int getNumberOfInFlightFetch() { int count = 0; for (AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch : asyncFetchStarted.values()) { count += fetch.getNumberOfInFlightFetches(); } for (AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch : asyncFetchStore.values()) { count += fetch.getNumberOfInFlightFetches(); } return count; } public void applyStartedShards(StartedRerouteAllocation allocation) { for (ShardRouting shard : allocation.startedShards()) { Releasables.close(asyncFetchStarted.remove(shard.shardId())); Releasables.close(asyncFetchStore.remove(shard.shardId())); } } public void applyFailedShards(FailedRerouteAllocation allocation) { for (FailedRerouteAllocation.FailedShard shard : allocation.failedShards()) { Releasables.close(asyncFetchStarted.remove(shard.shard.shardId())); Releasables.close(asyncFetchStore.remove(shard.shard.shardId())); } } public boolean allocateUnassigned(final RoutingAllocation allocation) { boolean changed = false; RoutingNodes.UnassignedShards unassigned = allocation.routingNodes().unassigned(); unassigned.sort( PriorityComparator.getAllocationComparator(allocation)); // sort for priority ordering changed |= primaryShardAllocator.allocateUnassigned(allocation); changed |= replicaShardAllocator.processExistingRecoveries(allocation); changed |= replicaShardAllocator.allocateUnassigned(allocation); return changed; } class InternalAsyncFetch<T extends BaseNodeResponse> extends AsyncShardFetch<T> { public InternalAsyncFetch( ESLogger logger, String type, ShardId shardId, List<? extends BaseNodesResponse<T>, T> action) { super(logger, type, shardId, action); } @Override protected void reroute(ShardId shardId, String reason) { logger.trace("{} scheduling reroute for {}", shardId, reason); routingService.reroute("async_shard_fetch"); } } class InternalPrimaryShardAllocator extends PrimaryShardAllocator { private final TransportNodesListGatewayStartedShards startedAction; public InternalPrimaryShardAllocator( Settings settings, TransportNodesListGatewayStartedShards startedAction) { super(settings); this.startedAction = startedAction; } @Override protected AsyncShardFetch.FetchResult< TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation) { AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction); asyncFetchStarted.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData( allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData() == true) { shardState.processAllocation(allocation); } return shardState; } } class InternalReplicaShardAllocator extends ReplicaShardAllocator { private final TransportNodesListShardStoreMetaData storeAction; public InternalReplicaShardAllocator( Settings settings, TransportNodesListShardStoreMetaData storeAction) { super(settings); this.storeAction = storeAction; } @Override protected AsyncShardFetch.FetchResult< TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetchData(ShardRouting shard, RoutingAllocation allocation) { AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction); asyncFetchStore.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetch.fetchData( allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId())); if (shardStores.hasData() == true) { shardStores.processAllocation(allocation); } return shardStores; } } }
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider { private final ThreadPool threadPool; private final TransportService transportService; private final ClusterService clusterService; private AllocationService allocationService; private final ClusterName clusterName; private final DiscoveryNodeService discoveryNodeService; private final ZenPingService pingService; private final MasterFaultDetection masterFD; private final NodesFaultDetection nodesFD; private final PublishClusterStateAction publishClusterState; private final MembershipAction membership; private final Version version; private final TimeValue pingTimeout; // a flag that should be used only for testing private final boolean sendLeaveRequest; private final ElectMasterService electMaster; private final boolean masterElectionFilterClientNodes; private final boolean masterElectionFilterDataNodes; private DiscoveryNode localNode; private final CopyOnWriteArrayList<InitialStateDiscoveryListener> initialStateListeners = new CopyOnWriteArrayList<InitialStateDiscoveryListener>(); private volatile boolean master = false; private volatile DiscoveryNodes latestDiscoNodes; private volatile Thread currentJoinThread; private final AtomicBoolean initialStateSent = new AtomicBoolean(); @Nullable private NodeService nodeService; @Inject public ZenDiscovery( Settings settings, ClusterName clusterName, ThreadPool threadPool, TransportService transportService, ClusterService clusterService, NodeSettingsService nodeSettingsService, DiscoveryNodeService discoveryNodeService, ZenPingService pingService, Version version, DiscoverySettings discoverySettings) { super(settings); this.clusterName = clusterName; this.threadPool = threadPool; this.clusterService = clusterService; this.transportService = transportService; this.discoveryNodeService = discoveryNodeService; this.pingService = pingService; this.version = version; // also support direct discovery.zen settings, for cases when it gets extended this.pingTimeout = settings.getAsTime( "discovery.zen.ping.timeout", settings.getAsTime( "discovery.zen.ping_timeout", componentSettings.getAsTime( "ping_timeout", componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3))))); this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true); this.masterElectionFilterClientNodes = settings.getAsBoolean("discovery.zen.master_election.filter_client", true); this.masterElectionFilterDataNodes = settings.getAsBoolean("discovery.zen.master_election.filter_data", false); logger.debug( "using ping.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]", pingTimeout, masterElectionFilterClientNodes, masterElectionFilterDataNodes); this.electMaster = new ElectMasterService(settings); nodeSettingsService.addListener(new ApplySettings()); this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this); this.masterFD.addListener(new MasterNodeFailureListener()); this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService); this.nodesFD.addListener(new NodeFailureListener()); this.publishClusterState = new PublishClusterStateAction( settings, transportService, this, new NewClusterStateListener(), discoverySettings); this.pingService.setNodesProvider(this); this.membership = new MembershipAction(settings, transportService, this, new MembershipListener()); transportService.registerHandler( RejoinClusterRequestHandler.ACTION, new RejoinClusterRequestHandler()); } @Override public void setNodeService(@Nullable NodeService nodeService) { this.nodeService = nodeService; } @Override public void setAllocationService(AllocationService allocationService) { this.allocationService = allocationService; } @Override protected void doStart() throws ElasticsearchException { Map<String, String> nodeAttributes = discoveryNodeService.buildAttributes(); // note, we rely on the fact that its a new id each time we start, see FD and "kill -9" handling final String nodeId = DiscoveryService.generateNodeId(settings); localNode = new DiscoveryNode( settings.get("name"), nodeId, transportService.boundAddress().publishAddress(), nodeAttributes, version); latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build(); nodesFD.updateNodes(latestDiscoNodes); pingService.start(); // do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is // discovered asyncJoinCluster(); } @Override protected void doStop() throws ElasticsearchException { pingService.stop(); masterFD.stop("zen disco stop"); nodesFD.stop(); initialStateSent.set(false); if (sendLeaveRequest) { if (!master && latestDiscoNodes.masterNode() != null) { try { membership.sendLeaveRequestBlocking( latestDiscoNodes.masterNode(), localNode, TimeValue.timeValueSeconds(1)); } catch (Exception e) { logger.debug( "failed to send leave request to master [{}]", e, latestDiscoNodes.masterNode()); } } else { DiscoveryNode[] possibleMasters = electMaster.nextPossibleMasters(latestDiscoNodes.nodes().values(), 5); for (DiscoveryNode possibleMaster : possibleMasters) { if (localNode.equals(possibleMaster)) { continue; } try { membership.sendLeaveRequest(latestDiscoNodes.masterNode(), possibleMaster); } catch (Exception e) { logger.debug( "failed to send leave request from master [{}] to possible master [{}]", e, latestDiscoNodes.masterNode(), possibleMaster); } } } } master = false; if (currentJoinThread != null) { try { currentJoinThread.interrupt(); } catch (Exception e) { // ignore } } } @Override protected void doClose() throws ElasticsearchException { masterFD.close(); nodesFD.close(); publishClusterState.close(); membership.close(); pingService.close(); } @Override public DiscoveryNode localNode() { return localNode; } @Override public void addListener(InitialStateDiscoveryListener listener) { this.initialStateListeners.add(listener); } @Override public void removeListener(InitialStateDiscoveryListener listener) { this.initialStateListeners.remove(listener); } @Override public String nodeDescription() { return clusterName.value() + "/" + localNode.id(); } @Override public DiscoveryNodes nodes() { DiscoveryNodes latestNodes = this.latestDiscoNodes; if (latestNodes != null) { return latestNodes; } // have not decided yet, just send the local node return DiscoveryNodes.builder().put(localNode).localNodeId(localNode.id()).build(); } @Override public NodeService nodeService() { return this.nodeService; } @Override public void publish(ClusterState clusterState, AckListener ackListener) { if (!master) { throw new ElasticsearchIllegalStateException("Shouldn't publish state when not master"); } latestDiscoNodes = clusterState.nodes(); nodesFD.updateNodes(clusterState.nodes()); publishClusterState.publish(clusterState, ackListener); } private void asyncJoinCluster() { if (currentJoinThread != null) { // we are already joining, ignore... logger.trace("a join thread already running"); return; } threadPool .generic() .execute( new Runnable() { @Override public void run() { currentJoinThread = Thread.currentThread(); try { innerJoinCluster(); } finally { currentJoinThread = null; } } }); } private void innerJoinCluster() { boolean retry = true; while (retry) { if (lifecycle.stoppedOrClosed()) { return; } retry = false; DiscoveryNode masterNode = findMaster(); if (masterNode == null) { logger.trace("no masterNode returned"); retry = true; continue; } if (localNode.equals(masterNode)) { this.master = true; nodesFD.start(); // start the nodes FD clusterService.submitStateUpdateTask( "zen-disco-join (elected_as_master)", Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { DiscoveryNodes.Builder builder = new DiscoveryNodes.Builder() .localNodeId(localNode.id()) .masterNodeId(localNode.id()) // put our local node .put(localNode); // update the fact that we are the master... latestDiscoNodes = builder.build(); ClusterBlocks clusterBlocks = ClusterBlocks.builder() .blocks(currentState.blocks()) .removeGlobalBlock(NO_MASTER_BLOCK) .build(); return ClusterState.builder(currentState) .nodes(latestDiscoNodes) .blocks(clusterBlocks) .build(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { sendInitialStateEventIfNeeded(); } }); } else { this.master = false; try { // first, make sure we can connect to the master transportService.connectToNode(masterNode); } catch (Exception e) { logger.warn("failed to connect to master [{}], retrying...", e, masterNode); retry = true; continue; } // send join request try { membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout); } catch (Exception e) { if (e instanceof ElasticsearchException) { logger.info( "failed to send join request to master [{}], reason [{}]", masterNode, ((ElasticsearchException) e).getDetailedMessage()); } else { logger.info( "failed to send join request to master [{}], reason [{}]", masterNode, e.getMessage()); } if (logger.isTraceEnabled()) { logger.trace("detailed failed reason", e); } // failed to send the join request, retry retry = true; continue; } masterFD.start(masterNode, "initial_join"); // no need to submit the received cluster state, we will get it from the master when it // publishes // the fact that we joined } } } private void handleLeaveRequest(final DiscoveryNode node) { if (lifecycleState() != Lifecycle.State.STARTED) { // not started, ignore a node failure return; } if (master) { clusterService.submitStateUpdateTask( "zen-disco-node_left(" + node + ")", Priority.URGENT, new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { DiscoveryNodes.Builder builder = DiscoveryNodes.builder(currentState.nodes()).remove(node.id()); latestDiscoNodes = builder.build(); currentState = ClusterState.builder(currentState).nodes(latestDiscoNodes).build(); // check if we have enough master nodes, if not, we need to move into joining the // cluster again if (!electMaster.hasEnoughMasterNodes(currentState.nodes())) { return rejoin(currentState, "not enough master nodes"); } // eagerly run reroute to remove dead nodes from routing table RoutingAllocation.Result routingResult = allocationService.reroute(ClusterState.builder(currentState).build()); return ClusterState.builder(currentState).routingResult(routingResult).build(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } }); } else { handleMasterGone(node, "shut_down"); } } private void handleNodeFailure(final DiscoveryNode node, String reason) { if (lifecycleState() != Lifecycle.State.STARTED) { // not started, ignore a node failure return; } if (!master) { // nothing to do here... return; } clusterService.submitStateUpdateTask( "zen-disco-node_failed(" + node + "), reason " + reason, Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { DiscoveryNodes.Builder builder = DiscoveryNodes.builder(currentState.nodes()).remove(node.id()); latestDiscoNodes = builder.build(); currentState = ClusterState.builder(currentState).nodes(latestDiscoNodes).build(); // check if we have enough master nodes, if not, we need to move into joining the // cluster again if (!electMaster.hasEnoughMasterNodes(currentState.nodes())) { return rejoin(currentState, "not enough master nodes"); } // eagerly run reroute to remove dead nodes from routing table RoutingAllocation.Result routingResult = allocationService.reroute(ClusterState.builder(currentState).build()); return ClusterState.builder(currentState).routingResult(routingResult).build(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { sendInitialStateEventIfNeeded(); } }); } private void handleMinimumMasterNodesChanged(final int minimumMasterNodes) { if (lifecycleState() != Lifecycle.State.STARTED) { // not started, ignore a node failure return; } if (!master) { // nothing to do here... return; } clusterService.submitStateUpdateTask( "zen-disco-minimum_master_nodes_changed", Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { final int prevMinimumMasterNode = ZenDiscovery.this.electMaster.minimumMasterNodes(); ZenDiscovery.this.electMaster.minimumMasterNodes(minimumMasterNodes); // check if we have enough master nodes, if not, we need to move into joining the // cluster again if (!electMaster.hasEnoughMasterNodes(currentState.nodes())) { return rejoin( currentState, "not enough master nodes on change of minimum_master_nodes from [" + prevMinimumMasterNode + "] to [" + minimumMasterNodes + "]"); } return currentState; } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { sendInitialStateEventIfNeeded(); } }); } private void handleMasterGone(final DiscoveryNode masterNode, final String reason) { if (lifecycleState() != Lifecycle.State.STARTED) { // not started, ignore a master failure return; } if (master) { // we might get this on both a master telling us shutting down, and then the disconnect // failure return; } logger.info("master_left [{}], reason [{}]", masterNode, reason); clusterService.submitStateUpdateTask( "zen-disco-master_failed (" + masterNode + ")", Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { if (!masterNode.id().equals(currentState.nodes().masterNodeId())) { // master got switched on us, no need to send anything return currentState; } DiscoveryNodes discoveryNodes = DiscoveryNodes.builder(currentState.nodes()) // make sure the old master node, which has failed, is not part of the nodes we // publish .remove(masterNode.id()) .masterNodeId(null) .build(); if (!electMaster.hasEnoughMasterNodes(discoveryNodes)) { return rejoin( ClusterState.builder(currentState).nodes(discoveryNodes).build(), "not enough master nodes after master left (reason = " + reason + ")"); } final DiscoveryNode electedMaster = electMaster.electMaster(discoveryNodes); // elect master if (localNode.equals(electedMaster)) { master = true; masterFD.stop( "got elected as new master since master left (reason = " + reason + ")"); nodesFD.start(); discoveryNodes = DiscoveryNodes.builder(discoveryNodes).masterNodeId(localNode.id()).build(); latestDiscoNodes = discoveryNodes; return ClusterState.builder(currentState).nodes(latestDiscoNodes).build(); } else { nodesFD.stop(); if (electedMaster != null) { discoveryNodes = DiscoveryNodes.builder(discoveryNodes).masterNodeId(electedMaster.id()).build(); masterFD.restart( electedMaster, "possible elected master since master left (reason = " + reason + ")"); latestDiscoNodes = discoveryNodes; return ClusterState.builder(currentState).nodes(latestDiscoNodes).build(); } else { return rejoin( ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master_left and no other node elected to become master"); } } } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { sendInitialStateEventIfNeeded(); } }); } static class ProcessClusterState { final ClusterState clusterState; final PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed; volatile boolean processed; ProcessClusterState( ClusterState clusterState, PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { this.clusterState = clusterState; this.newStateProcessed = newStateProcessed; } } private final BlockingQueue<ProcessClusterState> processNewClusterStates = ConcurrentCollections.newBlockingQueue(); void handleNewClusterStateFromMaster( ClusterState newClusterState, final PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) { if (master) { final ClusterState newState = newClusterState; clusterService.submitStateUpdateTask( "zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]", Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { if (newState.version() > currentState.version()) { logger.warn( "received cluster state from [{}] which is also master but with a newer cluster_state, rejoining to cluster...", newState.nodes().masterNode()); return rejoin( currentState, "zen-disco-master_receive_cluster_state_from_another_master [" + newState.nodes().masterNode() + "]"); } else { logger.warn( "received cluster state from [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster", newState.nodes().masterNode(), newState.nodes().masterNode()); transportService.sendRequest( newState.nodes().masterNode(), RejoinClusterRequestHandler.ACTION, new RejoinClusterRequest(currentState.nodes().localNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleException(TransportException exp) { logger.warn( "failed to send rejoin request to [{}]", exp, newState.nodes().masterNode()); } }); return currentState; } } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { newStateProcessed.onNewClusterStateProcessed(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); newStateProcessed.onNewClusterStateFailed(t); } }); } else { if (newClusterState.nodes().localNode() == null) { logger.warn( "received a cluster state from [{}] and not part of the cluster, should not happen", newClusterState.nodes().masterNode()); newStateProcessed.onNewClusterStateFailed( new ElasticsearchIllegalStateException( "received state from a node that is not part of the cluster")); } else { if (currentJoinThread != null) { logger.debug( "got a new state from master node, though we are already trying to rejoin the cluster"); } final ProcessClusterState processClusterState = new ProcessClusterState(newClusterState, newStateProcessed); processNewClusterStates.add(processClusterState); clusterService.submitStateUpdateTask( "zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])", Priority.URGENT, new ProcessedClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { // we already processed it in a previous event if (processClusterState.processed) { return currentState; } // TODO: once improvement that we can do is change the message structure to include // version and masterNodeId // at the start, this will allow us to keep the "compressed bytes" around, and only // parse the first page // to figure out if we need to use it or not, and only once we picked the latest // one, parse the whole state // try and get the state with the highest version out of all the ones with the same // master node id ProcessClusterState stateToProcess = processNewClusterStates.poll(); if (stateToProcess == null) { return currentState; } stateToProcess.processed = true; while (true) { ProcessClusterState potentialState = processNewClusterStates.peek(); // nothing else in the queue, bail if (potentialState == null) { break; } // if its not from the same master, then bail if (!Objects.equal( stateToProcess.clusterState.nodes().masterNodeId(), potentialState.clusterState.nodes().masterNodeId())) { break; } // we are going to use it for sure, poll (remove) it potentialState = processNewClusterStates.poll(); potentialState.processed = true; if (potentialState.clusterState.version() > stateToProcess.clusterState.version()) { // we found a new one stateToProcess = potentialState; } } ClusterState updatedState = stateToProcess.clusterState; // if the new state has a smaller version, and it has the same master node, then no // need to process it if (updatedState.version() < currentState.version() && Objects.equal( updatedState.nodes().masterNodeId(), currentState.nodes().masterNodeId())) { return currentState; } // we don't need to do this, since we ping the master, and get notified when it has // moved from being a master // because it doesn't have enough master nodes... // if (!electMaster.hasEnoughMasterNodes(newState.nodes())) { // return disconnectFromCluster(newState, "not enough master nodes on new cluster // state received from [" + newState.nodes().masterNode() + "]"); // } latestDiscoNodes = updatedState.nodes(); // check to see that we monitor the correct master of the cluster if (masterFD.masterNode() == null || !masterFD.masterNode().equals(latestDiscoNodes.masterNode())) { masterFD.restart( latestDiscoNodes.masterNode(), "new cluster state received and we are monitoring the wrong master [" + masterFD.masterNode() + "]"); } ClusterState.Builder builder = ClusterState.builder(updatedState); // if the routing table did not change, use the original one if (updatedState.routingTable().version() == currentState.routingTable().version()) { builder.routingTable(currentState.routingTable()); } // same for metadata if (updatedState.metaData().version() == currentState.metaData().version()) { builder.metaData(currentState.metaData()); } else { // if its not the same version, only copy over new indices or ones that changed // the version MetaData.Builder metaDataBuilder = MetaData.builder(updatedState.metaData()).removeAllIndices(); for (IndexMetaData indexMetaData : updatedState.metaData()) { IndexMetaData currentIndexMetaData = currentState.metaData().index(indexMetaData.index()); if (currentIndexMetaData == null || currentIndexMetaData.version() != indexMetaData.version()) { metaDataBuilder.put(indexMetaData, false); } else { metaDataBuilder.put(currentIndexMetaData, false); } } builder.metaData(metaDataBuilder); } return builder.build(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); newStateProcessed.onNewClusterStateFailed(t); } @Override public void clusterStateProcessed( String source, ClusterState oldState, ClusterState newState) { sendInitialStateEventIfNeeded(); newStateProcessed.onNewClusterStateProcessed(); } }); } } } private ClusterState handleJoinRequest(final DiscoveryNode node) { if (!master) { throw new ElasticsearchIllegalStateException( "Node [" + localNode + "] not master for join request from [" + node + "]"); } ClusterState state = clusterService.state(); if (!transportService.addressSupported(node.address().getClass())) { // TODO, what should we do now? Maybe inform that node that its crap? logger.warn("received a wrong address type from [{}], ignoring...", node); } else { // try and connect to the node, if it fails, we can raise an exception back to the client... transportService.connectToNode(node); state = clusterService.state(); // validate the join request, will throw a failure if it fails, which will get back to the // node calling the join request membership.sendValidateJoinRequestBlocking(node, state, pingTimeout); clusterService.submitStateUpdateTask( "zen-disco-receive(join from node[" + node + "])", Priority.URGENT, new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { if (currentState.nodes().nodeExists(node.id())) { // the node already exists in the cluster logger.warn("received a join request for an existing node [{}]", node); // still send a new cluster state, so it will be re published and possibly update // the other node return ClusterState.builder(currentState).build(); } DiscoveryNodes.Builder builder = DiscoveryNodes.builder(currentState.nodes()); for (DiscoveryNode existingNode : currentState.nodes()) { if (node.address().equals(existingNode.address())) { builder.remove(existingNode.id()); logger.warn( "received join request from node [{}], but found existing node {} with same address, removing existing node", node, existingNode); } } latestDiscoNodes = builder.build(); // add the new node now (will update latestDiscoNodes on publish) return ClusterState.builder(currentState) .nodes(latestDiscoNodes.newNode(node)) .build(); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } }); } return state; } private DiscoveryNode findMaster() { ZenPing.PingResponse[] fullPingResponses = pingService.pingAndWait(pingTimeout); if (fullPingResponses == null) { logger.trace("No full ping responses"); return null; } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder("full ping responses:"); if (fullPingResponses.length == 0) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : fullPingResponses) { sb.append("\n\t--> ") .append("target [") .append(pingResponse.target()) .append("], master [") .append(pingResponse.master()) .append("]"); } } logger.trace(sb.toString()); } // filter responses List<ZenPing.PingResponse> pingResponses = Lists.newArrayList(); for (ZenPing.PingResponse pingResponse : fullPingResponses) { DiscoveryNode node = pingResponse.target(); if (masterElectionFilterClientNodes && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) { // filter out the client node, which is a client node, or also one that is not data and not // master (effectively, client) } else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) { // filter out data node that is not also master } else { pingResponses.add(pingResponse); } } if (logger.isDebugEnabled()) { StringBuilder sb = new StringBuilder("filtered ping responses: (filter_client[") .append(masterElectionFilterClientNodes) .append("], filter_data[") .append(masterElectionFilterDataNodes) .append("])"); if (pingResponses.isEmpty()) { sb.append(" {none}"); } else { for (ZenPing.PingResponse pingResponse : pingResponses) { sb.append("\n\t--> ") .append("target [") .append(pingResponse.target()) .append("], master [") .append(pingResponse.master()) .append("]"); } } logger.debug(sb.toString()); } List<DiscoveryNode> pingMasters = newArrayList(); for (ZenPing.PingResponse pingResponse : pingResponses) { if (pingResponse.master() != null) { pingMasters.add(pingResponse.master()); } } Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet(); possibleMasterNodes.add(localNode); for (ZenPing.PingResponse pingResponse : pingResponses) { possibleMasterNodes.add(pingResponse.target()); } // if we don't have enough master nodes, we bail, even if we get a response that indicates // there is a master by other node, we don't see enough... if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) { return null; } if (pingMasters.isEmpty()) { // lets tie break between discovered nodes DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes); if (localNode.equals(electedMaster)) { return localNode; } } else { DiscoveryNode electedMaster = electMaster.electMaster(pingMasters); if (electedMaster != null) { return electedMaster; } } return null; } private ClusterState rejoin(ClusterState clusterState, String reason) { logger.warn(reason + ", current nodes: {}", clusterState.nodes()); nodesFD.stop(); masterFD.stop(reason); master = false; ClusterBlocks clusterBlocks = ClusterBlocks.builder() .blocks(clusterState.blocks()) .addGlobalBlock(NO_MASTER_BLOCK) .addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK) .build(); // clear the routing table, we have no master, so we need to recreate the routing when we reform // the cluster RoutingTable routingTable = RoutingTable.builder().build(); // we also clean the metadata, since we are going to recover it if we become master MetaData metaData = MetaData.builder().build(); // clean the nodes, we are now not connected to anybody, since we try and reform the cluster latestDiscoNodes = new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build(); asyncJoinCluster(); return ClusterState.builder(clusterState) .blocks(clusterBlocks) .nodes(latestDiscoNodes) .routingTable(routingTable) .metaData(metaData) .build(); } private void sendInitialStateEventIfNeeded() { if (initialStateSent.compareAndSet(false, true)) { for (InitialStateDiscoveryListener listener : initialStateListeners) { listener.initialStateProcessed(); } } } private class NewClusterStateListener implements PublishClusterStateAction.NewClusterStateListener { @Override public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) { handleNewClusterStateFromMaster(clusterState, newStateProcessed); } } private class MembershipListener implements MembershipAction.MembershipListener { @Override public ClusterState onJoin(DiscoveryNode node) { return handleJoinRequest(node); } @Override public void onLeave(DiscoveryNode node) { handleLeaveRequest(node); } } private class NodeFailureListener implements NodesFaultDetection.Listener { @Override public void onNodeFailure(DiscoveryNode node, String reason) { handleNodeFailure(node, reason); } } private class MasterNodeFailureListener implements MasterFaultDetection.Listener { @Override public void onMasterFailure(DiscoveryNode masterNode, String reason) { handleMasterGone(masterNode, reason); } @Override public void onDisconnectedFromMaster() { // got disconnected from the master, send a join request DiscoveryNode masterNode = latestDiscoNodes.masterNode(); try { membership.sendJoinRequest(masterNode, localNode); } catch (Exception e) { logger.warn("failed to send join request on disconnection from master [{}]", masterNode); } } } static class RejoinClusterRequest extends TransportRequest { private String fromNodeId; RejoinClusterRequest(String fromNodeId) { this.fromNodeId = fromNodeId; } RejoinClusterRequest() {} @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); fromNodeId = in.readOptionalString(); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeOptionalString(fromNodeId); } } class RejoinClusterRequestHandler extends BaseTransportRequestHandler<RejoinClusterRequest> { static final String ACTION = "discovery/zen/rejoin"; @Override public RejoinClusterRequest newInstance() { return new RejoinClusterRequest(); } @Override public void messageReceived(final RejoinClusterRequest request, final TransportChannel channel) throws Exception { clusterService.submitStateUpdateTask( "received a request to rejoin the cluster from [" + request.fromNodeId + "]", Priority.URGENT, new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { try { channel.sendResponse(TransportResponse.Empty.INSTANCE); } catch (Exception e) { logger.warn("failed to send response on rejoin cluster request handling", e); } return rejoin( currentState, "received a request to rejoin the cluster from [" + request.fromNodeId + "]"); } @Override public void onFailure(String source, Throwable t) { logger.error("unexpected failure during [{}]", t, source); } }); } @Override public String executor() { return ThreadPool.Names.SAME; } } class ApplySettings implements NodeSettingsService.Listener { @Override public void onRefreshSettings(Settings settings) { int minimumMasterNodes = settings.getAsInt( "discovery.zen.minimum_master_nodes", ZenDiscovery.this.electMaster.minimumMasterNodes()); if (minimumMasterNodes != ZenDiscovery.this.electMaster.minimumMasterNodes()) { logger.info( "updating discovery.zen.minimum_master_nodes from [{}] to [{}]", ZenDiscovery.this.electMaster.minimumMasterNodes(), minimumMasterNodes); handleMinimumMasterNodesChanged(minimumMasterNodes); } } } }
public class GatewayAllocator extends AbstractComponent { public static final String INDEX_RECOVERY_INITIAL_SHARDS = "index.recovery.initial_shards"; private final String initialShards; private final TransportNodesListGatewayStartedShards startedAction; private final TransportNodesListShardStoreMetaData storeAction; private RoutingService routingService; private final ConcurrentMap< ShardId, AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>> asyncFetchStarted = ConcurrentCollections.newConcurrentMap(); private final ConcurrentMap< ShardId, AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>> asyncFetchStore = ConcurrentCollections.newConcurrentMap(); @Inject public GatewayAllocator( Settings settings, TransportNodesListGatewayStartedShards startedAction, TransportNodesListShardStoreMetaData storeAction) { super(settings); this.startedAction = startedAction; this.storeAction = storeAction; this.initialShards = settings.get( "gateway.initial_shards", settings.get("gateway.local.initial_shards", "quorum")); logger.debug("using initial_shards [{}]", initialShards); } public void setReallocation( final ClusterService clusterService, final RoutingService routingService) { this.routingService = routingService; clusterService.add( new ClusterStateListener() { @Override public void clusterChanged(ClusterChangedEvent event) { boolean cleanCache = false; DiscoveryNode localNode = event.state().nodes().localNode(); if (localNode != null) { if (localNode.masterNode() == true && event.localNodeMaster() == false) { cleanCache = true; } } else { cleanCache = true; } if (cleanCache) { Releasables.close(asyncFetchStarted.values()); asyncFetchStarted.clear(); Releasables.close(asyncFetchStore.values()); asyncFetchStore.clear(); } } }); } public int getNumberOfInFlightFetch() { int count = 0; for (AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch : asyncFetchStarted.values()) { count += fetch.getNumberOfInFlightFetches(); } for (AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch : asyncFetchStore.values()) { count += fetch.getNumberOfInFlightFetches(); } return count; } public void applyStartedShards(StartedRerouteAllocation allocation) { for (ShardRouting shard : allocation.startedShards()) { Releasables.close(asyncFetchStarted.remove(shard.shardId())); Releasables.close(asyncFetchStore.remove(shard.shardId())); } } public void applyFailedShards(FailedRerouteAllocation allocation) { for (FailedRerouteAllocation.FailedShard shard : allocation.failedShards()) { Releasables.close(asyncFetchStarted.remove(shard.shard.shardId())); Releasables.close(asyncFetchStore.remove(shard.shard.shardId())); } } /** Return {@code true} if the index is configured to allow shards to be recovered on any node */ private boolean recoverOnAnyNode(@IndexSettings Settings idxSettings) { return IndexMetaData.isOnSharedFilesystem(idxSettings) && idxSettings.getAsBoolean( IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, false); } public boolean allocateUnassigned(RoutingAllocation allocation) { boolean changed = false; DiscoveryNodes nodes = allocation.nodes(); RoutingNodes routingNodes = allocation.routingNodes(); // First, handle primaries, they must find a place to be allocated on here final MetaData metaData = routingNodes.metaData(); RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned(); unassigned.sort( new PriorityComparator() { @Override protected Settings getIndexSettings(String index) { IndexMetaData indexMetaData = metaData.index(index); return indexMetaData.getSettings(); } }); // sort for priority ordering Iterator<ShardRouting> unassignedIterator = unassigned.iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (!shard.primary()) { continue; } // this is an API allocation, ignore since we know there is no data... if (!routingNodes .routingTable() .index(shard.index()) .shard(shard.id()) .primaryAllocatedPostApi()) { continue; } AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction); asyncFetchStarted.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId())); if (shardState.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard started state", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; } shardState.processAllocation(allocation); IndexMetaData indexMetaData = metaData.index(shard.getIndex()); /** * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1 * means the shard does not exist on the node, where any shard state >= 0 is the state version * of the shard on that node's disk. * * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating * that the shard can be allocated to any node. */ ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>(); for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : shardState.getData().values()) { long version = nodeShardState.version(); // -1 version means it does not exists, which is what the API returns, and what we expect to logger.trace( "[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version); nodesState.put(nodeShardState.getNode(), version); } int numberOfAllocationsFound = 0; long highestVersion = -1; final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap(); assert !nodesState.containsKey(null); final Object[] keys = nodesState.keys; final long[] values = nodesState.values; Settings idxSettings = indexMetaData.settings(); for (int i = 0; i < keys.length; i++) { if (keys[i] == null) { continue; } DiscoveryNode node = (DiscoveryNode) keys[i]; long version = values[i]; // since we don't check in NO allocation, we need to double check here if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) { continue; } if (recoverOnAnyNode(idxSettings)) { numberOfAllocationsFound++; if (version > highestVersion) { highestVersion = version; } // We always put the node without clearing the map nodesWithVersion.put(node, version); } else if (version != -1) { numberOfAllocationsFound++; // If we've found a new "best" candidate, clear the // current candidates and add it if (version > highestVersion) { highestVersion = version; nodesWithVersion.clear(); nodesWithVersion.put(node, version); } else if (version == highestVersion) { // If the candidate is the same, add it to the // list, but keep the current candidate nodesWithVersion.put(node, version); } } } // Now that we have a map of nodes to versions along with the // number of allocations found (and not ignored), we need to sort // it so the node with the highest version is at the beginning List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList(); nodesWithHighestVersion.addAll(nodesWithVersion.keySet()); CollectionUtil.timSort( nodesWithHighestVersion, new Comparator<DiscoveryNode>() { @Override public int compare(DiscoveryNode o1, DiscoveryNode o2) { return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1)); } }); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}] found {} allocations of {}, highest version: [{}]", shard.index(), shard.id(), numberOfAllocationsFound, shard, highestVersion); } if (logger.isTraceEnabled()) { StringBuilder sb = new StringBuilder("["); for (DiscoveryNode n : nodesWithHighestVersion) { sb.append("["); sb.append(n.getName()); sb.append("]"); sb.append(" -> "); sb.append(nodesWithVersion.get(n)); sb.append(", "); } sb.append("]"); logger.trace("{} candidates for allocation: {}", shard, sb.toString()); } // check if the counts meets the minimum set int requiredAllocation = 1; // if we restore from a repository one copy is more then enough if (shard.restoreSource() == null) { try { String initialShards = indexMetaData .settings() .get( INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards)); if ("quorum".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1; } } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 2) { requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2); } } else if ("one".equals(initialShards)) { requiredAllocation = 1; } else if ("full".equals(initialShards) || "all".equals(initialShards)) { requiredAllocation = indexMetaData.numberOfReplicas() + 1; } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) { if (indexMetaData.numberOfReplicas() > 1) { requiredAllocation = indexMetaData.numberOfReplicas(); } } else { requiredAllocation = Integer.parseInt(initialShards); } } catch (Exception e) { logger.warn( "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard); } } // not enough found for this shard, continue... if (numberOfAllocationsFound < requiredAllocation) { // if we are restoring this shard we still can allocate if (shard.restoreSource() == null) { // we can't really allocate, so ignore it and continue unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation); } } else if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource()); } continue; } Set<DiscoveryNode> throttledNodes = Sets.newHashSet(); Set<DiscoveryNode> noNodes = Sets.newHashSet(); for (DiscoveryNode discoNode : nodesWithHighestVersion) { RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.THROTTLE) { throttledNodes.add(discoNode); } else if (decision.type() == Decision.Type.NO) { noNodes.add(discoNode); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); // found a node, so no throttling, no "no", and break out of the loop throttledNodes.clear(); noNodes.clear(); break; } } if (throttledNodes.isEmpty()) { // if we have a node that we "can't" allocate to, force allocation, since this is our master // data! if (!noNodes.isEmpty()) { DiscoveryNode discoNode = noNodes.iterator().next(); RoutingNode node = routingNodes.node(discoNode.id()); if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode); } // we found a match changed = true; // make sure we create one with the version from the recovered state routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId()); unassignedIterator.remove(); } } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } if (!routingNodes.hasUnassigned()) { return changed; } // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was // allocated on unassignedIterator = unassigned.iterator(); while (unassignedIterator.hasNext()) { ShardRouting shard = unassignedIterator.next(); if (shard.primary()) { continue; } // pre-check if it can be allocated to any node that currently exists, so we won't list the // store for it for nothing boolean canBeAllocatedToAtLeastOneNode = false; for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { RoutingNode node = routingNodes.node(cursor.value.id()); if (node == null) { continue; } // if we can't allocate it on a node, ignore it, for example, this handles // cases for only allocating a replica after a primary Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.YES) { canBeAllocatedToAtLeastOneNode = true; break; } } if (!canBeAllocatedToAtLeastOneNode) { logger.trace("{}: ignoring allocation, can't be allocated on any node", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; } AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId()); if (fetch == null) { fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction); asyncFetchStore.put(shard.shardId(), fetch); } AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId())); if (shardStores.hasData() == false) { logger.trace("{}: ignoring allocation, still fetching shard stores", shard); unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); continue; // still fetching } shardStores.processAllocation(allocation); long lastSizeMatched = 0; DiscoveryNode lastDiscoNodeMatched = null; RoutingNode lastNodeMatched = null; boolean hasReplicaData = false; IndexMetaData indexMetaData = metaData.index(shard.getIndex()); for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) { DiscoveryNode discoNode = nodeStoreEntry.getKey(); TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData(); logger.trace("{}: checking node [{}]", shard, discoNode); if (storeFilesMetaData == null) { // already allocated on that node... continue; } RoutingNode node = routingNodes.node(discoNode.id()); if (node == null) { continue; } // check if we can allocate on that node... // we only check for NO, since if this node is THROTTLING and it has enough "same data" // then we will try and assign it next time Decision decision = allocation.deciders().canAllocate(shard, node, allocation); if (decision.type() == Decision.Type.NO) { continue; } // if it is already allocated, we can't assign to it... if (storeFilesMetaData.allocated()) { continue; } if (!shard.primary()) { hasReplicaData |= storeFilesMetaData.iterator().hasNext(); ShardRouting primaryShard = routingNodes.activePrimary(shard); if (primaryShard != null) { assert primaryShard.active(); DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId()); if (primaryNode != null) { TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode); if (primaryNodeFilesStore != null) { TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData(); if (primaryNodeStore != null && primaryNodeStore.allocated()) { long sizeMatched = 0; String primarySyncId = primaryNodeStore.syncId(); String replicaSyncId = storeFilesMetaData.syncId(); // see if we have a sync id we can make use of if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) { logger.trace( "{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId); lastNodeMatched = node; lastSizeMatched = Long.MAX_VALUE; lastDiscoNodeMatched = discoNode; } else { for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) { String metaDataFileName = storeFileMetaData.name(); if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) { sizeMatched += storeFileMetaData.length(); } } logger.trace( "{}: node [{}] has [{}/{}] bytes of re-usable data", shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched); if (sizeMatched > lastSizeMatched) { lastSizeMatched = sizeMatched; lastDiscoNodeMatched = discoNode; lastNodeMatched = node; } } } } } } } } if (lastNodeMatched != null) { // we only check on THROTTLE since we checked before before on NO Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation); if (decision.type() == Decision.Type.THROTTLE) { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we are throttling this, but we have enough to allocate to this node, ignore it for now unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } else { if (logger.isDebugEnabled()) { logger.debug( "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched)); } // we found a match changed = true; routingNodes.initialize(shard, lastNodeMatched.nodeId()); unassignedIterator.remove(); } } else if (hasReplicaData == false) { // if we didn't manage to find *any* data (regardless of matching sizes), check if the // allocation // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list // note: we only care about replica in delayed allocation, since if we have an unassigned // primary it // will anyhow wait to find an existing copy of the shard to be allocated // note: the other side of the equation is scheduling a reroute in a timely manner, which // happens in the RoutingService long delay = shard .unassignedInfo() .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings()); if (delay > 0) { logger.debug( "[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay)); /** * mark it as changed, since we want to kick a publishing to schedule future allocation, * see {@link * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}). */ changed = true; unassignedIterator.remove(); routingNodes.ignoredUnassigned().add(shard); } } } return changed; } class InternalAsyncFetch<T extends BaseNodeResponse> extends AsyncShardFetch<T> { public InternalAsyncFetch( ESLogger logger, String type, ShardId shardId, List<? extends BaseNodesResponse<T>, T> action) { super(logger, type, shardId, action); } @Override protected void reroute(ShardId shardId, String reason) { logger.trace("{} scheduling reroute for {}", shardId, reason); routingService.reroute("async_shard_fetch"); } } }
/** * The dangling indices state is responsible for finding new dangling indices (indices that have * their state written on disk, but don't exists in the metadata of the cluster), and importing them * into the cluster. */ public class DanglingIndicesState extends AbstractComponent { private final NodeEnvironment nodeEnv; private final MetaStateService metaStateService; private final LocalAllocateDangledIndices allocateDangledIndices; private final Map<String, IndexMetaData> danglingIndices = ConcurrentCollections.newConcurrentMap(); @Inject public DanglingIndicesState( Settings settings, NodeEnvironment nodeEnv, MetaStateService metaStateService, LocalAllocateDangledIndices allocateDangledIndices) { super(settings); this.nodeEnv = nodeEnv; this.metaStateService = metaStateService; this.allocateDangledIndices = allocateDangledIndices; } /** * Process dangling indices based on the provided meta data, handling cleanup, finding new * dangling indices, and allocating outstanding ones. */ public void processDanglingIndices(MetaData metaData) { if (nodeEnv.hasNodeFile() == false) { return; } cleanupAllocatedDangledIndices(metaData); findNewAndAddDanglingIndices(metaData); allocateDanglingIndices(); } /** The current set of dangling indices. */ Map<String, IndexMetaData> getDanglingIndices() { return ImmutableMap.copyOf(danglingIndices); } /** Cleans dangling indices if they are already allocated on the provided meta data. */ void cleanupAllocatedDangledIndices(MetaData metaData) { for (String danglingIndex : danglingIndices.keySet()) { if (metaData.hasIndex(danglingIndex)) { logger.debug( "[{}] no longer dangling (created), removing from dangling list", danglingIndex); danglingIndices.remove(danglingIndex); } } } /** * Finds (@{link #findNewAndAddDanglingIndices}) and adds the new dangling indices to the * currently tracked dangling indices. */ void findNewAndAddDanglingIndices(MetaData metaData) { danglingIndices.putAll(findNewDanglingIndices(metaData)); } /** * Finds new dangling indices by iterating over the indices and trying to find indices that have * state on disk, but are not part of the provided meta data, or not detected as dangled already. */ Map<String, IndexMetaData> findNewDanglingIndices(MetaData metaData) { final Set<String> indices; try { indices = nodeEnv.findAllIndices(); } catch (Throwable e) { logger.warn("failed to list dangling indices", e); return ImmutableMap.of(); } Map<String, IndexMetaData> newIndices = Maps.newHashMap(); for (String indexName : indices) { if (metaData.hasIndex(indexName) == false && danglingIndices.containsKey(indexName) == false) { try { IndexMetaData indexMetaData = metaStateService.loadIndexState(indexName); if (indexMetaData != null) { logger.info( "[{}] dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state", indexName); if (!indexMetaData.index().equals(indexName)) { logger.info( "dangled index directory name is [{}], state name is [{}], renaming to directory name", indexName, indexMetaData.index()); indexMetaData = IndexMetaData.builder(indexMetaData).index(indexName).build(); } newIndices.put(indexName, indexMetaData); } else { logger.debug("[{}] dangling index directory detected, but no state found", indexName); } } catch (Throwable t) { logger.warn("[{}] failed to load index state for detected dangled index", t, indexName); } } } return newIndices; } /** * Allocates the provided list of the dangled indices by sending them to the master node for * allocation. */ private void allocateDanglingIndices() { if (danglingIndices.isEmpty() == true) { return; } try { allocateDangledIndices.allocateDangled( ImmutableList.copyOf(danglingIndices.values()), new LocalAllocateDangledIndices.Listener() { @Override public void onResponse(LocalAllocateDangledIndices.AllocateDangledResponse response) { logger.trace("allocated dangled"); } @Override public void onFailure(Throwable e) { logger.info("failed to send allocated dangled", e); } }); } catch (Throwable e) { logger.warn("failed to send allocate dangled", e); } } }
/** * Creates a new BlockingClusterStatePublishResponseHandler * * @param publishingToNodes the set of nodes to which the cluster state will be published and * should respond */ public BlockingClusterStatePublishResponseHandler(Set<DiscoveryNode> publishingToNodes) { this.pendingNodes = ConcurrentCollections.newConcurrentSet(); this.pendingNodes.addAll(publishingToNodes); this.latch = new CountDownLatch(pendingNodes.size()); }
private Set<MockZenPing> getActiveNodesForCurrentCluster() { return activeNodesPerCluster.computeIfAbsent( getClusterName(), clusterName -> ConcurrentCollections.newConcurrentSet()); }
public class ShardStateAction extends AbstractComponent { private final TransportService transportService; private final ClusterService clusterService; private final AllocationService allocationService; private final ThreadPool threadPool; private final BlockingQueue<ShardRouting> startedShardsQueue = ConcurrentCollections.newBlockingQueue(); @Inject public ShardStateAction( Settings settings, ClusterService clusterService, TransportService transportService, AllocationService allocationService, ThreadPool threadPool) { super(settings); this.clusterService = clusterService; this.transportService = transportService; this.allocationService = allocationService; this.threadPool = threadPool; transportService.registerHandler( ShardStartedTransportHandler.ACTION, new ShardStartedTransportHandler()); transportService.registerHandler( ShardFailedTransportHandler.ACTION, new ShardFailedTransportHandler()); } public void shardFailed(final ShardRouting shardRouting, final String reason) throws ElasticSearchException { logger.warn("sending failed shard for {}, reason [{}]", shardRouting, reason); DiscoveryNodes nodes = clusterService.state().nodes(); if (nodes.localNodeMaster()) { innerShardFailed(shardRouting, reason); } else { transportService.sendRequest( clusterService.state().nodes().masterNode(), ShardFailedTransportHandler.ACTION, new ShardRoutingEntry(shardRouting, reason), new VoidTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleException(TransportException exp) { logger.warn( "failed to send failed shard to [{}]", exp, clusterService.state().nodes().masterNode()); } }); } } public void shardStarted(final ShardRouting shardRouting, final String reason) throws ElasticSearchException { if (logger.isDebugEnabled()) { logger.debug("sending shard started for {}, reason [{}]", shardRouting, reason); } DiscoveryNodes nodes = clusterService.state().nodes(); if (nodes.localNodeMaster()) { innerShardStarted(shardRouting, reason); } else { transportService.sendRequest( clusterService.state().nodes().masterNode(), ShardStartedTransportHandler.ACTION, new ShardRoutingEntry(shardRouting, reason), new VoidTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleException(TransportException exp) { logger.warn( "failed to send shard started to [{}]", exp, clusterService.state().nodes().masterNode()); } }); } } private void innerShardFailed(final ShardRouting shardRouting, final String reason) { logger.warn("received shard failed for {}, reason [{}]", shardRouting, reason); clusterService.submitStateUpdateTask( "shard-failed (" + shardRouting + "), reason [" + reason + "]", new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { if (logger.isDebugEnabled()) { logger.debug("Received failed shard {}, reason [{}]", shardRouting, reason); } RoutingAllocation.Result routingResult = allocationService.applyFailedShard(currentState, shardRouting); if (!routingResult.changed()) { return currentState; } if (logger.isDebugEnabled()) { logger.debug("Applying failed shard {}, reason [{}]", shardRouting, reason); } return newClusterStateBuilder() .state(currentState) .routingResult(routingResult) .build(); } }); } private void innerShardStarted(final ShardRouting shardRouting, final String reason) { if (logger.isDebugEnabled()) { logger.debug("received shard started for {}, reason [{}]", shardRouting, reason); } // buffer shard started requests, and the state update tasks will simply drain it // this is to optimize the number of "started" events we generate, and batch them // possibly, we can do time based batching as well, but usually, we would want to // process started events as fast as possible, to make shards available startedShardsQueue.add(shardRouting); clusterService.submitStateUpdateTask( "shard-started (" + shardRouting + "), reason [" + reason + "]", new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { List<ShardRouting> shards = new ArrayList<ShardRouting>(); startedShardsQueue.drainTo(shards); // nothing to process (a previous event has process it already) if (shards.isEmpty()) { return currentState; } RoutingTable routingTable = currentState.routingTable(); for (int i = 0; i < shards.size(); i++) { ShardRouting shardRouting = shards.get(i); IndexRoutingTable indexRoutingTable = routingTable.index(shardRouting.index()); // if there is no routing table, the index has been deleted while it was being // allocated // which is fine, we should just ignore this if (indexRoutingTable == null) { shards.remove(i); } else { // find the one that maps to us, if its already started, no need to do anything... // the shard might already be started since the nodes that is starting the shards // might get cluster events // with the shard still initializing, and it will try and start it again (until the // verification comes) IndexShardRoutingTable indexShardRoutingTable = indexRoutingTable.shard(shardRouting.id()); for (ShardRouting entry : indexShardRoutingTable) { if (shardRouting.currentNodeId().equals(entry.currentNodeId())) { // we found the same shard that exists on the same node id if (entry.started()) { // already started, do nothing here... shards.remove(i); } } } } } if (shards.isEmpty()) { return currentState; } if (logger.isDebugEnabled()) { logger.debug("applying started shards {}, reason [{}]", shards, reason); } RoutingAllocation.Result routingResult = allocationService.applyStartedShards(currentState, shards); if (!routingResult.changed()) { return currentState; } return newClusterStateBuilder() .state(currentState) .routingResult(routingResult) .build(); } }); } private class ShardFailedTransportHandler extends BaseTransportRequestHandler<ShardRoutingEntry> { static final String ACTION = "cluster/shardFailure"; @Override public ShardRoutingEntry newInstance() { return new ShardRoutingEntry(); } @Override public void messageReceived(ShardRoutingEntry request, TransportChannel channel) throws Exception { innerShardFailed(request.shardRouting, request.reason); channel.sendResponse(VoidStreamable.INSTANCE); } @Override public String executor() { return ThreadPool.Names.SAME; } } class ShardStartedTransportHandler extends BaseTransportRequestHandler<ShardRoutingEntry> { static final String ACTION = "cluster/shardStarted"; @Override public ShardRoutingEntry newInstance() { return new ShardRoutingEntry(); } @Override public void messageReceived(ShardRoutingEntry request, TransportChannel channel) throws Exception { innerShardStarted(request.shardRouting, request.reason); channel.sendResponse(VoidStreamable.INSTANCE); } @Override public String executor() { return ThreadPool.Names.SAME; } } static class ShardRoutingEntry extends TransportRequest { private ShardRouting shardRouting; private String reason; private ShardRoutingEntry() {} private ShardRoutingEntry(ShardRouting shardRouting, String reason) { this.shardRouting = shardRouting; this.reason = reason; } @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); shardRouting = readShardRoutingEntry(in); reason = in.readString(); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); shardRouting.writeTo(out); out.writeString(reason); } } }
/** * A transport class that doesn't send anything but rather captures all requests for inspection from * tests */ public class CapturingTransport implements Transport { private TransportServiceAdapter adapter; public static class CapturedRequest { public final DiscoveryNode node; public final long requestId; public final String action; public final TransportRequest request; public CapturedRequest( DiscoveryNode node, long requestId, String action, TransportRequest request) { this.node = node; this.requestId = requestId; this.action = action; this.request = request; } } private ConcurrentMap<Long, Tuple<DiscoveryNode, String>> requests = new ConcurrentHashMap<>(); private BlockingQueue<CapturedRequest> capturedRequests = ConcurrentCollections.newBlockingQueue(); /** * returns all requests captured so far. Doesn't clear the captured request list. See {@link * #clear()} */ public CapturedRequest[] capturedRequests() { return capturedRequests.toArray(new CapturedRequest[0]); } /** * Returns all requests captured so far. This method does clear the captured requests list. If you * do not want the captured requests list cleared, use {@link #capturedRequests()}. * * @return the captured requests */ public CapturedRequest[] getCapturedRequestsAndClear() { CapturedRequest[] capturedRequests = capturedRequests(); clear(); return capturedRequests; } /** * returns all requests captured so far, grouped by target node. Doesn't clear the captured * request list. See {@link #clear()} */ public Map<String, List<CapturedRequest>> capturedRequestsByTargetNode() { Map<String, List<CapturedRequest>> map = new HashMap<>(); for (CapturedRequest request : capturedRequests) { List<CapturedRequest> nodeList = map.get(request.node.getId()); if (nodeList == null) { nodeList = new ArrayList<>(); map.put(request.node.getId(), nodeList); } nodeList.add(request); } return map; } /** * Returns all requests captured so far, grouped by target node. This method does clear the * captured request list. If you do not want the captured requests list cleared, use {@link * #capturedRequestsByTargetNode()}. * * @return the captured requests grouped by target node */ public Map<String, List<CapturedRequest>> getCapturedRequestsByTargetNodeAndClear() { Map<String, List<CapturedRequest>> map = capturedRequestsByTargetNode(); clear(); return map; } /** clears captured requests */ public void clear() { capturedRequests.clear(); } /** simulate a response for the given requestId */ public void handleResponse(final long requestId, final TransportResponse response) { adapter.onResponseReceived(requestId).handleResponse(response); } /** * simulate a local error for the given requestId, will be wrapped by a {@link * SendRequestTransportException} * * @param requestId the id corresponding to the captured send request * @param t the failure to wrap */ public void handleLocalError(final long requestId, final Throwable t) { Tuple<DiscoveryNode, String> request = requests.get(requestId); assert request != null; this.handleError(requestId, new SendRequestTransportException(request.v1(), request.v2(), t)); } /** * simulate a remote error for the given requestId, will be wrapped by a {@link * RemoteTransportException} * * @param requestId the id corresponding to the captured send request * @param t the failure to wrap */ public void handleRemoteError(final long requestId, final Throwable t) { final RemoteTransportException remoteException; if (rarely(Randomness.get())) { remoteException = new RemoteTransportException("remote failure, coming from local node", t); } else { try (BytesStreamOutput output = new BytesStreamOutput()) { output.writeException(t); remoteException = new RemoteTransportException( "remote failure", output.bytes().streamInput().readException()); } catch (IOException ioException) { throw new ElasticsearchException( "failed to serialize/deserialize supplied exception " + t, ioException); } } this.handleError(requestId, remoteException); } /** * simulate an error for the given requestId, unlike {@link #handleLocalError(long, Throwable)} * and {@link #handleRemoteError(long, Throwable)}, the provided exception will not be wrapped but * will be delivered to the transport layer as is * * @param requestId the id corresponding to the captured send request * @param e the failure */ public void handleError(final long requestId, final TransportException e) { adapter.onResponseReceived(requestId).handleException(e); } @Override public Connection openConnection(DiscoveryNode node, ConnectionProfile profile) throws IOException { return new Connection() { @Override public DiscoveryNode getNode() { return node; } @Override public void sendRequest( long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { requests.put(requestId, Tuple.tuple(node, action)); capturedRequests.add(new CapturedRequest(node, requestId, action, request)); } @Override public void close() throws IOException {} }; } @Override public void transportServiceAdapter(TransportServiceAdapter adapter) { this.adapter = adapter; } @Override public BoundTransportAddress boundAddress() { return null; } @Override public Map<String, BoundTransportAddress> profileBoundAddresses() { return null; } @Override public TransportAddress[] addressesFromString(String address, int perAddressLimit) throws UnknownHostException { return new TransportAddress[0]; } @Override public boolean nodeConnected(DiscoveryNode node) { return true; } @Override public void connectToNode(DiscoveryNode node, ConnectionProfile connectionProfile) throws ConnectTransportException {} @Override public void disconnectFromNode(DiscoveryNode node) {} @Override public long serverOpen() { return 0; } @Override public Lifecycle.State lifecycleState() { return null; } @Override public void addLifecycleListener(LifecycleListener listener) {} @Override public void removeLifecycleListener(LifecycleListener listener) {} @Override public void start() {} @Override public void stop() {} @Override public void close() {} @Override public List<String> getLocalAddresses() { return Collections.emptyList(); } @Override public Connection getConnection(DiscoveryNode node) { try { return openConnection(node, null); } catch (IOException e) { throw new UncheckedIOException(e); } } }