예제 #1
0
/** @author kimchy (shay.banon) */
public class RecoveryStatus {

  public static enum Stage {
    INIT,
    INDEX,
    TRANSLOG,
    FINALIZE,
    DONE
  }

  ConcurrentMap<String, IndexOutput> openIndexOutputs = ConcurrentCollections.newConcurrentMap();
  ConcurrentMap<String, String> checksums = ConcurrentCollections.newConcurrentMap();

  final long startTime = System.currentTimeMillis();
  long time;
  List<String> phase1FileNames;
  List<Long> phase1FileSizes;
  List<String> phase1ExistingFileNames;
  List<Long> phase1ExistingFileSizes;
  long phase1TotalSize;
  long phase1ExistingTotalSize;

  volatile Stage stage = Stage.INIT;
  volatile long currentTranslogOperations = 0;
  AtomicLong currentFilesSize = new AtomicLong();

  public long startTime() {
    return startTime;
  }

  public long time() {
    return this.time;
  }

  public long phase1TotalSize() {
    return phase1TotalSize;
  }

  public long phase1ExistingTotalSize() {
    return phase1ExistingTotalSize;
  }

  public Stage stage() {
    return stage;
  }

  public long currentTranslogOperations() {
    return currentTranslogOperations;
  }

  public long currentFilesSize() {
    return currentFilesSize.get();
  }
}
예제 #2
0
/**
 * A {@link ZenPing} implementation which returns results based on an static in-memory map. This
 * allows pinging to be immediate and can be used to speed up tests.
 */
public final class MockZenPing extends AbstractComponent implements ZenPing {

  /**
   * A marker plugin used by {@link org.elasticsearch.node.MockNode} to indicate this mock zen ping
   * should be used.
   */
  public static class TestPlugin extends Plugin {}

  static final Map<ClusterName, Set<MockZenPing>> activeNodesPerCluster =
      ConcurrentCollections.newConcurrentMap();

  private volatile PingContextProvider contextProvider;

  @Inject
  public MockZenPing(Settings settings) {
    super(settings);
  }

  @Override
  public void start(PingContextProvider contextProvider) {
    this.contextProvider = contextProvider;
    assert contextProvider != null;
    boolean added = getActiveNodesForCurrentCluster().add(this);
    assert added;
  }

  @Override
  public void ping(PingListener listener, TimeValue timeout) {
    logger.info("pinging using mock zen ping");
    List<PingResponse> responseList =
        getActiveNodesForCurrentCluster()
            .stream()
            .filter(
                p -> p != this) // remove this as pings are not expected to return the local node
            .map(MockZenPing::getPingResponse)
            .collect(Collectors.toList());
    listener.onPing(responseList);
  }

  private ClusterName getClusterName() {
    return contextProvider.clusterState().getClusterName();
  }

  private PingResponse getPingResponse() {
    final ClusterState clusterState = contextProvider.clusterState();
    return new PingResponse(
        clusterState.nodes().getLocalNode(), clusterState.nodes().getMasterNode(), clusterState);
  }

  private Set<MockZenPing> getActiveNodesForCurrentCluster() {
    return activeNodesPerCluster.computeIfAbsent(
        getClusterName(), clusterName -> ConcurrentCollections.newConcurrentSet());
  }

  @Override
  public void close() {
    boolean found = getActiveNodesForCurrentCluster().remove(this);
    assert found;
  }
}
예제 #3
0
 @Override
 public BloomFilter filter(IndexReader reader, String fieldName, boolean asyncLoad) {
   int currentNumDocs = reader.numDocs();
   if (currentNumDocs == 0) {
     return BloomFilter.EMPTY;
   }
   ConcurrentMap<String, BloomFilterEntry> fieldCache = cache.get(reader.getFieldCacheKey());
   if (fieldCache == null) {
     synchronized (creationMutex) {
       fieldCache = cache.get(reader.getFieldCacheKey());
       if (fieldCache == null) {
         fieldCache = ConcurrentCollections.newConcurrentMap();
         cache.put(reader.getFieldCacheKey(), fieldCache);
       }
     }
   }
   BloomFilterEntry filter = fieldCache.get(fieldName);
   if (filter == null) {
     synchronized (fieldCache) {
       filter = fieldCache.get(fieldName);
       if (filter == null) {
         filter = new BloomFilterEntry(reader.numDocs(), BloomFilter.NONE);
         filter.loading.set(true);
         fieldCache.put(fieldName, filter);
         // now, do the async load of it...
         BloomFilterLoader loader = new BloomFilterLoader(reader, fieldName);
         if (asyncLoad) {
           threadPool.cached().execute(loader);
         } else {
           loader.run();
           filter = fieldCache.get(fieldName);
         }
       }
     }
   }
   // if we too many deletes, we need to reload the bloom filter so it will be more effective
   if (filter.numDocs > 1000 && (currentNumDocs / filter.numDocs) < 0.6) {
     if (filter.loading.compareAndSet(false, true)) {
       // do the async loading
       BloomFilterLoader loader = new BloomFilterLoader(reader, fieldName);
       if (asyncLoad) {
         threadPool.cached().execute(loader);
       } else {
         loader.run();
         filter = fieldCache.get(fieldName);
       }
     }
   }
   return filter.filter;
 }
/**
 * The scan context allows to optimize readers we already processed during scanning. We do that by
 * keeping track of the count per reader, and if we are done with it, we no longer process it by
 * using a filter that returns null docIdSet for this reader.
 */
public class ScanContext {

  private final ConcurrentMap<IndexReader, ReaderState> readerStates =
      ConcurrentCollections.newConcurrentMap();

  public void clear() {
    readerStates.clear();
  }

  public TopDocs execute(SearchContext context) throws IOException {
    ScanCollector collector =
        new ScanCollector(readerStates, context.from(), context.size(), context.trackScores());
    Query query = new FilteredQuery(context.query(), new ScanFilter(readerStates, collector));
    try {
      context.searcher().search(query, collector);
    } catch (ScanCollector.StopCollectingException e) {
      // all is well
    }
    return collector.topDocs();
  }

  static class ScanCollector extends SimpleCollector {

    private final ConcurrentMap<IndexReader, ReaderState> readerStates;

    private final int from;

    private final int to;

    private final ArrayList<ScoreDoc> docs;

    private final boolean trackScores;

    private Scorer scorer;

    private int docBase;

    private int counter;

    private IndexReader currentReader;
    private ReaderState readerState;

    ScanCollector(
        ConcurrentMap<IndexReader, ReaderState> readerStates,
        int from,
        int size,
        boolean trackScores) {
      this.readerStates = readerStates;
      this.from = from;
      this.to = from + size;
      this.trackScores = trackScores;
      this.docs = new ArrayList<>(size);
    }

    void incCounter(int count) {
      this.counter += count;
    }

    public TopDocs topDocs() {
      return new TopDocs(docs.size(), docs.toArray(new ScoreDoc[docs.size()]), 0f);
    }

    @Override
    public void setScorer(Scorer scorer) throws IOException {
      this.scorer = scorer;
    }

    @Override
    public void collect(int doc) throws IOException {
      if (counter >= from) {
        docs.add(new ScoreDoc(docBase + doc, trackScores ? scorer.score() : 0f));
      }
      readerState.count++;
      counter++;
      if (counter >= to) {
        throw StopCollectingException;
      }
    }

    @Override
    public void doSetNextReader(LeafReaderContext context) throws IOException {
      // if we have a reader state, and we haven't registered one already, register it
      // we need to check in readersState since even when the filter return null, setNextReader is
      // still
      // called for that reader (before)
      if (currentReader != null && !readerStates.containsKey(currentReader)) {
        assert readerState != null;
        readerState.done = true;
        readerStates.put(currentReader, readerState);
      }
      this.currentReader = context.reader();
      this.docBase = context.docBase;
      this.readerState = new ReaderState();
    }

    public static final RuntimeException StopCollectingException = new StopCollectingException();

    static class StopCollectingException extends RuntimeException {
      @Override
      public Throwable fillInStackTrace() {
        return null;
      }
    }
  }

  public static class ScanFilter extends Filter {

    private final ConcurrentMap<IndexReader, ReaderState> readerStates;

    private final ScanCollector scanCollector;

    public ScanFilter(
        ConcurrentMap<IndexReader, ReaderState> readerStates, ScanCollector scanCollector) {
      this.readerStates = readerStates;
      this.scanCollector = scanCollector;
    }

    @Override
    public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptedDocs) throws IOException {
      ReaderState readerState = readerStates.get(context.reader());
      if (readerState != null && readerState.done) {
        scanCollector.incCounter(readerState.count);
        return null;
      }
      return BitsFilteredDocIdSet.wrap(new AllDocIdSet(context.reader().maxDoc()), acceptedDocs);
    }
  }

  static class ReaderState {
    public int count;
    public boolean done;
  }
}
public class IndexFieldDataService extends AbstractIndexComponent {

  private static final String DISABLED_FORMAT = "disabled";
  private static final String DOC_VALUES_FORMAT = "doc_values";
  private static final String ARRAY_FORMAT = "array";
  private static final String PAGED_BYTES_FORMAT = "paged_bytes";
  private static final String FST_FORMAT = "fst";
  private static final String COMPRESSED_FORMAT = "compressed";

  private static final ImmutableMap<String, IndexFieldData.Builder> buildersByType;
  private static final ImmutableMap<String, IndexFieldData.Builder> docValuesBuildersByType;
  private static final ImmutableMap<Tuple<String, String>, IndexFieldData.Builder>
      buildersByTypeAndFormat;
  private final CircuitBreakerService circuitBreakerService;
  private final IndicesFieldDataCacheListener indicesFieldDataCacheListener;

  static {
    buildersByType =
        MapBuilder.<String, IndexFieldData.Builder>newMapBuilder()
            .put("string", new PagedBytesIndexFieldData.Builder())
            .put("float", new FloatArrayIndexFieldData.Builder())
            .put("double", new DoubleArrayIndexFieldData.Builder())
            .put(
                "byte",
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.BYTE))
            .put(
                "short",
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.SHORT))
            .put(
                "int",
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.INT))
            .put(
                "long",
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.LONG))
            .put("geo_point", new GeoPointDoubleArrayIndexFieldData.Builder())
            .put(ParentFieldMapper.NAME, new ParentChildIndexFieldData.Builder())
            .put("binary", new DisabledIndexFieldData.Builder())
            .immutableMap();

    docValuesBuildersByType =
        MapBuilder.<String, IndexFieldData.Builder>newMapBuilder()
            .put("string", new DocValuesIndexFieldData.Builder())
            .put(
                "float",
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.FLOAT))
            .put(
                "double",
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.DOUBLE))
            .put(
                "byte",
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.BYTE))
            .put(
                "short",
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.SHORT))
            .put(
                "int",
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.INT))
            .put(
                "long",
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.LONG))
            .put("geo_point", new GeoPointBinaryDVIndexFieldData.Builder())
            .put("binary", new BytesBinaryDVIndexFieldData.Builder())
            .immutableMap();

    buildersByTypeAndFormat =
        MapBuilder.<Tuple<String, String>, IndexFieldData.Builder>newMapBuilder()
            .put(Tuple.tuple("string", PAGED_BYTES_FORMAT), new PagedBytesIndexFieldData.Builder())
            .put(Tuple.tuple("string", FST_FORMAT), new FSTBytesIndexFieldData.Builder())
            .put(Tuple.tuple("string", DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder())
            .put(Tuple.tuple("string", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(Tuple.tuple("float", ARRAY_FORMAT), new FloatArrayIndexFieldData.Builder())
            .put(
                Tuple.tuple("float", DOC_VALUES_FORMAT),
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.FLOAT))
            .put(Tuple.tuple("float", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(Tuple.tuple("double", ARRAY_FORMAT), new DoubleArrayIndexFieldData.Builder())
            .put(
                Tuple.tuple("double", DOC_VALUES_FORMAT),
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.DOUBLE))
            .put(Tuple.tuple("double", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(
                Tuple.tuple("byte", ARRAY_FORMAT),
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.BYTE))
            .put(
                Tuple.tuple("byte", DOC_VALUES_FORMAT),
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.BYTE))
            .put(Tuple.tuple("byte", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(
                Tuple.tuple("short", ARRAY_FORMAT),
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.SHORT))
            .put(
                Tuple.tuple("short", DOC_VALUES_FORMAT),
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.SHORT))
            .put(Tuple.tuple("short", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(
                Tuple.tuple("int", ARRAY_FORMAT),
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.INT))
            .put(
                Tuple.tuple("int", DOC_VALUES_FORMAT),
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.INT))
            .put(Tuple.tuple("int", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(
                Tuple.tuple("long", ARRAY_FORMAT),
                new PackedArrayIndexFieldData.Builder()
                    .setNumericType(IndexNumericFieldData.NumericType.LONG))
            .put(
                Tuple.tuple("long", DOC_VALUES_FORMAT),
                new DocValuesIndexFieldData.Builder()
                    .numericType(IndexNumericFieldData.NumericType.LONG))
            .put(Tuple.tuple("long", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(
                Tuple.tuple("geo_point", ARRAY_FORMAT),
                new GeoPointDoubleArrayIndexFieldData.Builder())
            .put(
                Tuple.tuple("geo_point", DOC_VALUES_FORMAT),
                new GeoPointBinaryDVIndexFieldData.Builder())
            .put(Tuple.tuple("geo_point", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .put(
                Tuple.tuple("geo_point", COMPRESSED_FORMAT),
                new GeoPointCompressedIndexFieldData.Builder())
            .put(
                Tuple.tuple("binary", DOC_VALUES_FORMAT), new BytesBinaryDVIndexFieldData.Builder())
            .put(Tuple.tuple("binary", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())
            .immutableMap();
  }

  private final IndicesFieldDataCache indicesFieldDataCache;
  private final ConcurrentMap<String, IndexFieldData<?>> loadedFieldData =
      ConcurrentCollections.newConcurrentMap();
  private final Map<String, IndexFieldDataCache> fieldDataCaches =
      Maps.newHashMap(); // no need for concurrency support, always used under lock

  IndexService indexService;

  // public for testing
  public IndexFieldDataService(Index index, CircuitBreakerService circuitBreakerService) {
    this(
        index,
        ImmutableSettings.Builder.EMPTY_SETTINGS,
        new IndicesFieldDataCache(
            ImmutableSettings.Builder.EMPTY_SETTINGS,
            new IndicesFieldDataCacheListener(circuitBreakerService)),
        circuitBreakerService,
        new IndicesFieldDataCacheListener(circuitBreakerService));
  }

  // public for testing
  public IndexFieldDataService(
      Index index,
      CircuitBreakerService circuitBreakerService,
      IndicesFieldDataCache indicesFieldDataCache) {
    this(
        index,
        ImmutableSettings.Builder.EMPTY_SETTINGS,
        indicesFieldDataCache,
        circuitBreakerService,
        new IndicesFieldDataCacheListener(circuitBreakerService));
  }

  @Inject
  public IndexFieldDataService(
      Index index,
      @IndexSettings Settings indexSettings,
      IndicesFieldDataCache indicesFieldDataCache,
      CircuitBreakerService circuitBreakerService,
      IndicesFieldDataCacheListener indicesFieldDataCacheListener) {
    super(index, indexSettings);
    this.indicesFieldDataCache = indicesFieldDataCache;
    this.circuitBreakerService = circuitBreakerService;
    this.indicesFieldDataCacheListener = indicesFieldDataCacheListener;
  }

  // we need to "inject" the index service to not create cyclic dep
  public void setIndexService(IndexService indexService) {
    this.indexService = indexService;
  }

  public void clear() {
    synchronized (loadedFieldData) {
      for (IndexFieldData<?> fieldData : loadedFieldData.values()) {
        fieldData.clear();
      }
      loadedFieldData.clear();
      for (IndexFieldDataCache cache : fieldDataCaches.values()) {
        cache.clear();
      }
      fieldDataCaches.clear();
    }
  }

  public void clearField(String fieldName) {
    synchronized (loadedFieldData) {
      IndexFieldData<?> fieldData = loadedFieldData.remove(fieldName);
      if (fieldData != null) {
        fieldData.clear();
      }
      IndexFieldDataCache cache = fieldDataCaches.remove(fieldName);
      if (cache != null) {
        cache.clear();
      }
    }
  }

  public void clear(IndexReader reader) {
    synchronized (loadedFieldData) {
      for (IndexFieldData<?> indexFieldData : loadedFieldData.values()) {
        indexFieldData.clear(reader);
      }
      for (IndexFieldDataCache cache : fieldDataCaches.values()) {
        cache.clear(reader);
      }
    }
  }

  public void onMappingUpdate() {
    // synchronize to make sure to not miss field data instances that are being loaded
    synchronized (loadedFieldData) {
      // important: do not clear fieldDataCaches: the cache may be reused
      loadedFieldData.clear();
    }
  }

  public <IFD extends IndexFieldData<?>> IFD getForField(FieldMapper<?> mapper) {
    final FieldMapper.Names fieldNames = mapper.names();
    final FieldDataType type = mapper.fieldDataType();
    final boolean docValues = mapper.hasDocValues();
    IndexFieldData<?> fieldData = loadedFieldData.get(fieldNames.indexName());
    if (fieldData == null) {
      synchronized (loadedFieldData) {
        fieldData = loadedFieldData.get(fieldNames.indexName());
        if (fieldData == null) {
          IndexFieldData.Builder builder = null;
          String format = type.getFormat(indexSettings);
          if (format != null
              && FieldDataType.DOC_VALUES_FORMAT_VALUE.equals(format)
              && !docValues) {
            logger.warn(
                "field ["
                    + fieldNames.fullName()
                    + "] has no doc values, will use default field data format");
            format = null;
          }
          if (format != null) {
            builder = buildersByTypeAndFormat.get(Tuple.tuple(type.getType(), format));
            if (builder == null) {
              logger.warn(
                  "failed to find format ["
                      + format
                      + "] for field ["
                      + fieldNames.fullName()
                      + "], will use default");
            }
          }
          if (builder == null && docValues) {
            builder = docValuesBuildersByType.get(type.getType());
          }
          if (builder == null) {
            builder = buildersByType.get(type.getType());
          }
          if (builder == null) {
            throw new ElasticsearchIllegalArgumentException(
                "failed to find field data builder for field "
                    + fieldNames.fullName()
                    + ", and type "
                    + type.getType());
          }

          IndexFieldDataCache cache = fieldDataCaches.get(fieldNames.indexName());
          if (cache == null) {
            //  we default to node level cache, which in turn defaults to be unbounded
            // this means changing the node level settings is simple, just set the bounds there
            String cacheType =
                type.getSettings().get("cache", indexSettings.get("index.fielddata.cache", "node"));
            if ("resident".equals(cacheType)) {
              cache =
                  new IndexFieldDataCache.Resident(
                      indexService, fieldNames, type, indicesFieldDataCacheListener);
            } else if ("soft".equals(cacheType)) {
              cache =
                  new IndexFieldDataCache.Soft(
                      indexService, fieldNames, type, indicesFieldDataCacheListener);
            } else if ("node".equals(cacheType)) {
              cache =
                  indicesFieldDataCache.buildIndexFieldDataCache(
                      indexService, index, fieldNames, type);
            } else {
              throw new ElasticsearchIllegalArgumentException(
                  "cache type not supported ["
                      + cacheType
                      + "] for field ["
                      + fieldNames.fullName()
                      + "]");
            }
            fieldDataCaches.put(fieldNames.indexName(), cache);
          }

          GlobalOrdinalsBuilder globalOrdinalBuilder =
              new InternalGlobalOrdinalsBuilder(index(), indexSettings);
          fieldData =
              builder.build(
                  index,
                  indexSettings,
                  mapper,
                  cache,
                  circuitBreakerService,
                  indexService.mapperService(),
                  globalOrdinalBuilder);
          loadedFieldData.put(fieldNames.indexName(), fieldData);
        }
      }
    }
    return (IFD) fieldData;
  }
}
예제 #6
0
public class SearchService extends AbstractLifecycleComponent<SearchService> {

  public static final String NORMS_LOADING_KEY = "index.norms.loading";
  public static final String DEFAULT_KEEPALIVE_KEY = "search.default_keep_alive";
  public static final String KEEPALIVE_INTERVAL_KEY = "search.keep_alive_interval";

  private final ThreadPool threadPool;

  private final ClusterService clusterService;

  private final IndicesService indicesService;

  private final IndicesWarmer indicesWarmer;

  private final ScriptService scriptService;

  private final PageCacheRecycler pageCacheRecycler;

  private final BigArrays bigArrays;

  private final DfsPhase dfsPhase;

  private final QueryPhase queryPhase;

  private final FetchPhase fetchPhase;

  private final IndicesQueryCache indicesQueryCache;

  private final long defaultKeepAlive;

  private final ScheduledFuture<?> keepAliveReaper;

  private final AtomicLong idGenerator = new AtomicLong();

  private final ConcurrentMapLong<SearchContext> activeContexts =
      ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency();

  private final ImmutableMap<String, SearchParseElement> elementParsers;

  @Inject
  public SearchService(
      Settings settings,
      ClusterService clusterService,
      IndicesService indicesService,
      IndicesWarmer indicesWarmer,
      ThreadPool threadPool,
      ScriptService scriptService,
      PageCacheRecycler pageCacheRecycler,
      BigArrays bigArrays,
      DfsPhase dfsPhase,
      QueryPhase queryPhase,
      FetchPhase fetchPhase,
      IndicesQueryCache indicesQueryCache) {
    super(settings);
    this.threadPool = threadPool;
    this.clusterService = clusterService;
    this.indicesService = indicesService;
    indicesService
        .indicesLifecycle()
        .addListener(
            new IndicesLifecycle.Listener() {

              @Override
              public void afterIndexDeleted(Index index, @IndexSettings Settings indexSettings) {
                // once an index is closed we can just clean up all the pending search context
                // information
                // to release memory and let references to the filesystem go etc.
                freeAllContextForIndex(index);
              }
            });
    this.indicesWarmer = indicesWarmer;
    this.scriptService = scriptService;
    this.pageCacheRecycler = pageCacheRecycler;
    this.bigArrays = bigArrays;
    this.dfsPhase = dfsPhase;
    this.queryPhase = queryPhase;
    this.fetchPhase = fetchPhase;
    this.indicesQueryCache = indicesQueryCache;

    TimeValue keepAliveInterval = settings.getAsTime(KEEPALIVE_INTERVAL_KEY, timeValueMinutes(1));
    // we can have 5 minutes here, since we make sure to clean with search requests and when
    // shard/index closes
    this.defaultKeepAlive = settings.getAsTime(DEFAULT_KEEPALIVE_KEY, timeValueMinutes(5)).millis();

    Map<String, SearchParseElement> elementParsers = new HashMap<>();
    elementParsers.putAll(dfsPhase.parseElements());
    elementParsers.putAll(queryPhase.parseElements());
    elementParsers.putAll(fetchPhase.parseElements());
    elementParsers.put("stats", new StatsGroupsParseElement());
    this.elementParsers = ImmutableMap.copyOf(elementParsers);

    this.keepAliveReaper = threadPool.scheduleWithFixedDelay(new Reaper(), keepAliveInterval);

    this.indicesWarmer.addListener(new NormsWarmer());
    this.indicesWarmer.addListener(new FieldDataWarmer());
    this.indicesWarmer.addListener(new SearchWarmer());
  }

  protected void putContext(SearchContext context) {
    final SearchContext previous = activeContexts.put(context.id(), context);
    assert previous == null;
  }

  protected SearchContext removeContext(long id) {
    return activeContexts.remove(id);
  }

  @Override
  protected void doStart() {}

  @Override
  protected void doStop() {
    for (final SearchContext context : activeContexts.values()) {
      freeContext(context.id());
    }
  }

  @Override
  protected void doClose() {
    doStop();
    FutureUtils.cancel(keepAliveReaper);
  }

  public DfsSearchResult executeDfsPhase(ShardSearchRequest request) {
    final SearchContext context = createAndPutContext(request);
    try {
      contextProcessing(context);
      dfsPhase.execute(context);
      contextProcessedSuccessfully(context);
      return context.dfsResult();
    } catch (Throwable e) {
      logger.trace("Dfs phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public QuerySearchResult executeScan(ShardSearchRequest request) {
    final SearchContext context = createAndPutContext(request);
    final int originalSize = context.size();
    try {
      if (context.aggregations() != null) {
        throw new IllegalArgumentException("aggregations are not supported with search_type=scan");
      }

      if (context.scroll() == null) {
        throw new ElasticsearchException("Scroll must be provided when scanning...");
      }

      assert context.searchType() == SearchType.SCAN;
      context.searchType(
          SearchType
              .QUERY_THEN_FETCH); // move to QUERY_THEN_FETCH, and then, when scrolling, move to
      // SCAN
      context.size(0); // set size to 0 so that we only count matches
      assert context.searchType() == SearchType.QUERY_THEN_FETCH;

      contextProcessing(context);
      queryPhase.execute(context);
      contextProcessedSuccessfully(context);
      return context.queryResult();
    } catch (Throwable e) {
      logger.trace("Scan phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      context.size(originalSize);
      cleanContext(context);
    }
  }

  public ScrollQueryFetchSearchResult executeScan(InternalScrollSearchRequest request) {
    final SearchContext context = findContext(request.id());
    contextProcessing(context);
    try {
      processScroll(request, context);
      if (context.searchType() == SearchType.QUERY_THEN_FETCH) {
        // first scanning, reset the from to 0
        context.searchType(SearchType.SCAN);
        context.from(0);
      }
      queryPhase.execute(context);
      shortcutDocIdsToLoadForScanning(context);
      fetchPhase.execute(context);
      if (context.scroll() == null || context.fetchResult().hits().hits().length < context.size()) {
        freeContext(request.id());
      } else {
        contextProcessedSuccessfully(context);
      }
      return new ScrollQueryFetchSearchResult(
          new QueryFetchSearchResult(context.queryResult(), context.fetchResult()),
          context.shardTarget());
    } catch (Throwable e) {
      logger.trace("Scan phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  /**
   * Try to load the query results from the cache or execute the query phase directly if the cache
   * cannot be used.
   */
  private void loadOrExecuteQueryPhase(
      final ShardSearchRequest request, final SearchContext context, final QueryPhase queryPhase)
      throws Exception {
    final boolean canCache = indicesQueryCache.canCache(request, context);
    if (canCache) {
      indicesQueryCache.loadIntoContext(request, context, queryPhase);
    } else {
      queryPhase.execute(context);
    }
  }

  public QuerySearchResultProvider executeQueryPhase(ShardSearchRequest request) {
    final SearchContext context = createAndPutContext(request);
    final ShardSearchStats shardSearchStats = context.indexShard().searchService();
    try {
      shardSearchStats.onPreQueryPhase(context);
      long time = System.nanoTime();
      contextProcessing(context);

      loadOrExecuteQueryPhase(request, context, queryPhase);

      if (context.queryResult().topDocs().scoreDocs.length == 0 && context.scroll() == null) {
        freeContext(context.id());
      } else {
        contextProcessedSuccessfully(context);
      }
      shardSearchStats.onQueryPhase(context, System.nanoTime() - time);

      return context.queryResult();
    } catch (Throwable e) {
      // execution exception can happen while loading the cache, strip it
      if (e instanceof ExecutionException) {
        e = e.getCause();
      }
      shardSearchStats.onFailedQueryPhase(context);
      logger.trace("Query phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public ScrollQuerySearchResult executeQueryPhase(InternalScrollSearchRequest request) {
    final SearchContext context = findContext(request.id());
    ShardSearchStats shardSearchStats = context.indexShard().searchService();
    try {
      shardSearchStats.onPreQueryPhase(context);
      long time = System.nanoTime();
      contextProcessing(context);
      processScroll(request, context);
      queryPhase.execute(context);
      contextProcessedSuccessfully(context);
      shardSearchStats.onQueryPhase(context, System.nanoTime() - time);
      return new ScrollQuerySearchResult(context.queryResult(), context.shardTarget());
    } catch (Throwable e) {
      shardSearchStats.onFailedQueryPhase(context);
      logger.trace("Query phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public QuerySearchResult executeQueryPhase(QuerySearchRequest request) {
    final SearchContext context = findContext(request.id());
    contextProcessing(context);
    try {
      final IndexCache indexCache = context.indexShard().indexService().cache();
      context
          .searcher()
          .dfSource(
              new CachedDfSource(
                  context.searcher().getIndexReader(),
                  request.dfs(),
                  context.similarityService().similarity(),
                  indexCache.filter(),
                  indexCache.filterPolicy()));
    } catch (Throwable e) {
      processFailure(context, e);
      cleanContext(context);
      throw new QueryPhaseExecutionException(context, "Failed to set aggregated df", e);
    }
    ShardSearchStats shardSearchStats = context.indexShard().searchService();
    try {
      shardSearchStats.onPreQueryPhase(context);
      long time = System.nanoTime();
      queryPhase.execute(context);
      if (context.queryResult().topDocs().scoreDocs.length == 0 && context.scroll() == null) {
        // no hits, we can release the context since there will be no fetch phase
        freeContext(context.id());
      } else {
        contextProcessedSuccessfully(context);
      }
      shardSearchStats.onQueryPhase(context, System.nanoTime() - time);
      return context.queryResult();
    } catch (Throwable e) {
      shardSearchStats.onFailedQueryPhase(context);
      logger.trace("Query phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public QueryFetchSearchResult executeFetchPhase(ShardSearchRequest request) {
    final SearchContext context = createAndPutContext(request);
    contextProcessing(context);
    try {
      ShardSearchStats shardSearchStats = context.indexShard().searchService();
      shardSearchStats.onPreQueryPhase(context);
      long time = System.nanoTime();
      try {
        loadOrExecuteQueryPhase(request, context, queryPhase);
      } catch (Throwable e) {
        shardSearchStats.onFailedQueryPhase(context);
        throw ExceptionsHelper.convertToRuntime(e);
      }
      long time2 = System.nanoTime();
      shardSearchStats.onQueryPhase(context, time2 - time);
      shardSearchStats.onPreFetchPhase(context);
      try {
        shortcutDocIdsToLoad(context);
        fetchPhase.execute(context);
        if (context.scroll() == null) {
          freeContext(context.id());
        } else {
          contextProcessedSuccessfully(context);
        }
      } catch (Throwable e) {
        shardSearchStats.onFailedFetchPhase(context);
        throw ExceptionsHelper.convertToRuntime(e);
      }
      shardSearchStats.onFetchPhase(context, System.nanoTime() - time2);
      return new QueryFetchSearchResult(context.queryResult(), context.fetchResult());
    } catch (Throwable e) {
      logger.trace("Fetch phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public QueryFetchSearchResult executeFetchPhase(QuerySearchRequest request) {
    final SearchContext context = findContext(request.id());
    contextProcessing(context);
    try {
      final IndexCache indexCache = context.indexShard().indexService().cache();
      context
          .searcher()
          .dfSource(
              new CachedDfSource(
                  context.searcher().getIndexReader(),
                  request.dfs(),
                  context.similarityService().similarity(),
                  indexCache.filter(),
                  indexCache.filterPolicy()));
    } catch (Throwable e) {
      freeContext(context.id());
      cleanContext(context);
      throw new QueryPhaseExecutionException(context, "Failed to set aggregated df", e);
    }
    try {
      ShardSearchStats shardSearchStats = context.indexShard().searchService();
      shardSearchStats.onPreQueryPhase(context);
      long time = System.nanoTime();
      try {
        queryPhase.execute(context);
      } catch (Throwable e) {
        shardSearchStats.onFailedQueryPhase(context);
        throw ExceptionsHelper.convertToRuntime(e);
      }
      long time2 = System.nanoTime();
      shardSearchStats.onQueryPhase(context, time2 - time);
      shardSearchStats.onPreFetchPhase(context);
      try {
        shortcutDocIdsToLoad(context);
        fetchPhase.execute(context);
        if (context.scroll() == null) {
          freeContext(request.id());
        } else {
          contextProcessedSuccessfully(context);
        }
      } catch (Throwable e) {
        shardSearchStats.onFailedFetchPhase(context);
        throw ExceptionsHelper.convertToRuntime(e);
      }
      shardSearchStats.onFetchPhase(context, System.nanoTime() - time2);
      return new QueryFetchSearchResult(context.queryResult(), context.fetchResult());
    } catch (Throwable e) {
      logger.trace("Fetch phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public ScrollQueryFetchSearchResult executeFetchPhase(InternalScrollSearchRequest request) {
    final SearchContext context = findContext(request.id());
    contextProcessing(context);
    try {
      ShardSearchStats shardSearchStats = context.indexShard().searchService();
      processScroll(request, context);
      shardSearchStats.onPreQueryPhase(context);
      long time = System.nanoTime();
      try {
        queryPhase.execute(context);
      } catch (Throwable e) {
        shardSearchStats.onFailedQueryPhase(context);
        throw ExceptionsHelper.convertToRuntime(e);
      }
      long time2 = System.nanoTime();
      shardSearchStats.onQueryPhase(context, time2 - time);
      shardSearchStats.onPreFetchPhase(context);
      try {
        shortcutDocIdsToLoad(context);
        fetchPhase.execute(context);
        if (context.scroll() == null) {
          freeContext(request.id());
        } else {
          contextProcessedSuccessfully(context);
        }
      } catch (Throwable e) {
        shardSearchStats.onFailedFetchPhase(context);
        throw ExceptionsHelper.convertToRuntime(e);
      }
      shardSearchStats.onFetchPhase(context, System.nanoTime() - time2);
      return new ScrollQueryFetchSearchResult(
          new QueryFetchSearchResult(context.queryResult(), context.fetchResult()),
          context.shardTarget());
    } catch (Throwable e) {
      logger.trace("Fetch phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  public FetchSearchResult executeFetchPhase(ShardFetchRequest request) {
    final SearchContext context = findContext(request.id());
    contextProcessing(context);
    final ShardSearchStats shardSearchStats = context.indexShard().searchService();
    try {
      if (request.lastEmittedDoc() != null) {
        context.lastEmittedDoc(request.lastEmittedDoc());
      }
      context.docIdsToLoad(request.docIds(), 0, request.docIdsSize());
      shardSearchStats.onPreFetchPhase(context);
      long time = System.nanoTime();
      fetchPhase.execute(context);
      if (context.scroll() == null) {
        freeContext(request.id());
      } else {
        contextProcessedSuccessfully(context);
      }
      shardSearchStats.onFetchPhase(context, System.nanoTime() - time);
      return context.fetchResult();
    } catch (Throwable e) {
      shardSearchStats.onFailedFetchPhase(context);
      logger.trace("Fetch phase failed", e);
      processFailure(context, e);
      throw ExceptionsHelper.convertToRuntime(e);
    } finally {
      cleanContext(context);
    }
  }

  private SearchContext findContext(long id) throws SearchContextMissingException {
    SearchContext context = activeContexts.get(id);
    if (context == null) {
      throw new SearchContextMissingException(id);
    }
    SearchContext.setCurrent(context);
    return context;
  }

  final SearchContext createAndPutContext(ShardSearchRequest request) {
    SearchContext context = createContext(request, null);
    boolean success = false;
    try {
      putContext(context);
      context.indexShard().searchService().onNewContext(context);
      success = true;
      return context;
    } finally {
      if (!success) {
        freeContext(context.id());
      }
    }
  }

  final SearchContext createContext(
      ShardSearchRequest request, @Nullable Engine.Searcher searcher) {
    IndexService indexService = indicesService.indexServiceSafe(request.index());
    IndexShard indexShard = indexService.shardSafe(request.shardId());

    SearchShardTarget shardTarget =
        new SearchShardTarget(clusterService.localNode().id(), request.index(), request.shardId());

    Engine.Searcher engineSearcher =
        searcher == null ? indexShard.acquireSearcher("search") : searcher;
    SearchContext context =
        new DefaultSearchContext(
            idGenerator.incrementAndGet(),
            request,
            shardTarget,
            engineSearcher,
            indexService,
            indexShard,
            scriptService,
            pageCacheRecycler,
            bigArrays,
            threadPool.estimatedTimeInMillisCounter());
    SearchContext.setCurrent(context);
    try {
      context.scroll(request.scroll());

      parseTemplate(request);
      parseSource(context, request.source());
      parseSource(context, request.extraSource());

      // if the from and size are still not set, default them
      if (context.from() == -1) {
        context.from(0);
      }
      if (context.searchType() == SearchType.COUNT) {
        // so that the optimizations we apply to size=0 also apply to search_type=COUNT
        // and that we close contexts when done with the query phase
        context.searchType(SearchType.QUERY_THEN_FETCH);
        context.size(0);
      } else if (context.size() == -1) {
        context.size(10);
      }

      // pre process
      dfsPhase.preProcess(context);
      queryPhase.preProcess(context);
      fetchPhase.preProcess(context);

      // compute the context keep alive
      long keepAlive = defaultKeepAlive;
      if (request.scroll() != null && request.scroll().keepAlive() != null) {
        keepAlive = request.scroll().keepAlive().millis();
      }
      context.keepAlive(keepAlive);
    } catch (Throwable e) {
      context.close();
      throw ExceptionsHelper.convertToRuntime(e);
    }

    return context;
  }

  private void freeAllContextForIndex(Index index) {
    assert index != null;
    for (SearchContext ctx : activeContexts.values()) {
      if (index.equals(ctx.indexShard().shardId().index())) {
        freeContext(ctx.id());
      }
    }
  }

  public boolean freeContext(long id) {
    final SearchContext context = removeContext(id);
    if (context != null) {
      try {
        context.indexShard().searchService().onFreeContext(context);
      } finally {
        context.close();
      }
      return true;
    }
    return false;
  }

  public void freeAllScrollContexts() {
    for (SearchContext searchContext : activeContexts.values()) {
      if (searchContext.scroll() != null) {
        freeContext(searchContext.id());
      }
    }
  }

  private void contextProcessing(SearchContext context) {
    // disable timeout while executing a search
    context.accessed(-1);
  }

  private void contextProcessedSuccessfully(SearchContext context) {
    context.accessed(threadPool.estimatedTimeInMillis());
  }

  private void cleanContext(SearchContext context) {
    assert context == SearchContext.current();
    context.clearReleasables(Lifetime.PHASE);
    SearchContext.removeCurrent();
  }

  private void processFailure(SearchContext context, Throwable t) {
    freeContext(context.id());
    try {
      if (Lucene.isCorruptionException(t)) {
        context.indexShard().failShard("search execution corruption failure", t);
      }
    } catch (Throwable e) {
      logger.warn(
          "failed to process shard failure to (potentially) send back shard failure on corruption",
          e);
    }
  }

  private void parseTemplate(ShardSearchRequest request) {

    BytesReference processedQuery;
    if (request.template() != null) {
      ExecutableScript executable =
          this.scriptService.executable(request.template(), ScriptContext.Standard.SEARCH);
      processedQuery = (BytesReference) executable.run();
    } else {
      if (!hasLength(request.templateSource())) {
        return;
      }
      XContentParser parser = null;
      Template template = null;

      try {
        parser =
            XContentFactory.xContent(request.templateSource())
                .createParser(request.templateSource());
        template = TemplateQueryParser.parse(parser, "params", "template");

        if (template.getType() == ScriptService.ScriptType.INLINE) {
          // Try to double parse for nested template id/file
          parser = null;
          try {
            ExecutableScript executable =
                this.scriptService.executable(template, ScriptContext.Standard.SEARCH);
            processedQuery = (BytesReference) executable.run();
            parser = XContentFactory.xContent(processedQuery).createParser(processedQuery);
          } catch (ElasticsearchParseException epe) {
            // This was an non-nested template, the parse failure was due to this, it is safe to
            // assume this refers to a file
            // for backwards compatibility and keep going
            template =
                new Template(
                    template.getScript(),
                    ScriptService.ScriptType.FILE,
                    MustacheScriptEngineService.NAME,
                    null,
                    template.getParams());
            ExecutableScript executable =
                this.scriptService.executable(template, ScriptContext.Standard.SEARCH);
            processedQuery = (BytesReference) executable.run();
          }
          if (parser != null) {
            try {
              Template innerTemplate = TemplateQueryParser.parse(parser);
              if (hasLength(innerTemplate.getScript())
                  && !innerTemplate.getType().equals(ScriptService.ScriptType.INLINE)) {
                // An inner template referring to a filename or id
                template =
                    new Template(
                        innerTemplate.getScript(),
                        innerTemplate.getType(),
                        MustacheScriptEngineService.NAME,
                        null,
                        template.getParams());
                ExecutableScript executable =
                    this.scriptService.executable(template, ScriptContext.Standard.SEARCH);
                processedQuery = (BytesReference) executable.run();
              }
            } catch (ScriptParseException e) {
              // No inner template found, use original template from above
            }
          }
        } else {
          ExecutableScript executable =
              this.scriptService.executable(template, ScriptContext.Standard.SEARCH);
          processedQuery = (BytesReference) executable.run();
        }
      } catch (IOException e) {
        throw new ElasticsearchParseException("Failed to parse template", e);
      } finally {
        Releasables.closeWhileHandlingException(parser);
      }

      if (!hasLength(template.getScript())) {
        throw new ElasticsearchParseException("Template must have [template] field configured");
      }
    }
    request.source(processedQuery);
  }

  private void parseSource(SearchContext context, BytesReference source)
      throws SearchParseException {
    // nothing to parse...
    if (source == null || source.length() == 0) {
      return;
    }
    XContentParser parser = null;
    try {
      parser = XContentFactory.xContent(source).createParser(source);
      XContentParser.Token token;
      token = parser.nextToken();
      if (token != XContentParser.Token.START_OBJECT) {
        throw new ElasticsearchParseException(
            "Expected START_OBJECT but got " + token.name() + " " + parser.currentName());
      }
      while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
        if (token == XContentParser.Token.FIELD_NAME) {
          String fieldName = parser.currentName();
          parser.nextToken();
          SearchParseElement element = elementParsers.get(fieldName);
          if (element == null) {
            throw new SearchParseException(
                context, "No parser for element [" + fieldName + "]", parser.getTokenLocation());
          }
          element.parse(parser, context);
        } else {
          if (token == null) {
            throw new ElasticsearchParseException(
                "End of query source reached but query is not complete.");
          } else {
            throw new ElasticsearchParseException(
                "Expected field name but got "
                    + token.name()
                    + " \""
                    + parser.currentName()
                    + "\"");
          }
        }
      }
    } catch (Throwable e) {
      String sSource = "_na_";
      try {
        sSource = XContentHelper.convertToJson(source, false);
      } catch (Throwable e1) {
        // ignore
      }
      throw new SearchParseException(
          context, "Failed to parse source [" + sSource + "]", parser.getTokenLocation(), e);
    } finally {
      if (parser != null) {
        parser.close();
      }
    }
  }

  private static final int[] EMPTY_DOC_IDS = new int[0];

  /**
   * Shortcut ids to load, we load only "from" and up to "size". The phase controller handles this
   * as well since the result is always size * shards for Q_A_F
   */
  private void shortcutDocIdsToLoad(SearchContext context) {
    if (context.request().scroll() != null) {
      TopDocs topDocs = context.queryResult().topDocs();
      int[] docIdsToLoad = new int[topDocs.scoreDocs.length];
      for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        docIdsToLoad[i] = topDocs.scoreDocs[i].doc;
      }
      context.docIdsToLoad(docIdsToLoad, 0, docIdsToLoad.length);
    } else {
      TopDocs topDocs = context.queryResult().topDocs();
      if (topDocs.scoreDocs.length < context.from()) {
        // no more docs...
        context.docIdsToLoad(EMPTY_DOC_IDS, 0, 0);
        return;
      }
      int totalSize = context.from() + context.size();
      int[] docIdsToLoad =
          new int[Math.min(topDocs.scoreDocs.length - context.from(), context.size())];
      int counter = 0;
      for (int i = context.from(); i < totalSize; i++) {
        if (i < topDocs.scoreDocs.length) {
          docIdsToLoad[counter] = topDocs.scoreDocs[i].doc;
        } else {
          break;
        }
        counter++;
      }
      context.docIdsToLoad(docIdsToLoad, 0, counter);
    }
  }

  private void shortcutDocIdsToLoadForScanning(SearchContext context) {
    TopDocs topDocs = context.queryResult().topDocs();
    if (topDocs.scoreDocs.length == 0) {
      // no more docs...
      context.docIdsToLoad(EMPTY_DOC_IDS, 0, 0);
      return;
    }
    int[] docIdsToLoad = new int[topDocs.scoreDocs.length];
    for (int i = 0; i < docIdsToLoad.length; i++) {
      docIdsToLoad[i] = topDocs.scoreDocs[i].doc;
    }
    context.docIdsToLoad(docIdsToLoad, 0, docIdsToLoad.length);
  }

  private void processScroll(InternalScrollSearchRequest request, SearchContext context) {
    // process scroll
    context.from(context.from() + context.size());
    context.scroll(request.scroll());
    // update the context keep alive based on the new scroll value
    if (request.scroll() != null && request.scroll().keepAlive() != null) {
      context.keepAlive(request.scroll().keepAlive().millis());
    }
  }

  /** Returns the number of active contexts in this SearchService */
  public int getActiveContexts() {
    return this.activeContexts.size();
  }

  static class NormsWarmer extends IndicesWarmer.Listener {

    @Override
    public TerminationHandle warmNewReaders(
        final IndexShard indexShard,
        IndexMetaData indexMetaData,
        final WarmerContext context,
        ThreadPool threadPool) {
      final Loading defaultLoading =
          Loading.parse(indexMetaData.settings().get(NORMS_LOADING_KEY), Loading.LAZY);
      final MapperService mapperService = indexShard.mapperService();
      final ObjectSet<String> warmUp = new ObjectHashSet<>();
      for (DocumentMapper docMapper : mapperService.docMappers(false)) {
        for (FieldMapper fieldMapper : docMapper.mappers()) {
          final String indexName = fieldMapper.fieldType().names().indexName();
          Loading normsLoading = fieldMapper.fieldType().normsLoading();
          if (normsLoading == null) {
            normsLoading = defaultLoading;
          }
          if (fieldMapper.fieldType().indexOptions() != IndexOptions.NONE
              && !fieldMapper.fieldType().omitNorms()
              && normsLoading == Loading.EAGER) {
            warmUp.add(indexName);
          }
        }
      }

      final CountDownLatch latch = new CountDownLatch(1);
      // Norms loading may be I/O intensive but is not CPU intensive, so we execute it in a single
      // task
      threadPool
          .executor(executor())
          .execute(
              new Runnable() {
                @Override
                public void run() {
                  try {
                    for (ObjectCursor<String> stringObjectCursor : warmUp) {
                      final String indexName = stringObjectCursor.value;
                      final long start = System.nanoTime();
                      for (final LeafReaderContext ctx : context.searcher().reader().leaves()) {
                        final NumericDocValues values = ctx.reader().getNormValues(indexName);
                        if (values != null) {
                          values.get(0);
                        }
                      }
                      if (indexShard.warmerService().logger().isTraceEnabled()) {
                        indexShard
                            .warmerService()
                            .logger()
                            .trace(
                                "warmed norms for [{}], took [{}]",
                                indexName,
                                TimeValue.timeValueNanos(System.nanoTime() - start));
                      }
                    }
                  } catch (Throwable t) {
                    indexShard.warmerService().logger().warn("failed to warm-up norms", t);
                  } finally {
                    latch.countDown();
                  }
                }
              });

      return new TerminationHandle() {
        @Override
        public void awaitTermination() throws InterruptedException {
          latch.await();
        }
      };
    }

    @Override
    public TerminationHandle warmTopReader(
        IndexShard indexShard,
        IndexMetaData indexMetaData,
        WarmerContext context,
        ThreadPool threadPool) {
      return TerminationHandle.NO_WAIT;
    }
  }

  static class FieldDataWarmer extends IndicesWarmer.Listener {

    @Override
    public TerminationHandle warmNewReaders(
        final IndexShard indexShard,
        IndexMetaData indexMetaData,
        final WarmerContext context,
        ThreadPool threadPool) {
      final MapperService mapperService = indexShard.mapperService();
      final Map<String, MappedFieldType> warmUp = new HashMap<>();
      for (DocumentMapper docMapper : mapperService.docMappers(false)) {
        for (FieldMapper fieldMapper : docMapper.mappers()) {
          final FieldDataType fieldDataType = fieldMapper.fieldType().fieldDataType();
          if (fieldDataType == null) {
            continue;
          }
          if (fieldDataType.getLoading() == Loading.LAZY) {
            continue;
          }

          final String indexName = fieldMapper.fieldType().names().indexName();
          if (warmUp.containsKey(indexName)) {
            continue;
          }
          warmUp.put(indexName, fieldMapper.fieldType());
        }
      }
      final IndexFieldDataService indexFieldDataService = indexShard.indexFieldDataService();
      final Executor executor = threadPool.executor(executor());
      final CountDownLatch latch =
          new CountDownLatch(context.searcher().reader().leaves().size() * warmUp.size());
      for (final LeafReaderContext ctx : context.searcher().reader().leaves()) {
        for (final MappedFieldType fieldType : warmUp.values()) {
          executor.execute(
              new Runnable() {

                @Override
                public void run() {
                  try {
                    final long start = System.nanoTime();
                    indexFieldDataService.getForField(fieldType).load(ctx);
                    if (indexShard.warmerService().logger().isTraceEnabled()) {
                      indexShard
                          .warmerService()
                          .logger()
                          .trace(
                              "warmed fielddata for [{}], took [{}]",
                              fieldType.names().fullName(),
                              TimeValue.timeValueNanos(System.nanoTime() - start));
                    }
                  } catch (Throwable t) {
                    indexShard
                        .warmerService()
                        .logger()
                        .warn(
                            "failed to warm-up fielddata for [{}]",
                            t,
                            fieldType.names().fullName());
                  } finally {
                    latch.countDown();
                  }
                }
              });
        }
      }
      return new TerminationHandle() {
        @Override
        public void awaitTermination() throws InterruptedException {
          latch.await();
        }
      };
    }

    @Override
    public TerminationHandle warmTopReader(
        final IndexShard indexShard,
        IndexMetaData indexMetaData,
        final WarmerContext context,
        ThreadPool threadPool) {
      final MapperService mapperService = indexShard.mapperService();
      final Map<String, MappedFieldType> warmUpGlobalOrdinals = new HashMap<>();
      for (DocumentMapper docMapper : mapperService.docMappers(false)) {
        for (FieldMapper fieldMapper : docMapper.mappers()) {
          final FieldDataType fieldDataType = fieldMapper.fieldType().fieldDataType();
          if (fieldDataType == null) {
            continue;
          }
          if (fieldDataType.getLoading() != Loading.EAGER_GLOBAL_ORDINALS) {
            continue;
          }
          final String indexName = fieldMapper.fieldType().names().indexName();
          if (warmUpGlobalOrdinals.containsKey(indexName)) {
            continue;
          }
          warmUpGlobalOrdinals.put(indexName, fieldMapper.fieldType());
        }
      }
      final IndexFieldDataService indexFieldDataService = indexShard.indexFieldDataService();
      final Executor executor = threadPool.executor(executor());
      final CountDownLatch latch = new CountDownLatch(warmUpGlobalOrdinals.size());
      for (final MappedFieldType fieldType : warmUpGlobalOrdinals.values()) {
        executor.execute(
            new Runnable() {
              @Override
              public void run() {
                try {
                  final long start = System.nanoTime();
                  IndexFieldData.Global ifd = indexFieldDataService.getForField(fieldType);
                  ifd.loadGlobal(context.reader());
                  if (indexShard.warmerService().logger().isTraceEnabled()) {
                    indexShard
                        .warmerService()
                        .logger()
                        .trace(
                            "warmed global ordinals for [{}], took [{}]",
                            fieldType.names().fullName(),
                            TimeValue.timeValueNanos(System.nanoTime() - start));
                  }
                } catch (Throwable t) {
                  indexShard
                      .warmerService()
                      .logger()
                      .warn(
                          "failed to warm-up global ordinals for [{}]",
                          t,
                          fieldType.names().fullName());
                } finally {
                  latch.countDown();
                }
              }
            });
      }
      return new TerminationHandle() {
        @Override
        public void awaitTermination() throws InterruptedException {
          latch.await();
        }
      };
    }
  }

  class SearchWarmer extends IndicesWarmer.Listener {

    @Override
    public TerminationHandle warmNewReaders(
        IndexShard indexShard,
        IndexMetaData indexMetaData,
        WarmerContext context,
        ThreadPool threadPool) {
      return internalWarm(indexShard, indexMetaData, context, threadPool, false);
    }

    @Override
    public TerminationHandle warmTopReader(
        IndexShard indexShard,
        IndexMetaData indexMetaData,
        WarmerContext context,
        ThreadPool threadPool) {
      return internalWarm(indexShard, indexMetaData, context, threadPool, true);
    }

    public TerminationHandle internalWarm(
        final IndexShard indexShard,
        final IndexMetaData indexMetaData,
        final IndicesWarmer.WarmerContext warmerContext,
        ThreadPool threadPool,
        final boolean top) {
      IndexWarmersMetaData custom = indexMetaData.custom(IndexWarmersMetaData.TYPE);
      if (custom == null) {
        return TerminationHandle.NO_WAIT;
      }
      final Executor executor = threadPool.executor(executor());
      final CountDownLatch latch = new CountDownLatch(custom.entries().size());
      for (final IndexWarmersMetaData.Entry entry : custom.entries()) {
        executor.execute(
            new Runnable() {

              @Override
              public void run() {
                SearchContext context = null;
                try {
                  long now = System.nanoTime();
                  ShardSearchRequest request =
                      new ShardSearchLocalRequest(
                          indexShard.shardId(),
                          indexMetaData.numberOfShards(),
                          SearchType.QUERY_THEN_FETCH,
                          entry.source(),
                          entry.types(),
                          entry.queryCache());
                  context = createContext(request, warmerContext.searcher());
                  // if we use sort, we need to do query to sort on it and load relevant field data
                  // if not, we might as well set size=0 (and cache if needed)
                  if (context.sort() == null) {
                    context.size(0);
                  }
                  boolean canCache = indicesQueryCache.canCache(request, context);
                  // early terminate when we can cache, since we can only do proper caching on top
                  // level searcher
                  // also, if we can't cache, and its top, we don't need to execute it, since we
                  // already did when its not top
                  if (canCache != top) {
                    return;
                  }
                  loadOrExecuteQueryPhase(request, context, queryPhase);
                  long took = System.nanoTime() - now;
                  if (indexShard.warmerService().logger().isTraceEnabled()) {
                    indexShard
                        .warmerService()
                        .logger()
                        .trace(
                            "warmed [{}], took [{}]", entry.name(), TimeValue.timeValueNanos(took));
                  }
                } catch (Throwable t) {
                  indexShard.warmerService().logger().warn("warmer [{}] failed", t, entry.name());
                } finally {
                  try {
                    if (context != null) {
                      freeContext(context.id());
                      cleanContext(context);
                    }
                  } finally {
                    latch.countDown();
                  }
                }
              }
            });
      }
      return new TerminationHandle() {
        @Override
        public void awaitTermination() throws InterruptedException {
          latch.await();
        }
      };
    }
  }

  class Reaper implements Runnable {
    @Override
    public void run() {
      final long time = threadPool.estimatedTimeInMillis();
      for (SearchContext context : activeContexts.values()) {
        // Use the same value for both checks since lastAccessTime can
        // be modified by another thread between checks!
        final long lastAccessTime = context.lastAccessTime();
        if (lastAccessTime == -1l) { // its being processed or timeout is disabled
          continue;
        }
        if ((time - lastAccessTime > context.keepAlive())) {
          logger.debug(
              "freeing search context [{}], time [{}], lastAccessTime [{}], keepAlive [{}]",
              context.id(),
              time,
              lastAccessTime,
              context.keepAlive());
          freeContext(context.id());
        }
      }
    }
  }
}
예제 #7
0
/**
 * Each shard will have a percolator registry even if there isn't a {@link
 * PercolatorService#TYPE_NAME} document type in the index. For shards with indices that have no
 * {@link PercolatorService#TYPE_NAME} document type, this will hold no percolate queries.
 *
 * <p>Once a document type has been created, the real-time percolator will start to listen to write
 * events and update the this registry with queries in real time.
 */
public class PercolatorQueriesRegistry extends AbstractIndexShardComponent {

  // This is a shard level service, but these below are index level service:
  private final IndexQueryParserService queryParserService;
  private final MapperService mapperService;
  private final IndicesLifecycle indicesLifecycle;
  private final IndexCache indexCache;
  private final IndexFieldDataService indexFieldDataService;

  private final ShardIndexingService indexingService;
  private final ShardPercolateService shardPercolateService;

  private final ConcurrentMap<HashedBytesRef, Query> percolateQueries =
      ConcurrentCollections.newConcurrentMapWithAggressiveConcurrency();
  private final ShardLifecycleListener shardLifecycleListener = new ShardLifecycleListener();
  private final RealTimePercolatorOperationListener realTimePercolatorOperationListener =
      new RealTimePercolatorOperationListener();
  private final PercolateTypeListener percolateTypeListener = new PercolateTypeListener();
  private final AtomicBoolean realTimePercolatorEnabled = new AtomicBoolean(false);

  @Inject
  public PercolatorQueriesRegistry(
      ShardId shardId,
      @IndexSettings Settings indexSettings,
      IndexQueryParserService queryParserService,
      ShardIndexingService indexingService,
      IndicesLifecycle indicesLifecycle,
      MapperService mapperService,
      IndexCache indexCache,
      IndexFieldDataService indexFieldDataService,
      ShardPercolateService shardPercolateService) {
    super(shardId, indexSettings);
    this.queryParserService = queryParserService;
    this.mapperService = mapperService;
    this.indicesLifecycle = indicesLifecycle;
    this.indexingService = indexingService;
    this.indexCache = indexCache;
    this.indexFieldDataService = indexFieldDataService;
    this.shardPercolateService = shardPercolateService;

    indicesLifecycle.addListener(shardLifecycleListener);
    mapperService.addTypeListener(percolateTypeListener);
  }

  public ConcurrentMap<HashedBytesRef, Query> percolateQueries() {
    return percolateQueries;
  }

  public void close() {
    mapperService.removeTypeListener(percolateTypeListener);
    indicesLifecycle.removeListener(shardLifecycleListener);
    indexingService.removeListener(realTimePercolatorOperationListener);
    clear();
  }

  public void clear() {
    percolateQueries.clear();
  }

  void enableRealTimePercolator() {
    if (realTimePercolatorEnabled.compareAndSet(false, true)) {
      indexingService.addListener(realTimePercolatorOperationListener);
    }
  }

  void disableRealTimePercolator() {
    if (realTimePercolatorEnabled.compareAndSet(true, false)) {
      indexingService.removeListener(realTimePercolatorOperationListener);
    }
  }

  public void addPercolateQuery(String idAsString, BytesReference source) {
    Query newquery = parsePercolatorDocument(idAsString, source);
    HashedBytesRef id = new HashedBytesRef(new BytesRef(idAsString));
    Query previousQuery = percolateQueries.put(id, newquery);
    shardPercolateService.addedQuery(id, previousQuery, newquery);
  }

  public void removePercolateQuery(String idAsString) {
    HashedBytesRef id = new HashedBytesRef(idAsString);
    Query query = percolateQueries.remove(id);
    if (query != null) {
      shardPercolateService.removedQuery(id, query);
    }
  }

  Query parsePercolatorDocument(String id, BytesReference source) {
    String type = null;
    BytesReference querySource = null;

    XContentParser parser = null;
    try {
      parser = XContentHelper.createParser(source);
      String currentFieldName = null;
      XContentParser.Token token = parser.nextToken(); // move the START_OBJECT
      if (token != XContentParser.Token.START_OBJECT) {
        throw new ElasticsearchException(
            "failed to parse query [" + id + "], not starting with OBJECT");
      }
      while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
        if (token == XContentParser.Token.FIELD_NAME) {
          currentFieldName = parser.currentName();
        } else if (token == XContentParser.Token.START_OBJECT) {
          if ("query".equals(currentFieldName)) {
            if (type != null) {
              return parseQuery(type, null, parser);
            } else {
              XContentBuilder builder = XContentFactory.contentBuilder(parser.contentType());
              builder.copyCurrentStructure(parser);
              querySource = builder.bytes();
              builder.close();
            }
          } else {
            parser.skipChildren();
          }
        } else if (token == XContentParser.Token.START_ARRAY) {
          parser.skipChildren();
        } else if (token.isValue()) {
          if ("type".equals(currentFieldName)) {
            type = parser.text();
          }
        }
      }
      return parseQuery(type, querySource, null);
    } catch (Exception e) {
      throw new PercolatorException(shardId().index(), "failed to parse query [" + id + "]", e);
    } finally {
      if (parser != null) {
        parser.close();
      }
    }
  }

  private Query parseQuery(String type, BytesReference querySource, XContentParser parser) {
    if (type == null) {
      if (parser != null) {
        return queryParserService.parse(parser).query();
      } else {
        return queryParserService.parse(querySource).query();
      }
    }

    String[] previousTypes = QueryParseContext.setTypesWithPrevious(new String[] {type});
    try {
      if (parser != null) {
        return queryParserService.parse(parser).query();
      } else {
        return queryParserService.parse(querySource).query();
      }
    } finally {
      QueryParseContext.setTypes(previousTypes);
    }
  }

  private class PercolateTypeListener implements DocumentTypeListener {

    @Override
    public void beforeCreate(DocumentMapper mapper) {
      if (PercolatorService.TYPE_NAME.equals(mapper.type())) {
        enableRealTimePercolator();
      }
    }

    @Override
    public void afterRemove(DocumentMapper mapper) {
      if (PercolatorService.TYPE_NAME.equals(mapper.type())) {
        disableRealTimePercolator();
        clear();
      }
    }
  }

  private class ShardLifecycleListener extends IndicesLifecycle.Listener {

    @Override
    public void afterIndexShardCreated(IndexShard indexShard) {
      if (hasPercolatorType(indexShard)) {
        enableRealTimePercolator();
      }
    }

    @Override
    public void afterIndexShardPostRecovery(IndexShard indexShard) {
      if (hasPercolatorType(indexShard)) {
        // percolator index has started, fetch what we can from it and initialize the indices
        // we have
        logger.debug(
            "loading percolator queries for index [{}] and shard[{}]...",
            shardId.index(),
            shardId.id());
        loadQueries(indexShard);
        logger.trace(
            "done loading percolator queries for index [{}] and shard[{}]",
            shardId.index(),
            shardId.id());
      }
    }

    private boolean hasPercolatorType(IndexShard indexShard) {
      ShardId otherShardId = indexShard.shardId();
      return shardId.equals(otherShardId) && mapperService.hasMapping(PercolatorService.TYPE_NAME);
    }

    private void loadQueries(IndexShard shard) {
      try {
        shard.refresh(new Engine.Refresh("percolator_load_queries").force(true));
        // Maybe add a mode load? This isn't really a write. We need write b/c state=post_recovery
        Engine.Searcher searcher =
            shard.acquireSearcher("percolator_load_queries", IndexShard.Mode.WRITE);
        try {
          Query query =
              new XConstantScoreQuery(
                  indexCache
                      .filter()
                      .cache(
                          new TermFilter(
                              new Term(TypeFieldMapper.NAME, PercolatorService.TYPE_NAME))));
          QueriesLoaderCollector queryCollector =
              new QueriesLoaderCollector(
                  PercolatorQueriesRegistry.this, logger, mapperService, indexFieldDataService);
          searcher.searcher().search(query, queryCollector);
          Map<HashedBytesRef, Query> queries = queryCollector.queries();
          for (Map.Entry<HashedBytesRef, Query> entry : queries.entrySet()) {
            Query previousQuery = percolateQueries.put(entry.getKey(), entry.getValue());
            shardPercolateService.addedQuery(entry.getKey(), previousQuery, entry.getValue());
          }
        } finally {
          searcher.release();
        }
      } catch (Exception e) {
        throw new PercolatorException(
            shardId.index(), "failed to load queries from percolator index", e);
      }
    }
  }

  private class RealTimePercolatorOperationListener extends IndexingOperationListener {

    @Override
    public Engine.Create preCreate(Engine.Create create) {
      // validate the query here, before we index
      if (PercolatorService.TYPE_NAME.equals(create.type())) {
        parsePercolatorDocument(create.id(), create.source());
      }
      return create;
    }

    @Override
    public void postCreateUnderLock(Engine.Create create) {
      // add the query under a doc lock
      if (PercolatorService.TYPE_NAME.equals(create.type())) {
        addPercolateQuery(create.id(), create.source());
      }
    }

    @Override
    public Engine.Index preIndex(Engine.Index index) {
      // validate the query here, before we index
      if (PercolatorService.TYPE_NAME.equals(index.type())) {
        parsePercolatorDocument(index.id(), index.source());
      }
      return index;
    }

    @Override
    public void postIndexUnderLock(Engine.Index index) {
      // add the query under a doc lock
      if (PercolatorService.TYPE_NAME.equals(index.type())) {
        addPercolateQuery(index.id(), index.source());
      }
    }

    @Override
    public void postDeleteUnderLock(Engine.Delete delete) {
      // remove the query under a lock
      if (PercolatorService.TYPE_NAME.equals(delete.type())) {
        removePercolateQuery(delete.id());
      }
    }

    // Updating the live percolate queries for a delete by query is tricky with the current way
    // delete by queries
    // are handled. It is only possible if we put a big lock around the post delete by query hook...

    // If we implement delete by query, that just runs a query and generates delete operations in a
    // bulk, then
    // updating the live percolator is automatically supported for delete by query.
    //        @Override
    //        public void postDeleteByQuery(Engine.DeleteByQuery deleteByQuery) {
    //        }
  }
}
  final class MockTransport implements Transport {

    Set<DiscoveryNode> connectedNodes = ConcurrentCollections.newConcurrentSet();
    volatile boolean randomConnectionExceptions = false;

    @Override
    public void transportServiceAdapter(TransportServiceAdapter service) {}

    @Override
    public BoundTransportAddress boundAddress() {
      return null;
    }

    @Override
    public Map<String, BoundTransportAddress> profileBoundAddresses() {
      return null;
    }

    @Override
    public TransportAddress[] addressesFromString(String address, int perAddressLimit)
        throws UnknownHostException {
      return new TransportAddress[0];
    }

    @Override
    public boolean nodeConnected(DiscoveryNode node) {
      return connectedNodes.contains(node);
    }

    @Override
    public void connectToNode(DiscoveryNode node, ConnectionProfile connectionProfile)
        throws ConnectTransportException {
      if (connectionProfile == null) {
        if (connectedNodes.contains(node) == false
            && randomConnectionExceptions
            && randomBoolean()) {
          throw new ConnectTransportException(node, "simulated");
        }
        connectedNodes.add(node);
      }
    }

    @Override
    public void disconnectFromNode(DiscoveryNode node) {
      connectedNodes.remove(node);
    }

    @Override
    public Connection getConnection(DiscoveryNode node) {
      return new Connection() {
        @Override
        public DiscoveryNode getNode() {
          return node;
        }

        @Override
        public void sendRequest(
            long requestId,
            String action,
            TransportRequest request,
            TransportRequestOptions options)
            throws IOException, TransportException {}

        @Override
        public void close() throws IOException {}
      };
    }

    @Override
    public Connection openConnection(DiscoveryNode node, ConnectionProfile profile)
        throws IOException {
      return getConnection(node);
    }

    @Override
    public long serverOpen() {
      return 0;
    }

    @Override
    public List<String> getLocalAddresses() {
      return null;
    }

    @Override
    public Lifecycle.State lifecycleState() {
      return null;
    }

    @Override
    public void addLifecycleListener(LifecycleListener listener) {}

    @Override
    public void removeLifecycleListener(LifecycleListener listener) {}

    @Override
    public void start() {}

    @Override
    public void stop() {}

    @Override
    public void close() {}
  }
public class IndicesClusterStateService
    extends AbstractLifecycleComponent<IndicesClusterStateService> implements ClusterStateListener {

  private final IndicesService indicesService;
  private final ClusterService clusterService;
  private final ThreadPool threadPool;
  private final RecoveryTarget recoveryTarget;
  private final ShardStateAction shardStateAction;
  private final NodeIndexDeletedAction nodeIndexDeletedAction;
  private final NodeMappingRefreshAction nodeMappingRefreshAction;

  // a map of mappings type we have seen per index due to cluster state
  // we need this so we won't remove types automatically created as part of the indexing process
  private final ConcurrentMap<Tuple<String, String>, Boolean> seenMappings =
      ConcurrentCollections.newConcurrentMap();

  // a list of shards that failed during recovery
  // we keep track of these shards in order to prevent repeated recovery of these shards on each
  // cluster state update
  private final ConcurrentMap<ShardId, FailedShard> failedShards =
      ConcurrentCollections.newConcurrentMap();

  static class FailedShard {
    public final long version;
    public final long timestamp;

    FailedShard(long version) {
      this.version = version;
      this.timestamp = System.currentTimeMillis();
    }
  }

  private final Object mutex = new Object();
  private final FailedEngineHandler failedEngineHandler = new FailedEngineHandler();

  private final boolean sendRefreshMapping;

  @Inject
  public IndicesClusterStateService(
      Settings settings,
      IndicesService indicesService,
      ClusterService clusterService,
      ThreadPool threadPool,
      RecoveryTarget recoveryTarget,
      ShardStateAction shardStateAction,
      NodeIndexDeletedAction nodeIndexDeletedAction,
      NodeMappingRefreshAction nodeMappingRefreshAction) {
    super(settings);
    this.indicesService = indicesService;
    this.clusterService = clusterService;
    this.threadPool = threadPool;
    this.recoveryTarget = recoveryTarget;
    this.shardStateAction = shardStateAction;
    this.nodeIndexDeletedAction = nodeIndexDeletedAction;
    this.nodeMappingRefreshAction = nodeMappingRefreshAction;

    this.sendRefreshMapping =
        this.settings.getAsBoolean("indices.cluster.send_refresh_mapping", true);
  }

  @Override
  protected void doStart() {
    clusterService.addFirst(this);
  }

  @Override
  protected void doStop() {
    clusterService.remove(this);
  }

  @Override
  protected void doClose() {}

  @Override
  public void clusterChanged(final ClusterChangedEvent event) {
    if (!indicesService.changesAllowed()) {
      return;
    }

    if (!lifecycle.started()) {
      return;
    }

    synchronized (mutex) {
      // we need to clean the shards and indices we have on this node, since we
      // are going to recover them again once state persistence is disabled (no master / not
      // recovered)
      // TODO: this feels a bit hacky here, a block disables state persistence, and then we clean
      // the allocated shards, maybe another flag in blocks?
      if (event.state().blocks().disableStatePersistence()) {
        for (IndexService indexService : indicesService) {
          String index = indexService.index().getName();
          for (Integer shardId : indexService.shardIds()) {
            logger.debug("[{}][{}] removing shard (disabled block persistence)", index, shardId);
            try {
              indexService.removeShard(shardId, "removing shard (disabled block persistence)");
            } catch (Throwable e) {
              logger.warn("[{}] failed to remove shard (disabled block persistence)", e, index);
            }
          }
          removeIndex(index, "cleaning index (disabled block persistence)");
        }
        return;
      }

      cleanFailedShards(event);

      applyDeletedIndices(event);
      applyNewIndices(event);
      applyMappings(event);
      applyAliases(event);
      applyNewOrUpdatedShards(event);
      applyDeletedShards(event);
      applyCleanedIndices(event);
      applySettings(event);
    }
  }

  private void applyCleanedIndices(final ClusterChangedEvent event) {
    // handle closed indices, since they are not allocated on a node once they are closed
    // so applyDeletedIndices might not take them into account
    for (IndexService indexService : indicesService) {
      String index = indexService.index().getName();
      IndexMetaData indexMetaData = event.state().metaData().index(index);
      if (indexMetaData != null && indexMetaData.state() == IndexMetaData.State.CLOSE) {
        for (Integer shardId : indexService.shardIds()) {
          logger.debug("[{}][{}] removing shard (index is closed)", index, shardId);
          try {
            indexService.removeShard(shardId, "removing shard (index is closed)");
          } catch (Throwable e) {
            logger.warn("[{}] failed to remove shard (index is closed)", e, index);
          }
        }
      }
    }
    for (IndexService indexService : indicesService) {
      String index = indexService.index().getName();
      if (indexService.shardIds().isEmpty()) {
        if (logger.isDebugEnabled()) {
          logger.debug("[{}] cleaning index (no shards allocated)", index);
        }
        // clean the index
        removeIndex(index, "removing index (no shards allocated)");
      }
    }
  }

  private void applyDeletedIndices(final ClusterChangedEvent event) {
    final ClusterState previousState = event.previousState();
    final String localNodeId = event.state().nodes().localNodeId();
    assert localNodeId != null;

    for (IndexService indexService : indicesService) {
      IndexMetaData indexMetaData = event.state().metaData().index(indexService.index().name());
      if (indexMetaData != null) {
        if (!indexMetaData.isSameUUID(indexService.indexUUID())) {
          logger.debug(
              "[{}] mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated",
              indexMetaData.index());
          deleteIndex(
              indexMetaData.index(),
              "mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated");
        }
      }
    }

    for (String index : event.indicesDeleted()) {
      if (logger.isDebugEnabled()) {
        logger.debug("[{}] cleaning index, no longer part of the metadata", index);
      }
      final Settings indexSettings;
      final IndexService idxService = indicesService.indexService(index);
      if (idxService != null) {
        indexSettings = idxService.getIndexSettings();
        deleteIndex(index, "index no longer part of the metadata");
      } else {
        final IndexMetaData metaData = previousState.metaData().index(index);
        assert metaData != null;
        indexSettings = metaData.settings();
        indicesService.deleteClosedIndex(
            "closed index no longer part of the metadata", metaData, event.state());
      }
      try {
        nodeIndexDeletedAction.nodeIndexDeleted(event.state(), index, indexSettings, localNodeId);
      } catch (Throwable e) {
        logger.debug("failed to send to master index {} deleted event", e, index);
      }
    }
  }

  private void applyDeletedShards(final ClusterChangedEvent event) {
    RoutingNodes.RoutingNodeIterator routingNode =
        event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId());
    if (routingNode == null) {
      return;
    }
    IntHashSet newShardIds = new IntHashSet();
    for (IndexService indexService : indicesService) {
      String index = indexService.index().name();
      IndexMetaData indexMetaData = event.state().metaData().index(index);
      if (indexMetaData == null) {
        continue;
      }
      // now, go over and delete shards that needs to get deleted
      newShardIds.clear();
      for (ShardRouting shard : routingNode) {
        if (shard.index().equals(index)) {
          newShardIds.add(shard.id());
        }
      }
      for (Integer existingShardId : indexService.shardIds()) {
        if (!newShardIds.contains(existingShardId)) {
          if (indexMetaData.state() == IndexMetaData.State.CLOSE) {
            if (logger.isDebugEnabled()) {
              logger.debug("[{}][{}] removing shard (index is closed)", index, existingShardId);
            }
            indexService.removeShard(existingShardId, "removing shard (index is closed)");
          } else {
            // we can just remove the shard, without cleaning it locally, since we will clean it
            // when all shards are allocated in the IndicesStore
            if (logger.isDebugEnabled()) {
              logger.debug("[{}][{}] removing shard (not allocated)", index, existingShardId);
            }
            indexService.removeShard(existingShardId, "removing shard (not allocated)");
          }
        }
      }
    }
  }

  private void applyNewIndices(final ClusterChangedEvent event) {
    // we only create indices for shards that are allocated
    RoutingNodes.RoutingNodeIterator routingNode =
        event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId());
    if (routingNode == null) {
      return;
    }
    for (ShardRouting shard : routingNode) {
      if (!indicesService.hasIndex(shard.index())) {
        final IndexMetaData indexMetaData = event.state().metaData().index(shard.index());
        if (logger.isDebugEnabled()) {
          logger.debug("[{}] creating index", indexMetaData.index());
        }
        try {
          indicesService.createIndex(
              indexMetaData.index(),
              indexMetaData.settings(),
              event.state().nodes().localNode().id());
        } catch (Throwable e) {
          sendFailShard(shard, indexMetaData.getIndexUUID(), "failed to create index", e);
        }
      }
    }
  }

  private void applySettings(ClusterChangedEvent event) {
    if (!event.metaDataChanged()) {
      return;
    }
    for (IndexMetaData indexMetaData : event.state().metaData()) {
      if (!indicesService.hasIndex(indexMetaData.index())) {
        // we only create / update here
        continue;
      }
      // if the index meta data didn't change, no need check for refreshed settings
      if (!event.indexMetaDataChanged(indexMetaData)) {
        continue;
      }
      String index = indexMetaData.index();
      IndexService indexService = indicesService.indexService(index);
      if (indexService == null) {
        // already deleted on us, ignore it
        continue;
      }
      IndexSettingsService indexSettingsService =
          indexService.injector().getInstance(IndexSettingsService.class);
      indexSettingsService.refreshSettings(indexMetaData.settings());
    }
  }

  private void applyMappings(ClusterChangedEvent event) {
    // go over and update mappings
    for (IndexMetaData indexMetaData : event.state().metaData()) {
      if (!indicesService.hasIndex(indexMetaData.index())) {
        // we only create / update here
        continue;
      }
      List<String> typesToRefresh = Lists.newArrayList();
      String index = indexMetaData.index();
      IndexService indexService = indicesService.indexService(index);
      if (indexService == null) {
        // got deleted on us, ignore (closing the node)
        return;
      }
      try {
        MapperService mapperService = indexService.mapperService();
        // first, go over and update the _default_ mapping (if exists)
        if (indexMetaData.mappings().containsKey(MapperService.DEFAULT_MAPPING)) {
          boolean requireRefresh =
              processMapping(
                  index,
                  mapperService,
                  MapperService.DEFAULT_MAPPING,
                  indexMetaData.mapping(MapperService.DEFAULT_MAPPING).source());
          if (requireRefresh) {
            typesToRefresh.add(MapperService.DEFAULT_MAPPING);
          }
        }

        // go over and add the relevant mappings (or update them)
        for (ObjectCursor<MappingMetaData> cursor : indexMetaData.mappings().values()) {
          MappingMetaData mappingMd = cursor.value;
          String mappingType = mappingMd.type();
          CompressedXContent mappingSource = mappingMd.source();
          if (mappingType.equals(MapperService.DEFAULT_MAPPING)) { // we processed _default_ first
            continue;
          }
          boolean requireRefresh = processMapping(index, mapperService, mappingType, mappingSource);
          if (requireRefresh) {
            typesToRefresh.add(mappingType);
          }
        }
        if (!typesToRefresh.isEmpty() && sendRefreshMapping) {
          nodeMappingRefreshAction.nodeMappingRefresh(
              event.state(),
              new NodeMappingRefreshAction.NodeMappingRefreshRequest(
                  index,
                  indexMetaData.indexUUID(),
                  typesToRefresh.toArray(new String[typesToRefresh.size()]),
                  event.state().nodes().localNodeId()));
        }
      } catch (Throwable t) {
        // if we failed the mappings anywhere, we need to fail the shards for this index, note, we
        // safeguard
        // by creating the processing the mappings on the master, or on the node the mapping was
        // introduced on,
        // so this failure typically means wrong node level configuration or something similar
        for (IndexShard indexShard : indexService) {
          ShardRouting shardRouting = indexShard.routingEntry();
          failAndRemoveShard(shardRouting, indexService, true, "failed to update mappings", t);
        }
      }
    }
  }

  private boolean processMapping(
      String index,
      MapperService mapperService,
      String mappingType,
      CompressedXContent mappingSource)
      throws Throwable {
    if (!seenMappings.containsKey(new Tuple<>(index, mappingType))) {
      seenMappings.put(new Tuple<>(index, mappingType), true);
    }

    // refresh mapping can happen for 2 reasons. The first is less urgent, and happens when the
    // mapping on this
    // node is ahead of what there is in the cluster state (yet an update-mapping has been sent to
    // it already,
    // it just hasn't been processed yet and published). Eventually, the mappings will converge, and
    // the refresh
    // mapping sent is more of a safe keeping (assuming the update mapping failed to reach the
    // master, ...)
    // the second case is where the parsing/merging of the mapping from the metadata doesn't result
    // in the same
    // mapping, in this case, we send to the master to refresh its own version of the mappings (to
    // conform with the
    // merge version of it, which it does when refreshing the mappings), and warn log it.
    boolean requiresRefresh = false;
    try {
      if (!mapperService.hasMapping(mappingType)) {
        if (logger.isDebugEnabled() && mappingSource.compressed().length < 512) {
          logger.debug(
              "[{}] adding mapping [{}], source [{}]", index, mappingType, mappingSource.string());
        } else if (logger.isTraceEnabled()) {
          logger.trace(
              "[{}] adding mapping [{}], source [{}]", index, mappingType, mappingSource.string());
        } else {
          logger.debug(
              "[{}] adding mapping [{}] (source suppressed due to length, use TRACE level if needed)",
              index,
              mappingType);
        }
        // we don't apply default, since it has been applied when the mappings were parsed initially
        mapperService.merge(mappingType, mappingSource, false, true);
        if (!mapperService.documentMapper(mappingType).mappingSource().equals(mappingSource)) {
          logger.debug(
              "[{}] parsed mapping [{}], and got different sources\noriginal:\n{}\nparsed:\n{}",
              index,
              mappingType,
              mappingSource,
              mapperService.documentMapper(mappingType).mappingSource());
          requiresRefresh = true;
        }
      } else {
        DocumentMapper existingMapper = mapperService.documentMapper(mappingType);
        if (!mappingSource.equals(existingMapper.mappingSource())) {
          // mapping changed, update it
          if (logger.isDebugEnabled() && mappingSource.compressed().length < 512) {
            logger.debug(
                "[{}] updating mapping [{}], source [{}]",
                index,
                mappingType,
                mappingSource.string());
          } else if (logger.isTraceEnabled()) {
            logger.trace(
                "[{}] updating mapping [{}], source [{}]",
                index,
                mappingType,
                mappingSource.string());
          } else {
            logger.debug(
                "[{}] updating mapping [{}] (source suppressed due to length, use TRACE level if needed)",
                index,
                mappingType);
          }
          // we don't apply default, since it has been applied when the mappings were parsed
          // initially
          mapperService.merge(mappingType, mappingSource, false, true);
          if (!mapperService.documentMapper(mappingType).mappingSource().equals(mappingSource)) {
            requiresRefresh = true;
            logger.debug(
                "[{}] parsed mapping [{}], and got different sources\noriginal:\n{}\nparsed:\n{}",
                index,
                mappingType,
                mappingSource,
                mapperService.documentMapper(mappingType).mappingSource());
          }
        }
      }
    } catch (Throwable e) {
      logger.warn(
          "[{}] failed to add mapping [{}], source [{}]", e, index, mappingType, mappingSource);
      throw e;
    }
    return requiresRefresh;
  }

  private boolean aliasesChanged(ClusterChangedEvent event) {
    return !event.state().metaData().aliases().equals(event.previousState().metaData().aliases())
        || !event.state().routingTable().equals(event.previousState().routingTable());
  }

  private void applyAliases(ClusterChangedEvent event) {
    // check if aliases changed
    if (aliasesChanged(event)) {
      // go over and update aliases
      for (IndexMetaData indexMetaData : event.state().metaData()) {
        String index = indexMetaData.index();
        IndexService indexService = indicesService.indexService(index);
        if (indexService == null) {
          // we only create / update here
          continue;
        }
        IndexAliasesService indexAliasesService = indexService.aliasesService();
        indexAliasesService.setAliases(indexMetaData.getAliases());
      }
    }
  }

  private void applyNewOrUpdatedShards(final ClusterChangedEvent event) {
    if (!indicesService.changesAllowed()) {
      return;
    }

    RoutingTable routingTable = event.state().routingTable();
    RoutingNodes.RoutingNodeIterator routingNode =
        event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId());

    if (routingNode == null) {
      failedShards.clear();
      return;
    }
    DiscoveryNodes nodes = event.state().nodes();

    for (final ShardRouting shardRouting : routingNode) {
      final IndexService indexService = indicesService.indexService(shardRouting.index());
      if (indexService == null) {
        // got deleted on us, ignore
        continue;
      }
      final IndexMetaData indexMetaData = event.state().metaData().index(shardRouting.index());
      if (indexMetaData == null) {
        // the index got deleted on the metadata, we will clean it later in the apply deleted method
        // call
        continue;
      }

      final int shardId = shardRouting.id();

      if (!indexService.hasShard(shardId) && shardRouting.started()) {
        if (failedShards.containsKey(shardRouting.shardId())) {
          if (nodes.masterNode() != null) {
            shardStateAction.resendShardFailed(
                shardRouting,
                indexMetaData.getIndexUUID(),
                "master "
                    + nodes.masterNode()
                    + " marked shard as started, but shard has previous failed. resending shard failure.",
                nodes.masterNode());
          }
        } else {
          // the master thinks we are started, but we don't have this shard at all, mark it as
          // failed
          sendFailShard(
              shardRouting,
              indexMetaData.getIndexUUID(),
              "master ["
                  + nodes.masterNode()
                  + "] marked shard as started, but shard has not been created, mark shard as failed",
              null);
        }
        continue;
      }

      IndexShard indexShard = indexService.shard(shardId);
      if (indexShard != null) {
        ShardRouting currentRoutingEntry = indexShard.routingEntry();
        // if the current and global routing are initializing, but are still not the same, its a
        // different "shard" being allocated
        // for example: a shard that recovers from one node and now needs to recover to another
        // node,
        //              or a replica allocated and then allocating a primary because the primary
        // failed on another node
        boolean shardHasBeenRemoved = false;
        if (currentRoutingEntry.initializing()
            && shardRouting.initializing()
            && !currentRoutingEntry.equals(shardRouting)) {
          logger.debug(
              "[{}][{}] removing shard (different instance of it allocated on this node, current [{}], global [{}])",
              shardRouting.index(),
              shardRouting.id(),
              currentRoutingEntry,
              shardRouting);
          // closing the shard will also cancel any ongoing recovery.
          indexService.removeShard(
              shardRouting.id(),
              "removing shard (different instance of it allocated on this node)");
          shardHasBeenRemoved = true;
        } else if (isPeerRecovery(shardRouting)) {
          final DiscoveryNode sourceNode =
              findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting);
          // check if there is an existing recovery going, and if so, and the source node is not the
          // same, cancel the recovery to restart it
          final Predicate<RecoveryStatus> shouldCancel =
              new Predicate<RecoveryStatus>() {
                @Override
                public boolean apply(@Nullable RecoveryStatus status) {
                  return status.sourceNode().equals(sourceNode) == false;
                }
              };
          if (recoveryTarget.cancelRecoveriesForShard(
              indexShard.shardId(), "recovery source node changed", shouldCancel)) {
            logger.debug(
                "[{}][{}] removing shard (recovery source changed), current [{}], global [{}])",
                shardRouting.index(),
                shardRouting.id(),
                currentRoutingEntry,
                shardRouting);
            // closing the shard will also cancel any ongoing recovery.
            indexService.removeShard(
                shardRouting.id(), "removing shard (recovery source node changed)");
            shardHasBeenRemoved = true;
          }
        }
        if (shardHasBeenRemoved == false
            && (shardRouting.equals(indexShard.routingEntry()) == false
                || shardRouting.version() > indexShard.routingEntry().version())) {
          if (shardRouting.primary()
              && indexShard.routingEntry().primary() == false
              && shardRouting.initializing()
              && indexShard.allowsPrimaryPromotion() == false) {
            logger.debug("{} reinitialize shard on primary promotion", indexShard.shardId());
            indexService.removeShard(shardId, "promoted to primary");
          } else {
            // if we happen to remove the shardRouting by id above we don't need to jump in here!
            indexShard.updateRoutingEntry(
                shardRouting, event.state().blocks().disableStatePersistence() == false);
          }
        }
      }

      if (shardRouting.initializing()) {
        applyInitializingShard(event.state(), indexMetaData, shardRouting);
      }
    }
  }

  private void cleanFailedShards(final ClusterChangedEvent event) {
    RoutingTable routingTable = event.state().routingTable();
    RoutingNodes.RoutingNodeIterator routingNode =
        event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId());
    if (routingNode == null) {
      failedShards.clear();
      return;
    }
    DiscoveryNodes nodes = event.state().nodes();
    long now = System.currentTimeMillis();
    String localNodeId = nodes.localNodeId();
    Iterator<Map.Entry<ShardId, FailedShard>> iterator = failedShards.entrySet().iterator();
    shards:
    while (iterator.hasNext()) {
      Map.Entry<ShardId, FailedShard> entry = iterator.next();
      FailedShard failedShard = entry.getValue();
      IndexRoutingTable indexRoutingTable = routingTable.index(entry.getKey().getIndex());
      if (indexRoutingTable != null) {
        IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(entry.getKey().id());
        if (shardRoutingTable != null) {
          for (ShardRouting shardRouting : shardRoutingTable.assignedShards()) {
            if (localNodeId.equals(shardRouting.currentNodeId())) {
              // we have a timeout here just to make sure we don't have dangled failed shards for
              // some reason
              // its just another safely layer
              if (shardRouting.version() == failedShard.version
                  && ((now - failedShard.timestamp) < TimeValue.timeValueMinutes(60).millis())) {
                // It's the same failed shard - keep it if it hasn't timed out
                continue shards;
              } else {
                // Different version or expired, remove it
                break;
              }
            }
          }
        }
      }
      iterator.remove();
    }
  }

  private void applyInitializingShard(
      final ClusterState state,
      final IndexMetaData indexMetaData,
      final ShardRouting shardRouting) {
    final IndexService indexService = indicesService.indexService(shardRouting.index());
    if (indexService == null) {
      // got deleted on us, ignore
      return;
    }
    final RoutingTable routingTable = state.routingTable();
    final DiscoveryNodes nodes = state.getNodes();
    final int shardId = shardRouting.id();

    if (indexService.hasShard(shardId)) {
      IndexShard indexShard = indexService.shardSafe(shardId);
      if (indexShard.state() == IndexShardState.STARTED
          || indexShard.state() == IndexShardState.POST_RECOVERY) {
        // the master thinks we are initializing, but we are already started or on POST_RECOVERY and
        // waiting
        // for master to confirm a shard started message (either master failover, or a cluster event
        // before
        // we managed to tell the master we started), mark us as started
        if (logger.isTraceEnabled()) {
          logger.trace(
              "{} master marked shard as initializing, but shard has state [{}], resending shard started to {}",
              indexShard.shardId(),
              indexShard.state(),
              nodes.masterNode());
        }
        if (nodes.masterNode() != null) {
          shardStateAction.shardStarted(
              shardRouting,
              indexMetaData.getIndexUUID(),
              "master "
                  + nodes.masterNode()
                  + " marked shard as initializing, but shard state is ["
                  + indexShard.state()
                  + "], mark shard as started",
              nodes.masterNode());
        }
        return;
      } else {
        if (indexShard.ignoreRecoveryAttempt()) {
          logger.trace(
              "ignoring recovery instruction for an existing shard {} (shard state: [{}])",
              indexShard.shardId(),
              indexShard.state());
          return;
        }
      }
    }

    // if we're in peer recovery, try to find out the source node now so in case it fails, we will
    // not create the index shard
    DiscoveryNode sourceNode = null;
    if (isPeerRecovery(shardRouting)) {
      sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting);
      if (sourceNode == null) {
        logger.trace(
            "ignoring initializing shard {} - no source node can be found.",
            shardRouting.shardId());
        return;
      }
    }

    // if there is no shard, create it
    if (!indexService.hasShard(shardId)) {
      if (failedShards.containsKey(shardRouting.shardId())) {
        if (nodes.masterNode() != null) {
          shardStateAction.resendShardFailed(
              shardRouting,
              indexMetaData.getIndexUUID(),
              "master "
                  + nodes.masterNode()
                  + " marked shard as initializing, but shard is marked as failed, resend shard failure",
              nodes.masterNode());
        }
        return;
      }
      try {
        if (logger.isDebugEnabled()) {
          logger.debug("[{}][{}] creating shard", shardRouting.index(), shardId);
        }
        IndexShard indexShard = indexService.createShard(shardId, shardRouting.primary());
        indexShard.updateRoutingEntry(
            shardRouting, state.blocks().disableStatePersistence() == false);
        indexShard.addFailedEngineListener(failedEngineHandler);
      } catch (IndexShardAlreadyExistsException e) {
        // ignore this, the method call can happen several times
      } catch (Throwable e) {
        failAndRemoveShard(shardRouting, indexService, true, "failed to create shard", e);
        return;
      }
    }
    final IndexShard indexShard = indexService.shardSafe(shardId);

    if (indexShard.ignoreRecoveryAttempt()) {
      // we are already recovering (we can get to this state since the cluster event can happen
      // several
      // times while we recover)
      logger.trace(
          "ignoring recovery instruction for shard {} (shard state: [{}])",
          indexShard.shardId(),
          indexShard.state());
      return;
    }

    if (isPeerRecovery(shardRouting)) {
      try {

        assert sourceNode != null : "peer recovery started but sourceNode is null";

        // we don't mark this one as relocated at the end.
        // For primaries: requests in any case are routed to both when its relocating and that way
        // we handle
        //    the edge case where its mark as relocated, and we might need to roll it back...
        // For replicas: we are recovering a backup from a primary
        RecoveryState.Type type =
            shardRouting.primary() ? RecoveryState.Type.RELOCATION : RecoveryState.Type.REPLICA;
        recoveryTarget.startRecovery(
            indexShard,
            type,
            sourceNode,
            new PeerRecoveryListener(shardRouting, indexService, indexMetaData));
      } catch (Throwable e) {
        indexShard.failShard("corrupted preexisting index", e);
        handleRecoveryFailure(indexService, shardRouting, true, e);
      }
    } else {
      final IndexShardRoutingTable indexShardRouting =
          routingTable.index(shardRouting.index()).shard(shardRouting.id());
      indexService
          .shard(shardId)
          .recoverFromStore(
              indexShardRouting,
              new StoreRecoveryService.RecoveryListener() {
                @Override
                public void onRecoveryDone() {
                  shardStateAction.shardStarted(
                      shardRouting, indexMetaData.getIndexUUID(), "after recovery from store");
                }

                @Override
                public void onIgnoreRecovery(String reason) {}

                @Override
                public void onRecoveryFailed(IndexShardRecoveryException e) {
                  handleRecoveryFailure(indexService, shardRouting, true, e);
                }
              });
    }
  }

  /**
   * Finds the routing source node for peer recovery, return null if its not found. Note, this
   * method expects the shard routing to *require* peer recovery, use {@link
   * #isPeerRecovery(org.elasticsearch.cluster.routing.ShardRouting)} to check if its needed or not.
   */
  private DiscoveryNode findSourceNodeForPeerRecovery(
      RoutingTable routingTable, DiscoveryNodes nodes, ShardRouting shardRouting) {
    DiscoveryNode sourceNode = null;
    if (!shardRouting.primary()) {
      IndexShardRoutingTable shardRoutingTable =
          routingTable.index(shardRouting.index()).shard(shardRouting.id());
      for (ShardRouting entry : shardRoutingTable) {
        if (entry.primary() && entry.active()) {
          // only recover from started primary, if we can't find one, we will do it next round
          sourceNode = nodes.get(entry.currentNodeId());
          if (sourceNode == null) {
            logger.trace(
                "can't find replica source node because primary shard {} is assigned to an unknown node.",
                entry);
            return null;
          }
          break;
        }
      }

      if (sourceNode == null) {
        logger.trace(
            "can't find replica source node for {} because a primary shard can not be found.",
            shardRouting.shardId());
      }
    } else if (shardRouting.relocatingNodeId() != null) {
      sourceNode = nodes.get(shardRouting.relocatingNodeId());
      if (sourceNode == null) {
        logger.trace(
            "can't find relocation source node for shard {} because it is assigned to an unknown node [{}].",
            shardRouting.shardId(),
            shardRouting.relocatingNodeId());
      }
    } else {
      throw new IllegalStateException(
          "trying to find source node for peer recovery when routing state means no peer recovery: "
              + shardRouting);
    }
    return sourceNode;
  }

  private boolean isPeerRecovery(ShardRouting shardRouting) {
    return !shardRouting.primary() || shardRouting.relocatingNodeId() != null;
  }

  private class PeerRecoveryListener implements RecoveryTarget.RecoveryListener {

    private final ShardRouting shardRouting;
    private final IndexService indexService;
    private final IndexMetaData indexMetaData;

    private PeerRecoveryListener(
        ShardRouting shardRouting, IndexService indexService, IndexMetaData indexMetaData) {
      this.shardRouting = shardRouting;
      this.indexService = indexService;
      this.indexMetaData = indexMetaData;
    }

    @Override
    public void onRecoveryDone(RecoveryState state) {
      shardStateAction.shardStarted(
          shardRouting,
          indexMetaData.getIndexUUID(),
          "after recovery (replica) from node [" + state.getSourceNode() + "]");
    }

    @Override
    public void onRecoveryFailure(
        RecoveryState state, RecoveryFailedException e, boolean sendShardFailure) {
      handleRecoveryFailure(indexService, shardRouting, sendShardFailure, e);
    }
  }

  private void handleRecoveryFailure(
      IndexService indexService,
      ShardRouting shardRouting,
      boolean sendShardFailure,
      Throwable failure) {
    synchronized (mutex) {
      failAndRemoveShard(shardRouting, indexService, sendShardFailure, "failed recovery", failure);
    }
  }

  private void removeIndex(String index, String reason) {
    try {
      indicesService.removeIndex(index, reason);
    } catch (Throwable e) {
      logger.warn("failed to clean index ({})", e, reason);
    }
    clearSeenMappings(index);
  }

  private void clearSeenMappings(String index) {
    // clear seen mappings as well
    for (Tuple<String, String> tuple : seenMappings.keySet()) {
      if (tuple.v1().equals(index)) {
        seenMappings.remove(tuple);
      }
    }
  }

  private void deleteIndex(String index, String reason) {
    try {
      indicesService.deleteIndex(index, reason);
    } catch (Throwable e) {
      logger.warn("failed to delete index ({})", e, reason);
    }
    // clear seen mappings as well
    clearSeenMappings(index);
  }

  private void failAndRemoveShard(
      ShardRouting shardRouting,
      IndexService indexService,
      boolean sendShardFailure,
      String message,
      @Nullable Throwable failure) {
    if (indexService.hasShard(shardRouting.getId())) {
      try {
        indexService.removeShard(shardRouting.getId(), message);
      } catch (ShardNotFoundException e) {
        // the node got closed on us, ignore it
      } catch (Throwable e1) {
        logger.warn(
            "[{}][{}] failed to remove shard after failure ([{}])",
            e1,
            shardRouting.getIndex(),
            shardRouting.getId(),
            message);
      }
    }
    if (sendShardFailure) {
      sendFailShard(shardRouting, indexService.indexUUID(), message, failure);
    }
  }

  private void sendFailShard(
      ShardRouting shardRouting, String indexUUID, String message, @Nullable Throwable failure) {
    try {
      logger.warn(
          "[{}] marking and sending shard failed due to [{}]",
          failure,
          shardRouting.shardId(),
          message);
      failedShards.put(shardRouting.shardId(), new FailedShard(shardRouting.version()));
      shardStateAction.shardFailed(
          shardRouting,
          indexUUID,
          "shard failure ["
              + message
              + "]"
              + (failure == null ? "" : "[" + detailedMessage(failure) + "]"));
    } catch (Throwable e1) {
      logger.warn(
          "[{}][{}] failed to mark shard as failed (because of [{}])",
          e1,
          shardRouting.getIndex(),
          shardRouting.getId(),
          message);
    }
  }

  private class FailedEngineHandler implements Engine.FailedEngineListener {
    @Override
    public void onFailedEngine(
        final ShardId shardId, final String reason, final @Nullable Throwable failure) {
      ShardRouting shardRouting = null;
      final IndexService indexService = indicesService.indexService(shardId.index().name());
      if (indexService != null) {
        IndexShard indexShard = indexService.shard(shardId.id());
        if (indexShard != null) {
          shardRouting = indexShard.routingEntry();
        }
      }
      if (shardRouting == null) {
        logger.warn(
            "[{}][{}] engine failed, but can't find index shard. failure reason: [{}]",
            failure,
            shardId.index().name(),
            shardId.id(),
            reason);
        return;
      }
      final ShardRouting fShardRouting = shardRouting;
      threadPool
          .generic()
          .execute(
              new Runnable() {
                @Override
                public void run() {
                  synchronized (mutex) {
                    failAndRemoveShard(
                        fShardRouting,
                        indexService,
                        true,
                        "engine failure, reason [" + reason + "]",
                        failure);
                  }
                }
              });
    }
  }
}
예제 #10
0
public class ClusterService extends AbstractLifecycleComponent {

  public static final Setting<TimeValue> CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING =
      Setting.positiveTimeSetting(
          "cluster.service.slow_task_logging_threshold",
          TimeValue.timeValueSeconds(30),
          Property.Dynamic,
          Property.NodeScope);

  public static final String UPDATE_THREAD_NAME = "clusterService#updateTask";
  private final ThreadPool threadPool;
  private final ClusterName clusterName;

  private BiConsumer<ClusterChangedEvent, Discovery.AckListener> clusterStatePublisher;

  private final OperationRouting operationRouting;

  private final ClusterSettings clusterSettings;

  private TimeValue slowTaskLoggingThreshold;

  private volatile PrioritizedEsThreadPoolExecutor updateTasksExecutor;

  /** Those 3 state listeners are changing infrequently - CopyOnWriteArrayList is just fine */
  private final Collection<ClusterStateListener> priorityClusterStateListeners =
      new CopyOnWriteArrayList<>();

  private final Collection<ClusterStateListener> clusterStateListeners =
      new CopyOnWriteArrayList<>();
  private final Collection<ClusterStateListener> lastClusterStateListeners =
      new CopyOnWriteArrayList<>();
  private final Map<ClusterStateTaskExecutor, List<UpdateTask>> updateTasksPerExecutor =
      new HashMap<>();
  // TODO this is rather frequently changing I guess a Synced Set would be better here and a
  // dedicated remove API
  private final Collection<ClusterStateListener> postAppliedListeners =
      new CopyOnWriteArrayList<>();
  private final Iterable<ClusterStateListener> preAppliedListeners =
      Iterables.concat(
          priorityClusterStateListeners, clusterStateListeners, lastClusterStateListeners);

  private final LocalNodeMasterListeners localNodeMasterListeners;

  private final Queue<NotifyTimeout> onGoingTimeouts = ConcurrentCollections.newQueue();

  private volatile ClusterState clusterState;

  private final ClusterBlocks.Builder initialBlocks;

  private NodeConnectionsService nodeConnectionsService;

  public ClusterService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) {
    super(settings);
    this.operationRouting = new OperationRouting(settings, clusterSettings);
    this.threadPool = threadPool;
    this.clusterSettings = clusterSettings;
    this.clusterName = ClusterName.CLUSTER_NAME_SETTING.get(settings);
    // will be replaced on doStart.
    this.clusterState = ClusterState.builder(clusterName).build();

    this.clusterSettings.addSettingsUpdateConsumer(
        CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING, this::setSlowTaskLoggingThreshold);

    this.slowTaskLoggingThreshold =
        CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(settings);

    localNodeMasterListeners = new LocalNodeMasterListeners(threadPool);

    initialBlocks = ClusterBlocks.builder();
  }

  private void setSlowTaskLoggingThreshold(TimeValue slowTaskLoggingThreshold) {
    this.slowTaskLoggingThreshold = slowTaskLoggingThreshold;
  }

  public synchronized void setClusterStatePublisher(
      BiConsumer<ClusterChangedEvent, Discovery.AckListener> publisher) {
    clusterStatePublisher = publisher;
  }

  public synchronized void setLocalNode(DiscoveryNode localNode) {
    assert clusterState.nodes().getLocalNodeId() == null : "local node is already set";
    DiscoveryNodes.Builder nodeBuilder =
        DiscoveryNodes.builder(clusterState.nodes()).add(localNode).localNodeId(localNode.getId());
    this.clusterState = ClusterState.builder(clusterState).nodes(nodeBuilder).build();
  }

  public synchronized void setNodeConnectionsService(
      NodeConnectionsService nodeConnectionsService) {
    assert this.nodeConnectionsService == null : "nodeConnectionsService is already set";
    this.nodeConnectionsService = nodeConnectionsService;
  }

  /** Adds an initial block to be set on the first cluster state created. */
  public synchronized void addInitialStateBlock(ClusterBlock block) throws IllegalStateException {
    if (lifecycle.started()) {
      throw new IllegalStateException("can't set initial block when started");
    }
    initialBlocks.addGlobalBlock(block);
  }

  /** Remove an initial block to be set on the first cluster state created. */
  public synchronized void removeInitialStateBlock(ClusterBlock block)
      throws IllegalStateException {
    removeInitialStateBlock(block.id());
  }

  /** Remove an initial block to be set on the first cluster state created. */
  public synchronized void removeInitialStateBlock(int blockId) throws IllegalStateException {
    if (lifecycle.started()) {
      throw new IllegalStateException("can't set initial block when started");
    }
    initialBlocks.removeGlobalBlock(blockId);
  }

  @Override
  protected synchronized void doStart() {
    Objects.requireNonNull(
        clusterStatePublisher, "please set a cluster state publisher before starting");
    Objects.requireNonNull(
        clusterState.nodes().getLocalNode(), "please set the local node before starting");
    Objects.requireNonNull(
        nodeConnectionsService, "please set the node connection service before starting");
    add(localNodeMasterListeners);
    this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build();
    this.updateTasksExecutor =
        EsExecutors.newSinglePrioritizing(
            UPDATE_THREAD_NAME,
            daemonThreadFactory(settings, UPDATE_THREAD_NAME),
            threadPool.getThreadContext());
    this.clusterState = ClusterState.builder(clusterState).blocks(initialBlocks).build();
  }

  @Override
  protected synchronized void doStop() {
    for (NotifyTimeout onGoingTimeout : onGoingTimeouts) {
      onGoingTimeout.cancel();
      try {
        onGoingTimeout.cancel();
        onGoingTimeout.listener.onClose();
      } catch (Exception ex) {
        logger.debug("failed to notify listeners on shutdown", ex);
      }
    }
    ThreadPool.terminate(updateTasksExecutor, 10, TimeUnit.SECONDS);
    // close timeout listeners that did not have an ongoing timeout
    postAppliedListeners
        .stream()
        .filter(listener -> listener instanceof TimeoutClusterStateListener)
        .map(listener -> (TimeoutClusterStateListener) listener)
        .forEach(TimeoutClusterStateListener::onClose);
    remove(localNodeMasterListeners);
  }

  @Override
  protected synchronized void doClose() {}

  /** The local node. */
  public DiscoveryNode localNode() {
    DiscoveryNode localNode = clusterState.getNodes().getLocalNode();
    if (localNode == null) {
      throw new IllegalStateException("No local node found. Is the node started?");
    }
    return localNode;
  }

  public OperationRouting operationRouting() {
    return operationRouting;
  }

  /** The current state. */
  public ClusterState state() {
    return this.clusterState;
  }

  /** Adds a priority listener for updated cluster states. */
  public void addFirst(ClusterStateListener listener) {
    priorityClusterStateListeners.add(listener);
  }

  /** Adds last listener. */
  public void addLast(ClusterStateListener listener) {
    lastClusterStateListeners.add(listener);
  }

  /** Adds a listener for updated cluster states. */
  public void add(ClusterStateListener listener) {
    clusterStateListeners.add(listener);
  }

  /** Removes a listener for updated cluster states. */
  public void remove(ClusterStateListener listener) {
    clusterStateListeners.remove(listener);
    priorityClusterStateListeners.remove(listener);
    lastClusterStateListeners.remove(listener);
    postAppliedListeners.remove(listener);
    for (Iterator<NotifyTimeout> it = onGoingTimeouts.iterator(); it.hasNext(); ) {
      NotifyTimeout timeout = it.next();
      if (timeout.listener.equals(listener)) {
        timeout.cancel();
        it.remove();
      }
    }
  }

  /** Add a listener for on/off local node master events */
  public void add(LocalNodeMasterListener listener) {
    localNodeMasterListeners.add(listener);
  }

  /** Remove the given listener for on/off local master events */
  public void remove(LocalNodeMasterListener listener) {
    localNodeMasterListeners.remove(listener);
  }

  /**
   * Adds a cluster state listener that will timeout after the provided timeout, and is executed
   * after the clusterstate has been successfully applied ie. is in state {@link
   * org.elasticsearch.cluster.ClusterState.ClusterStateStatus#APPLIED} NOTE: a {@code null} timeout
   * means that the listener will never be removed automatically
   */
  public void add(@Nullable final TimeValue timeout, final TimeoutClusterStateListener listener) {
    if (lifecycle.stoppedOrClosed()) {
      listener.onClose();
      return;
    }
    // call the post added notification on the same event thread
    try {
      updateTasksExecutor.execute(
          new SourcePrioritizedRunnable(Priority.HIGH, "_add_listener_") {
            @Override
            public void run() {
              if (timeout != null) {
                NotifyTimeout notifyTimeout = new NotifyTimeout(listener, timeout);
                notifyTimeout.future =
                    threadPool.schedule(timeout, ThreadPool.Names.GENERIC, notifyTimeout);
                onGoingTimeouts.add(notifyTimeout);
              }
              postAppliedListeners.add(listener);
              listener.postAdded();
            }
          });
    } catch (EsRejectedExecutionException e) {
      if (lifecycle.stoppedOrClosed()) {
        listener.onClose();
      } else {
        throw e;
      }
    }
  }

  /**
   * Submits a cluster state update task; unlike {@link #submitStateUpdateTask(String, Object,
   * ClusterStateTaskConfig, ClusterStateTaskExecutor, ClusterStateTaskListener)}, submitted updates
   * will not be batched.
   *
   * @param source the source of the cluster state update task
   * @param updateTask the full context for the cluster state update task
   */
  public void submitStateUpdateTask(final String source, final ClusterStateUpdateTask updateTask) {
    submitStateUpdateTask(source, updateTask, updateTask, updateTask, updateTask);
  }

  /**
   * Submits a cluster state update task; submitted updates will be batched across the same instance
   * of executor. The exact batching semantics depend on the underlying implementation but a rough
   * guideline is that if the update task is submitted while there are pending update tasks for the
   * same executor, these update tasks will all be executed on the executor in a single batch
   *
   * @param source the source of the cluster state update task
   * @param task the state needed for the cluster state update task
   * @param config the cluster state update task configuration
   * @param executor the cluster state update task executor; tasks that share the same executor will
   *     be executed batches on this executor
   * @param listener callback after the cluster state update task completes
   * @param <T> the type of the cluster state update task state
   */
  public <T> void submitStateUpdateTask(
      final String source,
      final T task,
      final ClusterStateTaskConfig config,
      final ClusterStateTaskExecutor<T> executor,
      final ClusterStateTaskListener listener) {
    submitStateUpdateTasks(source, Collections.singletonMap(task, listener), config, executor);
  }

  /**
   * Submits a batch of cluster state update tasks; submitted updates are guaranteed to be processed
   * together, potentially with more tasks of the same executor.
   *
   * @param source the source of the cluster state update task
   * @param tasks a map of update tasks and their corresponding listeners
   * @param config the cluster state update task configuration
   * @param executor the cluster state update task executor; tasks that share the same executor will
   *     be executed batches on this executor
   * @param <T> the type of the cluster state update task state
   */
  public <T> void submitStateUpdateTasks(
      final String source,
      final Map<T, ClusterStateTaskListener> tasks,
      final ClusterStateTaskConfig config,
      final ClusterStateTaskExecutor<T> executor) {
    if (!lifecycle.started()) {
      return;
    }
    if (tasks.isEmpty()) {
      return;
    }
    try {
      // convert to an identity map to check for dups based on update tasks semantics of using
      // identity instead of equal
      final IdentityHashMap<T, ClusterStateTaskListener> tasksIdentity =
          new IdentityHashMap<>(tasks);
      final List<UpdateTask<T>> updateTasks =
          tasksIdentity
              .entrySet()
              .stream()
              .map(
                  entry ->
                      new UpdateTask<>(
                          source, entry.getKey(), config, executor, safe(entry.getValue(), logger)))
              .collect(Collectors.toList());

      synchronized (updateTasksPerExecutor) {
        List<UpdateTask> existingTasks =
            updateTasksPerExecutor.computeIfAbsent(executor, k -> new ArrayList<>());
        for (@SuppressWarnings("unchecked") UpdateTask<T> existing : existingTasks) {
          if (tasksIdentity.containsKey(existing.task)) {
            throw new IllegalStateException(
                "task ["
                    + executor.describeTasks(Collections.singletonList(existing.task))
                    + "] with source ["
                    + source
                    + "] is already queued");
          }
        }
        existingTasks.addAll(updateTasks);
      }

      final UpdateTask<T> firstTask = updateTasks.get(0);

      if (config.timeout() != null) {
        updateTasksExecutor.execute(
            firstTask,
            threadPool.scheduler(),
            config.timeout(),
            () ->
                threadPool
                    .generic()
                    .execute(
                        () -> {
                          for (UpdateTask<T> task : updateTasks) {
                            if (task.processed.getAndSet(true) == false) {
                              logger.debug(
                                  "cluster state update task [{}] timed out after [{}]",
                                  source,
                                  config.timeout());
                              task.listener.onFailure(
                                  source,
                                  new ProcessClusterEventTimeoutException(
                                      config.timeout(), source));
                            }
                          }
                        }));
      } else {
        updateTasksExecutor.execute(firstTask);
      }
    } catch (EsRejectedExecutionException e) {
      // ignore cases where we are shutting down..., there is really nothing interesting
      // to be done here...
      if (!lifecycle.stoppedOrClosed()) {
        throw e;
      }
    }
  }

  /** Returns the tasks that are pending. */
  public List<PendingClusterTask> pendingTasks() {
    PrioritizedEsThreadPoolExecutor.Pending[] pendings = updateTasksExecutor.getPending();
    List<PendingClusterTask> pendingClusterTasks = new ArrayList<>(pendings.length);
    for (PrioritizedEsThreadPoolExecutor.Pending pending : pendings) {
      final String source;
      final long timeInQueue;
      // we have to capture the task as it will be nulled after execution and we don't want to
      // change while we check things here.
      final Object task = pending.task;
      if (task == null) {
        continue;
      } else if (task instanceof SourcePrioritizedRunnable) {
        SourcePrioritizedRunnable runnable = (SourcePrioritizedRunnable) task;
        source = runnable.source();
        timeInQueue = runnable.getAgeInMillis();
      } else {
        assert false : "expected SourcePrioritizedRunnable got " + task.getClass();
        source = "unknown [" + task.getClass() + "]";
        timeInQueue = 0;
      }

      pendingClusterTasks.add(
          new PendingClusterTask(
              pending.insertionOrder,
              pending.priority,
              new Text(source),
              timeInQueue,
              pending.executing));
    }
    return pendingClusterTasks;
  }

  /** Returns the number of currently pending tasks. */
  public int numberOfPendingTasks() {
    return updateTasksExecutor.getNumberOfPendingTasks();
  }

  /**
   * Returns the maximum wait time for tasks in the queue
   *
   * @return A zero time value if the queue is empty, otherwise the time value oldest task waiting
   *     in the queue
   */
  public TimeValue getMaxTaskWaitTime() {
    return updateTasksExecutor.getMaxTaskWaitTime();
  }

  /** asserts that the current thread is the cluster state update thread */
  public static boolean assertClusterStateThread() {
    assert Thread.currentThread().getName().contains(ClusterService.UPDATE_THREAD_NAME)
        : "not called from the cluster state update thread";
    return true;
  }

  public ClusterName getClusterName() {
    return clusterName;
  }

  abstract static class SourcePrioritizedRunnable extends PrioritizedRunnable {
    protected final String source;

    public SourcePrioritizedRunnable(Priority priority, String source) {
      super(priority);
      this.source = source;
    }

    public String source() {
      return source;
    }
  }

  <T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) {
    final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>();
    final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>();
    synchronized (updateTasksPerExecutor) {
      List<UpdateTask> pending = updateTasksPerExecutor.remove(executor);
      if (pending != null) {
        for (UpdateTask<T> task : pending) {
          if (task.processed.getAndSet(true) == false) {
            logger.trace("will process {}", task.toString(executor));
            toExecute.add(task);
            processTasksBySource
                .computeIfAbsent(task.source, s -> new ArrayList<>())
                .add(task.task);
          } else {
            logger.trace("skipping {}, already processed", task.toString(executor));
          }
        }
      }
    }
    if (toExecute.isEmpty()) {
      return;
    }
    final String tasksSummary =
        processTasksBySource
            .entrySet()
            .stream()
            .map(
                entry -> {
                  String tasks = executor.describeTasks(entry.getValue());
                  return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]";
                })
            .reduce((s1, s2) -> s1 + ", " + s2)
            .orElse("");

    if (!lifecycle.started()) {
      logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary);
      return;
    }
    logger.debug("processing [{}]: execute", tasksSummary);
    ClusterState previousClusterState = clusterState;
    if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) {
      logger.debug("failing [{}]: local node is no longer master", tasksSummary);
      toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source));
      return;
    }
    ClusterStateTaskExecutor.BatchResult<T> batchResult;
    long startTimeNS = currentTimeInNanos();
    try {
      List<T> inputs =
          toExecute.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList());
      batchResult = executor.execute(previousClusterState, inputs);
    } catch (Exception e) {
      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      if (logger.isTraceEnabled()) {
        logger.trace(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}",
                        executionTime,
                        previousClusterState.version(),
                        tasksSummary,
                        previousClusterState.nodes().prettyPrint(),
                        previousClusterState.routingTable().prettyPrint(),
                        previousClusterState.getRoutingNodes().prettyPrint()),
            e);
      }
      warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
      batchResult =
          ClusterStateTaskExecutor.BatchResult.<T>builder()
              .failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e)
              .build(previousClusterState);
    }

    assert batchResult.executionResults != null;
    assert batchResult.executionResults.size() == toExecute.size()
        : String.format(
            Locale.ROOT,
            "expected [%d] task result%s but was [%d]",
            toExecute.size(),
            toExecute.size() == 1 ? "" : "s",
            batchResult.executionResults.size());
    boolean assertsEnabled = false;
    assert (assertsEnabled = true);
    if (assertsEnabled) {
      for (UpdateTask<T> updateTask : toExecute) {
        assert batchResult.executionResults.containsKey(updateTask.task)
            : "missing task result for " + updateTask.toString(executor);
      }
    }

    ClusterState newClusterState = batchResult.resultingState;
    final ArrayList<UpdateTask<T>> proccessedListeners = new ArrayList<>();
    // fail all tasks that have failed and extract those that are waiting for results
    for (UpdateTask<T> updateTask : toExecute) {
      assert batchResult.executionResults.containsKey(updateTask.task)
          : "missing " + updateTask.toString(executor);
      final ClusterStateTaskExecutor.TaskResult executionResult =
          batchResult.executionResults.get(updateTask.task);
      executionResult.handle(
          () -> proccessedListeners.add(updateTask),
          ex -> {
            logger.debug(
                (Supplier<?>)
                    () ->
                        new ParameterizedMessage(
                            "cluster state update task {} failed", updateTask.toString(executor)),
                ex);
            updateTask.listener.onFailure(updateTask.source, ex);
          });
    }

    if (previousClusterState == newClusterState) {
      for (UpdateTask<T> task : proccessedListeners) {
        if (task.listener instanceof AckedClusterStateTaskListener) {
          // no need to wait for ack if nothing changed, the update can be counted as acknowledged
          ((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null);
        }
        task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState);
      }
      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      logger.debug(
          "processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime);
      warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
      return;
    }

    try {
      ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>();
      if (newClusterState.nodes().isLocalNodeElectedMaster()) {
        // only the master controls the version numbers
        Builder builder = ClusterState.builder(newClusterState).incrementVersion();
        if (previousClusterState.routingTable() != newClusterState.routingTable()) {
          builder.routingTable(
              RoutingTable.builder(newClusterState.routingTable())
                  .version(newClusterState.routingTable().version() + 1)
                  .build());
        }
        if (previousClusterState.metaData() != newClusterState.metaData()) {
          builder.metaData(
              MetaData.builder(newClusterState.metaData())
                  .version(newClusterState.metaData().version() + 1));
        }
        newClusterState = builder.build();
        for (UpdateTask<T> task : proccessedListeners) {
          if (task.listener instanceof AckedClusterStateTaskListener) {
            final AckedClusterStateTaskListener ackedListener =
                (AckedClusterStateTaskListener) task.listener;
            if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) {
              ackedListener.onAckTimeout();
            } else {
              try {
                ackListeners.add(
                    new AckCountDownListener(
                        ackedListener,
                        newClusterState.version(),
                        newClusterState.nodes(),
                        threadPool));
              } catch (EsRejectedExecutionException ex) {
                if (logger.isDebugEnabled()) {
                  logger.debug(
                      "Couldn't schedule timeout thread - node might be shutting down", ex);
                }
                // timeout straightaway, otherwise we could wait forever as the timeout thread has
                // not started
                ackedListener.onAckTimeout();
              }
            }
          }
        }
      }
      final Discovery.AckListener ackListener = new DelegetingAckListener(ackListeners);

      newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED);

      if (logger.isTraceEnabled()) {
        logger.trace(
            "cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint());
      } else if (logger.isDebugEnabled()) {
        logger.debug(
            "cluster state updated, version [{}], source [{}]",
            newClusterState.version(),
            tasksSummary);
      }

      ClusterChangedEvent clusterChangedEvent =
          new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState);
      // new cluster state, notify all listeners
      final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta();
      if (nodesDelta.hasChanges() && logger.isInfoEnabled()) {
        String summary = nodesDelta.shortSummary();
        if (summary.length() > 0) {
          logger.info("{}, reason: {}", summary, tasksSummary);
        }
      }

      nodeConnectionsService.connectToAddedNodes(clusterChangedEvent);

      // if we are the master, publish the new state to all nodes
      // we publish here before we send a notification to all the listeners, since if it fails
      // we don't want to notify
      if (newClusterState.nodes().isLocalNodeElectedMaster()) {
        logger.debug("publishing cluster state version [{}]", newClusterState.version());
        try {
          clusterStatePublisher.accept(clusterChangedEvent, ackListener);
        } catch (Discovery.FailedToCommitClusterStateException t) {
          final long version = newClusterState.version();
          logger.warn(
              (Supplier<?>)
                  () ->
                      new ParameterizedMessage(
                          "failing [{}]: failed to commit cluster state version [{}]",
                          tasksSummary,
                          version),
              t);
          proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t));
          return;
        }
      }

      // update the current cluster state
      clusterState = newClusterState;
      logger.debug("set local cluster state to version {}", newClusterState.version());
      try {
        // nothing to do until we actually recover from the gateway or any other block indicates we
        // need to disable persistency
        if (clusterChangedEvent.state().blocks().disableStatePersistence() == false
            && clusterChangedEvent.metaDataChanged()) {
          final Settings incomingSettings = clusterChangedEvent.state().metaData().settings();
          clusterSettings.applySettings(incomingSettings);
        }
      } catch (Exception ex) {
        logger.warn("failed to apply cluster settings", ex);
      }
      for (ClusterStateListener listener : preAppliedListeners) {
        try {
          listener.clusterChanged(clusterChangedEvent);
        } catch (Exception ex) {
          logger.warn("failed to notify ClusterStateListener", ex);
        }
      }

      nodeConnectionsService.disconnectFromRemovedNodes(clusterChangedEvent);

      newClusterState.status(ClusterState.ClusterStateStatus.APPLIED);

      for (ClusterStateListener listener : postAppliedListeners) {
        try {
          listener.clusterChanged(clusterChangedEvent);
        } catch (Exception ex) {
          logger.warn("failed to notify ClusterStateListener", ex);
        }
      }

      // manual ack only from the master at the end of the publish
      if (newClusterState.nodes().isLocalNodeElectedMaster()) {
        try {
          ackListener.onNodeAck(newClusterState.nodes().getLocalNode(), null);
        } catch (Exception e) {
          final DiscoveryNode localNode = newClusterState.nodes().getLocalNode();
          logger.debug(
              (Supplier<?>)
                  () ->
                      new ParameterizedMessage(
                          "error while processing ack for master node [{}]", localNode),
              e);
        }
      }

      for (UpdateTask<T> task : proccessedListeners) {
        task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState);
      }

      try {
        executor.clusterStatePublished(clusterChangedEvent);
      } catch (Exception e) {
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown while notifying executor of new cluster state publication [{}]",
                        tasksSummary),
            e);
      }

      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      logger.debug(
          "processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})",
          tasksSummary,
          executionTime,
          newClusterState.version(),
          newClusterState.stateUUID());
      warnAboutSlowTaskIfNeeded(executionTime, tasksSummary);
    } catch (Exception e) {
      TimeValue executionTime =
          TimeValue.timeValueMillis(
              Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS)));
      final long version = newClusterState.version();
      final String stateUUID = newClusterState.stateUUID();
      final String prettyPrint = newClusterState.prettyPrint();
      logger.warn(
          (Supplier<?>)
              () ->
                  new ParameterizedMessage(
                      "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}",
                      executionTime,
                      version,
                      stateUUID,
                      tasksSummary,
                      prettyPrint),
          e);
      // TODO: do we want to call updateTask.onFailure here?
    }
  }

  // this one is overridden in tests so we can control time
  protected long currentTimeInNanos() {
    return System.nanoTime();
  }

  private static SafeClusterStateTaskListener safe(
      ClusterStateTaskListener listener, Logger logger) {
    if (listener instanceof AckedClusterStateTaskListener) {
      return new SafeAckedClusterStateTaskListener(
          (AckedClusterStateTaskListener) listener, logger);
    } else {
      return new SafeClusterStateTaskListener(listener, logger);
    }
  }

  private static class SafeClusterStateTaskListener implements ClusterStateTaskListener {
    private final ClusterStateTaskListener listener;
    private final Logger logger;

    public SafeClusterStateTaskListener(ClusterStateTaskListener listener, Logger logger) {
      this.listener = listener;
      this.logger = logger;
    }

    @Override
    public void onFailure(String source, Exception e) {
      try {
        listener.onFailure(source, e);
      } catch (Exception inner) {
        inner.addSuppressed(e);
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown by listener notifying of failure from [{}]", source),
            inner);
      }
    }

    @Override
    public void onNoLongerMaster(String source) {
      try {
        listener.onNoLongerMaster(source);
      } catch (Exception e) {
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown by listener while notifying no longer master from [{}]",
                        source),
            e);
      }
    }

    @Override
    public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
      try {
        listener.clusterStateProcessed(source, oldState, newState);
      } catch (Exception e) {
        logger.error(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "exception thrown by listener while notifying of cluster state processed from [{}], old cluster state:\n"
                            + "{}\nnew cluster state:\n{}",
                        source,
                        oldState.prettyPrint(),
                        newState.prettyPrint()),
            e);
      }
    }
  }

  private static class SafeAckedClusterStateTaskListener extends SafeClusterStateTaskListener
      implements AckedClusterStateTaskListener {
    private final AckedClusterStateTaskListener listener;
    private final Logger logger;

    public SafeAckedClusterStateTaskListener(
        AckedClusterStateTaskListener listener, Logger logger) {
      super(listener, logger);
      this.listener = listener;
      this.logger = logger;
    }

    @Override
    public boolean mustAck(DiscoveryNode discoveryNode) {
      return listener.mustAck(discoveryNode);
    }

    @Override
    public void onAllNodesAcked(@Nullable Exception e) {
      try {
        listener.onAllNodesAcked(e);
      } catch (Exception inner) {
        inner.addSuppressed(e);
        logger.error("exception thrown by listener while notifying on all nodes acked", inner);
      }
    }

    @Override
    public void onAckTimeout() {
      try {
        listener.onAckTimeout();
      } catch (Exception e) {
        logger.error("exception thrown by listener while notifying on ack timeout", e);
      }
    }

    @Override
    public TimeValue ackTimeout() {
      return listener.ackTimeout();
    }
  }

  class UpdateTask<T> extends SourcePrioritizedRunnable {

    public final T task;
    public final ClusterStateTaskConfig config;
    public final ClusterStateTaskExecutor<T> executor;
    public final ClusterStateTaskListener listener;
    public final AtomicBoolean processed = new AtomicBoolean();

    UpdateTask(
        String source,
        T task,
        ClusterStateTaskConfig config,
        ClusterStateTaskExecutor<T> executor,
        ClusterStateTaskListener listener) {
      super(config.priority(), source);
      this.task = task;
      this.config = config;
      this.executor = executor;
      this.listener = listener;
    }

    @Override
    public void run() {
      // if this task is already processed, the executor shouldn't execute other tasks (that arrived
      // later),
      // to give other executors a chance to execute their tasks.
      if (processed.get() == false) {
        runTasksForExecutor(executor);
      }
    }

    public String toString(ClusterStateTaskExecutor<T> executor) {
      String taskDescription = executor.describeTasks(Collections.singletonList(task));
      if (taskDescription.isEmpty()) {
        return "[" + source + "]";
      } else {
        return "[" + source + "[" + taskDescription + "]]";
      }
    }
  }

  private void warnAboutSlowTaskIfNeeded(TimeValue executionTime, String source) {
    if (executionTime.getMillis() > slowTaskLoggingThreshold.getMillis()) {
      logger.warn(
          "cluster state update task [{}] took [{}] above the warn threshold of {}",
          source,
          executionTime,
          slowTaskLoggingThreshold);
    }
  }

  class NotifyTimeout implements Runnable {
    final TimeoutClusterStateListener listener;
    final TimeValue timeout;
    volatile ScheduledFuture future;

    NotifyTimeout(TimeoutClusterStateListener listener, TimeValue timeout) {
      this.listener = listener;
      this.timeout = timeout;
    }

    public void cancel() {
      FutureUtils.cancel(future);
    }

    @Override
    public void run() {
      if (future != null && future.isCancelled()) {
        return;
      }
      if (lifecycle.stoppedOrClosed()) {
        listener.onClose();
      } else {
        listener.onTimeout(this.timeout);
      }
      // note, we rely on the listener to remove itself in case of timeout if needed
    }
  }

  private static class LocalNodeMasterListeners implements ClusterStateListener {

    private final List<LocalNodeMasterListener> listeners = new CopyOnWriteArrayList<>();
    private final ThreadPool threadPool;
    private volatile boolean master = false;

    private LocalNodeMasterListeners(ThreadPool threadPool) {
      this.threadPool = threadPool;
    }

    @Override
    public void clusterChanged(ClusterChangedEvent event) {
      if (!master && event.localNodeMaster()) {
        master = true;
        for (LocalNodeMasterListener listener : listeners) {
          Executor executor = threadPool.executor(listener.executorName());
          executor.execute(new OnMasterRunnable(listener));
        }
        return;
      }

      if (master && !event.localNodeMaster()) {
        master = false;
        for (LocalNodeMasterListener listener : listeners) {
          Executor executor = threadPool.executor(listener.executorName());
          executor.execute(new OffMasterRunnable(listener));
        }
      }
    }

    private void add(LocalNodeMasterListener listener) {
      listeners.add(listener);
    }

    private void remove(LocalNodeMasterListener listener) {
      listeners.remove(listener);
    }

    private void clear() {
      listeners.clear();
    }
  }

  private static class OnMasterRunnable implements Runnable {

    private final LocalNodeMasterListener listener;

    private OnMasterRunnable(LocalNodeMasterListener listener) {
      this.listener = listener;
    }

    @Override
    public void run() {
      listener.onMaster();
    }
  }

  private static class OffMasterRunnable implements Runnable {

    private final LocalNodeMasterListener listener;

    private OffMasterRunnable(LocalNodeMasterListener listener) {
      this.listener = listener;
    }

    @Override
    public void run() {
      listener.offMaster();
    }
  }

  private static class DelegetingAckListener implements Discovery.AckListener {

    private final List<Discovery.AckListener> listeners;

    private DelegetingAckListener(List<Discovery.AckListener> listeners) {
      this.listeners = listeners;
    }

    @Override
    public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
      for (Discovery.AckListener listener : listeners) {
        listener.onNodeAck(node, e);
      }
    }

    @Override
    public void onTimeout() {
      throw new UnsupportedOperationException("no timeout delegation");
    }
  }

  private static class AckCountDownListener implements Discovery.AckListener {

    private static final Logger logger = Loggers.getLogger(AckCountDownListener.class);

    private final AckedClusterStateTaskListener ackedTaskListener;
    private final CountDown countDown;
    private final DiscoveryNodes nodes;
    private final long clusterStateVersion;
    private final Future<?> ackTimeoutCallback;
    private Exception lastFailure;

    AckCountDownListener(
        AckedClusterStateTaskListener ackedTaskListener,
        long clusterStateVersion,
        DiscoveryNodes nodes,
        ThreadPool threadPool) {
      this.ackedTaskListener = ackedTaskListener;
      this.clusterStateVersion = clusterStateVersion;
      this.nodes = nodes;
      int countDown = 0;
      for (DiscoveryNode node : nodes) {
        if (ackedTaskListener.mustAck(node)) {
          countDown++;
        }
      }
      // we always wait for at least 1 node (the master)
      countDown = Math.max(1, countDown);
      logger.trace(
          "expecting {} acknowledgements for cluster_state update (version: {})",
          countDown,
          clusterStateVersion);
      this.countDown = new CountDown(countDown);
      this.ackTimeoutCallback =
          threadPool.schedule(
              ackedTaskListener.ackTimeout(),
              ThreadPool.Names.GENERIC,
              new Runnable() {
                @Override
                public void run() {
                  onTimeout();
                }
              });
    }

    @Override
    public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
      if (!ackedTaskListener.mustAck(node)) {
        // we always wait for the master ack anyway
        if (!node.equals(nodes.getMasterNode())) {
          return;
        }
      }
      if (e == null) {
        logger.trace(
            "ack received from node [{}], cluster_state update (version: {})",
            node,
            clusterStateVersion);
      } else {
        this.lastFailure = e;
        logger.debug(
            (Supplier<?>)
                () ->
                    new ParameterizedMessage(
                        "ack received from node [{}], cluster_state update (version: {})",
                        node,
                        clusterStateVersion),
            e);
      }

      if (countDown.countDown()) {
        logger.trace(
            "all expected nodes acknowledged cluster_state update (version: {})",
            clusterStateVersion);
        FutureUtils.cancel(ackTimeoutCallback);
        ackedTaskListener.onAllNodesAcked(lastFailure);
      }
    }

    @Override
    public void onTimeout() {
      if (countDown.fastForward()) {
        logger.trace(
            "timeout waiting for acknowledgement for cluster_state update (version: {})",
            clusterStateVersion);
        ackedTaskListener.onAckTimeout();
      }
    }
  }

  public ClusterSettings getClusterSettings() {
    return clusterSettings;
  }

  public Settings getSettings() {
    return settings;
  }
}
예제 #11
0
/**
 * The indices request cache allows to cache a shard level request stage responses, helping with
 * improving similar requests that are potentially expensive (because of aggs for example). The
 * cache is fully coherent with the semantics of NRT (the index reader version is part of the cache
 * key), and relies on size based eviction to evict old reader associated cache entries as well as
 * scheduler reaper to clean readers that are no longer used or closed shards.
 *
 * <p>Currently, the cache is only enabled for count requests, and can only be opted in on an index
 * level setting that can be dynamically changed and defaults to false.
 *
 * <p>There are still several TODOs left in this class, some easily addressable, some more complex,
 * but the support is functional.
 */
public class IndicesRequestCache extends AbstractComponent
    implements RemovalListener<IndicesRequestCache.Key, IndicesRequestCache.Value> {

  /**
   * A setting to enable or disable request caching on an index level. Its dynamic by default since
   * we are checking on the cluster state IndexMetaData always.
   */
  public static final String INDEX_CACHE_REQUEST_ENABLED = "index.requests.cache.enable";

  @Deprecated
  public static final String DEPRECATED_INDEX_CACHE_REQUEST_ENABLED = "index.cache.query.enable";

  public static final String INDICES_CACHE_REQUEST_CLEAN_INTERVAL =
      "indices.requests.cache.clean_interval";

  public static final String INDICES_CACHE_QUERY_SIZE = "indices.requests.cache.size";

  @Deprecated
  public static final String DEPRECATED_INDICES_CACHE_QUERY_SIZE = "indices.cache.query.size";

  public static final String INDICES_CACHE_QUERY_EXPIRE = "indices.requests.cache.expire";
  public static final String INDICES_CACHE_QUERY_CONCURRENCY_LEVEL =
      "indices.requests.cache.concurrency_level";

  private static final Set<SearchType> CACHEABLE_SEARCH_TYPES =
      EnumSet.of(SearchType.QUERY_THEN_FETCH, SearchType.QUERY_AND_FETCH);

  private final ThreadPool threadPool;
  private final ClusterService clusterService;

  private final TimeValue cleanInterval;
  private final Reaper reaper;

  final ConcurrentMap<CleanupKey, Boolean> registeredClosedListeners =
      ConcurrentCollections.newConcurrentMap();
  final Set<CleanupKey> keysToClean = ConcurrentCollections.newConcurrentSet();

  // TODO make these changes configurable on the cluster level
  private final String size;
  private final TimeValue expire;
  private final int concurrencyLevel;

  private volatile Cache<Key, Value> cache;

  @Inject
  public IndicesRequestCache(
      Settings settings, ClusterService clusterService, ThreadPool threadPool) {
    super(settings);
    this.clusterService = clusterService;
    this.threadPool = threadPool;
    this.cleanInterval =
        settings.getAsTime(INDICES_CACHE_REQUEST_CLEAN_INTERVAL, TimeValue.timeValueSeconds(60));

    String size = settings.get(INDICES_CACHE_QUERY_SIZE);
    if (size == null) {
      size = settings.get(DEPRECATED_INDICES_CACHE_QUERY_SIZE);
      if (size != null) {
        deprecationLogger.deprecated(
            "The ["
                + DEPRECATED_INDICES_CACHE_QUERY_SIZE
                + "] settings is now deprecated, use ["
                + INDICES_CACHE_QUERY_SIZE
                + "] instead");
      }
    }
    if (size == null) {
      // this cache can be very small yet still be very effective
      size = "1%";
    }
    this.size = size;

    this.expire = settings.getAsTime(INDICES_CACHE_QUERY_EXPIRE, null);
    // defaults to 4, but this is a busy map for all indices, increase it a bit by default
    this.concurrencyLevel = settings.getAsInt(INDICES_CACHE_QUERY_CONCURRENCY_LEVEL, 16);
    if (concurrencyLevel <= 0) {
      throw new IllegalArgumentException(
          "concurrency_level must be > 0 but was: " + concurrencyLevel);
    }
    buildCache();

    this.reaper = new Reaper();
    threadPool.schedule(cleanInterval, ThreadPool.Names.SAME, reaper);
  }

  private boolean isCacheEnabled(Settings settings, boolean defaultEnable) {
    Boolean enable = settings.getAsBoolean(INDEX_CACHE_REQUEST_ENABLED, null);
    if (enable == null) {
      enable = settings.getAsBoolean(DEPRECATED_INDEX_CACHE_REQUEST_ENABLED, null);
      if (enable != null) {
        deprecationLogger.deprecated(
            "The ["
                + DEPRECATED_INDEX_CACHE_REQUEST_ENABLED
                + "] settings is now deprecated, use ["
                + INDEX_CACHE_REQUEST_ENABLED
                + "] instead");
      }
    }
    if (enable == null) {
      enable = defaultEnable;
    }
    return enable;
  }

  private void buildCache() {
    long sizeInBytes =
        MemorySizeValue.parseBytesSizeValueOrHeapRatio(size, INDICES_CACHE_QUERY_SIZE).bytes();

    CacheBuilder<Key, Value> cacheBuilder =
        CacheBuilder.newBuilder()
            .maximumWeight(sizeInBytes)
            .weigher(new QueryCacheWeigher())
            .removalListener(this);
    cacheBuilder.concurrencyLevel(concurrencyLevel);

    if (expire != null) {
      cacheBuilder.expireAfterAccess(expire.millis(), TimeUnit.MILLISECONDS);
    }

    cache = cacheBuilder.build();
  }

  private static class QueryCacheWeigher implements Weigher<Key, Value> {

    @Override
    public int weigh(Key key, Value value) {
      return (int) (key.ramBytesUsed() + value.ramBytesUsed());
    }
  }

  public void close() {
    reaper.close();
    cache.invalidateAll();
  }

  public void clear(IndexShard shard) {
    if (shard == null) {
      return;
    }
    keysToClean.add(new CleanupKey(shard, -1));
    logger.trace("{} explicit cache clear", shard.shardId());
    reaper.reap();
  }

  @Override
  public void onRemoval(RemovalNotification<Key, Value> notification) {
    if (notification.getKey() == null) {
      return;
    }
    notification.getKey().shard.requestCache().onRemoval(notification);
  }

  /** Can the shard request be cached at all? */
  public boolean canCache(ShardSearchRequest request, SearchContext context) {
    // TODO: for now, template is not supported, though we could use the generated bytes as the key
    if (hasLength(request.templateSource())) {
      return false;
    }

    // for now, only enable it for requests with no hits
    if (context.size() != 0) {
      return false;
    }

    // We cannot cache with DFS because results depend not only on the content of the index but also
    // on the overridden statistics. So if you ran two queries on the same index with different
    // stats
    // (because an other shard was updated) you would get wrong results because of the scores
    // (think about top_hits aggs or scripts using the score)
    if (!CACHEABLE_SEARCH_TYPES.contains(context.searchType())) {
      return false;
    }

    IndexMetaData index = clusterService.state().getMetaData().index(request.index());
    if (index == null) { // in case we didn't yet have the cluster state, or it just got deleted
      return false;
    }
    // if not explicitly set in the request, use the index setting, if not, use the request
    if (request.requestCache() == null) {
      if (!isCacheEnabled(index.settings(), Boolean.FALSE)) {
        return false;
      }
    } else if (!request.requestCache()) {
      return false;
    }
    // if the reader is not a directory reader, we can't get the version from it
    if (!(context.searcher().getIndexReader() instanceof DirectoryReader)) {
      return false;
    }
    // if now in millis is used (or in the future, a more generic "isDeterministic" flag
    // then we can't cache based on "now" key within the search request, as it is not deterministic
    if (context.nowInMillisUsed()) {
      return false;
    }
    return true;
  }

  /**
   * Loads the cache result, computing it if needed by executing the query phase and otherwise
   * deserializing the cached value into the {@link SearchContext#queryResult() context's query
   * result}. The combination of load + compute allows to have a single load operation that will
   * cause other requests with the same key to wait till its loaded an reuse the same cache.
   */
  public void loadIntoContext(
      final ShardSearchRequest request, final SearchContext context, final QueryPhase queryPhase)
      throws Exception {
    assert canCache(request, context);
    Key key = buildKey(request, context);
    Loader loader = new Loader(queryPhase, context, key);
    Value value = cache.get(key, loader);
    if (loader.isLoaded()) {
      key.shard.requestCache().onMiss();
      // see if its the first time we see this reader, and make sure to register a cleanup key
      CleanupKey cleanupKey =
          new CleanupKey(
              context.indexShard(),
              ((DirectoryReader) context.searcher().getIndexReader()).getVersion());
      if (!registeredClosedListeners.containsKey(cleanupKey)) {
        Boolean previous = registeredClosedListeners.putIfAbsent(cleanupKey, Boolean.TRUE);
        if (previous == null) {
          context.searcher().getIndexReader().addReaderClosedListener(cleanupKey);
        }
      }
    } else {
      key.shard.requestCache().onHit();
      // restore the cached query result into the context
      final QuerySearchResult result = context.queryResult();
      result.readFromWithId(context.id(), value.reference.streamInput());
      result.shardTarget(context.shardTarget());
    }
  }

  private static class Loader implements Callable<Value> {

    private final QueryPhase queryPhase;
    private final SearchContext context;
    private final IndicesRequestCache.Key key;
    private boolean loaded;

    Loader(QueryPhase queryPhase, SearchContext context, IndicesRequestCache.Key key) {
      this.queryPhase = queryPhase;
      this.context = context;
      this.key = key;
    }

    public boolean isLoaded() {
      return this.loaded;
    }

    @Override
    public Value call() throws Exception {
      queryPhase.execute(context);

      /* BytesStreamOutput allows to pass the expected size but by default uses
       * BigArrays.PAGE_SIZE_IN_BYTES which is 16k. A common cached result ie.
       * a date histogram with 3 buckets is ~100byte so 16k might be very wasteful
       * since we don't shrink to the actual size once we are done serializing.
       * By passing 512 as the expected size we will resize the byte array in the stream
       * slowly until we hit the page size and don't waste too much memory for small query
       * results.*/
      final int expectedSizeInBytes = 512;
      try (BytesStreamOutput out = new BytesStreamOutput(expectedSizeInBytes)) {
        context.queryResult().writeToNoId(out);
        // for now, keep the paged data structure, which might have unused bytes to fill a page, but
        // better to keep
        // the memory properly paged instead of having varied sized bytes
        final BytesReference reference = out.bytes();
        loaded = true;
        Value value = new Value(reference, out.ramBytesUsed());
        key.shard.requestCache().onCached(key, value);
        return value;
      }
    }
  }

  public static class Value implements Accountable {
    final BytesReference reference;
    final long ramBytesUsed;

    public Value(BytesReference reference, long ramBytesUsed) {
      this.reference = reference;
      this.ramBytesUsed = ramBytesUsed;
    }

    @Override
    public long ramBytesUsed() {
      return ramBytesUsed;
    }

    @Override
    public Collection<Accountable> getChildResources() {
      return Collections.emptyList();
    }
  }

  public static class Key implements Accountable {
    public final IndexShard shard; // use as identity equality
    public final long
        readerVersion; // use the reader version to now keep a reference to a "short" lived reader
                       // until its reaped
    public final BytesReference value;

    Key(IndexShard shard, long readerVersion, BytesReference value) {
      this.shard = shard;
      this.readerVersion = readerVersion;
      this.value = value;
    }

    @Override
    public long ramBytesUsed() {
      return RamUsageEstimator.NUM_BYTES_OBJECT_REF
          + RamUsageEstimator.NUM_BYTES_LONG
          + value.length();
    }

    @Override
    public Collection<Accountable> getChildResources() {
      // TODO: more detailed ram usage?
      return Collections.emptyList();
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      Key key = (Key) o;
      if (readerVersion != key.readerVersion) return false;
      if (!shard.equals(key.shard)) return false;
      if (!value.equals(key.value)) return false;
      return true;
    }

    @Override
    public int hashCode() {
      int result = shard.hashCode();
      result = 31 * result + (int) (readerVersion ^ (readerVersion >>> 32));
      result = 31 * result + value.hashCode();
      return result;
    }
  }

  private class CleanupKey implements IndexReader.ReaderClosedListener {
    IndexShard indexShard;
    long
        readerVersion; // use the reader version to now keep a reference to a "short" lived reader
                       // until its reaped

    private CleanupKey(IndexShard indexShard, long readerVersion) {
      this.indexShard = indexShard;
      this.readerVersion = readerVersion;
    }

    @Override
    public void onClose(IndexReader reader) {
      Boolean remove = registeredClosedListeners.remove(this);
      if (remove != null) {
        keysToClean.add(this);
      }
    }

    @Override
    public boolean equals(Object o) {
      if (this == o) return true;
      CleanupKey that = (CleanupKey) o;
      if (readerVersion != that.readerVersion) return false;
      if (!indexShard.equals(that.indexShard)) return false;
      return true;
    }

    @Override
    public int hashCode() {
      int result = indexShard.hashCode();
      result = 31 * result + (int) (readerVersion ^ (readerVersion >>> 32));
      return result;
    }
  }

  private class Reaper implements Runnable {

    private final ObjectSet<CleanupKey> currentKeysToClean = new ObjectHashSet<>();
    private final ObjectSet<IndexShard> currentFullClean = new ObjectHashSet<>();

    private volatile boolean closed;

    void close() {
      closed = true;
    }

    @Override
    public void run() {
      if (closed) {
        return;
      }
      if (keysToClean.isEmpty()) {
        schedule();
        return;
      }
      try {
        threadPool
            .executor(ThreadPool.Names.GENERIC)
            .execute(
                new Runnable() {
                  @Override
                  public void run() {
                    reap();
                    schedule();
                  }
                });
      } catch (EsRejectedExecutionException ex) {
        logger.debug("Can not run ReaderCleaner - execution rejected", ex);
      }
    }

    private void schedule() {
      try {
        threadPool.schedule(cleanInterval, ThreadPool.Names.SAME, this);
      } catch (EsRejectedExecutionException ex) {
        logger.debug("Can not schedule ReaderCleaner - execution rejected", ex);
      }
    }

    synchronized void reap() {
      currentKeysToClean.clear();
      currentFullClean.clear();
      for (Iterator<CleanupKey> iterator = keysToClean.iterator(); iterator.hasNext(); ) {
        CleanupKey cleanupKey = iterator.next();
        iterator.remove();
        if (cleanupKey.readerVersion == -1
            || cleanupKey.indexShard.state() == IndexShardState.CLOSED) {
          // -1 indicates full cleanup, as does a closed shard
          currentFullClean.add(cleanupKey.indexShard);
        } else {
          currentKeysToClean.add(cleanupKey);
        }
      }

      if (!currentKeysToClean.isEmpty() || !currentFullClean.isEmpty()) {
        CleanupKey lookupKey = new CleanupKey(null, -1);
        for (Iterator<Key> iterator = cache.asMap().keySet().iterator(); iterator.hasNext(); ) {
          Key key = iterator.next();
          if (currentFullClean.contains(key.shard)) {
            iterator.remove();
          } else {
            lookupKey.indexShard = key.shard;
            lookupKey.readerVersion = key.readerVersion;
            if (currentKeysToClean.contains(lookupKey)) {
              iterator.remove();
            }
          }
        }
      }

      cache.cleanUp();
      currentKeysToClean.clear();
      currentFullClean.clear();
    }
  }

  private static Key buildKey(ShardSearchRequest request, SearchContext context) throws Exception {
    // TODO: for now, this will create different keys for different JSON order
    // TODO: tricky to get around this, need to parse and order all, which can be expensive
    return new Key(
        context.indexShard(),
        ((DirectoryReader) context.searcher().getIndexReader()).getVersion(),
        request.cacheKey());
  }
}
 // we do our best to return the shard failures, but its ok if its not fully concurrently safe
 // we simply try and return as much as possible
 protected final void addShardFailure(ShardSearchFailure failure) {
   if (shardFailures == null) {
     shardFailures = ConcurrentCollections.newQueue();
   }
   shardFailures.add(failure);
 }
/**
 * A node level registry of analyzers, to be reused by different indices which use default
 * analyzers.
 *
 * @author kimchy (shay.banon)
 */
public class IndicesAnalysisService extends AbstractComponent {

  private final Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories =
      ConcurrentCollections.newConcurrentMap();

  public IndicesAnalysisService() {
    super(EMPTY_SETTINGS);
  }

  @Inject
  public IndicesAnalysisService(Settings settings) {
    super(settings);

    analyzerProviderFactories.put(
        "standard",
        new PreBuiltAnalyzerProviderFactory(
            "standard", AnalyzerScope.INDICES, new StandardAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "keyword",
        new PreBuiltAnalyzerProviderFactory(
            "keyword", AnalyzerScope.INDICES, new KeywordAnalyzer()));
    analyzerProviderFactories.put(
        "stop",
        new PreBuiltAnalyzerProviderFactory(
            "stop", AnalyzerScope.INDICES, new StopAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "whitespace",
        new PreBuiltAnalyzerProviderFactory(
            "whitespace", AnalyzerScope.INDICES, new WhitespaceAnalyzer()));
    analyzerProviderFactories.put(
        "simple",
        new PreBuiltAnalyzerProviderFactory("simple", AnalyzerScope.INDICES, new SimpleAnalyzer()));

    // extended ones
    analyzerProviderFactories.put(
        "arabic",
        new PreBuiltAnalyzerProviderFactory(
            "arabic", AnalyzerScope.INDICES, new ArabicAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "brazilian",
        new PreBuiltAnalyzerProviderFactory(
            "brazilian", AnalyzerScope.INDICES, new BrazilianAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "chinese",
        new PreBuiltAnalyzerProviderFactory(
            "chinese", AnalyzerScope.INDICES, new ChineseAnalyzer()));
    analyzerProviderFactories.put(
        "cjk",
        new PreBuiltAnalyzerProviderFactory("cjk", AnalyzerScope.INDICES, new ChineseAnalyzer()));
    analyzerProviderFactories.put(
        "czech",
        new PreBuiltAnalyzerProviderFactory(
            "czech", AnalyzerScope.INDICES, new CzechAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "dutch",
        new PreBuiltAnalyzerProviderFactory(
            "dutch", AnalyzerScope.INDICES, new DutchAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "french",
        new PreBuiltAnalyzerProviderFactory(
            "french", AnalyzerScope.INDICES, new FrenchAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "german",
        new PreBuiltAnalyzerProviderFactory(
            "german", AnalyzerScope.INDICES, new GermanAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "greek",
        new PreBuiltAnalyzerProviderFactory(
            "greek", AnalyzerScope.INDICES, new GreekAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "persian",
        new PreBuiltAnalyzerProviderFactory(
            "persian", AnalyzerScope.INDICES, new PersianAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "russian",
        new PreBuiltAnalyzerProviderFactory(
            "russian", AnalyzerScope.INDICES, new RussianAnalyzer(Lucene.ANALYZER_VERSION)));
    analyzerProviderFactories.put(
        "thai",
        new PreBuiltAnalyzerProviderFactory(
            "thai", AnalyzerScope.INDICES, new ThaiAnalyzer(Lucene.ANALYZER_VERSION)));
  }

  public PreBuiltAnalyzerProviderFactory analyzerProviderFactory(String name) {
    return analyzerProviderFactories.get(name);
  }

  public boolean hasAnalyzer(String name) {
    return analyzer(name) != null;
  }

  public Analyzer analyzer(String name) {
    PreBuiltAnalyzerProviderFactory analyzerProviderFactory = analyzerProviderFactory(name);
    if (analyzerProviderFactory == null) {
      return null;
    }
    return analyzerProviderFactory.analyzer();
  }

  public void close() {
    for (PreBuiltAnalyzerProviderFactory analyzerProviderFactory :
        analyzerProviderFactories.values()) {
      analyzerProviderFactory.analyzer().close();
    }
  }
}
public class LocalGatewayMetaState extends AbstractComponent implements ClusterStateListener {

  static enum AutoImportDangledState {
    NO() {
      @Override
      public boolean shouldImport() {
        return false;
      }
    },
    YES() {
      @Override
      public boolean shouldImport() {
        return true;
      }
    },
    CLOSED() {
      @Override
      public boolean shouldImport() {
        return true;
      }
    };

    public abstract boolean shouldImport();

    public static AutoImportDangledState fromString(String value) {
      if ("no".equalsIgnoreCase(value)) {
        return NO;
      } else if ("yes".equalsIgnoreCase(value)) {
        return YES;
      } else if ("closed".equalsIgnoreCase(value)) {
        return CLOSED;
      } else {
        throw new ElasticSearchIllegalArgumentException(
            "failed to parse [" + value + "], not a valid auto dangling import type");
      }
    }
  }

  private final NodeEnvironment nodeEnv;
  private final ThreadPool threadPool;

  private final LocalAllocateDangledIndices allocateDangledIndices;

  @Nullable private volatile MetaData currentMetaData;

  private final XContentType format;
  private final ToXContent.Params formatParams;

  private final AutoImportDangledState autoImportDangled;
  private final TimeValue danglingTimeout;
  private final Map<String, DanglingIndex> danglingIndices =
      ConcurrentCollections.newConcurrentMap();
  private final Object danglingMutex = new Object();

  @Inject
  public LocalGatewayMetaState(
      Settings settings,
      ThreadPool threadPool,
      NodeEnvironment nodeEnv,
      TransportNodesListGatewayMetaState nodesListGatewayMetaState,
      LocalAllocateDangledIndices allocateDangledIndices)
      throws Exception {
    super(settings);
    this.nodeEnv = nodeEnv;
    this.threadPool = threadPool;
    this.format = XContentType.fromRestContentType(settings.get("format", "smile"));
    this.allocateDangledIndices = allocateDangledIndices;
    nodesListGatewayMetaState.init(this);

    if (this.format == XContentType.SMILE) {
      Map<String, String> params = Maps.newHashMap();
      params.put("binary", "true");
      formatParams = new ToXContent.MapParams(params);
    } else {
      formatParams = ToXContent.EMPTY_PARAMS;
    }

    this.autoImportDangled =
        AutoImportDangledState.fromString(
            settings.get(
                "gateway.local.auto_import_dangled", AutoImportDangledState.YES.toString()));
    this.danglingTimeout =
        settings.getAsTime("gateway.local.dangling_timeout", TimeValue.timeValueHours(2));

    logger.debug(
        "using gateway.local.auto_import_dangled [{}], with gateway.local.dangling_timeout [{}]",
        this.autoImportDangled,
        this.danglingTimeout);

    if (DiscoveryNode.masterNode(settings)) {
      try {
        pre019Upgrade();
        long start = System.currentTimeMillis();
        loadState();
        logger.debug(
            "took {} to load state", TimeValue.timeValueMillis(System.currentTimeMillis() - start));
      } catch (Exception e) {
        logger.error("failed to read local state, exiting...", e);
        throw e;
      }
    }
  }

  public MetaData loadMetaState() throws Exception {
    return loadState();
  }

  public boolean isDangling(String index) {
    return danglingIndices.containsKey(index);
  }

  @Override
  public void clusterChanged(ClusterChangedEvent event) {
    if (event.state().blocks().disableStatePersistence()) {
      // reset the current metadata, we need to start fresh...
      this.currentMetaData = null;
      return;
    }

    MetaData newMetaData = event.state().metaData();
    // we don't check if metaData changed, since we might be called several times and we need to
    // check dangling...

    boolean success = true;
    // only applied to master node, writing the global and index level states
    if (event.state().nodes().localNode().masterNode()) {
      // check if the global state changed?
      if (currentMetaData == null || !MetaData.isGlobalStateEquals(currentMetaData, newMetaData)) {
        try {
          writeGlobalState("changed", newMetaData, currentMetaData);
        } catch (Exception e) {
          success = false;
        }
      }

      // check and write changes in indices
      for (IndexMetaData indexMetaData : newMetaData) {
        String writeReason = null;
        IndexMetaData currentIndexMetaData;
        if (currentMetaData == null) {
          // a new event..., check from the state stored
          currentIndexMetaData = loadIndex(indexMetaData.index());
        } else {
          currentIndexMetaData = currentMetaData.index(indexMetaData.index());
        }
        if (currentIndexMetaData == null) {
          writeReason = "freshly created";
        } else if (currentIndexMetaData.version() != indexMetaData.version()) {
          writeReason =
              "version changed from ["
                  + currentIndexMetaData.version()
                  + "] to ["
                  + indexMetaData.version()
                  + "]";
        }

        // we update the writeReason only if we really need to write it
        if (writeReason == null) {
          continue;
        }

        try {
          writeIndex(writeReason, indexMetaData, currentIndexMetaData);
        } catch (Exception e) {
          success = false;
        }
      }
    }

    // delete indices that were there before, but are deleted now
    // we need to do it so they won't be detected as dangling
    if (nodeEnv.hasNodeFile()) {
      if (currentMetaData != null) {
        // only delete indices when we already received a state (currentMetaData != null)
        // and we had a go at processing dangling indices at least once
        // this will also delete the _state of the index itself
        for (IndexMetaData current : currentMetaData) {
          if (danglingIndices.containsKey(current.index())) {
            continue;
          }
          if (!newMetaData.hasIndex(current.index())) {
            logger.debug(
                "[{}] deleting index that is no longer part of the metadata (indices: [{}])",
                current.index(),
                newMetaData.indices().keySet());
            FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(current.index())));
          }
        }
      }
    }

    // handle dangling indices, we handle those for all nodes that have a node file (data or master)
    if (nodeEnv.hasNodeFile()) {
      if (danglingTimeout.millis() >= 0) {
        synchronized (danglingMutex) {
          for (String danglingIndex : danglingIndices.keySet()) {
            if (newMetaData.hasIndex(danglingIndex)) {
              logger.debug("[{}] no longer dangling (created), removing", danglingIndex);
              DanglingIndex removed = danglingIndices.remove(danglingIndex);
              removed.future.cancel(false);
            }
          }
          // delete indices that are no longer part of the metadata
          try {
            for (String indexName : nodeEnv.findAllIndices()) {
              // if we have the index on the metadata, don't delete it
              if (newMetaData.hasIndex(indexName)) {
                continue;
              }
              if (danglingIndices.containsKey(indexName)) {
                // already dangling, continue
                continue;
              }
              IndexMetaData indexMetaData = loadIndex(indexName);
              if (indexMetaData != null) {
                if (danglingTimeout.millis() == 0) {
                  logger.info(
                      "[{}] dangling index, exists on local file system, but not in cluster metadata, timeout set to 0, deleting now",
                      indexName);
                  FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(indexName)));
                } else {
                  logger.info(
                      "[{}] dangling index, exists on local file system, but not in cluster metadata, scheduling to delete in [{}], auto import to cluster state [{}]",
                      indexName,
                      danglingTimeout,
                      autoImportDangled);
                  danglingIndices.put(
                      indexName,
                      new DanglingIndex(
                          indexName,
                          threadPool.schedule(
                              danglingTimeout,
                              ThreadPool.Names.SAME,
                              new RemoveDanglingIndex(indexName))));
                }
              }
            }
          } catch (Exception e) {
            logger.warn("failed to find dangling indices", e);
          }
        }
      }
      if (autoImportDangled.shouldImport() && !danglingIndices.isEmpty()) {
        final List<IndexMetaData> dangled = Lists.newArrayList();
        for (String indexName : danglingIndices.keySet()) {
          IndexMetaData indexMetaData = loadIndex(indexName);
          if (indexMetaData == null) {
            logger.debug("failed to find state for dangling index [{}]", indexName);
            continue;
          }
          // we might have someone copying over an index, renaming the directory, handle that
          if (!indexMetaData.index().equals(indexName)) {
            logger.info(
                "dangled index directory name is [{}], state name is [{}], renaming to directory name",
                indexName,
                indexMetaData.index());
            indexMetaData =
                IndexMetaData.newIndexMetaDataBuilder(indexMetaData).index(indexName).build();
          }
          if (autoImportDangled == AutoImportDangledState.CLOSED) {
            indexMetaData =
                IndexMetaData.newIndexMetaDataBuilder(indexMetaData)
                    .state(IndexMetaData.State.CLOSE)
                    .build();
          }
          if (indexMetaData != null) {
            dangled.add(indexMetaData);
          }
        }
        IndexMetaData[] dangledIndices = dangled.toArray(new IndexMetaData[dangled.size()]);
        try {
          allocateDangledIndices.allocateDangled(
              dangledIndices,
              new LocalAllocateDangledIndices.Listener() {
                @Override
                public void onResponse(
                    LocalAllocateDangledIndices.AllocateDangledResponse response) {
                  logger.trace("allocated dangled");
                }

                @Override
                public void onFailure(Throwable e) {
                  logger.info("failed to send allocated dangled", e);
                }
              });
        } catch (Exception e) {
          logger.warn("failed to send allocate dangled", e);
        }
      }
    }

    if (success) {
      currentMetaData = newMetaData;
    }
  }

  private void deleteIndex(String index) {
    logger.trace("[{}] delete index state", index);
    File[] indexLocations = nodeEnv.indexLocations(new Index(index));
    for (File indexLocation : indexLocations) {
      if (!indexLocation.exists()) {
        continue;
      }
      FileSystemUtils.deleteRecursively(new File(indexLocation, "_state"));
    }
  }

  private void writeIndex(
      String reason, IndexMetaData indexMetaData, @Nullable IndexMetaData previousIndexMetaData)
      throws Exception {
    logger.trace("[{}] writing state, reason [{}]", indexMetaData.index(), reason);
    XContentBuilder builder = XContentFactory.contentBuilder(format, new BytesStreamOutput());
    builder.startObject();
    IndexMetaData.Builder.toXContent(indexMetaData, builder, formatParams);
    builder.endObject();
    builder.flush();

    String stateFileName = "state-" + indexMetaData.version();
    Exception lastFailure = null;
    boolean wroteAtLeastOnce = false;
    for (File indexLocation : nodeEnv.indexLocations(new Index(indexMetaData.index()))) {
      File stateLocation = new File(indexLocation, "_state");
      FileSystemUtils.mkdirs(stateLocation);
      File stateFile = new File(stateLocation, stateFileName);

      FileOutputStream fos = null;
      try {
        fos = new FileOutputStream(stateFile);
        BytesReference bytes = builder.bytes();
        fos.write(bytes.array(), bytes.arrayOffset(), bytes.length());
        fos.getChannel().force(true);
        fos.close();
        wroteAtLeastOnce = true;
      } catch (Exception e) {
        lastFailure = e;
      } finally {
        IOUtils.closeWhileHandlingException(fos);
      }
    }

    if (!wroteAtLeastOnce) {
      logger.warn("[{}]: failed to state", lastFailure, indexMetaData.index());
      throw new IOException(
          "failed to write state for [" + indexMetaData.index() + "]", lastFailure);
    }

    // delete the old files
    if (previousIndexMetaData != null
        && previousIndexMetaData.version() != indexMetaData.version()) {
      for (File indexLocation : nodeEnv.indexLocations(new Index(indexMetaData.index()))) {
        File[] files = new File(indexLocation, "_state").listFiles();
        if (files == null) {
          continue;
        }
        for (File file : files) {
          if (!file.getName().startsWith("state-")) {
            continue;
          }
          if (file.getName().equals(stateFileName)) {
            continue;
          }
          file.delete();
        }
      }
    }
  }

  private void writeGlobalState(
      String reason, MetaData metaData, @Nullable MetaData previousMetaData) throws Exception {
    logger.trace("[_global] writing state, reason [{}]", reason);
    // create metadata to write with just the global state
    MetaData globalMetaData = MetaData.builder().metaData(metaData).removeAllIndices().build();

    XContentBuilder builder = XContentFactory.contentBuilder(format);
    builder.startObject();
    MetaData.Builder.toXContent(globalMetaData, builder, formatParams);
    builder.endObject();
    builder.flush();

    String globalFileName = "global-" + globalMetaData.version();
    Exception lastFailure = null;
    boolean wroteAtLeastOnce = false;
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File stateLocation = new File(dataLocation, "_state");
      FileSystemUtils.mkdirs(stateLocation);
      File stateFile = new File(stateLocation, globalFileName);

      FileOutputStream fos = null;
      try {
        fos = new FileOutputStream(stateFile);
        BytesReference bytes = builder.bytes();
        fos.write(bytes.array(), bytes.arrayOffset(), bytes.length());
        fos.getChannel().force(true);
        fos.close();
        wroteAtLeastOnce = true;
      } catch (Exception e) {
        lastFailure = e;
      } finally {
        IOUtils.closeWhileHandlingException(fos);
      }
    }

    if (!wroteAtLeastOnce) {
      logger.warn("[_global]: failed to write global state", lastFailure);
      throw new IOException("failed to write global state", lastFailure);
    }

    // delete the old files
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File[] files = new File(dataLocation, "_state").listFiles();
      if (files == null) {
        continue;
      }
      for (File file : files) {
        if (!file.getName().startsWith("global-")) {
          continue;
        }
        if (file.getName().equals(globalFileName)) {
          continue;
        }
        file.delete();
      }
    }
  }

  private MetaData loadState() throws Exception {
    MetaData.Builder metaDataBuilder = MetaData.builder();
    MetaData globalMetaData = loadGlobalState();
    if (globalMetaData != null) {
      metaDataBuilder.metaData(globalMetaData);
    }

    Set<String> indices = nodeEnv.findAllIndices();
    for (String index : indices) {
      IndexMetaData indexMetaData = loadIndex(index);
      if (indexMetaData == null) {
        logger.debug("[{}] failed to find metadata for existing index location", index);
      } else {
        metaDataBuilder.put(indexMetaData, false);
      }
    }
    return metaDataBuilder.build();
  }

  @Nullable
  private IndexMetaData loadIndex(String index) {
    long highestVersion = -1;
    IndexMetaData indexMetaData = null;
    for (File indexLocation : nodeEnv.indexLocations(new Index(index))) {
      File stateDir = new File(indexLocation, "_state");
      if (!stateDir.exists() || !stateDir.isDirectory()) {
        continue;
      }
      // now, iterate over the current versions, and find latest one
      File[] stateFiles = stateDir.listFiles();
      if (stateFiles == null) {
        continue;
      }
      for (File stateFile : stateFiles) {
        if (!stateFile.getName().startsWith("state-")) {
          continue;
        }
        try {
          long version = Long.parseLong(stateFile.getName().substring("state-".length()));
          if (version > highestVersion) {
            byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile));
            if (data.length == 0) {
              logger.debug(
                  "[{}]: no data for [" + stateFile.getAbsolutePath() + "], ignoring...", index);
              continue;
            }
            XContentParser parser = null;
            try {
              parser = XContentHelper.createParser(data, 0, data.length);
              parser.nextToken(); // move to START_OBJECT
              indexMetaData = IndexMetaData.Builder.fromXContent(parser);
              highestVersion = version;
            } finally {
              if (parser != null) {
                parser.close();
              }
            }
          }
        } catch (Exception e) {
          logger.debug(
              "[{}]: failed to read [" + stateFile.getAbsolutePath() + "], ignoring...", e, index);
        }
      }
    }
    return indexMetaData;
  }

  private MetaData loadGlobalState() {
    long highestVersion = -1;
    MetaData metaData = null;
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File stateLocation = new File(dataLocation, "_state");
      if (!stateLocation.exists()) {
        continue;
      }
      File[] stateFiles = stateLocation.listFiles();
      if (stateFiles == null) {
        continue;
      }
      for (File stateFile : stateFiles) {
        String name = stateFile.getName();
        if (!name.startsWith("global-")) {
          continue;
        }
        try {
          long version = Long.parseLong(stateFile.getName().substring("global-".length()));
          if (version > highestVersion) {
            byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile));
            if (data.length == 0) {
              logger.debug(
                  "[_global] no data for [" + stateFile.getAbsolutePath() + "], ignoring...");
              continue;
            }

            XContentParser parser = null;
            try {
              parser = XContentHelper.createParser(data, 0, data.length);
              metaData = MetaData.Builder.fromXContent(parser);
              highestVersion = version;
            } finally {
              if (parser != null) {
                parser.close();
              }
            }
          }
        } catch (Exception e) {
          logger.debug("failed to load global state from [{}]", e, stateFile.getAbsolutePath());
        }
      }
    }

    return metaData;
  }

  private void pre019Upgrade() throws Exception {
    long index = -1;
    File metaDataFile = null;
    MetaData metaData = null;
    long version = -1;
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File stateLocation = new File(dataLocation, "_state");
      if (!stateLocation.exists()) {
        continue;
      }
      File[] stateFiles = stateLocation.listFiles();
      if (stateFiles == null) {
        continue;
      }
      for (File stateFile : stateFiles) {
        if (logger.isTraceEnabled()) {
          logger.trace("[upgrade]: processing [" + stateFile.getName() + "]");
        }
        String name = stateFile.getName();
        if (!name.startsWith("metadata-")) {
          continue;
        }
        long fileIndex = Long.parseLong(name.substring(name.indexOf('-') + 1));
        if (fileIndex >= index) {
          // try and read the meta data
          try {
            byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile));
            if (data.length == 0) {
              continue;
            }
            XContentParser parser = XContentHelper.createParser(data, 0, data.length);
            try {
              String currentFieldName = null;
              XContentParser.Token token = parser.nextToken();
              if (token != null) {
                while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
                  if (token == XContentParser.Token.FIELD_NAME) {
                    currentFieldName = parser.currentName();
                  } else if (token == XContentParser.Token.START_OBJECT) {
                    if ("meta-data".equals(currentFieldName)) {
                      metaData = MetaData.Builder.fromXContent(parser);
                    }
                  } else if (token.isValue()) {
                    if ("version".equals(currentFieldName)) {
                      version = parser.longValue();
                    }
                  }
                }
              }
            } finally {
              parser.close();
            }
            index = fileIndex;
            metaDataFile = stateFile;
          } catch (IOException e) {
            logger.warn("failed to read pre 0.19 state from [" + name + "], ignoring...", e);
          }
        }
      }
    }
    if (metaData == null) {
      return;
    }

    logger.info(
        "found old metadata state, loading metadata from [{}] and converting to new metadata location and strucutre...",
        metaDataFile.getAbsolutePath());

    writeGlobalState(
        "upgrade", MetaData.builder().metaData(metaData).version(version).build(), null);
    for (IndexMetaData indexMetaData : metaData) {
      IndexMetaData.Builder indexMetaDataBuilder =
          IndexMetaData.newIndexMetaDataBuilder(indexMetaData).version(version);
      // set the created version to 0.18
      indexMetaDataBuilder.settings(
          ImmutableSettings.settingsBuilder()
              .put(indexMetaData.settings())
              .put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_18_0));
      writeIndex("upgrade", indexMetaDataBuilder.build(), null);
    }

    // rename shards state to backup state
    File backupFile = new File(metaDataFile.getParentFile(), "backup-" + metaDataFile.getName());
    if (!metaDataFile.renameTo(backupFile)) {
      throw new IOException(
          "failed to rename old state to backup state [" + metaDataFile.getAbsolutePath() + "]");
    }

    // delete all other shards state files
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File stateLocation = new File(dataLocation, "_state");
      if (!stateLocation.exists()) {
        continue;
      }
      File[] stateFiles = stateLocation.listFiles();
      if (stateFiles == null) {
        continue;
      }
      for (File stateFile : stateFiles) {
        String name = stateFile.getName();
        if (!name.startsWith("metadata-")) {
          continue;
        }
        stateFile.delete();
      }
    }

    logger.info(
        "conversion to new metadata location and format done, backup create at [{}]",
        backupFile.getAbsolutePath());
  }

  class RemoveDanglingIndex implements Runnable {

    private final String index;

    RemoveDanglingIndex(String index) {
      this.index = index;
    }

    @Override
    public void run() {
      synchronized (danglingMutex) {
        DanglingIndex remove = danglingIndices.remove(index);
        // no longer there...
        if (remove == null) {
          return;
        }
        logger.info("[{}] deleting dangling index", index);
        FileSystemUtils.deleteRecursively(nodeEnv.indexLocations(new Index(index)));
      }
    }
  }

  static class DanglingIndex {
    public final String index;
    public final ScheduledFuture future;

    DanglingIndex(String index, ScheduledFuture future) {
      this.index = index;
      this.future = future;
    }
  }
}
    @Override
    protected void doSample() {
      // the nodes we are going to ping include the core listed nodes that were added
      // and the last round of discovered nodes
      Set<DiscoveryNode> nodesToPing = Sets.newHashSet();
      for (DiscoveryNode node : listedNodes) {
        nodesToPing.add(node);
      }
      for (DiscoveryNode node : nodes) {
        nodesToPing.add(node);
      }

      final CountDownLatch latch = new CountDownLatch(nodesToPing.size());
      final ConcurrentMap<DiscoveryNode, ClusterStateResponse> clusterStateResponses =
          ConcurrentCollections.newConcurrentMap();
      for (final DiscoveryNode listedNode : nodesToPing) {
        threadPool
            .executor(ThreadPool.Names.MANAGEMENT)
            .execute(
                new Runnable() {
                  @Override
                  public void run() {
                    try {
                      if (!transportService.nodeConnected(listedNode)) {
                        try {

                          // if its one of the actual nodes we will talk to, not to listed nodes,
                          // fully connect
                          if (nodes.contains(listedNode)) {
                            logger.trace("connecting to cluster node [{}]", listedNode);
                            transportService.connectToNode(listedNode);
                          } else {
                            // its a listed node, light connect to it...
                            logger.trace("connecting to listed node (light) [{}]", listedNode);
                            transportService.connectToNodeLight(listedNode);
                          }
                        } catch (Exception e) {
                          logger.debug(
                              "failed to connect to node [{}], ignoring...", e, listedNode);
                          latch.countDown();
                          return;
                        }
                      }
                      transportService.sendRequest(
                          listedNode,
                          ClusterStateAction.NAME,
                          headers.applyTo(
                              Requests.clusterStateRequest().clear().nodes(true).local(true)),
                          TransportRequestOptions.options()
                              .withType(TransportRequestOptions.Type.STATE)
                              .withTimeout(pingTimeout),
                          new BaseTransportResponseHandler<ClusterStateResponse>() {

                            @Override
                            public ClusterStateResponse newInstance() {
                              return new ClusterStateResponse();
                            }

                            @Override
                            public String executor() {
                              return ThreadPool.Names.SAME;
                            }

                            @Override
                            public void handleResponse(ClusterStateResponse response) {
                              clusterStateResponses.put(listedNode, response);
                              latch.countDown();
                            }

                            @Override
                            public void handleException(TransportException e) {
                              logger.info(
                                  "failed to get local cluster state for {}, disconnecting...",
                                  e,
                                  listedNode);
                              transportService.disconnectFromNode(listedNode);
                              latch.countDown();
                            }
                          });
                    } catch (Throwable e) {
                      logger.info(
                          "failed to get local cluster state info for {}, disconnecting...",
                          e,
                          listedNode);
                      transportService.disconnectFromNode(listedNode);
                      latch.countDown();
                    }
                  }
                });
      }

      try {
        latch.await();
      } catch (InterruptedException e) {
        return;
      }

      HashSet<DiscoveryNode> newNodes = new HashSet<>();
      HashSet<DiscoveryNode> newFilteredNodes = new HashSet<>();
      for (Map.Entry<DiscoveryNode, ClusterStateResponse> entry :
          clusterStateResponses.entrySet()) {
        if (!ignoreClusterName && !clusterName.equals(entry.getValue().getClusterName())) {
          logger.warn(
              "node {} not part of the cluster {}, ignoring...",
              entry.getValue().getState().nodes().localNode(),
              clusterName);
          newFilteredNodes.add(entry.getKey());
          continue;
        }
        for (ObjectCursor<DiscoveryNode> cursor :
            entry.getValue().getState().nodes().dataNodes().values()) {
          newNodes.add(cursor.value);
        }
      }

      nodes = validateNewNodes(newNodes);
      filteredNodes = ImmutableList.copyOf(newFilteredNodes);
    }
예제 #16
0
public class TransportService extends AbstractLifecycleComponent<TransportService> {

  private final AtomicBoolean started = new AtomicBoolean(false);
  protected final Transport transport;
  protected final ThreadPool threadPool;

  volatile ImmutableMap<String, TransportRequestHandler> serverHandlers = ImmutableMap.of();
  final Object serverHandlersMutex = new Object();

  final ConcurrentMapLong<RequestHolder> clientHandlers =
      ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency();

  final AtomicLong requestIds = new AtomicLong();

  final CopyOnWriteArrayList<TransportConnectionListener> connectionListeners =
      new CopyOnWriteArrayList<>();

  // An LRU (don't really care about concurrency here) that holds the latest timed out requests so
  // if they
  // do show up, we can print more descriptive information about them
  final Map<Long, TimeoutInfoHolder> timeoutInfoHandlers =
      Collections.synchronizedMap(
          new LinkedHashMap<Long, TimeoutInfoHolder>(100, .75F, true) {
            protected boolean removeEldestEntry(Map.Entry eldest) {
              return size() > 100;
            }
          });

  private final TransportService.Adapter adapter = new Adapter();

  public TransportService(Transport transport, ThreadPool threadPool) {
    this(EMPTY_SETTINGS, transport, threadPool);
  }

  @Inject
  public TransportService(Settings settings, Transport transport, ThreadPool threadPool) {
    super(settings);
    this.transport = transport;
    this.threadPool = threadPool;
  }

  @Override
  protected void doStart() throws ElasticsearchException {
    adapter.rxMetric.clear();
    adapter.txMetric.clear();
    transport.transportServiceAdapter(adapter);
    transport.start();
    if (transport.boundAddress() != null && logger.isInfoEnabled()) {
      logger.info("{}", transport.boundAddress());
    }
    boolean setStarted = started.compareAndSet(false, true);
    assert setStarted : "service was already started";
  }

  @Override
  protected void doStop() throws ElasticsearchException {
    final boolean setStopped = started.compareAndSet(true, false);
    assert setStopped : "service has already been stopped";
    try {
      transport.stop();
    } finally {
      // in case the transport is not connected to our local node (thus cleaned on node disconnect)
      // make sure to clean any leftover on going handles
      for (Map.Entry<Long, RequestHolder> entry : clientHandlers.entrySet()) {
        final RequestHolder holderToNotify = clientHandlers.remove(entry.getKey());
        if (holderToNotify != null) {
          // callback that an exception happened, but on a different thread since we don't
          // want handlers to worry about stack overflows
          threadPool
              .generic()
              .execute(
                  new Runnable() {
                    @Override
                    public void run() {
                      holderToNotify
                          .handler()
                          .handleException(
                              new TransportException(
                                  "transport stopped, action: " + holderToNotify.action()));
                    }
                  });
        }
      }
    }
  }

  @Override
  protected void doClose() throws ElasticsearchException {
    transport.close();
  }

  public boolean addressSupported(Class<? extends TransportAddress> address) {
    return transport.addressSupported(address);
  }

  public TransportInfo info() {
    BoundTransportAddress boundTransportAddress = boundAddress();
    if (boundTransportAddress == null) {
      return null;
    }
    return new TransportInfo(boundTransportAddress);
  }

  public TransportStats stats() {
    return new TransportStats(
        transport.serverOpen(),
        adapter.rxMetric.count(),
        adapter.rxMetric.sum(),
        adapter.txMetric.count(),
        adapter.txMetric.sum());
  }

  public BoundTransportAddress boundAddress() {
    return transport.boundAddress();
  }

  public boolean nodeConnected(DiscoveryNode node) {
    return transport.nodeConnected(node);
  }

  public void connectToNode(DiscoveryNode node) throws ConnectTransportException {
    transport.connectToNode(node);
  }

  public void connectToNodeLight(DiscoveryNode node) throws ConnectTransportException {
    transport.connectToNodeLight(node);
  }

  public void disconnectFromNode(DiscoveryNode node) {
    transport.disconnectFromNode(node);
  }

  public void addConnectionListener(TransportConnectionListener listener) {
    connectionListeners.add(listener);
  }

  public void removeConnectionListener(TransportConnectionListener listener) {
    connectionListeners.remove(listener);
  }

  public <T extends TransportResponse> TransportFuture<T> submitRequest(
      DiscoveryNode node,
      String action,
      TransportRequest request,
      TransportResponseHandler<T> handler)
      throws TransportException {
    return submitRequest(node, action, request, TransportRequestOptions.EMPTY, handler);
  }

  public <T extends TransportResponse> TransportFuture<T> submitRequest(
      DiscoveryNode node,
      String action,
      TransportRequest request,
      TransportRequestOptions options,
      TransportResponseHandler<T> handler)
      throws TransportException {
    PlainTransportFuture<T> futureHandler = new PlainTransportFuture<>(handler);
    sendRequest(node, action, request, options, futureHandler);
    return futureHandler;
  }

  public <T extends TransportResponse> void sendRequest(
      final DiscoveryNode node,
      final String action,
      final TransportRequest request,
      final TransportResponseHandler<T> handler) {
    sendRequest(node, action, request, TransportRequestOptions.EMPTY, handler);
  }

  public <T extends TransportResponse> void sendRequest(
      final DiscoveryNode node,
      final String action,
      final TransportRequest request,
      final TransportRequestOptions options,
      TransportResponseHandler<T> handler) {
    if (node == null) {
      throw new ElasticsearchIllegalStateException("can't send request to a null node");
    }
    final long requestId = newRequestId();
    TimeoutHandler timeoutHandler = null;
    try {
      clientHandlers.put(requestId, new RequestHolder<>(handler, node, action, timeoutHandler));
      if (started.get() == false) {
        // if we are not started the exception handling will remove the RequestHolder again and
        // calls the handler to notify the caller.
        // it will only notify if the toStop code hasn't done the work yet.
        throw new TransportException("TransportService is closed stopped can't send request");
      }
      if (options.timeout() != null) {
        timeoutHandler = new TimeoutHandler(requestId);
        timeoutHandler.future =
            threadPool.schedule(options.timeout(), ThreadPool.Names.GENERIC, timeoutHandler);
      }
      transport.sendRequest(node, requestId, action, request, options);
    } catch (final Throwable e) {
      // usually happen either because we failed to connect to the node
      // or because we failed serializing the message
      final RequestHolder holderToNotify = clientHandlers.remove(requestId);
      // if the scheduler raise a EsRejectedExecutionException (due to shutdown), we may have a
      // timeout handler, but no future
      if (timeoutHandler != null) {
        FutureUtils.cancel(timeoutHandler.future);
      }

      // If holderToNotify == null then handler has already been taken care of.
      if (holderToNotify != null) {
        // callback that an exception happened, but on a different thread since we don't
        // want handlers to worry about stack overflows
        final SendRequestTransportException sendRequestException =
            new SendRequestTransportException(node, action, e);
        threadPool
            .executor(ThreadPool.Names.GENERIC)
            .execute(
                new Runnable() {
                  @Override
                  public void run() {
                    holderToNotify.handler().handleException(sendRequestException);
                  }
                });
      }
    }
  }

  private long newRequestId() {
    return requestIds.getAndIncrement();
  }

  public TransportAddress[] addressesFromString(String address) throws Exception {
    return transport.addressesFromString(address);
  }

  public void registerHandler(String action, TransportRequestHandler handler) {
    synchronized (serverHandlersMutex) {
      TransportRequestHandler handlerReplaced = serverHandlers.get(action);
      serverHandlers = MapBuilder.newMapBuilder(serverHandlers).put(action, handler).immutableMap();
      if (handlerReplaced != null) {
        logger.warn(
            "Registered two transport handlers for action {}, handlers: {}, {}",
            action,
            handler,
            handlerReplaced);
      }
    }
  }

  public void removeHandler(String action) {
    synchronized (serverHandlersMutex) {
      serverHandlers = MapBuilder.newMapBuilder(serverHandlers).remove(action).immutableMap();
    }
  }

  protected TransportRequestHandler getHandler(String action) {
    return serverHandlers.get(action);
  }

  class Adapter implements TransportServiceAdapter {

    final MeanMetric rxMetric = new MeanMetric();
    final MeanMetric txMetric = new MeanMetric();

    @Override
    public void received(long size) {
      rxMetric.inc(size);
    }

    @Override
    public void sent(long size) {
      txMetric.inc(size);
    }

    @Override
    public TransportRequestHandler handler(String action, Version version) {
      return serverHandlers.get(ActionNames.incomingAction(action, version));
    }

    @Override
    public TransportResponseHandler remove(long requestId) {
      RequestHolder holder = clientHandlers.remove(requestId);
      if (holder == null) {
        // lets see if its in the timeout holder
        TimeoutInfoHolder timeoutInfoHolder = timeoutInfoHandlers.remove(requestId);
        if (timeoutInfoHolder != null) {
          long time = System.currentTimeMillis();
          logger.warn(
              "Received response for a request that has timed out, sent [{}ms] ago, timed out [{}ms] ago, action [{}], node [{}], id [{}]",
              time - timeoutInfoHolder.sentTime(),
              time - timeoutInfoHolder.timeoutTime(),
              timeoutInfoHolder.action(),
              timeoutInfoHolder.node(),
              requestId);
        } else {
          logger.warn("Transport response handler not found of id [{}]", requestId);
        }
        return null;
      }
      holder.cancel();
      return holder.handler();
    }

    @Override
    public void raiseNodeConnected(final DiscoveryNode node) {
      threadPool
          .generic()
          .execute(
              new Runnable() {
                @Override
                public void run() {
                  for (TransportConnectionListener connectionListener : connectionListeners) {
                    connectionListener.onNodeConnected(node);
                  }
                }
              });
    }

    @Override
    public void raiseNodeDisconnected(final DiscoveryNode node) {
      try {
        for (final TransportConnectionListener connectionListener : connectionListeners) {
          threadPool
              .generic()
              .execute(
                  new Runnable() {
                    @Override
                    public void run() {
                      connectionListener.onNodeDisconnected(node);
                    }
                  });
        }
        for (Map.Entry<Long, RequestHolder> entry : clientHandlers.entrySet()) {
          RequestHolder holder = entry.getValue();
          if (holder.node().equals(node)) {
            final RequestHolder holderToNotify = clientHandlers.remove(entry.getKey());
            if (holderToNotify != null) {
              // callback that an exception happened, but on a different thread since we don't
              // want handlers to worry about stack overflows
              threadPool
                  .generic()
                  .execute(
                      new Runnable() {
                        @Override
                        public void run() {
                          holderToNotify
                              .handler()
                              .handleException(
                                  new NodeDisconnectedException(node, holderToNotify.action()));
                        }
                      });
            }
          }
        }
      } catch (EsRejectedExecutionException ex) {
        logger.debug("Rejected execution on NodeDisconnected", ex);
      }
    }

    @Override
    public String action(String action, Version version) {
      return ActionNames.outgoingAction(action, version);
    }
  }

  class TimeoutHandler implements Runnable {

    private final long requestId;

    private final long sentTime = System.currentTimeMillis();

    ScheduledFuture future;

    TimeoutHandler(long requestId) {
      this.requestId = requestId;
    }

    public long sentTime() {
      return sentTime;
    }

    @Override
    public void run() {
      if (future.isCancelled()) {
        return;
      }
      final RequestHolder holder = clientHandlers.remove(requestId);
      if (holder != null) {
        // add it to the timeout information holder, in case we are going to get a response later
        long timeoutTime = System.currentTimeMillis();
        timeoutInfoHandlers.put(
            requestId,
            new TimeoutInfoHolder(holder.node(), holder.action(), sentTime, timeoutTime));
        holder
            .handler()
            .handleException(
                new ReceiveTimeoutTransportException(
                    holder.node(),
                    holder.action(),
                    "request_id ["
                        + requestId
                        + "] timed out after ["
                        + (timeoutTime - sentTime)
                        + "ms]"));
      }
    }
  }

  static class TimeoutInfoHolder {

    private final DiscoveryNode node;

    private final String action;

    private final long sentTime;

    private final long timeoutTime;

    TimeoutInfoHolder(DiscoveryNode node, String action, long sentTime, long timeoutTime) {
      this.node = node;
      this.action = action;
      this.sentTime = sentTime;
      this.timeoutTime = timeoutTime;
    }

    public DiscoveryNode node() {
      return node;
    }

    public String action() {
      return action;
    }

    public long sentTime() {
      return sentTime;
    }

    public long timeoutTime() {
      return timeoutTime;
    }
  }

  static class RequestHolder<T extends TransportResponse> {

    private final TransportResponseHandler<T> handler;

    private final DiscoveryNode node;

    private final String action;

    private final TimeoutHandler timeout;

    RequestHolder(
        TransportResponseHandler<T> handler,
        DiscoveryNode node,
        String action,
        TimeoutHandler timeout) {
      this.handler = handler;
      this.node = node;
      this.action = action;
      this.timeout = timeout;
    }

    public TransportResponseHandler<T> handler() {
      return handler;
    }

    public DiscoveryNode node() {
      return this.node;
    }

    public String action() {
      return this.action;
    }

    public void cancel() {
      if (timeout != null) {
        FutureUtils.cancel(timeout.future);
      }
    }
  }
}
  /**
   * inner class is responsible for send the requests to all replica shards and manage the responses
   */
  final class ReplicationPhase extends AbstractRunnable {

    private final ReplicaRequest replicaRequest;
    private final Response finalResponse;
    private final ShardIterator shardIt;
    private final ActionListener<Response> listener;
    private final AtomicBoolean finished = new AtomicBoolean(false);
    private final AtomicInteger success =
        new AtomicInteger(1); // We already wrote into the primary shard
    private final ConcurrentMap<String, Throwable> shardReplicaFailures =
        ConcurrentCollections.newConcurrentMap();
    private final IndexMetaData indexMetaData;
    private final ShardRouting originalPrimaryShard;
    private final AtomicInteger pending;
    private final int totalShards;
    private final ClusterStateObserver observer;
    private final Releasable indexShardReference;
    private final TimeValue shardFailedTimeout;

    /**
     * the constructor doesn't take any action, just calculates state. Call {@link #run()} to start
     * replicating.
     */
    public ReplicationPhase(
        ShardIterator originalShardIt,
        ReplicaRequest replicaRequest,
        Response finalResponse,
        ClusterStateObserver observer,
        ShardRouting originalPrimaryShard,
        InternalRequest internalRequest,
        ActionListener<Response> listener,
        Releasable indexShardReference,
        TimeValue shardFailedTimeout) {
      this.replicaRequest = replicaRequest;
      this.listener = listener;
      this.finalResponse = finalResponse;
      this.originalPrimaryShard = originalPrimaryShard;
      this.observer = observer;
      indexMetaData = observer.observedState().metaData().index(internalRequest.concreteIndex());
      this.indexShardReference = indexShardReference;
      this.shardFailedTimeout = shardFailedTimeout;

      ShardRouting shard;
      // we double check on the state, if it got changed we need to make sure we take the latest one
      // cause
      // maybe a replica shard started its recovery process and we need to apply it there...

      // we also need to make sure if the new state has a new primary shard (that we indexed to
      // before) started
      // and assigned to another node (while the indexing happened). In that case, we want to apply
      // it on the
      // new primary shard as well...
      ClusterState newState = clusterService.state();

      int numberOfUnassignedOrIgnoredReplicas = 0;
      int numberOfPendingShardInstances = 0;
      if (observer.observedState() != newState) {
        observer.reset(newState);
        shardIt = shards(newState, internalRequest);
        while ((shard = shardIt.nextOrNull()) != null) {
          if (shard.primary()) {
            if (originalPrimaryShard.currentNodeId().equals(shard.currentNodeId()) == false) {
              // there is a new primary, we'll have to replicate to it.
              numberOfPendingShardInstances++;
            }
            if (shard.relocating()) {
              numberOfPendingShardInstances++;
            }
          } else if (shouldExecuteReplication(indexMetaData.getSettings()) == false) {
            // If the replicas use shadow replicas, there is no reason to
            // perform the action on the replica, so skip it and
            // immediately return

            // this delays mapping updates on replicas because they have
            // to wait until they get the new mapping through the cluster
            // state, which is why we recommend pre-defined mappings for
            // indices using shadow replicas
            numberOfUnassignedOrIgnoredReplicas++;
          } else if (shard.unassigned()) {
            numberOfUnassignedOrIgnoredReplicas++;
          } else if (shard.relocating()) {
            // we need to send to two copies
            numberOfPendingShardInstances += 2;
          } else {
            numberOfPendingShardInstances++;
          }
        }
      } else {
        shardIt = originalShardIt;
        shardIt.reset();
        while ((shard = shardIt.nextOrNull()) != null) {
          if (shard.unassigned()) {
            numberOfUnassignedOrIgnoredReplicas++;
          } else if (shard.primary()) {
            if (shard.relocating()) {
              // we have to replicate to the other copy
              numberOfPendingShardInstances += 1;
            }
          } else if (shouldExecuteReplication(indexMetaData.getSettings()) == false) {
            // If the replicas use shadow replicas, there is no reason to
            // perform the action on the replica, so skip it and
            // immediately return

            // this delays mapping updates on replicas because they have
            // to wait until they get the new mapping through the cluster
            // state, which is why we recommend pre-defined mappings for
            // indices using shadow replicas
            numberOfUnassignedOrIgnoredReplicas++;
          } else if (shard.relocating()) {
            // we need to send to two copies
            numberOfPendingShardInstances += 2;
          } else {
            numberOfPendingShardInstances++;
          }
        }
      }

      // one for the primary already done
      this.totalShards = 1 + numberOfPendingShardInstances + numberOfUnassignedOrIgnoredReplicas;
      this.pending = new AtomicInteger(numberOfPendingShardInstances);
    }

    /** total shard copies */
    int totalShards() {
      return totalShards;
    }

    /** total successful operations so far */
    int successful() {
      return success.get();
    }

    /** number of pending operations */
    int pending() {
      return pending.get();
    }

    @Override
    public void onFailure(Throwable t) {
      logger.error(
          "unexpected error while replicating for action [{}]. shard [{}]. ",
          t,
          actionName,
          shardIt.shardId());
      forceFinishAsFailed(t);
    }

    /** start sending current requests to replicas */
    @Override
    protected void doRun() {
      if (pending.get() == 0) {
        doFinish();
        return;
      }
      ShardRouting shard;
      shardIt.reset(); // reset the iterator
      while ((shard = shardIt.nextOrNull()) != null) {
        // if its unassigned, nothing to do here...
        if (shard.unassigned()) {
          continue;
        }

        // we index on a replica that is initializing as well since we might not have got the event
        // yet that it was started. We will get an exception IllegalShardState exception if its not
        // started
        // and that's fine, we will ignore it
        if (shard.primary()) {
          if (originalPrimaryShard.currentNodeId().equals(shard.currentNodeId()) == false) {
            // there is a new primary, we'll have to replicate to it.
            performOnReplica(shard, shard.currentNodeId());
          }
          if (shard.relocating()) {
            performOnReplica(shard, shard.relocatingNodeId());
          }
        } else if (shouldExecuteReplication(indexMetaData.getSettings())) {
          performOnReplica(shard, shard.currentNodeId());
          if (shard.relocating()) {
            performOnReplica(shard, shard.relocatingNodeId());
          }
        }
      }
    }

    /** send operation to the given node or perform it if local */
    void performOnReplica(final ShardRouting shard, final String nodeId) {
      // if we don't have that node, it means that it might have failed and will be created again,
      // in
      // this case, we don't have to do the operation, and just let it failover
      if (!observer.observedState().nodes().nodeExists(nodeId)) {
        onReplicaFailure(nodeId, null);
        return;
      }

      replicaRequest.internalShardId = shardIt.shardId();

      if (!nodeId.equals(observer.observedState().nodes().localNodeId())) {
        final DiscoveryNode node = observer.observedState().nodes().get(nodeId);
        transportService.sendRequest(
            node,
            transportReplicaAction,
            replicaRequest,
            transportOptions,
            new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
              @Override
              public void handleResponse(TransportResponse.Empty vResponse) {
                onReplicaSuccess();
              }

              @Override
              public void handleException(TransportException exp) {
                logger.trace(
                    "[{}] transport failure during replica request [{}] ",
                    exp,
                    node,
                    replicaRequest);
                if (ignoreReplicaException(exp)) {
                  onReplicaFailure(nodeId, exp);
                } else {
                  logger.warn(
                      "{} failed to perform {} on node {}",
                      exp,
                      shardIt.shardId(),
                      actionName,
                      node);
                  shardStateAction.shardFailed(
                      shard,
                      indexMetaData.getIndexUUID(),
                      "failed to perform " + actionName + " on replica on node " + node,
                      exp,
                      shardFailedTimeout,
                      new ReplicationFailedShardStateListener(nodeId, exp));
                }
              }
            });
      } else {
        try {
          threadPool
              .executor(executor)
              .execute(
                  new AbstractRunnable() {
                    @Override
                    protected void doRun() {
                      try {
                        shardOperationOnReplica(shard.shardId(), replicaRequest);
                        onReplicaSuccess();
                      } catch (Throwable e) {
                        onReplicaFailure(nodeId, e);
                        failReplicaIfNeeded(shard.index(), shard.id(), e);
                      }
                    }

                    // we must never reject on because of thread pool capacity on replicas
                    @Override
                    public boolean isForceExecution() {
                      return true;
                    }

                    @Override
                    public void onFailure(Throwable t) {
                      onReplicaFailure(nodeId, t);
                    }
                  });
        } catch (Throwable e) {
          failReplicaIfNeeded(shard.index(), shard.id(), e);
          onReplicaFailure(nodeId, e);
        }
      }
    }

    void onReplicaFailure(String nodeId, @Nullable Throwable e) {
      // Only version conflict should be ignored from being put into the _shards header?
      if (e != null && ignoreReplicaException(e) == false) {
        shardReplicaFailures.put(nodeId, e);
      }
      decPendingAndFinishIfNeeded();
    }

    void onReplicaSuccess() {
      success.incrementAndGet();
      decPendingAndFinishIfNeeded();
    }

    private void decPendingAndFinishIfNeeded() {
      if (pending.decrementAndGet() <= 0) {
        doFinish();
      }
    }

    private void forceFinishAsFailed(Throwable t) {
      if (finished.compareAndSet(false, true)) {
        Releasables.close(indexShardReference);
        listener.onFailure(t);
      }
    }

    private void doFinish() {
      if (finished.compareAndSet(false, true)) {
        Releasables.close(indexShardReference);
        final ShardId shardId = shardIt.shardId();
        final ActionWriteResponse.ShardInfo.Failure[] failuresArray;
        if (!shardReplicaFailures.isEmpty()) {
          int slot = 0;
          failuresArray = new ActionWriteResponse.ShardInfo.Failure[shardReplicaFailures.size()];
          for (Map.Entry<String, Throwable> entry : shardReplicaFailures.entrySet()) {
            RestStatus restStatus = ExceptionsHelper.status(entry.getValue());
            failuresArray[slot++] =
                new ActionWriteResponse.ShardInfo.Failure(
                    shardId.getIndex(),
                    shardId.getId(),
                    entry.getKey(),
                    entry.getValue(),
                    restStatus,
                    false);
          }
        } else {
          failuresArray = ActionWriteResponse.EMPTY;
        }
        finalResponse.setShardInfo(
            new ActionWriteResponse.ShardInfo(totalShards, success.get(), failuresArray));

        listener.onResponse(finalResponse);
      }
    }

    public class ReplicationFailedShardStateListener implements ShardStateAction.Listener {
      private final String nodeId;
      private Throwable failure;

      public ReplicationFailedShardStateListener(String nodeId, Throwable failure) {
        this.nodeId = nodeId;
        this.failure = failure;
      }

      @Override
      public void onSuccess() {
        onReplicaFailure(nodeId, failure);
      }

      @Override
      public void onShardFailedNoMaster() {
        onReplicaFailure(nodeId, failure);
      }

      @Override
      public void onShardFailedFailure(DiscoveryNode master, TransportException e) {
        if (e instanceof ReceiveTimeoutTransportException) {
          logger.trace("timeout sending shard failure to master [{}]", e, master);
        }
        onReplicaFailure(nodeId, failure);
      }
    }
  }
  /**
   * Ensures that the mapping in the cluster state is the same as the mapping in our mapper service.
   * If the mapping is not in sync, sends a request to update it in the cluster state and blocks
   * until it has finished being updated.
   */
  private void updateMappingOnMaster() {
    // we test that the cluster state is in sync with our in memory mapping stored by the
    // mapperService
    // we have to do it under the "cluster state update" thread to make sure that one doesn't modify
    // it
    // while we're checking
    final BlockingQueue<DocumentMapper> documentMappersToUpdate =
        ConcurrentCollections.newBlockingQueue();
    final CountDownLatch latch = new CountDownLatch(1);
    final AtomicReference<Throwable> mappingCheckException = new AtomicReference<>();

    // we use immediate as this is a very light weight check and we don't wait to delay recovery
    clusterService.submitStateUpdateTask(
        "recovery_mapping_check",
        Priority.IMMEDIATE,
        new MappingUpdateTask(
            clusterService,
            indexService,
            recoverySettings,
            latch,
            documentMappersToUpdate,
            mappingCheckException,
            this.cancellableThreads));
    cancellableThreads.execute(
        new Interruptable() {
          @Override
          public void run() throws InterruptedException {
            latch.await();
          }
        });
    if (mappingCheckException.get() != null) {
      logger.warn("error during mapping check, failing recovery", mappingCheckException.get());
      throw new ElasticsearchException("error during mapping check", mappingCheckException.get());
    }
    if (documentMappersToUpdate.isEmpty()) {
      return;
    }
    final CountDownLatch updatedOnMaster = new CountDownLatch(documentMappersToUpdate.size());
    MappingUpdatedAction.MappingUpdateListener listener =
        new MappingUpdatedAction.MappingUpdateListener() {
          @Override
          public void onMappingUpdate() {
            updatedOnMaster.countDown();
          }

          @Override
          public void onFailure(Throwable t) {
            logger.debug(
                "{} recovery to {}: failed to update mapping on master",
                request.shardId(),
                request.targetNode(),
                t);
            updatedOnMaster.countDown();
          }
        };
    for (DocumentMapper documentMapper : documentMappersToUpdate) {
      mappingUpdatedAction.updateMappingOnMaster(
          indexService.index().getName(), documentMapper, indexService.indexUUID(), listener);
    }
    cancellableThreads.execute(
        new Interruptable() {
          @Override
          public void run() throws InterruptedException {
            try {
              if (!updatedOnMaster.await(
                  recoverySettings.internalActionTimeout().millis(), TimeUnit.MILLISECONDS)) {
                logger.debug(
                    "[{}][{}] recovery [phase2] to {}: waiting on pending mapping update timed out. waited [{}]",
                    indexName,
                    shardId,
                    request.targetNode(),
                    recoverySettings.internalActionTimeout());
              }
            } catch (InterruptedException e) {
              Thread.currentThread().interrupt();
              logger.debug("interrupted while waiting for mapping to update on master");
            }
          }
        });
  }
예제 #19
0
/**
 * The tribe service holds a list of node clients connected to a list of tribe members, and uses
 * their cluster state events to update this local node cluster state with the merged view of it.
 *
 * <p>The {@link #processSettings(org.elasticsearch.common.settings.Settings)} method should be
 * called before starting the node, so it will make sure to configure this current node properly
 * with the relevant tribe node settings.
 *
 * <p>The tribe node settings make sure the discovery used is "local", but with no master elected.
 * This means no write level master node operations will work ({@link
 * org.elasticsearch.discovery.MasterNotDiscoveredException} will be thrown), and state level
 * metadata operations with automatically use the local flag.
 *
 * <p>The state merged from different clusters include the list of nodes, metadata, and routing
 * table. Each node merged will have in its tribe which tribe member it came from. Each index merged
 * will have in its settings which tribe member it came from. In case an index has already been
 * merged from one cluster, and the same name index is discovered in another cluster, the conflict
 * one will be discarded. This happens because we need to have the correct index name to propagate
 * to the relevant cluster.
 */
public class TribeService extends AbstractLifecycleComponent<TribeService> {

  public static final ClusterBlock TRIBE_METADATA_BLOCK =
      new ClusterBlock(
          10,
          "tribe node, metadata not allowed",
          false,
          false,
          RestStatus.BAD_REQUEST,
          EnumSet.of(ClusterBlockLevel.METADATA_READ, ClusterBlockLevel.METADATA_WRITE));
  public static final ClusterBlock TRIBE_WRITE_BLOCK =
      new ClusterBlock(
          11,
          "tribe node, write not allowed",
          false,
          false,
          RestStatus.BAD_REQUEST,
          EnumSet.of(ClusterBlockLevel.WRITE));

  public static Settings processSettings(Settings settings) {
    if (settings.get(TRIBE_NAME) != null) {
      // if its a node client started by this service as tribe, remove any tribe group setting
      // to avoid recursive configuration
      Settings.Builder sb = Settings.builder().put(settings);
      for (String s : settings.getAsMap().keySet()) {
        if (s.startsWith("tribe.") && !s.equals(TRIBE_NAME)) {
          sb.remove(s);
        }
      }
      return sb.build();
    }
    Map<String, Settings> nodesSettings = settings.getGroups("tribe", true);
    if (nodesSettings.isEmpty()) {
      return settings;
    }
    // its a tribe configured node..., force settings
    Settings.Builder sb = Settings.builder().put(settings);
    sb.put(Node.NODE_CLIENT_SETTING.getKey(), true); // this node should just act as a node client
    sb.put(
        DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey(),
        "local"); // a tribe node should not use zen discovery
    sb.put(
        DiscoveryService.INITIAL_STATE_TIMEOUT_SETTING.getKey(),
        0); // nothing is going to be discovered, since no master will be elected
    if (sb.get("cluster.name") == null) {
      sb.put(
          "cluster.name",
          "tribe_"
              + Strings
                  .randomBase64UUID()); // make sure it won't join other tribe nodes in the same JVM
    }
    sb.put(TransportMasterNodeReadAction.FORCE_LOCAL_SETTING, true);
    return sb.build();
  }

  public static final String TRIBE_NAME = "tribe.name";

  private final ClusterService clusterService;
  private final String[] blockIndicesWrite;
  private final String[] blockIndicesRead;
  private final String[] blockIndicesMetadata;

  private static final String ON_CONFLICT_ANY = "any",
      ON_CONFLICT_DROP = "drop",
      ON_CONFLICT_PREFER = "prefer_";
  private final String onConflict;
  private final Set<String> droppedIndices = ConcurrentCollections.newConcurrentSet();

  private final List<Node> nodes = new CopyOnWriteArrayList<>();

  @Inject
  public TribeService(
      Settings settings, ClusterService clusterService, DiscoveryService discoveryService) {
    super(settings);
    this.clusterService = clusterService;
    Map<String, Settings> nodesSettings = new HashMap<>(settings.getGroups("tribe", true));
    nodesSettings.remove("blocks"); // remove prefix settings that don't indicate a client
    nodesSettings.remove("on_conflict"); // remove prefix settings that don't indicate a client
    for (Map.Entry<String, Settings> entry : nodesSettings.entrySet()) {
      Settings.Builder sb = Settings.builder().put(entry.getValue());
      sb.put("name", settings.get("name") + "/" + entry.getKey());
      sb.put(
          Environment.PATH_HOME_SETTING.getKey(),
          Environment.PATH_HOME_SETTING.get(settings)); // pass through ES home dir
      sb.put(TRIBE_NAME, entry.getKey());
      if (sb.get("http.enabled") == null) {
        sb.put("http.enabled", false);
      }
      sb.put(Node.NODE_CLIENT_SETTING.getKey(), true);
      nodes.add(new TribeClientNode(sb.build()));
    }

    String[] blockIndicesWrite = Strings.EMPTY_ARRAY;
    String[] blockIndicesRead = Strings.EMPTY_ARRAY;
    String[] blockIndicesMetadata = Strings.EMPTY_ARRAY;
    if (!nodes.isEmpty()) {
      // remove the initial election / recovery blocks since we are not going to have a
      // master elected in this single tribe  node local "cluster"
      clusterService.removeInitialStateBlock(discoveryService.getNoMasterBlock());
      clusterService.removeInitialStateBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK);
      if (settings.getAsBoolean("tribe.blocks.write", false)) {
        clusterService.addInitialStateBlock(TRIBE_WRITE_BLOCK);
      }
      blockIndicesWrite = settings.getAsArray("tribe.blocks.write.indices", Strings.EMPTY_ARRAY);
      if (settings.getAsBoolean("tribe.blocks.metadata", false)) {
        clusterService.addInitialStateBlock(TRIBE_METADATA_BLOCK);
      }
      blockIndicesMetadata =
          settings.getAsArray("tribe.blocks.metadata.indices", Strings.EMPTY_ARRAY);
      blockIndicesRead = settings.getAsArray("tribe.blocks.read.indices", Strings.EMPTY_ARRAY);
      for (Node node : nodes) {
        node.injector().getInstance(ClusterService.class).add(new TribeClusterStateListener(node));
      }
    }
    this.blockIndicesMetadata = blockIndicesMetadata;
    this.blockIndicesRead = blockIndicesRead;
    this.blockIndicesWrite = blockIndicesWrite;

    this.onConflict = settings.get("tribe.on_conflict", ON_CONFLICT_ANY);
  }

  @Override
  protected void doStart() {
    for (Node node : nodes) {
      try {
        node.start();
      } catch (Throwable e) {
        // calling close is safe for non started nodes, we can just iterate over all
        for (Node otherNode : nodes) {
          try {
            otherNode.close();
          } catch (Throwable t) {
            logger.warn("failed to close node {} on failed start", otherNode, t);
          }
        }
        if (e instanceof RuntimeException) {
          throw (RuntimeException) e;
        }
        throw new ElasticsearchException(e);
      }
    }
  }

  @Override
  protected void doStop() {
    doClose();
  }

  @Override
  protected void doClose() {
    for (Node node : nodes) {
      try {
        node.close();
      } catch (Throwable t) {
        logger.warn("failed to close node {}", t, node);
      }
    }
  }

  class TribeClusterStateListener implements ClusterStateListener {
    private final String tribeName;
    private final TribeNodeClusterStateTaskExecutor executor;

    TribeClusterStateListener(Node tribeNode) {
      String tribeName = tribeNode.settings().get(TRIBE_NAME);
      this.tribeName = tribeName;
      executor = new TribeNodeClusterStateTaskExecutor(tribeName);
    }

    @Override
    public void clusterChanged(final ClusterChangedEvent event) {
      logger.debug("[{}] received cluster event, [{}]", tribeName, event.source());
      clusterService.submitStateUpdateTask(
          "cluster event from " + tribeName + ", " + event.source(),
          event,
          ClusterStateTaskConfig.build(Priority.NORMAL),
          executor,
          (source, t) -> logger.warn("failed to process [{}]", t, source));
    }
  }

  class TribeNodeClusterStateTaskExecutor implements ClusterStateTaskExecutor<ClusterChangedEvent> {
    private final String tribeName;

    TribeNodeClusterStateTaskExecutor(String tribeName) {
      this.tribeName = tribeName;
    }

    @Override
    public boolean runOnlyOnMaster() {
      return false;
    }

    @Override
    public BatchResult<ClusterChangedEvent> execute(
        ClusterState currentState, List<ClusterChangedEvent> tasks) throws Exception {
      ClusterState accumulator = ClusterState.builder(currentState).build();
      BatchResult.Builder<ClusterChangedEvent> builder = BatchResult.builder();

      try {
        // we only need to apply the latest cluster state update
        accumulator = applyUpdate(accumulator, tasks.get(tasks.size() - 1));
        builder.successes(tasks);
      } catch (Throwable t) {
        builder.failures(tasks, t);
      }

      return builder.build(accumulator);
    }

    private ClusterState applyUpdate(ClusterState currentState, ClusterChangedEvent task) {
      boolean clusterStateChanged = false;
      ClusterState tribeState = task.state();
      DiscoveryNodes.Builder nodes = DiscoveryNodes.builder(currentState.nodes());
      // -- merge nodes
      // go over existing nodes, and see if they need to be removed
      for (DiscoveryNode discoNode : currentState.nodes()) {
        String markedTribeName = discoNode.attributes().get(TRIBE_NAME);
        if (markedTribeName != null && markedTribeName.equals(tribeName)) {
          if (tribeState.nodes().get(discoNode.id()) == null) {
            clusterStateChanged = true;
            logger.info("[{}] removing node [{}]", tribeName, discoNode);
            nodes.remove(discoNode.id());
          }
        }
      }
      // go over tribe nodes, and see if they need to be added
      for (DiscoveryNode tribe : tribeState.nodes()) {
        if (currentState.nodes().get(tribe.id()) == null) {
          // a new node, add it, but also add the tribe name to the attributes
          Map<String, String> tribeAttr = new HashMap<>();
          for (ObjectObjectCursor<String, String> attr : tribe.attributes()) {
            tribeAttr.put(attr.key, attr.value);
          }
          tribeAttr.put(TRIBE_NAME, tribeName);
          DiscoveryNode discoNode =
              new DiscoveryNode(
                  tribe.name(),
                  tribe.id(),
                  tribe.getHostName(),
                  tribe.getHostAddress(),
                  tribe.address(),
                  unmodifiableMap(tribeAttr),
                  tribe.version());
          clusterStateChanged = true;
          logger.info("[{}] adding node [{}]", tribeName, discoNode);
          nodes.put(discoNode);
        }
      }

      // -- merge metadata
      ClusterBlocks.Builder blocks = ClusterBlocks.builder().blocks(currentState.blocks());
      MetaData.Builder metaData = MetaData.builder(currentState.metaData());
      RoutingTable.Builder routingTable = RoutingTable.builder(currentState.routingTable());
      // go over existing indices, and see if they need to be removed
      for (IndexMetaData index : currentState.metaData()) {
        String markedTribeName = index.getSettings().get(TRIBE_NAME);
        if (markedTribeName != null && markedTribeName.equals(tribeName)) {
          IndexMetaData tribeIndex = tribeState.metaData().index(index.getIndex());
          clusterStateChanged = true;
          if (tribeIndex == null || tribeIndex.getState() == IndexMetaData.State.CLOSE) {
            logger.info("[{}] removing index [{}]", tribeName, index.getIndex());
            removeIndex(blocks, metaData, routingTable, index);
          } else {
            // always make sure to update the metadata and routing table, in case
            // there are changes in them (new mapping, shards moving from initializing to started)
            routingTable.add(tribeState.routingTable().index(index.getIndex()));
            Settings tribeSettings =
                Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build();
            metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings));
          }
        }
      }
      // go over tribe one, and see if they need to be added
      for (IndexMetaData tribeIndex : tribeState.metaData()) {
        // if there is no routing table yet, do nothing with it...
        IndexRoutingTable table = tribeState.routingTable().index(tribeIndex.getIndex());
        if (table == null) {
          continue;
        }
        final IndexMetaData indexMetaData = currentState.metaData().index(tribeIndex.getIndex());
        if (indexMetaData == null) {
          if (!droppedIndices.contains(tribeIndex.getIndex())) {
            // a new index, add it, and add the tribe name as a setting
            clusterStateChanged = true;
            logger.info("[{}] adding index [{}]", tribeName, tribeIndex.getIndex());
            addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex);
          }
        } else {
          String existingFromTribe = indexMetaData.getSettings().get(TRIBE_NAME);
          if (!tribeName.equals(existingFromTribe)) {
            // we have a potential conflict on index names, decide what to do...
            if (ON_CONFLICT_ANY.equals(onConflict)) {
              // we chose any tribe, carry on
            } else if (ON_CONFLICT_DROP.equals(onConflict)) {
              // drop the indices, there is a conflict
              clusterStateChanged = true;
              logger.info(
                  "[{}] dropping index [{}] due to conflict with [{}]",
                  tribeName,
                  tribeIndex.getIndex(),
                  existingFromTribe);
              removeIndex(blocks, metaData, routingTable, tribeIndex);
              droppedIndices.add(tribeIndex.getIndex());
            } else if (onConflict.startsWith(ON_CONFLICT_PREFER)) {
              // on conflict, prefer a tribe...
              String preferredTribeName = onConflict.substring(ON_CONFLICT_PREFER.length());
              if (tribeName.equals(preferredTribeName)) {
                // the new one is hte preferred one, replace...
                clusterStateChanged = true;
                logger.info(
                    "[{}] adding index [{}], preferred over [{}]",
                    tribeName,
                    tribeIndex.getIndex(),
                    existingFromTribe);
                removeIndex(blocks, metaData, routingTable, tribeIndex);
                addNewIndex(tribeState, blocks, metaData, routingTable, tribeIndex);
              } // else: either the existing one is the preferred one, or we haven't seen one, carry
                // on
            }
          }
        }
      }

      if (!clusterStateChanged) {
        return currentState;
      } else {
        return ClusterState.builder(currentState)
            .incrementVersion()
            .blocks(blocks)
            .nodes(nodes)
            .metaData(metaData)
            .routingTable(routingTable.build())
            .build();
      }
    }

    private void removeIndex(
        ClusterBlocks.Builder blocks,
        MetaData.Builder metaData,
        RoutingTable.Builder routingTable,
        IndexMetaData index) {
      metaData.remove(index.getIndex());
      routingTable.remove(index.getIndex());
      blocks.removeIndexBlocks(index.getIndex());
    }

    private void addNewIndex(
        ClusterState tribeState,
        ClusterBlocks.Builder blocks,
        MetaData.Builder metaData,
        RoutingTable.Builder routingTable,
        IndexMetaData tribeIndex) {
      Settings tribeSettings =
          Settings.builder().put(tribeIndex.getSettings()).put(TRIBE_NAME, tribeName).build();
      metaData.put(IndexMetaData.builder(tribeIndex).settings(tribeSettings));
      routingTable.add(tribeState.routingTable().index(tribeIndex.getIndex()));
      if (Regex.simpleMatch(blockIndicesMetadata, tribeIndex.getIndex())) {
        blocks.addIndexBlock(tribeIndex.getIndex(), IndexMetaData.INDEX_METADATA_BLOCK);
      }
      if (Regex.simpleMatch(blockIndicesRead, tribeIndex.getIndex())) {
        blocks.addIndexBlock(tribeIndex.getIndex(), IndexMetaData.INDEX_READ_BLOCK);
      }
      if (Regex.simpleMatch(blockIndicesWrite, tribeIndex.getIndex())) {
        blocks.addIndexBlock(tribeIndex.getIndex(), IndexMetaData.INDEX_WRITE_BLOCK);
      }
    }
  }
}
public class GatewayAllocator extends AbstractComponent {

  public static final String INDEX_RECOVERY_INITIAL_SHARDS = "index.recovery.initial_shards";

  private final TransportNodesListGatewayStartedShards listGatewayStartedShards;

  private final TransportNodesListShardStoreMetaData listShardStoreMetaData;

  private final ConcurrentMap<
          ShardId, Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>>
      cachedStores = ConcurrentCollections.newConcurrentMap();

  private final ConcurrentMap<ShardId, ObjectLongOpenHashMap<DiscoveryNode>> cachedShardsState =
      ConcurrentCollections.newConcurrentMap();

  private final TimeValue listTimeout;

  private final String initialShards;

  @Inject
  public GatewayAllocator(
      Settings settings,
      TransportNodesListGatewayStartedShards listGatewayStartedShards,
      TransportNodesListShardStoreMetaData listShardStoreMetaData) {
    super(settings);
    this.listGatewayStartedShards = listGatewayStartedShards;
    this.listShardStoreMetaData = listShardStoreMetaData;

    this.listTimeout =
        componentSettings.getAsTime(
            "list_timeout",
            settings.getAsTime("gateway.local.list_timeout", TimeValue.timeValueSeconds(30)));
    this.initialShards =
        componentSettings.get(
            "initial_shards", settings.get("gateway.local.initial_shards", "quorum"));

    logger.debug("using initial_shards [{}], list_timeout [{}]", initialShards, listTimeout);
  }

  public void applyStartedShards(StartedRerouteAllocation allocation) {
    for (ShardRouting shardRouting : allocation.startedShards()) {
      cachedStores.remove(shardRouting.shardId());
      cachedShardsState.remove(shardRouting.shardId());
    }
  }

  public void applyFailedShards(FailedRerouteAllocation allocation) {
    for (ShardRouting failedShard : allocation.failedShards()) {
      cachedStores.remove(failedShard.shardId());
      cachedShardsState.remove(failedShard.shardId());
    }
  }

  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }

  private ObjectLongOpenHashMap<DiscoveryNode> buildShardStates(
      final DiscoveryNodes nodes, MutableShardRouting shard) {
    ObjectLongOpenHashMap<DiscoveryNode> shardStates = cachedShardsState.get(shard.shardId());
    ObjectOpenHashSet<String> nodeIds;
    if (shardStates == null) {
      shardStates = new ObjectLongOpenHashMap<>();
      cachedShardsState.put(shard.shardId(), shardStates);
      nodeIds = ObjectOpenHashSet.from(nodes.dataNodes().keys());
    } else {
      // clean nodes that have failed
      shardStates
          .keys()
          .removeAll(
              new ObjectPredicate<DiscoveryNode>() {
                @Override
                public boolean apply(DiscoveryNode node) {
                  return !nodes.nodeExists(node.id());
                }
              });
      nodeIds = ObjectOpenHashSet.newInstance();
      // we have stored cached from before, see if the nodes changed, if they have, go fetch again
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        DiscoveryNode node = cursor.value;
        if (!shardStates.containsKey(node)) {
          nodeIds.add(node.id());
        }
      }
    }
    if (nodeIds.isEmpty()) {
      return shardStates;
    }

    String[] nodesIdsArray = nodeIds.toArray(String.class);
    TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response =
        listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet();
    if (logger.isDebugEnabled()) {
      if (response.failures().length > 0) {
        StringBuilder sb =
            new StringBuilder(shard + ": failures when trying to list shards on nodes:");
        for (int i = 0; i < response.failures().length; i++) {
          Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]);
          if (cause instanceof ConnectTransportException) {
            continue;
          }
          sb.append("\n    -> ").append(response.failures()[i].getDetailedMessage());
        }
        logger.debug(sb.toString());
      }
    }

    for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
        response) {
      // -1 version means it does not exists, which is what the API returns, and what we expect to
      logger.trace(
          "[{}] on node [{}] has version [{}] of shard",
          shard,
          nodeShardState.getNode(),
          nodeShardState.version());
      shardStates.put(nodeShardState.getNode(), nodeShardState.version());
    }
    return shardStates;
  }

  private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
      buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
    Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
        cachedStores.get(shard.shardId());
    ObjectOpenHashSet<String> nodesIds;
    if (shardStores == null) {
      shardStores = Maps.newHashMap();
      cachedStores.put(shard.shardId(), shardStores);
      nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys());
    } else {
      nodesIds = ObjectOpenHashSet.newInstance();
      // clean nodes that have failed
      for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) {
        DiscoveryNode node = it.next();
        if (!nodes.nodeExists(node.id())) {
          it.remove();
        }
      }

      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        DiscoveryNode node = cursor.value;
        if (!shardStores.containsKey(node)) {
          nodesIds.add(node.id());
        }
      }
    }

    if (!nodesIds.isEmpty()) {
      String[] nodesIdsArray = nodesIds.toArray(String.class);
      TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData =
          listShardStoreMetaData
              .list(shard.shardId(), false, nodesIdsArray, listTimeout)
              .actionGet();
      if (logger.isTraceEnabled()) {
        if (nodesStoreFilesMetaData.failures().length > 0) {
          StringBuilder sb =
              new StringBuilder(shard + ": failures when trying to list stores on nodes:");
          for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
            Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
            if (cause instanceof ConnectTransportException) {
              continue;
            }
            sb.append("\n    -> ")
                .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
          }
          logger.trace(sb.toString());
        }
      }

      for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData :
          nodesStoreFilesMetaData) {
        if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {
          shardStores.put(
              nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData());
        }
      }
    }

    return shardStores;
  }
}
예제 #21
0
public class GatewayAllocator extends AbstractComponent {

  private RoutingService routingService;

  private final PrimaryShardAllocator primaryShardAllocator;
  private final ReplicaShardAllocator replicaShardAllocator;

  private final ConcurrentMap<
          ShardId, AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>>
      asyncFetchStarted = ConcurrentCollections.newConcurrentMap();
  private final ConcurrentMap<
          ShardId, AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>>
      asyncFetchStore = ConcurrentCollections.newConcurrentMap();

  @Inject
  public GatewayAllocator(
      Settings settings,
      final TransportNodesListGatewayStartedShards startedAction,
      final TransportNodesListShardStoreMetaData storeAction) {
    super(settings);
    this.primaryShardAllocator = new InternalPrimaryShardAllocator(settings, startedAction);
    this.replicaShardAllocator = new InternalReplicaShardAllocator(settings, storeAction);
  }

  public void setReallocation(
      final ClusterService clusterService, final RoutingService routingService) {
    this.routingService = routingService;
    clusterService.add(
        new ClusterStateListener() {
          @Override
          public void clusterChanged(ClusterChangedEvent event) {
            boolean cleanCache = false;
            DiscoveryNode localNode = event.state().nodes().localNode();
            if (localNode != null) {
              if (localNode.masterNode() == true && event.localNodeMaster() == false) {
                cleanCache = true;
              }
            } else {
              cleanCache = true;
            }
            if (cleanCache) {
              Releasables.close(asyncFetchStarted.values());
              asyncFetchStarted.clear();
              Releasables.close(asyncFetchStore.values());
              asyncFetchStore.clear();
            }
          }
        });
  }

  public int getNumberOfInFlightFetch() {
    int count = 0;
    for (AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch :
        asyncFetchStarted.values()) {
      count += fetch.getNumberOfInFlightFetches();
    }
    for (AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch :
        asyncFetchStore.values()) {
      count += fetch.getNumberOfInFlightFetches();
    }
    return count;
  }

  public void applyStartedShards(StartedRerouteAllocation allocation) {
    for (ShardRouting shard : allocation.startedShards()) {
      Releasables.close(asyncFetchStarted.remove(shard.shardId()));
      Releasables.close(asyncFetchStore.remove(shard.shardId()));
    }
  }

  public void applyFailedShards(FailedRerouteAllocation allocation) {
    for (FailedRerouteAllocation.FailedShard shard : allocation.failedShards()) {
      Releasables.close(asyncFetchStarted.remove(shard.shard.shardId()));
      Releasables.close(asyncFetchStore.remove(shard.shard.shardId()));
    }
  }

  public boolean allocateUnassigned(final RoutingAllocation allocation) {
    boolean changed = false;

    RoutingNodes.UnassignedShards unassigned = allocation.routingNodes().unassigned();
    unassigned.sort(
        PriorityComparator.getAllocationComparator(allocation)); // sort for priority ordering

    changed |= primaryShardAllocator.allocateUnassigned(allocation);
    changed |= replicaShardAllocator.processExistingRecoveries(allocation);
    changed |= replicaShardAllocator.allocateUnassigned(allocation);
    return changed;
  }

  class InternalAsyncFetch<T extends BaseNodeResponse> extends AsyncShardFetch<T> {

    public InternalAsyncFetch(
        ESLogger logger,
        String type,
        ShardId shardId,
        List<? extends BaseNodesResponse<T>, T> action) {
      super(logger, type, shardId, action);
    }

    @Override
    protected void reroute(ShardId shardId, String reason) {
      logger.trace("{} scheduling reroute for {}", shardId, reason);
      routingService.reroute("async_shard_fetch");
    }
  }

  class InternalPrimaryShardAllocator extends PrimaryShardAllocator {

    private final TransportNodesListGatewayStartedShards startedAction;

    public InternalPrimaryShardAllocator(
        Settings settings, TransportNodesListGatewayStartedShards startedAction) {
      super(settings);
      this.startedAction = startedAction;
    }

    @Override
    protected AsyncShardFetch.FetchResult<
            TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
        fetchData(ShardRouting shard, RoutingAllocation allocation) {
      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState =
              fetch.fetchData(
                  allocation.nodes(),
                  allocation.metaData(),
                  allocation.getIgnoreNodes(shard.shardId()));

      if (shardState.hasData() == true) {
        shardState.processAllocation(allocation);
      }
      return shardState;
    }
  }

  class InternalReplicaShardAllocator extends ReplicaShardAllocator {

    private final TransportNodesListShardStoreMetaData storeAction;

    public InternalReplicaShardAllocator(
        Settings settings, TransportNodesListShardStoreMetaData storeAction) {
      super(settings);
      this.storeAction = storeAction;
    }

    @Override
    protected AsyncShardFetch.FetchResult<
            TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
        fetchData(ShardRouting shard, RoutingAllocation allocation) {
      AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
          asyncFetchStore.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
        asyncFetchStore.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores =
              fetch.fetchData(
                  allocation.nodes(),
                  allocation.metaData(),
                  allocation.getIgnoreNodes(shard.shardId()));
      if (shardStores.hasData() == true) {
        shardStores.processAllocation(allocation);
      }
      return shardStores;
    }
  }
}
예제 #22
0
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery>
    implements Discovery, DiscoveryNodesProvider {

  private final ThreadPool threadPool;
  private final TransportService transportService;
  private final ClusterService clusterService;
  private AllocationService allocationService;
  private final ClusterName clusterName;
  private final DiscoveryNodeService discoveryNodeService;
  private final ZenPingService pingService;
  private final MasterFaultDetection masterFD;
  private final NodesFaultDetection nodesFD;
  private final PublishClusterStateAction publishClusterState;
  private final MembershipAction membership;
  private final Version version;

  private final TimeValue pingTimeout;

  // a flag that should be used only for testing
  private final boolean sendLeaveRequest;

  private final ElectMasterService electMaster;

  private final boolean masterElectionFilterClientNodes;
  private final boolean masterElectionFilterDataNodes;

  private DiscoveryNode localNode;

  private final CopyOnWriteArrayList<InitialStateDiscoveryListener> initialStateListeners =
      new CopyOnWriteArrayList<InitialStateDiscoveryListener>();

  private volatile boolean master = false;

  private volatile DiscoveryNodes latestDiscoNodes;

  private volatile Thread currentJoinThread;

  private final AtomicBoolean initialStateSent = new AtomicBoolean();

  @Nullable private NodeService nodeService;

  @Inject
  public ZenDiscovery(
      Settings settings,
      ClusterName clusterName,
      ThreadPool threadPool,
      TransportService transportService,
      ClusterService clusterService,
      NodeSettingsService nodeSettingsService,
      DiscoveryNodeService discoveryNodeService,
      ZenPingService pingService,
      Version version,
      DiscoverySettings discoverySettings) {
    super(settings);
    this.clusterName = clusterName;
    this.threadPool = threadPool;
    this.clusterService = clusterService;
    this.transportService = transportService;
    this.discoveryNodeService = discoveryNodeService;
    this.pingService = pingService;
    this.version = version;

    // also support direct discovery.zen settings, for cases when it gets extended
    this.pingTimeout =
        settings.getAsTime(
            "discovery.zen.ping.timeout",
            settings.getAsTime(
                "discovery.zen.ping_timeout",
                componentSettings.getAsTime(
                    "ping_timeout",
                    componentSettings.getAsTime("initial_ping_timeout", timeValueSeconds(3)))));
    this.sendLeaveRequest = componentSettings.getAsBoolean("send_leave_request", true);

    this.masterElectionFilterClientNodes =
        settings.getAsBoolean("discovery.zen.master_election.filter_client", true);
    this.masterElectionFilterDataNodes =
        settings.getAsBoolean("discovery.zen.master_election.filter_data", false);

    logger.debug(
        "using ping.timeout [{}], master_election.filter_client [{}], master_election.filter_data [{}]",
        pingTimeout,
        masterElectionFilterClientNodes,
        masterElectionFilterDataNodes);

    this.electMaster = new ElectMasterService(settings);
    nodeSettingsService.addListener(new ApplySettings());

    this.masterFD = new MasterFaultDetection(settings, threadPool, transportService, this);
    this.masterFD.addListener(new MasterNodeFailureListener());

    this.nodesFD = new NodesFaultDetection(settings, threadPool, transportService);
    this.nodesFD.addListener(new NodeFailureListener());

    this.publishClusterState =
        new PublishClusterStateAction(
            settings, transportService, this, new NewClusterStateListener(), discoverySettings);
    this.pingService.setNodesProvider(this);
    this.membership =
        new MembershipAction(settings, transportService, this, new MembershipListener());

    transportService.registerHandler(
        RejoinClusterRequestHandler.ACTION, new RejoinClusterRequestHandler());
  }

  @Override
  public void setNodeService(@Nullable NodeService nodeService) {
    this.nodeService = nodeService;
  }

  @Override
  public void setAllocationService(AllocationService allocationService) {
    this.allocationService = allocationService;
  }

  @Override
  protected void doStart() throws ElasticsearchException {
    Map<String, String> nodeAttributes = discoveryNodeService.buildAttributes();
    // note, we rely on the fact that its a new id each time we start, see FD and "kill -9" handling
    final String nodeId = DiscoveryService.generateNodeId(settings);
    localNode =
        new DiscoveryNode(
            settings.get("name"),
            nodeId,
            transportService.boundAddress().publishAddress(),
            nodeAttributes,
            version);
    latestDiscoNodes =
        new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();
    nodesFD.updateNodes(latestDiscoNodes);
    pingService.start();

    // do the join on a different thread, the DiscoveryService waits for 30s anyhow till it is
    // discovered
    asyncJoinCluster();
  }

  @Override
  protected void doStop() throws ElasticsearchException {
    pingService.stop();
    masterFD.stop("zen disco stop");
    nodesFD.stop();
    initialStateSent.set(false);
    if (sendLeaveRequest) {
      if (!master && latestDiscoNodes.masterNode() != null) {
        try {
          membership.sendLeaveRequestBlocking(
              latestDiscoNodes.masterNode(), localNode, TimeValue.timeValueSeconds(1));
        } catch (Exception e) {
          logger.debug(
              "failed to send leave request to master [{}]", e, latestDiscoNodes.masterNode());
        }
      } else {
        DiscoveryNode[] possibleMasters =
            electMaster.nextPossibleMasters(latestDiscoNodes.nodes().values(), 5);
        for (DiscoveryNode possibleMaster : possibleMasters) {
          if (localNode.equals(possibleMaster)) {
            continue;
          }
          try {
            membership.sendLeaveRequest(latestDiscoNodes.masterNode(), possibleMaster);
          } catch (Exception e) {
            logger.debug(
                "failed to send leave request from master [{}] to possible master [{}]",
                e,
                latestDiscoNodes.masterNode(),
                possibleMaster);
          }
        }
      }
    }
    master = false;
    if (currentJoinThread != null) {
      try {
        currentJoinThread.interrupt();
      } catch (Exception e) {
        // ignore
      }
    }
  }

  @Override
  protected void doClose() throws ElasticsearchException {
    masterFD.close();
    nodesFD.close();
    publishClusterState.close();
    membership.close();
    pingService.close();
  }

  @Override
  public DiscoveryNode localNode() {
    return localNode;
  }

  @Override
  public void addListener(InitialStateDiscoveryListener listener) {
    this.initialStateListeners.add(listener);
  }

  @Override
  public void removeListener(InitialStateDiscoveryListener listener) {
    this.initialStateListeners.remove(listener);
  }

  @Override
  public String nodeDescription() {
    return clusterName.value() + "/" + localNode.id();
  }

  @Override
  public DiscoveryNodes nodes() {
    DiscoveryNodes latestNodes = this.latestDiscoNodes;
    if (latestNodes != null) {
      return latestNodes;
    }
    // have not decided yet, just send the local node
    return DiscoveryNodes.builder().put(localNode).localNodeId(localNode.id()).build();
  }

  @Override
  public NodeService nodeService() {
    return this.nodeService;
  }

  @Override
  public void publish(ClusterState clusterState, AckListener ackListener) {
    if (!master) {
      throw new ElasticsearchIllegalStateException("Shouldn't publish state when not master");
    }
    latestDiscoNodes = clusterState.nodes();
    nodesFD.updateNodes(clusterState.nodes());
    publishClusterState.publish(clusterState, ackListener);
  }

  private void asyncJoinCluster() {
    if (currentJoinThread != null) {
      // we are already joining, ignore...
      logger.trace("a join thread already running");
      return;
    }
    threadPool
        .generic()
        .execute(
            new Runnable() {
              @Override
              public void run() {
                currentJoinThread = Thread.currentThread();
                try {
                  innerJoinCluster();
                } finally {
                  currentJoinThread = null;
                }
              }
            });
  }

  private void innerJoinCluster() {
    boolean retry = true;
    while (retry) {
      if (lifecycle.stoppedOrClosed()) {
        return;
      }
      retry = false;
      DiscoveryNode masterNode = findMaster();
      if (masterNode == null) {
        logger.trace("no masterNode returned");
        retry = true;
        continue;
      }
      if (localNode.equals(masterNode)) {
        this.master = true;
        nodesFD.start(); // start the nodes FD
        clusterService.submitStateUpdateTask(
            "zen-disco-join (elected_as_master)",
            Priority.URGENT,
            new ProcessedClusterStateUpdateTask() {
              @Override
              public ClusterState execute(ClusterState currentState) {
                DiscoveryNodes.Builder builder =
                    new DiscoveryNodes.Builder()
                        .localNodeId(localNode.id())
                        .masterNodeId(localNode.id())
                        // put our local node
                        .put(localNode);
                // update the fact that we are the master...
                latestDiscoNodes = builder.build();
                ClusterBlocks clusterBlocks =
                    ClusterBlocks.builder()
                        .blocks(currentState.blocks())
                        .removeGlobalBlock(NO_MASTER_BLOCK)
                        .build();
                return ClusterState.builder(currentState)
                    .nodes(latestDiscoNodes)
                    .blocks(clusterBlocks)
                    .build();
              }

              @Override
              public void onFailure(String source, Throwable t) {
                logger.error("unexpected failure during [{}]", t, source);
              }

              @Override
              public void clusterStateProcessed(
                  String source, ClusterState oldState, ClusterState newState) {
                sendInitialStateEventIfNeeded();
              }
            });
      } else {
        this.master = false;
        try {
          // first, make sure we can connect to the master
          transportService.connectToNode(masterNode);
        } catch (Exception e) {
          logger.warn("failed to connect to master [{}], retrying...", e, masterNode);
          retry = true;
          continue;
        }
        // send join request
        try {
          membership.sendJoinRequestBlocking(masterNode, localNode, pingTimeout);
        } catch (Exception e) {
          if (e instanceof ElasticsearchException) {
            logger.info(
                "failed to send join request to master [{}], reason [{}]",
                masterNode,
                ((ElasticsearchException) e).getDetailedMessage());
          } else {
            logger.info(
                "failed to send join request to master [{}], reason [{}]",
                masterNode,
                e.getMessage());
          }
          if (logger.isTraceEnabled()) {
            logger.trace("detailed failed reason", e);
          }
          // failed to send the join request, retry
          retry = true;
          continue;
        }
        masterFD.start(masterNode, "initial_join");
        // no need to submit the received cluster state, we will get it from the master when it
        // publishes
        // the fact that we joined
      }
    }
  }

  private void handleLeaveRequest(final DiscoveryNode node) {
    if (lifecycleState() != Lifecycle.State.STARTED) {
      // not started, ignore a node failure
      return;
    }
    if (master) {
      clusterService.submitStateUpdateTask(
          "zen-disco-node_left(" + node + ")",
          Priority.URGENT,
          new ClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) {
              DiscoveryNodes.Builder builder =
                  DiscoveryNodes.builder(currentState.nodes()).remove(node.id());
              latestDiscoNodes = builder.build();
              currentState = ClusterState.builder(currentState).nodes(latestDiscoNodes).build();
              // check if we have enough master nodes, if not, we need to move into joining the
              // cluster again
              if (!electMaster.hasEnoughMasterNodes(currentState.nodes())) {
                return rejoin(currentState, "not enough master nodes");
              }
              // eagerly run reroute to remove dead nodes from routing table
              RoutingAllocation.Result routingResult =
                  allocationService.reroute(ClusterState.builder(currentState).build());
              return ClusterState.builder(currentState).routingResult(routingResult).build();
            }

            @Override
            public void onFailure(String source, Throwable t) {
              logger.error("unexpected failure during [{}]", t, source);
            }
          });
    } else {
      handleMasterGone(node, "shut_down");
    }
  }

  private void handleNodeFailure(final DiscoveryNode node, String reason) {
    if (lifecycleState() != Lifecycle.State.STARTED) {
      // not started, ignore a node failure
      return;
    }
    if (!master) {
      // nothing to do here...
      return;
    }
    clusterService.submitStateUpdateTask(
        "zen-disco-node_failed(" + node + "), reason " + reason,
        Priority.URGENT,
        new ProcessedClusterStateUpdateTask() {
          @Override
          public ClusterState execute(ClusterState currentState) {
            DiscoveryNodes.Builder builder =
                DiscoveryNodes.builder(currentState.nodes()).remove(node.id());
            latestDiscoNodes = builder.build();
            currentState = ClusterState.builder(currentState).nodes(latestDiscoNodes).build();
            // check if we have enough master nodes, if not, we need to move into joining the
            // cluster again
            if (!electMaster.hasEnoughMasterNodes(currentState.nodes())) {
              return rejoin(currentState, "not enough master nodes");
            }
            // eagerly run reroute to remove dead nodes from routing table
            RoutingAllocation.Result routingResult =
                allocationService.reroute(ClusterState.builder(currentState).build());
            return ClusterState.builder(currentState).routingResult(routingResult).build();
          }

          @Override
          public void onFailure(String source, Throwable t) {
            logger.error("unexpected failure during [{}]", t, source);
          }

          @Override
          public void clusterStateProcessed(
              String source, ClusterState oldState, ClusterState newState) {
            sendInitialStateEventIfNeeded();
          }
        });
  }

  private void handleMinimumMasterNodesChanged(final int minimumMasterNodes) {
    if (lifecycleState() != Lifecycle.State.STARTED) {
      // not started, ignore a node failure
      return;
    }
    if (!master) {
      // nothing to do here...
      return;
    }
    clusterService.submitStateUpdateTask(
        "zen-disco-minimum_master_nodes_changed",
        Priority.URGENT,
        new ProcessedClusterStateUpdateTask() {
          @Override
          public ClusterState execute(ClusterState currentState) {
            final int prevMinimumMasterNode = ZenDiscovery.this.electMaster.minimumMasterNodes();
            ZenDiscovery.this.electMaster.minimumMasterNodes(minimumMasterNodes);
            // check if we have enough master nodes, if not, we need to move into joining the
            // cluster again
            if (!electMaster.hasEnoughMasterNodes(currentState.nodes())) {
              return rejoin(
                  currentState,
                  "not enough master nodes on change of minimum_master_nodes from ["
                      + prevMinimumMasterNode
                      + "] to ["
                      + minimumMasterNodes
                      + "]");
            }
            return currentState;
          }

          @Override
          public void onFailure(String source, Throwable t) {
            logger.error("unexpected failure during [{}]", t, source);
          }

          @Override
          public void clusterStateProcessed(
              String source, ClusterState oldState, ClusterState newState) {
            sendInitialStateEventIfNeeded();
          }
        });
  }

  private void handleMasterGone(final DiscoveryNode masterNode, final String reason) {
    if (lifecycleState() != Lifecycle.State.STARTED) {
      // not started, ignore a master failure
      return;
    }
    if (master) {
      // we might get this on both a master telling us shutting down, and then the disconnect
      // failure
      return;
    }

    logger.info("master_left [{}], reason [{}]", masterNode, reason);

    clusterService.submitStateUpdateTask(
        "zen-disco-master_failed (" + masterNode + ")",
        Priority.URGENT,
        new ProcessedClusterStateUpdateTask() {
          @Override
          public ClusterState execute(ClusterState currentState) {
            if (!masterNode.id().equals(currentState.nodes().masterNodeId())) {
              // master got switched on us, no need to send anything
              return currentState;
            }

            DiscoveryNodes discoveryNodes =
                DiscoveryNodes.builder(currentState.nodes())
                    // make sure the old master node, which has failed, is not part of the nodes we
                    // publish
                    .remove(masterNode.id())
                    .masterNodeId(null)
                    .build();

            if (!electMaster.hasEnoughMasterNodes(discoveryNodes)) {
              return rejoin(
                  ClusterState.builder(currentState).nodes(discoveryNodes).build(),
                  "not enough master nodes after master left (reason = " + reason + ")");
            }

            final DiscoveryNode electedMaster =
                electMaster.electMaster(discoveryNodes); // elect master
            if (localNode.equals(electedMaster)) {
              master = true;
              masterFD.stop(
                  "got elected as new master since master left (reason = " + reason + ")");
              nodesFD.start();
              discoveryNodes =
                  DiscoveryNodes.builder(discoveryNodes).masterNodeId(localNode.id()).build();
              latestDiscoNodes = discoveryNodes;
              return ClusterState.builder(currentState).nodes(latestDiscoNodes).build();
            } else {
              nodesFD.stop();
              if (electedMaster != null) {
                discoveryNodes =
                    DiscoveryNodes.builder(discoveryNodes).masterNodeId(electedMaster.id()).build();
                masterFD.restart(
                    electedMaster,
                    "possible elected master since master left (reason = " + reason + ")");
                latestDiscoNodes = discoveryNodes;
                return ClusterState.builder(currentState).nodes(latestDiscoNodes).build();
              } else {
                return rejoin(
                    ClusterState.builder(currentState).nodes(discoveryNodes).build(),
                    "master_left and no other node elected to become master");
              }
            }
          }

          @Override
          public void onFailure(String source, Throwable t) {
            logger.error("unexpected failure during [{}]", t, source);
          }

          @Override
          public void clusterStateProcessed(
              String source, ClusterState oldState, ClusterState newState) {
            sendInitialStateEventIfNeeded();
          }
        });
  }

  static class ProcessClusterState {
    final ClusterState clusterState;
    final PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed;
    volatile boolean processed;

    ProcessClusterState(
        ClusterState clusterState,
        PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) {
      this.clusterState = clusterState;
      this.newStateProcessed = newStateProcessed;
    }
  }

  private final BlockingQueue<ProcessClusterState> processNewClusterStates =
      ConcurrentCollections.newBlockingQueue();

  void handleNewClusterStateFromMaster(
      ClusterState newClusterState,
      final PublishClusterStateAction.NewClusterStateListener.NewStateProcessed newStateProcessed) {
    if (master) {
      final ClusterState newState = newClusterState;
      clusterService.submitStateUpdateTask(
          "zen-disco-master_receive_cluster_state_from_another_master ["
              + newState.nodes().masterNode()
              + "]",
          Priority.URGENT,
          new ProcessedClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) {
              if (newState.version() > currentState.version()) {
                logger.warn(
                    "received cluster state from [{}] which is also master but with a newer cluster_state, rejoining to cluster...",
                    newState.nodes().masterNode());
                return rejoin(
                    currentState,
                    "zen-disco-master_receive_cluster_state_from_another_master ["
                        + newState.nodes().masterNode()
                        + "]");
              } else {
                logger.warn(
                    "received cluster state from [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster",
                    newState.nodes().masterNode(),
                    newState.nodes().masterNode());
                transportService.sendRequest(
                    newState.nodes().masterNode(),
                    RejoinClusterRequestHandler.ACTION,
                    new RejoinClusterRequest(currentState.nodes().localNodeId()),
                    new EmptyTransportResponseHandler(ThreadPool.Names.SAME) {
                      @Override
                      public void handleException(TransportException exp) {
                        logger.warn(
                            "failed to send rejoin request to [{}]",
                            exp,
                            newState.nodes().masterNode());
                      }
                    });
                return currentState;
              }
            }

            @Override
            public void clusterStateProcessed(
                String source, ClusterState oldState, ClusterState newState) {
              newStateProcessed.onNewClusterStateProcessed();
            }

            @Override
            public void onFailure(String source, Throwable t) {
              logger.error("unexpected failure during [{}]", t, source);
              newStateProcessed.onNewClusterStateFailed(t);
            }
          });
    } else {
      if (newClusterState.nodes().localNode() == null) {
        logger.warn(
            "received a cluster state from [{}] and not part of the cluster, should not happen",
            newClusterState.nodes().masterNode());
        newStateProcessed.onNewClusterStateFailed(
            new ElasticsearchIllegalStateException(
                "received state from a node that is not part of the cluster"));
      } else {
        if (currentJoinThread != null) {
          logger.debug(
              "got a new state from master node, though we are already trying to rejoin the cluster");
        }

        final ProcessClusterState processClusterState =
            new ProcessClusterState(newClusterState, newStateProcessed);
        processNewClusterStates.add(processClusterState);

        clusterService.submitStateUpdateTask(
            "zen-disco-receive(from master [" + newClusterState.nodes().masterNode() + "])",
            Priority.URGENT,
            new ProcessedClusterStateUpdateTask() {
              @Override
              public ClusterState execute(ClusterState currentState) {
                // we already processed it in a previous event
                if (processClusterState.processed) {
                  return currentState;
                }

                // TODO: once improvement that we can do is change the message structure to include
                // version and masterNodeId
                // at the start, this will allow us to keep the "compressed bytes" around, and only
                // parse the first page
                // to figure out if we need to use it or not, and only once we picked the latest
                // one, parse the whole state

                // try and get the state with the highest version out of all the ones with the same
                // master node id
                ProcessClusterState stateToProcess = processNewClusterStates.poll();
                if (stateToProcess == null) {
                  return currentState;
                }
                stateToProcess.processed = true;
                while (true) {
                  ProcessClusterState potentialState = processNewClusterStates.peek();
                  // nothing else in the queue, bail
                  if (potentialState == null) {
                    break;
                  }
                  // if its not from the same master, then bail
                  if (!Objects.equal(
                      stateToProcess.clusterState.nodes().masterNodeId(),
                      potentialState.clusterState.nodes().masterNodeId())) {
                    break;
                  }

                  // we are going to use it for sure, poll (remove) it
                  potentialState = processNewClusterStates.poll();
                  potentialState.processed = true;

                  if (potentialState.clusterState.version()
                      > stateToProcess.clusterState.version()) {
                    // we found a new one
                    stateToProcess = potentialState;
                  }
                }

                ClusterState updatedState = stateToProcess.clusterState;

                // if the new state has a smaller version, and it has the same master node, then no
                // need to process it
                if (updatedState.version() < currentState.version()
                    && Objects.equal(
                        updatedState.nodes().masterNodeId(), currentState.nodes().masterNodeId())) {
                  return currentState;
                }

                // we don't need to do this, since we ping the master, and get notified when it has
                // moved from being a master
                // because it doesn't have enough master nodes...
                // if (!electMaster.hasEnoughMasterNodes(newState.nodes())) {
                //    return disconnectFromCluster(newState, "not enough master nodes on new cluster
                // state received from [" + newState.nodes().masterNode() + "]");
                // }

                latestDiscoNodes = updatedState.nodes();

                // check to see that we monitor the correct master of the cluster
                if (masterFD.masterNode() == null
                    || !masterFD.masterNode().equals(latestDiscoNodes.masterNode())) {
                  masterFD.restart(
                      latestDiscoNodes.masterNode(),
                      "new cluster state received and we are monitoring the wrong master ["
                          + masterFD.masterNode()
                          + "]");
                }

                ClusterState.Builder builder = ClusterState.builder(updatedState);
                // if the routing table did not change, use the original one
                if (updatedState.routingTable().version()
                    == currentState.routingTable().version()) {
                  builder.routingTable(currentState.routingTable());
                }
                // same for metadata
                if (updatedState.metaData().version() == currentState.metaData().version()) {
                  builder.metaData(currentState.metaData());
                } else {
                  // if its not the same version, only copy over new indices or ones that changed
                  // the version
                  MetaData.Builder metaDataBuilder =
                      MetaData.builder(updatedState.metaData()).removeAllIndices();
                  for (IndexMetaData indexMetaData : updatedState.metaData()) {
                    IndexMetaData currentIndexMetaData =
                        currentState.metaData().index(indexMetaData.index());
                    if (currentIndexMetaData == null
                        || currentIndexMetaData.version() != indexMetaData.version()) {
                      metaDataBuilder.put(indexMetaData, false);
                    } else {
                      metaDataBuilder.put(currentIndexMetaData, false);
                    }
                  }
                  builder.metaData(metaDataBuilder);
                }

                return builder.build();
              }

              @Override
              public void onFailure(String source, Throwable t) {
                logger.error("unexpected failure during [{}]", t, source);
                newStateProcessed.onNewClusterStateFailed(t);
              }

              @Override
              public void clusterStateProcessed(
                  String source, ClusterState oldState, ClusterState newState) {
                sendInitialStateEventIfNeeded();
                newStateProcessed.onNewClusterStateProcessed();
              }
            });
      }
    }
  }

  private ClusterState handleJoinRequest(final DiscoveryNode node) {
    if (!master) {
      throw new ElasticsearchIllegalStateException(
          "Node [" + localNode + "] not master for join request from [" + node + "]");
    }

    ClusterState state = clusterService.state();
    if (!transportService.addressSupported(node.address().getClass())) {
      // TODO, what should we do now? Maybe inform that node that its crap?
      logger.warn("received a wrong address type from [{}], ignoring...", node);
    } else {
      // try and connect to the node, if it fails, we can raise an exception back to the client...
      transportService.connectToNode(node);
      state = clusterService.state();

      // validate the join request, will throw a failure if it fails, which will get back to the
      // node calling the join request
      membership.sendValidateJoinRequestBlocking(node, state, pingTimeout);

      clusterService.submitStateUpdateTask(
          "zen-disco-receive(join from node[" + node + "])",
          Priority.URGENT,
          new ClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) {
              if (currentState.nodes().nodeExists(node.id())) {
                // the node already exists in the cluster
                logger.warn("received a join request for an existing node [{}]", node);
                // still send a new cluster state, so it will be re published and possibly update
                // the other node
                return ClusterState.builder(currentState).build();
              }
              DiscoveryNodes.Builder builder = DiscoveryNodes.builder(currentState.nodes());
              for (DiscoveryNode existingNode : currentState.nodes()) {
                if (node.address().equals(existingNode.address())) {
                  builder.remove(existingNode.id());
                  logger.warn(
                      "received join request from node [{}], but found existing node {} with same address, removing existing node",
                      node,
                      existingNode);
                }
              }
              latestDiscoNodes = builder.build();
              // add the new node now (will update latestDiscoNodes on publish)
              return ClusterState.builder(currentState)
                  .nodes(latestDiscoNodes.newNode(node))
                  .build();
            }

            @Override
            public void onFailure(String source, Throwable t) {
              logger.error("unexpected failure during [{}]", t, source);
            }
          });
    }
    return state;
  }

  private DiscoveryNode findMaster() {
    ZenPing.PingResponse[] fullPingResponses = pingService.pingAndWait(pingTimeout);
    if (fullPingResponses == null) {
      logger.trace("No full ping responses");
      return null;
    }
    if (logger.isTraceEnabled()) {
      StringBuilder sb = new StringBuilder("full ping responses:");
      if (fullPingResponses.length == 0) {
        sb.append(" {none}");
      } else {
        for (ZenPing.PingResponse pingResponse : fullPingResponses) {
          sb.append("\n\t--> ")
              .append("target [")
              .append(pingResponse.target())
              .append("], master [")
              .append(pingResponse.master())
              .append("]");
        }
      }
      logger.trace(sb.toString());
    }

    // filter responses
    List<ZenPing.PingResponse> pingResponses = Lists.newArrayList();
    for (ZenPing.PingResponse pingResponse : fullPingResponses) {
      DiscoveryNode node = pingResponse.target();
      if (masterElectionFilterClientNodes
          && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) {
        // filter out the client node, which is a client node, or also one that is not data and not
        // master (effectively, client)
      } else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) {
        // filter out data node that is not also master
      } else {
        pingResponses.add(pingResponse);
      }
    }

    if (logger.isDebugEnabled()) {
      StringBuilder sb =
          new StringBuilder("filtered ping responses: (filter_client[")
              .append(masterElectionFilterClientNodes)
              .append("], filter_data[")
              .append(masterElectionFilterDataNodes)
              .append("])");
      if (pingResponses.isEmpty()) {
        sb.append(" {none}");
      } else {
        for (ZenPing.PingResponse pingResponse : pingResponses) {
          sb.append("\n\t--> ")
              .append("target [")
              .append(pingResponse.target())
              .append("], master [")
              .append(pingResponse.master())
              .append("]");
        }
      }
      logger.debug(sb.toString());
    }
    List<DiscoveryNode> pingMasters = newArrayList();
    for (ZenPing.PingResponse pingResponse : pingResponses) {
      if (pingResponse.master() != null) {
        pingMasters.add(pingResponse.master());
      }
    }

    Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
    possibleMasterNodes.add(localNode);
    for (ZenPing.PingResponse pingResponse : pingResponses) {
      possibleMasterNodes.add(pingResponse.target());
    }
    // if we don't have enough master nodes, we bail, even if we get a response that indicates
    // there is a master by other node, we don't see enough...
    if (!electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
      return null;
    }

    if (pingMasters.isEmpty()) {
      // lets tie break between discovered nodes
      DiscoveryNode electedMaster = electMaster.electMaster(possibleMasterNodes);
      if (localNode.equals(electedMaster)) {
        return localNode;
      }
    } else {
      DiscoveryNode electedMaster = electMaster.electMaster(pingMasters);
      if (electedMaster != null) {
        return electedMaster;
      }
    }
    return null;
  }

  private ClusterState rejoin(ClusterState clusterState, String reason) {
    logger.warn(reason + ", current nodes: {}", clusterState.nodes());
    nodesFD.stop();
    masterFD.stop(reason);
    master = false;

    ClusterBlocks clusterBlocks =
        ClusterBlocks.builder()
            .blocks(clusterState.blocks())
            .addGlobalBlock(NO_MASTER_BLOCK)
            .addGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)
            .build();

    // clear the routing table, we have no master, so we need to recreate the routing when we reform
    // the cluster
    RoutingTable routingTable = RoutingTable.builder().build();
    // we also clean the metadata, since we are going to recover it if we become master
    MetaData metaData = MetaData.builder().build();

    // clean the nodes, we are now not connected to anybody, since we try and reform the cluster
    latestDiscoNodes =
        new DiscoveryNodes.Builder().put(localNode).localNodeId(localNode.id()).build();

    asyncJoinCluster();

    return ClusterState.builder(clusterState)
        .blocks(clusterBlocks)
        .nodes(latestDiscoNodes)
        .routingTable(routingTable)
        .metaData(metaData)
        .build();
  }

  private void sendInitialStateEventIfNeeded() {
    if (initialStateSent.compareAndSet(false, true)) {
      for (InitialStateDiscoveryListener listener : initialStateListeners) {
        listener.initialStateProcessed();
      }
    }
  }

  private class NewClusterStateListener
      implements PublishClusterStateAction.NewClusterStateListener {

    @Override
    public void onNewClusterState(ClusterState clusterState, NewStateProcessed newStateProcessed) {
      handleNewClusterStateFromMaster(clusterState, newStateProcessed);
    }
  }

  private class MembershipListener implements MembershipAction.MembershipListener {
    @Override
    public ClusterState onJoin(DiscoveryNode node) {
      return handleJoinRequest(node);
    }

    @Override
    public void onLeave(DiscoveryNode node) {
      handleLeaveRequest(node);
    }
  }

  private class NodeFailureListener implements NodesFaultDetection.Listener {

    @Override
    public void onNodeFailure(DiscoveryNode node, String reason) {
      handleNodeFailure(node, reason);
    }
  }

  private class MasterNodeFailureListener implements MasterFaultDetection.Listener {

    @Override
    public void onMasterFailure(DiscoveryNode masterNode, String reason) {
      handleMasterGone(masterNode, reason);
    }

    @Override
    public void onDisconnectedFromMaster() {
      // got disconnected from the master, send a join request
      DiscoveryNode masterNode = latestDiscoNodes.masterNode();
      try {
        membership.sendJoinRequest(masterNode, localNode);
      } catch (Exception e) {
        logger.warn("failed to send join request on disconnection from master [{}]", masterNode);
      }
    }
  }

  static class RejoinClusterRequest extends TransportRequest {

    private String fromNodeId;

    RejoinClusterRequest(String fromNodeId) {
      this.fromNodeId = fromNodeId;
    }

    RejoinClusterRequest() {}

    @Override
    public void readFrom(StreamInput in) throws IOException {
      super.readFrom(in);
      fromNodeId = in.readOptionalString();
    }

    @Override
    public void writeTo(StreamOutput out) throws IOException {
      super.writeTo(out);
      out.writeOptionalString(fromNodeId);
    }
  }

  class RejoinClusterRequestHandler extends BaseTransportRequestHandler<RejoinClusterRequest> {

    static final String ACTION = "discovery/zen/rejoin";

    @Override
    public RejoinClusterRequest newInstance() {
      return new RejoinClusterRequest();
    }

    @Override
    public void messageReceived(final RejoinClusterRequest request, final TransportChannel channel)
        throws Exception {
      clusterService.submitStateUpdateTask(
          "received a request to rejoin the cluster from [" + request.fromNodeId + "]",
          Priority.URGENT,
          new ClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) {
              try {
                channel.sendResponse(TransportResponse.Empty.INSTANCE);
              } catch (Exception e) {
                logger.warn("failed to send response on rejoin cluster request handling", e);
              }
              return rejoin(
                  currentState,
                  "received a request to rejoin the cluster from [" + request.fromNodeId + "]");
            }

            @Override
            public void onFailure(String source, Throwable t) {
              logger.error("unexpected failure during [{}]", t, source);
            }
          });
    }

    @Override
    public String executor() {
      return ThreadPool.Names.SAME;
    }
  }

  class ApplySettings implements NodeSettingsService.Listener {
    @Override
    public void onRefreshSettings(Settings settings) {
      int minimumMasterNodes =
          settings.getAsInt(
              "discovery.zen.minimum_master_nodes",
              ZenDiscovery.this.electMaster.minimumMasterNodes());
      if (minimumMasterNodes != ZenDiscovery.this.electMaster.minimumMasterNodes()) {
        logger.info(
            "updating discovery.zen.minimum_master_nodes from [{}] to [{}]",
            ZenDiscovery.this.electMaster.minimumMasterNodes(),
            minimumMasterNodes);
        handleMinimumMasterNodesChanged(minimumMasterNodes);
      }
    }
  }
}
예제 #23
0
public class GatewayAllocator extends AbstractComponent {

  public static final String INDEX_RECOVERY_INITIAL_SHARDS = "index.recovery.initial_shards";

  private final String initialShards;

  private final TransportNodesListGatewayStartedShards startedAction;
  private final TransportNodesListShardStoreMetaData storeAction;
  private RoutingService routingService;

  private final ConcurrentMap<
          ShardId, AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>>
      asyncFetchStarted = ConcurrentCollections.newConcurrentMap();
  private final ConcurrentMap<
          ShardId, AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>>
      asyncFetchStore = ConcurrentCollections.newConcurrentMap();

  @Inject
  public GatewayAllocator(
      Settings settings,
      TransportNodesListGatewayStartedShards startedAction,
      TransportNodesListShardStoreMetaData storeAction) {
    super(settings);
    this.startedAction = startedAction;
    this.storeAction = storeAction;

    this.initialShards =
        settings.get(
            "gateway.initial_shards", settings.get("gateway.local.initial_shards", "quorum"));

    logger.debug("using initial_shards [{}]", initialShards);
  }

  public void setReallocation(
      final ClusterService clusterService, final RoutingService routingService) {
    this.routingService = routingService;
    clusterService.add(
        new ClusterStateListener() {
          @Override
          public void clusterChanged(ClusterChangedEvent event) {
            boolean cleanCache = false;
            DiscoveryNode localNode = event.state().nodes().localNode();
            if (localNode != null) {
              if (localNode.masterNode() == true && event.localNodeMaster() == false) {
                cleanCache = true;
              }
            } else {
              cleanCache = true;
            }
            if (cleanCache) {
              Releasables.close(asyncFetchStarted.values());
              asyncFetchStarted.clear();
              Releasables.close(asyncFetchStore.values());
              asyncFetchStore.clear();
            }
          }
        });
  }

  public int getNumberOfInFlightFetch() {
    int count = 0;
    for (AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch :
        asyncFetchStarted.values()) {
      count += fetch.getNumberOfInFlightFetches();
    }
    for (AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch :
        asyncFetchStore.values()) {
      count += fetch.getNumberOfInFlightFetches();
    }
    return count;
  }

  public void applyStartedShards(StartedRerouteAllocation allocation) {
    for (ShardRouting shard : allocation.startedShards()) {
      Releasables.close(asyncFetchStarted.remove(shard.shardId()));
      Releasables.close(asyncFetchStore.remove(shard.shardId()));
    }
  }

  public void applyFailedShards(FailedRerouteAllocation allocation) {
    for (FailedRerouteAllocation.FailedShard shard : allocation.failedShards()) {
      Releasables.close(asyncFetchStarted.remove(shard.shard.shardId()));
      Releasables.close(asyncFetchStore.remove(shard.shard.shardId()));
    }
  }

  /** Return {@code true} if the index is configured to allow shards to be recovered on any node */
  private boolean recoverOnAnyNode(@IndexSettings Settings idxSettings) {
    return IndexMetaData.isOnSharedFilesystem(idxSettings)
        && idxSettings.getAsBoolean(
            IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, false);
  }

  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    final MetaData metaData = routingNodes.metaData();
    RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned();
    unassigned.sort(
        new PriorityComparator() {

          @Override
          protected Settings getIndexSettings(String index) {
            IndexMetaData indexMetaData = metaData.index(index);
            return indexMetaData.getSettings();
          }
        }); // sort for priority ordering
    Iterator<ShardRouting> unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardState.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }
      shardState.processAllocation(allocation);

      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      /**
       * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1
       * means the shard does not exist on the node, where any shard state >= 0 is the state version
       * of the shard on that node's disk.
       *
       * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating
       * that the shard can be allocated to any node.
       */
      ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>();
      for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
          shardState.getData().values()) {
        long version = nodeShardState.version();
        // -1 version means it does not exists, which is what the API returns, and what we expect to
        logger.trace(
            "[{}] on node [{}] has version [{}] of shard",
            shard,
            nodeShardState.getNode(),
            version);
        nodesState.put(nodeShardState.getNode(), version);
      }

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();

      assert !nodesState.containsKey(null);
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      Settings idxSettings = indexMetaData.settings();
      for (int i = 0; i < keys.length; i++) {
        if (keys[i] == null) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (recoverOnAnyNode(idxSettings)) {
          numberOfAllocationsFound++;
          if (version > highestVersion) {
            highestVersion = version;
          }
          // We always put the node without clearing the map
          nodesWithVersion.put(node, version);
        } else if (version != -1) {
          numberOfAllocationsFound++;
          // If we've found a new "best" candidate, clear the
          // current candidates and add it
          if (version > highestVersion) {
            highestVersion = version;
            nodesWithVersion.clear();
            nodesWithVersion.put(node, version);
          } else if (version == highestVersion) {
            // If the candidate is the same, add it to the
            // list, but keep the current candidate
            nodesWithVersion.put(node, version);
          }
        }
      }
      // Now that we have a map of nodes to versions along with the
      // number of allocations found (and not ignored), we need to sort
      // it so the node with the highest version is at the beginning
      List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
      nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
      CollectionUtil.timSort(
          nodesWithHighestVersion,
          new Comparator<DiscoveryNode>() {
            @Override
            public int compare(DiscoveryNode o1, DiscoveryNode o2) {
              return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
            }
          });

      if (logger.isDebugEnabled()) {
        logger.debug(
            "[{}][{}] found {} allocations of {}, highest version: [{}]",
            shard.index(),
            shard.id(),
            numberOfAllocationsFound,
            shard,
            highestVersion);
      }
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder("[");
        for (DiscoveryNode n : nodesWithHighestVersion) {
          sb.append("[");
          sb.append(n.getName());
          sb.append("]");
          sb.append(" -> ");
          sb.append(nodesWithVersion.get(n));
          sb.append(", ");
        }
        sb.append("]");
        logger.trace("{} candidates for allocation: {}", shard, sb.toString());
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }

      AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
          asyncFetchStore.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
        asyncFetchStore.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores =
              fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue; // still fetching
      }
      shardStores.processAllocation(allocation);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          routingNodes.initialize(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        }
      }
    }
    return changed;
  }

  class InternalAsyncFetch<T extends BaseNodeResponse> extends AsyncShardFetch<T> {

    public InternalAsyncFetch(
        ESLogger logger,
        String type,
        ShardId shardId,
        List<? extends BaseNodesResponse<T>, T> action) {
      super(logger, type, shardId, action);
    }

    @Override
    protected void reroute(ShardId shardId, String reason) {
      logger.trace("{} scheduling reroute for {}", shardId, reason);
      routingService.reroute("async_shard_fetch");
    }
  }
}
/**
 * The dangling indices state is responsible for finding new dangling indices (indices that have
 * their state written on disk, but don't exists in the metadata of the cluster), and importing them
 * into the cluster.
 */
public class DanglingIndicesState extends AbstractComponent {

  private final NodeEnvironment nodeEnv;
  private final MetaStateService metaStateService;
  private final LocalAllocateDangledIndices allocateDangledIndices;

  private final Map<String, IndexMetaData> danglingIndices =
      ConcurrentCollections.newConcurrentMap();

  @Inject
  public DanglingIndicesState(
      Settings settings,
      NodeEnvironment nodeEnv,
      MetaStateService metaStateService,
      LocalAllocateDangledIndices allocateDangledIndices) {
    super(settings);
    this.nodeEnv = nodeEnv;
    this.metaStateService = metaStateService;
    this.allocateDangledIndices = allocateDangledIndices;
  }

  /**
   * Process dangling indices based on the provided meta data, handling cleanup, finding new
   * dangling indices, and allocating outstanding ones.
   */
  public void processDanglingIndices(MetaData metaData) {
    if (nodeEnv.hasNodeFile() == false) {
      return;
    }
    cleanupAllocatedDangledIndices(metaData);
    findNewAndAddDanglingIndices(metaData);
    allocateDanglingIndices();
  }

  /** The current set of dangling indices. */
  Map<String, IndexMetaData> getDanglingIndices() {
    return ImmutableMap.copyOf(danglingIndices);
  }

  /** Cleans dangling indices if they are already allocated on the provided meta data. */
  void cleanupAllocatedDangledIndices(MetaData metaData) {
    for (String danglingIndex : danglingIndices.keySet()) {
      if (metaData.hasIndex(danglingIndex)) {
        logger.debug(
            "[{}] no longer dangling (created), removing from dangling list", danglingIndex);
        danglingIndices.remove(danglingIndex);
      }
    }
  }

  /**
   * Finds (@{link #findNewAndAddDanglingIndices}) and adds the new dangling indices to the
   * currently tracked dangling indices.
   */
  void findNewAndAddDanglingIndices(MetaData metaData) {
    danglingIndices.putAll(findNewDanglingIndices(metaData));
  }

  /**
   * Finds new dangling indices by iterating over the indices and trying to find indices that have
   * state on disk, but are not part of the provided meta data, or not detected as dangled already.
   */
  Map<String, IndexMetaData> findNewDanglingIndices(MetaData metaData) {
    final Set<String> indices;
    try {
      indices = nodeEnv.findAllIndices();
    } catch (Throwable e) {
      logger.warn("failed to list dangling indices", e);
      return ImmutableMap.of();
    }

    Map<String, IndexMetaData> newIndices = Maps.newHashMap();
    for (String indexName : indices) {
      if (metaData.hasIndex(indexName) == false
          && danglingIndices.containsKey(indexName) == false) {
        try {
          IndexMetaData indexMetaData = metaStateService.loadIndexState(indexName);
          if (indexMetaData != null) {
            logger.info(
                "[{}] dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state",
                indexName);
            if (!indexMetaData.index().equals(indexName)) {
              logger.info(
                  "dangled index directory name is [{}], state name is [{}], renaming to directory name",
                  indexName,
                  indexMetaData.index());
              indexMetaData = IndexMetaData.builder(indexMetaData).index(indexName).build();
            }
            newIndices.put(indexName, indexMetaData);
          } else {
            logger.debug("[{}] dangling index directory detected, but no state found", indexName);
          }
        } catch (Throwable t) {
          logger.warn("[{}] failed to load index state for detected dangled index", t, indexName);
        }
      }
    }
    return newIndices;
  }

  /**
   * Allocates the provided list of the dangled indices by sending them to the master node for
   * allocation.
   */
  private void allocateDanglingIndices() {
    if (danglingIndices.isEmpty() == true) {
      return;
    }
    try {
      allocateDangledIndices.allocateDangled(
          ImmutableList.copyOf(danglingIndices.values()),
          new LocalAllocateDangledIndices.Listener() {
            @Override
            public void onResponse(LocalAllocateDangledIndices.AllocateDangledResponse response) {
              logger.trace("allocated dangled");
            }

            @Override
            public void onFailure(Throwable e) {
              logger.info("failed to send allocated dangled", e);
            }
          });
    } catch (Throwable e) {
      logger.warn("failed to send allocate dangled", e);
    }
  }
}
 /**
  * Creates a new BlockingClusterStatePublishResponseHandler
  *
  * @param publishingToNodes the set of nodes to which the cluster state will be published and
  *     should respond
  */
 public BlockingClusterStatePublishResponseHandler(Set<DiscoveryNode> publishingToNodes) {
   this.pendingNodes = ConcurrentCollections.newConcurrentSet();
   this.pendingNodes.addAll(publishingToNodes);
   this.latch = new CountDownLatch(pendingNodes.size());
 }
예제 #26
0
 private Set<MockZenPing> getActiveNodesForCurrentCluster() {
   return activeNodesPerCluster.computeIfAbsent(
       getClusterName(), clusterName -> ConcurrentCollections.newConcurrentSet());
 }
예제 #27
0
public class ShardStateAction extends AbstractComponent {

  private final TransportService transportService;

  private final ClusterService clusterService;

  private final AllocationService allocationService;

  private final ThreadPool threadPool;

  private final BlockingQueue<ShardRouting> startedShardsQueue =
      ConcurrentCollections.newBlockingQueue();

  @Inject
  public ShardStateAction(
      Settings settings,
      ClusterService clusterService,
      TransportService transportService,
      AllocationService allocationService,
      ThreadPool threadPool) {
    super(settings);
    this.clusterService = clusterService;
    this.transportService = transportService;
    this.allocationService = allocationService;
    this.threadPool = threadPool;

    transportService.registerHandler(
        ShardStartedTransportHandler.ACTION, new ShardStartedTransportHandler());
    transportService.registerHandler(
        ShardFailedTransportHandler.ACTION, new ShardFailedTransportHandler());
  }

  public void shardFailed(final ShardRouting shardRouting, final String reason)
      throws ElasticSearchException {
    logger.warn("sending failed shard for {}, reason [{}]", shardRouting, reason);
    DiscoveryNodes nodes = clusterService.state().nodes();
    if (nodes.localNodeMaster()) {
      innerShardFailed(shardRouting, reason);
    } else {
      transportService.sendRequest(
          clusterService.state().nodes().masterNode(),
          ShardFailedTransportHandler.ACTION,
          new ShardRoutingEntry(shardRouting, reason),
          new VoidTransportResponseHandler(ThreadPool.Names.SAME) {
            @Override
            public void handleException(TransportException exp) {
              logger.warn(
                  "failed to send failed shard to [{}]",
                  exp,
                  clusterService.state().nodes().masterNode());
            }
          });
    }
  }

  public void shardStarted(final ShardRouting shardRouting, final String reason)
      throws ElasticSearchException {
    if (logger.isDebugEnabled()) {
      logger.debug("sending shard started for {}, reason [{}]", shardRouting, reason);
    }
    DiscoveryNodes nodes = clusterService.state().nodes();
    if (nodes.localNodeMaster()) {
      innerShardStarted(shardRouting, reason);
    } else {
      transportService.sendRequest(
          clusterService.state().nodes().masterNode(),
          ShardStartedTransportHandler.ACTION,
          new ShardRoutingEntry(shardRouting, reason),
          new VoidTransportResponseHandler(ThreadPool.Names.SAME) {
            @Override
            public void handleException(TransportException exp) {
              logger.warn(
                  "failed to send shard started to [{}]",
                  exp,
                  clusterService.state().nodes().masterNode());
            }
          });
    }
  }

  private void innerShardFailed(final ShardRouting shardRouting, final String reason) {
    logger.warn("received shard failed for {}, reason [{}]", shardRouting, reason);
    clusterService.submitStateUpdateTask(
        "shard-failed (" + shardRouting + "), reason [" + reason + "]",
        new ClusterStateUpdateTask() {
          @Override
          public ClusterState execute(ClusterState currentState) {
            if (logger.isDebugEnabled()) {
              logger.debug("Received failed shard {}, reason [{}]", shardRouting, reason);
            }
            RoutingAllocation.Result routingResult =
                allocationService.applyFailedShard(currentState, shardRouting);
            if (!routingResult.changed()) {
              return currentState;
            }
            if (logger.isDebugEnabled()) {
              logger.debug("Applying failed shard {}, reason [{}]", shardRouting, reason);
            }
            return newClusterStateBuilder()
                .state(currentState)
                .routingResult(routingResult)
                .build();
          }
        });
  }

  private void innerShardStarted(final ShardRouting shardRouting, final String reason) {
    if (logger.isDebugEnabled()) {
      logger.debug("received shard started for {}, reason [{}]", shardRouting, reason);
    }
    // buffer shard started requests, and the state update tasks will simply drain it
    // this is to optimize the number of "started" events we generate, and batch them
    // possibly, we can do time based batching as well, but usually, we would want to
    // process started events as fast as possible, to make shards available
    startedShardsQueue.add(shardRouting);

    clusterService.submitStateUpdateTask(
        "shard-started (" + shardRouting + "), reason [" + reason + "]",
        new ClusterStateUpdateTask() {
          @Override
          public ClusterState execute(ClusterState currentState) {

            List<ShardRouting> shards = new ArrayList<ShardRouting>();
            startedShardsQueue.drainTo(shards);

            // nothing to process (a previous event has process it already)
            if (shards.isEmpty()) {
              return currentState;
            }

            RoutingTable routingTable = currentState.routingTable();

            for (int i = 0; i < shards.size(); i++) {
              ShardRouting shardRouting = shards.get(i);
              IndexRoutingTable indexRoutingTable = routingTable.index(shardRouting.index());
              // if there is no routing table, the index has been deleted while it was being
              // allocated
              // which is fine, we should just ignore this
              if (indexRoutingTable == null) {
                shards.remove(i);
              } else {
                // find the one that maps to us, if its already started, no need to do anything...
                // the shard might already be started since the nodes that is starting the shards
                // might get cluster events
                // with the shard still initializing, and it will try and start it again (until the
                // verification comes)
                IndexShardRoutingTable indexShardRoutingTable =
                    indexRoutingTable.shard(shardRouting.id());
                for (ShardRouting entry : indexShardRoutingTable) {
                  if (shardRouting.currentNodeId().equals(entry.currentNodeId())) {
                    // we found the same shard that exists on the same node id
                    if (entry.started()) {
                      // already started, do nothing here...
                      shards.remove(i);
                    }
                  }
                }
              }
            }

            if (shards.isEmpty()) {
              return currentState;
            }

            if (logger.isDebugEnabled()) {
              logger.debug("applying started shards {}, reason [{}]", shards, reason);
            }
            RoutingAllocation.Result routingResult =
                allocationService.applyStartedShards(currentState, shards);
            if (!routingResult.changed()) {
              return currentState;
            }
            return newClusterStateBuilder()
                .state(currentState)
                .routingResult(routingResult)
                .build();
          }
        });
  }

  private class ShardFailedTransportHandler extends BaseTransportRequestHandler<ShardRoutingEntry> {

    static final String ACTION = "cluster/shardFailure";

    @Override
    public ShardRoutingEntry newInstance() {
      return new ShardRoutingEntry();
    }

    @Override
    public void messageReceived(ShardRoutingEntry request, TransportChannel channel)
        throws Exception {
      innerShardFailed(request.shardRouting, request.reason);
      channel.sendResponse(VoidStreamable.INSTANCE);
    }

    @Override
    public String executor() {
      return ThreadPool.Names.SAME;
    }
  }

  class ShardStartedTransportHandler extends BaseTransportRequestHandler<ShardRoutingEntry> {

    static final String ACTION = "cluster/shardStarted";

    @Override
    public ShardRoutingEntry newInstance() {
      return new ShardRoutingEntry();
    }

    @Override
    public void messageReceived(ShardRoutingEntry request, TransportChannel channel)
        throws Exception {
      innerShardStarted(request.shardRouting, request.reason);
      channel.sendResponse(VoidStreamable.INSTANCE);
    }

    @Override
    public String executor() {
      return ThreadPool.Names.SAME;
    }
  }

  static class ShardRoutingEntry extends TransportRequest {

    private ShardRouting shardRouting;

    private String reason;

    private ShardRoutingEntry() {}

    private ShardRoutingEntry(ShardRouting shardRouting, String reason) {
      this.shardRouting = shardRouting;
      this.reason = reason;
    }

    @Override
    public void readFrom(StreamInput in) throws IOException {
      super.readFrom(in);
      shardRouting = readShardRoutingEntry(in);
      reason = in.readString();
    }

    @Override
    public void writeTo(StreamOutput out) throws IOException {
      super.writeTo(out);
      shardRouting.writeTo(out);
      out.writeString(reason);
    }
  }
}
예제 #28
0
/**
 * A transport class that doesn't send anything but rather captures all requests for inspection from
 * tests
 */
public class CapturingTransport implements Transport {

  private TransportServiceAdapter adapter;

  public static class CapturedRequest {
    public final DiscoveryNode node;
    public final long requestId;
    public final String action;
    public final TransportRequest request;

    public CapturedRequest(
        DiscoveryNode node, long requestId, String action, TransportRequest request) {
      this.node = node;
      this.requestId = requestId;
      this.action = action;
      this.request = request;
    }
  }

  private ConcurrentMap<Long, Tuple<DiscoveryNode, String>> requests = new ConcurrentHashMap<>();
  private BlockingQueue<CapturedRequest> capturedRequests =
      ConcurrentCollections.newBlockingQueue();

  /**
   * returns all requests captured so far. Doesn't clear the captured request list. See {@link
   * #clear()}
   */
  public CapturedRequest[] capturedRequests() {
    return capturedRequests.toArray(new CapturedRequest[0]);
  }

  /**
   * Returns all requests captured so far. This method does clear the captured requests list. If you
   * do not want the captured requests list cleared, use {@link #capturedRequests()}.
   *
   * @return the captured requests
   */
  public CapturedRequest[] getCapturedRequestsAndClear() {
    CapturedRequest[] capturedRequests = capturedRequests();
    clear();
    return capturedRequests;
  }

  /**
   * returns all requests captured so far, grouped by target node. Doesn't clear the captured
   * request list. See {@link #clear()}
   */
  public Map<String, List<CapturedRequest>> capturedRequestsByTargetNode() {
    Map<String, List<CapturedRequest>> map = new HashMap<>();
    for (CapturedRequest request : capturedRequests) {
      List<CapturedRequest> nodeList = map.get(request.node.getId());
      if (nodeList == null) {
        nodeList = new ArrayList<>();
        map.put(request.node.getId(), nodeList);
      }
      nodeList.add(request);
    }
    return map;
  }

  /**
   * Returns all requests captured so far, grouped by target node. This method does clear the
   * captured request list. If you do not want the captured requests list cleared, use {@link
   * #capturedRequestsByTargetNode()}.
   *
   * @return the captured requests grouped by target node
   */
  public Map<String, List<CapturedRequest>> getCapturedRequestsByTargetNodeAndClear() {
    Map<String, List<CapturedRequest>> map = capturedRequestsByTargetNode();
    clear();
    return map;
  }

  /** clears captured requests */
  public void clear() {
    capturedRequests.clear();
  }

  /** simulate a response for the given requestId */
  public void handleResponse(final long requestId, final TransportResponse response) {
    adapter.onResponseReceived(requestId).handleResponse(response);
  }

  /**
   * simulate a local error for the given requestId, will be wrapped by a {@link
   * SendRequestTransportException}
   *
   * @param requestId the id corresponding to the captured send request
   * @param t the failure to wrap
   */
  public void handleLocalError(final long requestId, final Throwable t) {
    Tuple<DiscoveryNode, String> request = requests.get(requestId);
    assert request != null;
    this.handleError(requestId, new SendRequestTransportException(request.v1(), request.v2(), t));
  }

  /**
   * simulate a remote error for the given requestId, will be wrapped by a {@link
   * RemoteTransportException}
   *
   * @param requestId the id corresponding to the captured send request
   * @param t the failure to wrap
   */
  public void handleRemoteError(final long requestId, final Throwable t) {
    final RemoteTransportException remoteException;
    if (rarely(Randomness.get())) {
      remoteException = new RemoteTransportException("remote failure, coming from local node", t);
    } else {
      try (BytesStreamOutput output = new BytesStreamOutput()) {
        output.writeException(t);
        remoteException =
            new RemoteTransportException(
                "remote failure", output.bytes().streamInput().readException());
      } catch (IOException ioException) {
        throw new ElasticsearchException(
            "failed to serialize/deserialize supplied exception " + t, ioException);
      }
    }
    this.handleError(requestId, remoteException);
  }

  /**
   * simulate an error for the given requestId, unlike {@link #handleLocalError(long, Throwable)}
   * and {@link #handleRemoteError(long, Throwable)}, the provided exception will not be wrapped but
   * will be delivered to the transport layer as is
   *
   * @param requestId the id corresponding to the captured send request
   * @param e the failure
   */
  public void handleError(final long requestId, final TransportException e) {
    adapter.onResponseReceived(requestId).handleException(e);
  }

  @Override
  public Connection openConnection(DiscoveryNode node, ConnectionProfile profile)
      throws IOException {
    return new Connection() {
      @Override
      public DiscoveryNode getNode() {
        return node;
      }

      @Override
      public void sendRequest(
          long requestId, String action, TransportRequest request, TransportRequestOptions options)
          throws IOException, TransportException {
        requests.put(requestId, Tuple.tuple(node, action));
        capturedRequests.add(new CapturedRequest(node, requestId, action, request));
      }

      @Override
      public void close() throws IOException {}
    };
  }

  @Override
  public void transportServiceAdapter(TransportServiceAdapter adapter) {
    this.adapter = adapter;
  }

  @Override
  public BoundTransportAddress boundAddress() {
    return null;
  }

  @Override
  public Map<String, BoundTransportAddress> profileBoundAddresses() {
    return null;
  }

  @Override
  public TransportAddress[] addressesFromString(String address, int perAddressLimit)
      throws UnknownHostException {
    return new TransportAddress[0];
  }

  @Override
  public boolean nodeConnected(DiscoveryNode node) {
    return true;
  }

  @Override
  public void connectToNode(DiscoveryNode node, ConnectionProfile connectionProfile)
      throws ConnectTransportException {}

  @Override
  public void disconnectFromNode(DiscoveryNode node) {}

  @Override
  public long serverOpen() {
    return 0;
  }

  @Override
  public Lifecycle.State lifecycleState() {
    return null;
  }

  @Override
  public void addLifecycleListener(LifecycleListener listener) {}

  @Override
  public void removeLifecycleListener(LifecycleListener listener) {}

  @Override
  public void start() {}

  @Override
  public void stop() {}

  @Override
  public void close() {}

  @Override
  public List<String> getLocalAddresses() {
    return Collections.emptyList();
  }

  @Override
  public Connection getConnection(DiscoveryNode node) {
    try {
      return openConnection(node, null);
    } catch (IOException e) {
      throw new UncheckedIOException(e);
    }
  }
}