예제 #1
0
 public Builder(IndexMetaData indexMetaData) {
   this(indexMetaData.index());
   settings(indexMetaData.settings());
   mappings.putAll(indexMetaData.mappings);
   aliases.putAll(indexMetaData.aliases);
   customs.putAll(indexMetaData.customs);
   this.state = indexMetaData.state;
   this.version = indexMetaData.version;
 }
예제 #2
0
    public static void toXContent(
        IndexMetaData indexMetaData, XContentBuilder builder, ToXContent.Params params)
        throws IOException {
      builder.startObject(indexMetaData.index(), XContentBuilder.FieldCaseConversion.NONE);

      builder.field("version", indexMetaData.version());
      builder.field("state", indexMetaData.state().toString().toLowerCase(Locale.ENGLISH));

      boolean binary = params.paramAsBoolean("binary", false);

      builder.startObject("settings");
      for (Map.Entry<String, String> entry : indexMetaData.settings().getAsMap().entrySet()) {
        builder.field(entry.getKey(), entry.getValue());
      }
      builder.endObject();

      builder.startArray("mappings");
      for (Map.Entry<String, MappingMetaData> entry : indexMetaData.mappings().entrySet()) {
        if (binary) {
          builder.value(entry.getValue().source().compressed());
        } else {
          byte[] data = entry.getValue().source().uncompressed();
          XContentParser parser = XContentFactory.xContent(data).createParser(data);
          Map<String, Object> mapping = parser.mapOrdered();
          parser.close();
          builder.map(mapping);
        }
      }
      builder.endArray();

      for (Map.Entry<String, Custom> entry : indexMetaData.customs().entrySet()) {
        builder.startObject(entry.getKey(), XContentBuilder.FieldCaseConversion.NONE);
        lookupFactorySafe(entry.getKey()).toXContent(entry.getValue(), builder, params);
        builder.endObject();
      }

      builder.startObject("aliases");
      for (AliasMetaData alias : indexMetaData.aliases().values()) {
        AliasMetaData.Builder.toXContent(alias, builder, params);
      }
      builder.endObject();

      builder.endObject();
    }
예제 #3
0
 public static void writeTo(IndexMetaData indexMetaData, StreamOutput out) throws IOException {
   out.writeUTF(indexMetaData.index());
   out.writeLong(indexMetaData.version());
   out.writeByte(indexMetaData.state().id());
   writeSettingsToStream(indexMetaData.settings(), out);
   out.writeVInt(indexMetaData.mappings().size());
   for (MappingMetaData mappingMd : indexMetaData.mappings().values()) {
     MappingMetaData.writeTo(mappingMd, out);
   }
   out.writeVInt(indexMetaData.aliases().size());
   for (AliasMetaData aliasMd : indexMetaData.aliases().values()) {
     AliasMetaData.Builder.writeTo(aliasMd, out);
   }
   out.writeVInt(indexMetaData.customs().size());
   for (Map.Entry<String, Custom> entry : indexMetaData.customs().entrySet()) {
     out.writeUTF(entry.getKey());
     lookupFactorySafe(entry.getKey()).writeTo(entry.getValue(), out);
   }
 }
  /** Can the shard request be cached at all? */
  public boolean canCache(ShardSearchRequest request, SearchContext context) {
    // TODO: for now, template is not supported, though we could use the generated bytes as the key
    if (hasLength(request.templateSource())) {
      return false;
    }

    // for now, only enable it for requests with no hits
    if (context.size() != 0) {
      return false;
    }

    // We cannot cache with DFS because results depend not only on the content of the index but also
    // on the overridden statistics. So if you ran two queries on the same index with different
    // stats
    // (because an other shard was updated) you would get wrong results because of the scores
    // (think about top_hits aggs or scripts using the score)
    if (!CACHEABLE_SEARCH_TYPES.contains(context.searchType())) {
      return false;
    }

    IndexMetaData index = clusterService.state().getMetaData().index(request.index());
    if (index == null) { // in case we didn't yet have the cluster state, or it just got deleted
      return false;
    }
    // if not explicitly set in the request, use the index setting, if not, use the request
    if (request.requestCache() == null) {
      if (!isCacheEnabled(index.settings(), Boolean.FALSE)) {
        return false;
      }
    } else if (!request.requestCache()) {
      return false;
    }
    // if the reader is not a directory reader, we can't get the version from it
    if (!(context.searcher().getIndexReader() instanceof DirectoryReader)) {
      return false;
    }
    // if now in millis is used (or in the future, a more generic "isDeterministic" flag
    // then we can't cache based on "now" key within the search request, as it is not deterministic
    if (context.nowInMillisUsed()) {
      return false;
    }
    return true;
  }
  private void applyDeletedIndices(final ClusterChangedEvent event) {
    final ClusterState previousState = event.previousState();
    final String localNodeId = event.state().nodes().localNodeId();
    assert localNodeId != null;

    for (IndexService indexService : indicesService) {
      IndexMetaData indexMetaData = event.state().metaData().index(indexService.index().name());
      if (indexMetaData != null) {
        if (!indexMetaData.isSameUUID(indexService.indexUUID())) {
          logger.debug(
              "[{}] mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated",
              indexMetaData.index());
          deleteIndex(
              indexMetaData.index(),
              "mismatch on index UUIDs between cluster state and local state, cleaning the index so it will be recreated");
        }
      }
    }

    for (String index : event.indicesDeleted()) {
      if (logger.isDebugEnabled()) {
        logger.debug("[{}] cleaning index, no longer part of the metadata", index);
      }
      final Settings indexSettings;
      final IndexService idxService = indicesService.indexService(index);
      if (idxService != null) {
        indexSettings = idxService.getIndexSettings();
        deleteIndex(index, "index no longer part of the metadata");
      } else {
        final IndexMetaData metaData = previousState.metaData().index(index);
        assert metaData != null;
        indexSettings = metaData.settings();
        indicesService.deleteClosedIndex(
            "closed index no longer part of the metadata", metaData, event.state());
      }
      try {
        nodeIndexDeletedAction.nodeIndexDeleted(event.state(), index, indexSettings, localNodeId);
      } catch (Throwable e) {
        logger.debug("failed to send to master index {} deleted event", e, index);
      }
    }
  }
 private void applySettings(ClusterChangedEvent event) {
   if (!event.metaDataChanged()) {
     return;
   }
   for (IndexMetaData indexMetaData : event.state().metaData()) {
     if (!indicesService.hasIndex(indexMetaData.index())) {
       // we only create / update here
       continue;
     }
     // if the index meta data didn't change, no need check for refreshed settings
     if (!event.indexMetaDataChanged(indexMetaData)) {
       continue;
     }
     String index = indexMetaData.index();
     IndexService indexService = indicesService.indexService(index);
     if (indexService == null) {
       // already deleted on us, ignore it
       continue;
     }
     IndexSettingsService indexSettingsService =
         indexService.injector().getInstance(IndexSettingsService.class);
     indexSettingsService.refreshSettings(indexMetaData.settings());
   }
 }
 private void applyNewIndices(final ClusterChangedEvent event) {
   // we only create indices for shards that are allocated
   RoutingNodes.RoutingNodeIterator routingNode =
       event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId());
   if (routingNode == null) {
     return;
   }
   for (ShardRouting shard : routingNode) {
     if (!indicesService.hasIndex(shard.index())) {
       final IndexMetaData indexMetaData = event.state().metaData().index(shard.index());
       if (logger.isDebugEnabled()) {
         logger.debug("[{}] creating index", indexMetaData.index());
       }
       try {
         indicesService.createIndex(
             indexMetaData.index(),
             indexMetaData.settings(),
             event.state().nodes().localNode().id());
       } catch (Throwable e) {
         sendFailShard(shard, indexMetaData.getIndexUUID(), "failed to create index", e);
       }
     }
   }
 }
예제 #8
0
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    final MetaData metaData = routingNodes.metaData();
    RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned();
    unassigned.sort(
        new PriorityComparator() {

          @Override
          protected Settings getIndexSettings(String index) {
            IndexMetaData indexMetaData = metaData.index(index);
            return indexMetaData.getSettings();
          }
        }); // sort for priority ordering
    Iterator<ShardRouting> unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch =
          asyncFetchStarted.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
        asyncFetchStarted.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>
          shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardState.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }
      shardState.processAllocation(allocation);

      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      /**
       * Build a map of DiscoveryNodes to shard state number for the given shard. A state of -1
       * means the shard does not exist on the node, where any shard state >= 0 is the state version
       * of the shard on that node's disk.
       *
       * <p>A shard on shared storage will return at least shard state 0 for all nodes, indicating
       * that the shard can be allocated to any node.
       */
      ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>();
      for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState :
          shardState.getData().values()) {
        long version = nodeShardState.version();
        // -1 version means it does not exists, which is what the API returns, and what we expect to
        logger.trace(
            "[{}] on node [{}] has version [{}] of shard",
            shard,
            nodeShardState.getNode(),
            version);
        nodesState.put(nodeShardState.getNode(), version);
      }

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();

      assert !nodesState.containsKey(null);
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      Settings idxSettings = indexMetaData.settings();
      for (int i = 0; i < keys.length; i++) {
        if (keys[i] == null) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (recoverOnAnyNode(idxSettings)) {
          numberOfAllocationsFound++;
          if (version > highestVersion) {
            highestVersion = version;
          }
          // We always put the node without clearing the map
          nodesWithVersion.put(node, version);
        } else if (version != -1) {
          numberOfAllocationsFound++;
          // If we've found a new "best" candidate, clear the
          // current candidates and add it
          if (version > highestVersion) {
            highestVersion = version;
            nodesWithVersion.clear();
            nodesWithVersion.put(node, version);
          } else if (version == highestVersion) {
            // If the candidate is the same, add it to the
            // list, but keep the current candidate
            nodesWithVersion.put(node, version);
          }
        }
      }
      // Now that we have a map of nodes to versions along with the
      // number of allocations found (and not ignored), we need to sort
      // it so the node with the highest version is at the beginning
      List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
      nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
      CollectionUtil.timSort(
          nodesWithHighestVersion,
          new Comparator<DiscoveryNode>() {
            @Override
            public int compare(DiscoveryNode o1, DiscoveryNode o2) {
              return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
            }
          });

      if (logger.isDebugEnabled()) {
        logger.debug(
            "[{}][{}] found {} allocations of {}, highest version: [{}]",
            shard.index(),
            shard.id(),
            numberOfAllocationsFound,
            shard,
            highestVersion);
      }
      if (logger.isTraceEnabled()) {
        StringBuilder sb = new StringBuilder("[");
        for (DiscoveryNode n : nodesWithHighestVersion) {
          sb.append("[");
          sb.append(n.getName());
          sb.append("]");
          sb.append(" -> ");
          sb.append(nodesWithVersion.get(n));
          sb.append(", ");
        }
        sb.append("]");
        logger.trace("{} candidates for allocation: {}", shard, sb.toString());
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = unassigned.iterator();
    while (unassignedIterator.hasNext()) {
      ShardRouting shard = unassignedIterator.next();
      if (shard.primary()) {
        continue;
      }

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue;
      }

      AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch =
          asyncFetchStore.get(shard.shardId());
      if (fetch == null) {
        fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
        asyncFetchStore.put(shard.shardId(), fetch);
      }
      AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          shardStores =
              fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
      if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
        continue; // still fetching
      }
      shardStores.processAllocation(allocation);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;
      boolean hasReplicaData = false;
      IndexMetaData indexMetaData = metaData.index(shard.getIndex());

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>
          nodeStoreEntry : shardStores.getData().entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue().storeFilesMetaData();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          hasReplicaData |= storeFilesMetaData.iterator().hasNext();
          ShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore =
                  shardStores.getData().get(primaryNode);
              if (primaryNodeFilesStore != null) {
                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                    primaryNodeFilesStore.storeFilesMetaData();
                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                  long sizeMatched = 0;

                  String primarySyncId = primaryNodeStore.syncId();
                  String replicaSyncId = storeFilesMetaData.syncId();
                  // see if we have a sync id we can make use of
                  if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
                    logger.trace(
                        "{}: node [{}] has same sync id {} as primary",
                        shard,
                        discoNode.name(),
                        replicaSyncId);
                    lastNodeMatched = node;
                    lastSizeMatched = Long.MAX_VALUE;
                    lastDiscoNodeMatched = discoNode;
                  } else {
                    for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                      String metaDataFileName = storeFileMetaData.name();
                      if (primaryNodeStore.fileExists(metaDataFileName)
                          && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
                        sizeMatched += storeFileMetaData.length();
                      }
                    }
                    logger.trace(
                        "{}: node [{}] has [{}/{}] bytes of re-usable data",
                        shard,
                        discoNode.name(),
                        new ByteSizeValue(sizeMatched),
                        sizeMatched);
                    if (sizeMatched > lastSizeMatched) {
                      lastSizeMatched = sizeMatched;
                      lastDiscoNodeMatched = discoNode;
                      lastNodeMatched = node;
                    }
                  }
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          routingNodes.initialize(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      } else if (hasReplicaData == false) {
        // if we didn't manage to find *any* data (regardless of matching sizes), check if the
        // allocation
        // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
        // note: we only care about replica in delayed allocation, since if we have an unassigned
        // primary it
        //       will anyhow wait to find an existing copy of the shard to be allocated
        // note: the other side of the equation is scheduling a reroute in a timely manner, which
        // happens in the RoutingService
        long delay =
            shard
                .unassignedInfo()
                .getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
        if (delay > 0) {
          logger.debug(
              "[{}][{}]: delaying allocation of [{}] for [{}]",
              shard.index(),
              shard.id(),
              shard,
              TimeValue.timeValueMillis(delay));
          /**
           * mark it as changed, since we want to kick a publishing to schedule future allocation,
           * see {@link
           * org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
           */
          changed = true;
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        }
      }
    }
    return changed;
  }
예제 #9
0
    @Override
    public TerminationHandle warmNewReaders(
        final IndexShard indexShard,
        IndexMetaData indexMetaData,
        final WarmerContext context,
        ThreadPool threadPool) {
      final Loading defaultLoading =
          Loading.parse(indexMetaData.settings().get(NORMS_LOADING_KEY), Loading.LAZY);
      final MapperService mapperService = indexShard.mapperService();
      final ObjectSet<String> warmUp = new ObjectHashSet<>();
      for (DocumentMapper docMapper : mapperService.docMappers(false)) {
        for (FieldMapper fieldMapper : docMapper.mappers()) {
          final String indexName = fieldMapper.fieldType().names().indexName();
          Loading normsLoading = fieldMapper.fieldType().normsLoading();
          if (normsLoading == null) {
            normsLoading = defaultLoading;
          }
          if (fieldMapper.fieldType().indexOptions() != IndexOptions.NONE
              && !fieldMapper.fieldType().omitNorms()
              && normsLoading == Loading.EAGER) {
            warmUp.add(indexName);
          }
        }
      }

      final CountDownLatch latch = new CountDownLatch(1);
      // Norms loading may be I/O intensive but is not CPU intensive, so we execute it in a single
      // task
      threadPool
          .executor(executor())
          .execute(
              new Runnable() {
                @Override
                public void run() {
                  try {
                    for (ObjectCursor<String> stringObjectCursor : warmUp) {
                      final String indexName = stringObjectCursor.value;
                      final long start = System.nanoTime();
                      for (final LeafReaderContext ctx : context.searcher().reader().leaves()) {
                        final NumericDocValues values = ctx.reader().getNormValues(indexName);
                        if (values != null) {
                          values.get(0);
                        }
                      }
                      if (indexShard.warmerService().logger().isTraceEnabled()) {
                        indexShard
                            .warmerService()
                            .logger()
                            .trace(
                                "warmed norms for [{}], took [{}]",
                                indexName,
                                TimeValue.timeValueNanos(System.nanoTime() - start));
                      }
                    }
                  } catch (Throwable t) {
                    indexShard.warmerService().logger().warn("failed to warm-up norms", t);
                  } finally {
                    latch.countDown();
                  }
                }
              });

      return new TerminationHandle() {
        @Override
        public void awaitTermination() throws InterruptedException {
          latch.await();
        }
      };
    }
  @Test
  public void testSimpleJsonFromAndTo() throws IOException {
    MetaData metaData =
        newMetaDataBuilder()
            .maxNumberOfShardsPerNode(2)
            .put(newIndexMetaDataBuilder("test1").numberOfShards(1).numberOfReplicas(2))
            .put(
                newIndexMetaDataBuilder("test2")
                    .settings(settingsBuilder().put("setting1", "value1").put("setting2", "value2"))
                    .numberOfShards(2)
                    .numberOfReplicas(3))
            .put(
                newIndexMetaDataBuilder("test3")
                    .numberOfShards(1)
                    .numberOfReplicas(2)
                    .putMapping("mapping1", MAPPING_SOURCE1))
            .put(
                newIndexMetaDataBuilder("test4")
                    .settings(settingsBuilder().put("setting1", "value1").put("setting2", "value2"))
                    .numberOfShards(1)
                    .numberOfReplicas(2)
                    .putMapping("mapping1", MAPPING_SOURCE1)
                    .putMapping("mapping2", MAPPING_SOURCE2))
            .build();

    String metaDataSource = MetaData.Builder.toJson(metaData);
    System.out.println("ToJson: " + metaDataSource);

    MetaData parsedMetaData =
        MetaData.Builder.fromJson(
            Jackson.defaultJsonFactory().createJsonParser(metaDataSource), null);
    assertThat(parsedMetaData.maxNumberOfShardsPerNode(), equalTo(2));

    IndexMetaData indexMetaData = metaData.index("test1");
    assertThat(indexMetaData.numberOfShards(), equalTo(1));
    assertThat(indexMetaData.numberOfReplicas(), equalTo(2));
    assertThat(indexMetaData.settings().getAsMap().size(), equalTo(2));
    assertThat(indexMetaData.mappings().size(), equalTo(0));

    indexMetaData = metaData.index("test2");
    assertThat(indexMetaData.numberOfShards(), equalTo(2));
    assertThat(indexMetaData.numberOfReplicas(), equalTo(3));
    assertThat(indexMetaData.settings().getAsMap().size(), equalTo(4));
    assertThat(indexMetaData.settings().get("setting1"), equalTo("value1"));
    assertThat(indexMetaData.settings().get("setting2"), equalTo("value2"));
    assertThat(indexMetaData.mappings().size(), equalTo(0));

    indexMetaData = metaData.index("test3");
    assertThat(indexMetaData.numberOfShards(), equalTo(1));
    assertThat(indexMetaData.numberOfReplicas(), equalTo(2));
    assertThat(indexMetaData.settings().getAsMap().size(), equalTo(2));
    assertThat(indexMetaData.mappings().size(), equalTo(1));
    assertThat(indexMetaData.mappings().get("mapping1"), equalTo(MAPPING_SOURCE1));

    indexMetaData = metaData.index("test4");
    assertThat(indexMetaData.numberOfShards(), equalTo(1));
    assertThat(indexMetaData.numberOfReplicas(), equalTo(2));
    assertThat(indexMetaData.settings().getAsMap().size(), equalTo(4));
    assertThat(indexMetaData.settings().get("setting1"), equalTo("value1"));
    assertThat(indexMetaData.settings().get("setting2"), equalTo("value2"));
    assertThat(indexMetaData.mappings().size(), equalTo(2));
    assertThat(indexMetaData.mappings().get("mapping1"), equalTo(MAPPING_SOURCE1));
    assertThat(indexMetaData.mappings().get("mapping2"), equalTo(MAPPING_SOURCE2));
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }
  private void pre019Upgrade() throws Exception {
    long index = -1;
    File metaDataFile = null;
    MetaData metaData = null;
    long version = -1;
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File stateLocation = new File(dataLocation, "_state");
      if (!stateLocation.exists()) {
        continue;
      }
      File[] stateFiles = stateLocation.listFiles();
      if (stateFiles == null) {
        continue;
      }
      for (File stateFile : stateFiles) {
        if (logger.isTraceEnabled()) {
          logger.trace("[upgrade]: processing [" + stateFile.getName() + "]");
        }
        String name = stateFile.getName();
        if (!name.startsWith("metadata-")) {
          continue;
        }
        long fileIndex = Long.parseLong(name.substring(name.indexOf('-') + 1));
        if (fileIndex >= index) {
          // try and read the meta data
          try {
            byte[] data = Streams.copyToByteArray(new FileInputStream(stateFile));
            if (data.length == 0) {
              continue;
            }
            XContentParser parser = XContentHelper.createParser(data, 0, data.length);
            try {
              String currentFieldName = null;
              XContentParser.Token token = parser.nextToken();
              if (token != null) {
                while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
                  if (token == XContentParser.Token.FIELD_NAME) {
                    currentFieldName = parser.currentName();
                  } else if (token == XContentParser.Token.START_OBJECT) {
                    if ("meta-data".equals(currentFieldName)) {
                      metaData = MetaData.Builder.fromXContent(parser);
                    }
                  } else if (token.isValue()) {
                    if ("version".equals(currentFieldName)) {
                      version = parser.longValue();
                    }
                  }
                }
              }
            } finally {
              parser.close();
            }
            index = fileIndex;
            metaDataFile = stateFile;
          } catch (IOException e) {
            logger.warn("failed to read pre 0.19 state from [" + name + "], ignoring...", e);
          }
        }
      }
    }
    if (metaData == null) {
      return;
    }

    logger.info(
        "found old metadata state, loading metadata from [{}] and converting to new metadata location and strucutre...",
        metaDataFile.getAbsolutePath());

    writeGlobalState(
        "upgrade", MetaData.builder().metaData(metaData).version(version).build(), null);
    for (IndexMetaData indexMetaData : metaData) {
      IndexMetaData.Builder indexMetaDataBuilder =
          IndexMetaData.newIndexMetaDataBuilder(indexMetaData).version(version);
      // set the created version to 0.18
      indexMetaDataBuilder.settings(
          ImmutableSettings.settingsBuilder()
              .put(indexMetaData.settings())
              .put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_18_0));
      writeIndex("upgrade", indexMetaDataBuilder.build(), null);
    }

    // rename shards state to backup state
    File backupFile = new File(metaDataFile.getParentFile(), "backup-" + metaDataFile.getName());
    if (!metaDataFile.renameTo(backupFile)) {
      throw new IOException(
          "failed to rename old state to backup state [" + metaDataFile.getAbsolutePath() + "]");
    }

    // delete all other shards state files
    for (File dataLocation : nodeEnv.nodeDataLocations()) {
      File stateLocation = new File(dataLocation, "_state");
      if (!stateLocation.exists()) {
        continue;
      }
      File[] stateFiles = stateLocation.listFiles();
      if (stateFiles == null) {
        continue;
      }
      for (File stateFile : stateFiles) {
        String name = stateFile.getName();
        if (!name.startsWith("metadata-")) {
          continue;
        }
        stateFile.delete();
      }
    }

    logger.info(
        "conversion to new metadata location and format done, backup create at [{}]",
        backupFile.getAbsolutePath());
  }