/** Build from lists of information about each node. */ private JvmStats(List<NodeInfo> nodeInfos, List<NodeStats> nodeStatsList) { this.versions = new ObjectIntHashMap<>(); long threads = 0; long maxUptime = 0; long heapMax = 0; long heapUsed = 0; for (NodeInfo nodeInfo : nodeInfos) { versions.addTo(new JvmVersion(nodeInfo.getJvm()), 1); } for (NodeStats nodeStats : nodeStatsList) { org.elasticsearch.monitor.jvm.JvmStats js = nodeStats.getJvm(); if (js == null) { continue; } if (js.getThreads() != null) { threads += js.getThreads().getCount(); } maxUptime = Math.max(maxUptime, js.getUptime().millis()); if (js.getMem() != null) { heapUsed += js.getMem().getHeapUsed().getBytes(); heapMax += js.getMem().getHeapMax().getBytes(); } } this.threads = threads; this.maxUptime = maxUptime; this.heapUsed = heapUsed; this.heapMax = heapMax; }
/** Build the stats from information about each node. */ private OsStats(List<NodeInfo> nodeInfos, List<NodeStats> nodeStatsList) { this.names = new ObjectIntHashMap<>(); int availableProcessors = 0; int allocatedProcessors = 0; for (NodeInfo nodeInfo : nodeInfos) { availableProcessors += nodeInfo.getOs().getAvailableProcessors(); allocatedProcessors += nodeInfo.getOs().getAllocatedProcessors(); if (nodeInfo.getOs().getName() != null) { names.addTo(nodeInfo.getOs().getName(), 1); } } this.availableProcessors = availableProcessors; this.allocatedProcessors = allocatedProcessors; long totalMemory = 0; long freeMemory = 0; for (NodeStats nodeStats : nodeStatsList) { if (nodeStats.getOs() != null) { long total = nodeStats.getOs().getMem().getTotal().getBytes(); if (total > 0) { totalMemory += total; } long free = nodeStats.getOs().getMem().getFree().getBytes(); if (free > 0) { freeMemory += free; } } } this.mem = new org.elasticsearch.monitor.os.OsStats.Mem(totalMemory, freeMemory); }
/** Build from looking at a list of node statistics. */ private ProcessStats(List<NodeStats> nodeStatsList) { int count = 0; int cpuPercent = 0; long totalOpenFileDescriptors = 0; long minOpenFileDescriptors = Long.MAX_VALUE; long maxOpenFileDescriptors = Long.MIN_VALUE; for (NodeStats nodeStats : nodeStatsList) { if (nodeStats.getProcess() == null) { continue; } count++; if (nodeStats.getProcess().getCpu() != null) { cpuPercent += nodeStats.getProcess().getCpu().getPercent(); } long fd = nodeStats.getProcess().getOpenFileDescriptors(); if (fd > 0) { // fd can be -1 if not supported on platform totalOpenFileDescriptors += fd; } // we still do min max calc on -1, so we'll have an indication of it not being supported on // one of the nodes. minOpenFileDescriptors = Math.min(minOpenFileDescriptors, fd); maxOpenFileDescriptors = Math.max(maxOpenFileDescriptors, fd); } this.count = count; this.cpuPercent = cpuPercent; this.totalOpenFileDescriptors = totalOpenFileDescriptors; this.minOpenFileDescriptors = minOpenFileDescriptors; this.maxOpenFileDescriptors = maxOpenFileDescriptors; }
private void assertSearchContextsClosed() { NodesStatsResponse nodesStats = client().admin().cluster().prepareNodesStats().setIndices(true).get(); for (NodeStats nodeStat : nodesStats.getNodes()) { assertThat(nodeStat.getIndices().getSearch().getOpenContexts(), equalTo(0L)); } }
public void testCustomCircuitBreakerRegistration() throws Exception { Iterable<CircuitBreakerService> serviceIter = internalCluster().getInstances(CircuitBreakerService.class); final String breakerName = "customBreaker"; BreakerSettings breakerSettings = new BreakerSettings(breakerName, 8, 1.03); CircuitBreaker breaker = null; for (CircuitBreakerService s : serviceIter) { s.registerBreaker(breakerSettings); breaker = s.getBreaker(breakerSettings.getName()); } if (breaker != null) { try { breaker.addEstimateBytesAndMaybeBreak(16, "test"); } catch (CircuitBreakingException e) { // ignore, we forced a circuit break } } NodesStatsResponse stats = client().admin().cluster().prepareNodesStats().clear().setBreaker(true).get(); int breaks = 0; for (NodeStats stat : stats.getNodes()) { CircuitBreakerStats breakerStats = stat.getBreaker().getStats(breakerName); breaks += breakerStats.getTrippedCount(); } assertThat(breaks, greaterThanOrEqualTo(1)); }
public void testRamAccountingTermsEnum() throws Exception { if (noopBreakerUsed()) { logger.info("--> noop breakers used, skipping test"); return; } final Client client = client(); // Create an index where the mappings have a field data filter assertAcked( prepareCreate("ramtest") .setSource( "{\"mappings\": {\"type\": {\"properties\": {\"test\": " + "{\"type\": \"text\",\"fielddata\": true,\"fielddata_frequency_filter\": {\"max\": 10000}}}}}}")); ensureGreen("ramtest"); // index some different terms so we have some field data for loading int docCount = scaledRandomIntBetween(300, 1000); List<IndexRequestBuilder> reqs = new ArrayList<>(); for (long id = 0; id < docCount; id++) { reqs.add( client .prepareIndex("ramtest", "type", Long.toString(id)) .setSource("test", "value" + id)); } indexRandom(true, false, true, reqs); // execute a search that loads field data (sorting on the "test" field) client.prepareSearch("ramtest").setQuery(matchAllQuery()).addSort("test", SortOrder.DESC).get(); // clear field data cache (thus setting the loaded field data back to 0) clearFieldData(); // Update circuit breaker settings Settings settings = Settings.builder() .put( HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_LIMIT_SETTING.getKey(), "100b") .put( HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_OVERHEAD_SETTING.getKey(), 1.05) .build(); assertAcked(client.admin().cluster().prepareUpdateSettings().setTransientSettings(settings)); // execute a search that loads field data (sorting on the "test" field) // again, this time it should trip the breaker assertFailures( client.prepareSearch("ramtest").setQuery(matchAllQuery()).addSort("test", SortOrder.DESC), RestStatus.INTERNAL_SERVER_ERROR, containsString("Data too large, data for [test] would be larger than limit of [100/100b]")); NodesStatsResponse stats = client.admin().cluster().prepareNodesStats().setBreaker(true).get(); int breaks = 0; for (NodeStats stat : stats.getNodes()) { CircuitBreakerStats breakerStats = stat.getBreaker().getStats(CircuitBreaker.FIELDDATA); breaks += breakerStats.getTrippedCount(); } assertThat(breaks, greaterThanOrEqualTo(1)); }
private long getTotalHttpConnections() { NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().setHttp(true).get(); int totalOpenConnections = 0; for (NodeStats stats : nodeStats.getNodes()) { totalOpenConnections += stats.getHttp().getTotalOpen(); } return totalOpenConnections; }
/** Returns true if any of the nodes used a noop breaker */ private boolean noopBreakerUsed() { NodesStatsResponse stats = client().admin().cluster().prepareNodesStats().setBreaker(true).get(); for (NodeStats nodeStats : stats.getNodes()) { if (nodeStats.getBreaker().getStats(CircuitBreaker.REQUEST).getLimit() == NoopCircuitBreaker.LIMIT) { return true; } if (nodeStats.getBreaker().getStats(CircuitBreaker.IN_FLIGHT_REQUESTS).getLimit() == NoopCircuitBreaker.LIMIT) { return true; } if (nodeStats.getBreaker().getStats(CircuitBreaker.FIELDDATA).getLimit() == NoopCircuitBreaker.LIMIT) { return true; } } return false; }
public void addNodeStats(NodeStats nodeStats) { if (nodeStats.getProcess() == null) { return; } count++; if (nodeStats.getProcess().cpu() != null) { // with no sigar, this may not be available cpuPercent += nodeStats.getProcess().cpu().getPercent(); } long fd = nodeStats.getProcess().openFileDescriptors(); if (fd > 0) { // fd can be -1 if not supported on platform totalOpenFileDescriptors += fd; } // we still do min max calc on -1, so we'll have an indication of it not being supported on // one of the nodes. minOpenFileDescriptors = Math.min(minOpenFileDescriptors, fd); maxOpenFileDescriptors = Math.max(maxOpenFileDescriptors, fd); }
public void addNodeInfoStats(NodeInfo nodeInfo, NodeStats nodeStats) { versions.addTo(new JvmVersion(nodeInfo.getJvm()), 1); org.elasticsearch.monitor.jvm.JvmStats js = nodeStats.getJvm(); if (js == null) { return; } if (js.threads() != null) { threads += js.threads().count(); } maxUptime = Math.max(maxUptime, js.uptime().millis()); if (js.mem() != null) { heapUsed += js.mem().getHeapUsed().bytes(); heapMax += js.mem().getHeapMax().bytes(); } }
@Override public void execute() throws Exception { // If Elasticsearch is started then only start the monitoring if (!ElasticsearchProcessMonitor.isElasticsearchRunning()) { String exceptionMsg = "Elasticsearch is not yet started, check back again later"; logger.info(exceptionMsg); return; } ThreadPoolStatsBean tpStatsBean = new ThreadPoolStatsBean(); try { NodesStatsResponse ndsStatsResponse = ESTransportClient.getNodesStatsResponse(config); ThreadPoolStats tpstats = null; NodeStats ndStat = null; if (ndsStatsResponse.getNodes().length > 0) { ndStat = ndsStatsResponse.getAt(0); } if (ndStat == null) { logger.info("NodeStats is null,hence returning (No ThreadPoolStats)."); return; } tpstats = ndStat.getThreadPool(); if (tpstats == null) { logger.info("ThreadPoolStats is null,hence returning (No ThreadPoolStats)."); return; } Iterator<ThreadPoolStats.Stats> iter = tpstats.iterator(); while (iter.hasNext()) { ThreadPoolStats.Stats stat = iter.next(); if (stat.getName().equals("index")) { tpStatsBean.indexThreads = stat.getThreads(); tpStatsBean.indexQueue = stat.getQueue(); tpStatsBean.indexActive = stat.getActive(); tpStatsBean.indexRejected = stat.getRejected(); tpStatsBean.indexLargest = stat.getLargest(); tpStatsBean.indexCompleted = stat.getCompleted(); } else if (stat.getName().equals("get")) { tpStatsBean.getThreads = stat.getThreads(); tpStatsBean.getQueue = stat.getQueue(); tpStatsBean.getActive = stat.getActive(); tpStatsBean.getRejected = stat.getRejected(); tpStatsBean.getLargest = stat.getLargest(); tpStatsBean.getCompleted = stat.getCompleted(); } else if (stat.getName().equals("search")) { tpStatsBean.searchThreads = stat.getThreads(); tpStatsBean.searchQueue = stat.getQueue(); tpStatsBean.searchActive = stat.getActive(); tpStatsBean.searchRejected = stat.getRejected(); tpStatsBean.searchLargest = stat.getLargest(); tpStatsBean.searchCompleted = stat.getCompleted(); } else if (stat.getName().equals("bulk")) { tpStatsBean.bulkThreads = stat.getThreads(); tpStatsBean.bulkQueue = stat.getQueue(); tpStatsBean.bulkActive = stat.getActive(); tpStatsBean.bulkRejected = stat.getRejected(); tpStatsBean.bulkLargest = stat.getLargest(); tpStatsBean.bulkCompleted = stat.getCompleted(); } } } catch (Exception e) { logger.warn("failed to load Thread Pool stats data", e); } tpStatsReporter.threadPoolBean.set(tpStatsBean); }
public void testLimitsRequestSize() throws Exception { ByteSizeValue inFlightRequestsLimit = new ByteSizeValue(8, ByteSizeUnit.KB); if (noopBreakerUsed()) { logger.info("--> noop breakers used, skipping test"); return; } internalCluster().ensureAtLeastNumDataNodes(2); NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get(); List<NodeStats> dataNodeStats = new ArrayList<>(); for (NodeStats stat : nodeStats.getNodes()) { if (stat.getNode().isDataNode()) { dataNodeStats.add(stat); } } assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2)); Collections.shuffle(dataNodeStats, random()); // send bulk request from source node to target node later. The sole shard is bound to the // target node. NodeStats targetNode = dataNodeStats.get(0); NodeStats sourceNode = dataNodeStats.get(1); assertAcked( prepareCreate("index") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0) .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put("index.routing.allocation.include._name", targetNode.getNode().getName()) .put( EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE))); Client client = client(sourceNode.getNode().getName()); // we use the limit size as a (very) rough indication on how many requests we should sent to hit // the limit int numRequests = inFlightRequestsLimit.bytesAsInt(); BulkRequest bulkRequest = new BulkRequest(); for (int i = 0; i < numRequests; i++) { IndexRequest indexRequest = new IndexRequest("index", "type", Integer.toString(i)); indexRequest.source("field", "value", "num", i); bulkRequest.add(indexRequest); } Settings limitSettings = Settings.builder() .put( HierarchyCircuitBreakerService.IN_FLIGHT_REQUESTS_CIRCUIT_BREAKER_LIMIT_SETTING .getKey(), inFlightRequestsLimit) .build(); assertAcked( client().admin().cluster().prepareUpdateSettings().setTransientSettings(limitSettings)); // can either fail directly with an exception or the response contains exceptions (depending on // client) try { BulkResponse response = client.bulk(bulkRequest).actionGet(); if (!response.hasFailures()) { fail("Should have thrown CircuitBreakingException"); } else { // each item must have failed with CircuitBreakingException for (BulkItemResponse bulkItemResponse : response) { Throwable cause = ExceptionsHelper.unwrapCause(bulkItemResponse.getFailure().getCause()); assertThat(cause, instanceOf(CircuitBreakingException.class)); assertEquals( ((CircuitBreakingException) cause).getByteLimit(), inFlightRequestsLimit.bytes()); } } } catch (CircuitBreakingException ex) { assertEquals(ex.getByteLimit(), inFlightRequestsLimit.bytes()); } }
private Table buildTable( RestRequest req, ClusterStateResponse state, NodesInfoResponse nodesInfo, NodesStatsResponse nodesStats) { boolean fullId = req.paramAsBoolean("full_id", false); DiscoveryNodes nodes = state.getState().nodes(); String masterId = nodes.masterNodeId(); Table table = getTableWithHeader(req); for (DiscoveryNode node : nodes) { NodeInfo info = nodesInfo.getNodesMap().get(node.id()); NodeStats stats = nodesStats.getNodesMap().get(node.id()); JvmInfo jvmInfo = info == null ? null : info.getJvm(); JvmStats jvmStats = stats == null ? null : stats.getJvm(); FsInfo fsInfo = stats == null ? null : stats.getFs(); OsStats osStats = stats == null ? null : stats.getOs(); ProcessStats processStats = stats == null ? null : stats.getProcess(); NodeIndicesStats indicesStats = stats == null ? null : stats.getIndices(); table.startRow(); table.addCell(fullId ? node.id() : Strings.substring(node.getId(), 0, 4)); table.addCell(info == null ? null : info.getProcess().getId()); table.addCell(node.getHostName()); table.addCell(node.getHostAddress()); if (node.address() instanceof InetSocketTransportAddress) { table.addCell(((InetSocketTransportAddress) node.address()).address().getPort()); } else { table.addCell("-"); } table.addCell(node.getVersion().number()); table.addCell(info == null ? null : info.getBuild().shortHash()); table.addCell(jvmInfo == null ? null : jvmInfo.version()); table.addCell(fsInfo == null ? null : fsInfo.getTotal().getAvailable()); table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsed()); table.addCell(jvmStats == null ? null : jvmStats.getMem().getHeapUsedPercent()); table.addCell(jvmInfo == null ? null : jvmInfo.getMem().getHeapMax()); table.addCell( osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getUsed()); table.addCell( osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getUsedPercent()); table.addCell( osStats == null ? null : osStats.getMem() == null ? null : osStats.getMem().getTotal()); table.addCell(processStats == null ? null : processStats.getOpenFileDescriptors()); table.addCell( processStats == null ? null : calculatePercentage( processStats.getOpenFileDescriptors(), processStats.getMaxFileDescriptors())); table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors()); table.addCell( osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getLoadAverage())); table.addCell(jvmStats == null ? null : jvmStats.getUptime()); table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-"); table.addCell( masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-"); table.addCell(node.name()); CompletionStats completionStats = indicesStats == null ? null : stats.getIndices().getCompletion(); table.addCell(completionStats == null ? null : completionStats.getSize()); FieldDataStats fdStats = indicesStats == null ? null : stats.getIndices().getFieldData(); table.addCell(fdStats == null ? null : fdStats.getMemorySize()); table.addCell(fdStats == null ? null : fdStats.getEvictions()); QueryCacheStats fcStats = indicesStats == null ? null : indicesStats.getQueryCache(); table.addCell(fcStats == null ? null : fcStats.getMemorySize()); table.addCell(fcStats == null ? null : fcStats.getEvictions()); RequestCacheStats qcStats = indicesStats == null ? null : indicesStats.getRequestCache(); table.addCell(qcStats == null ? null : qcStats.getMemorySize()); table.addCell(qcStats == null ? null : qcStats.getEvictions()); table.addCell(qcStats == null ? null : qcStats.getHitCount()); table.addCell(qcStats == null ? null : qcStats.getMissCount()); FlushStats flushStats = indicesStats == null ? null : indicesStats.getFlush(); table.addCell(flushStats == null ? null : flushStats.getTotal()); table.addCell(flushStats == null ? null : flushStats.getTotalTime()); GetStats getStats = indicesStats == null ? null : indicesStats.getGet(); table.addCell(getStats == null ? null : getStats.current()); table.addCell(getStats == null ? null : getStats.getTime()); table.addCell(getStats == null ? null : getStats.getCount()); table.addCell(getStats == null ? null : getStats.getExistsTime()); table.addCell(getStats == null ? null : getStats.getExistsCount()); table.addCell(getStats == null ? null : getStats.getMissingTime()); table.addCell(getStats == null ? null : getStats.getMissingCount()); IndexingStats indexingStats = indicesStats == null ? null : indicesStats.getIndexing(); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCurrent()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteTime()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getDeleteCount()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCurrent()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexTime()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexCount()); table.addCell(indexingStats == null ? null : indexingStats.getTotal().getIndexFailedCount()); MergeStats mergeStats = indicesStats == null ? null : indicesStats.getMerge(); table.addCell(mergeStats == null ? null : mergeStats.getCurrent()); table.addCell(mergeStats == null ? null : mergeStats.getCurrentNumDocs()); table.addCell(mergeStats == null ? null : mergeStats.getCurrentSize()); table.addCell(mergeStats == null ? null : mergeStats.getTotal()); table.addCell(mergeStats == null ? null : mergeStats.getTotalNumDocs()); table.addCell(mergeStats == null ? null : mergeStats.getTotalSize()); table.addCell(mergeStats == null ? null : mergeStats.getTotalTime()); PercolateStats percolateStats = indicesStats == null ? null : indicesStats.getPercolate(); table.addCell(percolateStats == null ? null : percolateStats.getCurrent()); table.addCell(percolateStats == null ? null : percolateStats.getMemorySize()); table.addCell(percolateStats == null ? null : percolateStats.getNumQueries()); table.addCell(percolateStats == null ? null : percolateStats.getTime()); table.addCell(percolateStats == null ? null : percolateStats.getCount()); RefreshStats refreshStats = indicesStats == null ? null : indicesStats.getRefresh(); table.addCell(refreshStats == null ? null : refreshStats.getTotal()); table.addCell(refreshStats == null ? null : refreshStats.getTotalTime()); ScriptStats scriptStats = stats == null ? null : stats.getScriptStats(); table.addCell(scriptStats == null ? null : scriptStats.getCompilations()); table.addCell(scriptStats == null ? null : scriptStats.getCacheEvictions()); SearchStats searchStats = indicesStats == null ? null : indicesStats.getSearch(); table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCurrent()); table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchTime()); table.addCell(searchStats == null ? null : searchStats.getTotal().getFetchCount()); table.addCell(searchStats == null ? null : searchStats.getOpenContexts()); table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCurrent()); table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryTime()); table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCount()); table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCurrent()); table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollTime()); table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCount()); SegmentsStats segmentsStats = indicesStats == null ? null : indicesStats.getSegments(); table.addCell(segmentsStats == null ? null : segmentsStats.getCount()); table.addCell(segmentsStats == null ? null : segmentsStats.getMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getIndexWriterMaxMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getVersionMapMemory()); table.addCell(segmentsStats == null ? null : segmentsStats.getBitsetMemory()); SuggestStats suggestStats = indicesStats == null ? null : indicesStats.getSuggest(); table.addCell(suggestStats == null ? null : suggestStats.getCurrent()); table.addCell(suggestStats == null ? null : suggestStats.getTime()); table.addCell(suggestStats == null ? null : suggestStats.getCount()); table.endRow(); } return table; }
private Table buildTable( RestRequest req, ClusterStateResponse state, NodesInfoResponse nodesInfo, NodesStatsResponse nodesStats) { final String[] threadPools = req.paramAsStringArray("thread_pool_patterns", new String[] {"*"}); final DiscoveryNodes nodes = state.getState().nodes(); final Table table = getTableWithHeader(req); // collect all thread pool names that we see across the nodes final Set<String> candidates = new HashSet<>(); for (final NodeStats nodeStats : nodesStats.getNodes()) { for (final ThreadPoolStats.Stats threadPoolStats : nodeStats.getThreadPool()) { candidates.add(threadPoolStats.getName()); } } // collect all thread pool names that match the specified thread pool patterns final Set<String> included = new HashSet<>(); for (final String candidate : candidates) { if (Regex.simpleMatch(threadPools, candidate)) { included.add(candidate); } } for (final DiscoveryNode node : nodes) { final NodeInfo info = nodesInfo.getNodesMap().get(node.getId()); final NodeStats stats = nodesStats.getNodesMap().get(node.getId()); final Map<String, ThreadPoolStats.Stats> poolThreadStats; final Map<String, ThreadPool.Info> poolThreadInfo; if (stats == null) { poolThreadStats = Collections.emptyMap(); poolThreadInfo = Collections.emptyMap(); } else { // we use a sorted map to ensure that thread pools are sorted by name poolThreadStats = new TreeMap<>(); poolThreadInfo = new HashMap<>(); ThreadPoolStats threadPoolStats = stats.getThreadPool(); for (ThreadPoolStats.Stats threadPoolStat : threadPoolStats) { poolThreadStats.put(threadPoolStat.getName(), threadPoolStat); } if (info != null) { for (ThreadPool.Info threadPoolInfo : info.getThreadPool()) { poolThreadInfo.put(threadPoolInfo.getName(), threadPoolInfo); } } } for (Map.Entry<String, ThreadPoolStats.Stats> entry : poolThreadStats.entrySet()) { if (!included.contains(entry.getKey())) continue; table.startRow(); table.addCell(node.getName()); table.addCell(node.getId()); table.addCell(node.getEphemeralId()); table.addCell(info == null ? null : info.getProcess().getId()); table.addCell(node.getHostName()); table.addCell(node.getHostAddress()); table.addCell(node.getAddress().address().getPort()); final ThreadPoolStats.Stats poolStats = entry.getValue(); final ThreadPool.Info poolInfo = poolThreadInfo.get(entry.getKey()); Long maxQueueSize = null; String keepAlive = null; Integer minThreads = null; Integer maxThreads = null; if (poolInfo != null) { if (poolInfo.getQueueSize() != null) { maxQueueSize = poolInfo.getQueueSize().singles(); } if (poolInfo.getKeepAlive() != null) { keepAlive = poolInfo.getKeepAlive().toString(); } if (poolInfo.getMin() >= 0) { minThreads = poolInfo.getMin(); } if (poolInfo.getMax() >= 0) { maxThreads = poolInfo.getMax(); } } table.addCell(entry.getKey()); table.addCell(poolInfo == null ? null : poolInfo.getThreadPoolType().getType()); table.addCell(poolStats == null ? null : poolStats.getActive()); table.addCell(poolStats == null ? null : poolStats.getThreads()); table.addCell(poolStats == null ? null : poolStats.getQueue()); table.addCell(maxQueueSize); table.addCell(poolStats == null ? null : poolStats.getRejected()); table.addCell(poolStats == null ? null : poolStats.getLargest()); table.addCell(poolStats == null ? null : poolStats.getCompleted()); table.addCell(minThreads); table.addCell(maxThreads); table.addCell(keepAlive); table.endRow(); } } return table; }
public void testRerouteRecovery() throws Exception { logger.info("--> start node A"); final String nodeA = internalCluster().startNode(); logger.info("--> create index on node: {}", nodeA); ByteSizeValue shardSize = createAndPopulateIndex(INDEX_NAME, 1, SHARD_COUNT, REPLICA_COUNT) .getShards()[0] .getStats() .getStore() .size(); logger.info("--> start node B"); final String nodeB = internalCluster().startNode(); ensureGreen(); logger.info("--> slowing down recoveries"); slowDownRecovery(shardSize); logger.info("--> move shard from: {} to: {}", nodeA, nodeB); client() .admin() .cluster() .prepareReroute() .add(new MoveAllocationCommand(INDEX_NAME, 0, nodeA, nodeB)) .execute() .actionGet() .getState(); logger.info("--> waiting for recovery to start both on source and target"); final Index index = resolveIndex(INDEX_NAME); assertBusy( new Runnable() { @Override public void run() { IndicesService indicesService = internalCluster().getInstance(IndicesService.class, nodeA); assertThat( indicesService .indexServiceSafe(index) .getShard(0) .recoveryStats() .currentAsSource(), equalTo(1)); indicesService = internalCluster().getInstance(IndicesService.class, nodeB); assertThat( indicesService .indexServiceSafe(index) .getShard(0) .recoveryStats() .currentAsTarget(), equalTo(1)); } }); logger.info("--> request recoveries"); RecoveryResponse response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); List<RecoveryState> recoveryStates = response.shardRecoveryStates().get(INDEX_NAME); List<RecoveryState> nodeARecoveryStates = findRecoveriesForTargetNode(nodeA, recoveryStates); assertThat(nodeARecoveryStates.size(), equalTo(1)); List<RecoveryState> nodeBRecoveryStates = findRecoveriesForTargetNode(nodeB, recoveryStates); assertThat(nodeBRecoveryStates.size(), equalTo(1)); assertRecoveryState( nodeARecoveryStates.get(0), 0, StoreRecoverySource.EMPTY_STORE_INSTANCE, true, Stage.DONE, null, nodeA); validateIndexRecoveryState(nodeARecoveryStates.get(0).getIndex()); assertOnGoingRecoveryState( nodeBRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, true, nodeA, nodeB); validateIndexRecoveryState(nodeBRecoveryStates.get(0).getIndex()); logger.info("--> request node recovery stats"); NodesStatsResponse statsResponse = client() .admin() .cluster() .prepareNodesStats() .clear() .setIndices(new CommonStatsFlags(CommonStatsFlags.Flag.Recovery)) .get(); long nodeAThrottling = Long.MAX_VALUE; long nodeBThrottling = Long.MAX_VALUE; for (NodeStats nodeStats : statsResponse.getNodes()) { final RecoveryStats recoveryStats = nodeStats.getIndices().getRecoveryStats(); if (nodeStats.getNode().getName().equals(nodeA)) { assertThat( "node A should have ongoing recovery as source", recoveryStats.currentAsSource(), equalTo(1)); assertThat( "node A should not have ongoing recovery as target", recoveryStats.currentAsTarget(), equalTo(0)); nodeAThrottling = recoveryStats.throttleTime().millis(); } if (nodeStats.getNode().getName().equals(nodeB)) { assertThat( "node B should not have ongoing recovery as source", recoveryStats.currentAsSource(), equalTo(0)); assertThat( "node B should have ongoing recovery as target", recoveryStats.currentAsTarget(), equalTo(1)); nodeBThrottling = recoveryStats.throttleTime().millis(); } } logger.info("--> checking throttling increases"); final long finalNodeAThrottling = nodeAThrottling; final long finalNodeBThrottling = nodeBThrottling; assertBusy( new Runnable() { @Override public void run() { NodesStatsResponse statsResponse = client() .admin() .cluster() .prepareNodesStats() .clear() .setIndices(new CommonStatsFlags(CommonStatsFlags.Flag.Recovery)) .get(); assertThat(statsResponse.getNodes(), hasSize(2)); for (NodeStats nodeStats : statsResponse.getNodes()) { final RecoveryStats recoveryStats = nodeStats.getIndices().getRecoveryStats(); if (nodeStats.getNode().getName().equals(nodeA)) { assertThat( "node A throttling should increase", recoveryStats.throttleTime().millis(), greaterThan(finalNodeAThrottling)); } if (nodeStats.getNode().getName().equals(nodeB)) { assertThat( "node B throttling should increase", recoveryStats.throttleTime().millis(), greaterThan(finalNodeBThrottling)); } } } }); logger.info("--> speeding up recoveries"); restoreRecoverySpeed(); // wait for it to be finished ensureGreen(); response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); recoveryStates = response.shardRecoveryStates().get(INDEX_NAME); assertThat(recoveryStates.size(), equalTo(1)); assertRecoveryState( recoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, true, Stage.DONE, nodeA, nodeB); validateIndexRecoveryState(recoveryStates.get(0).getIndex()); statsResponse = client() .admin() .cluster() .prepareNodesStats() .clear() .setIndices(new CommonStatsFlags(CommonStatsFlags.Flag.Recovery)) .get(); assertThat(statsResponse.getNodes(), hasSize(2)); for (NodeStats nodeStats : statsResponse.getNodes()) { final RecoveryStats recoveryStats = nodeStats.getIndices().getRecoveryStats(); assertThat(recoveryStats.currentAsSource(), equalTo(0)); assertThat(recoveryStats.currentAsTarget(), equalTo(0)); if (nodeStats.getNode().getName().equals(nodeA)) { assertThat( "node A throttling should be >0", recoveryStats.throttleTime().millis(), greaterThan(0L)); } if (nodeStats.getNode().getName().equals(nodeB)) { assertThat( "node B throttling should be >0 ", recoveryStats.throttleTime().millis(), greaterThan(0L)); } } logger.info("--> bump replica count"); client() .admin() .indices() .prepareUpdateSettings(INDEX_NAME) .setSettings(Settings.builder().put("number_of_replicas", 1)) .execute() .actionGet(); ensureGreen(); statsResponse = client() .admin() .cluster() .prepareNodesStats() .clear() .setIndices(new CommonStatsFlags(CommonStatsFlags.Flag.Recovery)) .get(); assertThat(statsResponse.getNodes(), hasSize(2)); for (NodeStats nodeStats : statsResponse.getNodes()) { final RecoveryStats recoveryStats = nodeStats.getIndices().getRecoveryStats(); assertThat(recoveryStats.currentAsSource(), equalTo(0)); assertThat(recoveryStats.currentAsTarget(), equalTo(0)); if (nodeStats.getNode().getName().equals(nodeA)) { assertThat( "node A throttling should be >0", recoveryStats.throttleTime().millis(), greaterThan(0L)); } if (nodeStats.getNode().getName().equals(nodeB)) { assertThat( "node B throttling should be >0 ", recoveryStats.throttleTime().millis(), greaterThan(0L)); } } logger.info("--> start node C"); String nodeC = internalCluster().startNode(); assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes("3").get().isTimedOut()); logger.info("--> slowing down recoveries"); slowDownRecovery(shardSize); logger.info("--> move replica shard from: {} to: {}", nodeA, nodeC); client() .admin() .cluster() .prepareReroute() .add(new MoveAllocationCommand(INDEX_NAME, 0, nodeA, nodeC)) .execute() .actionGet() .getState(); response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); recoveryStates = response.shardRecoveryStates().get(INDEX_NAME); nodeARecoveryStates = findRecoveriesForTargetNode(nodeA, recoveryStates); assertThat(nodeARecoveryStates.size(), equalTo(1)); nodeBRecoveryStates = findRecoveriesForTargetNode(nodeB, recoveryStates); assertThat(nodeBRecoveryStates.size(), equalTo(1)); List<RecoveryState> nodeCRecoveryStates = findRecoveriesForTargetNode(nodeC, recoveryStates); assertThat(nodeCRecoveryStates.size(), equalTo(1)); assertRecoveryState( nodeARecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, false, Stage.DONE, nodeB, nodeA); validateIndexRecoveryState(nodeARecoveryStates.get(0).getIndex()); assertRecoveryState( nodeBRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, true, Stage.DONE, nodeA, nodeB); validateIndexRecoveryState(nodeBRecoveryStates.get(0).getIndex()); // relocations of replicas are marked as REPLICA and the source node is the node holding the // primary (B) assertOnGoingRecoveryState( nodeCRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, false, nodeB, nodeC); validateIndexRecoveryState(nodeCRecoveryStates.get(0).getIndex()); if (randomBoolean()) { // shutdown node with relocation source of replica shard and check if recovery continues internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeA)); ensureStableCluster(2); response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); recoveryStates = response.shardRecoveryStates().get(INDEX_NAME); nodeARecoveryStates = findRecoveriesForTargetNode(nodeA, recoveryStates); assertThat(nodeARecoveryStates.size(), equalTo(0)); nodeBRecoveryStates = findRecoveriesForTargetNode(nodeB, recoveryStates); assertThat(nodeBRecoveryStates.size(), equalTo(1)); nodeCRecoveryStates = findRecoveriesForTargetNode(nodeC, recoveryStates); assertThat(nodeCRecoveryStates.size(), equalTo(1)); assertRecoveryState( nodeBRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, true, Stage.DONE, nodeA, nodeB); validateIndexRecoveryState(nodeBRecoveryStates.get(0).getIndex()); assertOnGoingRecoveryState( nodeCRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, false, nodeB, nodeC); validateIndexRecoveryState(nodeCRecoveryStates.get(0).getIndex()); } logger.info("--> speeding up recoveries"); restoreRecoverySpeed(); ensureGreen(); response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); recoveryStates = response.shardRecoveryStates().get(INDEX_NAME); nodeARecoveryStates = findRecoveriesForTargetNode(nodeA, recoveryStates); assertThat(nodeARecoveryStates.size(), equalTo(0)); nodeBRecoveryStates = findRecoveriesForTargetNode(nodeB, recoveryStates); assertThat(nodeBRecoveryStates.size(), equalTo(1)); nodeCRecoveryStates = findRecoveriesForTargetNode(nodeC, recoveryStates); assertThat(nodeCRecoveryStates.size(), equalTo(1)); assertRecoveryState( nodeBRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, true, Stage.DONE, nodeA, nodeB); validateIndexRecoveryState(nodeBRecoveryStates.get(0).getIndex()); // relocations of replicas are marked as REPLICA and the source node is the node holding the // primary (B) assertRecoveryState( nodeCRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, false, Stage.DONE, nodeB, nodeC); validateIndexRecoveryState(nodeCRecoveryStates.get(0).getIndex()); }
/** * Tests corruption that happens on the network layer and that the primary does not get affected * by corruption that happens on the way to the replica. The file on disk stays uncorrupted */ public void testCorruptionOnNetworkLayer() throws ExecutionException, InterruptedException { int numDocs = scaledRandomIntBetween(100, 1000); internalCluster().ensureAtLeastNumDataNodes(2); if (cluster().numDataNodes() < 3) { internalCluster() .startNode( Settings.builder() .put(Node.NODE_DATA_SETTING.getKey(), true) .put(Node.NODE_MASTER_SETTING.getKey(), false)); } NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get(); List<NodeStats> dataNodeStats = new ArrayList<>(); for (NodeStats stat : nodeStats.getNodes()) { if (stat.getNode().isDataNode()) { dataNodeStats.add(stat); } } assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2)); Collections.shuffle(dataNodeStats, random()); NodeStats primariesNode = dataNodeStats.get(0); NodeStats unluckyNode = dataNodeStats.get(1); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") .put( IndexMetaData.SETTING_NUMBER_OF_SHARDS, between(1, 4)) // don't go crazy here it must recovery fast // This does corrupt files on the replica, so we can't check: .put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false) .put( "index.routing.allocation.include._name", primariesNode.getNode().getName()) .put( EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE))); ensureGreen(); IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs]; for (int i = 0; i < builders.length; i++) { builders[i] = client().prepareIndex("test", "type").setSource("field", "value"); } indexRandom(true, builders); ensureGreen(); assertAllSuccessful( client() .admin() .indices() .prepareFlush() .setForce(true) .setWaitIfOngoing(true) .execute() .actionGet()); // we have to flush at least once here since we don't corrupt the translog SearchResponse countResponse = client().prepareSearch().setSize(0).get(); assertHitCount(countResponse, numDocs); final boolean truncate = randomBoolean(); for (NodeStats dataNode : dataNodeStats) { MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName())); mockTransportService.addDelegate( internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) { RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request; if (truncate && req.length() > 1) { BytesRef bytesRef = req.content().toBytesRef(); BytesArray array = new BytesArray(bytesRef.bytes, bytesRef.offset, (int) req.length() - 1); request = new RecoveryFileChunkRequest( req.recoveryId(), req.shardId(), req.metadata(), req.position(), array, req.lastChunk(), req.totalTranslogOps(), req.sourceThrottleTimeInNanos()); } else { assert req.content().toBytesRef().bytes == req.content().toBytesRef().bytes : "no internal reference!!"; final byte[] array = req.content().toBytesRef().bytes; int i = randomIntBetween(0, req.content().length() - 1); array[i] = (byte) ~array[i]; // flip one byte in the content } } super.sendRequest(node, requestId, action, request, options); } }); } Settings build = Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1") .put("index.routing.allocation.include._name", "*") .build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); client().admin().cluster().prepareReroute().get(); ClusterHealthResponse actionGet = client() .admin() .cluster() .health(Requests.clusterHealthRequest("test").waitForGreenStatus()) .actionGet(); if (actionGet.isTimedOut()) { logger.info( "ensureGreen timed out, cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState().prettyPrint(), client().admin().cluster().preparePendingClusterTasks().get().prettyPrint()); assertThat("timed out waiting for green state", actionGet.isTimedOut(), equalTo(false)); } // we are green so primaries got not corrupted. // ensure that no shard is actually allocated on the unlucky node ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get(); for (IndexShardRoutingTable table : clusterStateResponse.getState().getRoutingTable().index("test")) { for (ShardRouting routing : table) { if (unluckyNode.getNode().getId().equals(routing.currentNodeId())) { assertThat(routing.state(), not(equalTo(ShardRoutingState.STARTED))); assertThat(routing.state(), not(equalTo(ShardRoutingState.RELOCATING))); } } } final int numIterations = scaledRandomIntBetween(5, 20); for (int i = 0; i < numIterations; i++) { SearchResponse response = client().prepareSearch().setSize(numDocs).get(); assertHitCount(response, numDocs); } }
/** * This test triggers a corrupt index exception during finalization size if an empty commit point * is transferred during recovery we don't know the version of the segments_N file because it has * no segments we can take it from. This simulates recoveries from old indices or even without * checksums and makes sure if we fail during finalization we also check if the primary is ok. * Without the relevant checks this test fails with a RED cluster */ public void testCorruptionOnNetworkLayerFinalizingRecovery() throws ExecutionException, InterruptedException, IOException { internalCluster().ensureAtLeastNumDataNodes(2); NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get(); List<NodeStats> dataNodeStats = new ArrayList<>(); for (NodeStats stat : nodeStats.getNodes()) { if (stat.getNode().isDataNode()) { dataNodeStats.add(stat); } } assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2)); Collections.shuffle(dataNodeStats, random()); NodeStats primariesNode = dataNodeStats.get(0); NodeStats unluckyNode = dataNodeStats.get(1); assertAcked( prepareCreate("test") .setSettings( Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0") .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put( "index.routing.allocation.include._name", primariesNode.getNode().getName()) .put( EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE) .put("index.allocation.max_retries", Integer.MAX_VALUE) // keep on retrying )); ensureGreen(); // allocated with empty commit final AtomicBoolean corrupt = new AtomicBoolean(true); final CountDownLatch hasCorrupted = new CountDownLatch(1); for (NodeStats dataNode : dataNodeStats) { MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName())); mockTransportService.addDelegate( internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) { @Override public void sendRequest( DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (corrupt.get() && action.equals(RecoveryTargetService.Actions.FILE_CHUNK)) { RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request; byte[] array = BytesRef.deepCopyOf(req.content().toBytesRef()).bytes; int i = randomIntBetween(0, req.content().length() - 1); array[i] = (byte) ~array[i]; // flip one byte in the content hasCorrupted.countDown(); } super.sendRequest(node, requestId, action, request, options); } }); } Settings build = Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1") .put( "index.routing.allocation.include._name", primariesNode.getNode().getName() + "," + unluckyNode.getNode().getName()) .build(); client().admin().indices().prepareUpdateSettings("test").setSettings(build).get(); client().admin().cluster().prepareReroute().get(); hasCorrupted.await(); corrupt.set(false); ensureGreen(); }