@Override public void run() { long startTimeNanos = System.nanoTime(); logger.trace("AggregationRunnable starting: hashCode={}", this.hashCode()); // list all services in cluster PresenceService presenceService = getContext().getService("presence"); CoordinationService coordinationService = getContext().getService("coord"); ZkClient zkClient = getContext().getZkClient(); PathScheme pathScheme = getContext().getPathScheme(); // list all services in cluster List<String> clusterIds = presenceService.getClusters(); for (String clusterId : clusterIds) { // only proceed if in cluster if (!presenceService.isMemberOf(clusterId) || clusterId.equals(getContext().getPathScheme().getFrameworkClusterId())) { continue; } List<String> allServiceIds = presenceService.getServices(clusterId); List<String> memberServiceIds = new ArrayList<String>(allServiceIds.size()); for (String serviceId : allServiceIds) { // only aggregate if node is in service if (presenceService.isMemberOf(clusterId, serviceId)) { memberServiceIds.add(serviceId); } } // go through member service list in deterministic order so // locks are acquired in the same order across // nodes Collections.sort(memberServiceIds); for (int i = 0; i < memberServiceIds.size(); i++) { long currentTimestamp = System.currentTimeMillis(); String serviceId = memberServiceIds.get(i); logger.trace("Finding data nodes: clusterId={}; serviceId={}", clusterId, serviceId); // get lock for a service DistributedLock lock = coordinationService.getLock("reign", "metrics-" + clusterId + "-" + serviceId); if (!lock.tryLock()) { continue; } try { // get all data nodes for a service String dataParentPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId)); List<String> dataNodes = zkClient.getChildren(dataParentPath, false); /** iterate through service data nodes and gather up data to aggregate */ Map<String, List<CounterData>> counterMap = new HashMap<String, List<CounterData>>(dataNodes.size() + 1, 1.0f); Map<String, List<GaugeData>> gaugeMap = new HashMap<String, List<GaugeData>>(dataNodes.size() + 1, 1.0f); Map<String, List<HistogramData>> histogramMap = new HashMap<String, List<HistogramData>>(dataNodes.size() + 1, 1.0f); Map<String, List<MeterData>> meterMap = new HashMap<String, List<MeterData>>(dataNodes.size() + 1, 1.0f); Map<String, List<TimerData>> timerMap = new HashMap<String, List<TimerData>>(dataNodes.size() + 1, 1.0f); int dataNodeCount = 0; int dataNodeInWindowCount = 0; Integer intervalLength = null; TimeUnit intervalLengthUnit = null; for (String dataNode : dataNodes) { dataNodeCount++; logger.trace( "Found data node: clusterId={}; serviceId={}; nodeId={}", clusterId, serviceId, dataNode); String dataPath = null; MetricsData metricsData = null; dataPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId, dataNode)); try { metricsData = getMetricsFromDataNode(clusterId, serviceId, dataNode); if (metricsData == null) { continue; } } catch (Exception e) { logger.warn( "Error trying to aggregate data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + ": " + e, e); continue; } // skip data node if not within interval long millisToExpiry = millisToExpiry(metricsData, currentTimestamp); if (millisToExpiry <= 0) { continue; } intervalLength = metricsData.getIntervalLength(); intervalLengthUnit = metricsData.getIntervalLengthUnit(); // aggregate service stats for data nodes that // within current rotation interval logger.trace( "Aggregating data node: path={}; millisToExpiry={}", dataPath, millisToExpiry); // increment node count dataNodeInWindowCount++; // counters Map<String, CounterData> counters = metricsData.getCounters(); for (String key : counters.keySet()) { CounterData counter = counters.get(key); List<CounterData> counterList = counterMap.get(key); if (counterList == null) { counterList = new ArrayList<CounterData>(dataNodes.size()); counterMap.put(key, counterList); } counterList.add(counter); } // gauges Map<String, GaugeData> gauges = metricsData.getGauges(); for (String key : gauges.keySet()) { GaugeData gauge = gauges.get(key); List<GaugeData> gaugeList = gaugeMap.get(key); if (gaugeList == null) { gaugeList = new ArrayList<GaugeData>(dataNodes.size()); gaugeMap.put(key, gaugeList); } gaugeList.add(gauge); } // histogram Map<String, HistogramData> histograms = metricsData.getHistograms(); for (String key : histograms.keySet()) { HistogramData histogram = histograms.get(key); List<HistogramData> histogramList = histogramMap.get(key); if (histogramList == null) { histogramList = new ArrayList<HistogramData>(dataNodes.size()); histogramMap.put(key, histogramList); } histogramList.add(histogram); } // meters Map<String, MeterData> meters = metricsData.getMeters(); for (String key : meters.keySet()) { MeterData meter = meters.get(key); List<MeterData> meterList = meterMap.get(key); if (meterList == null) { meterList = new ArrayList<MeterData>(dataNodes.size()); meterMap.put(key, meterList); } meterList.add(meter); } // timers Map<String, TimerData> timers = metricsData.getTimers(); for (String key : timers.keySet()) { TimerData timer = timers.get(key); List<TimerData> meterList = timerMap.get(key); if (meterList == null) { meterList = new ArrayList<TimerData>(dataNodes.size()); timerMap.put(key, meterList); } meterList.add(timer); } } // for dataNodes /** aggregate data and write to ZK * */ MetricsData serviceMetricsData = new MetricsData(); // counters Map<String, CounterData> counters = new HashMap<String, CounterData>(counterMap.size() + 1, 1.0f); for (String key : counterMap.keySet()) { List<CounterData> counterList = counterMap.get(key); // if (counterList.size() != dataNodeCount) { // logger.warn( // "counterList size does not match nodeCount: counterList.size={}; nodeCount={}", // counterList.size(), dataNodeCount); // } CounterData counterData = CounterData.merge(counterList); counters.put(key, counterData); } serviceMetricsData.setCounters(counters); // gauges Map<String, GaugeData> gauges = new HashMap<String, GaugeData>(gaugeMap.size() + 1, 1.0f); for (String key : gaugeMap.keySet()) { List<GaugeData> gaugeList = gaugeMap.get(key); // if (gaugeList.size() != dataNodeCount) { // logger.warn( // "gaugeList size does not match nodeCount: gaugeList.size={}; nodeCount={}", // gaugeList.size(), dataNodeCount); // } GaugeData gaugeData = GaugeData.merge(gaugeList); gauges.put(key, gaugeData); } serviceMetricsData.setGauges(gauges); // histograms Map<String, HistogramData> histograms = new HashMap<String, HistogramData>(histogramMap.size() + 1, 1.0f); for (String key : histogramMap.keySet()) { List<HistogramData> histogramList = histogramMap.get(key); // if (histogramList.size() != dataNodeCount) { // logger.warn( // "histogramList size does not match nodeCount: histogramList.size={}; // nodeCount={}", // histogramList.size(), dataNodeCount); // } HistogramData histogramData = HistogramData.merge(histogramList); histograms.put(key, histogramData); } serviceMetricsData.setHistograms(histograms); // meters Map<String, MeterData> meters = new HashMap<String, MeterData>(meterMap.size() + 1, 1.0f); for (String key : meterMap.keySet()) { List<MeterData> meterList = meterMap.get(key); // if (meterList.size() != dataNodeCount) { // logger.warn( // "meterList size does not match nodeCount: meterList.size={}; nodeCount={}", // meterList.size(), dataNodeCount); // } MeterData meterData = MeterData.merge(meterList); meters.put(key, meterData); } serviceMetricsData.setMeters(meters); // timers Map<String, TimerData> timers = new HashMap<String, TimerData>(timerMap.size() + 1, 1.0f); for (String key : timerMap.keySet()) { List<TimerData> timerList = timerMap.get(key); // if (timerList.size() != dataNodeCount) { // logger.warn( // "timerList size does not match nodeCount: timerList.size={}; nodeCount={}", // timerList.size(), dataNodeCount); // } TimerData timerData = TimerData.merge(timerList); timers.put(key, timerData); } serviceMetricsData.setTimers(timers); serviceMetricsData.setDataNodeCount(dataNodeCount); serviceMetricsData.setDataNodeInWindowCount(dataNodeInWindowCount); serviceMetricsData.setClusterId(clusterId); serviceMetricsData.setServiceId(serviceId); serviceMetricsData.setIntervalLength(intervalLength); serviceMetricsData.setIntervalLengthUnit(intervalLengthUnit); serviceMetricsData.setLastUpdatedTimestamp(System.currentTimeMillis()); // write to ZK String dataPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId)); String serviceMetricsDataString = JacksonUtil.getObjectMapper().writeValueAsString(serviceMetricsData); zkClientUtil.updatePath( getContext().getZkClient(), getContext().getPathScheme(), dataPath, serviceMetricsDataString.getBytes(UTF_8), getContext().getDefaultZkAclList(), CreateMode.PERSISTENT, -1); // sleep to hold lock before next interval so that // updates don't happen too frequently with // more nodes in service if (i == memberServiceIds.size() - 1) { try { long elapsedMillis = (System.nanoTime() - startTimeNanos) / 1000000; long sleepIntervalMillis = (updateIntervalMillis - elapsedMillis) / 2; if (sleepIntervalMillis < 0) { sleepIntervalMillis = updateIntervalMillis; } logger.debug( "AggregationRunnable SLEEPING btw. services: sleepIntervalMillis={}; memberServiceIds.size={}", sleepIntervalMillis, memberServiceIds.size()); Thread.sleep(sleepIntervalMillis); } catch (InterruptedException e) { logger.warn("Interrupted while sleeping at end of aggregation: " + e, e); } } } catch (KeeperException e) { if (e.code() != KeeperException.Code.NONODE) { logger.warn( "Error trying to aggregate data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + ": " + e, e); } } catch (Exception e) { logger.warn( "Error trying to aggregate data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + ": " + e, e); } finally { logger.trace("Releasing lock: metrics-aggregation-{}-{}", clusterId, serviceId); lock.unlock(); lock.destroy(); logger.trace( "Released and destroyed lock: metrics-aggregation-{}-{}", clusterId, serviceId); } // try } // for service // store aggregated results in ZK at service level } // for cluster } // run
@Override public void run() { logger.trace("CleanerRunnable starting: hashCode={}", this.hashCode()); PresenceService presenceService = getContext().getService("presence"); CoordinationService coordinationService = getContext().getService("coord"); ZkClient zkClient = getContext().getZkClient(); PathScheme pathScheme = getContext().getPathScheme(); // list all services in cluster List<String> clusterIds = presenceService.getClusters(); for (String clusterId : clusterIds) { // only proceed if in cluster if (!presenceService.isMemberOf(clusterId) || clusterId.equals(getContext().getPathScheme().getFrameworkClusterId())) { continue; } List<String> serviceIds = presenceService.getServices(clusterId); for (String serviceId : serviceIds) { logger.trace( "Checking data nodes expiry: clusterId={}; serviceId={}", clusterId, serviceId); // only proceed if in service if (!presenceService.isMemberOf(clusterId, serviceId)) { continue; } long currentTimestamp = System.currentTimeMillis(); // get lock for a service DistributedLock lock = coordinationService.getLock("reign", "metrics-" + clusterId + "-" + serviceId); if (!lock.tryLock()) { continue; } String dataPath = null; try { // get all data nodes for a service String dataParentPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId)); List<String> dataNodes = zkClient.getChildren(dataParentPath, false); // remove all nodes that are older than rotation // interval for (String dataNode : dataNodes) { try { logger.trace( "Checking data node expiry: clusterId={}; serviceId={}; nodeId={}", clusterId, serviceId, dataNode); dataPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId, dataNode)); MetricsData metricsData = getMetricsFromDataNode(clusterId, serviceId, dataNode); if (metricsData == null) { logger.warn( "Removing unrecognized/corrupted/deprecated data node: path={}", dataPath); zkClient.delete(dataPath, -1); continue; } // keep last few hours worth of data long millisToExpiry = millisToExpiry(metricsData, currentTimestamp - (86400000 / 6)); // delete data that is older than some threshold boolean dataTooOld = currentTimestamp - metricsData.getIntervalStartTimestamp() > 86400000; // delete old and expired data if (millisToExpiry <= 0 || dataTooOld) { logger.info( "Removing expired data node: path={}; millisToExpiry={}", dataPath, millisToExpiry); zkClient.delete(dataPath, -1); } else { logger.trace( "Data node is not yet expired: path={}; millisToExpiry={}", dataPath, millisToExpiry); } } catch (Exception e) { logger.warn( "Error trying to clean up data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + "; dataPath=" + dataPath + ": " + e, e); } // try } // for } catch (KeeperException e) { if (e.code() != KeeperException.Code.NONODE) { logger.warn( "Error trying to clean up data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + "; dataPath=" + dataPath + ": " + e, e); } } catch (Exception e) { logger.warn( "Error trying to clean up data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + "; dataPath=" + dataPath + ": " + e, e); } finally { lock.unlock(); lock.destroy(); } // try } // for service } // for cluster } // run()