@Override public void run() { logger.trace("CleanerRunnable starting: hashCode={}", this.hashCode()); PresenceService presenceService = getContext().getService("presence"); CoordinationService coordinationService = getContext().getService("coord"); ZkClient zkClient = getContext().getZkClient(); PathScheme pathScheme = getContext().getPathScheme(); // list all services in cluster List<String> clusterIds = presenceService.getClusters(); for (String clusterId : clusterIds) { // only proceed if in cluster if (!presenceService.isMemberOf(clusterId) || clusterId.equals(getContext().getPathScheme().getFrameworkClusterId())) { continue; } List<String> serviceIds = presenceService.getServices(clusterId); for (String serviceId : serviceIds) { logger.trace( "Checking data nodes expiry: clusterId={}; serviceId={}", clusterId, serviceId); // only proceed if in service if (!presenceService.isMemberOf(clusterId, serviceId)) { continue; } long currentTimestamp = System.currentTimeMillis(); // get lock for a service DistributedLock lock = coordinationService.getLock("reign", "metrics-" + clusterId + "-" + serviceId); if (!lock.tryLock()) { continue; } String dataPath = null; try { // get all data nodes for a service String dataParentPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId)); List<String> dataNodes = zkClient.getChildren(dataParentPath, false); // remove all nodes that are older than rotation // interval for (String dataNode : dataNodes) { try { logger.trace( "Checking data node expiry: clusterId={}; serviceId={}; nodeId={}", clusterId, serviceId, dataNode); dataPath = pathScheme.getAbsolutePath( PathType.METRICS, pathScheme.joinTokens(clusterId, serviceId, dataNode)); MetricsData metricsData = getMetricsFromDataNode(clusterId, serviceId, dataNode); if (metricsData == null) { logger.warn( "Removing unrecognized/corrupted/deprecated data node: path={}", dataPath); zkClient.delete(dataPath, -1); continue; } // keep last few hours worth of data long millisToExpiry = millisToExpiry(metricsData, currentTimestamp - (86400000 / 6)); // delete data that is older than some threshold boolean dataTooOld = currentTimestamp - metricsData.getIntervalStartTimestamp() > 86400000; // delete old and expired data if (millisToExpiry <= 0 || dataTooOld) { logger.info( "Removing expired data node: path={}; millisToExpiry={}", dataPath, millisToExpiry); zkClient.delete(dataPath, -1); } else { logger.trace( "Data node is not yet expired: path={}; millisToExpiry={}", dataPath, millisToExpiry); } } catch (Exception e) { logger.warn( "Error trying to clean up data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + "; dataPath=" + dataPath + ": " + e, e); } // try } // for } catch (KeeperException e) { if (e.code() != KeeperException.Code.NONODE) { logger.warn( "Error trying to clean up data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + "; dataPath=" + dataPath + ": " + e, e); } } catch (Exception e) { logger.warn( "Error trying to clean up data directory for service: clusterId=" + clusterId + "; serviceId=" + serviceId + "; dataPath=" + dataPath + ": " + e, e); } finally { lock.unlock(); lock.destroy(); } // try } // for service } // for cluster } // run()