@TransactionAttribute(TransactionAttributeType.NEVER) public MeasurementBaseline calculateAutoBaseline( Subject subject, int groupId, int definitionId, long startDate, long endDate, boolean save) throws BaselineCreationException, MeasurementNotFoundException { MeasurementBaseline result = measurementBaselineManager.calculateAutoBaselineForGroupInNewTransaction( subject, groupId, definitionId, startDate, endDate, save); if (save) { // note, this executes in a new transaction so the baseline must already be committed to the // database agentStatusManager.updateByMeasurementBaseline(result.getId()); } return result; }
@TransactionAttribute(TransactionAttributeType.NEVER) public long calculateAutoBaselines(long amountOfData, long baselinesOlderThanTime) { try { log.info("Calculating auto baselines"); log.info("Deleting baselines computations older than " + new Date(baselinesOlderThanTime)); log.info( "Inserting new baselines using last " + (amountOfData / (24 * 60 * 60 * 1000L)) + " days of 1H data"); long now = System.currentTimeMillis(); long computeTime = now; log.debug("computeTime = " + computeTime); int deleted = measurementBaselineManager._calculateAutoBaselinesDELETE(baselinesOlderThanTime); log.info( "Removed [" + deleted + "] old baselines - they will now be recalculated (" + (System.currentTimeMillis() - now) + ")ms"); now = System.currentTimeMillis(); int totalInserted = 0; while (true) { /* * each call is done in a separate xtn of at most 100K inserted rows; this helps to keep the xtn * shorter to avoid timeouts in scenarios where baseline calculations bunch together. the idea was that * by basing a batch of baseline calculations off of the import time of the resource into inventory, * that the total work would naturally be staggered throughout the day. in practice, this didn't always * work as intended for one of several reasons: * * 1) all servers in the cloud were down for a few days (maybe a slow product upgrade, maybe a cold * data center relocation) * 2) issues with running the job itself, if quartz had locking issues under severe load and somehow * this job wasn't get executed for a few hours / days * 3) the user tended to import all new resources / platforms at the same time of day, thus bypassing * the implicit optimization of trying to stagger the calculations by resource commit time * * 2/18/2010 NOTE: Limits weren't / aren't actually achieving the affect we want. The baseline query * follows the general form of "insert into...select from <big query> having <subquery> limit X". * In this case, the limit was reducing the number of rows inserted, but it was still taking the full * cost of calculating everything that should have been inserted. The limit was intended as a cheap * method of chunking or partitioning the work, but wasn't properly chunking the expensive * part - the "big query". What we actually want to do is come of with a strategy that lessens the * amount of data we need to select, thereby reducing the amount of time it takes to calculate the * insertion list. * * One proposed strategy for this would be to chunk on the scheduleId. So if there were, say, * 5M scheduleIds in the systems, we might take 500K of them at a time and then execute the * baseline insertion job 10 times against a much smaller set of data each time. But the * complication here is how to calculate precise groups of 500K schedules at a time, and then * walk that chunked list. * * Another strategy would be to divy things up by resource type. Since a measurementSchedule is * linked to a measurementDefinition which is linked to a resourceType, we could very easily chunk * the insertion based off the schedules that belong to each resourceType. This would create * one insert statement for each type of resource in system. The complication here, however, * is that you may have millions of resources of one type, but hardly any resources of another. * So there's still a chance that some insertions proceed slowly (in the worst case). * * In any event, an appropriate chunking solution needs to be found, and that partitioning strategy * needs to replace the limits in the query today. */ int inserted = measurementBaselineManager._calculateAutoBaselinesINSERT(amountOfData); totalInserted += inserted; // since we're batch 100K inserts at a time, we're done if we didn't have that many to // insert if (inserted < 100000) { break; } } log.info( "Calculated and inserted [" + totalInserted + "] new baselines. (" + (System.currentTimeMillis() - now) + ")ms"); MeasurementMonitor.getMBean() .incrementBaselineCalculationTime(System.currentTimeMillis() - computeTime); agentStatusManager.updateByAutoBaselineCalculationJob(); return computeTime; } catch (Exception e) { log.error("Failed to auto-calculate baselines", e); throw new RuntimeException("Auto-calculation failure", e); } }
@Override public Set<ResourceMeasurementScheduleRequest> postProcessNewlyCommittedResources( Set<Integer> resourceIds) { if (log.isDebugEnabled()) { log.debug("Post-processing " + resourceIds.size() + "newly committed resources"); log.debug("Ids were: " + resourceIds); } Subject overlord = LookupUtil.getSubjectManager().getOverlord(); AlertTemplateManagerLocal alertTemplateManager = LookupUtil.getAlertTemplateManager(); MeasurementScheduleManagerLocal scheduleManager = LookupUtil.getMeasurementScheduleManager(); AgentManagerLocal agentManager = LookupUtil.getAgentManager(); StatusManagerLocal statusManager = LookupUtil.getStatusManager(); long start = System.currentTimeMillis(); // do this in one fell swoop, instead of one resource at a time Set<ResourceMeasurementScheduleRequest> results = scheduleManager.findSchedulesForResourceAndItsDescendants( ArrayUtils.unwrapCollection(resourceIds), false); long time = (System.currentTimeMillis() - start); if (time >= 10000L) { log.info( "Performance: commit resource, create schedules timing: resourceCount/millis=" + resourceIds.size() + '/' + time); } else if (log.isDebugEnabled()) { log.debug( "Performance: commit resource, create schedules timing: resourceCount/millis=" + resourceIds.size() + '/' + time); } start = System.currentTimeMillis(); for (Integer resourceId : resourceIds) { // apply alert templates try { alertTemplateManager.updateAlertDefinitionsForResource(overlord, resourceId); } catch (AlertDefinitionCreationException adce) { /* should never happen because AlertDefinitionCreationException is only ever * thrown if updateAlertDefinitionsForResource isn't called as the overlord * * but we'll log it anyway, just in case, so it isn't just swallowed */ log.error(adce); } catch (Throwable t) { log.debug("Could not apply alert templates for resourceId = " + resourceId, t); } } try { if (resourceIds.size() > 0) { // they all come from the same agent, so pick any old one int anyResourceIdFromNewlyCommittedSet = resourceIds.iterator().next(); int agentId = agentManager.getAgentIdByResourceId(anyResourceIdFromNewlyCommittedSet); statusManager.updateByAgent(agentId); } } catch (Throwable t) { log.debug("Could not reload caches for newly committed resources", t); } time = (System.currentTimeMillis() - start); if (time >= 10000L) { log.info( "Performance: commit resource, apply alert templates timing: resourceCount/millis=" + resourceIds.size() + '/' + time); } else if (log.isDebugEnabled()) { log.debug( "Performance: commit resource, apply alert templates timing: resourceCount/millis=" + resourceIds.size() + '/' + time); } return results; }