public void processDocuments( int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete) { PropertiesManager props = new PropertiesManager(); // Note: toAdd = toAdd(old) + toUpdate // Need to treat updates as follows: // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the // future comments etc) // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...) toDelete.addAll(toUpdate_subsetOfAdd); StoreAndIndexManager storageManager = new StoreAndIndexManager(); storageManager.removeFromDatastore_byURL(toDelete, (harvestType != InfiniteEnums.DATABASE)); // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester) // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...) // Aggregation: // 1+2. Create aggregate entities/events ("features") and write them to the DB // (then can store feeds - doesn't matter that the event/entities have been modified by the // aggregation) // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and // events // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so // (2) must have happened] // (Syncronization currently "corrupts" the entities so needs to be run last) AggregationManager perSourceAggregation = null; if (!props.getAggregationDisabled()) { perSourceAggregation = new AggregationManager(); } // 1+2] if (null != perSourceAggregation) { perSourceAggregation.doAggregation(toAdd, toDelete); perSourceAggregation.createOrUpdateFeatureEntries(); } // Save feeds to feeds collection in MongoDb // (second field determines if content gets saved) if (null != perSourceAggregation) { perSourceAggregation.applyAggregationToDocs(toAdd); // (First save aggregated statistics back to the docs' entity/event instances) } storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE)); // Then finish aggregation: if (null != perSourceAggregation) { // 3] perSourceAggregation.runScheduledDocumentUpdates(); // 4] This needs to happen last because it "corrupts" the entities and events perSourceAggregation.runScheduledSynchronization(); } } // TESTED (by eye - logic is v simple)
/** * Used to start the sync service * * @throws IOException * @throws InterruptedException */ public void startService(LinkedList<SourcePojo> sources) throws IOException, InterruptedException { // Let the client know the server is starting System.out.println("[SERVER] Harvest server is coming online"); // Intialize/update generic process controller (do this here so that it blocks before threading // fun starts) new GenericProcessingController().Initialize(); // Start the background aggregation thread (will do nothing if disabled) EntityBackgroundAggregationManager.startThread(); AssociationBackgroundAggregationManager.startThread(); _mainThread = Thread.currentThread(); String hostname = "unknown.host"; try { hostname = java.net.InetAddress.getLocalHost().getHostName(); } catch (Exception e) { } // Add the shutdown hook ShutdownHook shutdownHook = new ShutdownHook(); Runtime.getRuntime().addShutdownHook(shutdownHook); Date startDate = new Date(); _logger.info("Starting harvest process at: " + startDate + ", host=" + hostname); // Perform processing PropertiesManager threadConfig = new PropertiesManager(); String sThreadConfig = threadConfig.getHarvestThreadConfig(); HashSet<String> types = new HashSet<String>(); try { String harvestTypes = new com.ikanow.infinit.e.harvest.utils.PropertiesManager().getHarvesterTypes(); for (String s : harvestTypes.split("\\s*,\\s*")) { types.add(s.toLowerCase()); } } catch (Exception e) { _logger.error( Globals.populateStackTrace(new StringBuffer("Failed to register all harvest types"), e)); } // TESTED (by hand) // Max time for harvester (defaults to 25 mins) long maxTime_secs = threadConfig.getMaximumHarvestTime(); if (maxTime_secs > 0) { new Timer().schedule(new InternalShutdown(), maxTime_secs * 1000); // (arg in ms) } // TOTEST try { // All source types in a single thread int nThreads = Integer.parseInt(sThreadConfig); SourceTypeHarvesterRunnable allTypes = new SourceTypeHarvesterRunnable(sources, nThreads); _logger.info("(Launching " + nThreads + " threads for all source types)"); allTypes.run(); } catch (NumberFormatException e) { // The thread config must be comma-separated list of type:threads // (step over each type and launch that number of threads for that type) String[] sConfigBlocks = sThreadConfig.split("\\s*,\\s*"); ExecutorService exec = Executors.newFixedThreadPool(sConfigBlocks.length); for (String sConfigBlock : sConfigBlocks) { String[] sTypeOrNumThreads = sConfigBlock.split("\\s*:\\s*"); if (2 == sTypeOrNumThreads.length) { try { int nThreads = Integer.parseInt(sTypeOrNumThreads[1]); types.remove(sTypeOrNumThreads[0].toLowerCase()); SourceTypeHarvesterRunnable typeRunner = new SourceTypeHarvesterRunnable(sources, nThreads, sTypeOrNumThreads[0]); _logger.info( "(Launching " + nThreads + " threads for " + sTypeOrNumThreads[0] + " source types)"); exec.submit(typeRunner); } catch (NumberFormatException e2) { _logger.error("Error in harvester thread configuration: " + sThreadConfig); } } else { _logger.error("Error in harvester thread configuration: " + sThreadConfig); } } // (end loop over different file types) // (generate one thread for everything else) for (String unusedType : types) { // (note case unimportant) SourceTypeHarvesterRunnable typeRunner = new SourceTypeHarvesterRunnable(sources, 1, unusedType); _logger.info("(Launching 1 thread for " + unusedType + " source types)"); exec.submit(typeRunner); } // TESTED (by hand) exec.shutdown(); int i = 0; while (!exec.isTerminated()) { try { Thread.sleep(1000); } catch (InterruptedException e3) { } if (_bStopHarvest) i++; if (i > 14400) { // emergency shutdown time... _logger.error("Emergency shutdown after 4 hours of waiting for manual shutdown"); System.exit(0); } } } com.ikanow.infinit.e.processing.generic.utils.PropertiesManager aggProps = new com.ikanow.infinit.e.processing.generic.utils.PropertiesManager(); boolean bAggDisabled = aggProps.getAggregationDisabled(); StoreAndIndexManager dataStore = new StoreAndIndexManager(); boolean bResizedDB = dataStore.resizeDB(); boolean deletedDocs = true; if (!bAggDisabled) { deletedDocs = AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID()); } if (deletedDocs) { // (or if agg disabled, in which case we don't know) dataStore.removeSoftDeletedDocuments(); if (!bAggDisabled) { AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID()); } } if (bResizedDB) { _logger.info("(resized DB, now " + dataStore.getDatabaseSize() + " documents)"); } HarvestController.logHarvesterStats(); _logger.info("Completed harvest process at: " + new Date().toString()); Date endDate = new Date(); // Not allowed to cycle harvester runs too quickly // Sleep for some period: long nDiff = endDate.getTime() - startDate.getTime(); long nToSleep = threadConfig.getMinimumHarvestTimeMs() - nDiff; if ((nToSleep > 0) && !_bCurrentlySleepingBeforeExit) { try { _bCurrentlySleepingBeforeExit = true; // (don't really care there's a minor race condition here) Thread.sleep(nToSleep); } catch (InterruptedException e) { // Do nothing, probably got a signal } } // TESTED (cut and paste from tested Beta code) // Stop background aggregation EntityBackgroundAggregationManager.stopThreadAndWait(); AssociationBackgroundAggregationManager.stopThreadAndWait(); _logger.info("Harvest server is going offline"); _bStopHarvest = true; _bReadyToTerminate = true; // (if we were terminated manually tell the shutdown hook it can stop) System.exit(0); } // TESTED