public void processDocuments( int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete) { PropertiesManager props = new PropertiesManager(); // Note: toAdd = toAdd(old) + toUpdate // Need to treat updates as follows: // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the // future comments etc) // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...) toDelete.addAll(toUpdate_subsetOfAdd); StoreAndIndexManager storageManager = new StoreAndIndexManager(); storageManager.removeFromDatastore_byURL(toDelete, (harvestType != InfiniteEnums.DATABASE)); // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester) // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...) // Aggregation: // 1+2. Create aggregate entities/events ("features") and write them to the DB // (then can store feeds - doesn't matter that the event/entities have been modified by the // aggregation) // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and // events // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so // (2) must have happened] // (Syncronization currently "corrupts" the entities so needs to be run last) AggregationManager perSourceAggregation = null; if (!props.getAggregationDisabled()) { perSourceAggregation = new AggregationManager(); } // 1+2] if (null != perSourceAggregation) { perSourceAggregation.doAggregation(toAdd, toDelete); perSourceAggregation.createOrUpdateFeatureEntries(); } // Save feeds to feeds collection in MongoDb // (second field determines if content gets saved) if (null != perSourceAggregation) { perSourceAggregation.applyAggregationToDocs(toAdd); // (First save aggregated statistics back to the docs' entity/event instances) } storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE)); // Then finish aggregation: if (null != perSourceAggregation) { // 3] perSourceAggregation.runScheduledDocumentUpdates(); // 4] This needs to happen last because it "corrupts" the entities and events perSourceAggregation.runScheduledSynchronization(); } } // TESTED (by eye - logic is v simple)