public static void updateMatchingEntities(EntityFeaturePojo entFeature, ObjectId communityId) { String index = entFeature.getIndex(); long totalFreq = entFeature.getTotalfreq(); long docCount = entFeature.getDoccount(); try { DBCollection docDb = DbManager.getDocument().getMetadata(); BasicDBObject query1 = new BasicDBObject(); query1.put(EntityPojo.docQuery_index_, index); query1.put(DocumentPojo.communityId_, communityId); BasicDBObject multiopB = new BasicDBObject(); multiopB.put(EntityPojo.docUpdate_totalfrequency_, totalFreq); multiopB.put(EntityPojo.docUpdate_doccount_, docCount); BasicDBObject multiopA = new BasicDBObject(MongoDbManager.set_, multiopB); if (_diagnosticMode) { System.out.println( "EntityAggregationUtils.updateMatchingEntities: " + query1.toString() + " / " + multiopA.toString()); } else { synchronized (GenericProcessingController.class) { // Because this op can be slow, and traverse a lot of disk, need to ensure that // we don't allow all the threads to hammer it at once (the updates all yield to each // other // enough that the disk goes totally crazy) docDb.update(query1, multiopA, false, true); DbManager.getDocument().getLastError(DbManager.getDocument().getMetadata().getName()); // (enforce consecutive accesses for this potentially very slow operation) } // Was originally checked updatedExisting but for INF-1406, it sometimes seemed to be // checking the wrong command. I suspect the reason we had this code in here has gone away, // and it doesn't matter if this update occasionally fails anyway, it will just be out of // date // so the check/retry has been removed. } } catch (Exception ex) { logger.error(ex.getMessage(), ex); } } // TESTED (by eye, mostly cut-and-paste from test Beta)
public static void synchronizeEntityFeature( EntityFeaturePojo entityFeature, ObjectId communityId) { DBCollection entityFeatureDb = DbManager.getFeature().getEntity(); // NOTE: Important that feeds update occurs before synchronization, since the sync "corrupts" // the entity if (_diagnosticMode || (null != entityFeature .getDbSyncTime())) { // Else this is a new feature so don't need to update the // feature DB, only the index long nCurrTime = System.currentTimeMillis(); // (query from top of the function, basically lookup on gaz_index) BasicDBObject update2 = new BasicDBObject(); update2.put(EntityFeaturePojo.db_sync_time_, Long.toString(nCurrTime)); update2.put(EntityFeaturePojo.db_sync_doccount_, entityFeature.getDoccount()); BasicDBObject update = new BasicDBObject(MongoDbManager.set_, update2); BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entityFeature.getIndex()); query.put(EntityFeaturePojo.communityId_, communityId); if (_diagnosticMode) { System.out.println( "EntityAggregationUtils.synchronizeEntityFeature, featureDB: " + query.toString() + " / " + update.toString()); } else { entityFeatureDb.update(query, update, false, true); } } if (_diagnosticMode) { System.out.println( "EntityAggregationUtils.synchronizeEntityFeature, synchronize: " + new StringBuffer(entityFeature.getIndex()) .append(':') .append(communityId) .toString() + " = " + IndexManager.mapToIndex(entityFeature, new EntityFeaturePojoIndexMap())); } else { ElasticSearchManager esm = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_); esm.addDocument(entityFeature, new EntityFeaturePojoIndexMap(), null, true); // (_id is set by the index map to index:communityId) } } // TESTED (by eye, mostly cut-and-paste from test Beta)
public static void markAssociationFeatureForSync( AssociationFeaturePojo assocFeature, ObjectId communityId) { DBCollection assocFeatureDb = DbManager.getFeature().getAssociation(); double dPrio = 100.0 * (double) assocFeature.getDoccount() / (0.01 + (double) assocFeature.getDb_sync_doccount()); assocFeature.setDb_sync_prio(dPrio); BasicDBObject query = new BasicDBObject(AssociationFeaturePojo.index_, assocFeature.getIndex()); query.put(AssociationFeaturePojo.communityId_, communityId); BasicDBObject update = new BasicDBObject( MongoDbManager.set_, new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, dPrio)); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "EntityAggregationUtils.markAssociationFeatureForSync, featureDB: " + query.toString() + " / " + update.toString()); } else { assocFeatureDb.update(query, update, false, true); } } // TESTED
public void InitializeDatabase() { // Add indices: try { PropertiesManager pm = new PropertiesManager(); DbManager.getDocument() .getContent() .ensureIndex(new BasicDBObject(DocumentPojo.url_, 1)); // (annoyingly necessary) DbManager.getDocument() .getMetadata() .ensureIndex( new BasicDBObject(DocumentPojo.sourceUrl_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); try { DbManager.getDocument() .getMetadata() .dropIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 1)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) // Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey // separately to do {sourceKey} BasicDBObject compIndex = new BasicDBObject(DocumentPojo.url_, 1); compIndex.put(DocumentPojo.sourceKey_, 1); DbManager.getDocument().getMetadata().ensureIndex(compIndex); // Add {_id:-1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!) compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1); compIndex.put(DocumentPojo._id_, -1); DbManager.getDocument().getMetadata().ensureIndex(compIndex); try { DbManager.getDocument() .getMetadata() .dropIndex(new BasicDBObject(DocumentPojo.sourceKey_, 1)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) // Title simply not needed, that was a mistake from an early iteration: try { DbManager.getDocument().getMetadata().dropIndex(new BasicDBObject(DocumentPojo.title_, 1)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) DbManager.getDocument() .getMetadata() .ensureIndex( new BasicDBObject(DocumentPojo.updateId_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); try { DbManager.getDocument() .getMetadata() .dropIndex(new BasicDBObject(DocumentPojo.updateId_, 1)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) if (!pm.getAggregationDisabled()) { compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1); compIndex.put(DocumentPojo.communityId_, 1); DbManager.getDocument().getMetadata().ensureIndex(compIndex); } compIndex = new BasicDBObject(DocCountPojo._id_, 1); compIndex.put(DocCountPojo.doccount_, 1); DbManager.getDocument().getCounts().ensureIndex(compIndex); DbManager.getFeature() .getEntity() .ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1)); DbManager.getFeature() .getEntity() .ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1)); DbManager.getFeature() .getEntity() .ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1)); DbManager.getFeature() .getEntity() .ensureIndex( new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true)); DbManager.getFeature() .getAssociation() .ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1)); DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1)); DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1)); DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d")); DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1)); DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1)); DbManager.getIngest() .getSource() .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1)); DbManager.getIngest() .getSource() .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1)); // Compound index lets me access {type, communities._id}, {type} efficiently compIndex = new BasicDBObject("type", 1); compIndex.put("communities._id", 1); DbManager.getSocial().getShare().ensureIndex(compIndex); try { DbManager.getSocial().getShare().dropIndex(new BasicDBObject("type", 1)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) DbManager.getSocial() .getCookies() .ensureIndex( new BasicDBObject("apiKey", 2), new BasicDBObject(MongoDbManager.sparse_, true)); try { DbManager.getSocial().getCookies().dropIndex(new BasicDBObject("apiKey", 1)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) DbManager.getCustom() .getLookup() .ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1)); // TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2) DbManager.getCustom() .getLookup() .ensureIndex( new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1), new BasicDBObject(MongoDbManager.sparse_, false)); try { DbManager.getCustom() .getLookup() .dropIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) // //DbManager.getCustom().getLookup().ensureIndex(new // BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new // BasicDBObject(MongoDbManager.sparse_, true)); // DbManager.getCustom().getLookup().ensureIndex(new // BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_, // true)); // try { DbManager.getCustom().getLookup().dropIndex(new // BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1)); } catch (Exception e) {} // (leave this // in for a while until all legacy DBs are removed) DbManager.getCustom() .getLookup() .ensureIndex( new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1), new BasicDBObject(MongoDbManager.sparse_, false)); try { DbManager.getCustom() .getLookup() .dropIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2)); } catch (Exception e) { } // (leave this in for a while until all legacy DBs are removed) // DbManager.getCustom().getLookup().ensureIndex(new // BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new // BasicDBObject(MongoDbManager.sparse_, true)); // try { DbManager.getCustom().getLookup().dropIndex(new // BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1)); } catch (Exception e) {} // (leave // this in for a while until all legacy DBs are removed) } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e.getMessage()); } } // TESTED (not changed since by-eye test in Beta)
public void InitializeIndex( boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature, boolean bRebuildDocsIndex) { try { // create elasticsearch indexes PropertiesManager pm = new PropertiesManager(); if (!pm.getAggregationDisabled()) { Builder localSettingsEvent = ImmutableSettings.settingsBuilder(); localSettingsEvent.put("number_of_shards", 1).put("number_of_replicas", 0); localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettingsEvent.putArray( "index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettingsEvent.putArray( "index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); Builder localSettingsGaz = ImmutableSettings.settingsBuilder(); localSettingsGaz.put("number_of_shards", 1).put("number_of_replicas", 0); localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard"); localSettingsGaz.putArray( "index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase"); // event feature String eventGazMapping = new Gson() .toJson( new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class); ElasticSearchManager eventIndex = IndexManager.createIndex( AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent); if (bDeleteEventFeature) { eventIndex.deleteMe(); eventIndex = IndexManager.createIndex( AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent); } // entity feature String gazMapping = new Gson() .toJson( new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class); ElasticSearchManager entityIndex = IndexManager.createIndex( EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz); if (bDeleteEntityFeature) { entityIndex.deleteMe(); entityIndex = IndexManager.createIndex( EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz); } } // DOCS - much more complicated than anything else boolean bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); // (ie if main doc index doesn't exist then always rebuild all indexes) if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double // check the index is really missing... try { Thread.sleep(60000); } catch (Exception e) { } bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); } bRebuildDocsIndex |= bPingMainIndexFailed; createCommunityDocIndex( DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs); createCommunityDocIndex( DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs); // Some hardwired dummy communities createCommunityDocIndex( "4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin) createCommunityDocIndex( "4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user) // (create dummy index used to keep personal group aliases) // OK, going to have different shards for different communities: // Get a list of all the communities: BasicDBObject query = new BasicDBObject(); BasicDBObject fieldsToDrop = new BasicDBObject("members", 0); fieldsToDrop.put("communityAttributes", 0); fieldsToDrop.put("userAttributes", 0); DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop); if (bRebuildDocsIndex || bDeleteDocs) { List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts) int i = 0; System.out.println("Initializing " + dbc.size() + " indexes:"); for (int j = 0; j < 2; ++j) { for (DBObject dbotmp : tmparray) { if ((++i % 100) == 0) { System.out.println("Initialized " + i + " indexes."); } BasicDBObject dbo = (BasicDBObject) dbotmp; // OK, going to see if there are any sources with this group id, create a new index if // so: // (Don't use CommunityPojo data model here for performance reasons.... // (Also, haven't gotten round to porting CommunityPojo field access to using static // fields)) ObjectId communityId = (ObjectId) dbo.get("_id"); boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false); boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false); ObjectId parentCommunityId = (ObjectId) dbo.get("parentId"); createCommunityDocIndex( communityId.toString(), parentCommunityId, bPersonalGroup, bSystemGroup, bDeleteDocs, j == 0); } // end loop over communities } // end loop over communities - first time parents only } // (end if need to do big loop over all sources) } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e.getMessage()); } } // TESTED (not changed since by-eye test in Beta - retested after moving code into
public static void synchronizeEventFeature( AssociationFeaturePojo eventFeature, ObjectId communityId) { DBCollection eventFeatureDb = DbManager.getFeature().getAssociation(); // NOTE: Important that feeds update occurs before synchronization, since the sync "corrupts" // the event if (_diagnosticMode || (null != eventFeature.getDb_sync_time()) || (null != eventFeature.getDb_sync_prio())) { // Else this is a new feature so don't need to update the feature DB, only the index (if // db_sync_prio null then have to update to avoid b/g aggergation loop) // (note that db_sync_prio will in practice not be set when this is a new feature because it // will have same sync_doccount as doc_count) long nCurrTime = System.currentTimeMillis(); // (query from top of the function, basically lookup on gaz_index) BasicDBObject update2 = new BasicDBObject(); update2.put(AssociationFeaturePojo.db_sync_time_, Long.toString(nCurrTime)); update2.put(AssociationFeaturePojo.db_sync_doccount_, eventFeature.getDoccount()); BasicDBObject update = new BasicDBObject(MongoDbManager.set_, update2); // (also can be added to below) BasicDBObject update3 = new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 1); update.put(MongoDbManager.unset_, update3); BasicDBObject query = new BasicDBObject(AssociationFeaturePojo.index_, eventFeature.getIndex()); query.put(AssociationFeaturePojo.communityId_, communityId); // Keep the number of entity1 and entity2 sets down to a reasonable number // (In the end would like to be able to do this based on date rather than (essentially) // completely randomly) int nSize; BasicDBObject toPull = null; if (null != eventFeature.getEntity1()) { if ((nSize = eventFeature.getEntity1().size()) > AssociationFeaturePojo.entity_MAXFIELDS) { if (null == toPull) toPull = new BasicDBObject(); ArrayList<String> ent1ToRemove = new ArrayList<String>( eventFeature.getEntity1().size() - AssociationFeaturePojo.entity_MAXFIELDS); Iterator<String> it = eventFeature.getEntity1().iterator(); while (it.hasNext() && (nSize > AssociationFeaturePojo.entity_MAXFIELDS)) { String ent = it.next(); if (-1 == ent.indexOf('/')) { // (ie don't remove the index) nSize--; it.remove(); // (this removes from the index) ent1ToRemove.add(ent); } } toPull.put(AssociationFeaturePojo.entity1_, ent1ToRemove); // (this removes from the database) } } if (null != eventFeature.getEntity2()) { if ((nSize = eventFeature.getEntity2().size()) > AssociationFeaturePojo.entity_MAXFIELDS) { if (null == toPull) toPull = new BasicDBObject(); ArrayList<String> ent2ToRemove = new ArrayList<String>( eventFeature.getEntity2().size() - AssociationFeaturePojo.entity_MAXFIELDS); Iterator<String> it = eventFeature.getEntity2().iterator(); while (it.hasNext() && (nSize > AssociationFeaturePojo.entity_MAXFIELDS)) { String ent = it.next(); if (-1 == ent.indexOf('/')) { // (ie don't remove the index) nSize--; it.remove(); // (this removes from the index) ent2ToRemove.add(ent); } } toPull.put(AssociationFeaturePojo.entity2_, ent2ToRemove); // (this removes from the database) } } if (null != toPull) { update.put(MongoDbManager.pullAll_, toPull); // (this removes from the database) } // TESTED (2.1.4.3b, including no index removal clause) if (_diagnosticMode) { if ((null != eventFeature.getDb_sync_time()) || (null != eventFeature.getDb_sync_prio())) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.synchronizeEventFeature, featureDB: " + query.toString() + " / " + update.toString()); } else { if (_logInDiagnosticMode) System.out.println( "(WOULD NOT RUN) EventAggregationUtils.synchronizeEventFeature, featureDB: " + query.toString() + " / " + update.toString()); } } else { eventFeatureDb.update(query, update, false, true); } } if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.synchronizeEventFeature, synchronize: " + new StringBuffer(eventFeature.getIndex()) .append(':') .append(communityId) .toString() + " = " + IndexManager.mapToIndex(eventFeature, new AssociationFeaturePojoIndexMap())); } else { ElasticSearchManager esm = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_); esm.addDocument(eventFeature, new AssociationFeaturePojoIndexMap(), null, true); } } // TESTED
/** * Add events to the elastic search index for events and the mongodb collection so they are * searchable for searchsuggest * * <p>Step 1.a, try to just update alias's Step 1.b, if fail, create new entry * * <p>Step 2, Update totalfreq and doccount * * <p>Step 3, After updating totalfreq and doccount, write to ES for every group * * @param events */ public static void updateEventFeatures( Map<String, Map<ObjectId, AssociationFeaturePojo>> eventFeatures) { // Some diagnostic counters: int numCacheMisses = 0; int numCacheHits = 0; int numNewAssocs = 0; long entityAggregationTime = new Date().getTime(); DBCollection col = DbManager.getFeature().getAssociation(); // (This fn is normally run for a single community id) CommunityFeatureCaches.CommunityFeatureCache currCache = null; String savedSyncTime = null; for (Map<ObjectId, AssociationFeaturePojo> evtCommunity : eventFeatures.values()) { Iterator<Map.Entry<ObjectId, AssociationFeaturePojo>> it = evtCommunity.entrySet().iterator(); while (it.hasNext()) { Map.Entry<ObjectId, AssociationFeaturePojo> evtFeatureKV = it.next(); try { AssociationFeaturePojo evtFeature = evtFeatureKV.getValue(); long nSavedDocCount = evtFeature.getDoccount(); ObjectId communityID = evtFeature.getCommunityId(); if ((null == currCache) || !currCache.getCommunityId().equals(evtFeatureKV.getKey())) { currCache = CommunityFeatureCaches.getCommunityFeatureCache(evtFeatureKV.getKey()); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, Opened cache for community: " + evtFeatureKV.getKey()); } } // TESTED (by hand) // Is this in our cache? If so can short cut a bunch of the DB interaction: AssociationFeaturePojo cachedAssoc = currCache.getCachedAssocFeature(evtFeature); if (null != cachedAssoc) { if (_incrementalMode) { if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, skip cached: " + cachedAssoc.toDb()); // TODO (INF-2825): should be continue-ing here so can use delta more efficiently... } } else if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, grabbed cached: " + cachedAssoc.toDb()); } numCacheHits++; } // TESTED (by hand) else { numCacheMisses++; } // try to update BasicDBObject query = new BasicDBObject(AssociationFeaturePojo.index_, evtFeature.getIndex()); query.put(AssociationFeaturePojo.communityId_, communityID); // Step1 try to update alias // update arrays BasicDBObject multiopAliasArrays = new BasicDBObject(); // Entity1 Alias: if (null != evtFeature.getEntity1_index()) { evtFeature.addEntity1(evtFeature.getEntity1_index()); } if (null != evtFeature.getEntity1()) { if ((null == cachedAssoc) || (null == cachedAssoc.getEntity1()) || !cachedAssoc.getEntity1().containsAll(evtFeature.getEntity1())) { BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, evtFeature.getEntity1()); multiopAliasArrays.put(AssociationFeaturePojo.entity1_, multiopE); } } // TESTED (by hand) // Entity2 Alias: if (null != evtFeature.getEntity2_index()) { evtFeature.addEntity2(evtFeature.getEntity2_index()); } if (null != evtFeature.getEntity2()) { if ((null == cachedAssoc) || (null == cachedAssoc.getEntity2()) || !cachedAssoc.getEntity2().containsAll(evtFeature.getEntity2())) { BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, evtFeature.getEntity2()); multiopAliasArrays.put(AssociationFeaturePojo.entity2_, multiopE); } } // TESTED (by hand) // verb/verb cat alias: if (null != evtFeature.getVerb_category()) { evtFeature.addVerb(evtFeature.getVerb_category()); } if (null != evtFeature.getVerb()) { if ((null == cachedAssoc) || (null == cachedAssoc.getVerb()) || !cachedAssoc.getVerb().containsAll(evtFeature.getVerb())) { BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, evtFeature.getVerb()); multiopAliasArrays.put(AssociationFeaturePojo.verb_, multiopE); } } // TESTED (by hand) // OK - now we can copy across the fields into the cache: if (null != cachedAssoc) { currCache.updateCachedAssocFeatureStatistics( cachedAssoc, evtFeature); // (evtFeature is now fully up to date) } // TESTED (by hand) BasicDBObject updateOp = new BasicDBObject(); if (!multiopAliasArrays.isEmpty()) { updateOp.put(MongoDbManager.addToSet_, multiopAliasArrays); } // Document count for this event BasicDBObject updateFreqDocCount = new BasicDBObject(AssociationFeaturePojo.doccount_, nSavedDocCount); updateOp.put(MongoDbManager.inc_, updateFreqDocCount); BasicDBObject fields = new BasicDBObject(AssociationFeaturePojo.doccount_, 1); fields.put(AssociationFeaturePojo.entity1_, 1); fields.put(AssociationFeaturePojo.entity2_, 1); fields.put(AssociationFeaturePojo.verb_, 1); // (slightly annoying, since only want these if updating dc but won't know // until after i've got this object) fields.put(AssociationFeaturePojo.db_sync_time_, 1); fields.put(AssociationFeaturePojo.db_sync_doccount_, 1); DBObject dboUpdate = null; if (_diagnosticMode) { if (null == cachedAssoc) { dboUpdate = col.findOne(query, fields); } } else { if (null != cachedAssoc) { col.update(query, updateOp, false, false); } else { // Not cached - so have to grab the feature we're either getting or creating dboUpdate = col.findAndModify( query, fields, new BasicDBObject(), false, updateOp, false, true); // (can use findAndModify because specify index, ie the shard key) // (returns event before the changes above, update the feature object below) // (also atomically creates the object if it doesn't exist so is "distributed-safe") } } if ((null != cachedAssoc) || ((dboUpdate != null) && !dboUpdate.keySet().isEmpty())) // (feature already exists) { AssociationFeaturePojo egp = cachedAssoc; if (null == egp) { egp = AssociationFeaturePojo.fromDb(dboUpdate, AssociationFeaturePojo.class); evtFeature.setDoccount(egp.getDoccount() + nSavedDocCount); evtFeature.setDb_sync_doccount(egp.getDb_sync_doccount()); evtFeature.setDb_sync_time(egp.getDb_sync_time()); if (null != egp.getEntity1()) { for (String ent : egp.getEntity1()) evtFeature.addEntity1(ent); } if (null != egp.getEntity2()) { for (String ent : egp.getEntity2()) evtFeature.addEntity2(ent); } if (null != egp.getVerb()) { for (String verb : egp.getVerb()) evtFeature.addVerb(verb); } } // TESTED (cached and non-cached cases) // (in the cached case, evtFeature has already been updated by // updateCachedAssocFeatureStatistics) if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, found: " + ((BasicDBObject) egp.toDb()).toString()); if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, ^^^ found from query: " + query.toString() + " / " + updateOp.toString()); } // (In background aggregation mode we update db_sync_prio when checking the -otherwise // unused, unlike entities- document update schedule) } else // (the object in memory is now an accurate representation of the database, minus // some fields we'll now add) { numNewAssocs++; // Synchronization settings for the newly created object evtFeature.setDb_sync_doccount(nSavedDocCount); if (null == savedSyncTime) { savedSyncTime = Long.toString(System.currentTimeMillis()); } evtFeature.setDb_sync_time(savedSyncTime); // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if // that is // out of date, the update will just be slightly out-of-date at worst) since (otherwise) // these fields are // only set here, and the findAndModify is atomic BasicDBObject baseFields = new BasicDBObject(); if (null != evtFeature.getEntity1_index()) { baseFields.put(AssociationFeaturePojo.entity1_index_, evtFeature.getEntity1_index()); } if (null != evtFeature.getEntity2_index()) { baseFields.put(AssociationFeaturePojo.entity2_index_, evtFeature.getEntity2_index()); } if (null != evtFeature.getVerb_category()) { baseFields.put(AssociationFeaturePojo.verb_category_, evtFeature.getVerb_category()); } baseFields.put(AssociationFeaturePojo.assoc_type_, evtFeature.getAssociation_type()); baseFields.put( AssociationFeaturePojo.db_sync_doccount_, evtFeature.getDb_sync_doccount()); baseFields.put(AssociationFeaturePojo.db_sync_time_, evtFeature.getDb_sync_time()); baseFields.put( AssociationFeaturePojo.db_sync_prio_, 1000.0); // (ensures new objects are quickly index-synchronized) if (!_diagnosticMode) { // Store the object col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields)); } else { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, not found: " + query.toString() + " / " + baseFields.toString() + "/ orig_update= " + updateOp.toString()); } // (Note even in background aggregation mode we still perform the feature // synchronization // for new entities - and it has to be right at the end because it "corrupts" the // objects) } // (end if first time seen) if (null == cachedAssoc) { // First time we've seen this locally, so add to cache currCache.addCachedAssocFeature(evtFeature); if (_diagnosticMode) { if (_logInDiagnosticMode) System.out.println( "AssociationAggregationUtils.updateEventFeatures, added to cache: " + evtFeature.toDb()); } } // TESTED (by hand) } catch (Exception e) { // Exception, remove from feature list it.remove(); // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } // (end loop over all communities for the set of features sharing and index) } // (end loop over indexes) if ((numCacheHits > 0) || (numCacheMisses > 0)) { // ie some assocs were grabbed int cacheSize = 0; if (null != currCache) { cacheSize = currCache.getAssocCacheSize(); } StringBuffer logMsg = new StringBuffer() // (should append key, but don't have that...) .append(" assoc_agg_time_ms=") .append(new Date().getTime() - entityAggregationTime) .append(" total_assocs=") .append(eventFeatures.size()) .append(" new_assocs=") .append(numNewAssocs) .append(" cache_misses=") .append(numCacheMisses) .append(" cache_hits=") .append(numCacheHits) .append(" cache_size=") .append(cacheSize); logger.info(logMsg.toString()); } } // TESTED (by eye, reasonably significant changes, but still based on proven Beta code)
/** * Updates the feature entries for the list of entities that was just extracted including changing * frequency, adding aliases etc * * <p>This method now has 3 steps: 1. Try to update alias 1.a If fail, create new gaz 2. Update * totalfreq and doccount * * @param ents List of entities to update in the entity feature */ public static void updateEntityFeatures( Map<String, Map<ObjectId, EntityFeaturePojo>> entFeatures) { DBCollection col = DbManager.getFeature().getEntity(); String savedSyncTime = null; for (Map<ObjectId, EntityFeaturePojo> entCommunity : entFeatures.values()) { Iterator<Map.Entry<ObjectId, EntityFeaturePojo>> it = entCommunity.entrySet().iterator(); while (it.hasNext()) { Map.Entry<ObjectId, EntityFeaturePojo> entFeatureKV = it.next(); try { EntityFeaturePojo entFeature = entFeatureKV.getValue(); long nSavedDocCount = entFeature.getDoccount(); long nSavedFreqCount = entFeature.getTotalfreq(); // (these should be constant across all communities but keep it here // so can assign it using entFeature, it's v cheap so no need to get once like for sync // vars) ObjectId communityID = entFeature.getCommunityId(); if (null != communityID) { // For each community, see if the entity feature already exists *for that community* BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entFeature.getIndex()); query.put(EntityFeaturePojo.communityId_, communityID); BasicDBObject updateOp = new BasicDBObject(); // Add aliases: BasicDBObject updateOpA = new BasicDBObject(); BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, entFeature.getAlias()); updateOpA.put(EntityFeaturePojo.alias_, multiopE); // Add link data, if there is any: if ((null != entFeature.getSemanticLinks()) && !entFeature.getSemanticLinks().isEmpty()) { BasicDBObject multiopF = new BasicDBObject(MongoDbManager.each_, entFeature.getSemanticLinks()); updateOpA.put(EntityFeaturePojo.linkdata_, multiopF); } updateOp.put(MongoDbManager.addToSet_, updateOpA); // Update frequency: BasicDBObject updateOpB = new BasicDBObject(); updateOpB.put(EntityFeaturePojo.totalfreq_, nSavedFreqCount); updateOpB.put(EntityFeaturePojo.doccount_, nSavedDocCount); updateOp.put(MongoDbManager.inc_, updateOpB); // try to use find/modify to see if something comes back and set doc freq/totalfreq BasicDBObject fields = new BasicDBObject(EntityFeaturePojo.totalfreq_, 1); fields.put(EntityFeaturePojo.doccount_, 1); fields.put(EntityFeaturePojo.alias_, 1); fields.put(EntityFeaturePojo.linkdata_, 1); // (slightly annoying, since only want these 2 largish fields if updating freq but won't // know // until after i've got this object) fields.put(EntityFeaturePojo.db_sync_time_, 1); fields.put(EntityFeaturePojo.db_sync_doccount_, 1); DBObject dboUpdate = null; if (_diagnosticMode) { dboUpdate = col.findOne(query, fields); } else { dboUpdate = col.findAndModify( query, fields, new BasicDBObject(), false, updateOp, false, true); // (can use findAndModify because specify index, ie the shard key) // (returns entity before the changes above, update the feature object below) // (also atomically creates the object if it doesn't exist so is "distributed-safe") } if ((dboUpdate != null) && !dboUpdate.keySet().isEmpty()) { // (Update the entity feature to be correct so that it can be accurately synchronized // with the index) EntityFeaturePojo gp = EntityFeaturePojo.fromDb(dboUpdate, EntityFeaturePojo.class); entFeature.setTotalfreq(gp.getTotalfreq() + nSavedFreqCount); entFeature.setDoccount(gp.getDoccount() + nSavedDocCount); entFeature.setDbSyncDoccount(gp.getDbSyncDoccount()); entFeature.setDbSyncTime(gp.getDbSyncTime()); if (null != gp.getAlias()) { entFeature.addAllAlias(gp.getAlias()); } if (null != gp.getSemanticLinks()) { entFeature.addToSemanticLinks(gp.getSemanticLinks()); } if (_diagnosticMode) { System.out.println( "EntityAggregationUtils.updateEntityFeatures, found: " + ((BasicDBObject) gp.toDb()).toString()); System.out.println( "EntityAggregationUtils.updateEntityFeatures, ^^^ found from query: " + query.toString() + " / " + updateOp.toString()); } } else // (the object in memory is now an accurate representation of the database, minus // some fields we'll now add) { // Synchronization settings for the newly created object if (null == savedSyncTime) { savedSyncTime = Long.toString(System.currentTimeMillis()); } entFeature.setDbSyncDoccount(nSavedDocCount); entFeature.setDbSyncTime(savedSyncTime); // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if // that is // out of date, the update will just be slightly out-of-date at worst) since // (otherwise) these fields are // only set here, and the findAndModify is atomic // (Do in raw MongoDB for performance) BasicDBObject baseFields = new BasicDBObject(); baseFields.put(EntityFeaturePojo.dimension_, entFeature.getDimension().toString()); baseFields.put(EntityFeaturePojo.type_, entFeature.getType()); baseFields.put( EntityFeaturePojo.disambiguated_name_, entFeature.getDisambiguatedName()); baseFields.put(EntityFeaturePojo.db_sync_doccount_, entFeature.getDbSyncDoccount()); baseFields.put(EntityFeaturePojo.db_sync_time_, entFeature.getDbSyncTime()); if ((null != entFeature.getSemanticLinks()) && !entFeature.getSemanticLinks().isEmpty()) { baseFields.put(EntityFeaturePojo.linkdata_, entFeature.getSemanticLinks()); } // attempt to add geotag (makes necessary checks on util side) // also add ontology type if geotag is found EntityGeotagAggregationUtils.addEntityGeo(entFeature); if (entFeature.getGeotag() != null) { BasicDBObject geo = new BasicDBObject(GeoPojo.lat_, entFeature.getGeotag().lat); geo.put(GeoPojo.lon_, entFeature.getGeotag().lon); baseFields.put(EntityFeaturePojo.geotag_, geo); if (entFeature.getOntology_type() != null) { baseFields.put(EntityFeaturePojo.ontology_type_, entFeature.getOntology_type()); } } if (!_diagnosticMode) { // Store the object col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields)); } else { System.out.println( "EntityAggregationUtils.updateEntityFeatures, not found: " + query.toString() + ": " + baseFields.toString()); } entFeature.setDbSyncTime(null); // (ensures that index re-sync will occur) } } } catch (Exception e) { // Exception, remove from feature list it.remove(); // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } // (end loop over communities) } // (end loop over indexes) } // TESTED (just by eye - made few changes during re-factoring)