Exemplo n.º 1
0
  public static void updateMatchingEntities(EntityFeaturePojo entFeature, ObjectId communityId) {
    String index = entFeature.getIndex();
    long totalFreq = entFeature.getTotalfreq();
    long docCount = entFeature.getDoccount();

    try {
      DBCollection docDb = DbManager.getDocument().getMetadata();

      BasicDBObject query1 = new BasicDBObject();
      query1.put(EntityPojo.docQuery_index_, index);
      query1.put(DocumentPojo.communityId_, communityId);

      BasicDBObject multiopB = new BasicDBObject();
      multiopB.put(EntityPojo.docUpdate_totalfrequency_, totalFreq);
      multiopB.put(EntityPojo.docUpdate_doccount_, docCount);
      BasicDBObject multiopA = new BasicDBObject(MongoDbManager.set_, multiopB);

      if (_diagnosticMode) {
        System.out.println(
            "EntityAggregationUtils.updateMatchingEntities: "
                + query1.toString()
                + " / "
                + multiopA.toString());
      } else {
        synchronized (GenericProcessingController.class) {
          // Because this op can be slow, and traverse a lot of disk, need to ensure that
          // we don't allow all the threads to hammer it at once (the updates all yield to each
          // other
          // enough that the disk goes totally crazy)

          docDb.update(query1, multiopA, false, true);
          DbManager.getDocument().getLastError(DbManager.getDocument().getMetadata().getName());
          // (enforce consecutive accesses for this potentially very slow operation)
        }

        // Was originally checked updatedExisting but for INF-1406, it sometimes seemed to be
        // checking the wrong command. I suspect the reason we had this code in here has gone away,
        // and it doesn't matter if this update occasionally fails anyway, it will just be out of
        // date
        // so the check/retry has been removed.
      }
    } catch (Exception ex) {
      logger.error(ex.getMessage(), ex);
    }
  } // TESTED (by eye, mostly cut-and-paste from test Beta)
Exemplo n.º 2
0
  public static void synchronizeEntityFeature(
      EntityFeaturePojo entityFeature, ObjectId communityId) {
    DBCollection entityFeatureDb = DbManager.getFeature().getEntity();

    // NOTE: Important that feeds update occurs before synchronization, since the sync "corrupts"
    // the entity

    if (_diagnosticMode
        || (null
            != entityFeature
                .getDbSyncTime())) { // Else this is a new feature so don't need to update the
                                     // feature DB, only the index
      long nCurrTime = System.currentTimeMillis();
      // (query from top of the function, basically lookup on gaz_index)
      BasicDBObject update2 = new BasicDBObject();
      update2.put(EntityFeaturePojo.db_sync_time_, Long.toString(nCurrTime));
      update2.put(EntityFeaturePojo.db_sync_doccount_, entityFeature.getDoccount());
      BasicDBObject update = new BasicDBObject(MongoDbManager.set_, update2);
      BasicDBObject query = new BasicDBObject(EntityFeaturePojo.index_, entityFeature.getIndex());
      query.put(EntityFeaturePojo.communityId_, communityId);

      if (_diagnosticMode) {
        System.out.println(
            "EntityAggregationUtils.synchronizeEntityFeature, featureDB: "
                + query.toString()
                + " / "
                + update.toString());
      } else {
        entityFeatureDb.update(query, update, false, true);
      }
    }

    if (_diagnosticMode) {
      System.out.println(
          "EntityAggregationUtils.synchronizeEntityFeature, synchronize: "
              + new StringBuffer(entityFeature.getIndex())
                  .append(':')
                  .append(communityId)
                  .toString()
              + " = "
              + IndexManager.mapToIndex(entityFeature, new EntityFeaturePojoIndexMap()));
    } else {
      ElasticSearchManager esm = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
      esm.addDocument(entityFeature, new EntityFeaturePojoIndexMap(), null, true);
      // (_id is set by the index map to index:communityId)
    }
  } // TESTED (by eye, mostly cut-and-paste from test Beta)
 public static void markAssociationFeatureForSync(
     AssociationFeaturePojo assocFeature, ObjectId communityId) {
   DBCollection assocFeatureDb = DbManager.getFeature().getAssociation();
   double dPrio =
       100.0
           * (double) assocFeature.getDoccount()
           / (0.01 + (double) assocFeature.getDb_sync_doccount());
   assocFeature.setDb_sync_prio(dPrio);
   BasicDBObject query = new BasicDBObject(AssociationFeaturePojo.index_, assocFeature.getIndex());
   query.put(AssociationFeaturePojo.communityId_, communityId);
   BasicDBObject update =
       new BasicDBObject(
           MongoDbManager.set_, new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, dPrio));
   if (_diagnosticMode) {
     if (_logInDiagnosticMode)
       System.out.println(
           "EntityAggregationUtils.markAssociationFeatureForSync, featureDB: "
               + query.toString()
               + " / "
               + update.toString());
   } else {
     assocFeatureDb.update(query, update, false, true);
   }
 } // TESTED
  public void InitializeDatabase() {
    // Add indices:
    try {
      PropertiesManager pm = new PropertiesManager();

      DbManager.getDocument()
          .getContent()
          .ensureIndex(new BasicDBObject(DocumentPojo.url_, 1)); // (annoyingly necessary)
      DbManager.getDocument()
          .getMetadata()
          .ensureIndex(
              new BasicDBObject(DocumentPojo.sourceUrl_, 2),
              new BasicDBObject(MongoDbManager.sparse_, true));
      try {
        DbManager.getDocument()
            .getMetadata()
            .dropIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 1));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)

      // Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey
      // separately to do {sourceKey}
      BasicDBObject compIndex = new BasicDBObject(DocumentPojo.url_, 1);
      compIndex.put(DocumentPojo.sourceKey_, 1);
      DbManager.getDocument().getMetadata().ensureIndex(compIndex);
      // Add {_id:-1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!)
      compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1);
      compIndex.put(DocumentPojo._id_, -1);
      DbManager.getDocument().getMetadata().ensureIndex(compIndex);
      try {
        DbManager.getDocument()
            .getMetadata()
            .dropIndex(new BasicDBObject(DocumentPojo.sourceKey_, 1));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      // Title simply not needed, that was a mistake from an early iteration:
      try {
        DbManager.getDocument().getMetadata().dropIndex(new BasicDBObject(DocumentPojo.title_, 1));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      DbManager.getDocument()
          .getMetadata()
          .ensureIndex(
              new BasicDBObject(DocumentPojo.updateId_, 2),
              new BasicDBObject(MongoDbManager.sparse_, true));
      try {
        DbManager.getDocument()
            .getMetadata()
            .dropIndex(new BasicDBObject(DocumentPojo.updateId_, 1));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      if (!pm.getAggregationDisabled()) {
        compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1);
        compIndex.put(DocumentPojo.communityId_, 1);
        DbManager.getDocument().getMetadata().ensureIndex(compIndex);
      }
      compIndex = new BasicDBObject(DocCountPojo._id_, 1);
      compIndex.put(DocCountPojo.doccount_, 1);
      DbManager.getDocument().getCounts().ensureIndex(compIndex);
      DbManager.getFeature()
          .getEntity()
          .ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1));
      DbManager.getFeature()
          .getEntity()
          .ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1));
      DbManager.getFeature()
          .getEntity()
          .ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1));
      DbManager.getFeature()
          .getEntity()
          .ensureIndex(
              new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2),
              new BasicDBObject(MongoDbManager.sparse_, true));
      DbManager.getFeature()
          .getAssociation()
          .ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1));
      DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1));
      DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1));
      DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d"));
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1));
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1));
      DbManager.getIngest()
          .getSource()
          .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1));
      DbManager.getIngest()
          .getSource()
          .ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1));
      // Compound index lets me access {type, communities._id}, {type} efficiently
      compIndex = new BasicDBObject("type", 1);
      compIndex.put("communities._id", 1);
      DbManager.getSocial().getShare().ensureIndex(compIndex);
      try {
        DbManager.getSocial().getShare().dropIndex(new BasicDBObject("type", 1));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      DbManager.getSocial()
          .getCookies()
          .ensureIndex(
              new BasicDBObject("apiKey", 2), new BasicDBObject(MongoDbManager.sparse_, true));
      try {
        DbManager.getSocial().getCookies().dropIndex(new BasicDBObject("apiKey", 1));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      DbManager.getCustom()
          .getLookup()
          .ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1));
      // TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2)
      DbManager.getCustom()
          .getLookup()
          .ensureIndex(
              new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1),
              new BasicDBObject(MongoDbManager.sparse_, false));
      try {
        DbManager.getCustom()
            .getLookup()
            .dropIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      //	//DbManager.getCustom().getLookup().ensureIndex(new
      // BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new
      // BasicDBObject(MongoDbManager.sparse_, true));
      //			DbManager.getCustom().getLookup().ensureIndex(new
      // BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_,
      // true));
      //			try { DbManager.getCustom().getLookup().dropIndex(new
      // BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1)); } catch (Exception e) {} // (leave this
      // in for a while until all legacy DBs are removed)
      DbManager.getCustom()
          .getLookup()
          .ensureIndex(
              new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1),
              new BasicDBObject(MongoDbManager.sparse_, false));
      try {
        DbManager.getCustom()
            .getLookup()
            .dropIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2));
      } catch (Exception e) {
      } // (leave this in for a while until all legacy DBs are removed)
      //			DbManager.getCustom().getLookup().ensureIndex(new
      // BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new
      // BasicDBObject(MongoDbManager.sparse_, true));
      //			try { DbManager.getCustom().getLookup().dropIndex(new
      // BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1)); } catch (Exception e) {} // (leave
      // this in for a while until all legacy DBs are removed)
    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e.getMessage());
    }
  } // TESTED (not changed since by-eye test in Beta)
  public void InitializeIndex(
      boolean bDeleteDocs,
      boolean bDeleteEntityFeature,
      boolean bDeleteEventFeature,
      boolean bRebuildDocsIndex) {

    try { // create elasticsearch indexes

      PropertiesManager pm = new PropertiesManager();

      if (!pm.getAggregationDisabled()) {

        Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
        localSettingsEvent.put("number_of_shards", 1).put("number_of_replicas", 0);
        localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
        localSettingsEvent.putArray(
            "index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

        localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
        localSettingsEvent.putArray(
            "index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

        Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
        localSettingsGaz.put("number_of_shards", 1).put("number_of_replicas", 0);
        localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
        localSettingsGaz.putArray(
            "index.analysis.analyzer.suggestAnalyzer.filter", "standard", "lowercase");

        // event feature
        String eventGazMapping =
            new Gson()
                .toJson(
                    new AssociationFeaturePojoIndexMap.Mapping(),
                    AssociationFeaturePojoIndexMap.Mapping.class);
        ElasticSearchManager eventIndex =
            IndexManager.createIndex(
                AssociationFeaturePojoIndexMap.indexName_,
                null,
                false,
                null,
                eventGazMapping,
                localSettingsEvent);
        if (bDeleteEventFeature) {
          eventIndex.deleteMe();
          eventIndex =
              IndexManager.createIndex(
                  AssociationFeaturePojoIndexMap.indexName_,
                  null,
                  false,
                  null,
                  eventGazMapping,
                  localSettingsEvent);
        }
        // entity feature
        String gazMapping =
            new Gson()
                .toJson(
                    new EntityFeaturePojoIndexMap.Mapping(),
                    EntityFeaturePojoIndexMap.Mapping.class);
        ElasticSearchManager entityIndex =
            IndexManager.createIndex(
                EntityFeaturePojoIndexMap.indexName_,
                null,
                false,
                null,
                gazMapping,
                localSettingsGaz);
        if (bDeleteEntityFeature) {
          entityIndex.deleteMe();
          entityIndex =
              IndexManager.createIndex(
                  EntityFeaturePojoIndexMap.indexName_,
                  null,
                  false,
                  null,
                  gazMapping,
                  localSettingsGaz);
        }
      }

      // DOCS - much more complicated than anything else

      boolean bPingMainIndexFailed =
          !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
      // (ie if main doc index doesn't exist then always rebuild all indexes)

      if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double
        // check the index is really missing...
        try {
          Thread.sleep(60000);
        } catch (Exception e) {
        }
        bPingMainIndexFailed =
            !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
      }
      bRebuildDocsIndex |= bPingMainIndexFailed;

      createCommunityDocIndex(
          DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
      createCommunityDocIndex(
          DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);

      // Some hardwired dummy communities
      createCommunityDocIndex(
          "4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
      createCommunityDocIndex(
          "4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
      // (create dummy index used to keep personal group aliases)

      // OK, going to have different shards for different communities:
      // Get a list of all the communities:

      BasicDBObject query = new BasicDBObject();
      BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
      fieldsToDrop.put("communityAttributes", 0);
      fieldsToDrop.put("userAttributes", 0);
      DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);

      if (bRebuildDocsIndex || bDeleteDocs) {

        List<DBObject> tmparray =
            dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
        int i = 0;
        System.out.println("Initializing " + dbc.size() + " indexes:");
        for (int j = 0; j < 2; ++j) {
          for (DBObject dbotmp : tmparray) {
            if ((++i % 100) == 0) {
              System.out.println("Initialized " + i + " indexes.");
            }
            BasicDBObject dbo = (BasicDBObject) dbotmp;

            // OK, going to see if there are any sources with this group id, create a new index if
            // so:
            // (Don't use CommunityPojo data model here for performance reasons....
            //  (Also, haven't gotten round to porting CommunityPojo field access to using static
            // fields))
            ObjectId communityId = (ObjectId) dbo.get("_id");
            boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
            boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
            ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");

            createCommunityDocIndex(
                communityId.toString(),
                parentCommunityId,
                bPersonalGroup,
                bSystemGroup,
                bDeleteDocs,
                j == 0);
          } // end loop over communities
        } // end loop over communities - first time parents only
      } // (end if need to do big loop over all sources)
    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException(e.getMessage());
    }
  } // TESTED (not changed since by-eye test in Beta - retested after moving code into
  public static void synchronizeEventFeature(
      AssociationFeaturePojo eventFeature, ObjectId communityId) {
    DBCollection eventFeatureDb = DbManager.getFeature().getAssociation();

    // NOTE: Important that feeds update occurs before synchronization, since the sync "corrupts"
    // the event

    if (_diagnosticMode
        || (null != eventFeature.getDb_sync_time())
        || (null != eventFeature.getDb_sync_prio())) {
      // Else this is a new feature so don't need to update the feature DB, only the index (if
      // db_sync_prio null then have to update to avoid b/g aggergation loop)
      // (note that db_sync_prio will in practice not be set when this is a new feature because it
      // will have same sync_doccount as doc_count)

      long nCurrTime = System.currentTimeMillis();
      // (query from top of the function, basically lookup on gaz_index)
      BasicDBObject update2 = new BasicDBObject();
      update2.put(AssociationFeaturePojo.db_sync_time_, Long.toString(nCurrTime));
      update2.put(AssociationFeaturePojo.db_sync_doccount_, eventFeature.getDoccount());
      BasicDBObject update = new BasicDBObject(MongoDbManager.set_, update2);
      // (also can be added to below)
      BasicDBObject update3 = new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 1);
      update.put(MongoDbManager.unset_, update3);
      BasicDBObject query =
          new BasicDBObject(AssociationFeaturePojo.index_, eventFeature.getIndex());
      query.put(AssociationFeaturePojo.communityId_, communityId);

      // Keep the number of entity1 and entity2 sets down to a reasonable number
      // (In the end would like to be able to do this based on date rather than (essentially)
      // completely randomly)
      int nSize;
      BasicDBObject toPull = null;
      if (null != eventFeature.getEntity1()) {
        if ((nSize = eventFeature.getEntity1().size()) > AssociationFeaturePojo.entity_MAXFIELDS) {
          if (null == toPull) toPull = new BasicDBObject();
          ArrayList<String> ent1ToRemove =
              new ArrayList<String>(
                  eventFeature.getEntity1().size() - AssociationFeaturePojo.entity_MAXFIELDS);
          Iterator<String> it = eventFeature.getEntity1().iterator();
          while (it.hasNext() && (nSize > AssociationFeaturePojo.entity_MAXFIELDS)) {
            String ent = it.next();
            if (-1 == ent.indexOf('/')) { // (ie don't remove the index)
              nSize--;
              it.remove(); // (this removes from the index)
              ent1ToRemove.add(ent);
            }
          }
          toPull.put(AssociationFeaturePojo.entity1_, ent1ToRemove);
          // (this removes from the database)
        }
      }
      if (null != eventFeature.getEntity2()) {
        if ((nSize = eventFeature.getEntity2().size()) > AssociationFeaturePojo.entity_MAXFIELDS) {
          if (null == toPull) toPull = new BasicDBObject();
          ArrayList<String> ent2ToRemove =
              new ArrayList<String>(
                  eventFeature.getEntity2().size() - AssociationFeaturePojo.entity_MAXFIELDS);
          Iterator<String> it = eventFeature.getEntity2().iterator();
          while (it.hasNext() && (nSize > AssociationFeaturePojo.entity_MAXFIELDS)) {
            String ent = it.next();
            if (-1 == ent.indexOf('/')) { // (ie don't remove the index)
              nSize--;
              it.remove(); // (this removes from the index)
              ent2ToRemove.add(ent);
            }
          }
          toPull.put(AssociationFeaturePojo.entity2_, ent2ToRemove);
          // (this removes from the database)
        }
      }
      if (null != toPull) {
        update.put(MongoDbManager.pullAll_, toPull);
        // (this removes from the database)
      }
      // TESTED (2.1.4.3b, including no index removal clause)

      if (_diagnosticMode) {
        if ((null != eventFeature.getDb_sync_time()) || (null != eventFeature.getDb_sync_prio())) {
          if (_logInDiagnosticMode)
            System.out.println(
                "AssociationAggregationUtils.synchronizeEventFeature, featureDB: "
                    + query.toString()
                    + " / "
                    + update.toString());
        } else {
          if (_logInDiagnosticMode)
            System.out.println(
                "(WOULD NOT RUN) EventAggregationUtils.synchronizeEventFeature, featureDB: "
                    + query.toString()
                    + " / "
                    + update.toString());
        }
      } else {
        eventFeatureDb.update(query, update, false, true);
      }
    }

    if (_diagnosticMode) {
      if (_logInDiagnosticMode)
        System.out.println(
            "AssociationAggregationUtils.synchronizeEventFeature, synchronize: "
                + new StringBuffer(eventFeature.getIndex())
                    .append(':')
                    .append(communityId)
                    .toString()
                + " = "
                + IndexManager.mapToIndex(eventFeature, new AssociationFeaturePojoIndexMap()));
    } else {
      ElasticSearchManager esm = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
      esm.addDocument(eventFeature, new AssociationFeaturePojoIndexMap(), null, true);
    }
  } // TESTED
  /**
   * Add events to the elastic search index for events and the mongodb collection so they are
   * searchable for searchsuggest
   *
   * <p>Step 1.a, try to just update alias's Step 1.b, if fail, create new entry
   *
   * <p>Step 2, Update totalfreq and doccount
   *
   * <p>Step 3, After updating totalfreq and doccount, write to ES for every group
   *
   * @param events
   */
  public static void updateEventFeatures(
      Map<String, Map<ObjectId, AssociationFeaturePojo>> eventFeatures) {
    // Some diagnostic counters:
    int numCacheMisses = 0;
    int numCacheHits = 0;
    int numNewAssocs = 0;
    long entityAggregationTime = new Date().getTime();

    DBCollection col = DbManager.getFeature().getAssociation();

    // (This fn is normally run for a single community id)
    CommunityFeatureCaches.CommunityFeatureCache currCache = null;

    String savedSyncTime = null;
    for (Map<ObjectId, AssociationFeaturePojo> evtCommunity : eventFeatures.values()) {

      Iterator<Map.Entry<ObjectId, AssociationFeaturePojo>> it = evtCommunity.entrySet().iterator();
      while (it.hasNext()) {
        Map.Entry<ObjectId, AssociationFeaturePojo> evtFeatureKV = it.next();
        try {
          AssociationFeaturePojo evtFeature = evtFeatureKV.getValue();
          long nSavedDocCount = evtFeature.getDoccount();

          ObjectId communityID = evtFeature.getCommunityId();

          if ((null == currCache) || !currCache.getCommunityId().equals(evtFeatureKV.getKey())) {
            currCache = CommunityFeatureCaches.getCommunityFeatureCache(evtFeatureKV.getKey());
            if (_diagnosticMode) {
              if (_logInDiagnosticMode)
                System.out.println(
                    "AssociationAggregationUtils.updateEventFeatures, Opened cache for community: "
                        + evtFeatureKV.getKey());
            }
          } // TESTED (by hand)

          // Is this in our cache? If so can short cut a bunch of the DB interaction:
          AssociationFeaturePojo cachedAssoc = currCache.getCachedAssocFeature(evtFeature);
          if (null != cachedAssoc) {
            if (_incrementalMode) {
              if (_diagnosticMode) {
                if (_logInDiagnosticMode)
                  System.out.println(
                      "AssociationAggregationUtils.updateEventFeatures, skip cached: "
                          + cachedAssoc.toDb());
                // TODO (INF-2825): should be continue-ing here so can use delta more efficiently...
              }
            } else if (_diagnosticMode) {
              if (_logInDiagnosticMode)
                System.out.println(
                    "AssociationAggregationUtils.updateEventFeatures, grabbed cached: "
                        + cachedAssoc.toDb());
            }
            numCacheHits++;
          } // TESTED (by hand)
          else {
            numCacheMisses++;
          }

          // try to update
          BasicDBObject query =
              new BasicDBObject(AssociationFeaturePojo.index_, evtFeature.getIndex());
          query.put(AssociationFeaturePojo.communityId_, communityID);

          // Step1 try to update alias
          // update arrays
          BasicDBObject multiopAliasArrays = new BasicDBObject();
          // Entity1 Alias:
          if (null != evtFeature.getEntity1_index()) {
            evtFeature.addEntity1(evtFeature.getEntity1_index());
          }
          if (null != evtFeature.getEntity1()) {
            if ((null == cachedAssoc)
                || (null == cachedAssoc.getEntity1())
                || !cachedAssoc.getEntity1().containsAll(evtFeature.getEntity1())) {
              BasicDBObject multiopE =
                  new BasicDBObject(MongoDbManager.each_, evtFeature.getEntity1());
              multiopAliasArrays.put(AssociationFeaturePojo.entity1_, multiopE);
            }
          } // TESTED (by hand)

          // Entity2 Alias:
          if (null != evtFeature.getEntity2_index()) {
            evtFeature.addEntity2(evtFeature.getEntity2_index());
          }
          if (null != evtFeature.getEntity2()) {
            if ((null == cachedAssoc)
                || (null == cachedAssoc.getEntity2())
                || !cachedAssoc.getEntity2().containsAll(evtFeature.getEntity2())) {
              BasicDBObject multiopE =
                  new BasicDBObject(MongoDbManager.each_, evtFeature.getEntity2());
              multiopAliasArrays.put(AssociationFeaturePojo.entity2_, multiopE);
            }
          } // TESTED (by hand)

          // verb/verb cat alias:
          if (null != evtFeature.getVerb_category()) {
            evtFeature.addVerb(evtFeature.getVerb_category());
          }
          if (null != evtFeature.getVerb()) {
            if ((null == cachedAssoc)
                || (null == cachedAssoc.getVerb())
                || !cachedAssoc.getVerb().containsAll(evtFeature.getVerb())) {
              BasicDBObject multiopE =
                  new BasicDBObject(MongoDbManager.each_, evtFeature.getVerb());
              multiopAliasArrays.put(AssociationFeaturePojo.verb_, multiopE);
            }
          } // TESTED (by hand)

          // OK - now we can copy across the fields into the cache:
          if (null != cachedAssoc) {
            currCache.updateCachedAssocFeatureStatistics(
                cachedAssoc, evtFeature); // (evtFeature is now fully up to date)
          } // TESTED (by hand)

          BasicDBObject updateOp = new BasicDBObject();
          if (!multiopAliasArrays.isEmpty()) {
            updateOp.put(MongoDbManager.addToSet_, multiopAliasArrays);
          }
          // Document count for this event
          BasicDBObject updateFreqDocCount =
              new BasicDBObject(AssociationFeaturePojo.doccount_, nSavedDocCount);
          updateOp.put(MongoDbManager.inc_, updateFreqDocCount);

          BasicDBObject fields = new BasicDBObject(AssociationFeaturePojo.doccount_, 1);
          fields.put(AssociationFeaturePojo.entity1_, 1);
          fields.put(AssociationFeaturePojo.entity2_, 1);
          fields.put(AssociationFeaturePojo.verb_, 1);
          // (slightly annoying, since only want these if updating dc but won't know
          // until after i've got this object)

          fields.put(AssociationFeaturePojo.db_sync_time_, 1);
          fields.put(AssociationFeaturePojo.db_sync_doccount_, 1);

          DBObject dboUpdate = null;
          if (_diagnosticMode) {
            if (null == cachedAssoc) {
              dboUpdate = col.findOne(query, fields);
            }
          } else {
            if (null != cachedAssoc) {
              col.update(query, updateOp, false, false);
            } else { // Not cached - so have to grab the feature we're either getting or creating
              dboUpdate =
                  col.findAndModify(
                      query, fields, new BasicDBObject(), false, updateOp, false, true);
              // (can use findAndModify because specify index, ie the shard key)
              // (returns event before the changes above, update the feature object below)
              // (also atomically creates the object if it doesn't exist so is "distributed-safe")
            }
          }
          if ((null != cachedAssoc)
              || ((dboUpdate != null) && !dboUpdate.keySet().isEmpty())) // (feature already exists)
          {
            AssociationFeaturePojo egp = cachedAssoc;

            if (null == egp) {
              egp = AssociationFeaturePojo.fromDb(dboUpdate, AssociationFeaturePojo.class);
              evtFeature.setDoccount(egp.getDoccount() + nSavedDocCount);
              evtFeature.setDb_sync_doccount(egp.getDb_sync_doccount());
              evtFeature.setDb_sync_time(egp.getDb_sync_time());
              if (null != egp.getEntity1()) {
                for (String ent : egp.getEntity1()) evtFeature.addEntity1(ent);
              }
              if (null != egp.getEntity2()) {
                for (String ent : egp.getEntity2()) evtFeature.addEntity2(ent);
              }
              if (null != egp.getVerb()) {
                for (String verb : egp.getVerb()) evtFeature.addVerb(verb);
              }
            } // TESTED (cached and non-cached cases)
            // (in the cached case, evtFeature has already been updated by
            // updateCachedAssocFeatureStatistics)

            if (_diagnosticMode) {
              if (_logInDiagnosticMode)
                System.out.println(
                    "AssociationAggregationUtils.updateEventFeatures, found: "
                        + ((BasicDBObject) egp.toDb()).toString());
              if (_logInDiagnosticMode)
                System.out.println(
                    "AssociationAggregationUtils.updateEventFeatures, ^^^ found from query: "
                        + query.toString()
                        + " / "
                        + updateOp.toString());
            }
            // (In background aggregation mode we update db_sync_prio when checking the -otherwise
            // unused, unlike entities- document update schedule)
          } else // (the object in memory is now an accurate representation of the database, minus
                 // some fields we'll now add)
          {
            numNewAssocs++;

            // Synchronization settings for the newly created object
            evtFeature.setDb_sync_doccount(nSavedDocCount);
            if (null == savedSyncTime) {
              savedSyncTime = Long.toString(System.currentTimeMillis());
            }
            evtFeature.setDb_sync_time(savedSyncTime);

            // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if
            // that is
            // out of date, the update will just be slightly out-of-date at worst) since (otherwise)
            // these fields are
            // only set here, and the findAndModify is atomic

            BasicDBObject baseFields = new BasicDBObject();
            if (null != evtFeature.getEntity1_index()) {
              baseFields.put(AssociationFeaturePojo.entity1_index_, evtFeature.getEntity1_index());
            }
            if (null != evtFeature.getEntity2_index()) {
              baseFields.put(AssociationFeaturePojo.entity2_index_, evtFeature.getEntity2_index());
            }
            if (null != evtFeature.getVerb_category()) {
              baseFields.put(AssociationFeaturePojo.verb_category_, evtFeature.getVerb_category());
            }
            baseFields.put(AssociationFeaturePojo.assoc_type_, evtFeature.getAssociation_type());
            baseFields.put(
                AssociationFeaturePojo.db_sync_doccount_, evtFeature.getDb_sync_doccount());
            baseFields.put(AssociationFeaturePojo.db_sync_time_, evtFeature.getDb_sync_time());
            baseFields.put(
                AssociationFeaturePojo.db_sync_prio_,
                1000.0); // (ensures new objects are quickly index-synchronized)

            if (!_diagnosticMode) {
              // Store the object
              col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields));
            } else {
              if (_logInDiagnosticMode)
                System.out.println(
                    "AssociationAggregationUtils.updateEventFeatures, not found: "
                        + query.toString()
                        + " / "
                        + baseFields.toString()
                        + "/ orig_update= "
                        + updateOp.toString());
            }

            // (Note even in background aggregation mode we still perform the feature
            // synchronization
            //  for new entities - and it has to be right at the end because it "corrupts" the
            // objects)

          } // (end if first time seen)

          if (null == cachedAssoc) { // First time we've seen this locally, so add to cache
            currCache.addCachedAssocFeature(evtFeature);
            if (_diagnosticMode) {
              if (_logInDiagnosticMode)
                System.out.println(
                    "AssociationAggregationUtils.updateEventFeatures, added to cache: "
                        + evtFeature.toDb());
            }
          } // TESTED (by hand)
        } catch (Exception e) {
          // Exception, remove from feature list
          it.remove();

          // If an exception occurs log the error
          logger.error("Exception Message: " + e.getMessage(), e);
        }
      } // (end loop over all communities for the set of features sharing and index)
    } // (end loop over indexes)

    if ((numCacheHits > 0) || (numCacheMisses > 0)) { // ie some assocs were grabbed
      int cacheSize = 0;
      if (null != currCache) {
        cacheSize = currCache.getAssocCacheSize();
      }
      StringBuffer logMsg =
          new StringBuffer() // (should append key, but don't have that...)
              .append(" assoc_agg_time_ms=")
              .append(new Date().getTime() - entityAggregationTime)
              .append(" total_assocs=")
              .append(eventFeatures.size())
              .append(" new_assocs=")
              .append(numNewAssocs)
              .append(" cache_misses=")
              .append(numCacheMisses)
              .append(" cache_hits=")
              .append(numCacheHits)
              .append(" cache_size=")
              .append(cacheSize);

      logger.info(logMsg.toString());
    }
  } // TESTED (by eye, reasonably significant changes, but still based on proven Beta code)
Exemplo n.º 8
0
  /**
   * Updates the feature entries for the list of entities that was just extracted including changing
   * frequency, adding aliases etc
   *
   * <p>This method now has 3 steps: 1. Try to update alias 1.a If fail, create new gaz 2. Update
   * totalfreq and doccount
   *
   * @param ents List of entities to update in the entity feature
   */
  public static void updateEntityFeatures(
      Map<String, Map<ObjectId, EntityFeaturePojo>> entFeatures) {
    DBCollection col = DbManager.getFeature().getEntity();
    String savedSyncTime = null;
    for (Map<ObjectId, EntityFeaturePojo> entCommunity : entFeatures.values()) {

      Iterator<Map.Entry<ObjectId, EntityFeaturePojo>> it = entCommunity.entrySet().iterator();
      while (it.hasNext()) {
        Map.Entry<ObjectId, EntityFeaturePojo> entFeatureKV = it.next();
        try {
          EntityFeaturePojo entFeature = entFeatureKV.getValue();

          long nSavedDocCount = entFeature.getDoccount();
          long nSavedFreqCount = entFeature.getTotalfreq();
          // (these should be constant across all communities but keep it here
          //  so can assign it using entFeature, it's v cheap so no need to get once like for sync
          // vars)

          ObjectId communityID = entFeature.getCommunityId();
          if (null != communityID) {
            // For each community, see if the entity feature already exists *for that community*

            BasicDBObject query =
                new BasicDBObject(EntityFeaturePojo.index_, entFeature.getIndex());
            query.put(EntityFeaturePojo.communityId_, communityID);
            BasicDBObject updateOp = new BasicDBObject();
            // Add aliases:
            BasicDBObject updateOpA = new BasicDBObject();
            BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, entFeature.getAlias());
            updateOpA.put(EntityFeaturePojo.alias_, multiopE);
            // Add link data, if there is any:
            if ((null != entFeature.getSemanticLinks())
                && !entFeature.getSemanticLinks().isEmpty()) {
              BasicDBObject multiopF =
                  new BasicDBObject(MongoDbManager.each_, entFeature.getSemanticLinks());
              updateOpA.put(EntityFeaturePojo.linkdata_, multiopF);
            }
            updateOp.put(MongoDbManager.addToSet_, updateOpA);
            // Update frequency:
            BasicDBObject updateOpB = new BasicDBObject();
            updateOpB.put(EntityFeaturePojo.totalfreq_, nSavedFreqCount);
            updateOpB.put(EntityFeaturePojo.doccount_, nSavedDocCount);
            updateOp.put(MongoDbManager.inc_, updateOpB);

            // try to use find/modify to see if something comes back and set doc freq/totalfreq
            BasicDBObject fields = new BasicDBObject(EntityFeaturePojo.totalfreq_, 1);
            fields.put(EntityFeaturePojo.doccount_, 1);
            fields.put(EntityFeaturePojo.alias_, 1);
            fields.put(EntityFeaturePojo.linkdata_, 1);
            // (slightly annoying, since only want these 2 largish fields if updating freq but won't
            // know
            // until after i've got this object)
            fields.put(EntityFeaturePojo.db_sync_time_, 1);
            fields.put(EntityFeaturePojo.db_sync_doccount_, 1);

            DBObject dboUpdate = null;
            if (_diagnosticMode) {
              dboUpdate = col.findOne(query, fields);
            } else {
              dboUpdate =
                  col.findAndModify(
                      query, fields, new BasicDBObject(), false, updateOp, false, true);
              // (can use findAndModify because specify index, ie the shard key)
              // (returns entity before the changes above, update the feature object below)
              // (also atomically creates the object if it doesn't exist so is "distributed-safe")
            }
            if ((dboUpdate != null) && !dboUpdate.keySet().isEmpty()) {
              // (Update the entity feature to be correct so that it can be accurately synchronized
              // with the index)
              EntityFeaturePojo gp = EntityFeaturePojo.fromDb(dboUpdate, EntityFeaturePojo.class);
              entFeature.setTotalfreq(gp.getTotalfreq() + nSavedFreqCount);
              entFeature.setDoccount(gp.getDoccount() + nSavedDocCount);
              entFeature.setDbSyncDoccount(gp.getDbSyncDoccount());
              entFeature.setDbSyncTime(gp.getDbSyncTime());
              if (null != gp.getAlias()) {
                entFeature.addAllAlias(gp.getAlias());
              }
              if (null != gp.getSemanticLinks()) {
                entFeature.addToSemanticLinks(gp.getSemanticLinks());
              }
              if (_diagnosticMode) {
                System.out.println(
                    "EntityAggregationUtils.updateEntityFeatures, found: "
                        + ((BasicDBObject) gp.toDb()).toString());
                System.out.println(
                    "EntityAggregationUtils.updateEntityFeatures, ^^^ found from query: "
                        + query.toString()
                        + " / "
                        + updateOp.toString());
              }
            } else // (the object in memory is now an accurate representation of the database, minus
                   // some fields we'll now add)
            {
              // Synchronization settings for the newly created object
              if (null == savedSyncTime) {
                savedSyncTime = Long.toString(System.currentTimeMillis());
              }
              entFeature.setDbSyncDoccount(nSavedDocCount);
              entFeature.setDbSyncTime(savedSyncTime);

              // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if
              // that is
              // out of date, the update will just be slightly out-of-date at worst) since
              // (otherwise) these fields are
              // only set here, and the findAndModify is atomic

              // (Do in raw MongoDB for performance)
              BasicDBObject baseFields = new BasicDBObject();
              baseFields.put(EntityFeaturePojo.dimension_, entFeature.getDimension().toString());
              baseFields.put(EntityFeaturePojo.type_, entFeature.getType());
              baseFields.put(
                  EntityFeaturePojo.disambiguated_name_, entFeature.getDisambiguatedName());
              baseFields.put(EntityFeaturePojo.db_sync_doccount_, entFeature.getDbSyncDoccount());
              baseFields.put(EntityFeaturePojo.db_sync_time_, entFeature.getDbSyncTime());
              if ((null != entFeature.getSemanticLinks())
                  && !entFeature.getSemanticLinks().isEmpty()) {
                baseFields.put(EntityFeaturePojo.linkdata_, entFeature.getSemanticLinks());
              }

              // attempt to add geotag (makes necessary checks on util side)
              // also add ontology type if geotag is found
              EntityGeotagAggregationUtils.addEntityGeo(entFeature);
              if (entFeature.getGeotag() != null) {
                BasicDBObject geo = new BasicDBObject(GeoPojo.lat_, entFeature.getGeotag().lat);
                geo.put(GeoPojo.lon_, entFeature.getGeotag().lon);
                baseFields.put(EntityFeaturePojo.geotag_, geo);

                if (entFeature.getOntology_type() != null) {
                  baseFields.put(EntityFeaturePojo.ontology_type_, entFeature.getOntology_type());
                }
              }

              if (!_diagnosticMode) {
                // Store the object
                col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields));
              } else {
                System.out.println(
                    "EntityAggregationUtils.updateEntityFeatures, not found: "
                        + query.toString()
                        + ": "
                        + baseFields.toString());
              }
              entFeature.setDbSyncTime(null); // (ensures that index re-sync will occur)
            }
          }
        } catch (Exception e) {
          // Exception, remove from feature list
          it.remove();

          // If an exception occurs log the error
          logger.error("Exception Message: " + e.getMessage(), e);
        }
      } // (end loop over communities)
    } // (end loop over indexes)
  } // TESTED (just by eye - made few changes during re-factoring)