Ejemplo n.º 1
0
  public static void parseOutputAggregation(
      AdvancedQueryPojo.QueryOutputPojo.AggregationOutputPojo aggregation,
      AliasLookupTable aliasLookup,
      boolean geoLowAccuracy,
      String[] entTypeFilterStrings,
      String[] assocVerbFilterStrings,
      SearchRequestBuilder searchSettings,
      BoolFilterBuilder parentFilterObj,
      String[] communityIdStrs) {
    // 1.] Go through aggregation list

    // 1.1] Apply "simple specifications" if necessary

    // Geo

    if ((null != aggregation)
        && (null != aggregation.geoNumReturn)
        && (aggregation.geoNumReturn > 0)) {
      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("geo")
              .field(DocumentPojo.locs_)
              .size(aggregation.geoNumReturn);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    } // (TESTED)

    // Temporal

    if ((null != aggregation) && (null != aggregation.timesInterval)) {
      if (aggregation.timesInterval.contains("m")) {
        aggregation.timesInterval = "month";
      }
      CrossVersionFacetBuilder.DateHistogramFacetBuilder fb =
          CrossVersionFacetBuilders.dateHistogramFacet("time")
              .field(DocumentPojo.publishedDate_)
              .interval(aggregation.timesInterval);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);

      // TODO (INF-2688): if using certain types of moments then don't want this?
    } // (TESTED)

    // Temporal Moments

    if ((null != aggregation) && (null != aggregation.moments)) {
      if (null == aggregation.moments.timesInterval) {
        if (null != aggregation.timesInterval) {
          aggregation.moments.timesInterval = aggregation.timesInterval;
        } else {
          aggregation.moments.timesInterval = "m";
        }
      }
      if (aggregation.moments.timesInterval.contains("m")) {
        aggregation.moments.timesInterval = "month";
      }

      // TODO (INF-2688): Other cross filter type things
      if (!geoLowAccuracy
          && (null != aggregation.moments.geoNumReturn)
          && (aggregation.moments.geoNumReturn > 0)) {
        DateHistogramBuilder timeAgg =
            AggregationBuilders.dateHistogram("moments")
                .field(DocumentPojo.publishedDate_)
                .interval(new Interval(aggregation.moments.timesInterval));
        TermsBuilder geoAgg =
            AggregationBuilders.terms("geo")
                .field(DocumentPojo.locs_)
                .size(aggregation.moments.geoNumReturn);
        timeAgg.subAggregation(geoAgg);
        searchSettings.addAggregation(timeAgg);
      }

      // TODO (CORE-89)
      if (null != aggregation.moments.associationsNumReturn
          && aggregation.moments.associationsNumReturn >= 0) {
        // TODO need to check if indexes mapping use doc.associations.assoc_index == docValue
        // fail out or don't include those communities if they don't
        if (validateAssociationMapping(communityIdStrs)) {
          DateHistogramBuilder assocTimeAgg =
              AggregationBuilders.dateHistogram("moments.assoc")
                  .field(DocumentPojo.publishedDate_)
                  .interval(new Interval(aggregation.moments.timesInterval));
          TermsBuilder assocAgg =
              AggregationBuilders.terms("assoc")
                  .field(AssociationPojo.assoc_index_)
                  .size(aggregation.moments.associationsNumReturn);
          NestedBuilder nested =
              AggregationBuilders.nested("moments.assoc.nested")
                  .path(DocumentPojo.associations_)
                  .subAggregation(assocAgg);
          assocTimeAgg.subAggregation(nested);
          searchSettings.addAggregation(assocTimeAgg);
        }
      }

      if (null != aggregation.moments.entityList) {
        for (String entIndex : aggregation.moments.entityList) {

          CrossVersionFacetBuilder.DateHistogramFacetBuilder fb =
              CrossVersionFacetBuilders.dateHistogramFacet("moments." + entIndex)
                  .field(DocumentPojo.publishedDate_)
                  .interval(aggregation.moments.timesInterval);

          EntityFeaturePojo alias = null;
          if (null != aliasLookup) {
            alias = aliasLookup.getAliases(entIndex);
          }
          if (null == alias) { // no alias
            fb =
                fb.facetFilter(
                    FilterBuilders.nestedFilter(
                        DocumentPojo.entities_,
                        FilterBuilders.termFilter(EntityPojo.index_, entIndex)));
          } // TESTED
          else {
            QueryFilterBuilder qfb = null;
            if ((null != alias.getSemanticLinks()) && !alias.getSemanticLinks().isEmpty()) {
              BoolQueryBuilder qb = QueryBuilders.boolQuery();
              for (String textAlias : alias.getSemanticLinks()) {
                qb =
                    qb.should(
                        CrossVersionQueryBuilders.matchPhraseQuery(
                            DocumentPojo.fullText_, textAlias));
              }
              qfb = FilterBuilders.queryFilter(qb);
            } // TESTED
            if (!alias.getAlias().isEmpty()) {
              NestedFilterBuilder nfb =
                  FilterBuilders.nestedFilter(
                      DocumentPojo.entities_,
                      FilterBuilders.termsFilter(EntityPojo.index_, entIndex, alias.getAlias()));
              if (null == qfb) {
                fb = fb.facetFilter(nfb);
              } // TESTED
              else {
                BoolFilterBuilder bfb = FilterBuilders.boolFilter().should(nfb).should(qfb);
                fb = fb.facetFilter(bfb);
              } // TESTED
            } else if (null != qfb) {
              fb = fb.facetFilter(qfb);
            } // TESTED
          } // TESTED

          // Gross raw handling for facets
          if (null != parentFilterObj) {
            fb = fb.facetFilter(parentFilterObj);
          }
          searchSettings.addFacet(fb);
        }
      } // (end list over entities)
    } // TESTED

    // Entities - due to problems with significance, handled on a document by document basis, see
    // Significance helper class

    // Associations (Events/Facts)

    // Association verb category filter
    StringBuilder verbCatRegex = null;
    StringBuilder entTypeRegex = null;

    if (((null != aggregation)
            && (null != aggregation.eventsNumReturn)
            && (aggregation.eventsNumReturn > 0))
        || ((null != aggregation)
            && (null != aggregation.factsNumReturn)
            && (aggregation.factsNumReturn > 0))) {
      if (null != entTypeFilterStrings) {
        boolean bNegative = false;
        if ('-' != entTypeFilterStrings[0].charAt(0)) { // positive filtering
          entTypeRegex = new StringBuilder("(?:");
        } else {
          bNegative = true;
          entTypeRegex = new StringBuilder("(?!");
          // (this is a lookahead but will be fine because of the .*/ in front of it)
        }
        for (String entType : entTypeFilterStrings) {
          if (bNegative && ('-' == entType.charAt(0))) {
            entType = entType.substring(1);
          }
          entType = entType.replace("|", "%7C");
          entTypeRegex.append(".*?/").append(Pattern.quote(entType.toLowerCase())).append('|');
          // (can't match greedily because of the 2nd instance of entity type)
        }
        entTypeRegex.setLength(entTypeRegex.length() - 1); // (remove trailing |)
        entTypeRegex.append(")");
        if (bNegative) {
          entTypeRegex.append("[^|]*"); // (now the actual verb, if a -ve lookahead)				
        }
      } // TESTED

      if (null != assocVerbFilterStrings) {
        boolean bNegative = false;
        if ('-' != assocVerbFilterStrings[0].charAt(0)) { // positive filtering
          verbCatRegex = new StringBuilder("\\|(?:");
        } else {
          bNegative = true;
          verbCatRegex = new StringBuilder("\\|(?!");
          // (this is a lookahead but will be fine because of the "^[^|]*\\" in front of it)

          // eg say I have -VERB then subject|VERB|object will match because if the
        }
        for (String assocVerbFilterString : assocVerbFilterStrings) {
          if (bNegative && ('-' == assocVerbFilterString.charAt(0))) {
            assocVerbFilterString = assocVerbFilterString.substring(1);
          }
          assocVerbFilterString = assocVerbFilterString.replace("|", "%7C");
          verbCatRegex.append(Pattern.quote(assocVerbFilterString)).append('|');
        }
        verbCatRegex.setLength(verbCatRegex.length() - 1); // (remove trailing |)
        verbCatRegex.append(")");
        if (bNegative) {
          verbCatRegex.append("[^|]*"); // (now the actual verb, if a -ve lookahead)
        }
      } // TESTED
    }
    // TESTED (all combinations of 1/2 people, 1/2 verbs)

    if ((null != aggregation)
        && (null != aggregation.eventsNumReturn)
        && (aggregation.eventsNumReturn > 0)) {
      StringBuffer regex = new StringBuffer("^Event\\|");
      if (null != entTypeRegex) {
        regex.append(entTypeRegex);
      } else {
        regex.append("[^|]*");
      }
      if (null != verbCatRegex) {
        regex.append(verbCatRegex);
      } else if (null != entTypeRegex) {
        regex.append("\\|[^|]*");
      } else {
        regex.append(".*");
      }
      if (null != entTypeRegex) {
        regex.append("\\|").append(entTypeRegex);
        regex.append(".*");
      } else {
        regex.append("\\|.*");
      }
      // DEBUG
      // System.out.println("REGEX==" + regex.toString());
      // TESTED (all combinations of 1/2 people, 1/2 verbs)

      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("events")
              .field(AssociationPojo.assoc_index_)
              .size(aggregation.eventsNumReturn)
              .nested(DocumentPojo.associations_);
      fb.regex(regex.toString());

      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    }
    if ((null != aggregation)
        && (null != aggregation.factsNumReturn)
        && (aggregation.factsNumReturn > 0)) {
      StringBuffer regex = new StringBuffer("^Fact\\|");
      if (null != entTypeRegex) {
        regex.append(entTypeRegex);
      } else {
        regex.append("[^|]*");
      }
      if (null != verbCatRegex) {
        regex.append(verbCatRegex);
      } else if (null != entTypeRegex) {
        regex.append("\\|[^|]*");
      } else {
        regex.append(".*");
      }
      if (null != entTypeRegex) {
        regex.append("\\|").append(entTypeRegex);
        regex.append(".*");
      } else {
        regex.append("\\|.*");
      }
      // DEBUG
      // System.out.println("REGEX==" + regex.toString());
      // TESTED (all combinations of 1/2 people, 1/2 verbs)

      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("facts")
              .field(AssociationPojo.assoc_index_)
              .size(aggregation.factsNumReturn)
              .nested(DocumentPojo.associations_);
      fb.regex(regex.toString());

      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    }

    // Source management/monitoring

    if ((null != aggregation)
        && (null != aggregation.sourceMetadata)
        && (aggregation.sourceMetadata > 0)) {
      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("sourceTags")
              .field(DocumentPojo.tags_)
              .size(aggregation.sourceMetadata)
              .facetFilter(parentFilterObj);
      CrossVersionFacetBuilder.TermsFacetBuilder fb1 =
          CrossVersionFacetBuilders.termsFacet("sourceTypes")
              .field(DocumentPojo.mediaType_)
              .size(aggregation.sourceMetadata)
              .facetFilter(parentFilterObj);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
        fb1 = fb1.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
      searchSettings.addFacet(fb1);
    }

    if ((null != aggregation) && (null != aggregation.sources) && (aggregation.sources > 0)) {
      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("sourceKeys")
              .field(DocumentPojo.sourceKey_)
              .size(aggregation.sources);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    }
  } // TESTED
Ejemplo n.º 2
0
  /**
   * Updates the feature entries for the list of entities that was just extracted including changing
   * frequency, adding aliases etc
   *
   * <p>This method now has 3 steps: 1. Try to update alias 1.a If fail, create new gaz 2. Update
   * totalfreq and doccount
   *
   * @param ents List of entities to update in the entity feature
   */
  public static void updateEntityFeatures(
      Map<String, Map<ObjectId, EntityFeaturePojo>> entFeatures) {
    DBCollection col = DbManager.getFeature().getEntity();
    String savedSyncTime = null;
    for (Map<ObjectId, EntityFeaturePojo> entCommunity : entFeatures.values()) {

      Iterator<Map.Entry<ObjectId, EntityFeaturePojo>> it = entCommunity.entrySet().iterator();
      while (it.hasNext()) {
        Map.Entry<ObjectId, EntityFeaturePojo> entFeatureKV = it.next();
        try {
          EntityFeaturePojo entFeature = entFeatureKV.getValue();

          long nSavedDocCount = entFeature.getDoccount();
          long nSavedFreqCount = entFeature.getTotalfreq();
          // (these should be constant across all communities but keep it here
          //  so can assign it using entFeature, it's v cheap so no need to get once like for sync
          // vars)

          ObjectId communityID = entFeature.getCommunityId();
          if (null != communityID) {
            // For each community, see if the entity feature already exists *for that community*

            BasicDBObject query =
                new BasicDBObject(EntityFeaturePojo.index_, entFeature.getIndex());
            query.put(EntityFeaturePojo.communityId_, communityID);
            BasicDBObject updateOp = new BasicDBObject();
            // Add aliases:
            BasicDBObject updateOpA = new BasicDBObject();
            BasicDBObject multiopE = new BasicDBObject(MongoDbManager.each_, entFeature.getAlias());
            updateOpA.put(EntityFeaturePojo.alias_, multiopE);
            // Add link data, if there is any:
            if ((null != entFeature.getSemanticLinks())
                && !entFeature.getSemanticLinks().isEmpty()) {
              BasicDBObject multiopF =
                  new BasicDBObject(MongoDbManager.each_, entFeature.getSemanticLinks());
              updateOpA.put(EntityFeaturePojo.linkdata_, multiopF);
            }
            updateOp.put(MongoDbManager.addToSet_, updateOpA);
            // Update frequency:
            BasicDBObject updateOpB = new BasicDBObject();
            updateOpB.put(EntityFeaturePojo.totalfreq_, nSavedFreqCount);
            updateOpB.put(EntityFeaturePojo.doccount_, nSavedDocCount);
            updateOp.put(MongoDbManager.inc_, updateOpB);

            // try to use find/modify to see if something comes back and set doc freq/totalfreq
            BasicDBObject fields = new BasicDBObject(EntityFeaturePojo.totalfreq_, 1);
            fields.put(EntityFeaturePojo.doccount_, 1);
            fields.put(EntityFeaturePojo.alias_, 1);
            fields.put(EntityFeaturePojo.linkdata_, 1);
            // (slightly annoying, since only want these 2 largish fields if updating freq but won't
            // know
            // until after i've got this object)
            fields.put(EntityFeaturePojo.db_sync_time_, 1);
            fields.put(EntityFeaturePojo.db_sync_doccount_, 1);

            DBObject dboUpdate = null;
            if (_diagnosticMode) {
              dboUpdate = col.findOne(query, fields);
            } else {
              dboUpdate =
                  col.findAndModify(
                      query, fields, new BasicDBObject(), false, updateOp, false, true);
              // (can use findAndModify because specify index, ie the shard key)
              // (returns entity before the changes above, update the feature object below)
              // (also atomically creates the object if it doesn't exist so is "distributed-safe")
            }
            if ((dboUpdate != null) && !dboUpdate.keySet().isEmpty()) {
              // (Update the entity feature to be correct so that it can be accurately synchronized
              // with the index)
              EntityFeaturePojo gp = EntityFeaturePojo.fromDb(dboUpdate, EntityFeaturePojo.class);
              entFeature.setTotalfreq(gp.getTotalfreq() + nSavedFreqCount);
              entFeature.setDoccount(gp.getDoccount() + nSavedDocCount);
              entFeature.setDbSyncDoccount(gp.getDbSyncDoccount());
              entFeature.setDbSyncTime(gp.getDbSyncTime());
              if (null != gp.getAlias()) {
                entFeature.addAllAlias(gp.getAlias());
              }
              if (null != gp.getSemanticLinks()) {
                entFeature.addToSemanticLinks(gp.getSemanticLinks());
              }
              if (_diagnosticMode) {
                System.out.println(
                    "EntityAggregationUtils.updateEntityFeatures, found: "
                        + ((BasicDBObject) gp.toDb()).toString());
                System.out.println(
                    "EntityAggregationUtils.updateEntityFeatures, ^^^ found from query: "
                        + query.toString()
                        + " / "
                        + updateOp.toString());
              }
            } else // (the object in memory is now an accurate representation of the database, minus
                   // some fields we'll now add)
            {
              // Synchronization settings for the newly created object
              if (null == savedSyncTime) {
                savedSyncTime = Long.toString(System.currentTimeMillis());
              }
              entFeature.setDbSyncDoccount(nSavedDocCount);
              entFeature.setDbSyncTime(savedSyncTime);

              // This is all "distributed safe" (apart from the db_syc_xxx and it doesn't matter if
              // that is
              // out of date, the update will just be slightly out-of-date at worst) since
              // (otherwise) these fields are
              // only set here, and the findAndModify is atomic

              // (Do in raw MongoDB for performance)
              BasicDBObject baseFields = new BasicDBObject();
              baseFields.put(EntityFeaturePojo.dimension_, entFeature.getDimension().toString());
              baseFields.put(EntityFeaturePojo.type_, entFeature.getType());
              baseFields.put(
                  EntityFeaturePojo.disambiguated_name_, entFeature.getDisambiguatedName());
              baseFields.put(EntityFeaturePojo.db_sync_doccount_, entFeature.getDbSyncDoccount());
              baseFields.put(EntityFeaturePojo.db_sync_time_, entFeature.getDbSyncTime());
              if ((null != entFeature.getSemanticLinks())
                  && !entFeature.getSemanticLinks().isEmpty()) {
                baseFields.put(EntityFeaturePojo.linkdata_, entFeature.getSemanticLinks());
              }

              // attempt to add geotag (makes necessary checks on util side)
              // also add ontology type if geotag is found
              EntityGeotagAggregationUtils.addEntityGeo(entFeature);
              if (entFeature.getGeotag() != null) {
                BasicDBObject geo = new BasicDBObject(GeoPojo.lat_, entFeature.getGeotag().lat);
                geo.put(GeoPojo.lon_, entFeature.getGeotag().lon);
                baseFields.put(EntityFeaturePojo.geotag_, geo);

                if (entFeature.getOntology_type() != null) {
                  baseFields.put(EntityFeaturePojo.ontology_type_, entFeature.getOntology_type());
                }
              }

              if (!_diagnosticMode) {
                // Store the object
                col.update(query, new BasicDBObject(MongoDbManager.set_, baseFields));
              } else {
                System.out.println(
                    "EntityAggregationUtils.updateEntityFeatures, not found: "
                        + query.toString()
                        + ": "
                        + baseFields.toString());
              }
              entFeature.setDbSyncTime(null); // (ensures that index re-sync will occur)
            }
          }
        } catch (Exception e) {
          // Exception, remove from feature list
          it.remove();

          // If an exception occurs log the error
          logger.error("Exception Message: " + e.getMessage(), e);
        }
      } // (end loop over communities)
    } // (end loop over indexes)
  } // TESTED (just by eye - made few changes during re-factoring)