Exemplo n.º 1
0
  private static List<BasicDBObject> parseEventAggregationOutput(
      String sEventOrFact,
      TermsFacet facet,
      ScoringUtils scoreStats,
      AliasLookupTable aliasLookup,
      String[] entityTypeFilterStrings,
      String[] assocVerbFilterStrings) {
    ArrayList<BasicDBObject> facetList = new ArrayList<BasicDBObject>(facet.getEntries().size());

    // (These 2 might be needed if we alias and there are filter strings specified)
    HashSet<String> entTypeFilter = null;
    // TEST CASES:
    //		String term1 = "mark kelly/person|family relation|gabrielle giffords/person|";
    //		String term2 = "|family relation|gabrielle giffords/person|";
    //		String term3 = "mark kelly/person||gabrielle giffords/person|";
    //		String term4 = "mark kelly/person|family relation||";
    //		String term5 = "mark kelly/person|family relation|gabrielle giffords/person|loca,tion/city";
    //		List<String> terms = Arrays.asList(term1, term2, term3, term4, term5);

    @SuppressWarnings("unused")
    int nFacetEl = 0; // (this will get used later)

    for (TermsFacet.Entry facetEl : facet.getEntries()) {
      // DEBUG
      // System.out.println("TERM= " + FacetUtils.getTerm(facetEl));

      String term =
          FacetUtils.getTerm(facetEl)
              .substring(sEventOrFact.length() + 1); // (step over "Fact|" or "Event|"
      // TEST CASES:
      //			if (nFacetEl < terms.size()) {
      //				term = terms.get(nFacetEl);
      //			}

      // Parse the string
      Matcher m = eventIndexParser.matcher(term);
      if (m.matches()) {
        BasicDBObject json = new BasicDBObject();
        json.put(AssociationPojo.assoc_type_, sEventOrFact);
        String sEnt1_index = m.group(1);
        if (null != sEnt1_index) {
          sEnt1_index = sEnt1_index.replaceAll("%7C", "|");
        }
        String sVerbCat = m.group(2);
        if (null != sVerbCat)
          json.put(AssociationPojo.verb_category_, sVerbCat.replaceAll("%7C", "|"));
        String sEnt2_index = m.group(3);
        if (null != sEnt2_index) {
          sEnt2_index = sEnt2_index.replaceAll("%7C", "|");
        }
        String sGeoIndex = m.group(4);
        if (null != sGeoIndex) {
          sGeoIndex = sGeoIndex.replaceAll("%7C", "|");
        }
        json.put(AssociationPojo.doccount_, facetEl.getCount());

        // Add significance if possible
        if ((null == scoreStats)
            || !scoreStats.calcAssocationSignificance(sEnt1_index, sEnt2_index, sGeoIndex, json)) {
          // These fields are optional:
          // json.put("entity1_sig", 0.0);
          // json.put("entity2_sig", 0.0);
          // json.put("geo_sig", 0.0);
          // Mandatory:
          json.put(AssociationPojo.assoc_sig_, 0.0);
        }

        boolean bTransformedByAlias = false; // when true need to re-check vs entity type filter

        // Now write the last few values (adjusted for aliases if necessary) into the JSON object
        if (null != sEnt1_index) {
          if (null != aliasLookup) {
            EntityFeaturePojo alias = aliasLookup.getAliasMaster(sEnt1_index);
            if (null != alias) {
              sEnt1_index = alias.getIndex();
              if (sEnt1_index.equalsIgnoreCase("discard")) {
                continue;
              } // TESTED
              bTransformedByAlias = true;
            }
          }
          json.put(AssociationPojo.entity1_index_, sEnt1_index);
        }
        if (null != sEnt2_index) {
          if (null != aliasLookup) {
            EntityFeaturePojo alias = aliasLookup.getAliasMaster(sEnt2_index);
            if (null != alias) {
              sEnt2_index = alias.getIndex();
              if (sEnt2_index.equalsIgnoreCase("discard")) {
                continue;
              } // TESTED (cut and paste of ent index1)
              bTransformedByAlias = true;
            }
          }
          json.put(AssociationPojo.entity2_index_, sEnt2_index);
        }
        if (null != sGeoIndex) {
          if (null != aliasLookup) {
            EntityFeaturePojo alias = aliasLookup.getAliasMaster(sGeoIndex);
            if (null != alias) {
              sGeoIndex = alias.getIndex();
              if (sGeoIndex.equalsIgnoreCase("discard")) {
                if ((sEnt1_index != null) && (sEnt2_index != null)) {
                  sGeoIndex = null; // event/fact is still valid even without the geo								
                } // TESTED
                else continue; // event/fact now meaningless
              }
              bTransformedByAlias = true;
            }
          }
          json.put(AssociationPojo.geo_index_, sGeoIndex);
        }
        // TESTED

        // Whenever aliases are applied, need to re-check whether is this now a filter item
        // ideally have a single code block for doing this in scoringutils_association.
        if (bTransformedByAlias) {
          if ((null == entTypeFilter) && (null != entityTypeFilterStrings)) {
            entTypeFilter = new HashSet<String>();
          }
          // (only create the map once, and only if needed)

          boolean bKeep =
              recheckFiltersAfterTransform(
                  json, aliasLookup, entityTypeFilterStrings, entTypeFilter);

          if (!bKeep) {
            continue; // ie just bypass the facetList.add and the nFacetEl
          }
        } // TESTED

        facetList.add(json);
      }
      nFacetEl++;
    }
    return facetList;
  } // TESTED (see cases above - difficult to make this test case standalone because of
Exemplo n.º 2
0
  public static void parseOutputAggregation(
      AdvancedQueryPojo.QueryOutputPojo.AggregationOutputPojo aggregation,
      AliasLookupTable aliasLookup,
      boolean geoLowAccuracy,
      String[] entTypeFilterStrings,
      String[] assocVerbFilterStrings,
      SearchRequestBuilder searchSettings,
      BoolFilterBuilder parentFilterObj,
      String[] communityIdStrs) {
    // 1.] Go through aggregation list

    // 1.1] Apply "simple specifications" if necessary

    // Geo

    if ((null != aggregation)
        && (null != aggregation.geoNumReturn)
        && (aggregation.geoNumReturn > 0)) {
      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("geo")
              .field(DocumentPojo.locs_)
              .size(aggregation.geoNumReturn);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    } // (TESTED)

    // Temporal

    if ((null != aggregation) && (null != aggregation.timesInterval)) {
      if (aggregation.timesInterval.contains("m")) {
        aggregation.timesInterval = "month";
      }
      CrossVersionFacetBuilder.DateHistogramFacetBuilder fb =
          CrossVersionFacetBuilders.dateHistogramFacet("time")
              .field(DocumentPojo.publishedDate_)
              .interval(aggregation.timesInterval);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);

      // TODO (INF-2688): if using certain types of moments then don't want this?
    } // (TESTED)

    // Temporal Moments

    if ((null != aggregation) && (null != aggregation.moments)) {
      if (null == aggregation.moments.timesInterval) {
        if (null != aggregation.timesInterval) {
          aggregation.moments.timesInterval = aggregation.timesInterval;
        } else {
          aggregation.moments.timesInterval = "m";
        }
      }
      if (aggregation.moments.timesInterval.contains("m")) {
        aggregation.moments.timesInterval = "month";
      }

      // TODO (INF-2688): Other cross filter type things
      if (!geoLowAccuracy
          && (null != aggregation.moments.geoNumReturn)
          && (aggregation.moments.geoNumReturn > 0)) {
        DateHistogramBuilder timeAgg =
            AggregationBuilders.dateHistogram("moments")
                .field(DocumentPojo.publishedDate_)
                .interval(new Interval(aggregation.moments.timesInterval));
        TermsBuilder geoAgg =
            AggregationBuilders.terms("geo")
                .field(DocumentPojo.locs_)
                .size(aggregation.moments.geoNumReturn);
        timeAgg.subAggregation(geoAgg);
        searchSettings.addAggregation(timeAgg);
      }

      // TODO (CORE-89)
      if (null != aggregation.moments.associationsNumReturn
          && aggregation.moments.associationsNumReturn >= 0) {
        // TODO need to check if indexes mapping use doc.associations.assoc_index == docValue
        // fail out or don't include those communities if they don't
        if (validateAssociationMapping(communityIdStrs)) {
          DateHistogramBuilder assocTimeAgg =
              AggregationBuilders.dateHistogram("moments.assoc")
                  .field(DocumentPojo.publishedDate_)
                  .interval(new Interval(aggregation.moments.timesInterval));
          TermsBuilder assocAgg =
              AggregationBuilders.terms("assoc")
                  .field(AssociationPojo.assoc_index_)
                  .size(aggregation.moments.associationsNumReturn);
          NestedBuilder nested =
              AggregationBuilders.nested("moments.assoc.nested")
                  .path(DocumentPojo.associations_)
                  .subAggregation(assocAgg);
          assocTimeAgg.subAggregation(nested);
          searchSettings.addAggregation(assocTimeAgg);
        }
      }

      if (null != aggregation.moments.entityList) {
        for (String entIndex : aggregation.moments.entityList) {

          CrossVersionFacetBuilder.DateHistogramFacetBuilder fb =
              CrossVersionFacetBuilders.dateHistogramFacet("moments." + entIndex)
                  .field(DocumentPojo.publishedDate_)
                  .interval(aggregation.moments.timesInterval);

          EntityFeaturePojo alias = null;
          if (null != aliasLookup) {
            alias = aliasLookup.getAliases(entIndex);
          }
          if (null == alias) { // no alias
            fb =
                fb.facetFilter(
                    FilterBuilders.nestedFilter(
                        DocumentPojo.entities_,
                        FilterBuilders.termFilter(EntityPojo.index_, entIndex)));
          } // TESTED
          else {
            QueryFilterBuilder qfb = null;
            if ((null != alias.getSemanticLinks()) && !alias.getSemanticLinks().isEmpty()) {
              BoolQueryBuilder qb = QueryBuilders.boolQuery();
              for (String textAlias : alias.getSemanticLinks()) {
                qb =
                    qb.should(
                        CrossVersionQueryBuilders.matchPhraseQuery(
                            DocumentPojo.fullText_, textAlias));
              }
              qfb = FilterBuilders.queryFilter(qb);
            } // TESTED
            if (!alias.getAlias().isEmpty()) {
              NestedFilterBuilder nfb =
                  FilterBuilders.nestedFilter(
                      DocumentPojo.entities_,
                      FilterBuilders.termsFilter(EntityPojo.index_, entIndex, alias.getAlias()));
              if (null == qfb) {
                fb = fb.facetFilter(nfb);
              } // TESTED
              else {
                BoolFilterBuilder bfb = FilterBuilders.boolFilter().should(nfb).should(qfb);
                fb = fb.facetFilter(bfb);
              } // TESTED
            } else if (null != qfb) {
              fb = fb.facetFilter(qfb);
            } // TESTED
          } // TESTED

          // Gross raw handling for facets
          if (null != parentFilterObj) {
            fb = fb.facetFilter(parentFilterObj);
          }
          searchSettings.addFacet(fb);
        }
      } // (end list over entities)
    } // TESTED

    // Entities - due to problems with significance, handled on a document by document basis, see
    // Significance helper class

    // Associations (Events/Facts)

    // Association verb category filter
    StringBuilder verbCatRegex = null;
    StringBuilder entTypeRegex = null;

    if (((null != aggregation)
            && (null != aggregation.eventsNumReturn)
            && (aggregation.eventsNumReturn > 0))
        || ((null != aggregation)
            && (null != aggregation.factsNumReturn)
            && (aggregation.factsNumReturn > 0))) {
      if (null != entTypeFilterStrings) {
        boolean bNegative = false;
        if ('-' != entTypeFilterStrings[0].charAt(0)) { // positive filtering
          entTypeRegex = new StringBuilder("(?:");
        } else {
          bNegative = true;
          entTypeRegex = new StringBuilder("(?!");
          // (this is a lookahead but will be fine because of the .*/ in front of it)
        }
        for (String entType : entTypeFilterStrings) {
          if (bNegative && ('-' == entType.charAt(0))) {
            entType = entType.substring(1);
          }
          entType = entType.replace("|", "%7C");
          entTypeRegex.append(".*?/").append(Pattern.quote(entType.toLowerCase())).append('|');
          // (can't match greedily because of the 2nd instance of entity type)
        }
        entTypeRegex.setLength(entTypeRegex.length() - 1); // (remove trailing |)
        entTypeRegex.append(")");
        if (bNegative) {
          entTypeRegex.append("[^|]*"); // (now the actual verb, if a -ve lookahead)				
        }
      } // TESTED

      if (null != assocVerbFilterStrings) {
        boolean bNegative = false;
        if ('-' != assocVerbFilterStrings[0].charAt(0)) { // positive filtering
          verbCatRegex = new StringBuilder("\\|(?:");
        } else {
          bNegative = true;
          verbCatRegex = new StringBuilder("\\|(?!");
          // (this is a lookahead but will be fine because of the "^[^|]*\\" in front of it)

          // eg say I have -VERB then subject|VERB|object will match because if the
        }
        for (String assocVerbFilterString : assocVerbFilterStrings) {
          if (bNegative && ('-' == assocVerbFilterString.charAt(0))) {
            assocVerbFilterString = assocVerbFilterString.substring(1);
          }
          assocVerbFilterString = assocVerbFilterString.replace("|", "%7C");
          verbCatRegex.append(Pattern.quote(assocVerbFilterString)).append('|');
        }
        verbCatRegex.setLength(verbCatRegex.length() - 1); // (remove trailing |)
        verbCatRegex.append(")");
        if (bNegative) {
          verbCatRegex.append("[^|]*"); // (now the actual verb, if a -ve lookahead)
        }
      } // TESTED
    }
    // TESTED (all combinations of 1/2 people, 1/2 verbs)

    if ((null != aggregation)
        && (null != aggregation.eventsNumReturn)
        && (aggregation.eventsNumReturn > 0)) {
      StringBuffer regex = new StringBuffer("^Event\\|");
      if (null != entTypeRegex) {
        regex.append(entTypeRegex);
      } else {
        regex.append("[^|]*");
      }
      if (null != verbCatRegex) {
        regex.append(verbCatRegex);
      } else if (null != entTypeRegex) {
        regex.append("\\|[^|]*");
      } else {
        regex.append(".*");
      }
      if (null != entTypeRegex) {
        regex.append("\\|").append(entTypeRegex);
        regex.append(".*");
      } else {
        regex.append("\\|.*");
      }
      // DEBUG
      // System.out.println("REGEX==" + regex.toString());
      // TESTED (all combinations of 1/2 people, 1/2 verbs)

      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("events")
              .field(AssociationPojo.assoc_index_)
              .size(aggregation.eventsNumReturn)
              .nested(DocumentPojo.associations_);
      fb.regex(regex.toString());

      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    }
    if ((null != aggregation)
        && (null != aggregation.factsNumReturn)
        && (aggregation.factsNumReturn > 0)) {
      StringBuffer regex = new StringBuffer("^Fact\\|");
      if (null != entTypeRegex) {
        regex.append(entTypeRegex);
      } else {
        regex.append("[^|]*");
      }
      if (null != verbCatRegex) {
        regex.append(verbCatRegex);
      } else if (null != entTypeRegex) {
        regex.append("\\|[^|]*");
      } else {
        regex.append(".*");
      }
      if (null != entTypeRegex) {
        regex.append("\\|").append(entTypeRegex);
        regex.append(".*");
      } else {
        regex.append("\\|.*");
      }
      // DEBUG
      // System.out.println("REGEX==" + regex.toString());
      // TESTED (all combinations of 1/2 people, 1/2 verbs)

      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("facts")
              .field(AssociationPojo.assoc_index_)
              .size(aggregation.factsNumReturn)
              .nested(DocumentPojo.associations_);
      fb.regex(regex.toString());

      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    }

    // Source management/monitoring

    if ((null != aggregation)
        && (null != aggregation.sourceMetadata)
        && (aggregation.sourceMetadata > 0)) {
      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("sourceTags")
              .field(DocumentPojo.tags_)
              .size(aggregation.sourceMetadata)
              .facetFilter(parentFilterObj);
      CrossVersionFacetBuilder.TermsFacetBuilder fb1 =
          CrossVersionFacetBuilders.termsFacet("sourceTypes")
              .field(DocumentPojo.mediaType_)
              .size(aggregation.sourceMetadata)
              .facetFilter(parentFilterObj);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
        fb1 = fb1.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
      searchSettings.addFacet(fb1);
    }

    if ((null != aggregation) && (null != aggregation.sources) && (aggregation.sources > 0)) {
      CrossVersionFacetBuilder.TermsFacetBuilder fb =
          CrossVersionFacetBuilders.termsFacet("sourceKeys")
              .field(DocumentPojo.sourceKey_)
              .size(aggregation.sources);
      // Gross raw handling for facets
      if (null != parentFilterObj) {
        fb = fb.facetFilter(parentFilterObj);
      }
      searchSettings.addFacet(fb);
    }
  } // TESTED