@Override
  public Facet reduce(List<Facet> facets) {
    if (facets.size() == 1) {
      return facets.get(0);
    }

    InternalLongTermsFacet first = null;

    TLongIntHashMap aggregated = CacheRecycler.popLongIntMap();
    long missing = 0;
    long total = 0;
    for (Facet facet : facets) {
      TermsFacet termsFacet = (TermsFacet) facet;
      // termsFacet could be of type InternalStringTermsFacet representing unmapped fields
      if (first == null && termsFacet instanceof InternalLongTermsFacet) {
        first = (InternalLongTermsFacet) termsFacet;
      }
      missing += termsFacet.getMissingCount();
      total += termsFacet.getTotalCount();
      for (Entry entry : termsFacet.getEntries()) {
        aggregated.adjustOrPutValue(((LongEntry) entry).term, entry.getCount(), entry.getCount());
      }
    }

    BoundedTreeSet<LongEntry> ordered =
        new BoundedTreeSet<LongEntry>(first.comparatorType.comparator(), first.requiredSize);
    for (TLongIntIterator it = aggregated.iterator(); it.hasNext(); ) {
      it.advance();
      ordered.add(new LongEntry(it.key(), it.value()));
    }
    first.entries = ordered;
    first.missing = missing;
    first.total = total;

    CacheRecycler.pushLongIntMap(aggregated);

    return first;
  }
Ejemplo n.º 2
0
  private static List<BasicDBObject> parseEventAggregationOutput(
      String sEventOrFact,
      TermsFacet facet,
      ScoringUtils scoreStats,
      AliasLookupTable aliasLookup,
      String[] entityTypeFilterStrings,
      String[] assocVerbFilterStrings) {
    ArrayList<BasicDBObject> facetList = new ArrayList<BasicDBObject>(facet.getEntries().size());

    // (These 2 might be needed if we alias and there are filter strings specified)
    HashSet<String> entTypeFilter = null;
    // TEST CASES:
    //		String term1 = "mark kelly/person|family relation|gabrielle giffords/person|";
    //		String term2 = "|family relation|gabrielle giffords/person|";
    //		String term3 = "mark kelly/person||gabrielle giffords/person|";
    //		String term4 = "mark kelly/person|family relation||";
    //		String term5 = "mark kelly/person|family relation|gabrielle giffords/person|loca,tion/city";
    //		List<String> terms = Arrays.asList(term1, term2, term3, term4, term5);

    @SuppressWarnings("unused")
    int nFacetEl = 0; // (this will get used later)

    for (TermsFacet.Entry facetEl : facet.getEntries()) {
      // DEBUG
      // System.out.println("TERM= " + FacetUtils.getTerm(facetEl));

      String term =
          FacetUtils.getTerm(facetEl)
              .substring(sEventOrFact.length() + 1); // (step over "Fact|" or "Event|"
      // TEST CASES:
      //			if (nFacetEl < terms.size()) {
      //				term = terms.get(nFacetEl);
      //			}

      // Parse the string
      Matcher m = eventIndexParser.matcher(term);
      if (m.matches()) {
        BasicDBObject json = new BasicDBObject();
        json.put(AssociationPojo.assoc_type_, sEventOrFact);
        String sEnt1_index = m.group(1);
        if (null != sEnt1_index) {
          sEnt1_index = sEnt1_index.replaceAll("%7C", "|");
        }
        String sVerbCat = m.group(2);
        if (null != sVerbCat)
          json.put(AssociationPojo.verb_category_, sVerbCat.replaceAll("%7C", "|"));
        String sEnt2_index = m.group(3);
        if (null != sEnt2_index) {
          sEnt2_index = sEnt2_index.replaceAll("%7C", "|");
        }
        String sGeoIndex = m.group(4);
        if (null != sGeoIndex) {
          sGeoIndex = sGeoIndex.replaceAll("%7C", "|");
        }
        json.put(AssociationPojo.doccount_, facetEl.getCount());

        // Add significance if possible
        if ((null == scoreStats)
            || !scoreStats.calcAssocationSignificance(sEnt1_index, sEnt2_index, sGeoIndex, json)) {
          // These fields are optional:
          // json.put("entity1_sig", 0.0);
          // json.put("entity2_sig", 0.0);
          // json.put("geo_sig", 0.0);
          // Mandatory:
          json.put(AssociationPojo.assoc_sig_, 0.0);
        }

        boolean bTransformedByAlias = false; // when true need to re-check vs entity type filter

        // Now write the last few values (adjusted for aliases if necessary) into the JSON object
        if (null != sEnt1_index) {
          if (null != aliasLookup) {
            EntityFeaturePojo alias = aliasLookup.getAliasMaster(sEnt1_index);
            if (null != alias) {
              sEnt1_index = alias.getIndex();
              if (sEnt1_index.equalsIgnoreCase("discard")) {
                continue;
              } // TESTED
              bTransformedByAlias = true;
            }
          }
          json.put(AssociationPojo.entity1_index_, sEnt1_index);
        }
        if (null != sEnt2_index) {
          if (null != aliasLookup) {
            EntityFeaturePojo alias = aliasLookup.getAliasMaster(sEnt2_index);
            if (null != alias) {
              sEnt2_index = alias.getIndex();
              if (sEnt2_index.equalsIgnoreCase("discard")) {
                continue;
              } // TESTED (cut and paste of ent index1)
              bTransformedByAlias = true;
            }
          }
          json.put(AssociationPojo.entity2_index_, sEnt2_index);
        }
        if (null != sGeoIndex) {
          if (null != aliasLookup) {
            EntityFeaturePojo alias = aliasLookup.getAliasMaster(sGeoIndex);
            if (null != alias) {
              sGeoIndex = alias.getIndex();
              if (sGeoIndex.equalsIgnoreCase("discard")) {
                if ((sEnt1_index != null) && (sEnt2_index != null)) {
                  sGeoIndex = null; // event/fact is still valid even without the geo								
                } // TESTED
                else continue; // event/fact now meaningless
              }
              bTransformedByAlias = true;
            }
          }
          json.put(AssociationPojo.geo_index_, sGeoIndex);
        }
        // TESTED

        // Whenever aliases are applied, need to re-check whether is this now a filter item
        // ideally have a single code block for doing this in scoringutils_association.
        if (bTransformedByAlias) {
          if ((null == entTypeFilter) && (null != entityTypeFilterStrings)) {
            entTypeFilter = new HashSet<String>();
          }
          // (only create the map once, and only if needed)

          boolean bKeep =
              recheckFiltersAfterTransform(
                  json, aliasLookup, entityTypeFilterStrings, entTypeFilter);

          if (!bKeep) {
            continue; // ie just bypass the facetList.add and the nFacetEl
          }
        } // TESTED

        facetList.add(json);
      }
      nFacetEl++;
    }
    return facetList;
  } // TESTED (see cases above - difficult to make this test case standalone because of
Ejemplo n.º 3
0
  public static void loadAggregationResults(
      ResponsePojo rp,
      Facets facets,
      Aggregations aggs,
      AggregationOutputPojo aggOutParams,
      ScoringUtils scoreStats,
      AliasLookupTable aliasLookup,
      String[] entityTypeFilterStrings,
      String[] assocVerbFilterStrings,
      AggregationUtils.GeoContainer extraAliasAggregatedGeo) {
    HashMap<String, List<? extends Object>> moments = null;

    if ((null != facets) && (null != facets.getFacets()))
      for (Map.Entry<String, Facet> facet : facets.getFacets().entrySet()) {
        // Geo

        if (facet.getKey().equals("geo")) {
          TermsFacet geoFacet = (TermsFacet) facet.getValue();
          Set<GeoAggregationPojo> geoCounts = null;
          int nHighestCount = -1;
          int nLowestCount = Integer.MAX_VALUE;
          // If we've got some geotags from the alias masters then start with them:
          if ((null != extraAliasAggregatedGeo) && (null != extraAliasAggregatedGeo.geotags)) {
            geoCounts = extraAliasAggregatedGeo.geotags;
            nHighestCount = (int) extraAliasAggregatedGeo.minCount;
            nLowestCount = (int) extraAliasAggregatedGeo.maxCount;
          } else {
            geoCounts = new TreeSet<GeoAggregationPojo>();
          }
          for (TermsFacet.Entry geo : geoFacet.getEntries()) {
            String geohash = FacetUtils.getTerm(geo).substring(2);
            double[] loc = GeoHashUtils.decode(geohash);
            GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]);
            geoObj.count = geo.getCount();
            geoObj.type = GeoOntologyMapping.decodeOntologyCode(FacetUtils.getTerm(geo).charAt(0));
            geoCounts.add(geoObj);
            // (note this aggregates geo points whose decoded lat/logns are the same, which can
            // result in slightly fewer records than requested)
            // (note the aggregation writes the aggregated count into geoObj.count)

            if (geoObj.count
                > nHighestCount) { // (the counts can be modified by the add command above)
              nHighestCount = geo.getCount();
            }
            if (geoObj.count < nLowestCount) {
              nLowestCount = geo.getCount();
            }
          }
          rp.setGeo(geoCounts, nHighestCount, nLowestCount);
        } // (TESTED)
        if (facet.getKey().equals("time")) {
          DateHistogramFacet timeFacet = (DateHistogramFacet) facet.getValue();
          rp.setTimes(
              timeFacet.getEntries(), QueryHandler.getInterval(aggOutParams.timesInterval, 'm'));
        } // (TESTED)

        if (facet.getKey().equals("events")) {
          TermsFacet eventsFacet = (TermsFacet) facet.getValue();
          rp.setEvents(
              parseEventAggregationOutput(
                  "Event",
                  eventsFacet,
                  scoreStats,
                  aliasLookup,
                  entityTypeFilterStrings,
                  assocVerbFilterStrings));
        }
        if (facet.getKey().equals("facts")) {
          TermsFacet factsFacet = (TermsFacet) facet.getValue();
          rp.setFacts(
              parseEventAggregationOutput(
                  "Fact",
                  factsFacet,
                  scoreStats,
                  aliasLookup,
                  entityTypeFilterStrings,
                  assocVerbFilterStrings));
        }
        // TESTED x2

        if (facet.getKey().equals("sourceTags")) {
          TermsFacet tagsFacet = (TermsFacet) facet.getValue();
          rp.setSourceMetaTags(tagsFacet.getEntries());
        }
        if (facet.getKey().equals("sourceTypes")) {
          TermsFacet typesFacet = (TermsFacet) facet.getValue();
          rp.setSourceMetaTypes(typesFacet.getEntries());
        }
        if (facet.getKey().equals("sourceKeys")) {
          TermsFacet keysFacet = (TermsFacet) facet.getValue();
          rp.setSources(keysFacet.getEntries());
        }
        // TESTED x3

        // Moments (basic functionality)

        if (facet.getKey().startsWith("moments.")) {
          DateHistogramFacet momentFacet = (DateHistogramFacet) facet.getValue();
          if (null == moments) {
            moments = new HashMap<String, List<? extends Object>>();
          }
          moments.put(facet.getKey().substring(8), momentFacet.getEntries());
        } // TESTED
      } // (end loop over generated facets)

    if ((null != aggs) && (null != aggs.asMap()))
      for (Map.Entry<String, Aggregation> agg : aggs.asMap().entrySet()) {

        if (agg.getKey().equals("moments")) {
          if (null == moments) {
            moments = new HashMap<String, List<? extends Object>>();
          }

          DateHistogram val = (DateHistogram) agg.getValue();

          // TODO (INF-2688): Finalize format
          BasicDBList dbl = new BasicDBList();
          for (DateHistogram.Bucket dateBucket : val.getBuckets()) {
            if (dateBucket.getKeyAsNumber().longValue() > 0) {
              BasicDBObject dataBucketDbo = new BasicDBObject();
              dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue());
              dataBucketDbo.put("count", dateBucket.getDocCount());
              for (Map.Entry<String, Aggregation> dateAggs :
                  dateBucket.getAggregations().asMap().entrySet()) {
                if (dateAggs.getKey().equals("geo")) {

                  BasicDBList dbl_geo = new BasicDBList();
                  MultiBucketsAggregation geoVal = (MultiBucketsAggregation) dateAggs.getValue();

                  long nHighestCount = Long.MIN_VALUE;
                  for (MultiBucketsAggregation.Bucket geoBucket : geoVal.getBuckets()) {
                    String geohash = geoBucket.getKey().substring(2);
                    double[] loc = GeoHashUtils.decode(geohash);
                    GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]);
                    BasicDBObject geoDbo = new BasicDBObject(4);
                    geoDbo.put("lat", geoObj.lat);
                    geoDbo.put("lon", geoObj.lon);
                    geoDbo.put("count", geoBucket.getDocCount());
                    geoDbo.put(
                        "type",
                        GeoOntologyMapping.decodeOntologyCode(geoBucket.getKey().charAt(0)));
                    dbl_geo.add(geoDbo);

                    if (geoBucket.getDocCount()
                        > nHighestCount) { // (the counts can be modified by the add command above)
                      nHighestCount = geoBucket.getDocCount();
                    }
                  }
                  dataBucketDbo.put("maxGeoCount", nHighestCount);
                  dataBucketDbo.put("geo", dbl_geo);
                }
              }
              dbl.add(dataBucketDbo);
            }
          }
          moments.put("times", dbl);
        } else {
          if (null == moments) {
            moments = new HashMap<String, List<? extends Object>>();
          }
          DateHistogram val = (DateHistogram) agg.getValue();
          BasicDBList dbl = new BasicDBList();
          for (DateHistogram.Bucket dateBucket : val.getBuckets()) {
            if (dateBucket.getKeyAsNumber().longValue() > 0) {
              BasicDBObject dataBucketDbo = new BasicDBObject();
              dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue());
              dataBucketDbo.put("count", dateBucket.getDocCount());
              for (Map.Entry<String, Aggregation> dateAggs :
                  dateBucket.getAggregations().asMap().entrySet()) {
                if (dateAggs.getKey().equals("moments.assoc.nested")) {

                  BasicDBList dbl_assoc = new BasicDBList();
                  Nested nestedVal = (Nested) dateAggs.getValue();
                  MultiBucketsAggregation assocVal =
                      (MultiBucketsAggregation) nestedVal.getAggregations().asList().get(0);
                  long nHighestCount = Long.MIN_VALUE;

                  for (MultiBucketsAggregation.Bucket assocBucket : assocVal.getBuckets()) {
                    BasicDBObject assocDbo = new BasicDBObject(2);
                    assocDbo.put("key", assocBucket.getKey());
                    assocDbo.put("docCount", assocBucket.getDocCount());
                    dbl_assoc.add(assocDbo);

                    if (assocBucket.getDocCount()
                        > nHighestCount) { // (the counts can be modified by the add command above)
                      nHighestCount = assocBucket.getDocCount();
                    }
                  }

                  dataBucketDbo.put("maxAssocCount", nHighestCount);
                  dataBucketDbo.put("assoc", dbl_assoc);
                }
              }
              dbl.add(dataBucketDbo);
            }
            moments.put("assocs", dbl);
          }
        }
      } // (end loop over generated aggregations)

    if ((null != moments) && !moments.isEmpty()) {
      rp.setMoments(moments, QueryHandler.getInterval(aggOutParams.moments.timesInterval, 'm'));
    }
  } // TESTED
  @Test
  @Slow
  public void testTermFacet_stringFields() throws Throwable {
    prepareCreate("test")
        .addMapping(
            "type1",
            jsonBuilder()
                .startObject()
                .startObject("type1")
                .startObject("properties")
                .startObject("field1_paged")
                .field("type", "string")
                .field("index", "not_analyzed")
                .startObject("fielddata")
                .field("format", "paged_bytes")
                .endObject()
                .endObject()
                .startObject("field1_fst")
                .field("type", "string")
                .field("index", "not_analyzed")
                .startObject("fielddata")
                .field("format", "fst")
                .endObject()
                .endObject()
                .startObject("field2")
                .field("type", "string")
                .field("index", "not_analyzed")
                .startObject("fielddata")
                .field("format", "fst")
                .endObject()
                .endObject()
                .startObject("q_field")
                .field("type", "string")
                .field("index", "not_analyzed")
                .endObject()
                .endObject()
                .endObject()
                .endObject())
        .execute()
        .actionGet();

    Random random = getRandom();
    int numOfQueryValues = 50;
    String[] queryValues = new String[numOfQueryValues];
    for (int i = 0; i < numOfQueryValues; i++) {
      queryValues[i] = randomAsciiOfLength(5);
    }

    Set<String> uniqueValuesSet = new HashSet<String>();
    int numOfVals = 400;
    for (int i = 0; i < numOfVals; i++) {
      uniqueValuesSet.add(randomAsciiOfLength(10));
    }
    String[] allUniqueFieldValues = uniqueValuesSet.toArray(new String[uniqueValuesSet.size()]);

    Set<String> allField1Values = new HashSet<String>();
    Set<String> allField1AndField2Values = new HashSet<String>();
    Map<String, Map<String, Integer>> queryValToField1FacetEntries =
        new HashMap<String, Map<String, Integer>>();
    Map<String, Map<String, Integer>> queryValToField1and2FacetEntries =
        new HashMap<String, Map<String, Integer>>();
    for (int i = 1; i <= numDocs(); i++) {
      int numField1Values = random.nextInt(17);
      Set<String> field1Values = new HashSet<String>(numField1Values);
      for (int j = 0; j <= numField1Values; j++) {
        boolean added = false;
        while (!added) {
          added = field1Values.add(allUniqueFieldValues[random.nextInt(numOfVals)]);
        }
      }
      allField1Values.addAll(field1Values);
      allField1AndField2Values.addAll(field1Values);
      String field2Val = allUniqueFieldValues[random.nextInt(numOfVals)];
      allField1AndField2Values.add(field2Val);
      String queryVal = queryValues[random.nextInt(numOfQueryValues)];
      client()
          .prepareIndex("test", "type1", Integer.toString(i))
          .setSource(
              jsonBuilder()
                  .startObject()
                  .field("field1_paged", field1Values)
                  .field("field1_fst", field1Values)
                  .field("field2", field2Val)
                  .field("q_field", queryVal)
                  .endObject())
          .execute()
          .actionGet();

      if (random.nextInt(2000) == 854) {
        client().admin().indices().prepareFlush("test").execute().actionGet();
      }
      addControlValues(queryValToField1FacetEntries, field1Values, queryVal);
      addControlValues(queryValToField1and2FacetEntries, field1Values, queryVal);
      addControlValues(queryValToField1and2FacetEntries, field2Val, queryVal);
    }

    client().admin().indices().prepareRefresh().execute().actionGet();
    String[] facetFields = new String[] {"field1_paged", "field1_fst"};
    TermsFacet.ComparatorType[] compTypes = TermsFacet.ComparatorType.values();
    for (String facetField : facetFields) {
      for (String queryVal : queryValToField1FacetEntries.keySet()) {
        Set<String> allFieldValues;
        Map<String, Integer> queryControlFacets;
        TermsFacet.ComparatorType compType = compTypes[random.nextInt(compTypes.length)];
        TermsFacetBuilder termsFacetBuilder = FacetBuilders.termsFacet("facet1").order(compType);

        boolean useFields;
        if (random.nextInt(4) == 3) {
          useFields = true;
          queryControlFacets = queryValToField1and2FacetEntries.get(queryVal);
          allFieldValues = allField1AndField2Values;
          termsFacetBuilder.fields(facetField, "field2");
        } else {
          queryControlFacets = queryValToField1FacetEntries.get(queryVal);
          allFieldValues = allField1Values;
          useFields = false;
          termsFacetBuilder.field(facetField);
        }
        int size;
        if (numberOfShards() == 1
            || compType == TermsFacet.ComparatorType.TERM
            || compType == TermsFacet.ComparatorType.REVERSE_TERM) {
          size = random.nextInt(queryControlFacets.size());
        } else {
          size = allFieldValues.size();
        }
        termsFacetBuilder.size(size);

        if (random.nextBoolean()) {
          termsFacetBuilder.executionHint("map");
        }
        List<String> excludes = new ArrayList<String>();
        if (random.nextBoolean()) {
          int numExcludes = random.nextInt(5) + 1;
          List<String> facetValues = new ArrayList<String>(queryControlFacets.keySet());
          for (int i = 0; i < numExcludes; i++) {
            excludes.add(facetValues.get(random.nextInt(facetValues.size())));
          }
          termsFacetBuilder.exclude(excludes.toArray());
        }
        String regex = null;
        if (random.nextBoolean()) {
          List<String> facetValues = new ArrayList<String>(queryControlFacets.keySet());
          regex = facetValues.get(random.nextInt(facetValues.size()));
          regex = "^" + regex.substring(0, regex.length() / 2) + ".*";
          termsFacetBuilder.regex(regex);
        }

        boolean allTerms = random.nextInt(10) == 3;
        termsFacetBuilder.allTerms(allTerms);

        SearchResponse response =
            client()
                .prepareSearch("test")
                .setQuery(QueryBuilders.termQuery("q_field", queryVal))
                .addFacet(termsFacetBuilder)
                .execute()
                .actionGet();
        TermsFacet actualFacetEntries = response.getFacets().facet("facet1");

        List<Tuple<Text, Integer>> expectedFacetEntries =
            getExpectedFacetEntries(
                allFieldValues, queryControlFacets, size, compType, excludes, regex, allTerms);
        String reason =
            String.format(
                Locale.ROOT,
                "query: [%s] field: [%s] size: [%d] order: [%s] all_terms: [%s] fields: [%s] regex: [%s] excludes: [%s]",
                queryVal,
                facetField,
                size,
                compType,
                allTerms,
                useFields,
                regex,
                excludes);
        assertThat(
            reason, actualFacetEntries.getEntries().size(), equalTo(expectedFacetEntries.size()));
        for (int i = 0; i < expectedFacetEntries.size(); i++) {
          assertThat(
              reason,
              actualFacetEntries.getEntries().get(i).getTerm(),
              equalTo(expectedFacetEntries.get(i).v1()));
          assertThat(
              reason,
              actualFacetEntries.getEntries().get(i).getCount(),
              equalTo(expectedFacetEntries.get(i).v2()));
        }
      }
    }
  }