@Override public Facet reduce(List<Facet> facets) { if (facets.size() == 1) { return facets.get(0); } InternalLongTermsFacet first = null; TLongIntHashMap aggregated = CacheRecycler.popLongIntMap(); long missing = 0; long total = 0; for (Facet facet : facets) { TermsFacet termsFacet = (TermsFacet) facet; // termsFacet could be of type InternalStringTermsFacet representing unmapped fields if (first == null && termsFacet instanceof InternalLongTermsFacet) { first = (InternalLongTermsFacet) termsFacet; } missing += termsFacet.getMissingCount(); total += termsFacet.getTotalCount(); for (Entry entry : termsFacet.getEntries()) { aggregated.adjustOrPutValue(((LongEntry) entry).term, entry.getCount(), entry.getCount()); } } BoundedTreeSet<LongEntry> ordered = new BoundedTreeSet<LongEntry>(first.comparatorType.comparator(), first.requiredSize); for (TLongIntIterator it = aggregated.iterator(); it.hasNext(); ) { it.advance(); ordered.add(new LongEntry(it.key(), it.value())); } first.entries = ordered; first.missing = missing; first.total = total; CacheRecycler.pushLongIntMap(aggregated); return first; }
private static List<BasicDBObject> parseEventAggregationOutput( String sEventOrFact, TermsFacet facet, ScoringUtils scoreStats, AliasLookupTable aliasLookup, String[] entityTypeFilterStrings, String[] assocVerbFilterStrings) { ArrayList<BasicDBObject> facetList = new ArrayList<BasicDBObject>(facet.getEntries().size()); // (These 2 might be needed if we alias and there are filter strings specified) HashSet<String> entTypeFilter = null; // TEST CASES: // String term1 = "mark kelly/person|family relation|gabrielle giffords/person|"; // String term2 = "|family relation|gabrielle giffords/person|"; // String term3 = "mark kelly/person||gabrielle giffords/person|"; // String term4 = "mark kelly/person|family relation||"; // String term5 = "mark kelly/person|family relation|gabrielle giffords/person|loca,tion/city"; // List<String> terms = Arrays.asList(term1, term2, term3, term4, term5); @SuppressWarnings("unused") int nFacetEl = 0; // (this will get used later) for (TermsFacet.Entry facetEl : facet.getEntries()) { // DEBUG // System.out.println("TERM= " + FacetUtils.getTerm(facetEl)); String term = FacetUtils.getTerm(facetEl) .substring(sEventOrFact.length() + 1); // (step over "Fact|" or "Event|" // TEST CASES: // if (nFacetEl < terms.size()) { // term = terms.get(nFacetEl); // } // Parse the string Matcher m = eventIndexParser.matcher(term); if (m.matches()) { BasicDBObject json = new BasicDBObject(); json.put(AssociationPojo.assoc_type_, sEventOrFact); String sEnt1_index = m.group(1); if (null != sEnt1_index) { sEnt1_index = sEnt1_index.replaceAll("%7C", "|"); } String sVerbCat = m.group(2); if (null != sVerbCat) json.put(AssociationPojo.verb_category_, sVerbCat.replaceAll("%7C", "|")); String sEnt2_index = m.group(3); if (null != sEnt2_index) { sEnt2_index = sEnt2_index.replaceAll("%7C", "|"); } String sGeoIndex = m.group(4); if (null != sGeoIndex) { sGeoIndex = sGeoIndex.replaceAll("%7C", "|"); } json.put(AssociationPojo.doccount_, facetEl.getCount()); // Add significance if possible if ((null == scoreStats) || !scoreStats.calcAssocationSignificance(sEnt1_index, sEnt2_index, sGeoIndex, json)) { // These fields are optional: // json.put("entity1_sig", 0.0); // json.put("entity2_sig", 0.0); // json.put("geo_sig", 0.0); // Mandatory: json.put(AssociationPojo.assoc_sig_, 0.0); } boolean bTransformedByAlias = false; // when true need to re-check vs entity type filter // Now write the last few values (adjusted for aliases if necessary) into the JSON object if (null != sEnt1_index) { if (null != aliasLookup) { EntityFeaturePojo alias = aliasLookup.getAliasMaster(sEnt1_index); if (null != alias) { sEnt1_index = alias.getIndex(); if (sEnt1_index.equalsIgnoreCase("discard")) { continue; } // TESTED bTransformedByAlias = true; } } json.put(AssociationPojo.entity1_index_, sEnt1_index); } if (null != sEnt2_index) { if (null != aliasLookup) { EntityFeaturePojo alias = aliasLookup.getAliasMaster(sEnt2_index); if (null != alias) { sEnt2_index = alias.getIndex(); if (sEnt2_index.equalsIgnoreCase("discard")) { continue; } // TESTED (cut and paste of ent index1) bTransformedByAlias = true; } } json.put(AssociationPojo.entity2_index_, sEnt2_index); } if (null != sGeoIndex) { if (null != aliasLookup) { EntityFeaturePojo alias = aliasLookup.getAliasMaster(sGeoIndex); if (null != alias) { sGeoIndex = alias.getIndex(); if (sGeoIndex.equalsIgnoreCase("discard")) { if ((sEnt1_index != null) && (sEnt2_index != null)) { sGeoIndex = null; // event/fact is still valid even without the geo } // TESTED else continue; // event/fact now meaningless } bTransformedByAlias = true; } } json.put(AssociationPojo.geo_index_, sGeoIndex); } // TESTED // Whenever aliases are applied, need to re-check whether is this now a filter item // ideally have a single code block for doing this in scoringutils_association. if (bTransformedByAlias) { if ((null == entTypeFilter) && (null != entityTypeFilterStrings)) { entTypeFilter = new HashSet<String>(); } // (only create the map once, and only if needed) boolean bKeep = recheckFiltersAfterTransform( json, aliasLookup, entityTypeFilterStrings, entTypeFilter); if (!bKeep) { continue; // ie just bypass the facetList.add and the nFacetEl } } // TESTED facetList.add(json); } nFacetEl++; } return facetList; } // TESTED (see cases above - difficult to make this test case standalone because of
public static void loadAggregationResults( ResponsePojo rp, Facets facets, Aggregations aggs, AggregationOutputPojo aggOutParams, ScoringUtils scoreStats, AliasLookupTable aliasLookup, String[] entityTypeFilterStrings, String[] assocVerbFilterStrings, AggregationUtils.GeoContainer extraAliasAggregatedGeo) { HashMap<String, List<? extends Object>> moments = null; if ((null != facets) && (null != facets.getFacets())) for (Map.Entry<String, Facet> facet : facets.getFacets().entrySet()) { // Geo if (facet.getKey().equals("geo")) { TermsFacet geoFacet = (TermsFacet) facet.getValue(); Set<GeoAggregationPojo> geoCounts = null; int nHighestCount = -1; int nLowestCount = Integer.MAX_VALUE; // If we've got some geotags from the alias masters then start with them: if ((null != extraAliasAggregatedGeo) && (null != extraAliasAggregatedGeo.geotags)) { geoCounts = extraAliasAggregatedGeo.geotags; nHighestCount = (int) extraAliasAggregatedGeo.minCount; nLowestCount = (int) extraAliasAggregatedGeo.maxCount; } else { geoCounts = new TreeSet<GeoAggregationPojo>(); } for (TermsFacet.Entry geo : geoFacet.getEntries()) { String geohash = FacetUtils.getTerm(geo).substring(2); double[] loc = GeoHashUtils.decode(geohash); GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]); geoObj.count = geo.getCount(); geoObj.type = GeoOntologyMapping.decodeOntologyCode(FacetUtils.getTerm(geo).charAt(0)); geoCounts.add(geoObj); // (note this aggregates geo points whose decoded lat/logns are the same, which can // result in slightly fewer records than requested) // (note the aggregation writes the aggregated count into geoObj.count) if (geoObj.count > nHighestCount) { // (the counts can be modified by the add command above) nHighestCount = geo.getCount(); } if (geoObj.count < nLowestCount) { nLowestCount = geo.getCount(); } } rp.setGeo(geoCounts, nHighestCount, nLowestCount); } // (TESTED) if (facet.getKey().equals("time")) { DateHistogramFacet timeFacet = (DateHistogramFacet) facet.getValue(); rp.setTimes( timeFacet.getEntries(), QueryHandler.getInterval(aggOutParams.timesInterval, 'm')); } // (TESTED) if (facet.getKey().equals("events")) { TermsFacet eventsFacet = (TermsFacet) facet.getValue(); rp.setEvents( parseEventAggregationOutput( "Event", eventsFacet, scoreStats, aliasLookup, entityTypeFilterStrings, assocVerbFilterStrings)); } if (facet.getKey().equals("facts")) { TermsFacet factsFacet = (TermsFacet) facet.getValue(); rp.setFacts( parseEventAggregationOutput( "Fact", factsFacet, scoreStats, aliasLookup, entityTypeFilterStrings, assocVerbFilterStrings)); } // TESTED x2 if (facet.getKey().equals("sourceTags")) { TermsFacet tagsFacet = (TermsFacet) facet.getValue(); rp.setSourceMetaTags(tagsFacet.getEntries()); } if (facet.getKey().equals("sourceTypes")) { TermsFacet typesFacet = (TermsFacet) facet.getValue(); rp.setSourceMetaTypes(typesFacet.getEntries()); } if (facet.getKey().equals("sourceKeys")) { TermsFacet keysFacet = (TermsFacet) facet.getValue(); rp.setSources(keysFacet.getEntries()); } // TESTED x3 // Moments (basic functionality) if (facet.getKey().startsWith("moments.")) { DateHistogramFacet momentFacet = (DateHistogramFacet) facet.getValue(); if (null == moments) { moments = new HashMap<String, List<? extends Object>>(); } moments.put(facet.getKey().substring(8), momentFacet.getEntries()); } // TESTED } // (end loop over generated facets) if ((null != aggs) && (null != aggs.asMap())) for (Map.Entry<String, Aggregation> agg : aggs.asMap().entrySet()) { if (agg.getKey().equals("moments")) { if (null == moments) { moments = new HashMap<String, List<? extends Object>>(); } DateHistogram val = (DateHistogram) agg.getValue(); // TODO (INF-2688): Finalize format BasicDBList dbl = new BasicDBList(); for (DateHistogram.Bucket dateBucket : val.getBuckets()) { if (dateBucket.getKeyAsNumber().longValue() > 0) { BasicDBObject dataBucketDbo = new BasicDBObject(); dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue()); dataBucketDbo.put("count", dateBucket.getDocCount()); for (Map.Entry<String, Aggregation> dateAggs : dateBucket.getAggregations().asMap().entrySet()) { if (dateAggs.getKey().equals("geo")) { BasicDBList dbl_geo = new BasicDBList(); MultiBucketsAggregation geoVal = (MultiBucketsAggregation) dateAggs.getValue(); long nHighestCount = Long.MIN_VALUE; for (MultiBucketsAggregation.Bucket geoBucket : geoVal.getBuckets()) { String geohash = geoBucket.getKey().substring(2); double[] loc = GeoHashUtils.decode(geohash); GeoAggregationPojo geoObj = new GeoAggregationPojo(loc[0], loc[1]); BasicDBObject geoDbo = new BasicDBObject(4); geoDbo.put("lat", geoObj.lat); geoDbo.put("lon", geoObj.lon); geoDbo.put("count", geoBucket.getDocCount()); geoDbo.put( "type", GeoOntologyMapping.decodeOntologyCode(geoBucket.getKey().charAt(0))); dbl_geo.add(geoDbo); if (geoBucket.getDocCount() > nHighestCount) { // (the counts can be modified by the add command above) nHighestCount = geoBucket.getDocCount(); } } dataBucketDbo.put("maxGeoCount", nHighestCount); dataBucketDbo.put("geo", dbl_geo); } } dbl.add(dataBucketDbo); } } moments.put("times", dbl); } else { if (null == moments) { moments = new HashMap<String, List<? extends Object>>(); } DateHistogram val = (DateHistogram) agg.getValue(); BasicDBList dbl = new BasicDBList(); for (DateHistogram.Bucket dateBucket : val.getBuckets()) { if (dateBucket.getKeyAsNumber().longValue() > 0) { BasicDBObject dataBucketDbo = new BasicDBObject(); dataBucketDbo.put("time", dateBucket.getKeyAsNumber().longValue()); dataBucketDbo.put("count", dateBucket.getDocCount()); for (Map.Entry<String, Aggregation> dateAggs : dateBucket.getAggregations().asMap().entrySet()) { if (dateAggs.getKey().equals("moments.assoc.nested")) { BasicDBList dbl_assoc = new BasicDBList(); Nested nestedVal = (Nested) dateAggs.getValue(); MultiBucketsAggregation assocVal = (MultiBucketsAggregation) nestedVal.getAggregations().asList().get(0); long nHighestCount = Long.MIN_VALUE; for (MultiBucketsAggregation.Bucket assocBucket : assocVal.getBuckets()) { BasicDBObject assocDbo = new BasicDBObject(2); assocDbo.put("key", assocBucket.getKey()); assocDbo.put("docCount", assocBucket.getDocCount()); dbl_assoc.add(assocDbo); if (assocBucket.getDocCount() > nHighestCount) { // (the counts can be modified by the add command above) nHighestCount = assocBucket.getDocCount(); } } dataBucketDbo.put("maxAssocCount", nHighestCount); dataBucketDbo.put("assoc", dbl_assoc); } } dbl.add(dataBucketDbo); } moments.put("assocs", dbl); } } } // (end loop over generated aggregations) if ((null != moments) && !moments.isEmpty()) { rp.setMoments(moments, QueryHandler.getInterval(aggOutParams.moments.timesInterval, 'm')); } } // TESTED
@Test @Slow public void testTermFacet_stringFields() throws Throwable { prepareCreate("test") .addMapping( "type1", jsonBuilder() .startObject() .startObject("type1") .startObject("properties") .startObject("field1_paged") .field("type", "string") .field("index", "not_analyzed") .startObject("fielddata") .field("format", "paged_bytes") .endObject() .endObject() .startObject("field1_fst") .field("type", "string") .field("index", "not_analyzed") .startObject("fielddata") .field("format", "fst") .endObject() .endObject() .startObject("field2") .field("type", "string") .field("index", "not_analyzed") .startObject("fielddata") .field("format", "fst") .endObject() .endObject() .startObject("q_field") .field("type", "string") .field("index", "not_analyzed") .endObject() .endObject() .endObject() .endObject()) .execute() .actionGet(); Random random = getRandom(); int numOfQueryValues = 50; String[] queryValues = new String[numOfQueryValues]; for (int i = 0; i < numOfQueryValues; i++) { queryValues[i] = randomAsciiOfLength(5); } Set<String> uniqueValuesSet = new HashSet<String>(); int numOfVals = 400; for (int i = 0; i < numOfVals; i++) { uniqueValuesSet.add(randomAsciiOfLength(10)); } String[] allUniqueFieldValues = uniqueValuesSet.toArray(new String[uniqueValuesSet.size()]); Set<String> allField1Values = new HashSet<String>(); Set<String> allField1AndField2Values = new HashSet<String>(); Map<String, Map<String, Integer>> queryValToField1FacetEntries = new HashMap<String, Map<String, Integer>>(); Map<String, Map<String, Integer>> queryValToField1and2FacetEntries = new HashMap<String, Map<String, Integer>>(); for (int i = 1; i <= numDocs(); i++) { int numField1Values = random.nextInt(17); Set<String> field1Values = new HashSet<String>(numField1Values); for (int j = 0; j <= numField1Values; j++) { boolean added = false; while (!added) { added = field1Values.add(allUniqueFieldValues[random.nextInt(numOfVals)]); } } allField1Values.addAll(field1Values); allField1AndField2Values.addAll(field1Values); String field2Val = allUniqueFieldValues[random.nextInt(numOfVals)]; allField1AndField2Values.add(field2Val); String queryVal = queryValues[random.nextInt(numOfQueryValues)]; client() .prepareIndex("test", "type1", Integer.toString(i)) .setSource( jsonBuilder() .startObject() .field("field1_paged", field1Values) .field("field1_fst", field1Values) .field("field2", field2Val) .field("q_field", queryVal) .endObject()) .execute() .actionGet(); if (random.nextInt(2000) == 854) { client().admin().indices().prepareFlush("test").execute().actionGet(); } addControlValues(queryValToField1FacetEntries, field1Values, queryVal); addControlValues(queryValToField1and2FacetEntries, field1Values, queryVal); addControlValues(queryValToField1and2FacetEntries, field2Val, queryVal); } client().admin().indices().prepareRefresh().execute().actionGet(); String[] facetFields = new String[] {"field1_paged", "field1_fst"}; TermsFacet.ComparatorType[] compTypes = TermsFacet.ComparatorType.values(); for (String facetField : facetFields) { for (String queryVal : queryValToField1FacetEntries.keySet()) { Set<String> allFieldValues; Map<String, Integer> queryControlFacets; TermsFacet.ComparatorType compType = compTypes[random.nextInt(compTypes.length)]; TermsFacetBuilder termsFacetBuilder = FacetBuilders.termsFacet("facet1").order(compType); boolean useFields; if (random.nextInt(4) == 3) { useFields = true; queryControlFacets = queryValToField1and2FacetEntries.get(queryVal); allFieldValues = allField1AndField2Values; termsFacetBuilder.fields(facetField, "field2"); } else { queryControlFacets = queryValToField1FacetEntries.get(queryVal); allFieldValues = allField1Values; useFields = false; termsFacetBuilder.field(facetField); } int size; if (numberOfShards() == 1 || compType == TermsFacet.ComparatorType.TERM || compType == TermsFacet.ComparatorType.REVERSE_TERM) { size = random.nextInt(queryControlFacets.size()); } else { size = allFieldValues.size(); } termsFacetBuilder.size(size); if (random.nextBoolean()) { termsFacetBuilder.executionHint("map"); } List<String> excludes = new ArrayList<String>(); if (random.nextBoolean()) { int numExcludes = random.nextInt(5) + 1; List<String> facetValues = new ArrayList<String>(queryControlFacets.keySet()); for (int i = 0; i < numExcludes; i++) { excludes.add(facetValues.get(random.nextInt(facetValues.size()))); } termsFacetBuilder.exclude(excludes.toArray()); } String regex = null; if (random.nextBoolean()) { List<String> facetValues = new ArrayList<String>(queryControlFacets.keySet()); regex = facetValues.get(random.nextInt(facetValues.size())); regex = "^" + regex.substring(0, regex.length() / 2) + ".*"; termsFacetBuilder.regex(regex); } boolean allTerms = random.nextInt(10) == 3; termsFacetBuilder.allTerms(allTerms); SearchResponse response = client() .prepareSearch("test") .setQuery(QueryBuilders.termQuery("q_field", queryVal)) .addFacet(termsFacetBuilder) .execute() .actionGet(); TermsFacet actualFacetEntries = response.getFacets().facet("facet1"); List<Tuple<Text, Integer>> expectedFacetEntries = getExpectedFacetEntries( allFieldValues, queryControlFacets, size, compType, excludes, regex, allTerms); String reason = String.format( Locale.ROOT, "query: [%s] field: [%s] size: [%d] order: [%s] all_terms: [%s] fields: [%s] regex: [%s] excludes: [%s]", queryVal, facetField, size, compType, allTerms, useFields, regex, excludes); assertThat( reason, actualFacetEntries.getEntries().size(), equalTo(expectedFacetEntries.size())); for (int i = 0; i < expectedFacetEntries.size(); i++) { assertThat( reason, actualFacetEntries.getEntries().get(i).getTerm(), equalTo(expectedFacetEntries.get(i).v1())); assertThat( reason, actualFacetEntries.getEntries().get(i).getCount(), equalTo(expectedFacetEntries.get(i).v2())); } } } }