/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { StoredDocument d = ir.document(docNum); StorableField[] fields = d.getFields(fieldName); for (StorableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
public void testKeepsLastFilter() throws Throwable { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE); ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; assertTrue("Filtered searching should have found some matches", hits.length > 0); for (ScoreDoc hit : hits) { StoredDocument d = searcher.doc(hit.doc); String url = d.get(KEY_FIELD); DocsEnum td = _TestUtil.docs( random(), reader, KEY_FIELD, new BytesRef(url), MultiFields.getLiveDocs(reader), null, 0); int lastDoc = 0; while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { lastDoc = td.docID(); } assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc); } }
public void testDefaultFilter() throws Throwable { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); HashSet<String> results = new HashSet<String>(); ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; for (ScoreDoc hit : hits) { StoredDocument d = searcher.doc(hit.doc); String url = d.get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } }
/** * Match up search results and add corresponding data for each result (if we have query results * available). */ @Override @SuppressWarnings({"rawtypes", "unchecked"}) public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(getName(), false)) { return; } XJoinResults<?> results = (XJoinResults<?>) rb.req.getContext().get(getResultsTag()); if (results == null || rb.getResults() == null) { return; } // general results FieldAppender appender = new FieldAppender( (String) params.get(getName() + "." + XJoinParameters.RESULTS_FIELD_LIST, "*")); NamedList general = appender.addNamedList(rb.rsp.getValues(), getName(), results); // per join id results FieldAppender docAppender = new FieldAppender( (String) params.get(getName() + "." + XJoinParameters.DOC_FIELD_LIST, "*")); Set<String> joinFields = new HashSet<>(); joinFields.add(joinField); List<String> joinIds = new ArrayList<>(); for (Iterator<Integer> it = docIterator(rb); it.hasNext(); ) { StoredDocument doc = rb.req.getSearcher().doc(it.next(), joinFields); for (String joinId : doc.getValues(joinField)) { if (!joinIds.contains(joinId)) { joinIds.add(joinId); } } } for (String joinId : joinIds) { Object object = results.getResult(joinId); if (object == null) continue; NamedList external = new NamedList<>(); general.add("external", external); external.add("joinId", joinId); if (object instanceof Iterable) { for (Object item : (Iterable) object) { docAppender.addNamedList(external, "doc", item); } } else { docAppender.addNamedList(external, "doc", object); } } }
public void testNoFilter() throws Throwable { HashSet<String> results = new HashSet<String>(); ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs; assertTrue("Default searching should have found some matches", hits.length > 0); boolean dupsFound = false; for (ScoreDoc hit : hits) { StoredDocument d = searcher.doc(hit.doc); String url = d.get(KEY_FIELD); if (!dupsFound) dupsFound = results.contains(url); results.add(url); } assertTrue("Default searching should have found duplicate urls", dupsFound); }
private static SolrDocument toSolrDoc(SolrInputDocument sdoc, IndexSchema schema) { // TODO: do something more performant than this double conversion Document doc = DocumentBuilder.toDocument(sdoc, schema); // copy the stored fields only StoredDocument out = new StoredDocument(); for (IndexableField f : doc.getFields()) { if (f.fieldType().stored()) { out.add((StorableField) f); } } return toSolrDoc(out, schema); }
public void testFastFilter() throws Throwable { DuplicateFilter df = new DuplicateFilter(KEY_FIELD); df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION); HashSet<String> results = new HashSet<String>(); ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; assertTrue("Filtered searching should have found some matches", hits.length > 0); for (ScoreDoc hit : hits) { StoredDocument d = searcher.doc(hit.doc); String url = d.get(KEY_FIELD); assertFalse("No duplicate urls should be returned", results.contains(url)); results.add(url); } assertEquals("Two urls found", 2, results.size()); }
protected NamedList serializeTopDocs(QueryCommandResult result) throws IOException { NamedList<Object> queryResult = new NamedList<>(); queryResult.add("matches", result.getMatches()); queryResult.add("totalHits", result.getTopDocs().totalHits); if (rb.getGroupingSpec().isNeedScore()) { queryResult.add("maxScore", result.getTopDocs().getMaxScore()); } List<NamedList> documents = new ArrayList<>(); queryResult.add("documents", documents); final IndexSchema schema = rb.req.getSearcher().getSchema(); SchemaField uniqueField = schema.getUniqueKeyField(); CharsRef spare = new CharsRef(); for (ScoreDoc scoreDoc : result.getTopDocs().scoreDocs) { NamedList<Object> document = new NamedList<>(); documents.add(document); StoredDocument doc = retrieveDocument(uniqueField, scoreDoc.doc); document.add("id", uniqueField.getType().toExternal(doc.getField(uniqueField.getName()))); if (rb.getGroupingSpec().isNeedScore()) { document.add("score", scoreDoc.score); } if (!FieldDoc.class.isInstance(scoreDoc)) { continue; } FieldDoc fieldDoc = (FieldDoc) scoreDoc; Object[] convertedSortValues = new Object[fieldDoc.fields.length]; for (int j = 0; j < fieldDoc.fields.length; j++) { Object sortValue = fieldDoc.fields[j]; Sort groupSort = rb.getGroupingSpec().getGroupSort(); SchemaField field = groupSort.getSort()[j].getField() != null ? schema.getFieldOrNull(groupSort.getSort()[j].getField()) : null; if (field != null) { FieldType fieldType = field.getType(); if (sortValue != null) { sortValue = fieldType.marshalSortValue(sortValue); } } convertedSortValues[j] = sortValue; } document.add("sortValues", convertedSortValues); } return queryResult; }
@Test public void testReturnedDocID() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field")); int num = Math.min(1000, atLeast(10)); for (int i = 0; i < num; i++) { Document document = new Document(); document.add(new SuggestField("suggest_field", "abc_" + i, num)); document.add(new StoredField("int_field", i)); iw.addDocument(document); if (random().nextBoolean()) { iw.commit(); } } DirectoryReader reader = iw.getReader(); SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader); PrefixCompletionQuery query = new PrefixCompletionQuery(analyzer, new Term("suggest_field", "abc_")); TopSuggestDocs suggest = indexSearcher.suggest(query, num); assertEquals(num, suggest.totalHits); for (SuggestScoreDoc suggestScoreDoc : suggest.scoreLookupDocs()) { String key = suggestScoreDoc.key.toString(); assertTrue(key.startsWith("abc_")); String substring = key.substring(4); int fieldValue = Integer.parseInt(substring); StoredDocument doc = reader.document(suggestScoreDoc.doc); assertEquals(doc.getField("int_field").numericValue().intValue(), fieldValue); } reader.close(); iw.close(); }
private static SolrInputDocument toSolrInputDocument(StoredDocument doc, IndexSchema schema) { SolrInputDocument out = new SolrInputDocument(); for (StorableField f : doc.getFields()) { String fname = f.name(); SchemaField sf = schema.getFieldOrNull(f.name()); Object val = null; if (sf != null) { if (!sf.stored() || schema.isCopyFieldTarget(sf)) continue; val = sf.getType().toObject(f); // object or external string? } else { val = f.stringValue(); if (val == null) val = f.numericValue(); if (val == null) val = f.binaryValue(); if (val == null) val = f; } // todo: how to handle targets of copy fields (including polyfield sub-fields)? out.addField(fname, val); } return out; }
private static SolrDocument toSolrDoc(StoredDocument doc, IndexSchema schema) { SolrDocument out = new SolrDocument(); for (StorableField f : doc.getFields()) { // Make sure multivalued fields are represented as lists Object existing = out.get(f.name()); if (existing == null) { SchemaField sf = schema.getFieldOrNull(f.name()); // don't return copyField targets if (sf != null && schema.isCopyFieldTarget(sf)) continue; if (sf != null && sf.multiValued()) { List<Object> vals = new ArrayList<Object>(); vals.add(f); out.setField(f.name(), vals); } else { out.setField(f.name(), f); } } else { out.addField(f.name(), f); } } return out; }
protected NamedList serializeTopGroups(TopGroups<BytesRef> data, SchemaField groupField) throws IOException { NamedList<Object> result = new NamedList<>(); result.add("totalGroupedHitCount", data.totalGroupedHitCount); result.add("totalHitCount", data.totalHitCount); if (data.totalGroupCount != null) { result.add("totalGroupCount", data.totalGroupCount); } CharsRef spare = new CharsRef(); final IndexSchema schema = rb.req.getSearcher().getSchema(); SchemaField uniqueField = schema.getUniqueKeyField(); for (GroupDocs<BytesRef> searchGroup : data.groups) { NamedList<Object> groupResult = new NamedList<>(); groupResult.add("totalHits", searchGroup.totalHits); if (!Float.isNaN(searchGroup.maxScore)) { groupResult.add("maxScore", searchGroup.maxScore); } List<NamedList<Object>> documents = new ArrayList<>(); for (int i = 0; i < searchGroup.scoreDocs.length; i++) { NamedList<Object> document = new NamedList<>(); documents.add(document); StoredDocument doc = retrieveDocument(uniqueField, searchGroup.scoreDocs[i].doc); document.add("id", uniqueField.getType().toExternal(doc.getField(uniqueField.getName()))); if (!Float.isNaN(searchGroup.scoreDocs[i].score)) { document.add("score", searchGroup.scoreDocs[i].score); } if (!(searchGroup.scoreDocs[i] instanceof FieldDoc)) { continue; } FieldDoc fieldDoc = (FieldDoc) searchGroup.scoreDocs[i]; Object[] convertedSortValues = new Object[fieldDoc.fields.length]; for (int j = 0; j < fieldDoc.fields.length; j++) { Object sortValue = fieldDoc.fields[j]; Sort sortWithinGroup = rb.getGroupingSpec().getSortWithinGroup(); SchemaField field = sortWithinGroup.getSort()[j].getField() != null ? schema.getFieldOrNull(sortWithinGroup.getSort()[j].getField()) : null; if (field != null) { FieldType fieldType = field.getType(); if (sortValue != null) { sortValue = fieldType.marshalSortValue(sortValue); } } convertedSortValues[j] = sortValue; } document.add("sortValues", convertedSortValues); } groupResult.add("documents", documents); String groupValue = searchGroup.groupValue != null ? groupField.getType().indexedToReadable(searchGroup.groupValue.utf8ToString()) : null; result.add(groupValue, groupResult); } return result; }