/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
/** * Assert that the {@code scoreType} operates as expected and parents are found in the expected * order. * * <p>This will use the test index's parent/child types to create parents with multiple children. * Each child will have a randomly generated scored stored in {@link #CHILD_SCORE_NAME}, which is * used to score based on the {@code scoreType} by using a {@link MockScorer} to determine the * expected scores. * * @param scoreType The score type to use within the query to score parents relative to their * children. * @throws IOException if any unexpected error occurs */ private void assertScoreType(ScoreType scoreType) throws IOException { SearchContext context = SearchContext.current(); Directory directory = newDirectory(); IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random()))); // calculates the expected score per parent MockScorer scorer = new MockScorer(scoreType); scorer.scores = new FloatArrayList(10); // number of parents to generate int parentDocs = scaledRandomIntBetween(2, 10); // unique child ID int childDocId = 0; // Parent ID to expected score Map<String, Float> parentScores = new TreeMap<>(); // Add a few random parents to ensure that the children's score is appropriately taken into // account for (int parentDocId = 0; parentDocId < parentDocs; ++parentDocId) { String parent = Integer.toString(parentDocId); // Create the parent Document parentDocument = new Document(); parentDocument.add( new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES)); parentDocument.add(new StringField(IdFieldMapper.NAME, parent, Field.Store.YES)); parentDocument.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO)); // add the parent to the index writer.addDocument(parentDocument); int numChildDocs = scaledRandomIntBetween(1, 10); // forget any parent's previous scores scorer.scores.clear(); // associate children with the parent for (int i = 0; i < numChildDocs; ++i) { int childScore = random().nextInt(128); Document childDocument = new Document(); childDocument.add( new StringField( UidFieldMapper.NAME, Uid.createUid("child", Integer.toString(childDocId++)), Field.Store.NO)); childDocument.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO)); // parent association: childDocument.add( new StringField( ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO)); childDocument.add(new DoubleField(CHILD_SCORE_NAME, childScore, Field.Store.NO)); // remember the score to be calculated scorer.scores.add(childScore); // add the associated child to the index writer.addDocument(childDocument); } // this score that should be returned for this parent parentScores.put(parent, scorer.score()); } writer.commit(); IndexReader reader = DirectoryReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); // setup to read the parent/child map Engine.Searcher engineSearcher = new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher); ((TestSearchContext) context).setSearcher(new ContextIndexSearcher(context, engineSearcher)); // child query that returns the score as the value of "childScore" for each child document, with // the parent's score determined by the score type QueryBuilder childQueryBuilder = functionScoreQuery(typeFilter("child")) .add(new FieldValueFactorFunctionBuilder(CHILD_SCORE_NAME)); QueryBuilder queryBuilder = hasChildQuery("child", childQueryBuilder) .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH)) .setShortCircuitCutoff(parentDocs); // Perform the search for the documents using the selected score type TopDocs docs = searcher.search(parseQuery(queryBuilder), parentDocs); assertThat("Expected all parents", docs.totalHits, is(parentDocs)); // score should be descending (just a sanity check) float topScore = docs.scoreDocs[0].score; // ensure each score is returned as expected for (int i = 0; i < parentDocs; ++i) { ScoreDoc scoreDoc = docs.scoreDocs[i]; // get the ID from the document to get its expected score; remove it so we cannot double-count // it float score = parentScores.remove(reader.document(scoreDoc.doc).get(IdFieldMapper.NAME)); // expect exact match assertThat("Unexpected score", scoreDoc.score, is(score)); assertThat("Not descending", score, lessThanOrEqualTo(topScore)); // it had better keep descending topScore = score; } reader.close(); writer.close(); directory.close(); }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }