/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
/* * listTermVectors displays the term vectors for all of the fields * in a document in an index (specified by reader). */ static void listTermVectors(IndexReader reader, String docidString) throws IOException { System.out.println("\nTermVector: docid " + docidString); int docid = Integer.parseInt(docidString); if ((docid < 0) || (docid >= reader.numDocs())) { System.out.println("ERROR: " + docidString + " is a bad document id."); return; } ; /* * Iterate over the fields in this document. */ Fields fields = reader.getTermVectors(docid); Iterator<String> fieldIterator = fields.iterator(); while (fieldIterator.hasNext()) { String fieldName = fieldIterator.next(); System.out.println(" Field: " + fieldName); Terms terms = fields.terms(fieldName); termVectorDisplay(terms); } ; }
/** * Return a query that will return docs like the passed Fields. * * @return a query that will return docs like the passed Fields. */ public Query like(Fields... likeFields) throws IOException { // get all field names Set<String> fieldNames = new HashSet<>(); for (Fields fields : likeFields) { for (String fieldName : fields) { fieldNames.add(fieldName); } } // term selection is per field, then appended to a single boolean query BooleanQuery bq = new BooleanQuery(); for (String fieldName : fieldNames) { Map<String, Int> termFreqMap = new HashMap<>(); for (Fields fields : likeFields) { Terms vector = fields.terms(fieldName); if (vector != null) { addTermFrequencies(termFreqMap, vector, fieldName); } } addToQuery(createQueue(termFreqMap, fieldName), bq); } return bq; }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }