/** * Prepare a document reconstructor. * * @param reader IndexReader to read from. * @param fieldNames if non-null or not empty, data will be collected only from these fields, * otherwise data will be collected from all fields * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated) * @throws Exception */ public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception { if (reader == null) { throw new Exception("IndexReader cannot be null."); } this.reader = reader; if (fieldNames == null || fieldNames.length == 0) { // collect fieldNames this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]); } else { this.fieldNames = fieldNames; } if (numTerms == -1) { Fields fields = MultiFields.getFields(reader); numTerms = 0; FieldsEnum fe = fields.iterator(); String fld = null; while ((fld = fe.next()) != null) { TermsEnum te = fe.terms(); while (te.next() != null) { numTerms++; } } this.numTerms = numTerms; } deleted = MultiFields.getDeletedDocs(reader); }
private Fields generateTermVectors( Collection<GetField> getFields, boolean withOffsets, @Nullable Map<String, String> perFieldAnalyzer) throws IOException { /* store document in memory index */ MemoryIndex index = new MemoryIndex(withOffsets); for (GetField getField : getFields) { String field = getField.getName(); Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer); for (Object text : getField.getValues()) { index.addField(field, text.toString(), analyzer); } } /* and read vectors from it */ return MultiFields.getFields(index.createSearcher().getIndexReader()); }
public TermVectorResponse getTermVector(TermVectorRequest request, String concreteIndex) { final Engine.Searcher searcher = indexShard.acquireSearcher("term_vector"); IndexReader topLevelReader = searcher.reader(); final TermVectorResponse termVectorResponse = new TermVectorResponse(concreteIndex, request.type(), request.id()); final Term uidTerm = new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(request.type(), request.id())); Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm)); boolean docFromTranslog = get.source() != null; AggregatedDfs dfs = null; /* fetched from translog is treated as an artificial document */ if (docFromTranslog) { request.doc(get.source().source, false); termVectorResponse.setDocVersion(get.version()); } /* handle potential wildcards in fields */ if (request.selectedFields() != null) { handleFieldWildcards(request); } try { Fields topLevelFields = MultiFields.getFields(topLevelReader); Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion(); /* from an artificial document */ if (request.doc() != null) { Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog); // if no document indexed in shard, take the queried document itself for stats if (topLevelFields == null) { topLevelFields = termVectorsByField; } if (termVectorsByField != null && useDfs(request)) { dfs = getAggregatedDfs(termVectorsByField, request); } termVectorResponse.setFields( termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs); termVectorResponse.setExists(true); termVectorResponse.setArtificial(!docFromTranslog); } /* or from an existing document */ else if (docIdAndVersion != null) { // fields with stored term vectors Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId); Set<String> selectedFields = request.selectedFields(); // generate tvs for fields where analyzer is overridden if (selectedFields == null && request.perFieldAnalyzer() != null) { selectedFields = getFieldsToGenerate(request.perFieldAnalyzer(), termVectorsByField); } // fields without term vectors if (selectedFields != null) { termVectorsByField = addGeneratedTermVectors(get, termVectorsByField, request, selectedFields); } if (termVectorsByField != null && useDfs(request)) { dfs = getAggregatedDfs(termVectorsByField, request); } termVectorResponse.setFields( termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs); termVectorResponse.setDocVersion(docIdAndVersion.version); termVectorResponse.setExists(true); } else { termVectorResponse.setExists(false); } } catch (Throwable ex) { throw new ElasticsearchException("failed to execute term vector request", ex); } finally { searcher.close(); get.release(); } return termVectorResponse; }
public static void main(String[] args) throws IOException { IndexReader reader = null; /* * Opening the index first simplifies the processing of the * rest of the command line arguments. */ for (int i = 0; i < args.length; i++) { if (("-index".equals(args[i])) && ((i + 1) < args.length)) { reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1]))); if (reader == null) { System.err.println("Error: Can't open index " + args[i + 1]); System.exit(1); } ; break; } ; } ; if (reader == null) { System.err.println(usage); System.exit(1); } ; /* * Process the command line arguments sequentially. */ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { /* * Handled in the previous loop, so just skip the argument. */ i++; } else if ("-list-edocid".equals(args[i])) { System.out.println("-list-edocid:"); if ((i + 1) >= args.length) { System.out.println(usage); break; } ; Document d = reader.document(Integer.parseInt(args[i + 1])); System.out.println( "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId")); i += 1; } else if ("-list-docids".equals(args[i])) { System.out.println("-list-docids:"); for (int j = 0; j < reader.numDocs(); j++) { Document d = reader.document(j); System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId")); } ; } else if ("-list-fields".equals(args[i])) { Fields fields = MultiFields.getFields(reader); System.out.print("\nNumber of fields: "); if (fields == null) System.out.println("0"); else { System.out.println(fields.size()); Iterator<String> is = fields.iterator(); while (is.hasNext()) { System.out.println("\t" + is.next()); } ; } ; } else if ("-list-postings".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE); i += 2; } else if ("-list-postings-sample".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listPostings(reader, args[i + 1], args[i + 2], 5); i += 2; } else if ("-list-stats".equals(args[i])) { System.out.println("Corpus statistics:"); System.out.println("\tnumdocs\t\t" + reader.numDocs()); System.out.println( "\turl:\t" + "\tnumdocs=" + reader.getDocCount("url") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("url") + "\tavglen=" + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url")); System.out.println( "\tkeywords:" + "\tnumdocs=" + reader.getDocCount("keywords") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("keywords") + "\tavglen=" + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords")); System.out.println( "\ttitle:\t" + "\tnumdocs=" + reader.getDocCount("title") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("title") + "\tavglen=" + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title")); System.out.println( "\tbody:\t" + "\tnumdocs=" + reader.getDocCount("body") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("body") + "\tavglen=" + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body")); System.out.println( "\tinlink:\t" + "\tnumdocs=" + reader.getDocCount("inlink") + "\tsumTotalTF=" + reader.getSumTotalTermFreq("inlink") + "\tavglen=" + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink")); } else if ("-list-terms".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermDictionary(reader, args[i + 1]); i += 1; } else if ("-list-termvector".equals(args[i])) { if ((i + 1) >= args.length) { System.out.println(usage); break; } ; listTermVectors(reader, args[i + 1]); i += 1; } else if ("-list-termvector-field".equals(args[i])) { if ((i + 2) >= args.length) { System.out.println(usage); break; } ; listTermVectorField(reader, args[i + 1], args[i + 2]); i += 2; } else System.err.println("\nWarning: Unknown argument " + args[i] + " ignored."); } ; /* * Close the index and exit gracefully. */ reader.close(); }