private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Fieldable fieldable = (Fieldable) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(fieldable.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(fieldable)); Term t = new Term( fieldable.name(), ftype != null ? ftype.storedToIndexed(fieldable) : fieldable.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(fieldable)); // TODO: this really should be "stored" f.add("internal", fieldable.stringValue()); // may be a binary number byte[] arr = fieldable.getBinaryValue(); if (arr != null) { f.add("binary", Base64.byteArrayToBase64(arr, 0, arr.length)); } f.add("boost", fieldable.getBoost()); f.add( "docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (fieldable.isTermVectorStored()) { try { TermFreqVector v = reader.getTermFreqVector(docId, fieldable.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); for (int i = 0; i < v.size(); i++) { tfv.add(v.getTerms()[i], v.getTermFrequencies()[i]); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(fieldable.name(), f); } return finfo; }
/** * Reconstruct document fields. * * @param docNum document number. If this document is deleted, but the index is not optimized yet, * the reconstruction process may still yield the reconstructed field content even from * deleted documents. * @return reconstructed document * @throws Exception */ public Reconstructed reconstruct(int docNum) throws Exception { if (docNum < 0 || docNum > reader.maxDoc()) { throw new Exception("Document number outside of valid range."); } Reconstructed res = new Reconstructed(); if (deleted != null && deleted.get(docNum)) { throw new Exception("Document is deleted."); } else { Document doc = reader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { Field[] fs = doc.getFields(fieldNames[i]); if (fs != null && fs.length > 0) { res.getStoredFields().put(fieldNames[i], fs); } } } // collect values from unstored fields HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames)); // try to use term vectors if available progress.maxValue = fieldNames.length; progress.curValue = 0; progress.minValue = 0; for (int i = 0; i < fieldNames.length; i++) { TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]); if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) { TermPositionVector tpv = (TermPositionVector) tvf; progress.message = "Reading term vectors ..."; progress.curValue = i; setChanged(); notifyObservers(progress); BytesRef[] tv = tpv.getTerms(); for (int k = 0; k < tv.length; k++) { // do we have positions? int[] posArr = tpv.getTermPositions(k); if (posArr == null) { // only offsets TermVectorOffsetInfo[] offsets = tpv.getOffsets(k); if (offsets.length == 0) { continue; } // convert offsets into positions posArr = convertOffsets(offsets); } GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fieldNames[i], gsa); } for (int m = 0; m < posArr.length; m++) { gsa.append(posArr[m], "|", tv[k].utf8ToString()); } } fields.remove(fieldNames[i]); // got what we wanted } } // this loop collects data only from left-over fields // not yet collected through term vectors progress.maxValue = fields.size(); progress.curValue = 0; progress.minValue = 0; for (String fld : fields) { progress.message = "Collecting terms in " + fld + " ..."; progress.curValue++; setChanged(); notifyObservers(progress); Terms terms = MultiFields.getTerms(reader, fld); if (terms == null) { // no terms in this field continue; } TermsEnum te = terms.iterator(); while (te.next() != null) { DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null); if (dpe == null) { // no position info for this field break; } int num = dpe.advance(docNum); if (num != docNum) { // either greater than or NO_MORE_DOCS continue; // no data for this term in this doc } String term = te.term().utf8ToString(); GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld); if (gsa == null) { gsa = new GrowableStringArray(); res.getReconstructedFields().put(fld, gsa); } for (int k = 0; k < dpe.freq(); k++) { int pos = dpe.nextPosition(); gsa.append(pos, "|", term); } } } progress.message = "Done."; progress.curValue = 100; setChanged(); notifyObservers(progress); return res; }