public static void writeFlamdex(final FlamdexReader fdx, final FlamdexWriter w) throws IOException { final DocIdStream dis = fdx.getDocIdStream(); final int[] docIdBuf = new int[DOC_ID_BUFFER_SIZE]; for (final String intField : fdx.getIntFields()) { final IntFieldWriter ifw = w.getIntFieldWriter(intField); final IntTermIterator iter = fdx.getIntTermIterator(intField); while (iter.next()) { ifw.nextTerm(iter.term()); dis.reset(iter); while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int i = 0; i < n; ++i) { ifw.nextDoc(docIdBuf[i]); } if (n < docIdBuf.length) break; } } iter.close(); ifw.close(); } for (final String stringField : fdx.getStringFields()) { final StringFieldWriter sfw = w.getStringFieldWriter(stringField); final StringTermIterator iter = fdx.getStringTermIterator(stringField); while (iter.next()) { sfw.nextTerm(iter.term()); dis.reset(iter); while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int i = 0; i < n; ++i) { sfw.nextDoc(docIdBuf[i]); } if (n < docIdBuf.length) break; } } iter.close(); sfw.close(); } dis.close(); w.close(); }
public static void addField( String indexDir, String newFieldName, FlamdexReader docReader, final String[] values) throws IOException { final int[] indices = new int[docReader.getNumDocs()]; for (int i = 0; i < indices.length; i++) { indices[i] = i; } log.debug("sorting"); Quicksortables.sort( new Quicksortable() { @Override public void swap(int i, int j) { Quicksortables.swap(indices, i, j); } @Override public int compare(int i, int j) { // Sorting logic: Primarily by value (String), secondarily by document ID (indices[i]) final String left = values[indices[i]]; final String right = values[indices[j]]; if (left.compareTo(right) < 0) { return -1; } else if (left.compareTo(right) > 0) { return 1; } else { // left == right if (indices[i] < indices[j]) { return -1; } else if (indices[i] > indices[j]) { return 1; } else { return 0; // Both value & doc ID match } } } }, values.length); log.debug("writing field " + newFieldName); final SimpleFlamdexWriter w = new SimpleFlamdexWriter(indexDir, docReader.getNumDocs(), false); final StringFieldWriter sfw = w.getStringFieldWriter(newFieldName, true); final IntArrayList docList = new IntArrayList(); docList.add(indices[0]); for (int i = 1; i < indices.length; ++i) { final String prev = values[indices[i - 1]]; final String cur = values[indices[i]]; if (cur.compareTo(prev) != 0) { sfw.nextTerm(prev); for (int j = 0; j < docList.size(); ++j) { sfw.nextDoc(docList.getInt(j)); } docList.clear(); } docList.add(indices[i]); } if (docList.size() > 0) { sfw.nextTerm(values[indices[indices.length - 1]]); for (int j = 0; j < docList.size(); ++j) { sfw.nextDoc(docList.getInt(j)); } } sfw.close(); w.close(); }
public static void merge(FlamdexReader[] readers, FlamdexWriter w) throws IOException { final DocIdStream[] docIdStreams = new DocIdStream[readers.length]; final int[] segmentStartDocs = new int[readers.length]; int totalNumDocs = 0; for (int i = 0; i < readers.length; ++i) { docIdStreams[i] = readers[i].getDocIdStream(); segmentStartDocs[i] = totalNumDocs; totalNumDocs += readers[i].getNumDocs(); } log.info("merging " + readers.length + " readers with a total of " + totalNumDocs + " docs"); final int[] indexBuf = new int[readers.length]; final int[] docIdBuf = new int[64]; for (final String intField : mergeIntFields(readers)) { final IntFieldWriter ifw = w.getIntFieldWriter(intField); final IntTermIteratorWrapper[] iterators = new IntTermIteratorWrapper[readers.length]; final IndirectPriorityQueue<IntTermIteratorWrapper> pq = new ObjectHeapSemiIndirectPriorityQueue<IntTermIteratorWrapper>( iterators, iterators.length); for (int i = 0; i < readers.length; ++i) { if (!readers[i].getIntFields().contains(intField)) continue; final IntTermIterator it = readers[i].getIntTermIterator(intField); if (it.next()) { iterators[i] = new IntTermIteratorWrapper(it, i); pq.enqueue(i); } else { it.close(); } } while (!pq.isEmpty()) { final long term = iterators[pq.first()].it.term(); int numIndexes = 0; IntTermIteratorWrapper wrap; while (!pq.isEmpty() && (wrap = iterators[pq.first()]).it.term() == term) { final int index = wrap.index; docIdStreams[index].reset(wrap.it); indexBuf[numIndexes++] = index; if (wrap.it.next()) { pq.changed(); } else { wrap.it.close(); pq.dequeue(); } } ifw.nextTerm(term); for (int i = 0; i < numIndexes; ++i) { final int index = indexBuf[i]; final int startDoc = segmentStartDocs[index]; final DocIdStream dis = docIdStreams[index]; while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int j = 0; j < n; ++j) { ifw.nextDoc(docIdBuf[j] + startDoc); } if (n < docIdBuf.length) break; } } } ifw.close(); } for (final String stringField : mergeStringFields(readers)) { final StringFieldWriter sfw = w.getStringFieldWriter(stringField); final StringTermIteratorWrapper[] iterators = new StringTermIteratorWrapper[readers.length]; final IndirectPriorityQueue<StringTermIteratorWrapper> pq = new ObjectHeapSemiIndirectPriorityQueue<StringTermIteratorWrapper>( iterators, iterators.length); for (int i = 0; i < readers.length; ++i) { if (!readers[i].getStringFields().contains(stringField)) continue; final StringTermIterator it = readers[i].getStringTermIterator(stringField); if (it.next()) { iterators[i] = new StringTermIteratorWrapper(it, i); pq.enqueue(i); } else { it.close(); } } while (!pq.isEmpty()) { final String term = iterators[pq.first()].it.term(); int numIndexes = 0; StringTermIteratorWrapper wrap; while (!pq.isEmpty() && (wrap = iterators[pq.first()]).it.term().equals(term)) { final int index = wrap.index; docIdStreams[index].reset(wrap.it); indexBuf[numIndexes++] = index; if (wrap.it.next()) { pq.changed(); } else { wrap.it.close(); pq.dequeue(); } } sfw.nextTerm(term); for (int i = 0; i < numIndexes; ++i) { final int index = indexBuf[i]; final int startDoc = segmentStartDocs[index]; final DocIdStream dis = docIdStreams[index]; while (true) { final int n = dis.fillDocIdBuffer(docIdBuf); for (int j = 0; j < n; ++j) { sfw.nextDoc(docIdBuf[j] + startDoc); } if (n < docIdBuf.length) break; } } } sfw.close(); } for (final DocIdStream dis : docIdStreams) { dis.close(); } }