Esempio n. 1
0
  public static void writeFlamdex(final FlamdexReader fdx, final FlamdexWriter w)
      throws IOException {
    final DocIdStream dis = fdx.getDocIdStream();
    final int[] docIdBuf = new int[DOC_ID_BUFFER_SIZE];

    for (final String intField : fdx.getIntFields()) {
      final IntFieldWriter ifw = w.getIntFieldWriter(intField);
      final IntTermIterator iter = fdx.getIntTermIterator(intField);
      while (iter.next()) {
        ifw.nextTerm(iter.term());
        dis.reset(iter);
        while (true) {
          final int n = dis.fillDocIdBuffer(docIdBuf);
          for (int i = 0; i < n; ++i) {
            ifw.nextDoc(docIdBuf[i]);
          }
          if (n < docIdBuf.length) break;
        }
      }
      iter.close();
      ifw.close();
    }

    for (final String stringField : fdx.getStringFields()) {
      final StringFieldWriter sfw = w.getStringFieldWriter(stringField);
      final StringTermIterator iter = fdx.getStringTermIterator(stringField);
      while (iter.next()) {
        sfw.nextTerm(iter.term());
        dis.reset(iter);
        while (true) {
          final int n = dis.fillDocIdBuffer(docIdBuf);
          for (int i = 0; i < n; ++i) {
            sfw.nextDoc(docIdBuf[i]);
          }
          if (n < docIdBuf.length) break;
        }
      }
      iter.close();
      sfw.close();
    }

    dis.close();
    w.close();
  }
Esempio n. 2
0
  public static void addField(
      String indexDir, String newFieldName, FlamdexReader docReader, final String[] values)
      throws IOException {
    final int[] indices = new int[docReader.getNumDocs()];
    for (int i = 0; i < indices.length; i++) {
      indices[i] = i;
    }
    log.debug("sorting");
    Quicksortables.sort(
        new Quicksortable() {
          @Override
          public void swap(int i, int j) {
            Quicksortables.swap(indices, i, j);
          }

          @Override
          public int compare(int i, int j) {
            // Sorting logic: Primarily by value (String), secondarily by document ID (indices[i])
            final String left = values[indices[i]];
            final String right = values[indices[j]];
            if (left.compareTo(right) < 0) {
              return -1;
            } else if (left.compareTo(right) > 0) {
              return 1;
            } else { // left == right
              if (indices[i] < indices[j]) {
                return -1;
              } else if (indices[i] > indices[j]) {
                return 1;
              } else {
                return 0; // Both value & doc ID match
              }
            }
          }
        },
        values.length);

    log.debug("writing field " + newFieldName);
    final SimpleFlamdexWriter w = new SimpleFlamdexWriter(indexDir, docReader.getNumDocs(), false);
    final StringFieldWriter sfw = w.getStringFieldWriter(newFieldName, true);
    final IntArrayList docList = new IntArrayList();
    docList.add(indices[0]);
    for (int i = 1; i < indices.length; ++i) {
      final String prev = values[indices[i - 1]];
      final String cur = values[indices[i]];
      if (cur.compareTo(prev) != 0) {
        sfw.nextTerm(prev);
        for (int j = 0; j < docList.size(); ++j) {
          sfw.nextDoc(docList.getInt(j));
        }
        docList.clear();
      }
      docList.add(indices[i]);
    }
    if (docList.size() > 0) {
      sfw.nextTerm(values[indices[indices.length - 1]]);
      for (int j = 0; j < docList.size(); ++j) {
        sfw.nextDoc(docList.getInt(j));
      }
    }

    sfw.close();
    w.close();
  }
Esempio n. 3
0
  public static void merge(FlamdexReader[] readers, FlamdexWriter w) throws IOException {
    final DocIdStream[] docIdStreams = new DocIdStream[readers.length];
    final int[] segmentStartDocs = new int[readers.length];
    int totalNumDocs = 0;
    for (int i = 0; i < readers.length; ++i) {
      docIdStreams[i] = readers[i].getDocIdStream();
      segmentStartDocs[i] = totalNumDocs;
      totalNumDocs += readers[i].getNumDocs();
    }

    log.info("merging " + readers.length + " readers with a total of " + totalNumDocs + " docs");

    final int[] indexBuf = new int[readers.length];
    final int[] docIdBuf = new int[64];

    for (final String intField : mergeIntFields(readers)) {
      final IntFieldWriter ifw = w.getIntFieldWriter(intField);

      final IntTermIteratorWrapper[] iterators = new IntTermIteratorWrapper[readers.length];
      final IndirectPriorityQueue<IntTermIteratorWrapper> pq =
          new ObjectHeapSemiIndirectPriorityQueue<IntTermIteratorWrapper>(
              iterators, iterators.length);
      for (int i = 0; i < readers.length; ++i) {
        if (!readers[i].getIntFields().contains(intField)) continue;
        final IntTermIterator it = readers[i].getIntTermIterator(intField);
        if (it.next()) {
          iterators[i] = new IntTermIteratorWrapper(it, i);
          pq.enqueue(i);
        } else {
          it.close();
        }
      }

      while (!pq.isEmpty()) {
        final long term = iterators[pq.first()].it.term();
        int numIndexes = 0;
        IntTermIteratorWrapper wrap;
        while (!pq.isEmpty() && (wrap = iterators[pq.first()]).it.term() == term) {
          final int index = wrap.index;
          docIdStreams[index].reset(wrap.it);
          indexBuf[numIndexes++] = index;
          if (wrap.it.next()) {
            pq.changed();
          } else {
            wrap.it.close();
            pq.dequeue();
          }
        }

        ifw.nextTerm(term);
        for (int i = 0; i < numIndexes; ++i) {
          final int index = indexBuf[i];
          final int startDoc = segmentStartDocs[index];
          final DocIdStream dis = docIdStreams[index];
          while (true) {
            final int n = dis.fillDocIdBuffer(docIdBuf);

            for (int j = 0; j < n; ++j) {
              ifw.nextDoc(docIdBuf[j] + startDoc);
            }

            if (n < docIdBuf.length) break;
          }
        }
      }

      ifw.close();
    }

    for (final String stringField : mergeStringFields(readers)) {
      final StringFieldWriter sfw = w.getStringFieldWriter(stringField);

      final StringTermIteratorWrapper[] iterators = new StringTermIteratorWrapper[readers.length];
      final IndirectPriorityQueue<StringTermIteratorWrapper> pq =
          new ObjectHeapSemiIndirectPriorityQueue<StringTermIteratorWrapper>(
              iterators, iterators.length);
      for (int i = 0; i < readers.length; ++i) {
        if (!readers[i].getStringFields().contains(stringField)) continue;
        final StringTermIterator it = readers[i].getStringTermIterator(stringField);
        if (it.next()) {
          iterators[i] = new StringTermIteratorWrapper(it, i);
          pq.enqueue(i);
        } else {
          it.close();
        }
      }

      while (!pq.isEmpty()) {
        final String term = iterators[pq.first()].it.term();
        int numIndexes = 0;
        StringTermIteratorWrapper wrap;
        while (!pq.isEmpty() && (wrap = iterators[pq.first()]).it.term().equals(term)) {
          final int index = wrap.index;
          docIdStreams[index].reset(wrap.it);
          indexBuf[numIndexes++] = index;
          if (wrap.it.next()) {
            pq.changed();
          } else {
            wrap.it.close();
            pq.dequeue();
          }
        }

        sfw.nextTerm(term);
        for (int i = 0; i < numIndexes; ++i) {
          final int index = indexBuf[i];
          final int startDoc = segmentStartDocs[index];
          final DocIdStream dis = docIdStreams[index];
          while (true) {
            final int n = dis.fillDocIdBuffer(docIdBuf);

            for (int j = 0; j < n; ++j) {
              sfw.nextDoc(docIdBuf[j] + startDoc);
            }

            if (n < docIdBuf.length) break;
          }
        }
      }

      sfw.close();
    }

    for (final DocIdStream dis : docIdStreams) {
      dis.close();
    }
  }