private FieldInfo addOrUpdateInternal(
        String name,
        int preferredFieldNumber,
        boolean storeTermVector,
        boolean omitNorms,
        boolean storePayloads,
        IndexOptions indexOptions,
        DocValuesType docValues) {
      if (docValues == null) {
        throw new NullPointerException("DocValuesType cannot be null");
      }
      FieldInfo fi = fieldInfo(name);
      if (fi == null) {
        // This field wasn't yet added to this in-RAM
        // segment's FieldInfo, so now we get a global
        // number for this field.  If the field was seen
        // before then we'll get the same name and number,
        // else we'll allocate a new one:
        final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues);
        fi =
            new FieldInfo(
                name,
                fieldNumber,
                storeTermVector,
                omitNorms,
                storePayloads,
                indexOptions,
                docValues,
                -1,
                null);
        assert !byName.containsKey(fi.name);
        globalFieldNumbers.verifyConsistent(
            Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
        byName.put(fi.name, fi);
      } else {
        fi.update(storeTermVector, omitNorms, storePayloads, indexOptions);

        if (docValues != DocValuesType.NONE) {
          // Only pay the synchronization cost if fi does not already have a DVType
          boolean updateGlobal = fi.getDocValuesType() == DocValuesType.NONE;
          if (updateGlobal) {
            // Must also update docValuesType map so it's
            // aware of this field's DocValuesType.  This will throw IllegalArgumentException if
            // an illegal type change was attempted.
            globalFieldNumbers.setDocValuesType(fi.number, name, docValues);
          }

          fi.setDocValuesType(docValues); // this will also perform the consistency check.
        }
      }
      return fi;
    }
 @Override
 public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
   SortedSetDocValues dv = super.getSortedSetDocValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.getDocValuesType() == DocValuesType.SORTED_SET;
     return new AssertingSortedSetDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.getDocValuesType() != DocValuesType.SORTED_SET;
     return null;
   }
 }
 @Override
 public BinaryDocValues getBinaryDocValues(String field) throws IOException {
   BinaryDocValues dv = super.getBinaryDocValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.getDocValuesType() == DocValuesType.BINARY;
     return new AssertingBinaryDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.getDocValuesType() != DocValuesType.BINARY;
     return null;
   }
 }
 @Override
 public NumericDocValues getNumericDocValues(String field) throws IOException {
   NumericDocValues dv = super.getNumericDocValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.getDocValuesType() == DocValuesType.NUMERIC;
     return new AssertingNumericDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.getDocValuesType() != DocValuesType.NUMERIC;
     return null;
   }
 }
 @Override
 public Bits getDocsWithField(FieldInfo field) throws IOException {
   if (field.getDocValuesType() == DocValuesType.SORTED_SET) {
     return DocValues.docsWithValue(getSortedSet(field), maxDoc);
   } else {
     return new Bits.MatchAllBits(maxDoc);
   }
 }
  /** Constructs a new FieldInfos from an array of FieldInfo objects */
  public FieldInfos(FieldInfo[] infos) {
    boolean hasVectors = false;
    boolean hasProx = false;
    boolean hasPayloads = false;
    boolean hasOffsets = false;
    boolean hasFreq = false;
    boolean hasNorms = false;
    boolean hasDocValues = false;

    for (FieldInfo info : infos) {
      if (info.number < 0) {
        throw new IllegalArgumentException(
            "illegal field number: " + info.number + " for field " + info.name);
      }
      FieldInfo previous = byNumber.put(info.number, info);
      if (previous != null) {
        throw new IllegalArgumentException(
            "duplicate field numbers: "
                + previous.name
                + " and "
                + info.name
                + " have: "
                + info.number);
      }
      previous = byName.put(info.name, info);
      if (previous != null) {
        throw new IllegalArgumentException(
            "duplicate field names: "
                + previous.number
                + " and "
                + info.number
                + " have: "
                + info.name);
      }

      hasVectors |= info.hasVectors();
      hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
      hasFreq |= info.getIndexOptions() != IndexOptions.DOCS;
      hasOffsets |=
          info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
              >= 0;
      hasNorms |= info.hasNorms();
      hasDocValues |= info.getDocValuesType() != DocValuesType.NONE;
      hasPayloads |= info.hasPayloads();
    }

    this.hasVectors = hasVectors;
    this.hasProx = hasProx;
    this.hasPayloads = hasPayloads;
    this.hasOffsets = hasOffsets;
    this.hasFreq = hasFreq;
    this.hasNorms = hasNorms;
    this.hasDocValues = hasDocValues;
    this.values = Collections.unmodifiableCollection(byNumber.values());
  }
 public FieldInfo add(FieldInfo fi) {
   // IMPORTANT - reuse the field number if possible for consistent field numbers across segments
   return addOrUpdateInternal(
       fi.name,
       fi.number,
       fi.hasVectors(),
       fi.omitsNorms(),
       fi.hasPayloads(),
       fi.getIndexOptions(),
       fi.getDocValuesType());
 }
示例#8
0
    private FieldInfo addOrUpdateInternal(
        String name,
        int preferredFieldNumber,
        boolean isIndexed,
        boolean storeTermVector,
        boolean omitNorms,
        boolean storePayloads,
        IndexOptions indexOptions,
        DocValuesType docValues,
        DocValuesType normType) {
      FieldInfo fi = fieldInfo(name);
      if (fi == null) {
        // This field wasn't yet added to this in-RAM
        // segment's FieldInfo, so now we get a global
        // number for this field.  If the field was seen
        // before then we'll get the same name and number,
        // else we'll allocate a new one:
        final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues);
        fi =
            new FieldInfo(
                name,
                isIndexed,
                fieldNumber,
                storeTermVector,
                omitNorms,
                storePayloads,
                indexOptions,
                docValues,
                normType,
                null);
        assert !byName.containsKey(fi.name);
        assert globalFieldNumbers.containsConsistent(
            Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
        byName.put(fi.name, fi);
      } else {
        fi.update(isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions);

        if (docValues != null) {
          // only pay the synchronization cost if fi does not already have a DVType
          boolean updateGlobal = !fi.hasDocValues();
          fi.setDocValuesType(docValues); // this will also perform the consistency check.
          if (updateGlobal) {
            // must also update docValuesType map so it's
            // aware of this field's DocValueType
            globalFieldNumbers.setDocValuesType(fi.number, name, docValues);
          }
        }

        if (!fi.omitsNorms() && normType != null) {
          fi.setNormValueType(normType);
        }
      }
      return fi;
    }
  @Override
  public void write(
      Directory directory,
      SegmentInfo segmentInfo,
      String segmentSuffix,
      FieldInfos infos,
      IOContext context)
      throws IOException {
    final String fileName =
        IndexFileNames.segmentFileName(
            segmentInfo.name, segmentSuffix, Lucene46FieldInfosFormat.EXTENSION);
    try (IndexOutput output = directory.createOutput(fileName, context)) {
      CodecUtil.writeHeader(
          output, Lucene46FieldInfosFormat.CODEC_NAME, Lucene46FieldInfosFormat.FORMAT_CURRENT);
      output.writeVInt(infos.size());
      for (FieldInfo fi : infos) {
        IndexOptions indexOptions = fi.getIndexOptions();
        byte bits = 0x0;
        if (fi.hasVectors()) bits |= Lucene46FieldInfosFormat.STORE_TERMVECTOR;
        if (fi.omitsNorms()) bits |= Lucene46FieldInfosFormat.OMIT_NORMS;
        if (fi.hasPayloads()) bits |= Lucene46FieldInfosFormat.STORE_PAYLOADS;
        if (fi.getIndexOptions() != IndexOptions.NONE) {
          bits |= Lucene46FieldInfosFormat.IS_INDEXED;
          assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0
              || !fi.hasPayloads();
          if (indexOptions == IndexOptions.DOCS) {
            bits |= Lucene46FieldInfosFormat.OMIT_TERM_FREQ_AND_POSITIONS;
          } else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
            bits |= Lucene46FieldInfosFormat.STORE_OFFSETS_IN_POSTINGS;
          } else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
            bits |= Lucene46FieldInfosFormat.OMIT_POSITIONS;
          }
        }
        output.writeString(fi.name);
        output.writeVInt(fi.number);
        output.writeByte(bits);

        // pack the DV types in one byte
        final byte dv = docValuesByte(fi.getDocValuesType());
        final byte nrm = docValuesByte(fi.hasNorms() ? DocValuesType.NUMERIC : DocValuesType.NONE);
        assert (dv & (~0xF)) == 0 && (nrm & (~0x0F)) == 0;
        byte val = (byte) (0xff & ((nrm << 4) | dv));
        output.writeByte(val);
        output.writeLong(fi.getDocValuesGen());
        output.writeStringStringMap(fi.attributes());
      }
      CodecUtil.writeFooter(output);
    }
  }
示例#10
0
  /** Call this only once (if you subclass!) */
  protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix)
      throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
      throw new IllegalStateException(
          "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index =
        new int
            [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last
    // number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes =
        new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
      // No terms
      return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    // System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
      // No terms match
      return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (; ; ) {
      final BytesRef t = te.term();
      if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
        break;
      }
      // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

      visitTerm(te, termNum);

      if ((termNum & indexIntervalMask) == 0) {
        // Index this term
        sizeOfIndexedStrings += t.length;
        BytesRef indexedTerm = new BytesRef();
        indexedTermsBytes.copy(t, indexedTerm);
        // TODO: really should 1) strip off useless suffix,
        // and 2) use FST not array/PagedBytes
        indexedTerms.add(indexedTerm);
      }

      final int df = te.docFreq();
      if (df <= maxTermDocFreq) {

        postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

        // dF, but takes deletions into account
        int actualDF = 0;

        for (; ; ) {
          int doc = postingsEnum.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }
          // System.out.println("  chunk=" + chunk + " docs");

          actualDF++;
          termInstances++;

          // System.out.println("    docID=" + doc);
          // add TNUM_OFFSET to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];

          if ((val & 0xff) == 1) {
            // index into byte array (actually the end of
            // the doc-specific byte[] when building)
            int pos = val >>> 8;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos + ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit
              // boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more
              // memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val == 0) {
              ipos = 0;
            } else if ((val & 0x0000ff80) == 0) {
              ipos = 1;
            } else if ((val & 0x00ff8000) == 0) {
              ipos = 2;
            } else if ((val & 0xff800000) == 0) {
              ipos = 3;
            } else {
              ipos = 4;
            }

            // System.out.println("      ipos=" + ipos);

            int endPos = writeInt(delta, tempArr, ipos);
            // System.out.println("      endpos=" + endPos);
            if (endPos <= 4) {
              // System.out.println("      fits!");
              // value will fit in the integer... move bytes back
              for (int j = ipos; j < endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j << 3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j = 0; j < ipos; j++) {
                tempArr[j] = (byte) val;
                val >>>= 8;
              }
              // point at the end index in the byte[]
              index[doc] = (endPos << 8) | 1;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
        setActualDocFreq(termNum, actualDF);
      }

      termNum++;
      if (te.next() == null) {
        break;
      }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      tnums = null;
    } else {

      this.index = index;

      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //

      for (int pass = 0; pass < 256; pass++) {
        byte[] target = tnums[pass];
        int pos = 0; // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }

        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
          int lim = Math.min(docbase + (1 << 16), maxDoc);
          for (int doc = docbase; doc < lim; doc++) {
            // System.out.println("  pass="******" process docID=" + doc);
            int val = index[doc];
            if ((val & 0xff) == 1) {
              int len = val >>> 8;
              // System.out.println("    ptr pos=" + pos);
              index[doc] = (pos << 8) | 1; // change index to point to start of array
              if ((pos & 0xff000000) != 0) {
                // we only have 24 bits for the array index
                throw new IllegalStateException(
                    "Too many values for UnInvertedField faceting on field " + field);
              }
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
              bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                /**
                 * * we don't have to worry about the array getting too large since the "pos" param
                 * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { //
                 * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new
                 * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen
                 * <= pos + len) newlen<<=1; // doubling strategy } **
                 */
                while (newlen <= pos + len) newlen <<= 1; // doubling strategy
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1; // skip single byte at end and leave it 0 for terminator
            }
          }
        }

        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }

        tnums[pass] = target;

        if ((pass << 16) > maxDoc) break;
      }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
  }