SepDocsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs)
        throws IOException {
      this.liveDocs = liveDocs;
      this.indexOptions = fieldInfo.getIndexOptions();
      omitTF = indexOptions == IndexOptions.DOCS_ONLY;
      storePayloads = fieldInfo.hasPayloads();

      // TODO: can't we only do this if consumer
      // skipped consuming the previous docs?
      docIndex.copyFrom(termState.docIndex);
      docIndex.seek(docReader);

      if (!omitTF) {
        freqIndex.copyFrom(termState.freqIndex);
        freqIndex.seek(freqReader);
      }

      docFreq = termState.docFreq;
      // NOTE: unused if docFreq < skipMinimum:
      skipFP = termState.skipFP;
      count = 0;
      doc = -1;
      accum = 0;
      freq = 1;
      skipped = false;

      return this;
    }
 private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
   for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
     FieldInfo info = infos.fieldInfo(fieldNumber);
     if (info == null) {
       throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
     } else if (!info.hasNorms()) {
       throw new CorruptIndexException("Invalid field: " + info.name, meta);
     }
     NormsEntry entry = new NormsEntry();
     entry.docsWithFieldOffset = meta.readLong();
     entry.numDocsWithField = meta.readInt();
     entry.bytesPerNorm = meta.readByte();
     switch (entry.bytesPerNorm) {
       case 0:
       case 1:
       case 2:
       case 4:
       case 8:
         break;
       default:
         throw new CorruptIndexException(
             "Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " + info.name, meta);
     }
     entry.normsOffset = meta.readLong();
     norms.put(info.number, entry);
   }
 }
 public void testDocValuesUnstored() throws IOException {
   Directory dir = newDirectory();
   IndexWriterConfig iwconfig =
       newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
   iwconfig.setMergePolicy(newLogMergePolicy());
   IndexWriter writer = new IndexWriter(dir, iwconfig);
   for (int i = 0; i < 50; i++) {
     Document doc = new Document();
     doc.add(new NumericDocValuesField("dv", i));
     doc.add(new TextField("docId", "" + i, Field.Store.YES));
     writer.addDocument(doc);
   }
   DirectoryReader r = writer.getReader();
   SlowCompositeReaderWrapper slow = new SlowCompositeReaderWrapper(r);
   FieldInfos fi = slow.getFieldInfos();
   FieldInfo dvInfo = fi.fieldInfo("dv");
   assertTrue(dvInfo.hasDocValues());
   NumericDocValues dv = slow.getNumericDocValues("dv");
   for (int i = 0; i < 50; i++) {
     assertEquals(i, dv.get(i));
     StoredDocument d = slow.document(i);
     // cannot use d.get("dv") due to another bug!
     assertNull(d.getField("dv"));
     assertEquals(Integer.toString(i), d.get("docId"));
   }
   slow.close();
   writer.close();
   dir.close();
 }
Esempio n. 4
0
 // Currently, this instance is re-used across fields, so
 // our parent calls setField whenever the field changes
 @Override
 public void setField(FieldInfo fieldInfo) {
   this.indexOptions = fieldInfo.getIndexOptions();
   if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
   storePayloads = fieldInfo.hasPayloads();
   wrappedPostingsWriter.setField(fieldInfo);
   // DEBUG = BlockTreeTermsWriter.DEBUG;
 }
 public FieldInfo add(FieldInfo fi) {
   // IMPORTANT - reuse the field number if possible for consistent field numbers across segments
   return addOrUpdateInternal(
       fi.name,
       fi.number,
       fi.hasVectors(),
       fi.omitsNorms(),
       fi.hasPayloads(),
       fi.getIndexOptions(),
       fi.getDocValuesType());
 }
    private FieldInfo addOrUpdateInternal(
        String name,
        int preferredFieldNumber,
        boolean storeTermVector,
        boolean omitNorms,
        boolean storePayloads,
        IndexOptions indexOptions,
        DocValuesType docValues) {
      if (docValues == null) {
        throw new NullPointerException("DocValuesType cannot be null");
      }
      FieldInfo fi = fieldInfo(name);
      if (fi == null) {
        // This field wasn't yet added to this in-RAM
        // segment's FieldInfo, so now we get a global
        // number for this field.  If the field was seen
        // before then we'll get the same name and number,
        // else we'll allocate a new one:
        final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues);
        fi =
            new FieldInfo(
                name,
                fieldNumber,
                storeTermVector,
                omitNorms,
                storePayloads,
                indexOptions,
                docValues,
                -1,
                null);
        assert !byName.containsKey(fi.name);
        globalFieldNumbers.verifyConsistent(
            Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
        byName.put(fi.name, fi);
      } else {
        fi.update(storeTermVector, omitNorms, storePayloads, indexOptions);

        if (docValues != DocValuesType.NONE) {
          // Only pay the synchronization cost if fi does not already have a DVType
          boolean updateGlobal = fi.getDocValuesType() == DocValuesType.NONE;
          if (updateGlobal) {
            // Must also update docValuesType map so it's
            // aware of this field's DocValuesType.  This will throw IllegalArgumentException if
            // an illegal type change was attempted.
            globalFieldNumbers.setDocValuesType(fi.number, name, docValues);
          }

          fi.setDocValuesType(docValues); // this will also perform the consistency check.
        }
      }
      return fi;
    }
Esempio n. 7
0
 @Override
 public NumericDocValues getNormValues(String field) throws IOException {
   NumericDocValues dv = super.getNormValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.hasNorms();
     return new AssertingNumericDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.hasNorms() == false;
     return null;
   }
 }
Esempio n. 8
0
 @Override
 public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
   SortedSetDocValues dv = super.getSortedSetDocValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.getDocValuesType() == DocValuesType.SORTED_SET;
     return new AssertingSortedSetDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.getDocValuesType() != DocValuesType.SORTED_SET;
     return null;
   }
 }
Esempio n. 9
0
 @Override
 public BinaryDocValues getBinaryDocValues(String field) throws IOException {
   BinaryDocValues dv = super.getBinaryDocValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.getDocValuesType() == DocValuesType.BINARY;
     return new AssertingBinaryDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.getDocValuesType() != DocValuesType.BINARY;
     return null;
   }
 }
Esempio n. 10
0
 @Override
 public NumericDocValues getNumericDocValues(String field) throws IOException {
   NumericDocValues dv = super.getNumericDocValues(field);
   FieldInfo fi = getFieldInfos().fieldInfo(field);
   if (dv != null) {
     assert fi != null;
     assert fi.getDocValuesType() == DocValuesType.NUMERIC;
     return new AssertingNumericDocValues(dv, maxDoc());
   } else {
     assert fi == null || fi.getDocValuesType() != DocValuesType.NUMERIC;
     return null;
   }
 }
Esempio n. 11
0
  @Override
  public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOException {

    PointValues values = reader.getValues(fieldInfo.name);
    boolean singleValuePerDoc = values.size() == values.getDocCount();

    try (BKDWriter writer =
        new BKDWriter(
            writeState.segmentInfo.maxDoc(),
            writeState.directory,
            writeState.segmentInfo.name,
            fieldInfo.getPointDimensionCount(),
            fieldInfo.getPointNumBytes(),
            maxPointsInLeafNode,
            maxMBSortInHeap,
            values.size(),
            singleValuePerDoc)) {

      if (values instanceof MutablePointValues) {
        final long fp = writer.writeField(dataOut, fieldInfo.name, (MutablePointValues) values);
        if (fp != -1) {
          indexFPs.put(fieldInfo.name, fp);
        }
        return;
      }

      values.intersect(
          new IntersectVisitor() {
            @Override
            public void visit(int docID) {
              throw new IllegalStateException();
            }

            public void visit(int docID, byte[] packedValue) throws IOException {
              writer.add(packedValue, docID);
            }

            @Override
            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
              return Relation.CELL_CROSSES_QUERY;
            }
          });

      // We could have 0 points on merge since all docs with dimensional fields may be deleted:
      if (writer.getPointCount() > 0) {
        indexFPs.put(fieldInfo.name, writer.finish(dataOut));
      }
    }
  }
    SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits liveDocs)
        throws IOException {
      this.liveDocs = liveDocs;
      storePayloads = fieldInfo.hasPayloads();
      // System.out.println("Sep D&P init");

      // TODO: can't we only do this if consumer
      // skipped consuming the previous docs?
      docIndex.copyFrom(termState.docIndex);
      docIndex.seek(docReader);
      // System.out.println("  docIndex=" + docIndex);

      freqIndex.copyFrom(termState.freqIndex);
      freqIndex.seek(freqReader);
      // System.out.println("  freqIndex=" + freqIndex);

      posIndex.copyFrom(termState.posIndex);
      // System.out.println("  posIndex=" + posIndex);
      posSeekPending = true;
      payloadPending = false;

      payloadFP = termState.payloadFP;
      skipFP = termState.skipFP;
      // System.out.println("  skipFP=" + skipFP);

      docFreq = termState.docFreq;
      count = 0;
      doc = -1;
      accum = 0;
      pendingPosCount = 0;
      pendingPayloadBytes = 0;
      skipped = false;

      return this;
    }
  @Override
  public DocsAndPositionsEnum docsAndPositions(
      FieldInfo fieldInfo,
      BlockTermState _termState,
      Bits liveDocs,
      DocsAndPositionsEnum reuse,
      int flags)
      throws IOException {

    assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
    final SepTermState termState = (SepTermState) _termState;
    SepDocsAndPositionsEnum postingsEnum;
    if (reuse == null || !(reuse instanceof SepDocsAndPositionsEnum)) {
      postingsEnum = new SepDocsAndPositionsEnum();
    } else {
      postingsEnum = (SepDocsAndPositionsEnum) reuse;
      if (postingsEnum.startDocIn != docIn) {
        // If you are using ParellelReader, and pass in a
        // reused DocsAndPositionsEnum, it could have come
        // from another reader also using sep codec
        postingsEnum = new SepDocsAndPositionsEnum();
      }
    }

    return postingsEnum.init(fieldInfo, termState, liveDocs);
  }
Esempio n. 14
0
 @Override
 public boolean hasOffsets() {
   return fieldInfo
           .getIndexOptions()
           .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
       >= 0;
 }
 @Override
 public Bits getDocsWithField(FieldInfo field) throws IOException {
   if (field.getDocValuesType() == DocValuesType.SORTED_SET) {
     return DocValues.docsWithValue(getSortedSet(field), maxDoc);
   } else {
     return new Bits.MatchAllBits(maxDoc);
   }
 }
Esempio n. 16
0
 InsaneReader(LeafReader in, String insaneField) {
   super(in);
   this.insaneField = insaneField;
   ArrayList<FieldInfo> filteredInfos = new ArrayList<>();
   for (FieldInfo fi : in.getFieldInfos()) {
     if (fi.name.equals(insaneField)) {
       filteredInfos.add(
           new FieldInfo(
               fi.name,
               fi.number,
               fi.hasVectors(),
               fi.omitsNorms(),
               fi.hasPayloads(),
               fi.getIndexOptions(),
               DocValuesType.NONE,
               -1,
               Collections.emptyMap(),
               fi.getPointDimensionCount(),
               fi.getPointNumBytes()));
     } else {
       filteredInfos.add(fi);
     }
   }
   fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()]));
 }
Esempio n. 17
0
  public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)
      throws IOException {
    final String termsFileName =
        IndexFileNames.segmentFileName(
            state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);

    this.postingsReader = postingsReader;
    final IndexInput in = state.directory.openInput(termsFileName, state.context);

    boolean success = false;
    try {
      version = readHeader(in);
      if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) {
        CodecUtil.checksumEntireFile(in);
      }
      this.postingsReader.init(in);
      seekDir(in);

      final FieldInfos fieldInfos = state.fieldInfos;
      final int numFields = in.readVInt();
      for (int i = 0; i < numFields; i++) {
        int fieldNumber = in.readVInt();
        FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
        long numTerms = in.readVLong();
        long sumTotalTermFreq =
            fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
        long sumDocFreq = in.readVLong();
        int docCount = in.readVInt();
        int longsSize = in.readVInt();
        TermsReader current =
            new TermsReader(
                fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
        TermsReader previous = fields.put(fieldInfo.name, current);
        checkFieldSummary(state.segmentInfo, in, current, previous);
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(in);
      } else {
        IOUtils.closeWhileHandlingException(in);
      }
    }
  }
  void createDocumentNode(final DocumentDescriptor inDescriptor) throws IOException {
    try {
      _document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
      _rootNode = _document.createElement("document");
    } catch (ParserConfigurationException e) {
      e.printStackTrace();
      System.exit(1);
    }
    AtomicReader segmentReader = _indexReader.leaves().get(inDescriptor.segmentNumber).reader();
    _rootNode.setAttribute("id", DocumentIdOperations.documentDescriptorToId(inDescriptor));
    // TODO: implement the proper way of building a title from the production report
    _rootNode.setAttribute("title", buildDocumentTitle(segmentReader, inDescriptor));
    _rootNode.setAttribute("path", "ruscorpora.ru");
    _rootNode.setAttribute(
        "tagging", Attributes.getDocAttribute(segmentReader, inDescriptor.localId, "tagging"));
    _rootNode.setAttribute("snippets", "0");
    Element attributesNode = _document.createElement("attributes");

    _rootNode.appendChild(attributesNode);

    FieldInfos fields = segmentReader.getFieldInfos();
    for (int fieldIndex = 0; fieldIndex != fields.size(); ++fieldIndex) {
      FieldInfo field = fields.fieldInfo(fieldIndex);
      // TODO: understand why field may turn into null
      if (field == null) {
        continue;
      }
      String name = field.name;
      if (Attributes.ATTRIBUTES.contains(name)
          || Attributes.ATTRIBUTES_FOR_REPORT.contains(name)
          || Attributes.ATTRIBUTES_FOR_WORD_INFO.contains(name)
          || !field.hasDocValues()) {
        // it's a word attribute
        continue;
      }
      Element attrNode = _document.createElement("attr");
      attrNode.setAttribute("name", name);
      attrNode.setAttribute(
          "value", Attributes.getDocAttribute(segmentReader, inDescriptor.localId, name));
      attributesNode.appendChild(attrNode);
    }
  }
 @Override
 public void finishDoc() throws IOException {
   assert state == PostingsConsumerState.START;
   state = PostingsConsumerState.INITIAL;
   if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
     assert positionCount == 0; // we should not have fed any positions!
   } else {
     assert positionCount == freq;
   }
   in.finishDoc();
 }
  /** Constructs a new FieldInfos from an array of FieldInfo objects */
  public FieldInfos(FieldInfo[] infos) {
    boolean hasVectors = false;
    boolean hasProx = false;
    boolean hasPayloads = false;
    boolean hasOffsets = false;
    boolean hasFreq = false;
    boolean hasNorms = false;
    boolean hasDocValues = false;

    for (FieldInfo info : infos) {
      if (info.number < 0) {
        throw new IllegalArgumentException(
            "illegal field number: " + info.number + " for field " + info.name);
      }
      FieldInfo previous = byNumber.put(info.number, info);
      if (previous != null) {
        throw new IllegalArgumentException(
            "duplicate field numbers: "
                + previous.name
                + " and "
                + info.name
                + " have: "
                + info.number);
      }
      previous = byName.put(info.name, info);
      if (previous != null) {
        throw new IllegalArgumentException(
            "duplicate field names: "
                + previous.number
                + " and "
                + info.number
                + " have: "
                + info.name);
      }

      hasVectors |= info.hasVectors();
      hasProx |= info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
      hasFreq |= info.getIndexOptions() != IndexOptions.DOCS;
      hasOffsets |=
          info.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
              >= 0;
      hasNorms |= info.hasNorms();
      hasDocValues |= info.getDocValuesType() != DocValuesType.NONE;
      hasPayloads |= info.hasPayloads();
    }

    this.hasVectors = hasVectors;
    this.hasProx = hasProx;
    this.hasPayloads = hasPayloads;
    this.hasOffsets = hasOffsets;
    this.hasFreq = hasFreq;
    this.hasNorms = hasNorms;
    this.hasDocValues = hasDocValues;
    this.values = Collections.unmodifiableCollection(byNumber.values());
  }
Esempio n. 21
0
 @BeforeClass
 public static void beforeClass() throws Exception {
   testDoc = new Document();
   fieldInfos = new FieldInfos.Builder();
   DocHelper.setupDoc(testDoc);
   for (IndexableField field : testDoc.getFields()) {
     FieldInfo fieldInfo = fieldInfos.getOrAdd(field.name());
     IndexableFieldType ift = field.fieldType();
     fieldInfo.setIndexOptions(ift.indexOptions());
     if (ift.omitNorms()) {
       fieldInfo.setOmitsNorms();
     }
     fieldInfo.setDocValuesType(ift.docValuesType());
   }
   dir = newDirectory();
   IndexWriterConfig conf =
       newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy());
   conf.getMergePolicy().setNoCFSRatio(0.0);
   IndexWriter writer = new IndexWriter(dir, conf);
   writer.addDocument(testDoc);
   writer.close();
 }
Esempio n. 22
0
    private FieldInfo addOrUpdateInternal(
        String name,
        int preferredFieldNumber,
        boolean isIndexed,
        boolean storeTermVector,
        boolean omitNorms,
        boolean storePayloads,
        IndexOptions indexOptions,
        DocValuesType docValues,
        DocValuesType normType) {
      FieldInfo fi = fieldInfo(name);
      if (fi == null) {
        // This field wasn't yet added to this in-RAM
        // segment's FieldInfo, so now we get a global
        // number for this field.  If the field was seen
        // before then we'll get the same name and number,
        // else we'll allocate a new one:
        final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, docValues);
        fi =
            new FieldInfo(
                name,
                isIndexed,
                fieldNumber,
                storeTermVector,
                omitNorms,
                storePayloads,
                indexOptions,
                docValues,
                normType,
                null);
        assert !byName.containsKey(fi.name);
        assert globalFieldNumbers.containsConsistent(
            Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
        byName.put(fi.name, fi);
      } else {
        fi.update(isIndexed, storeTermVector, omitNorms, storePayloads, indexOptions);

        if (docValues != null) {
          // only pay the synchronization cost if fi does not already have a DVType
          boolean updateGlobal = !fi.hasDocValues();
          fi.setDocValuesType(docValues); // this will also perform the consistency check.
          if (updateGlobal) {
            // must also update docValuesType map so it's
            // aware of this field's DocValueType
            globalFieldNumbers.setDocValuesType(fi.number, name, docValues);
          }
        }

        if (!fi.omitsNorms() && normType != null) {
          fi.setNormValueType(normType);
        }
      }
      return fi;
    }
  @Override
  public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
    final SepTermState termState = (SepTermState) _termState;
    final boolean isFirstTerm = termState.termBlockOrd == 0;
    // System.out.println("SEPR.nextTerm termCount=" + termState.termBlockOrd + " isFirstTerm=" +
    // isFirstTerm + " bytesReader.pos=" + termState.bytesReader.getPosition());
    // System.out.println("  docFreq=" + termState.docFreq);
    termState.docIndex.read(termState.bytesReader, isFirstTerm);
    // System.out.println("  docIndex=" + termState.docIndex);
    if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
      termState.freqIndex.read(termState.bytesReader, isFirstTerm);
      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
        // System.out.println("  freqIndex=" + termState.freqIndex);
        termState.posIndex.read(termState.bytesReader, isFirstTerm);
        // System.out.println("  posIndex=" + termState.posIndex);
        if (fieldInfo.hasPayloads()) {
          if (isFirstTerm) {
            termState.payloadFP = termState.bytesReader.readVLong();
          } else {
            termState.payloadFP += termState.bytesReader.readVLong();
          }
          // System.out.println("  payloadFP=" + termState.payloadFP);
        }
      }
    }

    if (termState.docFreq >= skipMinimum) {
      // System.out.println("   readSkip @ " + termState.bytesReader.getPosition());
      if (isFirstTerm) {
        termState.skipFP = termState.bytesReader.readVLong();
      } else {
        termState.skipFP += termState.bytesReader.readVLong();
      }
      // System.out.println("  skipFP=" + termState.skipFP);
    } else if (isFirstTerm) {
      termState.skipFP = 0;
    }
  }
 @Override
 public void addPosition(int position, BytesRef payload, int startOffset, int endOffset)
     throws IOException {
   assert state == PostingsConsumerState.START;
   assert positionCount < freq;
   positionCount++;
   assert position >= lastPosition
       || position == -1; /* we still allow -1 from old 3.x indexes */
   lastPosition = position;
   if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
     assert startOffset >= 0;
     assert startOffset >= lastStartOffset;
     lastStartOffset = startOffset;
     assert endOffset >= startOffset;
   } else {
     assert startOffset == -1;
     assert endOffset == -1;
   }
   if (payload != null) {
     assert fieldInfo.hasPayloads();
   }
   in.addPosition(position, payload, startOffset, endOffset);
 }
 @Override
 public void finishTerm(BytesRef text, TermStats stats) throws IOException {
   assert state == TermsConsumerState.START;
   state = TermsConsumerState.INITIAL;
   assert text.equals(lastTerm);
   assert stats.docFreq > 0; // otherwise, this method should not be called.
   assert stats.docFreq == lastPostingsConsumer.docFreq;
   sumDocFreq += stats.docFreq;
   if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
     assert stats.totalTermFreq == -1;
   } else {
     assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
     sumTotalTermFreq += stats.totalTermFreq;
   }
   in.finishTerm(text, stats);
 }
 @Override
 public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
   assert state == TermsConsumerState.INITIAL
       || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
   state = TermsConsumerState.FINISHED;
   assert docCount >= 0;
   assert docCount == visitedDocs.cardinality();
   assert sumDocFreq >= docCount;
   assert sumDocFreq == this.sumDocFreq;
   if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
     assert sumTotalTermFreq == -1;
   } else {
     assert sumTotalTermFreq >= sumDocFreq;
     assert sumTotalTermFreq == this.sumTotalTermFreq;
   }
   in.finish(sumTotalTermFreq, sumDocFreq, docCount);
 }
Esempio n. 27
0
    // Finishes all terms in this field
    @Override
    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
      if (numTerms > 0) {
        blockBuilder.finish();

        // We better have one final "root" block:
        assert pending.size() == 1 && !pending.get(0).isTerm
            : "pending.size()=" + pending.size() + " pending=" + pending;
        final PendingBlock root = (PendingBlock) pending.get(0);
        assert root.prefix.length == 0;
        assert root.index.getEmptyOutput() != null;

        this.sumTotalTermFreq = sumTotalTermFreq;
        this.sumDocFreq = sumDocFreq;
        this.docCount = docCount;

        // Write FST to index
        indexStartFP = indexOut.getFilePointer();
        root.index.save(indexOut);
        // System.out.println("  write FST " + indexStartFP + " field=" + fieldInfo.name);

        // if (SAVE_DOT_FILES || DEBUG) {
        //   final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
        //   Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
        //   Util.toDot(root.index, w, false, false);
        //   System.out.println("SAVED to " + dotFileName);
        //   w.close();
        // }

        fields.add(
            new FieldMetaData(
                fieldInfo,
                ((PendingBlock) pending.get(0)).index.getEmptyOutput(),
                numTerms,
                indexStartFP,
                sumTotalTermFreq,
                sumDocFreq,
                docCount));
      } else {
        assert sumTotalTermFreq == 0
            || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
        assert sumDocFreq == 0;
        assert docCount == 0;
      }
    }
 @Override
 public void startDoc(int docID, int freq) throws IOException {
   assert state == PostingsConsumerState.INITIAL;
   state = PostingsConsumerState.START;
   assert docID >= 0;
   if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
     assert freq == -1;
     this.freq = 0; // we don't expect any positions here
   } else {
     assert freq > 0;
     this.freq = freq;
     totalTermFreq += freq;
   }
   this.positionCount = 0;
   this.lastPosition = 0;
   this.lastStartOffset = 0;
   docFreq++;
   visitedDocs.set(docID);
   in.startDoc(docID, freq);
 }
  /** Produce _X.nrm if any document had a field with norms not disabled */
  @Override
  public void flush(
      Map<String, InvertedDocEndConsumerPerField> fieldsToFlush, SegmentWriteState state)
      throws IOException {
    boolean success = false;
    boolean anythingFlushed = false;
    try {
      if (state.fieldInfos.hasNorms()) {
        for (FieldInfo fi : state.fieldInfos) {
          final NormsConsumerPerField toWrite = (NormsConsumerPerField) fieldsToFlush.get(fi.name);
          // we must check the final value of omitNorms for the fieldinfo, it could have
          // changed for this field since the first time we added it.
          if (!fi.omitsNorms()) {
            if (toWrite != null && toWrite.initialized()) {
              anythingFlushed = true;
              final Type type = toWrite.flush(state.segmentInfo.getDocCount());
              assert fi.getNormType() == type;
            } else if (fi.isIndexed()) {
              anythingFlushed = true;
              assert fi.getNormType() == null : "got " + fi.getNormType() + "; field=" + fi.name;
            }
          }
        }
      }

      success = true;
      if (!anythingFlushed && consumer != null) {
        consumer.abort();
      }
    } finally {
      if (success) {
        IOUtils.close(consumer);
      } else {
        IOUtils.closeWhileHandlingException(consumer);
      }
    }
  }
Esempio n. 30
0
  /** Call this only once (if you subclass!) */
  protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix)
      throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
      throw new IllegalStateException(
          "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index =
        new int
            [maxDoc]; // immediate term numbers, or the index into the byte[] representing the last
    // number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes =
        new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
      // No terms
      return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    // System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
      // No terms match
      return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (; ; ) {
      final BytesRef t = te.term();
      if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
        break;
      }
      // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

      visitTerm(te, termNum);

      if ((termNum & indexIntervalMask) == 0) {
        // Index this term
        sizeOfIndexedStrings += t.length;
        BytesRef indexedTerm = new BytesRef();
        indexedTermsBytes.copy(t, indexedTerm);
        // TODO: really should 1) strip off useless suffix,
        // and 2) use FST not array/PagedBytes
        indexedTerms.add(indexedTerm);
      }

      final int df = te.docFreq();
      if (df <= maxTermDocFreq) {

        postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

        // dF, but takes deletions into account
        int actualDF = 0;

        for (; ; ) {
          int doc = postingsEnum.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }
          // System.out.println("  chunk=" + chunk + " docs");

          actualDF++;
          termInstances++;

          // System.out.println("    docID=" + doc);
          // add TNUM_OFFSET to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];

          if ((val & 0xff) == 1) {
            // index into byte array (actually the end of
            // the doc-specific byte[] when building)
            int pos = val >>> 8;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos + ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit
              // boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more
              // memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val == 0) {
              ipos = 0;
            } else if ((val & 0x0000ff80) == 0) {
              ipos = 1;
            } else if ((val & 0x00ff8000) == 0) {
              ipos = 2;
            } else if ((val & 0xff800000) == 0) {
              ipos = 3;
            } else {
              ipos = 4;
            }

            // System.out.println("      ipos=" + ipos);

            int endPos = writeInt(delta, tempArr, ipos);
            // System.out.println("      endpos=" + endPos);
            if (endPos <= 4) {
              // System.out.println("      fits!");
              // value will fit in the integer... move bytes back
              for (int j = ipos; j < endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j << 3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j = 0; j < ipos; j++) {
                tempArr[j] = (byte) val;
                val >>>= 8;
              }
              // point at the end index in the byte[]
              index[doc] = (endPos << 8) | 1;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
        setActualDocFreq(termNum, actualDF);
      }

      termNum++;
      if (te.next() == null) {
        break;
      }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      tnums = null;
    } else {

      this.index = index;

      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //

      for (int pass = 0; pass < 256; pass++) {
        byte[] target = tnums[pass];
        int pos = 0; // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }

        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
          int lim = Math.min(docbase + (1 << 16), maxDoc);
          for (int doc = docbase; doc < lim; doc++) {
            // System.out.println("  pass="******" process docID=" + doc);
            int val = index[doc];
            if ((val & 0xff) == 1) {
              int len = val >>> 8;
              // System.out.println("    ptr pos=" + pos);
              index[doc] = (pos << 8) | 1; // change index to point to start of array
              if ((pos & 0xff000000) != 0) {
                // we only have 24 bits for the array index
                throw new IllegalStateException(
                    "Too many values for UnInvertedField faceting on field " + field);
              }
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
              bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                /**
                 * * we don't have to worry about the array getting too large since the "pos" param
                 * will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { //
                 * overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new
                 * SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen
                 * <= pos + len) newlen<<=1; // doubling strategy } **
                 */
                while (newlen <= pos + len) newlen <<= 1; // doubling strategy
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1; // skip single byte at end and leave it 0 for terminator
            }
          }
        }

        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }

        tnums[pass] = target;

        if ((pass << 16) > maxDoc) break;
      }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
  }