Пример #1
0
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   * @param fieldName Optional field name of the terms for skip terms
   */
  private void addTermFrequencies(
      Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      if (isSkipTerm(fieldName, term)) {
        continue;
      }

      final PostingsEnum docs = termsEnum.postings(null, null);
      int freq = 0;
      while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        freq += docs.freq();
      }

      // increment frequency
      Int cnt = termFreqMap.get(term);
      if (cnt == null) {
        cnt = new Int();
        termFreqMap.put(term, cnt);
        cnt.x = freq;
      } else {
        cnt.x += freq;
      }
    }
  }
  public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    Field f = newField("foo", "this is a test test", ft);
    doc.add(f);
    for (int i = 0; i < 100; i++) {
      w.addDocument(doc);
    }

    IndexReader reader = w.getReader();
    w.close();

    assertNotNull(MultiFields.getTermPositionsEnum(reader, "foo", new BytesRef("test")));

    PostingsEnum de =
        TestUtil.docs(random(), reader, "foo", new BytesRef("test"), null, PostingsEnum.FREQS);
    while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      assertEquals(2, de.freq());
    }

    reader.close();
    dir.close();
  }
Пример #3
0
 public static int[] toArray(PostingsEnum postingsEnum) throws IOException {
   List<Integer> docs = new ArrayList<>();
   while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
     int docID = postingsEnum.docID();
     docs.add(docID);
   }
   return ArrayUtil.toIntArray(docs);
 }
Пример #4
0
  public TermInfo collect(String term) throws IOException {
    TermInfo info = new TermInfo();
    BytesRef luceneTerm = new BytesRef(term.getBytes());
    // this gives documents in which the term is found, but no offset information can be retrieved
    PostingsEnum postings =
        MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm);
    // now go through each document
    int docId = postings.nextDoc();
    while (docId != PostingsEnum.NO_MORE_DOCS) {
      // get the term vector for that document.
      TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator();
      // find the term of interest
      it.seekExact(luceneTerm);
      // get its posting info. this will contain offset info
      PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS);
      postingsInDoc.nextDoc();

      Document doc = indexReader.document(docId);
      String id = doc.get(idFieldname);
      JATEDocument jd = new JATEDocument(id);
      Set<int[]> offsets = new HashSet<>();
      int totalFreq = postingsInDoc.freq();
      for (int i = 0; i < totalFreq; i++) {
        postingsInDoc.nextPosition();
        offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()});
      }
      info.getOffsets().put(jd, offsets);

      docId = postings.nextDoc();
    }

    return info;
  }
Пример #5
0
 private void verifyCount(IndexReader ir) throws Exception {
   Fields fields = MultiFields.getFields(ir);
   for (String field : fields) {
     Terms terms = fields.terms(field);
     if (terms == null) {
       continue;
     }
     int docCount = terms.getDocCount();
     FixedBitSet visited = new FixedBitSet(ir.maxDoc());
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       PostingsEnum de = TestUtil.docs(random(), te, null, PostingsEnum.NONE);
       while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
         visited.set(de.docID());
       }
     }
     assertEquals(visited.cardinality(), docCount);
   }
 }
Пример #6
0
 public AssertingPostingsEnum(PostingsEnum in) {
   super(in);
   this.doc = in.docID();
 }
Пример #7
0
  public void testMerge() throws IOException {
    final Codec codec = Codec.getDefault();
    final SegmentInfo si =
        new SegmentInfo(
            mergedDir,
            Version.LATEST,
            mergedSegment,
            -1,
            false,
            codec,
            Collections.emptyMap(),
            StringHelper.randomId(),
            new HashMap<>());

    SegmentMerger merger =
        new SegmentMerger(
            Arrays.<CodecReader>asList(reader1, reader2),
            si,
            InfoStream.getDefault(),
            mergedDir,
            new FieldInfos.FieldNumbers(),
            newIOContext(random(), new IOContext(new MergeInfo(-1, -1, false, -1))));
    MergeState mergeState = merger.merge();
    int docsMerged = mergeState.segmentInfo.maxDoc();
    assertTrue(docsMerged == 2);
    // Should be able to open a new SegmentReader against the new directory
    SegmentReader mergedReader =
        new SegmentReader(
            new SegmentCommitInfo(mergeState.segmentInfo, 0, -1L, -1L, -1L),
            newIOContext(random()));
    assertTrue(mergedReader != null);
    assertTrue(mergedReader.numDocs() == 2);
    Document newDoc1 = mergedReader.document(0);
    assertTrue(newDoc1 != null);
    // There are 2 unstored fields on the document
    assertTrue(
        DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - DocHelper.unstored.size());
    Document newDoc2 = mergedReader.document(1);
    assertTrue(newDoc2 != null);
    assertTrue(
        DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - DocHelper.unstored.size());

    PostingsEnum termDocs =
        TestUtil.docs(
            random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), null, 0);
    assertTrue(termDocs != null);
    assertTrue(termDocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);

    int tvCount = 0;
    for (FieldInfo fieldInfo : mergedReader.getFieldInfos()) {
      if (fieldInfo.hasVectors()) {
        tvCount++;
      }
    }

    // System.out.println("stored size: " + stored.size());
    assertEquals("We do not have 3 fields that were indexed with term vector", 3, tvCount);

    Terms vector = mergedReader.getTermVectors(0).terms(DocHelper.TEXT_FIELD_2_KEY);
    assertNotNull(vector);
    assertEquals(3, vector.size());
    TermsEnum termsEnum = vector.iterator();

    int i = 0;
    while (termsEnum.next() != null) {
      String term = termsEnum.term().utf8ToString();
      int freq = (int) termsEnum.totalTermFreq();
      // System.out.println("Term: " + term + " Freq: " + freq);
      assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
      assertTrue(DocHelper.FIELD_2_FREQS[i] == freq);
      i++;
    }

    TestSegmentReader.checkNorms(mergedReader);
    mergedReader.close();
  }
  @Test
  public void testRandom() throws Exception {
    Directory directory = newDirectory();
    final Random r = random();
    final IndexWriterConfig iwc =
        LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r))
            .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
            .setRAMBufferSizeMB(
                scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here
    RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc);
    int numUniqueChildValues = scaledRandomIntBetween(100, 2000);
    String[] childValues = new String[numUniqueChildValues];
    for (int i = 0; i < numUniqueChildValues; i++) {
      childValues[i] = Integer.toString(i);
    }

    IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet();
    int childDocId = 0;
    int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues);
    ObjectObjectOpenHashMap<String, NavigableSet<String>> childValueToParentIds =
        new ObjectObjectOpenHashMap<>();
    for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) {
      boolean markParentAsDeleted = rarely();
      boolean filterMe = rarely();
      String parent = Integer.toString(parentDocId);
      Document document = new Document();
      document.add(
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
      if (markParentAsDeleted) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("delete", "me", Field.Store.NO));
      }
      if (filterMe) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("filter", "me", Field.Store.NO));
      }
      indexWriter.addDocument(document);

      final int numChildDocs = scaledRandomIntBetween(0, 100);
      for (int i = 0; i < numChildDocs; i++) {
        boolean markChildAsDeleted = rarely();
        String childValue = childValues[random().nextInt(childValues.length)];

        document = new Document();
        document.add(
            new StringField(
                UidFieldMapper.NAME,
                Uid.createUid("child", Integer.toString(childDocId++)),
                Field.Store.NO));
        document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
        document.add(
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        document.add(new StringField("field1", childValue, Field.Store.NO));
        if (markChildAsDeleted) {
          document.add(new StringField("delete", "me", Field.Store.NO));
        }
        indexWriter.addDocument(document);

        if (!markChildAsDeleted) {
          NavigableSet<String> parentIds;
          if (childValueToParentIds.containsKey(childValue)) {
            parentIds = childValueToParentIds.lget();
          } else {
            childValueToParentIds.put(childValue, parentIds = new TreeSet<>());
          }
          if (!markParentAsDeleted && !filterMe) {
            parentIds.add(parent);
          }
        }
      }
    }

    // Delete docs that are marked to be deleted.
    indexWriter.deleteDocuments(new Term("delete", "me"));

    indexWriter.commit();
    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) SearchContext.current())
        .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

    int max = numUniqueChildValues / 4;
    for (int i = 0; i < max; i++) {
      // Simulate a parent update
      if (random().nextBoolean()) {
        final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size();
        int numberOfUpdates = scaledRandomIntBetween(0, numberOfUpdatableParents);
        for (int j = 0; j < numberOfUpdates; j++) {
          int parentId;
          do {
            parentId = random().nextInt(numParentDocs);
          } while (filteredOrDeletedDocs.contains(parentId));

          String parentUid = Uid.createUid("parent", Integer.toString(parentId));
          indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid));

          Document document = new Document();
          document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES));
          document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
          indexWriter.addDocument(document);
        }

        indexReader.close();
        indexReader = DirectoryReader.open(indexWriter.w, true);
        searcher = new IndexSearcher(indexReader);
        engineSearcher =
            new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
        ((TestSearchContext) SearchContext.current())
            .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));
      }

      String childValue = childValues[random().nextInt(numUniqueChildValues)];
      int shortCircuitParentDocSet = random().nextInt(numParentDocs);
      QueryBuilder queryBuilder;
      if (random().nextBoolean()) {
        queryBuilder =
            hasChildQuery("child", termQuery("field1", childValue))
                .setShortCircuitCutoff(shortCircuitParentDocSet);
      } else {
        queryBuilder =
            constantScoreQuery(
                hasChildFilter("child", termQuery("field1", childValue))
                    .setShortCircuitCutoff(shortCircuitParentDocSet));
      }
      // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not
      // get live docs as acceptedDocs
      queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me")));
      Query query = parseQuery(queryBuilder);

      BitSetCollector collector = new BitSetCollector(indexReader.maxDoc());
      searcher.search(query, collector);
      FixedBitSet actualResult = collector.getResult();

      FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc());
      if (childValueToParentIds.containsKey(childValue)) {
        LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader);
        Terms terms = slowLeafReader.terms(UidFieldMapper.NAME);
        if (terms != null) {
          NavigableSet<String> parentIds = childValueToParentIds.lget();
          TermsEnum termsEnum = terms.iterator(null);
          PostingsEnum docsEnum = null;
          for (String id : parentIds) {
            TermsEnum.SeekStatus seekStatus =
                termsEnum.seekCeil(Uid.createUidAsBytes("parent", id));
            if (seekStatus == TermsEnum.SeekStatus.FOUND) {
              docsEnum =
                  termsEnum.postings(slowLeafReader.getLiveDocs(), docsEnum, PostingsEnum.NONE);
              expectedResult.set(docsEnum.nextDoc());
            } else if (seekStatus == TermsEnum.SeekStatus.END) {
              break;
            }
          }
        }
      }

      assertBitSet(actualResult, expectedResult, searcher);
    }

    indexWriter.close();
    indexReader.close();
    directory.close();
  }