public TIntDoubleHashMap getConceptVector(String phrase, TIntSet validIds) throws IOException {
   synchronized (phraseCache) {
     if (phraseCache.containsKey(phrase)) {
       return phraseCache.get(phrase);
     }
   }
   QueryParser parser = new QueryParser(Version.LUCENE_42, "text", analyzer);
   TopDocs docs = null;
   try {
     docs = searcher.search(parser.parse(phrase), esaHelper.getWpIdFilter(validIds), 5000);
   } catch (org.apache.lucene.queryparser.classic.ParseException e) {
     LOG.log(Level.WARNING, "parsing of phrase " + phrase + " failed", e);
     return null;
   }
   pruneSimilar(docs);
   TIntDoubleHashMap result = expandScores(docs.scoreDocs);
   synchronized (phraseCache) {
     phraseCache.put(phrase, result);
   }
   return result;
   //        System.out.println("top docs for " + phrase + " are:");
   //        for (int i = 0; i < 50 && i < docs.scoreDocs.length; i++) {
   //            ScoreDoc sd = docs.scoreDocs[i];
   //            Document d = reader.document(sd.doc);
   //
   //            System.out.println("\t" + sd.score + ": " +
   //                    d.get("title") + ", " + d.get("text").split("\\s+").length +
   //                    ", " + d.get("inlinks"));
   //        }
 }
 @Override
 public DocScoreList mostSimilar(int wpId, int maxResults, TIntSet validIds) throws IOException {
   if (hasCachedMostSimilar(wpId)) {
     return getCachedMostSimilar(wpId, maxResults, validIds);
   }
   MoreLikeThis mlt = getMoreLikeThis();
   int luceneId = esaHelper.wpIdToLuceneId(wpId);
   Query query;
   if (luceneId >= 0) {
     query = mlt.like(luceneId);
   } else if (textHelper != null && textHelper.wpIdToLuceneId(wpId) >= 0) {
     Document d = textHelper.wpIdToLuceneDoc(wpId);
     String text = d.get(Page.FIELD_TEXT);
     query = mlt.like(new StringReader(text), Page.FIELD_TEXT);
   } else {
     return null;
   }
   TopDocs similarDocs = searcher.search(query, esaHelper.getWpIdFilter(validIds), maxResults);
   pruneSimilar(similarDocs);
   DocScoreList scores = new DocScoreList(similarDocs.scoreDocs.length);
   for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
     ScoreDoc sd = similarDocs.scoreDocs[i];
     scores.set(i, esaHelper.luceneIdToWpId(sd.doc), similarDocs.scoreDocs[i].score);
   }
   return normalize(scores);
 }
  public double singleSearch(int docNum)
      throws IOException, InstantiationException, IllegalAccessException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));

    // -----------

    String query = reader.document(docNum).getValues("hash")[0];
    CEDD ceddQuery = new CEDD();
    ceddQuery.setByteArrayRepresentation(
        reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().bytes,
        reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().offset,
        reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().length);

    // -----------

    HashSet<String> gold = new HashSet<String>(numImagesEval);
    ImageSearcher cis = ImageSearcherFactory.createCEDDImageSearcher(100);
    ImageSearchHits hits = cis.search(reader.document(docNum), reader);
    for (int i = 0; i < 10; i++) {
      gold.add(hits.doc(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]);
    }

    // ------------

    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(
        new SimilarityBase() {
          @Override
          protected float score(BasicStats basicStats, float freq, float v2) {
            return 1;
          }

          @Override
          public String toString() {
            return null;
          }
        });
    TopDocs topDocs = searcher.search(createQuery(query), 500);
    topDocs = rerank(topDocs, ceddQuery, reader);
    //        System.out.println("topDocs.scoreDocs.length = " + topDocs.scoreDocs.length);
    double numMatches = 0;
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
      ScoreDoc scoreDoc = topDocs.scoreDocs[i];
      //            System.out.print(scoreDoc.score + ": ");
      String file =
          reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
      //            System.out.println(file.substring(file.lastIndexOf('/') + 1) +
      // (gold.contains(file)?" x":" o"));
      if (gold.contains(file)) numMatches++;
    }
    return numMatches;
  }
  @Override
  public double similarity(int wpId1, int wpId2) throws IOException {
    int doc1 = esaHelper.wpIdToLuceneId(wpId1);
    int doc2 = esaHelper.wpIdToLuceneId(wpId2);

    if (doc1 < 0 || doc2 < 0) {
      return normalize(0.0);
    }

    MoreLikeThis mlt = getMoreLikeThis();
    TopDocs similarDocs =
        searcher.search(mlt.like(doc1), new FieldCacheTermsFilter("id", "" + wpId2), 1);
    if (similarDocs.scoreDocs.length == 0) {
      return normalize(0);
    } else {
      assert (similarDocs.scoreDocs.length == 1);
      assert (similarDocs.scoreDocs[0].doc == doc2);
      return normalize(similarDocs.scoreDocs[0].score);
    }
  }
  public void testOutputSearchResults()
      throws IOException, InstantiationException, IllegalAccessException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int docNum = 0; // doc to search for.
    // -----------

    String query = reader.document(docNum).getValues("hash")[0];
    CEDD ceddQuery = new CEDD();
    ceddQuery.setByteArrayRepresentation(
        reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().bytes,
        reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().offset,
        reader.document(docNum).getField(DocumentBuilder.FIELD_NAME_CEDD).binaryValue().length);

    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs topDocs = searcher.search(createQuery(query), numImagesEval);
    FileUtils.saveImageResultsToPng(
        "result_lsh",
        topDocs,
        reader.document(docNum).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0],
        reader);
  }
Ejemplo n.º 6
0
  /**
   * Assert that the {@code scoreType} operates as expected and parents are found in the expected
   * order.
   *
   * <p>This will use the test index's parent/child types to create parents with multiple children.
   * Each child will have a randomly generated scored stored in {@link #CHILD_SCORE_NAME}, which is
   * used to score based on the {@code scoreType} by using a {@link MockScorer} to determine the
   * expected scores.
   *
   * @param scoreType The score type to use within the query to score parents relative to their
   *     children.
   * @throws IOException if any unexpected error occurs
   */
  private void assertScoreType(ScoreType scoreType) throws IOException {
    SearchContext context = SearchContext.current();
    Directory directory = newDirectory();
    IndexWriter writer =
        new IndexWriter(directory, newIndexWriterConfig(new MockAnalyzer(random())));

    // calculates the expected score per parent
    MockScorer scorer = new MockScorer(scoreType);
    scorer.scores = new FloatArrayList(10);

    // number of parents to generate
    int parentDocs = scaledRandomIntBetween(2, 10);
    // unique child ID
    int childDocId = 0;

    // Parent ID to expected score
    Map<String, Float> parentScores = new TreeMap<>();

    // Add a few random parents to ensure that the children's score is appropriately taken into
    // account
    for (int parentDocId = 0; parentDocId < parentDocs; ++parentDocId) {
      String parent = Integer.toString(parentDocId);

      // Create the parent
      Document parentDocument = new Document();

      parentDocument.add(
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      parentDocument.add(new StringField(IdFieldMapper.NAME, parent, Field.Store.YES));
      parentDocument.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));

      // add the parent to the index
      writer.addDocument(parentDocument);

      int numChildDocs = scaledRandomIntBetween(1, 10);

      // forget any parent's previous scores
      scorer.scores.clear();

      // associate children with the parent
      for (int i = 0; i < numChildDocs; ++i) {
        int childScore = random().nextInt(128);

        Document childDocument = new Document();

        childDocument.add(
            new StringField(
                UidFieldMapper.NAME,
                Uid.createUid("child", Integer.toString(childDocId++)),
                Field.Store.NO));
        childDocument.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
        // parent association:
        childDocument.add(
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        childDocument.add(new DoubleField(CHILD_SCORE_NAME, childScore, Field.Store.NO));

        // remember the score to be calculated
        scorer.scores.add(childScore);

        // add the associated child to the index
        writer.addDocument(childDocument);
      }

      // this score that should be returned for this parent
      parentScores.put(parent, scorer.score());
    }

    writer.commit();

    IndexReader reader = DirectoryReader.open(writer, true);
    IndexSearcher searcher = new IndexSearcher(reader);

    // setup to read the parent/child map
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) context).setSearcher(new ContextIndexSearcher(context, engineSearcher));

    // child query that returns the score as the value of "childScore" for each child document, with
    // the parent's score determined by the score type
    QueryBuilder childQueryBuilder =
        functionScoreQuery(typeFilter("child"))
            .add(new FieldValueFactorFunctionBuilder(CHILD_SCORE_NAME));
    QueryBuilder queryBuilder =
        hasChildQuery("child", childQueryBuilder)
            .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH))
            .setShortCircuitCutoff(parentDocs);

    // Perform the search for the documents using the selected score type
    TopDocs docs = searcher.search(parseQuery(queryBuilder), parentDocs);
    assertThat("Expected all parents", docs.totalHits, is(parentDocs));

    // score should be descending (just a sanity check)
    float topScore = docs.scoreDocs[0].score;

    // ensure each score is returned as expected
    for (int i = 0; i < parentDocs; ++i) {
      ScoreDoc scoreDoc = docs.scoreDocs[i];
      // get the ID from the document to get its expected score; remove it so we cannot double-count
      // it
      float score = parentScores.remove(reader.document(scoreDoc.doc).get(IdFieldMapper.NAME));

      // expect exact match
      assertThat("Unexpected score", scoreDoc.score, is(score));
      assertThat("Not descending", score, lessThanOrEqualTo(topScore));

      // it had better keep descending
      topScore = score;
    }

    reader.close();
    writer.close();
    directory.close();
  }
Ejemplo n.º 7
0
  @Test
  public void testRandom() throws Exception {
    Directory directory = newDirectory();
    final Random r = random();
    final IndexWriterConfig iwc =
        LuceneTestCase.newIndexWriterConfig(r, new MockAnalyzer(r))
            .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
            .setRAMBufferSizeMB(
                scaledRandomIntBetween(16, 64)); // we might index a lot - don't go crazy here
    RandomIndexWriter indexWriter = new RandomIndexWriter(r, directory, iwc);
    int numUniqueChildValues = scaledRandomIntBetween(100, 2000);
    String[] childValues = new String[numUniqueChildValues];
    for (int i = 0; i < numUniqueChildValues; i++) {
      childValues[i] = Integer.toString(i);
    }

    IntOpenHashSet filteredOrDeletedDocs = new IntOpenHashSet();

    int childDocId = 0;
    int numParentDocs = scaledRandomIntBetween(1, numUniqueChildValues);
    ObjectObjectOpenHashMap<String, NavigableMap<String, FloatArrayList>> childValueToParentIds =
        new ObjectObjectOpenHashMap<>();
    for (int parentDocId = 0; parentDocId < numParentDocs; parentDocId++) {
      boolean markParentAsDeleted = rarely();
      boolean filterMe = rarely();
      String parent = Integer.toString(parentDocId);
      Document document = new Document();
      document.add(
          new StringField(UidFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.YES));
      document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
      if (markParentAsDeleted) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("delete", "me", Field.Store.NO));
      }
      if (filterMe) {
        filteredOrDeletedDocs.add(parentDocId);
        document.add(new StringField("filter", "me", Field.Store.NO));
      }
      indexWriter.addDocument(document);

      int numChildDocs = scaledRandomIntBetween(0, 100);
      for (int i = 0; i < numChildDocs; i++) {
        boolean markChildAsDeleted = rarely();
        String childValue = childValues[random().nextInt(childValues.length)];

        document = new Document();
        document.add(
            new StringField(
                UidFieldMapper.NAME,
                Uid.createUid("child", Integer.toString(childDocId++)),
                Field.Store.NO));
        document.add(new StringField(TypeFieldMapper.NAME, "child", Field.Store.NO));
        document.add(
            new StringField(
                ParentFieldMapper.NAME, Uid.createUid("parent", parent), Field.Store.NO));
        document.add(new StringField("field1", childValue, Field.Store.NO));
        if (markChildAsDeleted) {
          document.add(new StringField("delete", "me", Field.Store.NO));
        }
        indexWriter.addDocument(document);

        if (!markChildAsDeleted) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores;
          if (childValueToParentIds.containsKey(childValue)) {
            parentIdToChildScores = childValueToParentIds.lget();
          } else {
            childValueToParentIds.put(childValue, parentIdToChildScores = new TreeMap<>());
          }
          if (!markParentAsDeleted && !filterMe) {
            FloatArrayList childScores = parentIdToChildScores.get(parent);
            if (childScores == null) {
              parentIdToChildScores.put(parent, childScores = new FloatArrayList());
            }
            childScores.add(1f);
          }
        }
      }
    }

    // Delete docs that are marked to be deleted.
    indexWriter.deleteDocuments(new Term("delete", "me"));
    indexWriter.commit();

    IndexReader indexReader = DirectoryReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(indexReader);
    Engine.Searcher engineSearcher =
        new Engine.Searcher(ChildrenQueryTests.class.getSimpleName(), searcher);
    ((TestSearchContext) SearchContext.current())
        .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));

    int max = numUniqueChildValues / 4;
    for (int i = 0; i < max; i++) {
      // Simulate a parent update
      if (random().nextBoolean()) {
        final int numberOfUpdatableParents = numParentDocs - filteredOrDeletedDocs.size();
        int numberOfUpdates =
            RandomInts.randomIntBetween(
                random(), 0, Math.min(numberOfUpdatableParents, TEST_NIGHTLY ? 25 : 5));
        for (int j = 0; j < numberOfUpdates; j++) {
          int parentId;
          do {
            parentId = random().nextInt(numParentDocs);
          } while (filteredOrDeletedDocs.contains(parentId));

          String parentUid = Uid.createUid("parent", Integer.toString(parentId));
          indexWriter.deleteDocuments(new Term(UidFieldMapper.NAME, parentUid));

          Document document = new Document();
          document.add(new StringField(UidFieldMapper.NAME, parentUid, Field.Store.YES));
          document.add(new StringField(TypeFieldMapper.NAME, "parent", Field.Store.NO));
          indexWriter.addDocument(document);
        }

        indexReader.close();
        indexReader = DirectoryReader.open(indexWriter.w, true);
        searcher = new IndexSearcher(indexReader);
        engineSearcher =
            new Engine.Searcher(ChildrenConstantScoreQueryTests.class.getSimpleName(), searcher);
        ((TestSearchContext) SearchContext.current())
            .setSearcher(new ContextIndexSearcher(SearchContext.current(), engineSearcher));
      }

      String childValue = childValues[random().nextInt(numUniqueChildValues)];
      int shortCircuitParentDocSet = random().nextInt(numParentDocs);
      ScoreType scoreType = ScoreType.values()[random().nextInt(ScoreType.values().length)];
      // leave min/max set to 0 half the time
      int minChildren = random().nextInt(2) * scaledRandomIntBetween(0, 110);
      int maxChildren = random().nextInt(2) * scaledRandomIntBetween(minChildren, 110);

      QueryBuilder queryBuilder =
          hasChildQuery("child", constantScoreQuery(termQuery("field1", childValue)))
              .scoreType(scoreType.name().toLowerCase(Locale.ENGLISH))
              .minChildren(minChildren)
              .maxChildren(maxChildren)
              .setShortCircuitCutoff(shortCircuitParentDocSet);
      // Using a FQ, will invoke / test the Scorer#advance(..) and also let the Weight#scorer not
      // get live docs as acceptedDocs
      queryBuilder = filteredQuery(queryBuilder, notFilter(termFilter("filter", "me")));
      Query query = parseQuery(queryBuilder);
      BitSetCollector collector = new BitSetCollector(indexReader.maxDoc());
      int numHits = 1 + random().nextInt(25);
      TopScoreDocCollector actualTopDocsCollector = TopScoreDocCollector.create(numHits);
      searcher.search(query, MultiCollector.wrap(collector, actualTopDocsCollector));
      FixedBitSet actualResult = collector.getResult();

      FixedBitSet expectedResult = new FixedBitSet(indexReader.maxDoc());
      TopScoreDocCollector expectedTopDocsCollector = TopScoreDocCollector.create(numHits);
      if (childValueToParentIds.containsKey(childValue)) {
        LeafReader slowLeafReader = SlowCompositeReaderWrapper.wrap(indexReader);
        final FloatArrayList[] scores = new FloatArrayList[slowLeafReader.maxDoc()];
        Terms terms = slowLeafReader.terms(UidFieldMapper.NAME);
        if (terms != null) {
          NavigableMap<String, FloatArrayList> parentIdToChildScores = childValueToParentIds.lget();
          TermsEnum termsEnum = terms.iterator(null);
          DocsEnum docsEnum = null;
          for (Map.Entry<String, FloatArrayList> entry : parentIdToChildScores.entrySet()) {
            int count = entry.getValue().elementsCount;
            if (count >= minChildren && (maxChildren == 0 || count <= maxChildren)) {
              TermsEnum.SeekStatus seekStatus =
                  termsEnum.seekCeil(Uid.createUidAsBytes("parent", entry.getKey()));
              if (seekStatus == TermsEnum.SeekStatus.FOUND) {
                docsEnum =
                    termsEnum.docs(slowLeafReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
                expectedResult.set(docsEnum.nextDoc());
                scores[docsEnum.docID()] = new FloatArrayList(entry.getValue());
              } else if (seekStatus == TermsEnum.SeekStatus.END) {
                break;
              }
            }
          }
        }
        MockScorer mockScorer = new MockScorer(scoreType);
        final LeafCollector leafCollector =
            expectedTopDocsCollector.getLeafCollector(slowLeafReader.getContext());
        leafCollector.setScorer(mockScorer);
        for (int doc = expectedResult.nextSetBit(0);
            doc < slowLeafReader.maxDoc();
            doc =
                doc + 1 >= expectedResult.length()
                    ? DocIdSetIterator.NO_MORE_DOCS
                    : expectedResult.nextSetBit(doc + 1)) {
          mockScorer.scores = scores[doc];
          leafCollector.collect(doc);
        }
      }

      assertBitSet(actualResult, expectedResult, searcher);
      assertTopDocs(actualTopDocsCollector.topDocs(), expectedTopDocsCollector.topDocs());
    }

    indexWriter.close();
    indexReader.close();
    directory.close();
  }
Ejemplo n.º 8
0
  /**
   * Searches pages using a particular combination of flags.
   *
   * @param query The query to perform in Lucene query language
   * @param flags A set of flags
   * @return A Collection of SearchResult instances
   * @throws ProviderException if there is a problem with the backend
   */
  public Collection findPages(String query, int flags) throws ProviderException {
    IndexSearcher searcher = null;
    ArrayList<SearchResult> list = null;
    Highlighter highlighter = null;

    try {
      String[] queryfields = {
        LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS
      };
      QueryParser qp =
          new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer());

      // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
      Query luceneQuery = qp.parse(query);

      if ((flags & FLAG_CONTEXTS) != 0) {
        highlighter =
            new Highlighter(
                new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"),
                new SimpleHTMLEncoder(),
                new QueryScorer(luceneQuery));
      }

      try {
        File dir = new File(m_luceneDirectory);
        Directory luceneDir = new SimpleFSDirectory(dir, null);
        IndexReader reader = IndexReader.open(luceneDir);
        searcher = new IndexSearcher(reader);
      } catch (Exception ex) {
        log.info("Lucene not yet ready; indexing not started", ex);
        return null;
      }

      ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;

      list = new ArrayList<SearchResult>(hits.length);
      for (int curr = 0; curr < hits.length; curr++) {
        int docID = hits[curr].doc;
        Document doc = searcher.doc(docID);
        String pageName = doc.get(LUCENE_ID);
        WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);

        if (page != null) {
          if (page instanceof Attachment) {
            // Currently attachments don't look nice on the search-results page
            // When the search-results are cleaned up this can be enabled again.
          }

          int score = (int) (hits[curr].score * 100);

          // Get highlighted search contexts
          String text = doc.get(LUCENE_PAGE_CONTENTS);

          String[] fragments = new String[0];
          if (text != null && highlighter != null) {
            TokenStream tokenStream =
                getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
            fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
          }

          SearchResult result = new SearchResultImpl(page, score, fragments);
          list.add(result);
        } else {
          log.error(
              "Lucene found a result page '"
                  + pageName
                  + "' that could not be loaded, removing from Lucene cache");
          pageRemoved(new WikiPage(m_engine, pageName));
        }
      }
    } catch (IOException e) {
      log.error("Failed during lucene search", e);
    } catch (ParseException e) {
      log.info("Broken query; cannot parse query ", e);

      throw new ProviderException(
          "You have entered a query Lucene cannot process: " + e.getMessage());
    } catch (InvalidTokenOffsetsException e) {
      log.error("Tokens are incompatible with provided text ", e);
    } finally {
      if (searcher != null) {
        try {
          searcher.close();
        } catch (IOException e) {
          log.error(e);
        }
      }
    }

    return list;
  }