@Override
  public void setScorer(Scorer scorer) {
    // System.out.println("C.setScorer scorer=" + scorer);
    // Since we invoke .score(), and the comparators likely
    // do as well, cache it so it's only "really" computed
    // once:
    this.scorer = new ScoreCachingWrappingScorer(scorer);
    for (int compIDX = 0; compIDX < comparators.length; compIDX++) {
      comparators[compIDX].setScorer(this.scorer);
    }
    Arrays.fill(joinScorers, null);

    Queue<Scorer> queue = new LinkedList<>();
    // System.out.println("\nqueue: add top scorer=" + scorer);
    queue.add(scorer);
    while ((scorer = queue.poll()) != null) {
      // System.out.println("  poll: " + scorer + "; " + scorer.getWeight().getQuery());
      if (scorer instanceof ToParentBlockJoinQuery.BlockJoinScorer) {
        enroll(
            (ToParentBlockJoinQuery) scorer.getWeight().getQuery(),
            (ToParentBlockJoinQuery.BlockJoinScorer) scorer);
      }

      for (ChildScorer sub : scorer.getChildren()) {
        // System.out.println("  add sub: " + sub.child + "; " + sub.child.getWeight().getQuery());
        queue.add(sub.child);
      }
    }
  }
Пример #2
0
  // Skip encoding for updating the index
  void createIndex2(int nDocs, String... fields) throws IOException {
    Set<String> fieldSet = new HashSet<String>(Arrays.asList(fields));

    SolrQueryRequest req = lrf.makeRequest();
    SolrQueryResponse rsp = new SolrQueryResponse();
    UpdateRequestProcessorChain processorChain = req.getCore().getUpdateProcessingChain(null);
    UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp);

    boolean foomany_s = fieldSet.contains("foomany_s");
    boolean foo1_s = fieldSet.contains("foo1_s");
    boolean foo2_s = fieldSet.contains("foo2_s");
    boolean foo4_s = fieldSet.contains("foo4_s");
    boolean foo8_s = fieldSet.contains("foo8_s");
    boolean t10_100_ws = fieldSet.contains("t10_100_ws");

    for (int i = 0; i < nDocs; i++) {
      SolrInputDocument doc = new SolrInputDocument();
      doc.addField("id", Float.toString(i));
      if (foomany_s) {
        doc.addField("foomany_s", t(r.nextInt(nDocs * 10)));
      }
      if (foo1_s) {
        doc.addField("foo1_s", t(0));
      }
      if (foo2_s) {
        doc.addField("foo2_s", r.nextInt(2));
      }
      if (foo4_s) {
        doc.addField("foo4_s", r.nextInt(4));
      }
      if (foo8_s) {
        doc.addField("foo8_s", r.nextInt(8));
      }
      if (t10_100_ws) {
        StringBuilder sb = new StringBuilder(9 * 100);
        for (int j = 0; j < 100; j++) {
          sb.append(' ');
          sb.append(t(r.nextInt(10)));
        }
        doc.addField("t10_100_ws", sb.toString());
      }

      AddUpdateCommand cmd = new AddUpdateCommand();
      cmd.solrDoc = doc;
      processor.processAdd(cmd);
    }
    processor.finish();
    req.close();

    assertU(commit());

    req = lrf.makeRequest();
    assertEquals(nDocs, req.getSearcher().maxDoc());
    req.close();
  }
 @Override
 public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds)
     throws IOException {
   final TIntDoubleHashMap scores = getConceptVector(phrase, validIds);
   Integer luceneIds[] = ArrayUtils.toObject(scores.keys());
   Arrays.sort(
       luceneIds,
       new Comparator<Integer>() {
         @Override
         public int compare(Integer id1, Integer id2) {
           return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2));
         }
       });
   DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults));
   for (int i = 0; i < result.numDocs(); i++) {
     result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i]));
   }
   return normalize(result);
 }
    public void seek(TermEnum terms) throws IOException {
      original.seek(terms);

      docFreq = terms.docFreq();
      pointer = -1;

      if (docFreq > postingMaps.length) { // grow postingsMap
        PostingMap[] newMap = new PostingMap[docFreq];
        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
        for (int i = postingMaps.length; i < docFreq; i++) {
          newMap[i] = new PostingMap();
        }
        postingMaps = newMap;
      }

      out.reset();

      int i = 0;
      while (original.next()) {
        PostingMap map = postingMaps[i++];
        map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
        map.offset = out.getFilePointer(); // save pointer to buffer

        final int tf = original.freq(); // buffer tf & positions
        out.writeVInt(tf);
        int prevPosition = 0;
        for (int j = tf; j > 0; j--) { // delta encode positions
          int p = original.nextPosition();
          out.writeVInt(p - prevPosition);
          prevPosition = p;
        }
      }
      out.flush();
      docFreq = i; // allow for deletions

      Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
      // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space

      // NOTE: this might be substantially faster if RAMInputStream were public
      // and supported a reset() operation.
      in = tempDir.openInput(TEMP_FILE);
    }
  // private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException {
  private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException {
    int readerMax = reader.maxDoc();
    DocScore[] newToOld = new DocScore[readerMax];

    // use site, an indexed, un-tokenized field to get boost
    // byte[] boosts = reader.norms("site"); TODO MC
    /* TODO MC */
    Document docMeta;
    Pattern includes = Pattern.compile("\\|");
    String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, "");
    String includeExtensions[] = includes.split(value);
    Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>();
    for (int i = 0; i < includeExtensions.length; i++) {
      validExtensions.put(includeExtensions[i], true);
      System.out.println("extension boosted " + includeExtensions[i]);
    }
    /* TODO MC */

    for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) {
      float score;
      if (reader.isDeleted(oldDoc)) {
        // score = 0.0f;
        score = -1f; // TODO MC
      } else {
        // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC
        /* TODO MC */
        docMeta = searcher.doc(oldDoc);
        if (validExtensions.get(docMeta.get("subType"))
            == null) { // searched extensions will have higher scores
          score = -0.5f;
        } else {
          score = Integer.parseInt(docMeta.get("inlinks"));
          /*
          if (score==0) {
          	score=0.001f; // TODO MC - to not erase
          }
          */
        }
        /* TODO MC */
        // System.out.println("Score for old document "+oldDoc+" is "+score+" and type
        // "+docMeta.get("subType")); // TODO MC debug remove
      }
      DocScore docScore = new DocScore();
      docScore.doc = oldDoc;
      docScore.score = score;
      newToOld[oldDoc] = docScore;
    }

    System.out.println("Sorting " + newToOld.length + " documents.");
    Arrays.sort(newToOld);
    // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space

    /* TODO MC
    int[] oldToNew = new int[readerMax];
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
      DocScore docScore = newToOld[newDoc];
      //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC
      oldToNew[docScore.oldDoc] = newDoc; // TODO MC
    }
    */

    /* TODO MC *
    for (int newDoc = 0; newDoc < readerMax; newDoc++) {
    	DocScore docScore = newToOld[newDoc];
    	System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove
    }
    * TODO MC */

    // return oldToNew; TODO MC
    return newToOld; // TODO MC
  }
  /**
   * @param scrollSort Whether to ignore the from and sort all hits in each shard result. Only used
   *     for scroll search
   * @param resultsArr Shard result holder
   */
  public ScoreDoc[] sortDocs(
      boolean scrollSort, AtomicArray<? extends QuerySearchResultProvider> resultsArr)
      throws IOException {
    List<? extends AtomicArray.Entry<? extends QuerySearchResultProvider>> results =
        resultsArr.asList();
    if (results.isEmpty()) {
      return EMPTY_DOCS;
    }

    if (optimizeSingleShard) {
      boolean canOptimize = false;
      QuerySearchResult result = null;
      int shardIndex = -1;
      if (results.size() == 1) {
        canOptimize = true;
        result = results.get(0).value.queryResult();
        shardIndex = results.get(0).index;
      } else {
        // lets see if we only got hits from a single shard, if so, we can optimize...
        for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : results) {
          if (entry.value.queryResult().topDocs().scoreDocs.length > 0) {
            if (result != null) { // we already have one, can't really optimize
              canOptimize = false;
              break;
            }
            canOptimize = true;
            result = entry.value.queryResult();
            shardIndex = entry.index;
          }
        }
      }
      if (canOptimize) {
        int offset = result.from();
        if (scrollSort) {
          offset = 0;
        }
        ScoreDoc[] scoreDocs = result.topDocs().scoreDocs;
        if (scoreDocs.length == 0 || scoreDocs.length < offset) {
          return EMPTY_DOCS;
        }

        int resultDocsSize = result.size();
        if ((scoreDocs.length - offset) < resultDocsSize) {
          resultDocsSize = scoreDocs.length - offset;
        }
        ScoreDoc[] docs = new ScoreDoc[resultDocsSize];
        for (int i = 0; i < resultDocsSize; i++) {
          ScoreDoc scoreDoc = scoreDocs[offset + i];
          scoreDoc.shardIndex = shardIndex;
          docs[i] = scoreDoc;
        }
        return docs;
      }
    }

    @SuppressWarnings("unchecked")
    AtomicArray.Entry<? extends QuerySearchResultProvider>[] sortedResults =
        results.toArray(new AtomicArray.Entry[results.size()]);
    Arrays.sort(sortedResults, QUERY_RESULT_ORDERING);
    QuerySearchResultProvider firstResult = sortedResults[0].value;

    final Sort sort;
    if (firstResult.queryResult().topDocs() instanceof TopFieldDocs) {
      TopFieldDocs firstTopDocs = (TopFieldDocs) firstResult.queryResult().topDocs();
      sort = new Sort(firstTopDocs.fields);
    } else {
      sort = null;
    }

    int topN = firstResult.queryResult().size();
    // Need to use the length of the resultsArr array, since the slots will be based on the position
    // in the resultsArr array
    TopDocs[] shardTopDocs = new TopDocs[resultsArr.length()];
    if (firstResult.includeFetch()) {
      // if we did both query and fetch on the same go, we have fetched all the docs from each
      // shards already, use them...
      // this is also important since we shortcut and fetch only docs from "from" and up to "size"
      topN *= sortedResults.length;
    }
    for (AtomicArray.Entry<? extends QuerySearchResultProvider> sortedResult : sortedResults) {
      TopDocs topDocs = sortedResult.value.queryResult().topDocs();
      // the 'index' field is the position in the resultsArr atomic array
      shardTopDocs[sortedResult.index] = topDocs;
    }
    int from = firstResult.queryResult().from();
    if (scrollSort) {
      from = 0;
    }
    // TopDocs#merge can't deal with null shard TopDocs
    for (int i = 0; i < shardTopDocs.length; i++) {
      if (shardTopDocs[i] == null) {
        shardTopDocs[i] = Lucene.EMPTY_TOP_DOCS;
      }
    }
    TopDocs mergedTopDocs = TopDocs.merge(sort, from, topN, shardTopDocs);
    return mergedTopDocs.scoreDocs;
  }