@Override public DocScoreList mostSimilar(String phrase, int maxResults, TIntSet validIds) throws IOException { final TIntDoubleHashMap scores = getConceptVector(phrase, validIds); Integer luceneIds[] = ArrayUtils.toObject(scores.keys()); Arrays.sort( luceneIds, new Comparator<Integer>() { @Override public int compare(Integer id1, Integer id2) { return -1 * new Double(scores.get(id1)).compareTo(scores.get(id2)); } }); DocScoreList result = new DocScoreList(Math.min(luceneIds.length, maxResults)); for (int i = 0; i < result.numDocs(); i++) { result.set(i, esaHelper.luceneIdToWpId(luceneIds[i]), scores.get(luceneIds[i])); } return normalize(result); }
public void seek(TermEnum terms) throws IOException { original.seek(terms); docFreq = terms.docFreq(); pointer = -1; if (docFreq > postingMaps.length) { // grow postingsMap PostingMap[] newMap = new PostingMap[docFreq]; System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); for (int i = postingMaps.length; i < docFreq; i++) { newMap[i] = new PostingMap(); } postingMaps = newMap; } out.reset(); int i = 0; while (original.next()) { PostingMap map = postingMaps[i++]; map.newDoc = oldToNew[original.doc()]; // remap the newDoc id map.offset = out.getFilePointer(); // save pointer to buffer final int tf = original.freq(); // buffer tf & positions out.writeVInt(tf); int prevPosition = 0; for (int j = tf; j > 0; j--) { // delta encode positions int p = original.nextPosition(); out.writeVInt(p - prevPosition); prevPosition = p; } } out.flush(); docFreq = i; // allow for deletions Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space // NOTE: this might be substantially faster if RAMInputStream were public // and supported a reset() operation. in = tempDir.openInput(TEMP_FILE); }
// private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost // byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { // score = 0.0f; score = -1f; // TODO MC } else { // score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ // System.out.println("Score for old document "+oldDoc+" is "+score+" and type // "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); // HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ // return oldToNew; TODO MC return newToOld; // TODO MC }
/** * @param scrollSort Whether to ignore the from and sort all hits in each shard result. Only used * for scroll search * @param resultsArr Shard result holder */ public ScoreDoc[] sortDocs( boolean scrollSort, AtomicArray<? extends QuerySearchResultProvider> resultsArr) throws IOException { List<? extends AtomicArray.Entry<? extends QuerySearchResultProvider>> results = resultsArr.asList(); if (results.isEmpty()) { return EMPTY_DOCS; } if (optimizeSingleShard) { boolean canOptimize = false; QuerySearchResult result = null; int shardIndex = -1; if (results.size() == 1) { canOptimize = true; result = results.get(0).value.queryResult(); shardIndex = results.get(0).index; } else { // lets see if we only got hits from a single shard, if so, we can optimize... for (AtomicArray.Entry<? extends QuerySearchResultProvider> entry : results) { if (entry.value.queryResult().topDocs().scoreDocs.length > 0) { if (result != null) { // we already have one, can't really optimize canOptimize = false; break; } canOptimize = true; result = entry.value.queryResult(); shardIndex = entry.index; } } } if (canOptimize) { int offset = result.from(); if (scrollSort) { offset = 0; } ScoreDoc[] scoreDocs = result.topDocs().scoreDocs; if (scoreDocs.length == 0 || scoreDocs.length < offset) { return EMPTY_DOCS; } int resultDocsSize = result.size(); if ((scoreDocs.length - offset) < resultDocsSize) { resultDocsSize = scoreDocs.length - offset; } ScoreDoc[] docs = new ScoreDoc[resultDocsSize]; for (int i = 0; i < resultDocsSize; i++) { ScoreDoc scoreDoc = scoreDocs[offset + i]; scoreDoc.shardIndex = shardIndex; docs[i] = scoreDoc; } return docs; } } @SuppressWarnings("unchecked") AtomicArray.Entry<? extends QuerySearchResultProvider>[] sortedResults = results.toArray(new AtomicArray.Entry[results.size()]); Arrays.sort(sortedResults, QUERY_RESULT_ORDERING); QuerySearchResultProvider firstResult = sortedResults[0].value; final Sort sort; if (firstResult.queryResult().topDocs() instanceof TopFieldDocs) { TopFieldDocs firstTopDocs = (TopFieldDocs) firstResult.queryResult().topDocs(); sort = new Sort(firstTopDocs.fields); } else { sort = null; } int topN = firstResult.queryResult().size(); // Need to use the length of the resultsArr array, since the slots will be based on the position // in the resultsArr array TopDocs[] shardTopDocs = new TopDocs[resultsArr.length()]; if (firstResult.includeFetch()) { // if we did both query and fetch on the same go, we have fetched all the docs from each // shards already, use them... // this is also important since we shortcut and fetch only docs from "from" and up to "size" topN *= sortedResults.length; } for (AtomicArray.Entry<? extends QuerySearchResultProvider> sortedResult : sortedResults) { TopDocs topDocs = sortedResult.value.queryResult().topDocs(); // the 'index' field is the position in the resultsArr atomic array shardTopDocs[sortedResult.index] = topDocs; } int from = firstResult.queryResult().from(); if (scrollSort) { from = 0; } // TopDocs#merge can't deal with null shard TopDocs for (int i = 0; i < shardTopDocs.length; i++) { if (shardTopDocs[i] == null) { shardTopDocs[i] = Lucene.EMPTY_TOP_DOCS; } } TopDocs mergedTopDocs = TopDocs.merge(sort, from, topN, shardTopDocs); return mergedTopDocs.scoreDocs; }