private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { SolrParams params = req.getParams(); String[] docTexts = doc.getValues(fieldName); // according to Document javadoc, doc.getValues() never returns null. check empty instead of // null if (docTexts.length == 0) return; SolrIndexSearcher searcher = req.getSearcher(); IndexSchema schema = searcher.getSchema(); TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization try { TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } } catch (IllegalArgumentException e) { // No problem. But we can't use TermOffsets optimization. } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { // TODO: this is not always necessary - eventually we would like to avoid this wrap // when it is not needed. tstream = new CachingTokenFilter(tstream); // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already // used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } int maxCharsToAnalyze = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments( tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first Collections.sort( frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } if (fragTexts.size() >= numFragments) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return; // END: Hack SolrParams params = req.getParams(); IndexableField[] docFields = doc.getFields(fieldName); List<String> listFields = new ArrayList<String>(); for (IndexableField field : docFields) { listFields.add(field.stringValue()); } // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); String[] docTexts = (String[]) listFields.toArray(new String[listFields.size()]); // according to Document javadoc, doc.getValues() never returns null. check empty instead of // null if (docTexts.length == 0) return; TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } int maxCharsToAnalyze = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tstream = new CachingTokenFilter(tstream); } else { tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already // used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments( tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if (preserveMulti) { if (bestTextFragments[k] != null) { frags.add(bestTextFragments[k]); } } else { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort( frags, new Comparator<TextFragment>() { @Override public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); } // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if (preserveMulti) { if (fragment != null) { fragTexts.add(fragment.toString()); } } else { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } } if (fragTexts.size() >= numFragments && !preserveMulti) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
public String getContext(String sample) throws IOException { String result = ""; try { String highlight_query_str = cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes; String query_str = "padded_length:[" + String.format("%09d", cfg.minDocLen) + " TO *]"; if (cfg.enableTitleSearch) { query_str += " AND (title:" + cfg.quotes + sample + cfg.quotes + " OR " + cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes + ")"; } else { query_str += " AND (" + cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes + ")"; } Query query = parser.parse(query_str); Query highlight_query = parser.parse(highlight_query_str); if (cfg.debug == true) System.out.println("Searching (" + query + ")....."); TopDocs topDocs = searcher.search(query, cfg.maxHits != 0 ? cfg.maxHits : Integer.MAX_VALUE); if (topDocs.totalHits > 0) { ScoreDoc[] hits = topDocs.scoreDocs; if (cfg.debug == true) System.out.println("Results (" + hits.length + ") :)"); String data; int indx; SimpleHTMLFormatter htmlFormatter = null; Highlighter highlighter = null; if (cfg.displayHighlights) { htmlFormatter = new SimpleHTMLFormatter(); highlighter = new Highlighter(htmlFormatter, new QueryScorer(highlight_query)); } for (int i = 0; i < hits.length; i++) { if (cfg.displayDID) { result += String.format("\t%d", hits[i].doc); } if (cfg.displayScore) { result += String.format("\t%f", hits[i].score); } if (cfg.displayLen) { result += "\t" + indexReader.document(hits[i].doc).getField("length").stringValue(); } if (cfg.displayTitle) { data = indexReader.document(hits[i].doc).getField("title").stringValue(); if (cfg.removeParen && (indx = data.indexOf(" (")) != -1) data = indexReader .document(hits[i].doc) .getField("title") .stringValue() .substring(0, indx); result += "\t" + data; } if (cfg.displayTxt || cfg.displayHighlights) { String text = indexReader.document(hits[i].doc).getField("text").stringValue(); if (cfg.displayTxt) result += "\t" + text; if (cfg.displayHighlights) { TokenStream tokenStream = TokenSources.getAnyTokenStream( searcher.getIndexReader(), hits[i].doc, "text", stdAnalyzer); TextFragment[] frag; try { frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { result += "\t" + (frag[j].toString()); } } } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } // highlighter.getBestFragments(tokenStream, text, 3, "..."); } } if (cfg.displayCategories) { IndexableField categories[] = indexReader.document(hits[i].doc).getFields("category"); for (int j = 0; j < categories.length && (cfg.numCategories == 0 || j < cfg.numCategories); j++) { result += "\t" + categories[j].stringValue(); } } result += System.lineSeparator() + System.lineSeparator() + System.lineSeparator(); } } else if (cfg.debug == true) System.out.println("No results found :("); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } // return result; }
public Search search(String queryString) throws ParseException, IOException, InvalidTokenOffsetsException { List<Result> finalRes = new ArrayList<Result>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); Directory indexDir = FSDirectory.open(new File(pathOfIndex)); Date inizio = new Date(); IndexReader reader = DirectoryReader.open(indexDir); IndexSearcher indexSearcher = new IndexSearcher(reader); QueryParser queryParser = new QueryParser(Version.LUCENE_47, "words", analyzer); Query query = queryParser.parse(queryString); TopDocs hits = indexSearcher.search(query, reader.maxDoc()); // TopDocs hitsSuggested= new TopDocs(); int numberOfResults = hits.totalHits; Date fine = new Date(); long time = fine.getTime() - inizio.getTime(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); // Estraggo gli snippet for (int i = 0; i < hits.totalHits; i++) { int id = hits.scoreDocs[i].doc; Document doc = indexSearcher.doc(id); String text = doc.get("words"); // MoreLikeThis mlt = new MoreLikeThis(reader); // Query querySuggested = mlt.like(hits.scoreDocs[i].doc); // hitsSuggested = indexSearcher.search(querySuggested, reader.maxDoc()); // TokenStream tokenStream = TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), id, "words", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 4); Result r = new Result(); r.setTitle(doc.get("title")); String path = doc.get("path"); r.setPath(path); r.setScore(hits.scoreDocs[i].score); String snippet = ""; for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { snippet += frag[j].toString(); } } String snippet1 = snippet.replace("<B>", "<b>"); String snippetFinal = snippet1.replace("</B>", "</b>"); r.setSnippet(snippetFinal); finalRes.add(r); } reader.close(); String suggestedQuery = null; if (numberOfResults < minimumHits || hits.scoreDocs[0].score < minimumScore) { suggestedQuery = DidYouMean.suggest(queryString, indexDir); } Search searchRes = new Search(finalRes, time, queryString, suggestedQuery); return searchRes; }