@Override public String getSnippet( Query query, String field, String s, int maxNumFragments, int fragmentLength, String fragmentSuffix, Formatter formatter) throws IOException { QueryScorer queryScorer = new QueryScorer(query, field); Highlighter highlighter = new Highlighter(formatter, queryScorer); highlighter.setTextFragmenter(new SimpleFragmenter(fragmentLength)); TokenStream tokenStream = getAnalyzer().tokenStream(field, new UnsyncStringReader(s)); try { String snippet = highlighter.getBestFragments(tokenStream, s, maxNumFragments, fragmentSuffix); if (Validator.isNotNull(snippet) && !StringUtil.endsWith(snippet, fragmentSuffix) && !s.equals(snippet)) { snippet = snippet.concat(fragmentSuffix); } return snippet; } catch (InvalidTokenOffsetsException itoe) { throw new IOException(itoe); } }
/** * 设置关键字高亮 * * @param query 查询对象 * @param list 设置高亮的内容列表 * @param subLength 截取长度 * @param fields 字段名 */ public List<T> keywordsHighlight( BooleanQuery query, List<T> list, int subLength, String... fields) { Analyzer analyzer = new IKAnalyzer(); Formatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\">", "</span>"); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(subLength)); for (T entity : list) { try { for (String field : fields) { String text = StringUtils.replaceHtml((String) Reflections.invokeGetter(entity, field)); // 设置高亮字段 String description = highlighter.getBestFragment(analyzer, field, text); if (description != null) { Reflections.invokeSetter(entity, fields[0], description); break; } Reflections.invokeSetter(entity, fields[0], StringUtils.abbr(text, subLength * 2)); } } catch (IOException e) { e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { e.printStackTrace(); } } return list; }
/** * Return a {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field. * * @param query The current Query * @param fieldName The name of the field * @param request The current SolrQueryRequest */ protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) { SolrParams params = request.getParams(); Highlighter highlighter = new Highlighter( getFormatter(fieldName, params), getEncoder(fieldName, params), getQueryScorer(query, fieldName, request)); highlighter.setTextFragmenter(getFragmenter(fieldName, params)); return highlighter; }
/** * Annoncements adapter * * @param hg * @param a * @param ann * @return * @throws Exception */ public SearchResultWH makeHW(Highlighter hg, Analyzer a, Announce ann) throws Exception { String s = ""; { String text = ann.getITopDescription() + ""; TokenStream tokenStream = a.tokenStream("topdescription", new StringReader(text)); s += cP( "Совпадения в заголовке объявления", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } { String text = ann.getIDescription() + ""; TokenStream tokenStream = a.tokenStream("description", new StringReader(text)); s += cP( "Совпадения в тексте объявления", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } String metatexts = ""; { String text = ann.getMeta_keywords() + ""; TokenStream tokenStream = a.tokenStream("meta_keywords", new StringReader(text)); metatexts += cPmeta( "Совпадения в keywords", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); text = ann.getMeta_description() + ""; tokenStream = a.tokenStream("meta_description", new StringReader(text)); metatexts += cPmeta( "Совпадения в description", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); text = ann.getMeta_subject() + ""; tokenStream = a.tokenStream("meta_subject", new StringReader(text)); metatexts += cPmeta( "Совпадения в subject", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } SearchResultWH swh = new SearchResultWH(ann, "Announce", s, metatexts); return swh; }
/** * @Title: createHighlighter @Description: 创建高亮器 * * @param query 索引查询对象 * @param prefix 高亮前缀字符串 * @param stuffix 高亮后缀字符串 * @param fragmenterLength 摘要最大长度 * @return */ public static Highlighter createHighlighter( Query query, String prefix, String stuffix, int fragmenterLength) { Formatter formatter = new SimpleHTMLFormatter( (prefix == null || prefix.trim().length() == 0) ? "<font color=\"red\">" : prefix, (stuffix == null || stuffix.trim().length() == 0) ? "</font>" : stuffix); Scorer fragmentScorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, fragmentScorer); Fragmenter fragmenter = new SimpleFragmenter(fragmenterLength <= 0 ? 50 : fragmenterLength); highlighter.setTextFragmenter(fragmenter); return highlighter; }
public static String hightlightFeild( Highlighter highlighter, Analyzer analyzer, Document doc, String feild) { String docContent = doc.get(feild); try { String hc = highlighter.getBestFragment(analyzer, feild, docContent); if (hc == null) { if (docContent.length() >= 50) { hc = docContent.substring(0, 50); } else { hc = docContent; } } doc.getField(feild).setValue(hc); // System.out.println(hc); return hc; } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return docContent; }
/** * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this * field. * * @param query The current Query * @param fieldName The name of the field * @param request The current SolrQueryRequest * @param tokenStream document text CachingTokenStream * @throws IOException */ protected Highlighter getPhraseHighlighter( Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException { SolrParams params = request.getParams(); Highlighter highlighter = null; highlighter = new Highlighter( getFormatter(fieldName, params), getEncoder(fieldName, params), getSpanQueryScorer(query, fieldName, tokenStream, request)); highlighter.setTextFragmenter(getFragmenter(fieldName, params)); return highlighter; }
public void heighlight(String field, String searchText) { String text = "In this section we'll show you how to make the simplest " + "programmatic query, searching for a single term, and then " + "we'll see how to use QueryParser to accept textual queries. " + "In the sections that follow, we’ll take this simple example " + "further by detailing all the query types built into Lucene. " + "We begin with the simplest search of all: searching for all " + "documents that contain a single term."; Analyzer analyzer = new StandardAnalyzer(); QueryParser queryParser = new QueryParser(field, analyzer); try { Query query = queryParser.parse(searchText); SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span class=\"highlight\">", "</span>"); TokenStream tokens = analyzer.tokenStream("f", new StringReader(text)); QueryScorer scorer = new QueryScorer(query, "f"); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer)); String result = highlighter.getBestFragments(tokens, text, 3, "..."); FileWriter writer = new FileWriter("/home/venugopal/Downloads/Lucene/lia/highter.html"); // #8 writer.write("<html>"); // #8 writer.write( "<style>\n" + // #8 ".highlight {\n" + // #8 " background: yellow;\n" + // #8 "}\n" + // #8 "</style>"); // #8 writer.write("<body>"); // #8 writer.write(result); // #8 writer.write("</body></html>"); // #8 writer.close(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** * @param query 索引查询对象 * @param prefix 高亮前缀 * @param suffix 高亮后缀 * @param fragmenterLength 摘要最大长度 * @param ifCustom 是否自定义Highlighter * @return * @author huangzhiqian * @date 2015年12月11日 */ public static Highlighter createHighlighter( Query query, String prefix, String suffix, int fragmenterLength, boolean ifCustom) { Formatter formatter = new SimpleHTMLFormatter( (prefix == null || prefix.trim().length() == 0) ? "<font color=\"blue\">" : prefix, (suffix == null || suffix.trim().length() == 0) ? "</font>" : suffix); Scorer fragmentScorer = new QueryScorer(query); Highlighter highlighter = null; if (ifCustom) { highlighter = new CusHzqHighlighter(formatter, fragmentScorer); } else { highlighter = new Highlighter(formatter, fragmentScorer); } Fragmenter fragmenter = new SimpleFragmenter(fragmenterLength <= 0 ? 50 : fragmenterLength); highlighter.setTextFragmenter(fragmenter); return highlighter; }
/** * 高亮设置 * * @param query * @param doc * @param field * @return */ private String toHighlighter(Analyzer analyzer, Query query, Document doc) { String field = "text"; try { SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field))); String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field)); return highlighterStr == null ? doc.get(field) : highlighterStr; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; }
private String doHighlightQuery( Query query, String fieldName, String text, String openMark, String closeMark) { try { SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(openMark, closeMark); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query, fieldName, this.env.defaultField)); highlighter.setTextFragmenter(new NullFragmenter()); String str = highlighter.getBestFragment( this.env.indexAnalyzer, "".equals(fieldName) ? this.env.defaultField : fieldName, text); return null != str ? str : text; } catch (Exception x) { logger.error("Caught an exception:", x); } return text; }
/** * Visa adapter * * @param hg * @param a * @param ann * @return * @throws Exception */ public SearchResultWH makeHW(Highlighter hg, Analyzer a, Visa v) throws Exception { String s = ""; String text = v.getIDescription() + ""; TokenStream tokenStream = a.tokenStream("description", new StringReader(text)); s += cP( "Совпадения в описании", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); String metatexts = ""; { text = v.getMeta_keywords() + ""; tokenStream = a.tokenStream("meta_keywords", new StringReader(text)); metatexts += cPmeta( "Совпадения в keywords", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); text = v.getMeta_description() + ""; tokenStream = a.tokenStream("meta_description", new StringReader(text)); metatexts += cPmeta( "Совпадения в description", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); text = v.getMeta_subject() + ""; tokenStream = a.tokenStream("meta_subject", new StringReader(text)); metatexts += cPmeta( "Совпадения в subject", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); } SearchResultWH swh = new SearchResultWH(v, "Visa", s, metatexts); return swh; }
/** * Public image adapter * * @param hg * @param a * @param pi * @return * @throws Exception */ public SearchResultWH makeHW(Highlighter hg, Analyzer a, PublicImage pi) throws Exception { String text = pi.getDescription() + ""; TokenStream tokenStream = a.tokenStream("description", new StringReader(text)); String s = cP( "Совпадения в описании", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); SearchResultWH swh = new SearchResultWH(pi, "PublicImage", s); return swh; }
/** * Any text adapter * * @param hg * @param a * @param ann * @return * @throws Exception */ public SearchResultWH makeHW(Highlighter hg, Analyzer a, AnyText t) throws Exception { String s = ""; String text = t.getIDescription() + ""; TokenStream tokenStream = a.tokenStream("anytext", new StringReader(text)); s += cP( "Совпадения в тексте", hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... ")); SearchResultWH swh = new SearchResultWH(t, "AnyText", s); return swh; }
protected String retrieveResultSummary( Document document, Highlighter highlighter, StandardAnalyzer analyzer) throws InvalidTokenOffsetsException, IOException { String content = document.get(FIELD_TOPIC_CONTENT); TokenStream tokenStream = analyzer.tokenStream(FIELD_TOPIC_CONTENT, new StringReader(content)); String summary = highlighter.getBestFragments(tokenStream, content, 3, "..."); if (StringUtils.isBlank(summary) && !StringUtils.isBlank(content)) { summary = StringEscapeUtils.escapeHtml(content.substring(0, Math.min(200, content.length()))); if (Math.min(200, content.length()) == 200) { summary += "..."; } } return summary; }
public String getSnippet(int index) throws Exception { StandardAnalyzer analyzer = new StandardAnalyzer(); QueryParser qp = new QueryParser(FieldType.TEXT.toString(), analyzer); SchemaObjectInfo info = getResult(index); String text = info.getTextDescription(); String label = info.getLabel().toLowerCase(); // remove abundant labels if (!label.equals("")) { while (text.indexOf(label) == 0) text = text.substring(label.length()).trim(); } if (keyword != null) { try { Highlighter hl = new Highlighter(new QueryScorer(qp.parse(keyword))); TokenStream ts = analyzer.tokenStream(text, new StringReader(text)); String snippet = hl.getBestFragment(ts, text); if (snippet != null) return snippet; } catch (Exception e) { // e.printStackTrace(); } } return text; }
/** * @param key 搜索内容所在的字段名称 * @param value 所要搜索的内容 * @throws CorruptIndexException * @throws IOException * @throws ParseException */ public static void search(String key, String value) throws CorruptIndexException, IOException, ParseException { IndexSearcher searcher; // 创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器 // Analyzer analyzer = new IKAnalyzer(); //new StandardAnalyzer(Version.LUCENE_36) // QueryParser qp = new QueryParser(Version.LUCENE_36, key,analyzer); searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File(PATH)))); searcher.setSimilarity(new IKSimilarity()); Query query = IKQueryParser.parse(key, value); // Query tq = qp.parse(value); TopDocs topDocs = searcher.search(query, 10000); // searcher.search(query, results); // new function ScoreDoc[] scoreDocs = topDocs.scoreDocs; System.out.println("命中:" + topDocs.totalHits); Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); Scorer fragmentScorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, fragmentScorer); Fragmenter fragmenter = new SimpleFragmenter(50); highlighter.setTextFragmenter(fragmenter); Analyzer analyzer = new IKAnalyzer(); for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docSn = scoreDoc.doc; // 文档内部编号 Document doc = searcher.doc(docSn); // 根据文档编号取出文档 hightlightFeild(highlighter, analyzer, doc, "name"); hightlightFeild(highlighter, analyzer, doc, "address"); hightlightFeild(highlighter, analyzer, doc, "datatype"); // scoreDocs[i]. System.out.println("name:" + doc.get("name")); // new function System.out.println("address:" + doc.get("address")); System.out.println("datatype:" + doc.get("datatype")); System.out.println("geom:" + doc.get("geom")); } }
/** * @Title: highlight @Description: 生成高亮文本 * * @param document 索引文档对象 * @param highlighter 高亮器 * @param analyzer 索引分词器 * @param field 高亮字段 * @return * @throws IOException * @throws InvalidTokenOffsetsException */ public static String highlight( Document document, Highlighter highlighter, Analyzer analyzer, String field) throws IOException { List<IndexableField> list = document.getFields(); for (IndexableField fieldable : list) { String fieldValue = fieldable.stringValue(); if (fieldable.name().equals(field)) { try { fieldValue = highlighter.getBestFragment(analyzer, field, fieldValue); } catch (InvalidTokenOffsetsException e) { fieldValue = fieldable.stringValue(); } return (fieldValue == null || fieldValue.trim().length() == 0) ? fieldable.stringValue() : fieldValue; } } return null; }
@Override protected BenchmarkHighlighter getBenchmarkHighlighter(Query q) { highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); return new BenchmarkHighlighter() { @Override public int doHighlight( IndexReader reader, int doc, String field, StoredDocument document, Analyzer analyzer, String text) throws Exception { TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); return frag != null ? frag.length : 0; } }; }
public Search search(String queryString) throws ParseException, IOException, InvalidTokenOffsetsException { List<Result> finalRes = new ArrayList<Result>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); Directory indexDir = FSDirectory.open(new File(pathOfIndex)); Date inizio = new Date(); IndexReader reader = DirectoryReader.open(indexDir); IndexSearcher indexSearcher = new IndexSearcher(reader); QueryParser queryParser = new QueryParser(Version.LUCENE_47, "words", analyzer); Query query = queryParser.parse(queryString); TopDocs hits = indexSearcher.search(query, reader.maxDoc()); // TopDocs hitsSuggested= new TopDocs(); int numberOfResults = hits.totalHits; Date fine = new Date(); long time = fine.getTime() - inizio.getTime(); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); // Estraggo gli snippet for (int i = 0; i < hits.totalHits; i++) { int id = hits.scoreDocs[i].doc; Document doc = indexSearcher.doc(id); String text = doc.get("words"); // MoreLikeThis mlt = new MoreLikeThis(reader); // Query querySuggested = mlt.like(hits.scoreDocs[i].doc); // hitsSuggested = indexSearcher.search(querySuggested, reader.maxDoc()); // TokenStream tokenStream = TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), id, "words", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 4); Result r = new Result(); r.setTitle(doc.get("title")); String path = doc.get("path"); r.setPath(path); r.setScore(hits.scoreDocs[i].score); String snippet = ""; for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { snippet += frag[j].toString(); } } String snippet1 = snippet.replace("<B>", "<b>"); String snippetFinal = snippet1.replace("</B>", "</b>"); r.setSnippet(snippetFinal); finalRes.add(r); } reader.close(); String suggestedQuery = null; if (numberOfResults < minimumHits || hits.scoreDocs[0].score < minimumScore) { suggestedQuery = DidYouMean.suggest(queryString, indexDir); } Search searchRes = new Search(finalRes, time, queryString, suggestedQuery); return searchRes; }
/** * Searches pages using a particular combination of flags. * * @param query The query to perform in Lucene query language * @param flags A set of flags * @return A Collection of SearchResult instances * @throws ProviderException if there is a problem with the backend */ public Collection findPages(String query, int flags) throws ProviderException { IndexSearcher searcher = null; ArrayList<SearchResult> list = null; Highlighter highlighter = null; try { String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS }; QueryParser qp = new MultiFieldQueryParser(Version.LUCENE_36, queryfields, getLuceneAnalyzer()); // QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() ); Query luceneQuery = qp.parse(query); if ((flags & FLAG_CONTEXTS) != 0) { highlighter = new Highlighter( new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"), new SimpleHTMLEncoder(), new QueryScorer(luceneQuery)); } try { File dir = new File(m_luceneDirectory); Directory luceneDir = new SimpleFSDirectory(dir, null); IndexReader reader = IndexReader.open(luceneDir); searcher = new IndexSearcher(reader); } catch (Exception ex) { log.info("Lucene not yet ready; indexing not started", ex); return null; } ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs; list = new ArrayList<SearchResult>(hits.length); for (int curr = 0; curr < hits.length; curr++) { int docID = hits[curr].doc; Document doc = searcher.doc(docID); String pageName = doc.get(LUCENE_ID); WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION); if (page != null) { if (page instanceof Attachment) { // Currently attachments don't look nice on the search-results page // When the search-results are cleaned up this can be enabled again. } int score = (int) (hits[curr].score * 100); // Get highlighted search contexts String text = doc.get(LUCENE_PAGE_CONTENTS); String[] fragments = new String[0]; if (text != null && highlighter != null) { TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text)); fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS); } SearchResult result = new SearchResultImpl(page, score, fragments); list.add(result); } else { log.error( "Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache"); pageRemoved(new WikiPage(m_engine, pageName)); } } } catch (IOException e) { log.error("Failed during lucene search", e); } catch (ParseException e) { log.info("Broken query; cannot parse query ", e); throw new ProviderException( "You have entered a query Lucene cannot process: " + e.getMessage()); } catch (InvalidTokenOffsetsException e) { log.error("Tokens are incompatible with provided text ", e); } finally { if (searcher != null) { try { searcher.close(); } catch (IOException e) { log.error(e); } } } return list; }
private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { final SolrIndexSearcher searcher = req.getSearcher(); final IndexSchema schema = searcher.getSchema(); // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - // so we disable them until fixed (see LUCENE-3080)! // BEGIN: Hack final SchemaField schemaField = schema.getFieldOrNull(fieldName); if (schemaField != null && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField) || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return; // END: Hack SolrParams params = req.getParams(); IndexableField[] docFields = doc.getFields(fieldName); List<String> listFields = new ArrayList<String>(); for (IndexableField field : docFields) { listFields.add(field.stringValue()); } // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); String[] docTexts = (String[]) listFields.toArray(new String[listFields.size()]); // according to Document javadoc, doc.getValues() never returns null. check empty instead of // null if (docTexts.length == 0) return; TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } int maxCharsToAnalyze = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { if (maxCharsToAnalyze < 0) { tstream = new CachingTokenFilter(tstream); } else { tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already // used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments( tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if (preserveMulti) { if (bestTextFragments[k] != null) { frags.add(bestTextFragments[k]); } } else { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort( frags, new Comparator<TextFragment>() { @Override public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); } // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if (preserveMulti) { if (fragment != null) { fragTexts.add(fragment.toString()); } } else { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } } if (fragTexts.size() >= numFragments && !preserveMulti) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName) throws IOException { SolrParams params = req.getParams(); String[] docTexts = doc.getValues(fieldName); // according to Document javadoc, doc.getValues() never returns null. check empty instead of // null if (docTexts.length == 0) return; SolrIndexSearcher searcher = req.getSearcher(); IndexSchema schema = searcher.getSchema(); TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); String[] summaries = null; List<TextFragment> frags = new ArrayList<TextFragment>(); TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization try { TokenStream tvStream = TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName); if (tvStream != null) { tots = new TermOffsetsTokenStream(tvStream); } } catch (IllegalArgumentException e) { // No problem. But we can't use TermOffsets optimization. } for (int j = 0; j < docTexts.length; j++) { if (tots != null) { // if we're using TermOffsets optimization, then get the next // field value's TokenStream (i.e. get field j's TokenStream) from tots: tstream = tots.getMultiValuedTokenStream(docTexts[j].length()); } else { // fall back to analyzer tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]); } Highlighter highlighter; if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) { // TODO: this is not always necessary - eventually we would like to avoid this wrap // when it is not needed. tstream = new CachingTokenFilter(tstream); // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream); // after highlighter initialization, reset tstream since construction of highlighter already // used it tstream.reset(); } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } int maxCharsToAnalyze = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); if (maxCharsToAnalyze < 0) { highlighter.setMaxDocCharsToAnalyze(docTexts[j].length()); } else { highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); } try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments( tstream, docTexts[j], mergeContiguousFragments, numFragments); for (int k = 0; k < bestTextFragments.length; k++) { if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) { frags.add(bestTextFragments[k]); } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } // sort such that the fragments with the highest score come first Collections.sort( frags, new Comparator<TextFragment>() { public int compare(TextFragment arg0, TextFragment arg1) { return Math.round(arg1.getScore() - arg0.getScore()); } }); // convert fragments back into text // TODO: we can include score and position information in output as snippet attributes if (frags.size() > 0) { ArrayList<String> fragTexts = new ArrayList<String>(); for (TextFragment fragment : frags) { if ((fragment != null) && (fragment.getScore() > 0)) { fragTexts.add(fragment.toString()); } if (fragTexts.size() >= numFragments) break; } summaries = fragTexts.toArray(new String[0]); if (summaries.length > 0) docSummaries.add(fieldName, summaries); } // no summeries made, copy text from alternate field if (summaries == null || summaries.length == 0) { alternateField(docSummaries, params, doc, fieldName); } }
public Vector<Hashtable<String, String>> query(String s, String subject, String from, String to) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException { WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_36); QueryParser parser = new QueryParser(Version.LUCENE_36, "segmented", analyzer); // s = Lucene.client.autoSegment(s); // System.out.println(s); String[] sprite = s.split(" "); ScoreDoc[] docs; String q = ""; if (from.length() == 14 && to.length() == 14) { q += "post_date:[" + from + " TO " + to + "] AND "; } else if (from.length() == 14) { q += "post_date:[" + from + " TO " + "21000000000000" + "] AND "; } else if (to.length() == 14) { q += "post_date:[" + "20000000000000" + " TO " + to + "] AND "; } if (subject.length() == 0) q += s; else { if (!subject.equals("all")) q += s + " AND subject:" + subject; else q += s; } if (subject.length() == 0) docs = this.search(searcher, parser.parse(q)); else docs = this.search(searcher, parser.parse(q + " AND sub_score:[0070 TO 1000]")); Vector<Hashtable<String, String>> result = new Vector<>(); System.out.println(q); System.out.println(docs.length); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(parser.parse(s))); // id score search for (int i = 0; i < 100 && i < docs.length; ++i) { Document doc = searcher.doc(docs[i].doc); String docResult = ""; Hashtable<String, String> ret = new Hashtable<>(); // docResult +="doc="+docs[i].doc+" score="+docs[i].score + "\n"; TokenStream Stream = TokenSources.getAnyTokenStream( searcher.getIndexReader(), docs[i].doc, "post_content", analyzer); TextFragment[] frag; String content = doc.get("post_content"); for (String tmp : sprite) { content = content.replaceAll(tmp, " " + tmp + " "); } // System.out.println(content); ////////// 주석 docResult += highlighter.getBestFragment(analyzer, "", content); /* frag = highlighter.getBestTextFragments(Stream, content, false, 100); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { docResult+=(frag[j].toString()); } } */ docResult = docResult.replaceAll(" (<B>.*?</B>) ", "$1"); // System.out.println(docResult); try { // System.out.println(safe_get(doc,"sub_score")); // ret.put("id", safe_get(doc,"id")); ret.put("content", docResult); ret.put("url", safe_get(doc, "url")); ret.put("date", safe_get(doc, "post_date")); ret.put("id", safe_get(doc, "post_title")); ret.put("sub_score", safe_get(doc, "sub_score")); ret.put("subject", safe_get(doc, "subject")); ret.put("score", Float.toString(docs[i].score)); result.add(ret); } catch (Exception err) { result.add(ret); continue; } } return result; }
/** * @param currentPageNum * @return * @throws Exception */ public List<DocumentEntity> getResult(int currentPageNum) throws Exception { List<DocumentEntity> list = new ArrayList<DocumentEntity>(); directory = FSDirectory.open(new File(SystemConstant.indexDir)); reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); // 高亮显示设置 Highlighter highlighter = null; SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><b>", "</b></font>"); Highlighter highlighterTitle = null; SimpleHTMLFormatter formatTitle = new SimpleHTMLFormatter("<FONT color=#c60a00>", "</FONT>"); ScoreDoc[] hits = this.getScoreDocs(); Query query = this.getQuery(); highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); highlighterTitle = new Highlighter(formatTitle, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(200)); // 这个200是指定关键字字符串的context // 的长度,你可以自己设定,因为不可能返回整篇正文内容 Document doc; String fileName = ""; int totalNumber = currentPageNum * eachePageNum; if (totalNumber > hits.length) totalNumber = hits.length; for (int i = (currentPageNum - 1) * eachePageNum; i < totalNumber; i++) { // 打印文档的内容 doc = searcher.doc(hits[i].doc); // System.out.println(doc.toString()); // if(this.docType.equals(doc.get("type"))){ // 高亮出显示 DocumentEntity docEntity = new DocumentEntity(); TokenStream tokenStream = new PaodingAnalyzer().tokenStream("contents", new StringReader(doc.get("contents"))); docEntity.setContents(highlighter.getBestFragment(tokenStream, doc.get("contents"))); // System.out.println("----------"+i+"----------"); // System.out.println(docEntity.getContents()); fileName = doc.get("fileName"); tokenStream = new PaodingAnalyzer().tokenStream("fileName", new StringReader(fileName)); // 需要注意:在处理时如果文本检索结果中不包含对应的关键字返回一个null String forMatt = highlighterTitle.getBestFragment(tokenStream, fileName); if (forMatt == null) docEntity.setFilename(fileName); else docEntity.setFilename(forMatt); String type1 = doc.get("type"); docEntity.setType(type1); docEntity.setId(doc.get("id")); if ("pdf".equalsIgnoreCase(type1)) { fileName = SystemConstant.CONTEXT + SystemConstant.PDFdir + fileName + "." + type1; docEntity.setOriginalFileName(fileName); } else if ("doc".equalsIgnoreCase(type1)) { fileName = SystemConstant.CONTEXT + SystemConstant.Docdir + fileName + "." + type1; docEntity.setOriginalFileName(fileName); } // System.out.println(docEntity.getType()); list.add(docEntity); // System.out.println(docEntity.getFilename()); // System.out.println("--------------------"+doc.get("fileName")); // }//end for if // else continue; } // end for return list; }
public String getContext(String sample) throws IOException { String result = ""; try { String highlight_query_str = cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes; String query_str = "padded_length:[" + String.format("%09d", cfg.minDocLen) + " TO *]"; if (cfg.enableTitleSearch) { query_str += " AND (title:" + cfg.quotes + sample + cfg.quotes + " OR " + cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes + ")"; } else { query_str += " AND (" + cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes + ")"; } Query query = parser.parse(query_str); Query highlight_query = parser.parse(highlight_query_str); if (cfg.debug == true) System.out.println("Searching (" + query + ")....."); TopDocs topDocs = searcher.search(query, cfg.maxHits != 0 ? cfg.maxHits : Integer.MAX_VALUE); if (topDocs.totalHits > 0) { ScoreDoc[] hits = topDocs.scoreDocs; if (cfg.debug == true) System.out.println("Results (" + hits.length + ") :)"); String data; int indx; SimpleHTMLFormatter htmlFormatter = null; Highlighter highlighter = null; if (cfg.displayHighlights) { htmlFormatter = new SimpleHTMLFormatter(); highlighter = new Highlighter(htmlFormatter, new QueryScorer(highlight_query)); } for (int i = 0; i < hits.length; i++) { if (cfg.displayDID) { result += String.format("\t%d", hits[i].doc); } if (cfg.displayScore) { result += String.format("\t%f", hits[i].score); } if (cfg.displayLen) { result += "\t" + indexReader.document(hits[i].doc).getField("length").stringValue(); } if (cfg.displayTitle) { data = indexReader.document(hits[i].doc).getField("title").stringValue(); if (cfg.removeParen && (indx = data.indexOf(" (")) != -1) data = indexReader .document(hits[i].doc) .getField("title") .stringValue() .substring(0, indx); result += "\t" + data; } if (cfg.displayTxt || cfg.displayHighlights) { String text = indexReader.document(hits[i].doc).getField("text").stringValue(); if (cfg.displayTxt) result += "\t" + text; if (cfg.displayHighlights) { TokenStream tokenStream = TokenSources.getAnyTokenStream( searcher.getIndexReader(), hits[i].doc, "text", stdAnalyzer); TextFragment[] frag; try { frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); for (int j = 0; j < frag.length; j++) { if ((frag[j] != null) && (frag[j].getScore() > 0)) { result += "\t" + (frag[j].toString()); } } } catch (InvalidTokenOffsetsException e) { // TODO Auto-generated catch block e.printStackTrace(); } // highlighter.getBestFragments(tokenStream, text, 3, "..."); } } if (cfg.displayCategories) { IndexableField categories[] = indexReader.document(hits[i].doc).getFields("category"); for (int j = 0; j < categories.length && (cfg.numCategories == 0 || j < cfg.numCategories); j++) { result += "\t" + categories[j].stringValue(); } } result += System.lineSeparator() + System.lineSeparator() + System.lineSeparator(); } } else if (cfg.debug == true) System.out.println("No results found :("); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } // return result; }
@SuppressWarnings("resource") public static String getTextAbstract(String title, String content) { try { content = getTextFromHtml(content); org.apache.lucene.search.Query q = IKQueryParser.parse("CONTENT", title); SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("", ""); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(q)); highlighter.setTextFragmenter(new SimpleFragmenter(200)); org.apache.lucene.analysis.TokenStream tokenStream = (new IKAnalyzer()).tokenStream("CONTENT", new StringReader(content)); String tmp = highlighter.getBestFragment(tokenStream, content); if (HtmlStringUtil.isNotEmpty(tmp)) content = tmp.trim(); } catch (Exception e) { e.printStackTrace(); } int start = 0; int end = 0; boolean startFlag = true; for (int i = 0; i < content.length(); i++) { char c = content.charAt(i); if (startFlag) { if (Character.isWhitespace(c) || Character.isISOControl(c) || c == ',' || c == ',' || c == '”' || c == '’' || c == '.' || c == '。' || c == '>' || c == '?' || c == '?' || c == ' ' || c == ' ' || c == ' ' || c == '!' || c == '!' || c == ';' || c == ';' || c == ':' || c == ':' || c == ']' || c == ']') continue; start = i; startFlag = false; } if (!startFlag) if (c == '.' || c == '。' || c == '?' || c == '?' || c == '!' || c == '!' || c == ' ' || c == ' ' || c == ' ') { if (i < 8) start = i + 1; end = i; if (i != content.length() - 1 && (content.charAt(i + 1) == '”' || content.charAt(i + 1) == '’')) end = i + 1; } else { if ((c == ',' || c == ',' || c == '>' || c == '》' || c == '、') && i < 2) start = i + 1; if (c == '’' || c == '”') if (i != content.length() - 1) { char next = content.charAt(i + 1); if (next != ',' && next == ',' && next == '、' && next == ';' && next == ';') end = i + 1; } else { end = i; } } } if (end != 0 && end > start) { content = content.substring(start, end + 1).trim(); start = 0; for (int i = 0; i < content.length(); i++) { char c = content.charAt(i); if ((c == '.' || c == '。' || c == '?' || c == '?' || c == '!' || c == '!' || c == ' ' || c == ' ' || c == ' ') && i < 8) start = i + 1; } if (start != 0) content = content.substring(start); end = 0; if (HtmlStringUtil.isNotEmpty(content)) { char c = content.charAt(content.length() - 1); if (c != '.' && c != '。' && c != '?' && c != '?' && c != '!' && c != '!') { for (int i = content.length() - 2; i > 0; i--) { c = content.charAt(i); if (c != ';' && c != ';' && c != ',' && c != ',' && c != '>' && c != '》') continue; end = i; break; } } } if (end != 0) content = content.substring(0, end); } return content; }
/** * Serch method * * @param f * @param luquery * @param entities * @return * @throws Exception */ @Transactional(propagation = Propagation.REQUIRED) public List<SearchResultWH> searchWH( String[] f, String luquery, String hgQuery, SortField[] sortFields, int firstResult, int maxResults, Class<?>... entities) throws Exception { // create FullTextEntityManager ---------------------------- FullTextEntityManager fullTextEntityManager = org.hibernate.search.jpa.Search.getFullTextEntityManager(entityManager); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); // --------------------------------------------------------- QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_31, f, analyzer); org.apache.lucene.search.Query query = parser.parse(luquery.trim()); System.out.println("QUERY: " + query + " entitys size:" + entities.length); // wrap Lucene query in a javax.persistence.Query // javax.persistence.Query persistenceQuery = fullTextEntityManager // .createFullTextQuery(query, entities); org.hibernate.search.jpa.FullTextQuery persistenceQuery = fullTextEntityManager.createFullTextQuery(query, entities); // org.apache.lucene.search.Sort sort = new Sort( // new SortField("title", SortField.STRING)); if (sortFields != null && sortFields.length > 0) { persistenceQuery.setSort(new Sort(sortFields)); System.out.println("Sort setted"); } if (firstResult >= 0) { persistenceQuery.setFirstResult(firstResult); persistenceQuery.setMaxResults(maxResults); } // execute search @SuppressWarnings("unchecked") List<Object> result = persistenceQuery.getResultList(); // --------------------------------------------------------------- QueryScorer qs = null; if (hgQuery != null && hgQuery.trim().length() > 0) { // Если запрос для // подсветки // специфичный qs = new QueryScorer(parser.parse(hgQuery.trim())); } else { qs = new QueryScorer(query); } Highlighter highlighter = new Highlighter( new Formatter() { String preTag = "<font color=\"red\"><b>"; String postTag = "</b></font>"; @Override public String highlightTerm(String originalText, TokenGroup tokenGroup) { StringBuffer returnBuffer; if (tokenGroup.getTotalScore() > 0) { returnBuffer = new StringBuffer(); returnBuffer.append(preTag); returnBuffer.append(originalText); returnBuffer.append(postTag); return returnBuffer.toString(); } return originalText; } }, qs); highlighter.setTextFragmenter(new SimpleFragmenter(150)); // --------------------------------------------------------------- List<SearchResultWH> listWH = new ArrayList<SearchResultWH>(); for (Object o : result) { if (o instanceof Hotel) { Hotel h = (Hotel) o; h.getResortTypes().size(); // LAZY load listWH.add(makeHW(highlighter, analyzer, h)); } else if (o instanceof ResortType) { listWH.add(makeHW(highlighter, analyzer, (ResortType) o)); } else if (o instanceof PublicImage) { listWH.add(makeHW(highlighter, analyzer, (PublicImage) o)); } else if (o instanceof FileResourse) { listWH.add(makeHW(highlighter, analyzer, (FileResourse) o)); } else if (o instanceof News) { listWH.add(makeHW(highlighter, analyzer, (News) o)); } else if (o instanceof Announce) { listWH.add(makeHW(highlighter, analyzer, (Announce) o)); } else if (o instanceof Location) { listWH.add(makeHW(highlighter, analyzer, (Location) o)); } else if (o instanceof LType) { listWH.add(makeHW(highlighter, analyzer, (LType) o)); } else if (o instanceof AnyText) { listWH.add(makeHW(highlighter, analyzer, (AnyText) o)); } else if (o instanceof Visa) { listWH.add(makeHW(highlighter, analyzer, (Visa) o)); } } return listWH; }