Java Highlighter.getBestTextFragments Examples

Programming Language: Java

Namespace/Package Name: org.apache.lucene.search.highlight

Class/Type: Highlighter

Method/Function: getBestTextFragments

Examples at hotexamples.com: 4

Java Highlighter.getBestTextFragments - 4 examples found. These are the top rated real world Java examples of org.apache.lucene.search.highlight.Highlighter.getBestTextFragments extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

setTextFragmenter(12)

getBestFragment(9)

getBestFragments(8)

getBestTextFragments(4)

setMaxDocCharsToAnalyze(3)

Example #1

Show file

File: DefaultSolrHighlighter.java Project: ieure/lucene-solr-snapshot

  private void doHighlightingByHighlighter(
      Query query,
      SolrQueryRequest req,
      NamedList docSummaries,
      int docId,
      Document doc,
      String fieldName)
      throws IOException {
    SolrParams params = req.getParams();
    String[] docTexts = doc.getValues(fieldName);
    // according to Document javadoc, doc.getValues() never returns null. check empty instead of
    // null
    if (docTexts.length == 0) return;

    SolrIndexSearcher searcher = req.getSearcher();
    IndexSchema schema = searcher.getSchema();
    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    try {
      TokenStream tvStream =
          TokenSources.getTokenStream(searcher.getIndexReader(), docId, fieldName);
      if (tvStream != null) {
        tots = new TermOffsetsTokenStream(tvStream);
      }
    } catch (IllegalArgumentException e) {
      // No problem. But we can't use TermOffsets optimization.
    }

    for (int j = 0; j < docTexts.length; j++) {
      if (tots != null) {
        // if we're using TermOffsets optimization, then get the next
        // field value's TokenStream (i.e. get field j's TokenStream) from tots:
        tstream = tots.getMultiValuedTokenStream(docTexts[j].length());
      } else {
        // fall back to analyzer
        tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
      }

      Highlighter highlighter;
      if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
        // TODO: this is not always necessary - eventually we would like to avoid this wrap
        //       when it is not needed.
        tstream = new CachingTokenFilter(tstream);

        // get highlighter
        highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

        // after highlighter initialization, reset tstream since construction of highlighter already
        // used it
        tstream.reset();
      } else {
        // use "the old way"
        highlighter = getHighlighter(query, fieldName, req);
      }

      int maxCharsToAnalyze =
          params.getFieldInt(
              fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
      if (maxCharsToAnalyze < 0) {
        highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
      } else {
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
      }

      try {
        TextFragment[] bestTextFragments =
            highlighter.getBestTextFragments(
                tstream, docTexts[j], mergeContiguousFragments, numFragments);
        for (int k = 0; k < bestTextFragments.length; k++) {
          if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
            frags.add(bestTextFragments[k]);
          }
        }
      } catch (InvalidTokenOffsetsException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }
    // sort such that the fragments with the highest score come first
    Collections.sort(
        frags,
        new Comparator<TextFragment>() {
          public int compare(TextFragment arg0, TextFragment arg1) {
            return Math.round(arg1.getScore() - arg0.getScore());
          }
        });

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
      ArrayList<String> fragTexts = new ArrayList<String>();
      for (TextFragment fragment : frags) {
        if ((fragment != null) && (fragment.getScore() > 0)) {
          fragTexts.add(fragment.toString());
        }
        if (fragTexts.size() >= numFragments) break;
      }
      summaries = fragTexts.toArray(new String[0]);
      if (summaries.length > 0) docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
      alternateField(docSummaries, params, doc, fieldName);
    }
  }

Example #2

Show file

File: DefaultSolrHighlighter.java Project: netboynb/search-core

  private void doHighlightingByHighlighter(
      Query query,
      SolrQueryRequest req,
      NamedList docSummaries,
      int docId,
      Document doc,
      String fieldName)
      throws IOException {
    final SolrIndexSearcher searcher = req.getSearcher();
    final IndexSchema schema = searcher.getSchema();

    // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
    // so we disable them until fixed (see LUCENE-3080)!
    // BEGIN: Hack
    final SchemaField schemaField = schema.getFieldOrNull(fieldName);
    if (schemaField != null
        && ((schemaField.getType() instanceof org.apache.solr.schema.TrieField)
            || (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField))) return;
    // END: Hack

    SolrParams params = req.getParams();
    IndexableField[] docFields = doc.getFields(fieldName);
    List<String> listFields = new ArrayList<String>();
    for (IndexableField field : docFields) {
      listFields.add(field.stringValue());
    }

    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);

    String[] docTexts = (String[]) listFields.toArray(new String[listFields.size()]);

    // according to Document javadoc, doc.getValues() never returns null. check empty instead of
    // null
    if (docTexts.length == 0) return;

    TokenStream tstream = null;
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);

    String[] summaries = null;
    List<TextFragment> frags = new ArrayList<TextFragment>();

    TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
    TokenStream tvStream =
        TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
    if (tvStream != null) {
      tots = new TermOffsetsTokenStream(tvStream);
    }

    for (int j = 0; j < docTexts.length; j++) {
      if (tots != null) {
        // if we're using TermOffsets optimization, then get the next
        // field value's TokenStream (i.e. get field j's TokenStream) from tots:
        tstream = tots.getMultiValuedTokenStream(docTexts[j].length());
      } else {
        // fall back to analyzer
        tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
      }

      int maxCharsToAnalyze =
          params.getFieldInt(
              fieldName, HighlightParams.MAX_CHARS, Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);

      Highlighter highlighter;
      if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
        if (maxCharsToAnalyze < 0) {
          tstream = new CachingTokenFilter(tstream);
        } else {
          tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
        }

        // get highlighter
        highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);

        // after highlighter initialization, reset tstream since construction of highlighter already
        // used it
        tstream.reset();
      } else {
        // use "the old way"
        highlighter = getHighlighter(query, fieldName, req);
      }

      if (maxCharsToAnalyze < 0) {
        highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
      } else {
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
      }

      try {
        TextFragment[] bestTextFragments =
            highlighter.getBestTextFragments(
                tstream, docTexts[j], mergeContiguousFragments, numFragments);
        for (int k = 0; k < bestTextFragments.length; k++) {
          if (preserveMulti) {
            if (bestTextFragments[k] != null) {
              frags.add(bestTextFragments[k]);
            }
          } else {
            if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
              frags.add(bestTextFragments[k]);
            }
          }
        }
      } catch (InvalidTokenOffsetsException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }
    // sort such that the fragments with the highest score come first
    if (!preserveMulti) {
      Collections.sort(
          frags,
          new Comparator<TextFragment>() {
            @Override
            public int compare(TextFragment arg0, TextFragment arg1) {
              return Math.round(arg1.getScore() - arg0.getScore());
            }
          });
    }

    // convert fragments back into text
    // TODO: we can include score and position information in output as snippet attributes
    if (frags.size() > 0) {
      ArrayList<String> fragTexts = new ArrayList<String>();
      for (TextFragment fragment : frags) {
        if (preserveMulti) {
          if (fragment != null) {
            fragTexts.add(fragment.toString());
          }
        } else {
          if ((fragment != null) && (fragment.getScore() > 0)) {
            fragTexts.add(fragment.toString());
          }
        }

        if (fragTexts.size() >= numFragments && !preserveMulti) break;
      }
      summaries = fragTexts.toArray(new String[0]);
      if (summaries.length > 0) docSummaries.add(fieldName, summaries);
    }
    // no summeries made, copy text from alternate field
    if (summaries == null || summaries.length == 0) {
      alternateField(docSummaries, params, doc, fieldName);
    }
  }

Example #3

Show file

File: ContextGenerator.java Project: cbdr/WikiToolbox

  public String getContext(String sample) throws IOException {
    String result = "";
    try {
      String highlight_query_str = cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes;
      String query_str = "padded_length:[" + String.format("%09d", cfg.minDocLen) + " TO *]";
      if (cfg.enableTitleSearch) {
        query_str +=
            " AND (title:"
                + cfg.quotes
                + sample
                + cfg.quotes
                + " OR "
                + cfg.searchField
                + ":"
                + cfg.quotes
                + sample
                + cfg.quotes
                + ")";
      } else {
        query_str += " AND (" + cfg.searchField + ":" + cfg.quotes + sample + cfg.quotes + ")";
      }

      Query query = parser.parse(query_str);
      Query highlight_query = parser.parse(highlight_query_str);

      if (cfg.debug == true) System.out.println("Searching (" + query + ").....");
      TopDocs topDocs = searcher.search(query, cfg.maxHits != 0 ? cfg.maxHits : Integer.MAX_VALUE);
      if (topDocs.totalHits > 0) {
        ScoreDoc[] hits = topDocs.scoreDocs;
        if (cfg.debug == true) System.out.println("Results (" + hits.length + ") :)");
        String data;
        int indx;
        SimpleHTMLFormatter htmlFormatter = null;
        Highlighter highlighter = null;
        if (cfg.displayHighlights) {
          htmlFormatter = new SimpleHTMLFormatter();
          highlighter = new Highlighter(htmlFormatter, new QueryScorer(highlight_query));
        }
        for (int i = 0; i < hits.length; i++) {
          if (cfg.displayDID) {
            result += String.format("\t%d", hits[i].doc);
          }
          if (cfg.displayScore) {
            result += String.format("\t%f", hits[i].score);
          }
          if (cfg.displayLen) {
            result += "\t" + indexReader.document(hits[i].doc).getField("length").stringValue();
          }
          if (cfg.displayTitle) {
            data = indexReader.document(hits[i].doc).getField("title").stringValue();
            if (cfg.removeParen && (indx = data.indexOf(" (")) != -1)
              data =
                  indexReader
                      .document(hits[i].doc)
                      .getField("title")
                      .stringValue()
                      .substring(0, indx);
            result += "\t" + data;
          }
          if (cfg.displayTxt || cfg.displayHighlights) {
            String text = indexReader.document(hits[i].doc).getField("text").stringValue();
            if (cfg.displayTxt) result += "\t" + text;
            if (cfg.displayHighlights) {
              TokenStream tokenStream =
                  TokenSources.getAnyTokenStream(
                      searcher.getIndexReader(), hits[i].doc, "text", stdAnalyzer);
              TextFragment[] frag;
              try {
                frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
                for (int j = 0; j < frag.length; j++) {
                  if ((frag[j] != null) && (frag[j].getScore() > 0)) {
                    result += "\t" + (frag[j].toString());
                  }
                }
              } catch (InvalidTokenOffsetsException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
              } // highlighter.getBestFragments(tokenStream, text, 3, "...");
            }
          }
          if (cfg.displayCategories) {
            IndexableField categories[] = indexReader.document(hits[i].doc).getFields("category");
            for (int j = 0;
                j < categories.length && (cfg.numCategories == 0 || j < cfg.numCategories);
                j++) {
              result += "\t" + categories[j].stringValue();
            }
          }

          result += System.lineSeparator() + System.lineSeparator() + System.lineSeparator();
        }
      } else if (cfg.debug == true) System.out.println("No results found :(");
    } catch (ParseException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } //

    return result;
  }

Example #4

Show file

File: Searcher.java Project: PCoronas90/AGIW

  public Search search(String queryString)
      throws ParseException, IOException, InvalidTokenOffsetsException {

    List<Result> finalRes = new ArrayList<Result>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    Directory indexDir = FSDirectory.open(new File(pathOfIndex));

    Date inizio = new Date();
    IndexReader reader = DirectoryReader.open(indexDir);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    QueryParser queryParser = new QueryParser(Version.LUCENE_47, "words", analyzer);
    Query query = queryParser.parse(queryString);

    TopDocs hits = indexSearcher.search(query, reader.maxDoc());
    // TopDocs hitsSuggested= new TopDocs();

    int numberOfResults = hits.totalHits;
    Date fine = new Date();

    long time = fine.getTime() - inizio.getTime();
    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));

    // Estraggo gli snippet
    for (int i = 0; i < hits.totalHits; i++) {
      int id = hits.scoreDocs[i].doc;
      Document doc = indexSearcher.doc(id);
      String text = doc.get("words");

      // MoreLikeThis mlt = new MoreLikeThis(reader);

      // Query querySuggested = mlt.like(hits.scoreDocs[i].doc);
      // hitsSuggested = indexSearcher.search(querySuggested, reader.maxDoc());
      //
      TokenStream tokenStream =
          TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), id, "words", analyzer);
      TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 4);

      Result r = new Result();
      r.setTitle(doc.get("title"));
      String path = doc.get("path");
      r.setPath(path);
      r.setScore(hits.scoreDocs[i].score);
      String snippet = "";
      for (int j = 0; j < frag.length; j++) {
        if ((frag[j] != null) && (frag[j].getScore() > 0)) {
          snippet += frag[j].toString();
        }
      }

      String snippet1 = snippet.replace("<B>", "<b>");
      String snippetFinal = snippet1.replace("</B>", "</b>");
      r.setSnippet(snippetFinal);
      finalRes.add(r);
    }

    reader.close();
    String suggestedQuery = null;

    if (numberOfResults < minimumHits || hits.scoreDocs[0].score < minimumScore) {
      suggestedQuery = DidYouMean.suggest(queryString, indexDir);
    }

    Search searchRes = new Search(finalRes, time, queryString, suggestedQuery);
    return searchRes;
  }