@Override
  public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
    if (!fieldType().indexed()) {
      return null;
    }

    final NumericType numericType = fieldType().numericType();
    if (numericType != null) {
      if (!(reuse instanceof NumericTokenStream
          && ((NumericTokenStream) reuse).getPrecisionStep() == type.numericPrecisionStep())) {
        // lazy init the TokenStream as it is heavy to instantiate
        // (attributes,...) if not needed (stored field loading)
        reuse = new NumericTokenStream(type.numericPrecisionStep());
      }
      final NumericTokenStream nts = (NumericTokenStream) reuse;
      // initialize value in TokenStream
      final Number val = (Number) fieldsData;
      switch (numericType) {
        case INT:
          nts.setIntValue(val.intValue());
          break;
        case LONG:
          nts.setLongValue(val.longValue());
          break;
        case FLOAT:
          nts.setFloatValue(val.floatValue());
          break;
        case DOUBLE:
          nts.setDoubleValue(val.doubleValue());
          break;
        default:
          throw new AssertionError("Should never get here");
      }
      return reuse;
    }

    if (!fieldType().tokenized()) {
      if (stringValue() == null) {
        throw new IllegalArgumentException("Non-Tokenized Fields must have a String value");
      }
      if (!(reuse instanceof StringTokenStream)) {
        // lazy init the TokenStream as it is heavy to instantiate
        // (attributes,...) if not needed (stored field loading)
        reuse = new StringTokenStream();
      }
      ((StringTokenStream) reuse).setValue(stringValue());
      return reuse;
    }

    if (tokenStream != null) {
      return tokenStream;
    } else if (readerValue() != null) {
      return analyzer.tokenStream(name(), readerValue());
    } else if (stringValue() != null) {
      return analyzer.tokenStream(name(), stringValue());
    }

    throw new IllegalArgumentException(
        "Field must have either TokenStream, String, Reader or Number value; got " + this);
  }
Exemple #2
0
  /**
   * Annoncements adapter
   *
   * @param hg
   * @param a
   * @param ann
   * @return
   * @throws Exception
   */
  public SearchResultWH makeHW(Highlighter hg, Analyzer a, Announce ann) throws Exception {
    String s = "";

    {
      String text = ann.getITopDescription() + "";

      TokenStream tokenStream = a.tokenStream("topdescription", new StringReader(text));
      s +=
          cP(
              "Совпадения в заголовке объявления",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));
    }

    {
      String text = ann.getIDescription() + "";

      TokenStream tokenStream = a.tokenStream("description", new StringReader(text));
      s +=
          cP(
              "Совпадения в тексте объявления",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));
    }

    String metatexts = "";

    {
      String text = ann.getMeta_keywords() + "";

      TokenStream tokenStream = a.tokenStream("meta_keywords", new StringReader(text));
      metatexts +=
          cPmeta(
              "Совпадения в keywords",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

      text = ann.getMeta_description() + "";

      tokenStream = a.tokenStream("meta_description", new StringReader(text));
      metatexts +=
          cPmeta(
              "Совпадения в description",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

      text = ann.getMeta_subject() + "";

      tokenStream = a.tokenStream("meta_subject", new StringReader(text));
      metatexts +=
          cPmeta(
              "Совпадения в subject",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));
    }

    SearchResultWH swh = new SearchResultWH(ann, "Announce", s, metatexts);
    return swh;
  }
  static Query parseQueryString(
      ExtendedCommonTermsQuery query,
      Object queryString,
      String field,
      Analyzer analyzer,
      String lowFreqMinimumShouldMatch,
      String highFreqMinimumShouldMatch)
      throws IOException {
    // Logic similar to QueryParser#getFieldQuery
    int count = 0;
    try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) {
      source.reset();
      CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
      BytesRefBuilder builder = new BytesRefBuilder();
      while (source.incrementToken()) {
        // UTF-8
        builder.copyChars(termAtt);
        query.add(new Term(field, builder.toBytesRef()));
        count++;
      }
    }

    if (count == 0) {
      return null;
    }
    query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch);
    query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch);
    return query;
  }
Exemple #4
0
  public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null) return null;

    try (TokenStream source = analyzerIn.tokenStream(field, part)) {
      source.reset();

      TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
      BytesRef bytes = termAtt.getBytesRef();

      if (!source.incrementToken())
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            "analyzer returned no terms for multiTerm term: " + part);
      termAtt.fillBytesRef();
      if (source.incrementToken())
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            "analyzer returned too many terms for multiTerm term: " + part);

      source.end();
      return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    }
  }
  public Query getQuery(Element e) throws ParserException {
    String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
    String text = DOMUtils.getNonBlankTextOrFail(e);

    BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false));
    bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0));
    try {
      TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      Term term = null;
      BytesRef bytes = termAtt.getBytesRef();
      ts.reset();
      while (ts.incrementToken()) {
        termAtt.fillBytesRef();
        term = new Term(fieldName, BytesRef.deepCopyOf(bytes));
        bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
      }
      ts.end();
      ts.close();
    } catch (IOException ioe) {
      throw new RuntimeException("Error constructing terms from index:" + ioe);
    }

    bq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
    return bq;
  }
    @Override
    public final TokenStream tokenStream(String fieldName, Reader reader) {
      int dotIndex = fieldName.indexOf('.');
      if (dotIndex != -1) {
        String possibleType = fieldName.substring(0, dotIndex);
        DocumentMapper possibleDocMapper = mappers.get(possibleType);
        if (possibleDocMapper != null) {
          return possibleDocMapper.mappers().searchAnalyzer().tokenStream(fieldName, reader);
        }
      }
      FieldMappers mappers = fullNameFieldMappers.get(fieldName);
      if (mappers != null
          && mappers.mapper() != null
          && mappers.mapper().searchAnalyzer() != null) {
        return mappers.mapper().searchAnalyzer().tokenStream(fieldName, reader);
      }

      mappers = indexNameFieldMappers.get(fieldName);
      if (mappers != null
          && mappers.mapper() != null
          && mappers.mapper().searchAnalyzer() != null) {
        return mappers.mapper().searchAnalyzer().tokenStream(fieldName, reader);
      }
      return defaultAnalyzer.tokenStream(fieldName, reader);
    }
  public List<Document> searchDocuments(String text) {
    List<Document> documents = new ArrayList<Document>();
    try {
      TokenStream tokenStream = analyzer.tokenStream("text", text);
      CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      BooleanQuery bQuery = new BooleanQuery();
      while (tokenStream.incrementToken()) {
        String token = charTermAtt.toString();
        TermQuery tq = new TermQuery(new Term("text", token));
        tq.setBoost(2f);

        bQuery.add(tq, Occur.MUST);
      }
      tokenStream.close();

      TopDocs results = searcher.search(bQuery, 100000);
      ScoreDoc[] hits = results.scoreDocs;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        doc.add(new FloatField("score", hit.score, FloatField.TYPE_STORED));
        documents.add(doc);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return documents;
  }
 public String tokens(String field) {
   try {
     Field f = doc.getField(field);
     if (f == null) fail("No such field " + field);
     if (!f.isTokenized()) {
       String val = value(field);
       Token t = new Token(val, 0, val.length());
       return t.getPositionIncrement() + " [" + t.termText() + "]";
     }
     TokenStream ts = f.tokenStreamValue();
     if (ts == null && f.stringValue() != null) ts = analyzer.tokenStream(field, f.stringValue());
     if (ts == null && f.readerValue() != null) ts = analyzer.tokenStream(field, f.readerValue());
     if (ts == null) fail("No token stream for field " + field);
     Token t = null;
     StringBuilder sb = new StringBuilder();
     while ((t = ts.next()) != null) {
       sb.append(t.getPositionIncrement() + " [" + t.termText() + "] ");
     }
     return sb.toString().trim();
   } catch (Exception e) {
     e.printStackTrace();
     fail(e.getMessage());
     return null;
   }
 }
 /**
  * Sugar: analyzes the text with the analyzer and separates by {@link
  * SynonymMap#WORD_SEPARATOR}. reuse and its chars must not be null.
  */
 public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException {
   try (TokenStream ts = analyzer.tokenStream("", text)) {
     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
     ts.reset();
     reuse.clear();
     while (ts.incrementToken()) {
       int length = termAtt.length();
       if (length == 0) {
         throw new IllegalArgumentException(
             "term: " + text + " analyzed to a zero-length token");
       }
       if (posIncAtt.getPositionIncrement() != 1) {
         throw new IllegalArgumentException(
             "term: " + text + " analyzed to a token with posinc != 1");
       }
       reuse.grow(reuse.length() + length + 1); /* current + word + separator */
       int end = reuse.length();
       if (reuse.length() > 0) {
         reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR);
         reuse.setLength(reuse.length() + 1);
       }
       System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
       reuse.setLength(reuse.length() + length);
     }
     ts.end();
   }
   if (reuse.length() == 0) {
     throw new IllegalArgumentException(
         "term: " + text + " was completely eliminated by analyzer");
   }
   return reuse.get();
 }
  /** @param analyzer @Description: */
  private static void cutWordsForSingle(Analyzer analyzer, String inputs, Random random) {
    try {
      TokenStream ts = analyzer.tokenStream("text", new StringReader(inputs));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();

      while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
          word += ts.getAttribute(CharTermAttribute.class).toString();
          word += "|" + (random.nextFloat() + 10) + " ";
          // 拼接 word|23.3 word1|43.4

          i++;
          // 添加到索引
          if (i >= throftdocnum) { // 100次一个文档
            i = 0;
            docnum++;
            // System.out.println(word);
            addDoc(docnum + patch, word);
            word = "";
          }
          j++;
        }
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
  public Map<String, Double> search(String text) {
    Map<String, Double> similar = new HashMap<String, Double>();
    try {
      TokenStream tokenStream = analyzer.tokenStream("text", text);
      CharTermAttribute charTermAtt = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      BooleanQuery bQuery = new BooleanQuery();
      while (tokenStream.incrementToken()) {
        String token = charTermAtt.toString();
        TermQuery tq = new TermQuery(new Term("text", token));
        tq.setBoost(2f);

        bQuery.add(tq, Occur.MUST);
      }
      tokenStream.close();

      TopDocs results = searcher.search(bQuery, 100000);
      ScoreDoc[] hits = results.scoreDocs;
      for (ScoreDoc hit : hits) {
        Document doc = searcher.doc(hit.doc);
        similar.put(doc.get("id"), new Double(hit.score));
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return similar;
  }
  @Test
  public void simpleTest() throws IOException {
    Analyzer analyzer =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader);
            return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3));
          }
        };

    TokenStream test = analyzer.tokenStream("test", "a bb ccc dddd eeeee");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("a"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("bb"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("ccc"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("ddd"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("eee"));

    assertThat(test.incrementToken(), equalTo(false));
  }
  /**
   * Adds term frequencies found by tokenizing text from reader into the Map words
   *
   * @param r a source of text to be tokenized
   * @param termFreqMap a Map of terms and their frequencies
   * @param fieldName Used by analyzer for any special per-field analysis
   */
  private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
      throws IOException {
    if (analyzer == null) {
      throw new UnsupportedOperationException(
          "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
      int tokenCount = 0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
          break;
        }
        if (isNoiseWord(word)) {
          continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
          termFreqMap.put(word, new Int());
        } else {
          cnt.x++;
        }
      }
      ts.end();
    }
  }
 public void testReliability() throws IOException {
   for (int i = 0; i < 10000; i++) {
     String s = _TestUtil.randomUnicodeString(random(), 100);
     TokenStream ts = analyzerDefault.tokenStream("foo", new StringReader(s));
     ts.reset();
     while (ts.incrementToken()) ;
   }
 }
Exemple #15
0
 /**
  * 查看分词后的语汇单元细节,只打印分词结果
  *
  * @param analyzer
  * @param text
  * @throws IOException
  */
 public static void displaySimpleTokens(Analyzer analyzer, String text) throws IOException {
   TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(text));
   TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
   while (tokenStream.incrementToken()) {
     System.out.print(termAttribute.term() + ",");
   }
   System.out.println();
 }
  public void cleanText(String... inboundTexts) {
    try {
      final List<String> fields = Lists.newArrayList();
      for (String raw : inboundTexts) {
        //		        Tidy t = new Tidy();
        //		        t.setErrout(new PrintWriter(new ByteArrayOutputStream()));
        //		        StringWriter out = new StringWriter();
        //		        t.parse(new StringReader(raw), out);
        //		        String tidied = out.getBuffer().toString();
        //    		    logger.debug("{}",tidied);
        //		        AutoDetectParser p = new AutoDetectParser();
        //		        p.parse(new ByteArrayInputStream(raw.getBytes()),
        //		        		new TextContentHandler(new DefaultHandler()
        //		        {
        //		            @Override
        //		            public void characters(char[] ch, int start, int length) throws SAXException
        //		            {
        //		                CharBuffer buf = CharBuffer.wrap(ch, start, length);
        //		                String s = buf.toString();
        //		    		    logger.debug("{}",s);
        //		                fields.add(s);
        //		            }
        //		        }), new Metadata());
      }

      Analyzer analyzer = new StandardAnalyzer();
      //		    String joinedFields = Joiner.on(" ").join(fields).replaceAll("\\s+", " ");
      String joinedFields = Joiner.on(" ").join(inboundTexts).replaceAll("\\s+", " ");
      logger.debug("{}", joinedFields);
      StringReader in = new StringReader(joinedFields);
      TokenStream ts = analyzer.tokenStream("content", in);
      ts.reset();
      ts = new LowerCaseFilter(ts);

      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      List<String> words = Lists.newArrayList();
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.buffer();
        int termLen = termAtt.length();
        String w = new String(termBuffer, 0, termLen);
        words.add(w);
      }
      ts.end();
      ts.close();
      analyzer.close();
      scrubbedWords = new ArrayList<String>();
      for (String word : words) {
        if (word.length() >= MINWORDLEN && !stopwords.contains(word)) {
          scrubbedWords.add(word);
        } else {
          logger.debug("Ignoring word: {}", word);
        }
      }
      //		    this.scrubbedWords = words;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
Exemple #17
0
  private static String[] groupTokens(Analyzer analyzer, String input) throws IOException {
    if (Resources.debug) {
      Resources.LOGGER.debug("TokenParser:" + input);
      Resources.LOGGER.debug("Analyzer:" + analyzer.getClass());
    }
    TokenStream tokenStream = analyzer.tokenStream("input", input);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute =
        tokenStream.addAttribute(PositionIncrementAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
    tokenStream.reset();
    int position = 0;

    List<TermInfo> infos = new ArrayList<TermInfo>();
    while (tokenStream.incrementToken()) {
      int increment = positionIncrementAttribute.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        if (Resources.debug) {
          Resources.LOGGER.debug(position + ":");
        }
      }

      int startOffset = offsetAttribute.startOffset();
      int endOffset = offsetAttribute.endOffset();
      String term = charTermAttribute.toString();
      TermInfo info = new TermInfo();
      info.setStart(startOffset);
      info.setEnd(endOffset);
      infos.add(info);
      if (Resources.debug) {
        Resources.LOGGER.debug(
            "["
                + term
                + "]"
                + ":("
                + startOffset
                + "-->"
                + endOffset
                + "):"
                + typeAttribute.type());
      }
    }
    tokenStream.end();
    tokenStream.close();

    Stack<TermInfo> tiStack = groupTokenInfos(infos);
    List<String> terms = new ArrayList<String>();
    while (!tiStack.isEmpty()) {
      TermInfo termInfo = tiStack.pop();
      if (termInfo.getEnd() <= input.length() && termInfo.getStart() >= 1) {
        String term = input.substring(termInfo.getStart(), termInfo.getEnd());
        terms.add(term);
      }
    }
    return terms.toArray(new String[] {});
  }
 public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
   TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
   stream.reset();
   PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
   while (stream.incrementToken()) {
     System.out.println("posIncr=" + posIncr.getPositionIncrement());
   }
   stream.close();
 }
  @Test
  public void testPresearcherComponent() throws IOException {

    PresearcherComponent comp =
        new WildcardNGramPresearcherComponent("FOO", 10, "__wibble__", Sets.newHashSet("field1"));

    Analyzer input = new WhitespaceAnalyzer();

    // field1 is in the excluded set, so nothing should happen
    TokenStreamAssert.assertThat(
            comp.filterDocumentTokens("field1", input.tokenStream("field1", "hello world")))
        .nextEquals("hello")
        .nextEquals("world")
        .isExhausted();

    // field2 is not excluded
    TokenStreamAssert.assertThat(
            comp.filterDocumentTokens(
                "field", input.tokenStream("field", "harm alarm asdasasdasdasd")))
        .nextEquals("harm")
        .nextEquals("harmFOO")
        .nextEquals("harFOO")
        .nextEquals("haFOO")
        .nextEquals("hFOO")
        .nextEquals("armFOO")
        .nextEquals("arFOO")
        .nextEquals("aFOO")
        .nextEquals("rmFOO")
        .nextEquals("rFOO")
        .nextEquals("mFOO")
        .nextEquals("FOO")
        .nextEquals("alarm")
        .nextEquals("alarmFOO")
        .nextEquals("alarFOO")
        .nextEquals("alaFOO")
        .nextEquals("alFOO")
        .nextEquals("larmFOO")
        .nextEquals("larFOO")
        .nextEquals("laFOO")
        .nextEquals("lFOO")
        .nextEquals("asdasasdasdasd")
        .nextEquals("__wibble__")
        .isExhausted();
  }
  /**
   * 文本向量化
   *
   * @param analyzer - 选择的分词器对象
   * @param field - lucene域名
   * @param content - 文本内容
   */
  public Map<Long, Integer> vectorize(Analyzer analyzer, String field, String content) {

    Map<Long, Integer> map = new TreeMap<Long, Integer>();
    DocWordHashMap wordHash = DocWordHashMap.getInstance();
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream(field, content);

      // 迭代获取分词结果

      // 重置TokenStream
      ts.reset();

      while (ts.incrementToken()) {

        String word = ts.addAttribute(CharTermAttribute.class).toString();
        // 逐个MurmurHash词元
        long hash = MurmurHash.hash64(word);
        if (!wordHash.isContainKey(hash)) {
          wordHash.setWordStringHash(hash, word);
        }
        if (!map.containsKey(hash)) {
          map.put(hash, 1);
        } else {
          map.put(hash, map.get(hash) + 1);
        }
      }

      // 关闭TokenStream
      ts.end(); // Perform end-of-stream operations, e.g. set the final
      // offset.
    } catch (CorruptIndexException e) {
      e.printStackTrace();
      map.clear();
    } catch (LockObtainFailedException e) {
      e.printStackTrace();
      map.clear();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      map.clear();
    } finally {
      // 释放TokenStream的所有资源
      if (ts != null) {
        try {
          ts.close();
        } catch (IOException e) {
          e.printStackTrace();
          map.clear();
        }
      }
    }
    return map;
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
   */
  @Override
  public Query getQuery(Element e) throws ParserException {
    String fieldsList = e.getAttribute("fieldNames"); // a comma-delimited list of fields
    String fields[] = defaultFieldNames;
    if ((fieldsList != null) && (fieldsList.trim().length() > 0)) {
      fields = fieldsList.trim().split(",");
      // trim the fieldnames
      for (int i = 0; i < fields.length; i++) {
        fields[i] = fields[i].trim();
      }
    }

    // Parse any "stopWords" attribute
    // TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
    // I use all analyzers/fields to generate multi-field compatible stop list
    String stopWords = e.getAttribute("stopWords");
    Set<String> stopWordsSet = null;
    if ((stopWords != null) && (fields != null)) {
      stopWordsSet = new HashSet<String>();
      for (String field : fields) {
        try (TokenStream ts = analyzer.tokenStream(field, stopWords)) {
          CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
          ts.reset();
          while (ts.incrementToken()) {
            stopWordsSet.add(termAtt.toString());
          }
          ts.end();
          ts.close();
        } catch (IOException ioe) {
          throw new ParserException(
              "IoException parsing stop words list in "
                  + getClass().getName()
                  + ":"
                  + ioe.getLocalizedMessage());
        }
      }
    }

    MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.getText(e), fields, analyzer, fields[0]);
    mlt.setMaxQueryTerms(DOMUtils.getAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS));
    mlt.setMinTermFrequency(
        DOMUtils.getAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY));
    mlt.setPercentTermsToMatch(
        DOMUtils.getAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100);
    mlt.setStopWords(stopWordsSet);
    int minDocFreq = DOMUtils.getAttribute(e, "minDocFreq", -1);
    if (minDocFreq >= 0) {
      mlt.setMinDocFreq(minDocFreq);
    }

    mlt.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));

    return mlt;
  }
  public TermSubQueryFactory termToFactory(String fieldname, Term sourceTerm, FieldBoost boost)
      throws IOException {

    CacheKey cacheKey = null;

    if (termQueryCache != null) {

      cacheKey = new CacheKey(fieldname, sourceTerm);

      TermQueryCacheValue cacheValue = termQueryCache.get(cacheKey);
      if (cacheValue != null) {
        // The cache references factories with pre-analyzed terms, or cache entries without a
        // query factory if the term does not exist in the index. cacheValue.hasQuery() returns
        // true/false correspondingly.
        // Cache entries don't have a boost factor, it is only added later via the queryFactory.
        return (cacheValue.hasQuery()) ? new TermSubQueryFactory(cacheValue, boost) : null;
      }
    }

    LuceneQueryFactoryAndPRMSQuery root = null;
    TokenStream ts = null;
    try {

      ts = analyzer.tokenStream(fieldname, new CharSequenceReader(sourceTerm));
      CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
      PositionIncrementAttribute posIncAttr = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();

      PositionSequence<org.apache.lucene.index.Term> sequence = new PositionSequence<>();
      while (ts.incrementToken()) {

        int inc = posIncAttr.getPositionIncrement();
        if (inc > 0 || sequence.isEmpty()) {
          sequence.nextPosition();
        }

        sequence.addElement(new org.apache.lucene.index.Term(fieldname, new BytesRef(termAttr)));
      }

      root = positionSequenceToQueryFactoryAndPRMS(sequence);

    } finally {
      if (ts != null) {
        try {
          ts.close();
        } catch (IOException e) {
        }
      }
    }

    putQueryFactoryAndPRMSQueryIntoCache(cacheKey, root);

    return root == null ? null : new TermSubQueryFactory(root, boost);
  }
Exemple #23
0
  /**
   * Visa adapter
   *
   * @param hg
   * @param a
   * @param ann
   * @return
   * @throws Exception
   */
  public SearchResultWH makeHW(Highlighter hg, Analyzer a, Visa v) throws Exception {
    String s = "";
    String text = v.getIDescription() + "";

    TokenStream tokenStream = a.tokenStream("description", new StringReader(text));
    s +=
        cP(
            "Совпадения в описании",
            hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

    String metatexts = "";

    {
      text = v.getMeta_keywords() + "";

      tokenStream = a.tokenStream("meta_keywords", new StringReader(text));
      metatexts +=
          cPmeta(
              "Совпадения в keywords",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

      text = v.getMeta_description() + "";

      tokenStream = a.tokenStream("meta_description", new StringReader(text));
      metatexts +=
          cPmeta(
              "Совпадения в description",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

      text = v.getMeta_subject() + "";

      tokenStream = a.tokenStream("meta_subject", new StringReader(text));
      metatexts +=
          cPmeta(
              "Совпадения в subject",
              hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));
    }

    SearchResultWH swh = new SearchResultWH(v, "Visa", s, metatexts);
    return swh;
  }
 public static void main(String[] args) throws IOException {
   Analyzer analyzer = new JavaSrcAnalyzer();
   TokenStream stream =
       analyzer.tokenStream("test", new StringReader("package java.util.ArrayList"));
   while (stream.incrementToken()) {
     String[] parts = stream.reflectAsString(false).split("#");
     for (String s : parts) {
       System.out.println(s);
     }
     System.out.println();
   }
 }
  @Test
  public void test02() throws Exception {
    TokenStream ts = analyzer.tokenStream("content", "good");
    CharTermAttribute cta = ts.addAttribute(CharTermAttribute.class);

    ts.reset();
    while (ts.incrementToken()) {
      System.out.println("token:" + ts.reflectAsString(true));
    }
    ts.end();
    ts.close();
  }
Exemple #26
0
  /**
   * Public image adapter
   *
   * @param hg
   * @param a
   * @param pi
   * @return
   * @throws Exception
   */
  public SearchResultWH makeHW(Highlighter hg, Analyzer a, PublicImage pi) throws Exception {
    String text = pi.getDescription() + "";

    TokenStream tokenStream = a.tokenStream("description", new StringReader(text));
    String s =
        cP(
            "Совпадения в описании",
            hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

    SearchResultWH swh = new SearchResultWH(pi, "PublicImage", s);
    return swh;
  }
 public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output)
     throws Exception {
   TokenStream stream = analyzer.tokenStream("field", new StringReader(input));
   stream.reset();
   CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
   for (String expected : output) {
     Assert.assertTrue(stream.incrementToken());
     Assert.assertEquals(expected, termAttr.toString());
   }
   Assert.assertFalse(stream.incrementToken());
   stream.close();
 }
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

      int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ":");
      }

      BytesRef pl = payload.getPayload();

      if (pl != null) {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + ":"
                + new String(pl.bytes)
                + "] ");

      } else {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + "] ");
      }
    }
    System.out.println();
  }
 protected Term getAnalyzedTerm(TokenType tokenType, String termString) throws IOException {
   Term term = getTerm(termString, tokenType); // first ensure that we've stripped any prefixes
   TokenStream tokenStream = analyzer.tokenStream(term.field(), new StringReader(term.text()));
   tokenStream.reset();
   CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
   StringBuilder sb = new StringBuilder();
   while (tokenStream.incrementToken()) {
     sb.append(termAtt.toString());
   }
   tokenStream.end();
   tokenStream.close();
   return new Term(term.field(), sb.toString());
 }
Exemple #30
0
  /**
   * Any text adapter
   *
   * @param hg
   * @param a
   * @param ann
   * @return
   * @throws Exception
   */
  public SearchResultWH makeHW(Highlighter hg, Analyzer a, AnyText t) throws Exception {
    String s = "";
    String text = t.getIDescription() + "";

    TokenStream tokenStream = a.tokenStream("anytext", new StringReader(text));
    s +=
        cP(
            "Совпадения в тексте",
            hg.getBestFragments(tokenStream, text, MAX_NUM_FRAGMENTS_REQUIRED, "... "));

    SearchResultWH swh = new SearchResultWH(t, "AnyText", s);
    return swh;
  }