コード例 #1
0
  public Query getQuery(Element e) throws ParserException {
    String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
    String text = DOMUtils.getNonBlankTextOrFail(e);

    BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false));
    bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0));
    try {
      TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      Term term = null;
      BytesRef bytes = termAtt.getBytesRef();
      ts.reset();
      while (ts.incrementToken()) {
        termAtt.fillBytesRef();
        term = new Term(fieldName, BytesRef.deepCopyOf(bytes));
        bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD));
      }
      ts.end();
      ts.close();
    } catch (IOException ioe) {
      throw new RuntimeException("Error constructing terms from index:" + ioe);
    }

    bq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
    return bq;
  }
コード例 #2
0
 public void testIntStream() throws Exception {
   final NumericTokenStream stream = new NumericTokenStream().setIntValue(ivalue);
   // use getAttribute to test if attributes really exist, if not an IAE will be throwed
   final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
   final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
   final NumericTokenStream.NumericTermAttribute numericAtt =
       stream.getAttribute(NumericTokenStream.NumericTermAttribute.class);
   final BytesRef bytes = bytesAtt.getBytesRef();
   stream.reset();
   assertEquals(32, numericAtt.getValueSize());
   for (int shift = 0; shift < 32; shift += NumericUtils.PRECISION_STEP_DEFAULT) {
     assertTrue("New token is available", stream.incrementToken());
     assertEquals("Shift value wrong", shift, numericAtt.getShift());
     final int hash = bytesAtt.fillBytesRef();
     assertEquals("Hash incorrect", bytes.hashCode(), hash);
     assertEquals(
         "Term is incorrectly encoded",
         ivalue & ~((1 << shift) - 1),
         NumericUtils.prefixCodedToInt(bytes));
     assertEquals(
         "Term raw value is incorrectly encoded",
         ((long) ivalue) & ~((1L << shift) - 1L),
         numericAtt.getRawValue());
     assertEquals(
         "Type incorrect",
         (shift == 0)
             ? NumericTokenStream.TOKEN_TYPE_FULL_PREC
             : NumericTokenStream.TOKEN_TYPE_LOWER_PREC,
         typeAtt.type());
   }
   assertFalse("More tokens available", stream.incrementToken());
   stream.end();
   stream.close();
 }
コード例 #3
0
ファイル: TextField.java プロジェクト: jibaro/lucene_solr
  public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) {
    if (part == null || analyzerIn == null) return null;

    try (TokenStream source = analyzerIn.tokenStream(field, part)) {
      source.reset();

      TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
      BytesRef bytes = termAtt.getBytesRef();

      if (!source.incrementToken())
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            "analyzer returned no terms for multiTerm term: " + part);
      termAtt.fillBytesRef();
      if (source.incrementToken())
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            "analyzer returned too many terms for multiTerm term: " + part);

      source.end();
      return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e);
    }
  }
コード例 #4
0
 private BytesRef bytesFromTokenStream(TokenStream stream) throws IOException {
   TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class);
   BytesRef bytesRef = termAttr.getBytesRef();
   stream.reset();
   while (stream.incrementToken()) {
     termAttr.fillBytesRef();
   }
   stream.close();
   BytesRef copy = new BytesRef();
   copy.copyBytes(bytesRef);
   return copy;
 }
コード例 #5
0
 /**
  * Analyzes the given text using the given analyzer and returns the produced tokens.
  *
  * @param query The query to analyze.
  * @param analyzer The analyzer to use.
  */
 protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
   final Set<BytesRef> tokens = new HashSet<BytesRef>();
   final TokenStream tokenStream = analyzer.tokenStream("", new StringReader(query));
   final TermToBytesRefAttribute bytesAtt =
       tokenStream.getAttribute(TermToBytesRefAttribute.class);
   final BytesRef bytes = bytesAtt.getBytesRef();
   try {
     tokenStream.reset();
     while (tokenStream.incrementToken()) {
       bytesAtt.fillBytesRef();
       tokens.add(new BytesRef(bytes));
     }
   } catch (IOException ioe) {
     throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
   }
   return tokens;
 }
コード例 #6
0
  protected List<BytesRef> analyze(String text, String field, Analyzer analyzer)
      throws IOException {
    List<BytesRef> bytesRefs = new ArrayList<>();

    try (TokenStream tokenStream = analyzer.tokenStream(field, text)) {
      TermToBytesRefAttribute termAttribute =
          tokenStream.getAttribute(TermToBytesRefAttribute.class);

      BytesRef bytesRef = termAttribute.getBytesRef();

      tokenStream.reset();

      while (tokenStream.incrementToken()) {
        termAttribute.fillBytesRef();
        bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
      }

      tokenStream.end();
    }

    return bytesRefs;
  }
コード例 #7
0
  // Produces a realistic unicode random string that
  // survives MockAnalyzer unchanged:
  private String getRandomTerm(String other) throws IOException {
    Analyzer a = new MockAnalyzer(random());
    while (true) {
      String s = _TestUtil.randomRealisticUnicodeString(random());
      if (other != null && s.equals(other)) {
        continue;
      }
      IOException priorException = null;
      TokenStream ts = a.tokenStream("foo", s);
      try {
        final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
        final BytesRef termBytes = termAtt.getBytesRef();
        ts.reset();

        int count = 0;
        boolean changed = false;

        while (ts.incrementToken()) {
          termAtt.fillBytesRef();
          if (count == 0 && !termBytes.utf8ToString().equals(s)) {
            // The value was changed during analysis.  Keep iterating so the
            // tokenStream is exhausted.
            changed = true;
          }
          count++;
        }

        ts.end();
        // Did we iterate just once and the value was unchanged?
        if (!changed && count == 1) {
          return s;
        }
      } catch (IOException e) {
        priorException = e;
      } finally {
        IOUtils.closeWhileHandlingException(priorException, ts);
      }
    }
  }
コード例 #8
0
ファイル: MemoryIndex.java プロジェクト: sail-umkc/Examples
  /**
   * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to
   * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link
   * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized
   * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene
   * <code>KeywordTokenizer</code> or similar utilities.
   *
   * @param fieldName a name to be associated with the text
   * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no
   *     matter what.
   * @param boost the boost factor for hits for this field
   * @param positionIncrementGap the position increment gap if fields with the same name are added
   *     more than once
   * @param offsetGap the offset gap if fields with the same name are added more than once
   * @see org.apache.lucene.document.Field#setBoost(float)
   */
  public void addField(
      String fieldName,
      TokenStream tokenStream,
      float boost,
      int positionIncrementGap,
      int offsetGap) {
    try (TokenStream stream = tokenStream) {
      if (frozen)
        throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
      if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null");
      if (stream == null) throw new IllegalArgumentException("token stream must not be null");
      if (boost <= 0.0f)
        throw new IllegalArgumentException("boost factor must be greater than 0.0");
      int numTokens = 0;
      int numOverlapTokens = 0;
      int pos = -1;
      final BytesRefHash terms;
      final SliceByteStartArray sliceArray;
      Info info;
      long sumTotalTermFreq = 0;
      int offset = 0;
      FieldInfo fieldInfo;
      if ((info = fields.get(fieldName)) != null) {
        fieldInfo = info.fieldInfo;
        numTokens = info.numTokens;
        numOverlapTokens = info.numOverlapTokens;
        pos = info.lastPosition + positionIncrementGap;
        offset = info.lastOffset + offsetGap;
        terms = info.terms;
        boost *= info.boost;
        sliceArray = info.sliceArray;
        sumTotalTermFreq = info.sumTotalTermFreq;
      } else {
        fieldInfo =
            new FieldInfo(
                fieldName,
                fields.size(),
                true,
                false,
                this.storePayloads,
                this.storeOffsets
                    ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
                    : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                DocValuesType.NONE,
                -1,
                Collections.<String, String>emptyMap());
        sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
        terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
      }

      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute =
          stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt =
          storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();

      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        //        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();

      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
      if (numTokens > 0) {
        fields.put(
            fieldName,
            new Info(
                fieldInfo,
                terms,
                sliceArray,
                numTokens,
                numOverlapTokens,
                boost,
                pos,
                offsetAtt.endOffset() + offset,
                sumTotalTermFreq));
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
コード例 #9
0
  /**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokens Tokens to convert
   * @param context The analysis context
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
  private List<NamedList> convertTokensToNamedLists(
      final List<AttributeSource> tokens, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<NamedList>();

    final int[] positions = new int[tokens.size()];
    int position = 0;
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      positions[i] = position;
    }

    // sort the tokens by absoulte position
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
        final int p = positions[i];
        positions[i] = positions[j];
        positions[j] = p;
        Collections.swap(tokens, i, j);
      }

      @Override
      protected int compare(int i, int j) {
        return positions[i] - positions[j];
      }

      @Override
      protected void setPivot(int i) {
        pivot = positions[i];
      }

      @Override
      protected int comparePivot(int j) {
        return pivot - positions[j];
      }

      private int pivot;
    }.mergeSort(0, tokens.size() - 1);

    FieldType fieldType = context.getFieldType();

    final CharArr textBuf = new CharArr();
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
      final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
      BytesRef rawBytes = termAtt.getBytesRef();
      termAtt.fillBytesRef();

      textBuf.reset();
      fieldType.indexedToReadable(rawBytes, textBuf);
      final String text = textBuf.toString();

      tokenNamedList.add("text", text);

      if (token.hasAttribute(CharTermAttribute.class)) {
        final String rawText = token.getAttribute(CharTermAttribute.class).toString();
        if (!rawText.equals(text)) {
          tokenNamedList.add("raw_text", rawText);
        }
      }

      tokenNamedList.add("raw_bytes", rawBytes.toString());

      if (context.getTermsToMatch().contains(rawBytes)) {
        tokenNamedList.add("match", true);
      }

      tokenNamedList.add("position", positions[i]);

      token.reflectWith(
          new AttributeReflector() {
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
              // leave out position and bytes term
              if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return;
              if (CharTermAttribute.class.isAssignableFrom(attClass)) return;
              if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return;

              String k = attClass.getName() + '#' + key;

              // map keys for "standard attributes":
              if (ATTRIBUTE_MAPPING.containsKey(k)) {
                k = ATTRIBUTE_MAPPING.get(k);
              }

              if (value instanceof Payload) {
                final Payload p = (Payload) value;
                value = new BytesRef(p.getData()).toString();
              }

              tokenNamedList.add(k, value);
            }
          });

      tokensNamedLists.add(tokenNamedList);
    }

    return tokensNamedLists;
  }