public Query getQuery(Element e) throws ParserException { String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); String text = DOMUtils.getNonBlankTextOrFail(e); BooleanQuery bq = new BooleanQuery(DOMUtils.getAttribute(e, "disableCoord", false)); bq.setMinimumNumberShouldMatch(DOMUtils.getAttribute(e, "minimumNumberShouldMatch", 0)); try { TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text)); TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); Term term = null; BytesRef bytes = termAtt.getBytesRef(); ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); term = new Term(fieldName, BytesRef.deepCopyOf(bytes)); bq.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.SHOULD)); } ts.end(); ts.close(); } catch (IOException ioe) { throw new RuntimeException("Error constructing terms from index:" + ioe); } bq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f)); return bq; }
public void testIntStream() throws Exception { final NumericTokenStream stream = new NumericTokenStream().setIntValue(ivalue); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); stream.reset(); assertEquals(32, numericAtt.getValueSize()); for (int shift = 0; shift < 32; shift += NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); final int hash = bytesAtt.fillBytesRef(); assertEquals("Hash incorrect", bytes.hashCode(), hash); assertEquals( "Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes)); assertEquals( "Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals( "Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { if (part == null || analyzerIn == null) return null; try (TokenStream source = analyzerIn.tokenStream(field, part)) { source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "analyzer returned no terms for multiTerm term: " + part); termAtt.fillBytesRef(); if (source.incrementToken()) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "error analyzing range part: " + part, e); } }
private BytesRef bytesFromTokenStream(TokenStream stream) throws IOException { TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class); BytesRef bytesRef = termAttr.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAttr.fillBytesRef(); } stream.close(); BytesRef copy = new BytesRef(); copy.copyBytes(bytesRef); return copy; }
/** * Analyzes the given text using the given analyzer and returns the produced tokens. * * @param query The query to analyze. * @param analyzer The analyzer to use. */ protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) { final Set<BytesRef> tokens = new HashSet<BytesRef>(); final TokenStream tokenStream = analyzer.tokenStream("", new StringReader(query)); final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { bytesAtt.fillBytesRef(); tokens.add(new BytesRef(bytes)); } } catch (IOException ioe) { throw new RuntimeException("Error occured while iterating over tokenstream", ioe); } return tokens; }
protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException { List<BytesRef> bytesRefs = new ArrayList<>(); try (TokenStream tokenStream = analyzer.tokenStream(field, text)) { TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class); BytesRef bytesRef = termAttribute.getBytesRef(); tokenStream.reset(); while (tokenStream.incrementToken()) { termAttribute.fillBytesRef(); bytesRefs.add(BytesRef.deepCopyOf(bytesRef)); } tokenStream.end(); } return bytesRefs; }
// Produces a realistic unicode random string that // survives MockAnalyzer unchanged: private String getRandomTerm(String other) throws IOException { Analyzer a = new MockAnalyzer(random()); while (true) { String s = _TestUtil.randomRealisticUnicodeString(random()); if (other != null && s.equals(other)) { continue; } IOException priorException = null; TokenStream ts = a.tokenStream("foo", s); try { final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class); final BytesRef termBytes = termAtt.getBytesRef(); ts.reset(); int count = 0; boolean changed = false; while (ts.incrementToken()) { termAtt.fillBytesRef(); if (count == 0 && !termBytes.utf8ToString().equals(s)) { // The value was changed during analysis. Keep iterating so the // tokenStream is exhausted. changed = true; } count++; } ts.end(); // Did we iterate just once and the value was unchanged? if (!changed && count == 1) { return s; } } catch (IOException e) { priorException = e; } finally { IOUtils.closeWhileHandlingException(priorException, ts); } } }
/** * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene * <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName a name to be associated with the text * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no * matter what. * @param boost the boost factor for hits for this field * @param positionIncrementGap the position increment gap if fields with the same name are added * more than once * @param offsetGap the offset gap if fields with the same name are added more than once * @see org.apache.lucene.document.Field#setBoost(float) */ public void addField( String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, int offsetGap) { try (TokenStream stream = tokenStream) { if (frozen) throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen"); if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (stream == null) throw new IllegalArgumentException("token stream must not be null"); if (boost <= 0.0f) throw new IllegalArgumentException("boost factor must be greater than 0.0"); int numTokens = 0; int numOverlapTokens = 0; int pos = -1; final BytesRefHash terms; final SliceByteStartArray sliceArray; Info info; long sumTotalTermFreq = 0; int offset = 0; FieldInfo fieldInfo; if ((info = fields.get(fieldName)) != null) { fieldInfo = info.fieldInfo; numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { fieldInfo = new FieldInfo( fieldName, fields.size(), true, false, this.storePayloads, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, -1, Collections.<String, String>emptyMap()); sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null; BytesRef ref = termAtt.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAtt.fillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr == 0) numOverlapTokens++; pos += posIncr; int ord = terms.add(ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.startNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; postingsWriter.writeInt(pos); if (storeOffsets) { postingsWriter.writeInt(offsetAtt.startOffset() + offset); postingsWriter.writeInt(offsetAtt.endOffset() + offset); } if (storePayloads) { final BytesRef payload = payloadAtt.getPayload(); final int pIndex; if (payload == null || payload.length == 0) { pIndex = -1; } else { pIndex = payloadsBytesRefs.append(payload); } postingsWriter.writeInt(pIndex); } sliceArray.end[ord] = postingsWriter.getCurrentOffset(); } stream.end(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields.put( fieldName, new Info( fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq)); } } catch (IOException e) { throw new RuntimeException(e); } }
/** * Converts the list of Tokens to a list of NamedLists representing the tokens. * * @param tokens Tokens to convert * @param context The analysis context * @return List of NamedLists containing the relevant information taken from the tokens */ private List<NamedList> convertTokensToNamedLists( final List<AttributeSource> tokens, AnalysisContext context) { final List<NamedList> tokensNamedLists = new ArrayList<NamedList>(); final int[] positions = new int[tokens.size()]; int position = 0; for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement(); positions[i] = position; } // sort the tokens by absoulte position new SorterTemplate() { @Override protected void swap(int i, int j) { final int p = positions[i]; positions[i] = positions[j]; positions[j] = p; Collections.swap(tokens, i, j); } @Override protected int compare(int i, int j) { return positions[i] - positions[j]; } @Override protected void setPivot(int i) { pivot = positions[i]; } @Override protected int comparePivot(int j) { return pivot - positions[j]; } private int pivot; }.mergeSort(0, tokens.size() - 1); FieldType fieldType = context.getFieldType(); final CharArr textBuf = new CharArr(); for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>(); final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class); BytesRef rawBytes = termAtt.getBytesRef(); termAtt.fillBytesRef(); textBuf.reset(); fieldType.indexedToReadable(rawBytes, textBuf); final String text = textBuf.toString(); tokenNamedList.add("text", text); if (token.hasAttribute(CharTermAttribute.class)) { final String rawText = token.getAttribute(CharTermAttribute.class).toString(); if (!rawText.equals(text)) { tokenNamedList.add("raw_text", rawText); } } tokenNamedList.add("raw_bytes", rawBytes.toString()); if (context.getTermsToMatch().contains(rawBytes)) { tokenNamedList.add("match", true); } tokenNamedList.add("position", positions[i]); token.reflectWith( new AttributeReflector() { public void reflect(Class<? extends Attribute> attClass, String key, Object value) { // leave out position and bytes term if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return; if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; String k = attClass.getName() + '#' + key; // map keys for "standard attributes": if (ATTRIBUTE_MAPPING.containsKey(k)) { k = ATTRIBUTE_MAPPING.get(k); } if (value instanceof Payload) { final Payload p = (Payload) value; value = new BytesRef(p.getData()).toString(); } tokenNamedList.add(k, value); } }); tokensNamedLists.add(tokenNamedList); } return tokensNamedLists; }