@Override public int nextDoc() throws IOException { if (docID == NO_MORE_DOCS) { return docID; } boolean first = true; int termFreq = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END) : "scratch=" + scratch.utf8ToString(); if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } return docID = NO_MORE_DOCS; } } }
@Override public int nextPosition() throws IOException { final int pos; if (readPositions) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + POS.length, scratch.length - POS.length, scratchUTF16_2); pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, START_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + START_OFFSET.length, scratch.length - START_OFFSET.length, scratchUTF16_2); startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + END_OFFSET.length, scratch.length - END_OFFSET.length, scratchUTF16_2); endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { final int len = scratch.length - PAYLOAD.length; if (scratch2.bytes.length < len) { scratch2.grow(len); } System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len); scratch2.length = len; payload = scratch2; } else { payload = null; in.seek(fp); } return pos; }
@Override public int nextDoc() throws IOException { boolean first = true; in.seek(nextDocStart); long posStart = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); // System.out.println("NEXT DOC: " + scratch.utf8ToString()); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); tf = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); posStart = in.getFilePointer(); } else if (StringHelper.startsWith(scratch, POS)) { // skip } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END); if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } return docID = NO_MORE_DOCS; } } }
@Override public List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num) { assert num > 0; BytesRef scratch = new BytesRef(key); int prefixLength = scratch.length; Arc<Long> arc = new Arc<Long>(); // match the prefix portion exactly Long prefixOutput = null; try { prefixOutput = lookupPrefix(scratch, arc); } catch (IOException bogus) { throw new RuntimeException(bogus); } if (prefixOutput == null) { return Collections.<LookupResult>emptyList(); } List<LookupResult> results = new ArrayList<LookupResult>(num); CharsRef spare = new CharsRef(); if (exactFirst && arc.isFinal()) { spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add( new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput))); if (--num == 0) { return results; // that was quick } } // complete top-N MinResult<Long> completions[] = null; try { completions = Util.shortestPaths(fst, arc, weightComparator, num); } catch (IOException bogus) { throw new RuntimeException(bogus); } BytesRef suffix = new BytesRef(8); for (MinResult<Long> completion : completions) { scratch.length = prefixLength; // append suffix Util.toBytesRef(completion.input, suffix); scratch.append(suffix); spare.grow(scratch.length); UnicodeUtil.UTF8toUTF16(scratch, spare); results.add( new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output))); } return results; }
@Override public String strVal(int doc) { int ord = termsIndex.getOrd(doc); if (ord == 0) return null; termsIndex.lookup(ord, spare); UnicodeUtil.UTF8toUTF16(spare, spareChars); return spareChars.toString(); }
@Override public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { // TODO: this could be more efficient, but the sortable types should be deprecated instead UnicodeUtil.UTF8toUTF16(input, charsRef); final char[] indexedToReadable = indexedToReadable(charsRef.toString()).toCharArray(); charsRef.copyChars(indexedToReadable, 0, indexedToReadable.length); return charsRef; }
@Override public String toUtf8() { if (length() == 0) { return ""; } byte[] bytes = toBytes(); final CharsRef ref = new CharsRef(length); UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref); return ref.toString(); }
@Override public List<LookupResult> lookup(CharSequence key, boolean higherWeightsFirst, int num) { final List<Completion> completions; if (higherWeightsFirst) { completions = higherWeightsCompletion.lookup(key, num); } else { completions = normalCompletion.lookup(key, num); } final ArrayList<LookupResult> results = new ArrayList<LookupResult>(completions.size()); CharsRef spare = new CharsRef(); for (Completion c : completions) { spare.grow(c.utf8.length); UnicodeUtil.UTF8toUTF16(c.utf8, spare); results.add(new LookupResult(spare.toString(), c.bucket)); } return results; }
@Override public void build(InputIterator tfit) throws IOException { if (tfit.hasPayloads()) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } root = new TernaryTreeNode(); // buffer first if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) { // make sure it's sorted and the comparator uses UTF16 sort order tfit = new SortedInputIterator(tfit, BytesRef.getUTF8SortedAsUTF16Comparator()); } ArrayList<String> tokens = new ArrayList<String>(); ArrayList<Number> vals = new ArrayList<Number>(); BytesRef spare; CharsRef charsSpare = new CharsRef(); while ((spare = tfit.next()) != null) { charsSpare.grow(spare.length); UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); tokens.add(charsSpare.toString()); vals.add(Long.valueOf(tfit.weight())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); }
@Override protected AcceptStatus accept(BytesRef term) throws IOException { UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); return runAutomaton.run(utf16.result, 0, utf16.length) ? AcceptStatus.YES : AcceptStatus.NO; }
@Override public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { UnicodeUtil.UTF8toUTF16(input, charsRef); charsRef.append(Z_ARRAY, 0, 1); return charsRef; }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
@Override protected Suggest.Suggestion< ? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>> innerExecute( String name, CompletionSuggestionContext suggestionContext, IndexReader indexReader, CharsRef spare) throws IOException { if (suggestionContext.mapper() == null || !(suggestionContext.mapper() instanceof CompletionFieldMapper)) { throw new ElasticsearchException( "Field [" + suggestionContext.getField() + "] is not a completion suggest field"); } CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); UnicodeUtil.UTF8toUTF16(suggestionContext.getText(), spare); CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(new StringText(spare.toString()), 0, spare.length()); completionSuggestion.addTerm(completionSuggestEntry); String fieldName = suggestionContext.getField(); Map<String, CompletionSuggestion.Entry.Option> results = Maps.newHashMapWithExpectedSize(indexReader.leaves().size() * suggestionContext.getSize()); for (AtomicReaderContext atomicReaderContext : indexReader.leaves()) { AtomicReader atomicReader = atomicReaderContext.reader(); Terms terms = atomicReader.fields().terms(fieldName); if (terms instanceof Completion090PostingsFormat.CompletionTerms) { final Completion090PostingsFormat.CompletionTerms lookupTerms = (Completion090PostingsFormat.CompletionTerms) terms; final Lookup lookup = lookupTerms.getLookup(suggestionContext.mapper(), suggestionContext); if (lookup == null) { // we don't have a lookup for this segment.. this might be possible if a merge dropped all // docs from the segment that had a value in this segment. continue; } List<Lookup.LookupResult> lookupResults = lookup.lookup(spare, false, suggestionContext.getSize()); for (Lookup.LookupResult res : lookupResults) { final String key = res.key.toString(); final float score = res.value; final Option value = results.get(key); if (value == null) { final Option option = new CompletionSuggestion.Entry.Option( new StringText(key), score, res.payload == null ? null : new BytesArray(res.payload)); results.put(key, option); } else if (value.getScore() < score) { value.setScore(score); value.setPayload(res.payload == null ? null : new BytesArray(res.payload)); } } } } final List<CompletionSuggestion.Entry.Option> options = new ArrayList<CompletionSuggestion.Entry.Option>(results.values()); CollectionUtil.introSort(options, scoreComparator); int optionCount = Math.min(suggestionContext.getSize(), options.size()); for (int i = 0; i < optionCount; i++) { completionSuggestEntry.addOption(options.get(i)); } return completionSuggestion; }