@Override public boolean load(InputStream input) throws IOException { try { this.fst = new FST<Long>(new InputStreamDataInput(input), PositiveIntOutputs.getSingleton(true)); } finally { IOUtils.close(input); } return true; }
public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { this.fieldInfo = fieldInfo; fstOutputs = PositiveIntOutputs.getSingleton(true); fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs); indexStart = out.getFilePointer(); //// System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in fstBuilder.add(new IntsRef(), termsFilePointer); startTermsFilePointer = termsFilePointer; }
@Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
private UserDictionary(List<String[]> featureEntries) throws IOException { int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if it's needed/useful? Collections.sort( featureEntries, new Comparator<String[]>() { @Override public int compare(String[] left, String[] right) { return left[0].compareTo(right[0]); } }); List<String> data = new ArrayList<>(featureEntries.size()); List<int[]> segmentations = new ArrayList<>(featureEntries.size()); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (String[] values : featureEntries) { String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; if (segmentation.length != readings.length) { throw new RuntimeException( "Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.length + ")" + " does not the match number of readings (" + readings.length + ")"); } int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.length; i++) { wordIdAndLength[i + 1] = segmentation[i].length(); data.add(readings[i] + INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST String token = values[0]; scratch.grow(token.length()); scratch.setLength(token.length()); for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, (int) token.charAt(i)); } fstBuilder.add(scratch.get(), ord); segmentations.add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.finish(), false); this.data = data.toArray(new String[data.size()]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); }
@Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { final FSTEntry entry = fsts.get(field.name); if (entry.numOrds == 0) { return DocValues.emptySortedSet(); // empty FST! } FST<Long> instance; synchronized (this) { instance = fstInstances.get(field.name); if (instance == null) { data.seek(entry.offset); instance = new FST<>(data, PositiveIntOutputs.getSingleton()); if (!merging) { ramBytesUsed.addAndGet(instance.ramBytesUsed()); fstInstances.put(field.name, instance); } } } final BinaryDocValues docToOrds = getBinary(field); final FST<Long> fst = instance; // per-thread resources final BytesReader in = fst.getBytesReader(); final Arc<Long> firstArc = new Arc<>(); final Arc<Long> scratchArc = new Arc<>(); final IntsRefBuilder scratchInts = new IntsRefBuilder(); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); final ByteArrayDataInput input = new ByteArrayDataInput(); return new SortedSetDocValues() { final BytesRefBuilder term = new BytesRefBuilder(); BytesRef ordsRef; long currentOrd; @Override public long nextOrd() { if (input.eof()) { return NO_MORE_ORDS; } else { currentOrd += input.readVLong(); return currentOrd; } } @Override public void setDocument(int docID) { ordsRef = docToOrds.get(docID); input.reset(ordsRef.bytes, ordsRef.offset, ordsRef.length); currentOrd = 0; } @Override public BytesRef lookupOrd(long ord) { try { in.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); term.grow(output.length); term.clear(); return Util.toBytesRef(output, term); } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long lookupTerm(BytesRef key) { try { InputOutput<Long> o = fstEnum.seekCeil(key); if (o == null) { return -getValueCount() - 1; } else if (o.input.equals(key)) { return o.output.intValue(); } else { return -o.output - 1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long getValueCount() { return entry.numOrds; } @Override public TermsEnum termsEnum() { return new FSTTermsEnum(fst); } }; }
@Override public SortedDocValues getSorted(FieldInfo field) throws IOException { final FSTEntry entry = fsts.get(field.name); FST<Long> instance; synchronized (this) { instance = fstInstances.get(field.name); if (instance == null) { data.seek(entry.offset); instance = new FST<>(data, PositiveIntOutputs.getSingleton()); if (!merging) { ramBytesUsed.addAndGet(instance.ramBytesUsed()); fstInstances.put(field.name, instance); } } } final NumericDocValues docToOrd = getNumeric(field); final FST<Long> fst = instance; // per-thread resources final BytesReader in = fst.getBytesReader(); final Arc<Long> firstArc = new Arc<>(); final Arc<Long> scratchArc = new Arc<>(); final IntsRefBuilder scratchInts = new IntsRefBuilder(); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); return new SortedDocValues() { final BytesRefBuilder term = new BytesRefBuilder(); @Override public int getOrd(int docID) { return (int) docToOrd.get(docID); } @Override public BytesRef lookupOrd(int ord) { try { in.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); term.grow(output.length); term.clear(); return Util.toBytesRef(output, term); } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public int lookupTerm(BytesRef key) { try { InputOutput<Long> o = fstEnum.seekCeil(key); if (o == null) { return -getValueCount() - 1; } else if (o.input.equals(key)) { return o.output.intValue(); } else { return (int) -o.output - 1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public int getValueCount() { return (int) entry.numOrds; } @Override public TermsEnum termsEnum() { return new FSTTermsEnum(fst); } }; }