コード例 #1
0
  @Override
  public List<LookupResult> lookup(CharSequence key, boolean onlyMorePopular, int num) {
    assert num > 0;
    BytesRef scratch = new BytesRef(key);
    int prefixLength = scratch.length;
    Arc<Long> arc = new Arc<Long>();

    // match the prefix portion exactly
    Long prefixOutput = null;
    try {
      prefixOutput = lookupPrefix(scratch, arc);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }

    if (prefixOutput == null) {
      return Collections.<LookupResult>emptyList();
    }

    List<LookupResult> results = new ArrayList<LookupResult>(num);
    CharsRef spare = new CharsRef();
    if (exactFirst && arc.isFinal()) {
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(
          new LookupResult(spare.toString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
      if (--num == 0) {
        return results; // that was quick
      }
    }

    // complete top-N
    MinResult<Long> completions[] = null;
    try {
      completions = Util.shortestPaths(fst, arc, weightComparator, num);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }

    BytesRef suffix = new BytesRef(8);
    for (MinResult<Long> completion : completions) {
      scratch.length = prefixLength;
      // append suffix
      Util.toBytesRef(completion.input, suffix);
      scratch.append(suffix);
      spare.grow(scratch.length);
      UnicodeUtil.UTF8toUTF16(scratch, spare);
      results.add(
          new LookupResult(spare.toString(), decodeWeight(prefixOutput + completion.output)));
    }
    return results;
  }
コード例 #2
0
 @Override
 public void seekExact(long ord) throws IOException {
   // TODO: would be better to make this simpler and faster.
   // but we dont want to introduce a bug that corrupts our enum state!
   bytesReader.setPosition(0);
   fst.getFirstArc(firstArc);
   IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts);
   BytesRefBuilder scratchBytes = new BytesRefBuilder();
   scratchBytes.clear();
   Util.toBytesRef(output, scratchBytes);
   // TODO: we could do this lazily, better to try to push into FSTEnum though?
   in.seekExact(scratchBytes.get());
 }
コード例 #3
0
 // TODO: maybe we could add bulk-add method to
 // Builder?  Takes FST and unions it w/ current
 // FST.
 private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex) throws IOException {
   final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<BytesRef>(subIndex);
   BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
   while ((indexEnt = subIndexEnum.next()) != null) {
     // if (DEBUG) {
     //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output="
     // + indexEnt.output);
     // }
     builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
   }
 }
コード例 #4
0
    @Override
    public void finishTerm(BytesRef text, TermStats stats) throws IOException {

      assert stats.docFreq > 0;
      // if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" +
      // toString(text) + " seg=" + segment + " df=" + stats.docFreq);

      blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
      pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats));
      postingsWriter.finishTerm(stats);
      numTerms++;
    }
コード例 #5
0
 /**
  * Load frame for target arc(node) on fst, so that arc.label >= label and
  * !fsa.reject(arc.label)
  */
 Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
   FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
   arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
   if (arc == null) {
     return null;
   }
   frame.fsaState = fsa.step(top.fsaState, arc.label);
   // if (TEST) System.out.println(" loadCeil frame="+frame);
   if (frame.fsaState == -1) {
     return loadNextFrame(top, frame);
   }
   return frame;
 }
コード例 #6
0
 @Override
 public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
   if (text.length == 0) {
     // We already added empty string in ctor
     assert termsFilePointer == startTermsFilePointer;
     return;
   }
   final int lengthSave = text.length;
   text.length = indexedTermPrefixLength(lastTerm, text);
   try {
     fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
   } finally {
     text.length = lengthSave;
   }
   lastTerm.copyBytes(text);
 }
コード例 #7
0
  @Override
  public void build(TermFreqIterator iterator) throws IOException {
    BytesRef scratch = new BytesRef();
    TermFreqIterator iter =
        new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator());
    IntsRef scratchInts = new IntsRef();
    BytesRef previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
      long cost = iter.weight();

      if (previous == null) {
        previous = new BytesRef();
      } else if (scratch.equals(previous)) {
        continue; // for duplicate suggestions, the best weight is actually
        // added
      }
      Util.toIntsRef(scratch, scratchInts);
      builder.add(scratchInts, cost);
      previous.copyBytes(scratch);
    }
    fst = builder.finish();
  }
コード例 #8
0
 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }
コード例 #9
0
 @SuppressWarnings("unused")
 private void printSeekState(PrintStream out) throws IOException {
   if (currentFrame == staticFrame) {
     out.println("  no prior seek");
   } else {
     out.println("  prior seek state:");
     int ord = 0;
     boolean isSeekFrame = true;
     while (true) {
       IDVersionSegmentTermsEnumFrame f = getFrame(ord);
       assert f != null;
       final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
       if (f.nextEnt == -1) {
         out.println(
             "    frame "
                 + (isSeekFrame ? "(seek)" : "(next)")
                 + " ord="
                 + ord
                 + " fp="
                 + f.fp
                 + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
                 + " prefixLen="
                 + f.prefix
                 + " prefix="
                 + brToString(prefix)
                 + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
                 + " hasTerms="
                 + f.hasTerms
                 + " isFloor="
                 + f.isFloor
                 + " code="
                 + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS)
                     + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0)
                     + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0))
                 + " isLastInFloor="
                 + f.isLastInFloor
                 + " mdUpto="
                 + f.metaDataUpto
                 + " tbOrd="
                 + f.getTermBlockOrd());
       } else {
         out.println(
             "    frame "
                 + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)")
                 + " ord="
                 + ord
                 + " fp="
                 + f.fp
                 + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "")
                 + " prefixLen="
                 + f.prefix
                 + " prefix="
                 + brToString(prefix)
                 + " nextEnt="
                 + f.nextEnt
                 + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
                 + " hasTerms="
                 + f.hasTerms
                 + " isFloor="
                 + f.isFloor
                 + " code="
                 + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS)
                     + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0)
                     + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0))
                 + " lastSubFP="
                 + f.lastSubFP
                 + " isLastInFloor="
                 + f.isLastInFloor
                 + " mdUpto="
                 + f.metaDataUpto
                 + " tbOrd="
                 + f.getTermBlockOrd());
       }
       if (fr.index != null) {
         assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
         if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
           out.println(
               "      broken seek state: arc.label="
                   + (char) f.arc.label
                   + " vs term byte="
                   + (char) (term.byteAt(f.prefix - 1) & 0xFF));
           throw new RuntimeException("seek state is broken");
         }
         Pair<BytesRef, Long> output = Util.get(fr.index, prefix);
         if (output == null) {
           out.println("      broken seek state: prefix is not final in index");
           throw new RuntimeException("seek state is broken");
         } else if (isSeekFrame && !f.isFloor) {
           final ByteArrayDataInput reader =
               new ByteArrayDataInput(
                   output.output1.bytes, output.output1.offset, output.output1.length);
           final long codeOrig = reader.readVLong();
           final long code =
               (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS)
                   | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0)
                   | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0);
           if (codeOrig != code) {
             out.println(
                 "      broken seek state: output code="
                     + codeOrig
                     + " doesn't match frame code="
                     + code);
             throw new RuntimeException("seek state is broken");
           }
         }
       }
       if (f == currentFrame) {
         break;
       }
       if (f.prefix == validIndexPrefix) {
         isSeekFrame = false;
       }
       ord++;
     }
   }
 }
コード例 #10
0
    /** Builds an {@link SynonymMap} and returns it. */
    public SynonymMap build() throws IOException {
      ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder =
          new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

      BytesRefBuilder scratch = new BytesRefBuilder();
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

      final Set<Integer> dedupSet;

      if (dedup) {
        dedupSet = new HashSet<>();
      } else {
        dedupSet = null;
      }

      final byte[] spare = new byte[5];

      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

      // System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

        scratch.grow(estimatedSize);
        scratchOutput.reset(scratch.bytes());

        // now write our output data:
        int count = 0;
        for (int i = 0; i < numEntries; i++) {
          if (dedupSet != null) {
            // box once
            final Integer ent = output.ords.get(i);
            if (dedupSet.contains(ent)) {
              continue;
            }
            dedupSet.add(ent);
          }
          scratchOutput.writeVInt(output.ords.get(i));
          count++;
        }

        final int pos = scratchOutput.getPosition();
        scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
        final int pos2 = scratchOutput.getPosition();
        final int vIntLen = pos2 - pos;

        // Move the count + includeOrig to the front of the byte[]:
        System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
        System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
        System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

        if (dedupSet != null) {
          dedupSet.clear();
        }

        scratch.setLength(scratchOutput.getPosition());
        // System.out.println("  add input=" + input + " output=" + scratch + " offset=" +
        // scratch.offset + " length=" + scratch.length + " count=" + count);
        builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
      }

      FST<BytesRef> fst = builder.finish();
      return new SynonymMap(fst, words, maxHorizontalContext);
    }
コード例 #11
0
    public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes)
        throws IOException {

      assert (isFloor && floorBlocks != null && floorBlocks.size() != 0)
              || (!isFloor && floorBlocks == null)
          : "isFloor=" + isFloor + " floorBlocks=" + floorBlocks;

      assert scratchBytes.getFilePointer() == 0;

      // TODO: try writing the leading vLong in MSB order
      // (opposite of what Lucene does today), for better
      // outputs sharing in the FST
      scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
      if (isFloor) {
        scratchBytes.writeVInt(floorBlocks.size());
        for (PendingBlock sub : floorBlocks) {
          assert sub.floorLeadByte != -1;
          // if (DEBUG) {
          //  System.out.println("    write floorLeadByte=" +
          // Integer.toHexString(sub.floorLeadByte&0xff));
          // }
          scratchBytes.writeByte((byte) sub.floorLeadByte);
          assert sub.fp > fp;
          scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
        }
      }

      final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      final Builder<BytesRef> indexBuilder =
          new Builder<BytesRef>(
              FST.INPUT_TYPE.BYTE1,
              0,
              0,
              true,
              false,
              Integer.MAX_VALUE,
              outputs,
              null,
              false,
              PackedInts.COMPACT,
              true,
              15);
      // if (DEBUG) {
      //  System.out.println("  compile index for prefix=" + prefix);
      // }
      // indexBuilder.DEBUG = false;
      final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
      assert bytes.length > 0;
      scratchBytes.writeTo(bytes, 0);
      indexBuilder.add(
          Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
      scratchBytes.reset();

      // Copy over index for all sub-blocks

      if (subIndices != null) {
        for (FST<BytesRef> subIndex : subIndices) {
          append(indexBuilder, subIndex);
        }
      }

      if (floorBlocks != null) {
        for (PendingBlock sub : floorBlocks) {
          if (sub.subIndices != null) {
            for (FST<BytesRef> subIndex : sub.subIndices) {
              append(indexBuilder, subIndex);
            }
          }
          sub.subIndices = null;
        }
      }

      index = indexBuilder.finish();
      subIndices = null;

      /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
      Util.toDot(index, w, false, false);
      System.out.println("SAVED to out.dot");
      w.close();
      */
    }