Ejemplo n.º 1
0
 // TODO: maybe we could add bulk-add method to
 // Builder?  Takes FST and unions it w/ current
 // FST.
 private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex) throws IOException {
   final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<BytesRef>(subIndex);
   BytesRefFSTEnum.InputOutput<BytesRef> indexEnt;
   while ((indexEnt = subIndexEnum.next()) != null) {
     // if (DEBUG) {
     //  System.out.println("      add sub=" + indexEnt.input + " " + indexEnt.input + " output="
     // + indexEnt.output);
     // }
     builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
   }
 }
    public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
      this.fieldInfo = fieldInfo;
      fstOutputs = PositiveIntOutputs.getSingleton(true);
      fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
      indexStart = out.getFilePointer();
      //// System.out.println("VGW: field=" + fieldInfo.name);

      // Always put empty string in
      fstBuilder.add(new IntsRef(), termsFilePointer);
      startTermsFilePointer = termsFilePointer;
    }
Ejemplo n.º 3
0
    @Override
    public void finishTerm(BytesRef text, TermStats stats) throws IOException {

      assert stats.docFreq > 0;
      // if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" +
      // toString(text) + " seg=" + segment + " df=" + stats.docFreq);

      blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
      pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats));
      postingsWriter.finishTerm(stats);
      numTerms++;
    }
 @Override
 public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
   if (text.length == 0) {
     // We already added empty string in ctor
     assert termsFilePointer == startTermsFilePointer;
     return;
   }
   final int lengthSave = text.length;
   text.length = indexedTermPrefixLength(lastTerm, text);
   try {
     fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
   } finally {
     text.length = lengthSave;
   }
   lastTerm.copyBytes(text);
 }
  @Override
  public void build(TermFreqIterator iterator) throws IOException {
    BytesRef scratch = new BytesRef();
    TermFreqIterator iter =
        new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator());
    IntsRef scratchInts = new IntsRef();
    BytesRef previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
      long cost = iter.weight();

      if (previous == null) {
        previous = new BytesRef();
      } else if (scratch.equals(previous)) {
        continue; // for duplicate suggestions, the best weight is actually
        // added
      }
      Util.toIntsRef(scratch, scratchInts);
      builder.add(scratchInts, cost);
      previous.copyBytes(scratch);
    }
    fst = builder.finish();
  }
    /** Builds an {@link SynonymMap} and returns it. */
    public SynonymMap build() throws IOException {
      ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder =
          new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

      BytesRefBuilder scratch = new BytesRefBuilder();
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

      final Set<Integer> dedupSet;

      if (dedup) {
        dedupSet = new HashSet<>();
      } else {
        dedupSet = null;
      }

      final byte[] spare = new byte[5];

      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

      // System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

        scratch.grow(estimatedSize);
        scratchOutput.reset(scratch.bytes());

        // now write our output data:
        int count = 0;
        for (int i = 0; i < numEntries; i++) {
          if (dedupSet != null) {
            // box once
            final Integer ent = output.ords.get(i);
            if (dedupSet.contains(ent)) {
              continue;
            }
            dedupSet.add(ent);
          }
          scratchOutput.writeVInt(output.ords.get(i));
          count++;
        }

        final int pos = scratchOutput.getPosition();
        scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
        final int pos2 = scratchOutput.getPosition();
        final int vIntLen = pos2 - pos;

        // Move the count + includeOrig to the front of the byte[]:
        System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
        System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
        System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

        if (dedupSet != null) {
          dedupSet.clear();
        }

        scratch.setLength(scratchOutput.getPosition());
        // System.out.println("  add input=" + input + " output=" + scratch + " offset=" +
        // scratch.offset + " length=" + scratch.length + " count=" + count);
        builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
      }

      FST<BytesRef> fst = builder.finish();
      return new SynonymMap(fst, words, maxHorizontalContext);
    }
Ejemplo n.º 7
0
    public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes)
        throws IOException {

      assert (isFloor && floorBlocks != null && floorBlocks.size() != 0)
              || (!isFloor && floorBlocks == null)
          : "isFloor=" + isFloor + " floorBlocks=" + floorBlocks;

      assert scratchBytes.getFilePointer() == 0;

      // TODO: try writing the leading vLong in MSB order
      // (opposite of what Lucene does today), for better
      // outputs sharing in the FST
      scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
      if (isFloor) {
        scratchBytes.writeVInt(floorBlocks.size());
        for (PendingBlock sub : floorBlocks) {
          assert sub.floorLeadByte != -1;
          // if (DEBUG) {
          //  System.out.println("    write floorLeadByte=" +
          // Integer.toHexString(sub.floorLeadByte&0xff));
          // }
          scratchBytes.writeByte((byte) sub.floorLeadByte);
          assert sub.fp > fp;
          scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
        }
      }

      final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      final Builder<BytesRef> indexBuilder =
          new Builder<BytesRef>(
              FST.INPUT_TYPE.BYTE1,
              0,
              0,
              true,
              false,
              Integer.MAX_VALUE,
              outputs,
              null,
              false,
              PackedInts.COMPACT,
              true,
              15);
      // if (DEBUG) {
      //  System.out.println("  compile index for prefix=" + prefix);
      // }
      // indexBuilder.DEBUG = false;
      final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
      assert bytes.length > 0;
      scratchBytes.writeTo(bytes, 0);
      indexBuilder.add(
          Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
      scratchBytes.reset();

      // Copy over index for all sub-blocks

      if (subIndices != null) {
        for (FST<BytesRef> subIndex : subIndices) {
          append(indexBuilder, subIndex);
        }
      }

      if (floorBlocks != null) {
        for (PendingBlock sub : floorBlocks) {
          if (sub.subIndices != null) {
            for (FST<BytesRef> subIndex : sub.subIndices) {
              append(indexBuilder, subIndex);
            }
          }
          sub.subIndices = null;
        }
      }

      index = indexBuilder.finish();
      subIndices = null;

      /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
      Util.toDot(index, w, false, false);
      System.out.println("SAVED to out.dot");
      w.close();
      */
    }
Ejemplo n.º 8
0
  private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(
        featureEntries,
        new Comparator<String[]>() {
          @Override
          public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
          }
        });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];

      if (segmentation.length != readings.length) {
        throw new RuntimeException(
            "Illegal user dictionary entry "
                + values[0]
                + " - the number of segmentations ("
                + segmentation.length
                + ")"
                + " does not the match number of readings ("
                + readings.length
                + ")");
      }

      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstBuilder.add(scratch.get(), ord);
      segmentations.add(wordIdAndLength);
      ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
  }