@Override
 public boolean load(InputStream input) throws IOException {
   try {
     this.fst =
         new FST<Long>(new InputStreamDataInput(input), PositiveIntOutputs.getSingleton(true));
   } finally {
     IOUtils.close(input);
   }
   return true;
 }
    public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
      this.fieldInfo = fieldInfo;
      fstOutputs = PositiveIntOutputs.getSingleton(true);
      fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
      indexStart = out.getFilePointer();
      //// System.out.println("VGW: field=" + fieldInfo.name);

      // Always put empty string in
      fstBuilder.add(new IntsRef(), termsFilePointer);
      startTermsFilePointer = termsFilePointer;
    }
  @Override
  public void build(TermFreqIterator iterator) throws IOException {
    BytesRef scratch = new BytesRef();
    TermFreqIterator iter =
        new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator());
    IntsRef scratchInts = new IntsRef();
    BytesRef previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
      long cost = iter.weight();

      if (previous == null) {
        previous = new BytesRef();
      } else if (scratch.equals(previous)) {
        continue; // for duplicate suggestions, the best weight is actually
        // added
      }
      Util.toIntsRef(scratch, scratchInts);
      builder.add(scratchInts, cost);
      previous.copyBytes(scratch);
    }
    fst = builder.finish();
  }
 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }
Beispiel #5
0
  private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(
        featureEntries,
        new Comparator<String[]>() {
          @Override
          public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
          }
        });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];

      if (segmentation.length != readings.length) {
        throw new RuntimeException(
            "Illegal user dictionary entry "
                + values[0]
                + " - the number of segmentations ("
                + segmentation.length
                + ")"
                + " does not the match number of readings ("
                + readings.length
                + ")");
      }

      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstBuilder.add(scratch.get(), ord);
      segmentations.add(wordIdAndLength);
      ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
  }
  @Override
  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
    final FSTEntry entry = fsts.get(field.name);
    if (entry.numOrds == 0) {
      return DocValues.emptySortedSet(); // empty FST!
    }
    FST<Long> instance;
    synchronized (this) {
      instance = fstInstances.get(field.name);
      if (instance == null) {
        data.seek(entry.offset);
        instance = new FST<>(data, PositiveIntOutputs.getSingleton());
        if (!merging) {
          ramBytesUsed.addAndGet(instance.ramBytesUsed());
          fstInstances.put(field.name, instance);
        }
      }
    }
    final BinaryDocValues docToOrds = getBinary(field);
    final FST<Long> fst = instance;

    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new SortedSetDocValues() {
      final BytesRefBuilder term = new BytesRefBuilder();
      BytesRef ordsRef;
      long currentOrd;

      @Override
      public long nextOrd() {
        if (input.eof()) {
          return NO_MORE_ORDS;
        } else {
          currentOrd += input.readVLong();
          return currentOrd;
        }
      }

      @Override
      public void setDocument(int docID) {
        ordsRef = docToOrds.get(docID);
        input.reset(ordsRef.bytes, ordsRef.offset, ordsRef.length);
        currentOrd = 0;
      }

      @Override
      public BytesRef lookupOrd(long ord) {
        try {
          in.setPosition(0);
          fst.getFirstArc(firstArc);
          IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
          term.grow(output.length);
          term.clear();
          return Util.toBytesRef(output, term);
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public long lookupTerm(BytesRef key) {
        try {
          InputOutput<Long> o = fstEnum.seekCeil(key);
          if (o == null) {
            return -getValueCount() - 1;
          } else if (o.input.equals(key)) {
            return o.output.intValue();
          } else {
            return -o.output - 1;
          }
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public long getValueCount() {
        return entry.numOrds;
      }

      @Override
      public TermsEnum termsEnum() {
        return new FSTTermsEnum(fst);
      }
    };
  }
  @Override
  public SortedDocValues getSorted(FieldInfo field) throws IOException {
    final FSTEntry entry = fsts.get(field.name);
    FST<Long> instance;
    synchronized (this) {
      instance = fstInstances.get(field.name);
      if (instance == null) {
        data.seek(entry.offset);
        instance = new FST<>(data, PositiveIntOutputs.getSingleton());
        if (!merging) {
          ramBytesUsed.addAndGet(instance.ramBytesUsed());
          fstInstances.put(field.name, instance);
        }
      }
    }
    final NumericDocValues docToOrd = getNumeric(field);
    final FST<Long> fst = instance;

    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);

    return new SortedDocValues() {

      final BytesRefBuilder term = new BytesRefBuilder();

      @Override
      public int getOrd(int docID) {
        return (int) docToOrd.get(docID);
      }

      @Override
      public BytesRef lookupOrd(int ord) {
        try {
          in.setPosition(0);
          fst.getFirstArc(firstArc);
          IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
          term.grow(output.length);
          term.clear();
          return Util.toBytesRef(output, term);
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public int lookupTerm(BytesRef key) {
        try {
          InputOutput<Long> o = fstEnum.seekCeil(key);
          if (o == null) {
            return -getValueCount() - 1;
          } else if (o.input.equals(key)) {
            return o.output.intValue();
          } else {
            return (int) -o.output - 1;
          }
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public int getValueCount() {
        return (int) entry.numOrds;
      }

      @Override
      public TermsEnum termsEnum() {
        return new FSTTermsEnum(fst);
      }
    };
  }