예제 #1
0
 /** encodes an entry (bytes+(contexts)+(payload)+weight) to the provided writer */
 protected void encode(
     ByteSequencesWriter writer,
     ByteArrayDataOutput output,
     byte[] buffer,
     BytesRef spare,
     BytesRef payload,
     Set<BytesRef> contexts,
     long weight)
     throws IOException {
   int requiredLength = spare.length + 8 + ((hasPayloads) ? 2 + payload.length : 0);
   if (hasContexts) {
     for (BytesRef ctx : contexts) {
       requiredLength += 2 + ctx.length;
     }
     requiredLength += 2; // for length of contexts
   }
   if (requiredLength >= buffer.length) {
     buffer = ArrayUtil.grow(buffer, requiredLength);
   }
   output.reset(buffer);
   output.writeBytes(spare.bytes, spare.offset, spare.length);
   if (hasContexts) {
     for (BytesRef ctx : contexts) {
       output.writeBytes(ctx.bytes, ctx.offset, ctx.length);
       output.writeShort((short) ctx.length);
     }
     output.writeShort((short) contexts.size());
   }
   if (hasPayloads) {
     output.writeBytes(payload.bytes, payload.offset, payload.length);
     output.writeShort((short) payload.length);
   }
   output.writeLong(weight);
   writer.write(buffer, 0, output.getPosition());
 }
 @Override
 protected void encode(
     ByteSequencesWriter writer,
     ByteArrayDataOutput output,
     byte[] buffer,
     BytesRef spare,
     long weight)
     throws IOException {
   if (spare.length + 5 >= buffer.length) {
     buffer = ArrayUtil.grow(buffer, spare.length + 5);
   }
   output.reset(buffer);
   output.writeBytes(spare.bytes, spare.offset, spare.length);
   output.writeByte((byte) 0); // separator: not used, just for sort order
   output.writeInt(encodeWeight(weight));
   writer.write(buffer, 0, output.getPosition());
 }
    /** Builds an {@link SynonymMap} and returns it. */
    public SynonymMap build() throws IOException {
      ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
      // TODO: are we using the best sharing options?
      org.apache.lucene.util.fst.Builder<BytesRef> builder =
          new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

      BytesRefBuilder scratch = new BytesRefBuilder();
      ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

      final Set<Integer> dedupSet;

      if (dedup) {
        dedupSet = new HashSet<>();
      } else {
        dedupSet = null;
      }

      final byte[] spare = new byte[5];

      Set<CharsRef> keys = workingSet.keySet();
      CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
      Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

      final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

      // System.out.println("fmap.build");
      for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
        CharsRef input = sortedKeys[keyIdx];
        MapEntry output = workingSet.get(input);

        int numEntries = output.ords.size();
        // output size, assume the worst case
        int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

        scratch.grow(estimatedSize);
        scratchOutput.reset(scratch.bytes());

        // now write our output data:
        int count = 0;
        for (int i = 0; i < numEntries; i++) {
          if (dedupSet != null) {
            // box once
            final Integer ent = output.ords.get(i);
            if (dedupSet.contains(ent)) {
              continue;
            }
            dedupSet.add(ent);
          }
          scratchOutput.writeVInt(output.ords.get(i));
          count++;
        }

        final int pos = scratchOutput.getPosition();
        scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
        final int pos2 = scratchOutput.getPosition();
        final int vIntLen = pos2 - pos;

        // Move the count + includeOrig to the front of the byte[]:
        System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
        System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
        System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

        if (dedupSet != null) {
          dedupSet.clear();
        }

        scratch.setLength(scratchOutput.getPosition());
        // System.out.println("  add input=" + input + " output=" + scratch + " offset=" +
        // scratch.offset + " length=" + scratch.length + " count=" + count);
        builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
      }

      FST<BytesRef> fst = builder.finish();
      return new SynonymMap(fst, words, maxHorizontalContext);
    }
  @Override
  public void build(TermFreqIterator tfit) throws IOException {
    if (tfit instanceof TermFreqPayloadIterator) {
      throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    File tempInput =
        File.createTempFile(
            FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir());
    File tempSorted =
        File.createTempFile(
            FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir());

    Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput);
    Sort.ByteSequencesReader reader = null;
    ExternalRefSorter sorter = null;

    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    boolean success = false;
    try {
      byte[] buffer = new byte[0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef spare;
      while ((spare = tfit.next()) != null) {
        if (spare.length + 4 >= buffer.length) {
          buffer = ArrayUtil.grow(buffer, spare.length + 4);
        }

        output.reset(buffer);
        output.writeInt(encodeWeight(tfit.weight()));
        output.writeBytes(spare.bytes, spare.offset, spare.length);
        writer.write(buffer, 0, output.getPosition());
      }
      writer.close();

      // We don't know the distribution of scores and we need to bucket them, so we'll sort
      // and divide into equal buckets.
      SortInfo info = new Sort().sort(tempInput, tempSorted);
      tempInput.delete();
      FSTCompletionBuilder builder =
          new FSTCompletionBuilder(
              buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength);

      final int inputLines = info.lines;
      reader = new Sort.ByteSequencesReader(tempSorted);
      long line = 0;
      int previousBucket = 0;
      int previousScore = 0;
      ByteArrayDataInput input = new ByteArrayDataInput();
      BytesRef tmp1 = new BytesRef();
      BytesRef tmp2 = new BytesRef();
      while (reader.read(tmp1)) {
        input.reset(tmp1.bytes);
        int currentScore = input.readInt();

        int bucket;
        if (line > 0 && currentScore == previousScore) {
          bucket = previousBucket;
        } else {
          bucket = (int) (line * buckets / inputLines);
        }
        previousScore = currentScore;
        previousBucket = bucket;

        // Only append the input, discard the weight.
        tmp2.bytes = tmp1.bytes;
        tmp2.offset = input.getPosition();
        tmp2.length = tmp1.length - input.getPosition();
        builder.add(tmp2, bucket);

        line++;
      }

      // The two FSTCompletions share the same automaton.
      this.higherWeightsCompletion = builder.build();
      this.normalCompletion =
          new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);

      success = true;
    } finally {
      if (success) IOUtils.close(reader, writer, sorter);
      else IOUtils.closeWhileHandlingException(reader, writer, sorter);

      tempInput.delete();
      tempSorted.delete();
    }
  }
예제 #5
0
  public void index(IndexWriter writer, Type valueType, long[] values, int offset, int num)
      throws CorruptIndexException, IOException {
    final Field valField;

    if (VERBOSE) {
      System.out.println(
          "TEST: add docs " + offset + "-" + (offset + num) + " valType=" + valueType);
    }

    switch (valueType) {
      case VAR_INTS:
        valField = new PackedLongDocValuesField("promote", (long) 0);
        break;
      case FIXED_INTS_8:
        valField = new ByteDocValuesField("promote", (byte) 0);
        break;
      case FIXED_INTS_16:
        valField = new ShortDocValuesField("promote", (short) 0);
        break;
      case FIXED_INTS_32:
        valField = new IntDocValuesField("promote", 0);
        break;
      case FIXED_INTS_64:
        valField = new LongDocValuesField("promote", (byte) 0);
        break;
      case FLOAT_32:
        valField = new FloatDocValuesField("promote", 0f);
        break;
      case FLOAT_64:
        valField = new DoubleDocValuesField("promote", 0d);
        break;
      case BYTES_FIXED_STRAIGHT:
        valField = new StraightBytesDocValuesField("promote", new BytesRef(), true);
        break;
      case BYTES_VAR_STRAIGHT:
        valField = new StraightBytesDocValuesField("promote", new BytesRef(), false);
        break;
      case BYTES_FIXED_DEREF:
        valField = new DerefBytesDocValuesField("promote", new BytesRef(), true);
        break;
      case BYTES_VAR_DEREF:
        valField = new DerefBytesDocValuesField("promote", new BytesRef(), false);
        break;
      case BYTES_FIXED_SORTED:
        valField = new SortedBytesDocValuesField("promote", new BytesRef(), true);
        break;
      case BYTES_VAR_SORTED:
        valField = new SortedBytesDocValuesField("promote", new BytesRef(), false);
        break;
      default:
        throw new IllegalStateException("unknown Type: " + valueType);
    }

    for (int i = offset; i < offset + num; i++) {
      Document doc = new Document();
      doc.add(new Field("id", i + "", TextField.TYPE_STORED));
      switch (valueType) {
        case VAR_INTS:
          // TODO: can we do nextLong()?
          values[i] = random().nextInt();
          valField.setLongValue(values[i]);
          break;
        case FIXED_INTS_16:
          // TODO: negatives too?
          values[i] = random().nextInt(Short.MAX_VALUE);
          valField.setShortValue((short) values[i]);
          break;
        case FIXED_INTS_32:
          values[i] = random().nextInt();
          valField.setIntValue((int) values[i]);
          break;
        case FIXED_INTS_64:
          values[i] = random().nextLong();
          valField.setLongValue(values[i]);
          break;
        case FLOAT_64:
          final double nextDouble = random().nextDouble();
          values[i] = Double.doubleToRawLongBits(nextDouble);
          valField.setDoubleValue(nextDouble);
          break;
        case FLOAT_32:
          final float nextFloat = random().nextFloat();
          values[i] = Double.doubleToRawLongBits(nextFloat);
          valField.setFloatValue(nextFloat);
          break;
        case FIXED_INTS_8:
          values[i] = (byte) i;
          valField.setByteValue((byte) values[i]);
          break;
        case BYTES_FIXED_DEREF:
        case BYTES_FIXED_SORTED:
        case BYTES_FIXED_STRAIGHT:
          values[i] = random().nextLong();
          byte bytes[] = new byte[8];
          ByteArrayDataOutput out = new ByteArrayDataOutput(bytes, 0, 8);
          out.writeLong(values[i]);
          valField.setBytesValue(new BytesRef(bytes));
          break;
        case BYTES_VAR_DEREF:
        case BYTES_VAR_SORTED:
        case BYTES_VAR_STRAIGHT:
          byte lbytes[] = new byte[8];
          ByteArrayDataOutput lout = new ByteArrayDataOutput(lbytes, 0, 8);
          final int len;
          if (random().nextBoolean()) {
            values[i] = random().nextInt();
            lout.writeInt((int) values[i]);
            len = 4;
          } else {
            values[i] = random().nextLong();
            lout.writeLong(values[i]);
            len = 8;
          }
          valField.setBytesValue(new BytesRef(lbytes, 0, len));
          break;

        default:
          fail("unexpected value " + valueType);
      }
      if (VERBOSE) {
        System.out.println("  doc " + i + " has val=" + valField);
      }
      doc.add(valField);
      writer.addDocument(doc);
      if (random().nextInt(10) == 0) {
        writer.commit();
      }
    }
  }