/** encodes an entry (bytes+(contexts)+(payload)+weight) to the provided writer */ protected void encode( ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, BytesRef payload, Set<BytesRef> contexts, long weight) throws IOException { int requiredLength = spare.length + 8 + ((hasPayloads) ? 2 + payload.length : 0); if (hasContexts) { for (BytesRef ctx : contexts) { requiredLength += 2 + ctx.length; } requiredLength += 2; // for length of contexts } if (requiredLength >= buffer.length) { buffer = ArrayUtil.grow(buffer, requiredLength); } output.reset(buffer); output.writeBytes(spare.bytes, spare.offset, spare.length); if (hasContexts) { for (BytesRef ctx : contexts) { output.writeBytes(ctx.bytes, ctx.offset, ctx.length); output.writeShort((short) ctx.length); } output.writeShort((short) contexts.size()); } if (hasPayloads) { output.writeBytes(payload.bytes, payload.offset, payload.length); output.writeShort((short) payload.length); } output.writeLong(weight); writer.write(buffer, 0, output.getPosition()); }
@Override protected void encode( ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, long weight) throws IOException { if (spare.length + 5 >= buffer.length) { buffer = ArrayUtil.grow(buffer, spare.length + 5); } output.reset(buffer); output.writeBytes(spare.bytes, spare.offset, spare.length); output.writeByte((byte) 0); // separator: not used, just for sort order output.writeInt(encodeWeight(weight)); writer.write(buffer, 0, output.getPosition()); }
/** Builds an {@link SynonymMap} and returns it. */ public SynonymMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); // TODO: are we using the best sharing options? org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); BytesRefBuilder scratch = new BytesRefBuilder(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); final Set<Integer> dedupSet; if (dedup) { dedupSet = new HashSet<>(); } else { dedupSet = null; } final byte[] spare = new byte[5]; Set<CharsRef> keys = workingSet.keySet(); CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); // System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet.get(input); int numEntries = output.ords.size(); // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes()); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once final Integer ent = output.ords.get(i); if (dedupSet.contains(ent)) { continue; } dedupSet.add(ent); } scratchOutput.writeVInt(output.ords.get(i)); count++; } final int pos = scratchOutput.getPosition(); scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); final int pos2 = scratchOutput.getPosition(); final int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen); System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos); System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen); if (dedupSet != null) { dedupSet.clear(); } scratch.setLength(scratchOutput.getPosition()); // System.out.println(" add input=" + input + " output=" + scratch + " offset=" + // scratch.offset + " length=" + scratch.length + " count=" + count); builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); } FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext); }
@Override public void build(TermFreqIterator tfit) throws IOException { if (tfit instanceof TermFreqPayloadIterator) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } File tempInput = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir()); File tempSorted = File.createTempFile( FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir()); Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); Sort.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. // If negative floats are allowed some trickery needs to be done to find their byte order. boolean success = false; try { byte[] buffer = new byte[0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef spare; while ((spare = tfit.next()) != null) { if (spare.length + 4 >= buffer.length) { buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); output.writeInt(encodeWeight(tfit.weight())); output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } writer.close(); // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. SortInfo info = new Sort().sort(tempInput, tempSorted); tempInput.delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder( buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength); final int inputLines = info.lines; reader = new Sort.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); int currentScore = input.readInt(); int bucket; if (line > 0 && currentScore == previousScore) { bucket = previousBucket; } else { bucket = (int) (line * buckets / inputLines); } previousScore = currentScore; previousBucket = bucket; // Only append the input, discard the weight. tmp2.bytes = tmp1.bytes; tmp2.offset = input.getPosition(); tmp2.length = tmp1.length - input.getPosition(); builder.add(tmp2, bucket); line++; } // The two FSTCompletions share the same automaton. this.higherWeightsCompletion = builder.build(); this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst); success = true; } finally { if (success) IOUtils.close(reader, writer, sorter); else IOUtils.closeWhileHandlingException(reader, writer, sorter); tempInput.delete(); tempSorted.delete(); } }
public void index(IndexWriter writer, Type valueType, long[] values, int offset, int num) throws CorruptIndexException, IOException { final Field valField; if (VERBOSE) { System.out.println( "TEST: add docs " + offset + "-" + (offset + num) + " valType=" + valueType); } switch (valueType) { case VAR_INTS: valField = new PackedLongDocValuesField("promote", (long) 0); break; case FIXED_INTS_8: valField = new ByteDocValuesField("promote", (byte) 0); break; case FIXED_INTS_16: valField = new ShortDocValuesField("promote", (short) 0); break; case FIXED_INTS_32: valField = new IntDocValuesField("promote", 0); break; case FIXED_INTS_64: valField = new LongDocValuesField("promote", (byte) 0); break; case FLOAT_32: valField = new FloatDocValuesField("promote", 0f); break; case FLOAT_64: valField = new DoubleDocValuesField("promote", 0d); break; case BYTES_FIXED_STRAIGHT: valField = new StraightBytesDocValuesField("promote", new BytesRef(), true); break; case BYTES_VAR_STRAIGHT: valField = new StraightBytesDocValuesField("promote", new BytesRef(), false); break; case BYTES_FIXED_DEREF: valField = new DerefBytesDocValuesField("promote", new BytesRef(), true); break; case BYTES_VAR_DEREF: valField = new DerefBytesDocValuesField("promote", new BytesRef(), false); break; case BYTES_FIXED_SORTED: valField = new SortedBytesDocValuesField("promote", new BytesRef(), true); break; case BYTES_VAR_SORTED: valField = new SortedBytesDocValuesField("promote", new BytesRef(), false); break; default: throw new IllegalStateException("unknown Type: " + valueType); } for (int i = offset; i < offset + num; i++) { Document doc = new Document(); doc.add(new Field("id", i + "", TextField.TYPE_STORED)); switch (valueType) { case VAR_INTS: // TODO: can we do nextLong()? values[i] = random().nextInt(); valField.setLongValue(values[i]); break; case FIXED_INTS_16: // TODO: negatives too? values[i] = random().nextInt(Short.MAX_VALUE); valField.setShortValue((short) values[i]); break; case FIXED_INTS_32: values[i] = random().nextInt(); valField.setIntValue((int) values[i]); break; case FIXED_INTS_64: values[i] = random().nextLong(); valField.setLongValue(values[i]); break; case FLOAT_64: final double nextDouble = random().nextDouble(); values[i] = Double.doubleToRawLongBits(nextDouble); valField.setDoubleValue(nextDouble); break; case FLOAT_32: final float nextFloat = random().nextFloat(); values[i] = Double.doubleToRawLongBits(nextFloat); valField.setFloatValue(nextFloat); break; case FIXED_INTS_8: values[i] = (byte) i; valField.setByteValue((byte) values[i]); break; case BYTES_FIXED_DEREF: case BYTES_FIXED_SORTED: case BYTES_FIXED_STRAIGHT: values[i] = random().nextLong(); byte bytes[] = new byte[8]; ByteArrayDataOutput out = new ByteArrayDataOutput(bytes, 0, 8); out.writeLong(values[i]); valField.setBytesValue(new BytesRef(bytes)); break; case BYTES_VAR_DEREF: case BYTES_VAR_SORTED: case BYTES_VAR_STRAIGHT: byte lbytes[] = new byte[8]; ByteArrayDataOutput lout = new ByteArrayDataOutput(lbytes, 0, 8); final int len; if (random().nextBoolean()) { values[i] = random().nextInt(); lout.writeInt((int) values[i]); len = 4; } else { values[i] = random().nextLong(); lout.writeLong(values[i]); len = 8; } valField.setBytesValue(new BytesRef(lbytes, 0, len)); break; default: fail("unexpected value " + valueType); } if (VERBOSE) { System.out.println(" doc " + i + " has val=" + valField); } doc.add(valField); writer.addDocument(doc); if (random().nextInt(10) == 0) { writer.commit(); } } }