Пример #1
0
 @Override
 public int nextDoc() throws IOException {
   boolean first = true;
   in.seek(nextDocStart);
   long posStart = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     // System.out.println("NEXT DOC: " + scratch.utf8ToString());
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       tf = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       posStart = in.getFilePointer();
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
           || StringHelper.startsWith(scratch, FIELD)
           || StringHelper.startsWith(scratch, END);
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
  @Test
  public void testReadRandomSampleFile() throws IOException {
    final int BUFFER_SIZE = 64;

    Cache cache = cacheManager.getCache();
    InfinispanDirectory dir = new InfinispanDirectory(cache, cache, cache, INDEXNAME, BUFFER_SIZE);

    final int FILE_SIZE = 1000;
    assert BUFFER_SIZE < FILE_SIZE;
    createFileWithRepeatableContent(dir, "RandomSampleFile.txt", FILE_SIZE);

    IndexInput indexInput = dir.openInput("RandomSampleFile.txt");
    assert indexInput.length() == FILE_SIZE;
    RepeatableLongByteSequence bytesGenerator = new RepeatableLongByteSequence();

    Random r = new Random();
    long seekPoint = 0;
    // Now it reads some random byte and it compares to the expected byte
    for (int i = 0; i < FILE_SIZE; i++) {
      if (seekPoint == i) {
        byte expectedByte = bytesGenerator.nextByte();
        byte actualByte = indexInput.readByte();
        assert expectedByte == actualByte;
        seekPoint = indexInput.getFilePointer() + r.nextInt(10);
        indexInput.seek(seekPoint);
      } else {
        bytesGenerator.nextByte();
      }
    }
    indexInput.close();
    dir.close();
    DirectoryIntegrityCheck.verifyDirectoryStructure(cache, INDEXNAME);
  }
    @Override
    public BytesRef getPayload() throws IOException {
      if (!payloadPending) {
        return null;
      }

      if (pendingPayloadBytes == 0) {
        return payload;
      }

      assert pendingPayloadBytes >= payloadLength;

      if (pendingPayloadBytes > payloadLength) {
        payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength));
      }

      if (payload == null) {
        payload = new BytesRef();
        payload.bytes = new byte[payloadLength];
      } else if (payload.bytes.length < payloadLength) {
        payload.grow(payloadLength);
      }

      payloadIn.readBytes(payload.bytes, 0, payloadLength);
      payload.length = payloadLength;
      pendingPayloadBytes = 0;
      return payload;
    }
Пример #4
0
 @Override
 public int nextDoc() throws IOException {
   if (docID == NO_MORE_DOCS) {
     return docID;
   }
   boolean first = true;
   int termFreq = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       termFreq = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip termFreq++;
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
               || StringHelper.startsWith(scratch, FIELD)
               || StringHelper.startsWith(scratch, END)
           : "scratch=" + scratch.utf8ToString();
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
Пример #5
0
 private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
   if (binary || compressed) {
     long pointer = fieldsStream.getFilePointer();
     fieldsStream.seek(pointer + toRead);
   } else {
     // We need to skip chars.  This will slow us down, but still better
     fieldsStream.skipChars(toRead);
   }
 }
  public void testEncodeDecode() throws IOException {
    final int iterations = RandomInts.randomIntBetween(random(), 1, 1000);
    final float acceptableOverheadRatio = random().nextFloat();
    final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE];
    for (int i = 0; i < iterations; ++i) {
      final int bpv = random().nextInt(32);
      if (bpv == 0) {
        final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE);
        for (int j = 0; j < BLOCK_SIZE; ++j) {
          values[i * BLOCK_SIZE + j] = value;
        }
      } else {
        for (int j = 0; j < BLOCK_SIZE; ++j) {
          values[i * BLOCK_SIZE + j] =
              RandomInts.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv));
        }
      }
    }

    final Directory d = new RAMDirectory();
    final long endPointer;

    {
      // encode
      IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT);
      final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out);

      for (int i = 0; i < iterations; ++i) {
        forUtil.writeBlock(
            Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length),
            new byte[MAX_ENCODED_SIZE],
            out);
      }
      endPointer = out.getFilePointer();
      out.close();
    }

    {
      // decode
      IndexInput in = d.openInput("test.bin", IOContext.READONCE);
      final ForUtil forUtil = new ForUtil(in);
      for (int i = 0; i < iterations; ++i) {
        if (random().nextBoolean()) {
          forUtil.skipBlock(in);
          continue;
        }
        final int[] restored = new int[MAX_DATA_SIZE];
        forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored);
        assertArrayEquals(
            Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE),
            Arrays.copyOf(restored, BLOCK_SIZE));
      }
      assertEquals(endPointer, in.getFilePointer());
      in.close();
    }
  }
  private void assertSameStreams(String msg, IndexInput expected, IndexInput test)
      throws IOException {
    assertNotNull(msg + " null expected", expected);
    assertNotNull(msg + " null test", test);
    assertEquals(msg + " length", expected.length(), test.length());
    assertEquals(msg + " position", expected.getFilePointer(), test.getFilePointer());

    byte expectedBuffer[] = new byte[512];
    byte testBuffer[] = new byte[expectedBuffer.length];

    long remainder = expected.length() - expected.getFilePointer();
    while (remainder > 0) {
      int readLen = (int) Math.min(remainder, expectedBuffer.length);
      expected.readBytes(expectedBuffer, 0, readLen);
      test.readBytes(testBuffer, 0, readLen);
      assertEqualArrays(msg + ", remainder " + remainder, expectedBuffer, testBuffer, 0, readLen);
      remainder -= readLen;
    }
  }
Пример #8
0
      /* Does initial decode of next block of terms; this
      doesn't actually decode the docFreq, totalTermFreq,
      postings details (frq/prx offset, etc.) metadata;
      it just loads them as byte[] blobs which are then
      decoded on-demand if the metadata is ever requested
      for any term in this block.  This enables terms-only
      intensive consumes (eg certain MTQs, respelling) to
      not pay the price of decoding metadata they won't
      use. */
      private boolean nextBlock() throws IOException {

        // TODO: we still lazy-decode the byte[] for each
        // term (the suffix), but, if we decoded
        // all N terms up front then seeking could do a fast
        // bsearch w/in the block...

        // System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this);
        state.blockFilePointer = in.getFilePointer();
        blockTermCount = in.readVInt();
        // System.out.println("  blockTermCount=" + blockTermCount);
        if (blockTermCount == 0) {
          return false;
        }
        termBlockPrefix = in.readVInt();

        // term suffixes:
        int len = in.readVInt();
        if (termSuffixes.length < len) {
          termSuffixes = new byte[ArrayUtil.oversize(len, 1)];
        }
        // System.out.println("  termSuffixes len=" + len);
        in.readBytes(termSuffixes, 0, len);
        termSuffixesReader.reset(termSuffixes, 0, len);

        // docFreq, totalTermFreq
        len = in.readVInt();
        if (docFreqBytes.length < len) {
          docFreqBytes = new byte[ArrayUtil.oversize(len, 1)];
        }
        // System.out.println("  freq bytes len=" + len);
        in.readBytes(docFreqBytes, 0, len);
        freqReader.reset(docFreqBytes, 0, len);

        // metadata
        len = in.readVInt();
        if (bytes == null) {
          bytes = new byte[ArrayUtil.oversize(len, 1)];
          bytesReader = new ByteArrayDataInput();
        } else if (bytes.length < len) {
          bytes = new byte[ArrayUtil.oversize(len, 1)];
        }
        in.readBytes(bytes, 0, len);
        bytesReader.reset(bytes, 0, len);

        metaDataUpto = 0;
        state.termBlockOrd = 0;

        indexIsCurrent = false;
        // System.out.println("  indexIsCurrent=" + indexIsCurrent);

        return true;
      }
Пример #9
0
  private void addFieldLazy(
      Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize)
      throws IOException {
    if (binary == true) {
      int toRead = fieldsStream.readVInt();
      long pointer = fieldsStream.getFilePointer();
      if (compressed) {
        // was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));
        doc.add(new LazyField(fi.name, Field.Store.COMPRESS, toRead, pointer));
      } else {
        // was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
        doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer));
      }
      // Need to move the pointer ahead by toRead positions
      fieldsStream.seek(pointer + toRead);
    } else {
      Field.Store store = Field.Store.YES;
      Field.Index index = getIndexType(fi, tokenize);
      Field.TermVector termVector = getTermVectorType(fi);

      Fieldable f;
      if (compressed) {
        store = Field.Store.COMPRESS;
        int toRead = fieldsStream.readVInt();
        long pointer = fieldsStream.getFilePointer();
        f = new LazyField(fi.name, store, toRead, pointer);
        // skip over the part that we aren't loading
        fieldsStream.seek(pointer + toRead);
        f.setOmitNorms(fi.omitNorms);
      } else {
        int length = fieldsStream.readVInt();
        long pointer = fieldsStream.getFilePointer();
        // Skip ahead of where we are by the length of what is stored
        fieldsStream.skipChars(length);
        f = new LazyField(fi.name, store, index, termVector, length, pointer);
        f.setOmitNorms(fi.omitNorms);
      }
      doc.add(f);
    }
  }
Пример #10
0
    @Override
    public int nextPosition() throws IOException {
      final int pos;
      if (readPositions) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + POS.length,
            scratch.length - POS.length,
            scratchUTF16_2);
        pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      } else {
        pos = -1;
      }

      if (readOffsets) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, START_OFFSET)
            : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + START_OFFSET.length,
            scratch.length - START_OFFSET.length,
            scratchUTF16_2);
        startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + END_OFFSET.length,
            scratch.length - END_OFFSET.length,
            scratchUTF16_2);
        endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      }

      final long fp = in.getFilePointer();
      SimpleTextUtil.readLine(in, scratch);
      if (StringHelper.startsWith(scratch, PAYLOAD)) {
        final int len = scratch.length - PAYLOAD.length;
        if (scratch2.bytes.length < len) {
          scratch2.grow(len);
        }
        System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
        scratch2.length = len;
        payload = scratch2;
      } else {
        payload = null;
        in.seek(fp);
      }
      return pos;
    }
 private void skipBytes(long count) throws IOException {
   if (in instanceof IndexInput) {
     final IndexInput iin = (IndexInput) in;
     iin.seek(iin.getFilePointer() + count);
   } else {
     if (blocks == null) {
       blocks = new byte[blockSize];
     }
     long skipped = 0;
     while (skipped < count) {
       final int toSkip = (int) Math.min(blocks.length, count - skipped);
       in.readBytes(blocks, 0, toSkip);
       skipped += toSkip;
     }
   }
 }
Пример #12
0
 private void readIndexInputFullyWithRandomSeeks(IndexInput indexInput) throws IOException {
   BytesRef ref = new BytesRef(scaledRandomIntBetween(1, 1024));
   long pos = 0;
   while (pos < indexInput.length()) {
     assertEquals(pos, indexInput.getFilePointer());
     int op = random().nextInt(5);
     if (op == 0) {
       int shift = 100 - randomIntBetween(0, 200);
       pos = Math.min(indexInput.length() - 1, Math.max(0, pos + shift));
       indexInput.seek(pos);
     } else if (op == 1) {
       indexInput.readByte();
       pos++;
     } else {
       int min = (int) Math.min(indexInput.length() - pos, ref.bytes.length);
       indexInput.readBytes(ref.bytes, ref.offset, min);
       pos += min;
     }
   }
 }
  /**
   * closes temporary file, compresses data and removes temporary file.
   *
   * @throws IOException
   */
  @Override
  public void close() throws IOException {
    byte[] buffer = new byte[chunkSize];
    tempOut.close();
    // directory with offsets offsets of compressed chunks with
    // real position in decompressed stream
    IndexInput in = tempDirectory.openInput(tmpName);
    long len = closeLength = in.length();
    // write length of the file at the begining for easier retreval
    output.writeLong(-1);

    // write configuration
    writeConfig();
    int toRead;
    // read all data and compresse it in variable block chunks
    while (len > 0) {
      if (len > buffer.length) {
        toRead = buffer.length;
      } else {
        toRead = (int) len;
      }

      // just for safety --- can be improoved
      long bufferPos = in.getFilePointer();
      // read original data
      in.readBytes(buffer, 0, toRead);

      writeChunk(buffer, bufferPos, toRead);

      len -= toRead;
    }
    // now let's crate directory entry of all chunks and their's original
    // position in inflated stream

    in.close();
    if (tempDirectory.fileExists(tmpName)) {
      tempDirectory.deleteFile(tmpName);
    }
    super.close();
  }
Пример #14
0
 public boolean hasNext() {
   return input.getFilePointer() < input.length();
 }
 @Override
 public long getFilePointer() {
   return delegate.getFilePointer();
 }
Пример #16
0
 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }
Пример #17
0
  /**
   * This test opens two files from a compound stream and verifies that their file positions are
   * independent of each other.
   */
  public void testRandomAccessClones() throws IOException {
    setUp_2();
    CompoundFileReader cr = new CompoundFileReader(dir, "f.comp");

    // Open two files
    IndexInput e1 = cr.openInput("f11");
    IndexInput e2 = cr.openInput("f3");

    IndexInput a1 = (IndexInput) e1.clone();
    IndexInput a2 = (IndexInput) e2.clone();

    // Seek the first pair
    e1.seek(100);
    a1.seek(100);
    assertEquals(100, e1.getFilePointer());
    assertEquals(100, a1.getFilePointer());
    byte be1 = e1.readByte();
    byte ba1 = a1.readByte();
    assertEquals(be1, ba1);

    // Now seek the second pair
    e2.seek(1027);
    a2.seek(1027);
    assertEquals(1027, e2.getFilePointer());
    assertEquals(1027, a2.getFilePointer());
    byte be2 = e2.readByte();
    byte ba2 = a2.readByte();
    assertEquals(be2, ba2);

    // Now make sure the first one didn't move
    assertEquals(101, e1.getFilePointer());
    assertEquals(101, a1.getFilePointer());
    be1 = e1.readByte();
    ba1 = a1.readByte();
    assertEquals(be1, ba1);

    // Now more the first one again, past the buffer length
    e1.seek(1910);
    a1.seek(1910);
    assertEquals(1910, e1.getFilePointer());
    assertEquals(1910, a1.getFilePointer());
    be1 = e1.readByte();
    ba1 = a1.readByte();
    assertEquals(be1, ba1);

    // Now make sure the second set didn't move
    assertEquals(1028, e2.getFilePointer());
    assertEquals(1028, a2.getFilePointer());
    be2 = e2.readByte();
    ba2 = a2.readByte();
    assertEquals(be2, ba2);

    // Move the second set back, again cross the buffer size
    e2.seek(17);
    a2.seek(17);
    assertEquals(17, e2.getFilePointer());
    assertEquals(17, a2.getFilePointer());
    be2 = e2.readByte();
    ba2 = a2.readByte();
    assertEquals(be2, ba2);

    // Finally, make sure the first set didn't move
    // Now make sure the first one didn't move
    assertEquals(1911, e1.getFilePointer());
    assertEquals(1911, a1.getFilePointer());
    be1 = e1.readByte();
    ba1 = a1.readByte();
    assertEquals(be1, ba1);

    e1.close();
    e2.close();
    a1.close();
    a2.close();
    cr.close();
  }