@Override public int nextDoc() throws IOException { boolean first = true; in.seek(nextDocStart); long posStart = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); // System.out.println("NEXT DOC: " + scratch.utf8ToString()); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); tf = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); posStart = in.getFilePointer(); } else if (StringHelper.startsWith(scratch, POS)) { // skip } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END); if (!first && (liveDocs == null || liveDocs.get(docID))) { nextDocStart = lineStart; in.seek(posStart); return docID; } return docID = NO_MORE_DOCS; } } }
@Test public void testReadRandomSampleFile() throws IOException { final int BUFFER_SIZE = 64; Cache cache = cacheManager.getCache(); InfinispanDirectory dir = new InfinispanDirectory(cache, cache, cache, INDEXNAME, BUFFER_SIZE); final int FILE_SIZE = 1000; assert BUFFER_SIZE < FILE_SIZE; createFileWithRepeatableContent(dir, "RandomSampleFile.txt", FILE_SIZE); IndexInput indexInput = dir.openInput("RandomSampleFile.txt"); assert indexInput.length() == FILE_SIZE; RepeatableLongByteSequence bytesGenerator = new RepeatableLongByteSequence(); Random r = new Random(); long seekPoint = 0; // Now it reads some random byte and it compares to the expected byte for (int i = 0; i < FILE_SIZE; i++) { if (seekPoint == i) { byte expectedByte = bytesGenerator.nextByte(); byte actualByte = indexInput.readByte(); assert expectedByte == actualByte; seekPoint = indexInput.getFilePointer() + r.nextInt(10); indexInput.seek(seekPoint); } else { bytesGenerator.nextByte(); } } indexInput.close(); dir.close(); DirectoryIntegrityCheck.verifyDirectoryStructure(cache, INDEXNAME); }
@Override public BytesRef getPayload() throws IOException { if (!payloadPending) { return null; } if (pendingPayloadBytes == 0) { return payload; } assert pendingPayloadBytes >= payloadLength; if (pendingPayloadBytes > payloadLength) { payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength)); } if (payload == null) { payload = new BytesRef(); payload.bytes = new byte[payloadLength]; } else if (payload.bytes.length < payloadLength) { payload.grow(payloadLength); } payloadIn.readBytes(payload.bytes, 0, payloadLength); payload.length = payloadLength; pendingPayloadBytes = 0; return payload; }
@Override public int nextDoc() throws IOException { if (docID == NO_MORE_DOCS) { return docID; } boolean first = true; int termFreq = 0; while (true) { final long lineStart = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, DOC)) { if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, POS)) { // skip termFreq++; } else if (StringHelper.startsWith(scratch, START_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, END_OFFSET)) { // skip } else if (StringHelper.startsWith(scratch, PAYLOAD)) { // skip } else { assert StringHelper.startsWith(scratch, TERM) || StringHelper.startsWith(scratch, FIELD) || StringHelper.startsWith(scratch, END) : "scratch=" + scratch.utf8ToString(); if (!first && (liveDocs == null || liveDocs.get(docID))) { in.seek(lineStart); if (!omitTF) { tf = termFreq; } return docID; } return docID = NO_MORE_DOCS; } } }
private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { if (binary || compressed) { long pointer = fieldsStream.getFilePointer(); fieldsStream.seek(pointer + toRead); } else { // We need to skip chars. This will slow us down, but still better fieldsStream.skipChars(toRead); } }
public void testEncodeDecode() throws IOException { final int iterations = RandomInts.randomIntBetween(random(), 1, 1000); final float acceptableOverheadRatio = random().nextFloat(); final int[] values = new int[(iterations - 1) * BLOCK_SIZE + ForUtil.MAX_DATA_SIZE]; for (int i = 0; i < iterations; ++i) { final int bpv = random().nextInt(32); if (bpv == 0) { final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE); for (int j = 0; j < BLOCK_SIZE; ++j) { values[i * BLOCK_SIZE + j] = value; } } else { for (int j = 0; j < BLOCK_SIZE; ++j) { values[i * BLOCK_SIZE + j] = RandomInts.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); } } } final Directory d = new RAMDirectory(); final long endPointer; { // encode IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out); for (int i = 0; i < iterations; ++i) { forUtil.writeBlock( Arrays.copyOfRange(values, i * BLOCK_SIZE, values.length), new byte[MAX_ENCODED_SIZE], out); } endPointer = out.getFilePointer(); out.close(); } { // decode IndexInput in = d.openInput("test.bin", IOContext.READONCE); final ForUtil forUtil = new ForUtil(in); for (int i = 0; i < iterations; ++i) { if (random().nextBoolean()) { forUtil.skipBlock(in); continue; } final int[] restored = new int[MAX_DATA_SIZE]; forUtil.readBlock(in, new byte[MAX_ENCODED_SIZE], restored); assertArrayEquals( Arrays.copyOfRange(values, i * BLOCK_SIZE, (i + 1) * BLOCK_SIZE), Arrays.copyOf(restored, BLOCK_SIZE)); } assertEquals(endPointer, in.getFilePointer()); in.close(); } }
private void assertSameStreams(String msg, IndexInput expected, IndexInput test) throws IOException { assertNotNull(msg + " null expected", expected); assertNotNull(msg + " null test", test); assertEquals(msg + " length", expected.length(), test.length()); assertEquals(msg + " position", expected.getFilePointer(), test.getFilePointer()); byte expectedBuffer[] = new byte[512]; byte testBuffer[] = new byte[expectedBuffer.length]; long remainder = expected.length() - expected.getFilePointer(); while (remainder > 0) { int readLen = (int) Math.min(remainder, expectedBuffer.length); expected.readBytes(expectedBuffer, 0, readLen); test.readBytes(testBuffer, 0, readLen); assertEqualArrays(msg + ", remainder " + remainder, expectedBuffer, testBuffer, 0, readLen); remainder -= readLen; } }
/* Does initial decode of next block of terms; this doesn't actually decode the docFreq, totalTermFreq, postings details (frq/prx offset, etc.) metadata; it just loads them as byte[] blobs which are then decoded on-demand if the metadata is ever requested for any term in this block. This enables terms-only intensive consumes (eg certain MTQs, respelling) to not pay the price of decoding metadata they won't use. */ private boolean nextBlock() throws IOException { // TODO: we still lazy-decode the byte[] for each // term (the suffix), but, if we decoded // all N terms up front then seeking could do a fast // bsearch w/in the block... // System.out.println("BTR.nextBlock() fp=" + in.getFilePointer() + " this=" + this); state.blockFilePointer = in.getFilePointer(); blockTermCount = in.readVInt(); // System.out.println(" blockTermCount=" + blockTermCount); if (blockTermCount == 0) { return false; } termBlockPrefix = in.readVInt(); // term suffixes: int len = in.readVInt(); if (termSuffixes.length < len) { termSuffixes = new byte[ArrayUtil.oversize(len, 1)]; } // System.out.println(" termSuffixes len=" + len); in.readBytes(termSuffixes, 0, len); termSuffixesReader.reset(termSuffixes, 0, len); // docFreq, totalTermFreq len = in.readVInt(); if (docFreqBytes.length < len) { docFreqBytes = new byte[ArrayUtil.oversize(len, 1)]; } // System.out.println(" freq bytes len=" + len); in.readBytes(docFreqBytes, 0, len); freqReader.reset(docFreqBytes, 0, len); // metadata len = in.readVInt(); if (bytes == null) { bytes = new byte[ArrayUtil.oversize(len, 1)]; bytesReader = new ByteArrayDataInput(); } else if (bytes.length < len) { bytes = new byte[ArrayUtil.oversize(len, 1)]; } in.readBytes(bytes, 0, len); bytesReader.reset(bytes, 0, len); metaDataUpto = 0; state.termBlockOrd = 0; indexIsCurrent = false; // System.out.println(" indexIsCurrent=" + indexIsCurrent); return true; }
private void addFieldLazy( Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException { if (binary == true) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); if (compressed) { // was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS)); doc.add(new LazyField(fi.name, Field.Store.COMPRESS, toRead, pointer)); } else { // was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer)); } // Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); } else { Field.Store store = Field.Store.YES; Field.Index index = getIndexType(fi, tokenize); Field.TermVector termVector = getTermVectorType(fi); Fieldable f; if (compressed) { store = Field.Store.COMPRESS; int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); f = new LazyField(fi.name, store, toRead, pointer); // skip over the part that we aren't loading fieldsStream.seek(pointer + toRead); f.setOmitNorms(fi.omitNorms); } else { int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); // Skip ahead of where we are by the length of what is stored fieldsStream.skipChars(length); f = new LazyField(fi.name, store, index, termVector, length, pointer); f.setOmitNorms(fi.omitNorms); } doc.add(f); } }
@Override public int nextPosition() throws IOException { final int pos; if (readPositions) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + POS.length, scratch.length - POS.length, scratchUTF16_2); pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } else { pos = -1; } if (readOffsets) { SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, START_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + START_OFFSET.length, scratch.length - START_OFFSET.length, scratchUTF16_2); startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); SimpleTextUtil.readLine(in, scratch); assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + END_OFFSET.length, scratch.length - END_OFFSET.length, scratchUTF16_2); endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); } final long fp = in.getFilePointer(); SimpleTextUtil.readLine(in, scratch); if (StringHelper.startsWith(scratch, PAYLOAD)) { final int len = scratch.length - PAYLOAD.length; if (scratch2.bytes.length < len) { scratch2.grow(len); } System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len); scratch2.length = len; payload = scratch2; } else { payload = null; in.seek(fp); } return pos; }
private void skipBytes(long count) throws IOException { if (in instanceof IndexInput) { final IndexInput iin = (IndexInput) in; iin.seek(iin.getFilePointer() + count); } else { if (blocks == null) { blocks = new byte[blockSize]; } long skipped = 0; while (skipped < count) { final int toSkip = (int) Math.min(blocks.length, count - skipped); in.readBytes(blocks, 0, toSkip); skipped += toSkip; } } }
private void readIndexInputFullyWithRandomSeeks(IndexInput indexInput) throws IOException { BytesRef ref = new BytesRef(scaledRandomIntBetween(1, 1024)); long pos = 0; while (pos < indexInput.length()) { assertEquals(pos, indexInput.getFilePointer()); int op = random().nextInt(5); if (op == 0) { int shift = 100 - randomIntBetween(0, 200); pos = Math.min(indexInput.length() - 1, Math.max(0, pos + shift)); indexInput.seek(pos); } else if (op == 1) { indexInput.readByte(); pos++; } else { int min = (int) Math.min(indexInput.length() - pos, ref.bytes.length); indexInput.readBytes(ref.bytes, ref.offset, min); pos += min; } } }
/** * closes temporary file, compresses data and removes temporary file. * * @throws IOException */ @Override public void close() throws IOException { byte[] buffer = new byte[chunkSize]; tempOut.close(); // directory with offsets offsets of compressed chunks with // real position in decompressed stream IndexInput in = tempDirectory.openInput(tmpName); long len = closeLength = in.length(); // write length of the file at the begining for easier retreval output.writeLong(-1); // write configuration writeConfig(); int toRead; // read all data and compresse it in variable block chunks while (len > 0) { if (len > buffer.length) { toRead = buffer.length; } else { toRead = (int) len; } // just for safety --- can be improoved long bufferPos = in.getFilePointer(); // read original data in.readBytes(buffer, 0, toRead); writeChunk(buffer, bufferPos, toRead); len -= toRead; } // now let's crate directory entry of all chunks and their's original // position in inflated stream in.close(); if (tempDirectory.fileExists(tmpName)) { tempDirectory.deleteFile(tmpName); } super.close(); }
public boolean hasNext() { return input.getFilePointer() < input.length(); }
@Override public long getFilePointer() { return delegate.getFilePointer(); }
private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b; final PairOutputs<Long, Long> outputsInner = new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs); final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs = new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner); b = new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>( FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); final IntsRef scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.startsWith(scratch, DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + DOC.length, scratch.length - DOC.length, scratchUTF16); int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); visitedDocs.set(docID); } else if (StringHelper.startsWith(scratch, FREQ)) { UnicodeUtil.UTF8toUTF16( scratch.bytes, scratch.offset + FREQ.length, scratch.length - FREQ.length, scratchUTF16); totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { b.add( Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair( lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; if (len > lastTerm.length) { lastTerm.grow(len); } System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len); lastTerm.length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = (int) visitedDocs.cardinality(); fst = b.finish(); /* PrintStream ps = new PrintStream("out.dot"); fst.toDot(ps); ps.close(); System.out.println("SAVED out.dot"); */ // System.out.println("FST " + fst.sizeInBytes()); }
/** * This test opens two files from a compound stream and verifies that their file positions are * independent of each other. */ public void testRandomAccessClones() throws IOException { setUp_2(); CompoundFileReader cr = new CompoundFileReader(dir, "f.comp"); // Open two files IndexInput e1 = cr.openInput("f11"); IndexInput e2 = cr.openInput("f3"); IndexInput a1 = (IndexInput) e1.clone(); IndexInput a2 = (IndexInput) e2.clone(); // Seek the first pair e1.seek(100); a1.seek(100); assertEquals(100, e1.getFilePointer()); assertEquals(100, a1.getFilePointer()); byte be1 = e1.readByte(); byte ba1 = a1.readByte(); assertEquals(be1, ba1); // Now seek the second pair e2.seek(1027); a2.seek(1027); assertEquals(1027, e2.getFilePointer()); assertEquals(1027, a2.getFilePointer()); byte be2 = e2.readByte(); byte ba2 = a2.readByte(); assertEquals(be2, ba2); // Now make sure the first one didn't move assertEquals(101, e1.getFilePointer()); assertEquals(101, a1.getFilePointer()); be1 = e1.readByte(); ba1 = a1.readByte(); assertEquals(be1, ba1); // Now more the first one again, past the buffer length e1.seek(1910); a1.seek(1910); assertEquals(1910, e1.getFilePointer()); assertEquals(1910, a1.getFilePointer()); be1 = e1.readByte(); ba1 = a1.readByte(); assertEquals(be1, ba1); // Now make sure the second set didn't move assertEquals(1028, e2.getFilePointer()); assertEquals(1028, a2.getFilePointer()); be2 = e2.readByte(); ba2 = a2.readByte(); assertEquals(be2, ba2); // Move the second set back, again cross the buffer size e2.seek(17); a2.seek(17); assertEquals(17, e2.getFilePointer()); assertEquals(17, a2.getFilePointer()); be2 = e2.readByte(); ba2 = a2.readByte(); assertEquals(be2, ba2); // Finally, make sure the first set didn't move // Now make sure the first one didn't move assertEquals(1911, e1.getFilePointer()); assertEquals(1911, a1.getFilePointer()); be1 = e1.readByte(); ba1 = a1.readByte(); assertEquals(be1, ba1); e1.close(); e2.close(); a1.close(); a2.close(); cr.close(); }