/** * Experimental parser built to leverage multiple cores and keep up with the speed of modern SSDs * * @param fileChannel * @throws IOException */ public void extract(FileChannel fileChannel) throws IOException { MappedByteBuffer mappedBuffer; long fileSize = fileChannel.size(); long position = 0; int tailPadding = 8; // needed to cover the transition long blockSize = 1 << 25; TypeExtractor typeExtractor = new TypeExtractor(true /* force ASCII */); RecordFieldExtractor rfe = new RecordFieldExtractor(); mappedBuffer = fileChannel.map( FileChannel.MapMode.READ_ONLY, position, Math.min(blockSize, fileSize - position)); int padding = tailPadding; do { if (mappedBuffer.limit() + position == fileSize) { padding = 0; } int pos = 0; Pipe.setValue( rb.structuredLayoutRingBuffer, rb.mask, Pipe.getWorkingHeadPositionObject(rb).value++, pos); int tokenCount = 0; int c = 0; int j = mappedBuffer.remaining() - padding; do { // walk over the data while we have this section mapped. c++; byte b = (byte) mappedBuffer.get(); // RecordFieldExtractor.appendContent(rfe, b); //TOO much work here must do on reading // thread. // TODO: check the field type sums // TODO: zero copy but we need to discover tokens // splits on returns, commas, dots and many other punctuation if (b < 48) { // System.err.println("char :"+b); // what mask can be built to combine the byte we are after. // allTheBits++; //do something pos = mappedBuffer.position(); Pipe.setValue( rb.structuredLayoutRingBuffer, rb.mask, Pipe.getWorkingHeadPositionObject(rb).value++, pos); if ((++tokenCount & 0xF) == 0) { Pipe.publishWrites(rb); } // rb.reset(); } } while (--j > 0); // this tokenizer assumes that the file ends with a field delimiter so the last record gets // flushed. // TODO: need to wait for threads to finish before swapping to new page or have multiple pages // to swap in/out // only increment by exactly how many bytes were read assuming we started at zero // can only cut at the last known record start position += c; System.out.println("bytes read so far:" + position); mappedBuffer = fileChannel.map( FileChannel.MapMode.READ_ONLY, position, Math.min(blockSize, fileSize - position)); } while (position < fileSize); }