/** * Picks a block with samplingRate probability Used in the first phase of up-front partitioning. * * @param filename * @param samplingRate */ public void scanWithBlockSampling(String filename, double samplingRate, OutputStream out) { initScan(blockSampleSize); FileChannel ch = IOUtils.openFileChannel(filename); try { for (long position = 0; ; position += blockSampleSize) { while (sampleSucceed(samplingRate) == false) { position += blockSampleSize; } ch.position(position); if ((nRead = ch.read(bb)) == -1) { break; } byteArrayIdx = previous = 0; if (position != 0) { // skip the first tuple if not starting of file. while (byteArrayIdx < nRead && byteArray[byteArrayIdx] != newLine) { byteArrayIdx++; } previous = ++byteArrayIdx; } processByteBuffer(null, out); bb.clear(); out.flush(); // It only helps get an exact profiling? } } catch (IOException e) { e.printStackTrace(); } IOUtils.closeFileChannel(ch); firstPass = false; }
/** * Does a full scan over entire data. Used in second phase of up-front partitioning. * * @param filename * @param writer */ public void scan(String filename, PartitionWriter writer) { initScan(bufferSize); long sStartTime = System.nanoTime(), temp1; long readTime = 0, processTime = 0; FileChannel ch = IOUtils.openFileChannel(filename); int counter = 0; try { while (true) { temp1 = System.nanoTime(); boolean allGood = ((nRead = ch.read(bb)) != -1); readTime += System.nanoTime() - temp1; if (!allGood) break; if (nRead == 0) continue; counter++; byteArrayIdx = previous = 0; temp1 = System.nanoTime(); processByteBuffer(writer, null); processTime += System.nanoTime() - temp1; long startTime = System.nanoTime(); if (previous < nRead) { // is there a broken line in the end? brokenLine = BinaryUtils.getBytes(byteArray, previous, nRead - previous); hasLeftover = true; } brokenTime += System.nanoTime() - startTime; startTime = System.nanoTime(); bb.clear(); clearTime += System.nanoTime() - startTime; } } catch (IOException e) { e.printStackTrace(); } IOUtils.closeFileChannel(ch); firstPass = false; System.out.println("counter:" + counter); System.out.println( "SCAN: Total Time taken = " + (System.nanoTime() - sStartTime) / 1E9 + " sec"); System.out.println("Line count = " + lineCount); System.out.println("Average line size = " + (double) totalLineSize / lineCount); System.out.println("SCAN: Read into buffer time = " + readTime / 1E9); System.out.println("SCAN: Process buffer time = " + processTime / 1E9); System.out.println("SCAN: Array copy time = " + arrayCopyTime / 1E9); System.out.println("SCAN: Get bucket ID time = " + bucketIdTime / 1E9); System.out.println("SCAN: Broken line fix time = " + brokenTime / 1E9); System.out.println("SCAN: Buffer clear time = " + clearTime / 1E9); }