示例#1
0
  /**
   * Picks a block with samplingRate probability Used in the first phase of up-front partitioning.
   *
   * @param filename
   * @param samplingRate
   */
  public void scanWithBlockSampling(String filename, double samplingRate, OutputStream out) {
    initScan(blockSampleSize);
    FileChannel ch = IOUtils.openFileChannel(filename);
    try {
      for (long position = 0; ; position += blockSampleSize) {
        while (sampleSucceed(samplingRate) == false) {
          position += blockSampleSize;
        }
        ch.position(position);
        if ((nRead = ch.read(bb)) == -1) {
          break;
        }

        byteArrayIdx = previous = 0;
        if (position != 0) {
          // skip the first tuple if not starting of file.
          while (byteArrayIdx < nRead && byteArray[byteArrayIdx] != newLine) {
            byteArrayIdx++;
          }
          previous = ++byteArrayIdx;
        }

        processByteBuffer(null, out);
        bb.clear();
        out.flush(); // It only helps get an exact profiling?
      }

    } catch (IOException e) {
      e.printStackTrace();
    }
    IOUtils.closeFileChannel(ch);
    firstPass = false;
  }
示例#2
0
  /**
   * Does a full scan over entire data. Used in second phase of up-front partitioning.
   *
   * @param filename
   * @param writer
   */
  public void scan(String filename, PartitionWriter writer) {
    initScan(bufferSize);
    long sStartTime = System.nanoTime(), temp1;
    long readTime = 0, processTime = 0;
    FileChannel ch = IOUtils.openFileChannel(filename);
    int counter = 0;
    try {
      while (true) {
        temp1 = System.nanoTime();
        boolean allGood = ((nRead = ch.read(bb)) != -1);
        readTime += System.nanoTime() - temp1;

        if (!allGood) break;

        if (nRead == 0) continue;

        counter++;

        byteArrayIdx = previous = 0;
        temp1 = System.nanoTime();
        processByteBuffer(writer, null);
        processTime += System.nanoTime() - temp1;

        long startTime = System.nanoTime();
        if (previous < nRead) { // is there a broken line in the end?
          brokenLine = BinaryUtils.getBytes(byteArray, previous, nRead - previous);
          hasLeftover = true;
        }
        brokenTime += System.nanoTime() - startTime;

        startTime = System.nanoTime();
        bb.clear();
        clearTime += System.nanoTime() - startTime;
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
    IOUtils.closeFileChannel(ch);
    firstPass = false;

    System.out.println("counter:" + counter);
    System.out.println(
        "SCAN: Total Time taken = " + (System.nanoTime() - sStartTime) / 1E9 + " sec");
    System.out.println("Line count = " + lineCount);
    System.out.println("Average line size = " + (double) totalLineSize / lineCount);
    System.out.println("SCAN: Read into buffer time = " + readTime / 1E9);
    System.out.println("SCAN: Process buffer time = " + processTime / 1E9);

    System.out.println("SCAN: Array copy time = " + arrayCopyTime / 1E9);
    System.out.println("SCAN: Get bucket ID time = " + bucketIdTime / 1E9);
    System.out.println("SCAN: Broken line fix time = " + brokenTime / 1E9);
    System.out.println("SCAN: Buffer clear time = " + clearTime / 1E9);
  }