public abstract class AbstractRunGeneratorTest {
  static TestUtils testUtils = new TestUtils();
  static ISerializerDeserializer[] SerDers =
      new ISerializerDeserializer[] {
        IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE
      };
  static RecordDescriptor RecordDesc = new RecordDescriptor(SerDers);
  static Random GRandom = new Random(System.currentTimeMillis());
  static int[] SortFields = new int[] {0, 1};
  static IBinaryComparatorFactory[] ComparatorFactories =
      new IBinaryComparatorFactory[] {
        PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY),
        PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY)
      };

  static void assertMaxFrameSizesAreAllEqualsTo(
      List<RunAndMaxFrameSizePair> maxSize, int pageSize) {
    for (int i = 0; i < maxSize.size(); i++) {
      assertTrue(maxSize.get(i).maxFrameSize == pageSize);
    }
  }

  abstract AbstractSortRunGenerator getSortRunGenerator(
      IHyracksTaskContext ctx, int frameLimit, int numOfInputRecord) throws HyracksDataException;

  protected List<RunAndMaxFrameSizePair> testSortRecords(
      int pageSize,
      int frameLimit,
      int numRuns,
      int minRecordSize,
      int maxRecordSize,
      HashMap<Integer, String> specialData)
      throws HyracksDataException {
    IHyracksTaskContext ctx = testUtils.create(pageSize);

    HashMap<Integer, String> keyValuePair = new HashMap<>();
    List<IFrame> frameList = new ArrayList<>();
    prepareData(
        ctx,
        frameList,
        pageSize * frameLimit * numRuns,
        minRecordSize,
        maxRecordSize,
        specialData,
        keyValuePair);
    AbstractSortRunGenerator runGenerator =
        getSortRunGenerator(ctx, frameLimit, keyValuePair.size());
    runGenerator.open();
    for (IFrame frame : frameList) {
      runGenerator.nextFrame(frame.getBuffer());
    }
    runGenerator.close();
    matchResult(ctx, runGenerator.getRuns(), keyValuePair);
    return runGenerator.getRuns();
  }

  static void matchResult(
      IHyracksTaskContext ctx, List<RunAndMaxFrameSizePair> runs, Map<Integer, String> keyValuePair)
      throws HyracksDataException {
    HashMap<Integer, String> copyMap2 = new HashMap<>(keyValuePair);
    int maxFrameSizes = 0;
    for (RunAndMaxFrameSizePair run : runs) {
      maxFrameSizes = Math.max(maxFrameSizes, run.maxFrameSize);
    }
    GroupVSizeFrame gframe = new GroupVSizeFrame(ctx, maxFrameSizes);
    GroupFrameAccessor gfta = new GroupFrameAccessor(ctx.getInitialFrameSize(), RecordDesc);
    assertReadSorted(runs, gfta, gframe, copyMap2);
  }

  static int assertFTADataIsSorted(
      IFrameTupleAccessor fta, Map<Integer, String> keyValuePair, int preKey)
      throws HyracksDataException {

    ByteBufferInputStream bbis = new ByteBufferInputStream();
    DataInputStream di = new DataInputStream(bbis);
    for (int i = 0; i < fta.getTupleCount(); i++) {
      bbis.setByteBuffer(
          fta.getBuffer(),
          fta.getTupleStartOffset(i) + fta.getFieldStartOffset(i, 0) + fta.getFieldSlotsLength());
      int key = (int) RecordDesc.getFields()[0].deserialize(di);
      bbis.setByteBuffer(
          fta.getBuffer(),
          fta.getTupleStartOffset(i) + fta.getFieldStartOffset(i, 1) + fta.getFieldSlotsLength());
      String value = (String) RecordDesc.getFields()[1].deserialize(di);

      if (!keyValuePair.get(key).equals(value)) {
        assertTrue(false);
      }
      keyValuePair.remove(key);
      assertTrue(key >= preKey);
      preKey = key;
    }
    return preKey;
  }

  static void assertReadSorted(
      List<RunAndMaxFrameSizePair> runs,
      IFrameTupleAccessor fta,
      IFrame frame,
      Map<Integer, String> keyValuePair)
      throws HyracksDataException {

    assertTrue(runs.size() > 0);
    for (RunAndMaxFrameSizePair run : runs) {
      run.run.open();
      int preKey = Integer.MIN_VALUE;
      while (run.run.nextFrame(frame)) {
        fta.reset(frame.getBuffer());
        preKey = assertFTADataIsSorted(fta, keyValuePair, preKey);
      }
      run.run.close();
    }
    assertTrue(keyValuePair.isEmpty());
  }

  static void prepareData(
      IHyracksTaskContext ctx,
      List<IFrame> frameList,
      int minDataSize,
      int minRecordSize,
      int maxRecordSize,
      Map<Integer, String> specialData,
      Map<Integer, String> keyValuePair)
      throws HyracksDataException {

    ArrayTupleBuilder tb = new ArrayTupleBuilder(RecordDesc.getFieldCount());
    FrameTupleAppender appender = new FrameTupleAppender();

    int datasize = 0;
    if (specialData != null) {
      for (Map.Entry<Integer, String> entry : specialData.entrySet()) {
        tb.reset();
        tb.addField(IntegerSerializerDeserializer.INSTANCE, entry.getKey());
        tb.addField(UTF8StringSerializerDeserializer.INSTANCE, entry.getValue());

        VSizeFrame frame =
            new VSizeFrame(
                ctx,
                FrameHelper.calcAlignedFrameSizeToStore(
                    tb.getFieldEndOffsets().length, tb.getSize(), ctx.getInitialFrameSize()));
        appender.reset(frame, true);
        assertTrue(appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize()));
        frameList.add(frame);
        datasize += frame.getFrameSize();
      }
      keyValuePair.putAll(specialData);
    }

    VSizeFrame frame = new VSizeFrame(ctx, ctx.getInitialFrameSize());
    appender.reset(frame, true);
    while (datasize < minDataSize) {
      tb.reset();
      int key = GRandom.nextInt(minDataSize + 1);
      if (!keyValuePair.containsKey(key)) {
        String value = generateRandomRecord(minRecordSize, maxRecordSize);
        tb.addField(IntegerSerializerDeserializer.INSTANCE, key);
        tb.addField(UTF8StringSerializerDeserializer.INSTANCE, value);

        if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
          frameList.add(frame);
          datasize += frame.getFrameSize();
          frame =
              new VSizeFrame(
                  ctx,
                  FrameHelper.calcAlignedFrameSizeToStore(
                      tb.getFieldEndOffsets().length, tb.getSize(), ctx.getInitialFrameSize()));
          appender.reset(frame, true);
          assertTrue(appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize()));
        }

        keyValuePair.put(key, value);
      }
    }
    if (appender.getTupleCount() > 0) {
      frameList.add(frame);
    }
  }

  static String generateRandomRecord(int minRecordSize, int maxRecordSize)
      throws HyracksDataException {
    int size = GRandom.nextInt(maxRecordSize - minRecordSize + 1) + minRecordSize;
    return generateRandomFixSizedString(size);
  }

  static String generateRandomFixSizedString(int size) {
    StringBuilder sb = new StringBuilder(size);
    for (; size >= 0; --size) {
      char ch = (char) (GRandom.nextInt(26) + 97);
      sb.append(ch);
    }
    return sb.toString();
  }

  static HashMap<Integer, String> generateBigObject(int pageSize, int times) {
    HashMap<Integer, String> map = new HashMap<>(1);
    for (int i = 1; i < times; i++) {
      map.put(GRandom.nextInt(), generateRandomFixSizedString(pageSize * i));
    }
    return map;
  }

  @Test
  public void testAllSmallRecords() throws HyracksDataException {
    int pageSize = 512;
    int frameLimit = 4;
    int numRuns = 2;
    int minRecordSize = pageSize / 8;
    int maxRecordSize = pageSize / 8;
    List<RunAndMaxFrameSizePair> maxSize =
        testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, null);
    assertMaxFrameSizesAreAllEqualsTo(maxSize, pageSize);
  }

  @Test
  public void testAllLargeRecords() throws HyracksDataException {
    int pageSize = 2048;
    int frameLimit = 4;
    int numRuns = 2;
    int minRecordSize = pageSize;
    int maxRecordSize = (int) (pageSize * 1.8);
    List<RunAndMaxFrameSizePair> size =
        testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, null);
    assertMaxFrameSizesAreAllEqualsTo(size, pageSize * 2);
  }

  @Test
  public void testMixedLargeRecords() throws HyracksDataException {
    int pageSize = 128;
    int frameLimit = 4;
    int numRuns = 4;
    int minRecordSize = 20;
    int maxRecordSize = pageSize / 2;
    HashMap<Integer, String> specialPair = generateBigObject(pageSize, frameLimit - 1);
    List<RunAndMaxFrameSizePair> size =
        testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, specialPair);

    int max = 0;
    for (RunAndMaxFrameSizePair run : size) {
      max = Math.max(max, run.maxFrameSize);
    }
    assertTrue(max == pageSize * (frameLimit - 1));
  }

  @Test(expected = HyracksDataException.class)
  public void testTooBigRecordWillThrowException() throws HyracksDataException {
    int pageSize = 1024;
    int frameLimit = 8;
    int numRuns = 8;
    HashMap<Integer, String> specialPair = generateBigObject(pageSize, frameLimit);
    int minRecordSize = 10;
    int maxRecordSize = pageSize / 2;
    List<RunAndMaxFrameSizePair> size =
        testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, specialPair);
  }
}
Ejemplo n.º 2
0
public class Sort {
  private static class Options {
    @Option(name = "-host", usage = "Hyracks Cluster Controller Host name", required = true)
    public String host;

    @Option(
        name = "-port",
        usage = "Hyracks Cluster Controller Port (default: 1098)",
        required = false)
    public int port = 1098;

    @Option(name = "-frame-size", usage = "Hyracks frame size (default: 32768)", required = false)
    public int frameSize = 32768;

    @Option(
        name = "-frame-limit",
        usage = "memory limit for sorting (default: 4)",
        required = false)
    public int frameLimit = 4;

    @Option(
        name = "-infile-splits",
        usage =
            "Comma separated list of file-splits for the ORDER input. A file-split is <node-name>:<path>",
        required = true)
    public String inFileOrderSplits;

    @Option(
        name = "-outfile-splits",
        usage = "Comma separated list of file-splits for the output",
        required = true)
    public String outFileSplits;

    @Option(
        name = "-membuffer-alg",
        usage = "bestfit or lastfit (default: lastfit)",
        required = false)
    public String memBufferAlg = "lastfit";

    @Option(name = "-profile", usage = "Enable/Disable profiling. (default: enabled)")
    public boolean profile = true;

    @Option(name = "-topK", usage = "only output topK for each node. (default: not set)")
    public int topK = Integer.MAX_VALUE;

    @Option(name = "-heapSort", usage = "using heap sort for topK result. (default: false)")
    public boolean usingHeapSorter = false;
  }

  static int[] SortFields = new int[] {1, 0};
  static IBinaryComparatorFactory[] SortFieldsComparatorFactories =
      new IBinaryComparatorFactory[] {
        PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY),
        PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY)
      };

  static IBinaryHashFunctionFactory[] orderBinaryHashFunctionFactories =
      new IBinaryHashFunctionFactory[] {
        PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY),
        PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY)
      };

  public static void main(String[] args) throws Exception {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);
    if (args.length == 0) {
      parser.printUsage(System.err);
      return;
    }
    parser.parseArgument(args);

    IHyracksClientConnection hcc = new HyracksConnection(options.host, options.port);

    JobSpecification job =
        createJob(
            parseFileSplits(options.inFileOrderSplits),
            parseFileSplits(options.outFileSplits),
            options.memBufferAlg,
            options.frameLimit,
            options.frameSize,
            options.topK,
            options.usingHeapSorter);

    long start = System.currentTimeMillis();
    JobId jobId =
        hcc.startJob(
            job,
            options.profile ? EnumSet.of(JobFlag.PROFILE_RUNTIME) : EnumSet.noneOf(JobFlag.class));
    hcc.waitForCompletion(jobId);
    long end = System.currentTimeMillis();
    System.err.println("finished in:" + (end - start) + "ms");
  }

  private static JobSpecification createJob(
      FileSplit[] ordersSplits,
      FileSplit[] outputSplit,
      String memBufferAlg,
      int frameLimit,
      int frameSize,
      int limit,
      boolean usingHeapSorter) {
    JobSpecification spec = new JobSpecification();

    spec.setFrameSize(frameSize);
    IFileSplitProvider ordersSplitProvider = new ConstantFileSplitProvider(ordersSplits);
    FileScanOperatorDescriptor ordScanner =
        new FileScanOperatorDescriptor(
            spec,
            ordersSplitProvider,
            new DelimitedDataTupleParserFactory(orderParserFactories, '|'),
            ordersDesc);
    createPartitionConstraint(spec, ordScanner, ordersSplits);
    AbstractSorterOperatorDescriptor sorter;
    if (usingHeapSorter && limit < Integer.MAX_VALUE) {
      sorter =
          new TopKSorterOperatorDescriptor(
              spec, frameLimit, limit, SortFields, null, SortFieldsComparatorFactories, ordersDesc);
    } else {
      if (memBufferAlg.equalsIgnoreCase("bestfit")) {
        sorter =
            new ExternalSortOperatorDescriptor(
                spec,
                frameLimit,
                SortFields,
                null,
                SortFieldsComparatorFactories,
                ordersDesc,
                Algorithm.MERGE_SORT,
                EnumFreeSlotPolicy.SMALLEST_FIT,
                limit);
      } else if (memBufferAlg.equalsIgnoreCase("biggestfit")) {
        sorter =
            new ExternalSortOperatorDescriptor(
                spec,
                frameLimit,
                SortFields,
                null,
                SortFieldsComparatorFactories,
                ordersDesc,
                Algorithm.MERGE_SORT,
                EnumFreeSlotPolicy.BIGGEST_FIT,
                limit);
      } else {
        sorter =
            new ExternalSortOperatorDescriptor(
                spec,
                frameLimit,
                SortFields,
                null,
                SortFieldsComparatorFactories,
                ordersDesc,
                Algorithm.MERGE_SORT,
                EnumFreeSlotPolicy.LAST_FIT,
                limit);
      }
    }
    createPartitionConstraint(spec, sorter, ordersSplits);
    IFileSplitProvider outputSplitProvider = new ConstantFileSplitProvider(outputSplit);
    IOperatorDescriptor printer =
        new PlainFileWriterOperatorDescriptor(spec, outputSplitProvider, "|");
    createPartitionConstraint(spec, printer, outputSplit);

    spec.connect(new OneToOneConnectorDescriptor(spec), ordScanner, 0, sorter, 0);

    spec.connect(
        new MToNPartitioningMergingConnectorDescriptor(
            spec,
            new FieldHashPartitionComputerFactory(SortFields, orderBinaryHashFunctionFactories),
            SortFields,
            SortFieldsComparatorFactories,
            new UTF8StringNormalizedKeyComputerFactory()),
        sorter,
        0,
        printer,
        0);

    spec.addRoot(printer);
    return spec;
  }
}