public abstract class AbstractRunGeneratorTest { static TestUtils testUtils = new TestUtils(); static ISerializerDeserializer[] SerDers = new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE }; static RecordDescriptor RecordDesc = new RecordDescriptor(SerDers); static Random GRandom = new Random(System.currentTimeMillis()); static int[] SortFields = new int[] {0, 1}; static IBinaryComparatorFactory[] ComparatorFactories = new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY), PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY) }; static void assertMaxFrameSizesAreAllEqualsTo( List<RunAndMaxFrameSizePair> maxSize, int pageSize) { for (int i = 0; i < maxSize.size(); i++) { assertTrue(maxSize.get(i).maxFrameSize == pageSize); } } abstract AbstractSortRunGenerator getSortRunGenerator( IHyracksTaskContext ctx, int frameLimit, int numOfInputRecord) throws HyracksDataException; protected List<RunAndMaxFrameSizePair> testSortRecords( int pageSize, int frameLimit, int numRuns, int minRecordSize, int maxRecordSize, HashMap<Integer, String> specialData) throws HyracksDataException { IHyracksTaskContext ctx = testUtils.create(pageSize); HashMap<Integer, String> keyValuePair = new HashMap<>(); List<IFrame> frameList = new ArrayList<>(); prepareData( ctx, frameList, pageSize * frameLimit * numRuns, minRecordSize, maxRecordSize, specialData, keyValuePair); AbstractSortRunGenerator runGenerator = getSortRunGenerator(ctx, frameLimit, keyValuePair.size()); runGenerator.open(); for (IFrame frame : frameList) { runGenerator.nextFrame(frame.getBuffer()); } runGenerator.close(); matchResult(ctx, runGenerator.getRuns(), keyValuePair); return runGenerator.getRuns(); } static void matchResult( IHyracksTaskContext ctx, List<RunAndMaxFrameSizePair> runs, Map<Integer, String> keyValuePair) throws HyracksDataException { HashMap<Integer, String> copyMap2 = new HashMap<>(keyValuePair); int maxFrameSizes = 0; for (RunAndMaxFrameSizePair run : runs) { maxFrameSizes = Math.max(maxFrameSizes, run.maxFrameSize); } GroupVSizeFrame gframe = new GroupVSizeFrame(ctx, maxFrameSizes); GroupFrameAccessor gfta = new GroupFrameAccessor(ctx.getInitialFrameSize(), RecordDesc); assertReadSorted(runs, gfta, gframe, copyMap2); } static int assertFTADataIsSorted( IFrameTupleAccessor fta, Map<Integer, String> keyValuePair, int preKey) throws HyracksDataException { ByteBufferInputStream bbis = new ByteBufferInputStream(); DataInputStream di = new DataInputStream(bbis); for (int i = 0; i < fta.getTupleCount(); i++) { bbis.setByteBuffer( fta.getBuffer(), fta.getTupleStartOffset(i) + fta.getFieldStartOffset(i, 0) + fta.getFieldSlotsLength()); int key = (int) RecordDesc.getFields()[0].deserialize(di); bbis.setByteBuffer( fta.getBuffer(), fta.getTupleStartOffset(i) + fta.getFieldStartOffset(i, 1) + fta.getFieldSlotsLength()); String value = (String) RecordDesc.getFields()[1].deserialize(di); if (!keyValuePair.get(key).equals(value)) { assertTrue(false); } keyValuePair.remove(key); assertTrue(key >= preKey); preKey = key; } return preKey; } static void assertReadSorted( List<RunAndMaxFrameSizePair> runs, IFrameTupleAccessor fta, IFrame frame, Map<Integer, String> keyValuePair) throws HyracksDataException { assertTrue(runs.size() > 0); for (RunAndMaxFrameSizePair run : runs) { run.run.open(); int preKey = Integer.MIN_VALUE; while (run.run.nextFrame(frame)) { fta.reset(frame.getBuffer()); preKey = assertFTADataIsSorted(fta, keyValuePair, preKey); } run.run.close(); } assertTrue(keyValuePair.isEmpty()); } static void prepareData( IHyracksTaskContext ctx, List<IFrame> frameList, int minDataSize, int minRecordSize, int maxRecordSize, Map<Integer, String> specialData, Map<Integer, String> keyValuePair) throws HyracksDataException { ArrayTupleBuilder tb = new ArrayTupleBuilder(RecordDesc.getFieldCount()); FrameTupleAppender appender = new FrameTupleAppender(); int datasize = 0; if (specialData != null) { for (Map.Entry<Integer, String> entry : specialData.entrySet()) { tb.reset(); tb.addField(IntegerSerializerDeserializer.INSTANCE, entry.getKey()); tb.addField(UTF8StringSerializerDeserializer.INSTANCE, entry.getValue()); VSizeFrame frame = new VSizeFrame( ctx, FrameHelper.calcAlignedFrameSizeToStore( tb.getFieldEndOffsets().length, tb.getSize(), ctx.getInitialFrameSize())); appender.reset(frame, true); assertTrue(appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())); frameList.add(frame); datasize += frame.getFrameSize(); } keyValuePair.putAll(specialData); } VSizeFrame frame = new VSizeFrame(ctx, ctx.getInitialFrameSize()); appender.reset(frame, true); while (datasize < minDataSize) { tb.reset(); int key = GRandom.nextInt(minDataSize + 1); if (!keyValuePair.containsKey(key)) { String value = generateRandomRecord(minRecordSize, maxRecordSize); tb.addField(IntegerSerializerDeserializer.INSTANCE, key); tb.addField(UTF8StringSerializerDeserializer.INSTANCE, value); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { frameList.add(frame); datasize += frame.getFrameSize(); frame = new VSizeFrame( ctx, FrameHelper.calcAlignedFrameSizeToStore( tb.getFieldEndOffsets().length, tb.getSize(), ctx.getInitialFrameSize())); appender.reset(frame, true); assertTrue(appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())); } keyValuePair.put(key, value); } } if (appender.getTupleCount() > 0) { frameList.add(frame); } } static String generateRandomRecord(int minRecordSize, int maxRecordSize) throws HyracksDataException { int size = GRandom.nextInt(maxRecordSize - minRecordSize + 1) + minRecordSize; return generateRandomFixSizedString(size); } static String generateRandomFixSizedString(int size) { StringBuilder sb = new StringBuilder(size); for (; size >= 0; --size) { char ch = (char) (GRandom.nextInt(26) + 97); sb.append(ch); } return sb.toString(); } static HashMap<Integer, String> generateBigObject(int pageSize, int times) { HashMap<Integer, String> map = new HashMap<>(1); for (int i = 1; i < times; i++) { map.put(GRandom.nextInt(), generateRandomFixSizedString(pageSize * i)); } return map; } @Test public void testAllSmallRecords() throws HyracksDataException { int pageSize = 512; int frameLimit = 4; int numRuns = 2; int minRecordSize = pageSize / 8; int maxRecordSize = pageSize / 8; List<RunAndMaxFrameSizePair> maxSize = testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, null); assertMaxFrameSizesAreAllEqualsTo(maxSize, pageSize); } @Test public void testAllLargeRecords() throws HyracksDataException { int pageSize = 2048; int frameLimit = 4; int numRuns = 2; int minRecordSize = pageSize; int maxRecordSize = (int) (pageSize * 1.8); List<RunAndMaxFrameSizePair> size = testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, null); assertMaxFrameSizesAreAllEqualsTo(size, pageSize * 2); } @Test public void testMixedLargeRecords() throws HyracksDataException { int pageSize = 128; int frameLimit = 4; int numRuns = 4; int minRecordSize = 20; int maxRecordSize = pageSize / 2; HashMap<Integer, String> specialPair = generateBigObject(pageSize, frameLimit - 1); List<RunAndMaxFrameSizePair> size = testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, specialPair); int max = 0; for (RunAndMaxFrameSizePair run : size) { max = Math.max(max, run.maxFrameSize); } assertTrue(max == pageSize * (frameLimit - 1)); } @Test(expected = HyracksDataException.class) public void testTooBigRecordWillThrowException() throws HyracksDataException { int pageSize = 1024; int frameLimit = 8; int numRuns = 8; HashMap<Integer, String> specialPair = generateBigObject(pageSize, frameLimit); int minRecordSize = 10; int maxRecordSize = pageSize / 2; List<RunAndMaxFrameSizePair> size = testSortRecords(pageSize, frameLimit, numRuns, minRecordSize, maxRecordSize, specialPair); } }
public class Sort { private static class Options { @Option(name = "-host", usage = "Hyracks Cluster Controller Host name", required = true) public String host; @Option( name = "-port", usage = "Hyracks Cluster Controller Port (default: 1098)", required = false) public int port = 1098; @Option(name = "-frame-size", usage = "Hyracks frame size (default: 32768)", required = false) public int frameSize = 32768; @Option( name = "-frame-limit", usage = "memory limit for sorting (default: 4)", required = false) public int frameLimit = 4; @Option( name = "-infile-splits", usage = "Comma separated list of file-splits for the ORDER input. A file-split is <node-name>:<path>", required = true) public String inFileOrderSplits; @Option( name = "-outfile-splits", usage = "Comma separated list of file-splits for the output", required = true) public String outFileSplits; @Option( name = "-membuffer-alg", usage = "bestfit or lastfit (default: lastfit)", required = false) public String memBufferAlg = "lastfit"; @Option(name = "-profile", usage = "Enable/Disable profiling. (default: enabled)") public boolean profile = true; @Option(name = "-topK", usage = "only output topK for each node. (default: not set)") public int topK = Integer.MAX_VALUE; @Option(name = "-heapSort", usage = "using heap sort for topK result. (default: false)") public boolean usingHeapSorter = false; } static int[] SortFields = new int[] {1, 0}; static IBinaryComparatorFactory[] SortFieldsComparatorFactories = new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY), PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY) }; static IBinaryHashFunctionFactory[] orderBinaryHashFunctionFactories = new IBinaryHashFunctionFactory[] { PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY), PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY) }; public static void main(String[] args) throws Exception { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); if (args.length == 0) { parser.printUsage(System.err); return; } parser.parseArgument(args); IHyracksClientConnection hcc = new HyracksConnection(options.host, options.port); JobSpecification job = createJob( parseFileSplits(options.inFileOrderSplits), parseFileSplits(options.outFileSplits), options.memBufferAlg, options.frameLimit, options.frameSize, options.topK, options.usingHeapSorter); long start = System.currentTimeMillis(); JobId jobId = hcc.startJob( job, options.profile ? EnumSet.of(JobFlag.PROFILE_RUNTIME) : EnumSet.noneOf(JobFlag.class)); hcc.waitForCompletion(jobId); long end = System.currentTimeMillis(); System.err.println("finished in:" + (end - start) + "ms"); } private static JobSpecification createJob( FileSplit[] ordersSplits, FileSplit[] outputSplit, String memBufferAlg, int frameLimit, int frameSize, int limit, boolean usingHeapSorter) { JobSpecification spec = new JobSpecification(); spec.setFrameSize(frameSize); IFileSplitProvider ordersSplitProvider = new ConstantFileSplitProvider(ordersSplits); FileScanOperatorDescriptor ordScanner = new FileScanOperatorDescriptor( spec, ordersSplitProvider, new DelimitedDataTupleParserFactory(orderParserFactories, '|'), ordersDesc); createPartitionConstraint(spec, ordScanner, ordersSplits); AbstractSorterOperatorDescriptor sorter; if (usingHeapSorter && limit < Integer.MAX_VALUE) { sorter = new TopKSorterOperatorDescriptor( spec, frameLimit, limit, SortFields, null, SortFieldsComparatorFactories, ordersDesc); } else { if (memBufferAlg.equalsIgnoreCase("bestfit")) { sorter = new ExternalSortOperatorDescriptor( spec, frameLimit, SortFields, null, SortFieldsComparatorFactories, ordersDesc, Algorithm.MERGE_SORT, EnumFreeSlotPolicy.SMALLEST_FIT, limit); } else if (memBufferAlg.equalsIgnoreCase("biggestfit")) { sorter = new ExternalSortOperatorDescriptor( spec, frameLimit, SortFields, null, SortFieldsComparatorFactories, ordersDesc, Algorithm.MERGE_SORT, EnumFreeSlotPolicy.BIGGEST_FIT, limit); } else { sorter = new ExternalSortOperatorDescriptor( spec, frameLimit, SortFields, null, SortFieldsComparatorFactories, ordersDesc, Algorithm.MERGE_SORT, EnumFreeSlotPolicy.LAST_FIT, limit); } } createPartitionConstraint(spec, sorter, ordersSplits); IFileSplitProvider outputSplitProvider = new ConstantFileSplitProvider(outputSplit); IOperatorDescriptor printer = new PlainFileWriterOperatorDescriptor(spec, outputSplitProvider, "|"); createPartitionConstraint(spec, printer, outputSplit); spec.connect(new OneToOneConnectorDescriptor(spec), ordScanner, 0, sorter, 0); spec.connect( new MToNPartitioningMergingConnectorDescriptor( spec, new FieldHashPartitionComputerFactory(SortFields, orderBinaryHashFunctionFactories), SortFields, SortFieldsComparatorFactories, new UTF8StringNormalizedKeyComputerFactory()), sorter, 0, printer, 0); spec.addRoot(printer); return spec; } }