@Override public void coGroup( Iterator<Record> candidates, Iterator<Record> current, Collector<Record> out) throws Exception { if (!current.hasNext()) { throw new Exception("Error: Id not encountered before."); } Record old = current.next(); long oldId = old.getField(1, LongValue.class).getValue(); long minimumComponentID = Long.MAX_VALUE; while (candidates.hasNext()) { long candidateComponentID = candidates.next().getField(1, LongValue.class).getValue(); if (candidateComponentID < minimumComponentID) { minimumComponentID = candidateComponentID; } } if (minimumComponentID < oldId) { newComponentId.setValue(minimumComponentID); old.setField(1, newComponentId); out.collect(old); } }
/** * Project "supplier". * * <p>Output Schema: Key: nationkey Value: suppkey */ @Override public void map(Record record, Collector<Record> out) throws Exception { suppKey = record.getField(0, suppKey); inputTuple = record.getField(1, inputTuple); /* Project (suppkey | name, address, nationkey, phone, acctbal, comment): */ IntValue nationKey = new IntValue(Integer.parseInt(inputTuple.getStringValueAt(3))); record.setField(0, nationKey); record.setField(1, suppKey); out.collect(record); }
@Override public void writeRecord(Record rec) throws IOException { IntValue key = rec.getField(0, IntValue.class); IntValue value = rec.getField(1, IntValue.class); this.bld.setLength(0); this.bld.append(key.getValue()); this.bld.append('_'); this.bld.append(value.getValue()); this.bld.append('\n'); byte[] bytes = this.bld.toString().getBytes(); this.stream.write(bytes); }
/** * Tests the resettable iterator with too little memory, so that the data has to be written to * disk. */ @Test public void testResettableIterator() { try { final AbstractInvokable memOwner = new DummyInvokable(); // create the resettable Iterator SpillingResettableMutableObjectIterator<Record> iterator = new SpillingResettableMutableObjectIterator<Record>( this.reader, this.serializer, this.memman, this.ioman, 2, memOwner); // open the iterator iterator.open(); // now test walking through the iterator int count = 0; Record target = new Record(); while ((target = iterator.next(target)) != null) { Assert.assertEquals( "In initial run, element " + count + " does not match expected value!", count++, target.getField(0, IntValue.class).getValue()); } Assert.assertEquals( "Too few elements were deserialzied in initial run!", NUM_TESTRECORDS, count); // test resetting the iterator a few times for (int j = 0; j < 10; ++j) { count = 0; iterator.reset(); target = new Record(); // now we should get the same results while ((target = iterator.next(target)) != null) { Assert.assertEquals( "After reset nr. " + j + 1 + " element " + count + " does not match expected value!", count++, target.getField(0, IntValue.class).getValue()); } Assert.assertEquals( "Too few elements were deserialzied after reset nr. " + j + 1 + "!", NUM_TESTRECORDS, count); } // close the iterator iterator.close(); } catch (Exception ex) { ex.printStackTrace(); Assert.fail("Test encountered an exception."); } }
@Override public Record readRecord(Record target, byte[] record, int offset, int numBytes) { String line = new String(record, offset, numBytes); try { this.key.setValue(Integer.parseInt(line.substring(0, line.indexOf("_")))); this.value.setValue(Integer.parseInt(line.substring(line.indexOf("_") + 1, line.length()))); } catch (RuntimeException re) { return null; } target.setField(0, this.key); target.setField(1, this.value); return target; }
@Override public int serializeRecord(Record rec, byte[] target) throws Exception { String string = rec.getField(0, StringValue.class).toString(); byte[] stringBytes = string.getBytes(); Tuple tuple = rec.getField(1, Tuple.class); String tupleStr = tuple.toString(); byte[] tupleBytes = tupleStr.getBytes(); int totalLength = stringBytes.length + 1 + tupleBytes.length; if (target.length >= totalLength) { System.arraycopy(stringBytes, 0, target, 0, stringBytes.length); target[stringBytes.length] = '|'; System.arraycopy(tupleBytes, 0, target, stringBytes.length + 1, tupleBytes.length); return totalLength; } else { return -1 * totalLength; } }
private Map<TestData.Key, Collection<TestData.Value>> collectData(Generator iter, int num) throws Exception { Map<TestData.Key, Collection<TestData.Value>> map = new HashMap<TestData.Key, Collection<TestData.Value>>(); Record pair = new Record(); for (int i = 0; i < num; i++) { iter.next(pair); TestData.Key key = pair.getField(0, TestData.Key.class); if (!map.containsKey(key)) { map.put(new TestData.Key(key.getKey()), new ArrayList<TestData.Value>()); } Collection<TestData.Value> values = map.get(key); values.add(new TestData.Value(pair.getField(1, TestData.Value.class).getValue())); } return map; }
public static void prepareInputFile( MutableObjectIterator<Record> inIt, String inputFilePath, boolean insertInvalidData) throws IOException { FileWriter fw = new FileWriter(inputFilePath); BufferedWriter bw = new BufferedWriter(fw); if (insertInvalidData) { bw.write("####_I_AM_INVALID_########\n"); } Record rec = new Record(); while ((rec = inIt.next(rec)) != null) { IntValue key = rec.getField(0, IntValue.class); IntValue value = rec.getField(1, IntValue.class); bw.write(key.getValue() + "_" + value.getValue() + "\n"); } if (insertInvalidData) { bw.write("####_I_AM_INVALID_########\n"); } bw.flush(); bw.close(); }
@Test public void testSpillingSortWithIntermediateMerge() { try { // amount of pairs final int PAIRS = 10000000; // comparator final Comparator<TestData.Key> keyComparator = new TestData.KeyComparator(); final TestData.Generator generator = new TestData.Generator(SEED, KEY_MAX, VALUE_LENGTH, KeyMode.RANDOM, ValueMode.FIX_LENGTH); final MutableObjectIterator<Record> source = new TestData.GeneratorIterator(generator, PAIRS); // merge iterator LOG.debug("Initializing sortmerger..."); Sorter<Record> merger = new UnilateralSortMerger<Record>( this.memoryManager, this.ioManager, source, this.parentTask, this.pactRecordSerializer, this.pactRecordComparator, (double) 64 / 78, 16, 0.7f); // emit data LOG.debug("Emitting data..."); // check order MutableObjectIterator<Record> iterator = merger.getIterator(); LOG.debug("Checking results..."); int pairsRead = 1; int nextStep = PAIRS / 20; Record rec1 = new Record(); Record rec2 = new Record(); Assert.assertTrue((rec1 = iterator.next(rec1)) != null); while ((rec2 = iterator.next(rec2)) != null) { final Key k1 = rec1.getField(0, TestData.Key.class); final Key k2 = rec2.getField(0, TestData.Key.class); pairsRead++; Assert.assertTrue(keyComparator.compare(k1, k2) <= 0); Record tmp = rec1; rec1 = rec2; k1.setKey(k2.getKey()); rec2 = tmp; // log if (pairsRead == nextStep) { nextStep += PAIRS / 20; } } Assert.assertEquals("Not all pairs were read back in.", PAIRS, pairsRead); merger.close(); testSuccess = true; } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Test public void testSpillingSort() { try { // comparator final Comparator<TestData.Key> keyComparator = new TestData.KeyComparator(); final TestData.Generator generator = new TestData.Generator( SEED, KEY_MAX, VALUE_LENGTH, KeyMode.RANDOM, ValueMode.CONSTANT, VAL); final MutableObjectIterator<Record> source = new TestData.GeneratorIterator(generator, NUM_PAIRS); // merge iterator LOG.debug("Initializing sortmerger..."); Sorter<Record> merger = new UnilateralSortMerger<Record>( this.memoryManager, this.ioManager, source, this.parentTask, this.pactRecordSerializer, this.pactRecordComparator, (double) 16 / 78, 64, 0.7f); // emit data LOG.debug("Reading and sorting data..."); // check order MutableObjectIterator<Record> iterator = merger.getIterator(); LOG.debug("Checking results..."); int pairsEmitted = 1; Record rec1 = new Record(); Record rec2 = new Record(); Assert.assertTrue((rec1 = iterator.next(rec1)) != null); while ((rec2 = iterator.next(rec2)) != null) { final Key k1 = rec1.getField(0, TestData.Key.class); final Key k2 = rec2.getField(0, TestData.Key.class); pairsEmitted++; Assert.assertTrue(keyComparator.compare(k1, k2) <= 0); Record tmp = rec1; rec1 = rec2; k1.setKey(k2.getKey()); rec2 = tmp; } Assert.assertTrue(NUM_PAIRS == pairsEmitted); merger.close(); testSuccess = true; } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Test public void testDataSourceTask() { int keyCnt = 100; int valCnt = 20; this.outList = new ArrayList<Record>(); try { InputFilePreparator.prepareInputFile( new UniformRecordGenerator(keyCnt, valCnt, false), this.tempTestPath, true); } catch (IOException e1) { Assert.fail("Unable to set-up test input file"); } super.initEnvironment(MEMORY_MANAGER_SIZE, NETWORK_BUFFER_SIZE); super.addOutput(this.outList); DataSourceTask<Record> testTask = new DataSourceTask<>(); super.registerFileInputTask( testTask, MockInputFormat.class, new File(tempTestPath).toURI().toString(), "\n"); try { testTask.invoke(); } catch (Exception e) { System.err.println(e); Assert.fail("Invoke method caused exception."); } Assert.assertTrue( "Invalid output size. Expected: " + (keyCnt * valCnt) + " Actual: " + this.outList.size(), this.outList.size() == keyCnt * valCnt); HashMap<Integer, HashSet<Integer>> keyValueCountMap = new HashMap<>(keyCnt); for (Record kvp : this.outList) { int key = kvp.getField(0, IntValue.class).getValue(); int val = kvp.getField(1, IntValue.class).getValue(); if (!keyValueCountMap.containsKey(key)) { keyValueCountMap.put(key, new HashSet<Integer>()); } keyValueCountMap.get(key).add(val); } Assert.assertTrue( "Invalid key count in out file. Expected: " + keyCnt + " Actual: " + keyValueCountMap.keySet().size(), keyValueCountMap.keySet().size() == keyCnt); for (Integer mapKey : keyValueCountMap.keySet()) { Assert.assertTrue( "Invalid value count for key: " + mapKey + ". Expected: " + valCnt + " Actual: " + keyValueCountMap.get(mapKey).size(), keyValueCountMap.get(mapKey).size() == valCnt); } }
@Override public void collect(Record record) { this.output.add(record.createCopy()); }
@Test public void testMerge() { try { generator1 = new Generator(SEED1, 500, 4096, KeyMode.SORTED, ValueMode.RANDOM_LENGTH); generator2 = new Generator(SEED2, 500, 2048, KeyMode.SORTED, ValueMode.RANDOM_LENGTH); reader1 = new TestData.GeneratorIterator(generator1, INPUT_1_SIZE); reader2 = new TestData.GeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data Map<TestData.Key, Collection<TestData.Value>> expectedValuesMap1 = collectData(generator1, INPUT_1_SIZE); Map<TestData.Key, Collection<TestData.Value>> expectedValuesMap2 = collectData(generator2, INPUT_2_SIZE); Map<TestData.Key, List<Collection<TestData.Value>>> expectedCoGroupsMap = coGroupValues(expectedValuesMap1, expectedValuesMap2); // reset the generators generator1.reset(); generator2.reset(); // compare with iterator values SortMergeCoGroupIterator<Record, Record> iterator = new SortMergeCoGroupIterator<Record, Record>( this.reader1, this.reader2, this.serializer1, this.comparator1, this.serializer2, this.comparator2, this.pairComparator); iterator.open(); final TestData.Key key = new TestData.Key(); while (iterator.next()) { Iterator<Record> iter1 = iterator.getValues1().iterator(); Iterator<Record> iter2 = iterator.getValues2().iterator(); TestData.Value v1 = null; TestData.Value v2 = null; if (iter1.hasNext()) { Record rec = iter1.next(); rec.getFieldInto(0, key); v1 = rec.getField(1, TestData.Value.class); } else if (iter2.hasNext()) { Record rec = iter2.next(); rec.getFieldInto(0, key); v2 = rec.getField(1, TestData.Value.class); } else { Assert.fail("No input on both sides."); } // assert that matches for this key exist Assert.assertTrue("No matches for key " + key, expectedCoGroupsMap.containsKey(key)); Collection<TestData.Value> expValues1 = expectedCoGroupsMap.get(key).get(0); Collection<TestData.Value> expValues2 = expectedCoGroupsMap.get(key).get(1); if (v1 != null) { expValues1.remove(v1); } else { expValues2.remove(v2); } while (iter1.hasNext()) { Record rec = iter1.next(); Assert.assertTrue( "Value not in expected set of first input", expValues1.remove(rec.getField(1, TestData.Value.class))); } Assert.assertTrue("Expected set of first input not empty", expValues1.isEmpty()); while (iter2.hasNext()) { Record rec = iter2.next(); Assert.assertTrue( "Value not in expected set of second input", expValues2.remove(rec.getField(1, TestData.Value.class))); } Assert.assertTrue("Expected set of second input not empty", expValues2.isEmpty()); expectedCoGroupsMap.remove(key); } iterator.close(); Assert.assertTrue("Expected key set not empty", expectedCoGroupsMap.isEmpty()); } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } }