// See PIG-1434 @Test public void testScalarAliasesFilterClause() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;"); pigServer.registerQuery("Y = filter A by a1 > C.average;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(3,20)")); t = iter.next(); assertTrue(t.toString().equals("(4,12)")); assertFalse(iter.hasNext()); }
// See PIG-1434 @Test public void testScalarWithNoSchemaDollarProj() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';"); pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); }
public void testRecursiveFileListing() throws IOException { String LOCAL_INPUT_FILE = "test.dat"; String INPUT_FILE = "foo/bar/test.dat"; PrintWriter w = new PrintWriter(new FileWriter(LOCAL_INPUT_FILE)); w.println("1"); w.println("2"); w.println("3"); w.println("5"); w.close(); Util.copyFromLocalToCluster(cluster, LOCAL_INPUT_FILE, INPUT_FILE); pigServer.registerQuery("a = load 'foo' as (nums:chararray);"); pigServer.registerQuery("b = load 'foo' as (nums:chararray);"); pigServer.registerQuery("d = join a by nums, b by nums USING 'skewed';"); Iterator<Tuple> iter = pigServer.openIterator("d"); int count = 0; while (iter.hasNext()) { iter.next(); count++; } Assert.assertEquals(4, count); new File(LOCAL_INPUT_FILE).delete(); Util.deleteFile(cluster, INPUT_FILE); }
/** * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they * are when using uncompressed text */ @Test public void testRecordDelims() throws Exception { String[] inputData = new String[] { "1\t2\r3\t4", // '\r' case - this will be split into two tuples "5\t6\r", // '\r\n' case "7\t8", // '\n' case "9\t10\r" // '\r\n' at the end of file }; // bzip compressed input File in = File.createTempFile("junit", ".bz2"); String compressedInputFileName = in.getAbsolutePath(); in.deleteOnExit(); String clusterCompressedFilePath = Util.removeColon(compressedInputFileName); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputData); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 0; i < inputData.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath); // pig script to read uncompressed input String script = "a = load '" + unCompressedInputFileName + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed input script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in.delete(); Util.deleteFile(cluster, unCompressedInputFileName); Util.deleteFile(cluster, clusterCompressedFilePath); } }
public void testShouldWorkWithWKT() throws Exception { ArrayList<String[]> data = new ArrayList<String[]>(); data.add(new String[] {"1", "1", "0", "POINT (0.0 0.0)"}); data.add(new String[] {"1", "2", "1", "POINT (0.0 3.0)"}); data.add(new String[] {"1", "3", "2", "POINT (4.0 5.0)"}); data.add(new String[] {"1", "4", "3", "POINT (10.0 0.0)"}); data.add(new String[] {"2", "5", "0", "POINT (5.0 6.0)"}); data.add(new String[] {"2", "6", "1", "POINT (10.0 3.0)"}); data.add(new String[] {"2", "7", "2", "POINT (7.0 13.0)"}); data.add(new String[] {"3", "1", "0", "POINT (0.0 0.0)"}); data.add(new String[] {"3", "8", "1", "POINT (10.0 10.0)"}); data.add(new String[] {"3", "9", "2", "POINT (18.0 5.0)"}); data.add(new String[] {"3", "1", "3", "POINT (0.0 0.0)"}); String datafile = TestHelper.createTempFile(data, "\t"); datafile = datafile.replace("\\", "\\\\"); PigServer pig = new PigServer(LOCAL); String query = "A = LOAD 'file:" + datafile + "' as (geom_id: int, point_id: int, point_pos: int, point);\n" + "B = ORDER A BY point_pos;" + "C = GROUP B BY geom_id;" + "D = FOREACH C GENERATE group, FLATTEN(" + MakeSegments.class.getName() + "(B.point_id, B.point));"; pig.registerQuery(query); Iterator<?> it = pig.openIterator("D"); ArrayList<String[]> expectedResults = new ArrayList<String[]>(); expectedResults.add(new String[] {"1", "0", "1", "0.0", "0.0", "2", "0.0", "3.0"}); expectedResults.add(new String[] {"1", "1", "2", "0.0", "3.0", "3", "4.0", "5.0"}); expectedResults.add(new String[] {"1", "2", "3", "4.0", "5.0", "4", "10.0", "0.0"}); expectedResults.add(new String[] {"2", "0", "5", "5.0", "6.0", "6", "10.0", "3.0"}); expectedResults.add(new String[] {"2", "1", "6", "10.0", "3.0", "7", "7.0", "13.0"}); expectedResults.add(new String[] {"3", "0", "1", "0.0", "0.0", "8", "10.0", "10.0"}); expectedResults.add(new String[] {"3", "1", "8", "10.0", "10.0", "9", "18.0", "5.0"}); expectedResults.add(new String[] {"3", "2", "9", "18.0", "5.0", "1", "0.0", "0.0"}); Iterator<String[]> expectedResultIter = expectedResults.iterator(); int count = 0; while (it.hasNext() && expectedResultIter.hasNext()) { Tuple tuple = (Tuple) it.next(); String[] expectedResult = expectedResultIter.next(); if (tuple == null) break; assertEquals(Integer.parseInt(expectedResult[0]), tuple.get(0)); assertEquals(Integer.parseInt(expectedResult[1]), tuple.get(1)); assertEquals(Long.parseLong(expectedResult[2]), tuple.get(2)); assertEquals(Double.parseDouble(expectedResult[3]), tuple.get(3)); assertEquals(Double.parseDouble(expectedResult[4]), tuple.get(4)); assertEquals(Long.parseLong(expectedResult[5]), tuple.get(5)); assertEquals(Double.parseDouble(expectedResult[6]), tuple.get(6)); assertEquals(Double.parseDouble(expectedResult[7]), tuple.get(7)); count++; } assertEquals(expectedResults.size(), count); }
@Test public void testErrorMessageUndefinedAliasInGroupByStatement() throws Exception { String queryA = "A = load 'nosuchfile' using PigStorage() as (f1:chararray,f2:chararray);"; String queryB = "B = GROUP B by f1;"; PigServer ps = new PigServer(ExecType.LOCAL); ps.registerQuery(queryA); try { ps.registerQuery(queryB); } catch (FrontendException e) { Assert.assertTrue(e.getMessage().contains("Undefined alias:")); return; } Assert.fail(); }
@Test public void readWriteNullValuesRead() throws IOException { registerLoadQuery(); tempFilename = tempFilename + "-2"; pigServer.registerQuery( String.format( "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), IntWritableConverter.class.getName(), NullWritableConverter.class.getName())); registerLoadQuery(IntWritableConverter.class, NullWritableConverter.class, null); validateIndex(pigServer.openIterator("A"), 2, 0, 0); }
@Test public void testBzipStoreInMultiQuery2() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input2.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); PigContext pigContext = pig.getPigContext(); pigContext.getProperties().setProperty("output.compression.enabled", "true"); pigContext .getProperties() .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output2.bz2';"); pig.registerQuery("store a into 'output2';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
@Test(expected = IOException.class) public void writeUnsupportedConversion() throws IOException { registerLoadQuery(); // swap ordering of key and value pigServer.registerQuery("A = FOREACH A GENERATE TOTUPLE(key), value;"); // the following should die because IntWritableConverter doesn't support conversion of Tuple to // IntWritable pigServer.registerQuery( String.format( "STORE A INTO 'file:%s-2' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), IntWritableConverter.class.getName(), TextConverter.class.getName())); }
@Test(expected = Exception.class) public void readByteArraysWriteByteArraysWithoutTypeRead() throws IOException { registerLoadQuery( GenericWritableConverter.class, TextConverter.class, "key:bytearray, value:bytearray"); tempFilename = tempFilename + "-2"; pigServer.registerQuery( String.format( "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), GenericWritableConverter.class.getName(), TextConverter.class.getName())); registerLoadQuery(); validate(pigServer.openIterator("A")); }
@Test public void writeTextConversion() throws IOException { registerLoadQuery(); tempFilename = tempFilename + "-2"; // rely on TextConverter for conversion of int to Text pigServer.registerQuery( String.format( "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), TextConverter.class.getName(), TextConverter.class.getName())); registerLoadQuery(TextConverter.class, TextConverter.class, "key:chararray, value:chararray"); validate(pigServer.openIterator("A")); }
private void testCount( String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec) throws IOException { String outputFile = "/tmp/bz-output"; // simple load-store script to verify that the bzip input is getting // split String scriptToTestSplitting = "a = load '" + inputFileName + "' using " + loadFuncSpec + "; store a into '" + outputFile + "';"; String script = "a = load '" + inputFileName + "';" + "b = group a all;" + "c = foreach b generate COUNT_STAR(a);"; Properties props = new Properties(); for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) { props.put(entry.getKey(), entry.getValue()); } props.setProperty("mapred.max.split.size", Integer.toString(splitSize)); PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props); PigServer pig = new PigServer(pigContext); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props)); fs.delete(new Path(outputFile), true); Util.registerMultiLineQuery(pig, scriptToTestSplitting); // verify that > 1 maps were launched due to splitting of the bzip input FileStatus[] files = fs.listStatus(new Path(outputFile)); int numPartFiles = 0; for (FileStatus fileStatus : files) { if (fileStatus.getPath().getName().startsWith("part")) { numPartFiles++; } } assertEquals(true, numPartFiles > 0); // verify record count to verify we read bzip data correctly Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("c"); Long result = (Long) it.next().get(0); assertEquals(expectedCount, result); }
// See PIG-1434 @Test public void testScalarAliasesBatchNobatch() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String output = BUILD_TEST_TMP + "table_testScalarAliasesDir"; TestScalarAliases.deleteDirectory(new File(output)); // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesBatch"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("Y = foreach A generate (a0 * C.count), (a1 / C.max);"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);"); Iterator<Tuple> iter; Tuple t; iter = pigServer.openIterator("Z"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(3,0.25)")); t = iter.next(); assertTrue(t.toString().equals("(6,0.5)")); t = iter.next(); assertTrue(t.toString().equals("(9,1.0)")); assertFalse(iter.hasNext()); }
public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } }
@Test(expected = Exception.class) public void readWithMissingWritableConverterArguments() throws IOException { registerLoadQuery( FixedArgsConstructorIntWritableConverter.class, TextConverter.class, "key: int, value: chararray"); validate(pigServer.openIterator("A")); }
public void testSkewedJoinWithGroup() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = GROUP A by id;"); pigServer.registerQuery("D = GROUP B by id;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by group, D by group;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); }
// See PIG-1434 @Test public void testScalarAliasesJoinClause() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"}; // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB"; TestScalarAliases.createLocalInputFile(inputPathB, inputB); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate COUNT(A) as count;"); pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);"); pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); String[] expected = new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"}; Util.checkQueryOutputsAfterSortRecursive( iter, expected, org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y"))); }
@Test public void testSplitWithNotEvalCondition() throws Exception { String defineQ = "define minelogs org.apache.pig.test.RegexGroupCount('www\\\\.xyz\\\\.com/sports');"; String defineL = "a = load 'nosuchfile' " + " using PigStorage() as (source : chararray);"; String defineSplit = "SPLIT a INTO a1 IF (minelogs(source) > 0 ), a2 IF (NOT (minelogs(source)>0));"; // (NOT // ( // minelogs(source) ) > 0) ;"; PigServer ps = new PigServer(ExecType.LOCAL); ps.registerQuery(defineQ); ps.registerQuery(defineL); try { ps.registerQuery(defineSplit); } catch (FrontendException e) { Assert.fail(e.getMessage()); } }
/** Tests the end-to-end writing and reading of a BZip file. */ @Test public void testBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".bz2"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutput = Util.removeColon(out.getAbsolutePath()); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 1; i < 100; i++) { StringBuffer sb = new StringBuffer(); sb.append(i).append("\n").append(-i).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(100, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';"); Iterator<Tuple> i = pig.openIterator("B"); HashMap<Integer, Integer> map = new HashMap<Integer, Integer>(); while (i.hasNext()) { Integer val = DataType.toInteger(i.next().get(0)); map.put(val, val); } assertEquals(new Integer(99), new Integer(map.keySet().size())); for (int j = 1; j < 100; j++) { assertEquals(new Integer(j), map.get(j)); } in.delete(); Util.deleteFile(cluster, clusterOutput); }
@Test public void readWriteUnexpectedNullValuesRead() throws IOException { registerLoadQuery(); tempFilename = tempFilename + "-2"; // swap last value with null; this pair should not be stored pigServer.registerQuery( String.format("A = FOREACH A GENERATE key, (key == 2 ? null : value) AS value;")); pigServer.registerQuery( String.format( "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), IntWritableConverter.class.getName(), TextConverter.class.getName())); registerLoadQuery(); // validation against expected pairs will succeed, with expected number of pairs one less than // usual (the last pair wasn't stored due to null value) validate(pigServer.openIterator("A"), DATA.length - 1); }
@Test public void readWithoutSchemaTestSchema() throws IOException { registerLoadQuery(IntWritableConverter.class, TextConverter.class, null); Schema schema = pigServer.dumpSchema("A"); Assert.assertNotNull(schema); Assert.assertEquals("key", schema.getField(0).alias); Assert.assertEquals(DataType.INTEGER, schema.getField(0).type); Assert.assertEquals("value", schema.getField(1).alias); Assert.assertEquals(DataType.CHARARRAY, schema.getField(1).type); }
public void testSchemaSerialization() throws IOException { MiniCluster cluster = MiniCluster.buildCluster(); PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); String inputFileName = "testSchemaSerialization-input.txt"; String[] inputData = new String[] {"foo\t1", "hello\t2"}; Util.createInputFile(cluster, inputFileName, inputData); String script = "a = load '" + inputFileName + "' as (f1:chararray, f2:int);" + " b = group a all; c = foreach b generate org.apache.pig.test.InputSchemaUDF(a);"; Util.registerMultiLineQuery(pigServer, script); Iterator<Tuple> it = pigServer.openIterator("c"); while (it.hasNext()) { Tuple t = it.next(); Assert.assertEquals("{a: {(f1: chararray,f2: int)}}", t.get(0)); } cluster.shutDown(); }
public void testSkewedJoinKeyPartition() throws IOException { try { Util.deleteFile(cluster, "skewedjoin"); } catch (Exception e) { // it is ok if directory not exist } pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 7;"); pigServer.store("E", "skewedjoin"); int[][] lineCount = new int[3][7]; new File("skewedjoin").mkdir(); // check how many times a key appear in each part- file for (int i = 0; i < 7; i++) { Util.copyFromClusterToLocal( cluster, "skewedjoin/part-r-0000" + i, "skewedjoin/part-r-0000" + i); BufferedReader reader = new BufferedReader(new FileReader("skewedjoin/part-r-0000" + i)); String line = null; while ((line = reader.readLine()) != null) { String[] cols = line.split("\t"); int key = Integer.parseInt(cols[0]) / 100 - 1; lineCount[key][i]++; } } int fc = 0; for (int i = 0; i < 3; i++) { for (int j = 0; j < 7; j++) { if (lineCount[i][j] > 0) { fc++; } } } // atleast one key should be a skewed key // check atleast one key should appear in more than 1 part- file assertTrue(fc > 3); }
// See PIG-1434 @Test public void testScalarAliasesGrammarNegative() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar"; TestScalarAliases.createLocalInputFile(inputPath, input); try { pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A);"); // Only projections of C are supported pigServer.registerQuery("Y = foreach A generate C;"); pigServer.openIterator("Y"); // Control should not reach here fail("Scalar projections are only supported"); } catch (IOException pe) { assertTrue(pe.getMessage().contains("Invalid scalar projection: C")); } }
public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); }
public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; }
public void testSkewedJoinManyReducers() throws IOException { pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2"); pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by id, B by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
// See PIG-1636 @Test public void testScalarAliasesLimit() throws Exception { String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;"); pigServer.registerQuery("C1 = limit C 1;"); pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(a,15.0)")); t = iter.next(); assertTrue(t.toString().equals("(b,30.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,45.0)")); t = iter.next(); assertTrue(t.toString().equals("(a,60.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,75.0)")); assertFalse(iter.hasNext()); }
public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } }
// pig 1048 public void testSkewedJoinOneValue() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);"); // Filter key with a single value pigServer.registerQuery("C = FILTER A by id == 400;"); pigServer.registerQuery("D = FILTER B by id == 400;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by id, D by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by id, D by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }