@Test public void testBzipStoreInMultiQuery2() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input2.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); PigContext pigContext = pig.getPigContext(); pigContext.getProperties().setProperty("output.compression.enabled", "true"); pigContext .getProperties() .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output2.bz2';"); pig.registerQuery("store a into 'output2';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
// See PIG-1636 @Test public void testScalarAliasesLimit() throws Exception { String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;"); pigServer.registerQuery("C1 = limit C 1;"); pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(a,15.0)")); t = iter.next(); assertTrue(t.toString().equals("(b,30.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,45.0)")); t = iter.next(); assertTrue(t.toString().equals("(a,60.0)")); t = iter.next(); assertTrue(t.toString().equals("(c,75.0)")); assertFalse(iter.hasNext()); }
public void testSkewedJoinWithGroup() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = GROUP A by id;"); pigServer.registerQuery("D = GROUP B by id;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by group, D by group;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); }
public void testSkewedJoinManyReducers() throws IOException { pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2"); pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by id, B by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
// See PIG-1434 @Test public void testScalarAliasesJoinClause() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"}; // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB"; TestScalarAliases.createLocalInputFile(inputPathB, inputB); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate COUNT(A) as count;"); pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);"); pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); String[] expected = new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"}; Util.checkQueryOutputsAfterSortRecursive( iter, expected, org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y"))); }
// pig 1048 public void testSkewedJoinOneValue() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);"); // Filter key with a single value pigServer.registerQuery("C = FILTER A by id == 400;"); pigServer.registerQuery("D = FILTER B by id == 400;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by id, D by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by id, D by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
// See PIG-1434 @Test public void testScalarWithNoSchemaDollarProj() throws Exception { String[] scalarInput = {"1\t5"}; String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar"; TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput); // Load A as a scalar pigServer.registerQuery("A = LOAD '" + inputPath + "';"); pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';"); pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;"); Iterator<Tuple> iter = pigServer.openIterator("B"); Tuple t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); t = iter.next(); assertTrue(t.get(0).toString().equals("1")); assertFalse(iter.hasNext()); }
public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } }
public void testRecursiveFileListing() throws IOException { String LOCAL_INPUT_FILE = "test.dat"; String INPUT_FILE = "foo/bar/test.dat"; PrintWriter w = new PrintWriter(new FileWriter(LOCAL_INPUT_FILE)); w.println("1"); w.println("2"); w.println("3"); w.println("5"); w.close(); Util.copyFromLocalToCluster(cluster, LOCAL_INPUT_FILE, INPUT_FILE); pigServer.registerQuery("a = load 'foo' as (nums:chararray);"); pigServer.registerQuery("b = load 'foo' as (nums:chararray);"); pigServer.registerQuery("d = join a by nums, b by nums USING 'skewed';"); Iterator<Tuple> iter = pigServer.openIterator("d"); int count = 0; while (iter.hasNext()) { iter.next(); count++; } Assert.assertEquals(4, count); new File(LOCAL_INPUT_FILE).delete(); Util.deleteFile(cluster, INPUT_FILE); }
// See PIG-1434 @Test public void testScalarAliasesFilterClause() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"}; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);"); pigServer.registerQuery("G = group A all;"); pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;"); pigServer.registerQuery("Y = filter A by a1 > C.average;"); Iterator<Tuple> iter = pigServer.openIterator("Y"); // Average is 11 Tuple t = iter.next(); assertTrue(t.toString().equals("(3,20)")); t = iter.next(); assertTrue(t.toString().equals("(4,12)")); assertFalse(iter.hasNext()); }
// See PIG-1434 @Test public void testFilteredScalarDollarProj() throws Exception { String output = BUILD_TEST_TMP + "table_testFilteredScalarDollarProjDir"; TestScalarAliases.deleteDirectory(new File(output)); String[] input = { "1\t5\t[state#maine,city#portland]\t{(a),(b)}\t(a,b)", "2\t10\t\t\t", "3\t20\t\t\t" }; // Test the use of scalars in expressions String inputPath = BUILD_TEST_TMP + "table_testFilteredScalarDollarProj"; TestScalarAliases.createLocalInputFile(inputPath, input); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery( "A = LOAD '" + inputPath + "'" + " as (a0: long, a1: double, a2 : bytearray, " + "a3: bag{ t : tuple(tc : chararray)}, " + "a4: tuple(c1 : chararray, c2 : chararray) );"); pigServer.registerQuery("B = filter A by $1 < 8;"); pigServer.registerQuery( "Y = foreach A generate (a0 * B.$0), (a1 / B.$1), B.$2, B.$2#'state', B.$3, B.a4;"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.explain("Y", System.err); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);"); pigServer.explain("Z", System.err); Iterator<Tuple> iter = pigServer.openIterator("Z"); Tuple t = iter.next(); assertTrue(t.toString().equals("(1,1.0)")); t = iter.next(); assertTrue(t.toString().equals("(2,2.0)")); t = iter.next(); assertTrue(t.toString().equals("(3,4.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertEquals(t.toString(), "(1,1.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); t = iter.next(); assertEquals(t.toString(), "(2,2.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); t = iter.next(); assertEquals(t.toString(), "(3,4.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))"); assertFalse(iter.hasNext()); }
/** * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they * are when using uncompressed text */ @Test public void testRecordDelims() throws Exception { String[] inputData = new String[] { "1\t2\r3\t4", // '\r' case - this will be split into two tuples "5\t6\r", // '\r\n' case "7\t8", // '\n' case "9\t10\r" // '\r\n' at the end of file }; // bzip compressed input File in = File.createTempFile("junit", ".bz2"); String compressedInputFileName = in.getAbsolutePath(); in.deleteOnExit(); String clusterCompressedFilePath = Util.removeColon(compressedInputFileName); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputData); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 0; i < inputData.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath); // pig script to read uncompressed input String script = "a = load '" + unCompressedInputFileName + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed input script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in.delete(); Util.deleteFile(cluster, unCompressedInputFileName); Util.deleteFile(cluster, clusterCompressedFilePath); } }
/** Tests the end-to-end writing and reading of a BZip file. */ @Test public void testBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".bz2"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutput = Util.removeColon(out.getAbsolutePath()); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 1; i < 100; i++) { StringBuffer sb = new StringBuffer(); sb.append(i).append("\n").append(-i).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(100, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';"); Iterator<Tuple> i = pig.openIterator("B"); HashMap<Integer, Integer> map = new HashMap<Integer, Integer>(); while (i.hasNext()) { Integer val = DataType.toInteger(i.next().get(0)); map.put(val, val); } assertEquals(new Integer(99), new Integer(map.keySet().size())); for (int j = 1; j < 100; j++) { assertEquals(new Integer(j), map.get(j)); } in.delete(); Util.deleteFile(cluster, clusterOutput); }
@Test public void testErrorMessageUndefinedAliasInGroupByStatement() throws Exception { String queryA = "A = load 'nosuchfile' using PigStorage() as (f1:chararray,f2:chararray);"; String queryB = "B = GROUP B by f1;"; PigServer ps = new PigServer(ExecType.LOCAL); ps.registerQuery(queryA); try { ps.registerQuery(queryB); } catch (FrontendException e) { Assert.assertTrue(e.getMessage().contains("Undefined alias:")); return; } Assert.fail(); }
@Test(expected = IOException.class) public void writeUnsupportedConversion() throws IOException { registerLoadQuery(); // swap ordering of key and value pigServer.registerQuery("A = FOREACH A GENERATE TOTUPLE(key), value;"); // the following should die because IntWritableConverter doesn't support conversion of Tuple to // IntWritable pigServer.registerQuery( String.format( "STORE A INTO 'file:%s-2' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), IntWritableConverter.class.getName(), TextConverter.class.getName())); }
public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } }
@Test public void testSplitWithNotEvalCondition() throws Exception { String defineQ = "define minelogs org.apache.pig.test.RegexGroupCount('www\\\\.xyz\\\\.com/sports');"; String defineL = "a = load 'nosuchfile' " + " using PigStorage() as (source : chararray);"; String defineSplit = "SPLIT a INTO a1 IF (minelogs(source) > 0 ), a2 IF (NOT (minelogs(source)>0));"; // (NOT // ( // minelogs(source) ) > 0) ;"; PigServer ps = new PigServer(ExecType.LOCAL); ps.registerQuery(defineQ); ps.registerQuery(defineL); try { ps.registerQuery(defineSplit); } catch (FrontendException e) { Assert.fail(e.getMessage()); } }
@Test public void testScalarNullValue() throws Exception { Storage.Data data = Storage.resetData(pigServer); data.set("input", Storage.tuple("a", 1), Storage.tuple("b", 2)); pigServer.setBatchOn(); pigServer.registerQuery("A = load 'input' using mock.Storage() as (a:chararray, b:int);"); pigServer.registerQuery("B = FILTER A by a == 'c';"); pigServer.registerQuery("C = FOREACH A generate a, b + B.b;"); pigServer.registerQuery("store C into 'output' using mock.Storage();"); pigServer.executeBatch(); List<Tuple> actualResults = data.get("output"); List<Tuple> expectedResults = Util.getTuplesFromConstantTupleStrings(new String[] {"('a', null)", "('b', null)"}); Util.checkQueryOutputsAfterSort(actualResults.iterator(), expectedResults); }
@Test public void readWriteUnexpectedNullValuesRead() throws IOException { registerLoadQuery(); tempFilename = tempFilename + "-2"; // swap last value with null; this pair should not be stored pigServer.registerQuery( String.format("A = FOREACH A GENERATE key, (key == 2 ? null : value) AS value;")); pigServer.registerQuery( String.format( "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), IntWritableConverter.class.getName(), TextConverter.class.getName())); registerLoadQuery(); // validation against expected pairs will succeed, with expected number of pairs one less than // usual (the last pair wasn't stored due to null value) validate(pigServer.openIterator("A"), DATA.length - 1); }
public void testShouldWorkWithWKT() throws Exception { ArrayList<String[]> data = new ArrayList<String[]>(); data.add(new String[] {"1", "1", "0", "POINT (0.0 0.0)"}); data.add(new String[] {"1", "2", "1", "POINT (0.0 3.0)"}); data.add(new String[] {"1", "3", "2", "POINT (4.0 5.0)"}); data.add(new String[] {"1", "4", "3", "POINT (10.0 0.0)"}); data.add(new String[] {"2", "5", "0", "POINT (5.0 6.0)"}); data.add(new String[] {"2", "6", "1", "POINT (10.0 3.0)"}); data.add(new String[] {"2", "7", "2", "POINT (7.0 13.0)"}); data.add(new String[] {"3", "1", "0", "POINT (0.0 0.0)"}); data.add(new String[] {"3", "8", "1", "POINT (10.0 10.0)"}); data.add(new String[] {"3", "9", "2", "POINT (18.0 5.0)"}); data.add(new String[] {"3", "1", "3", "POINT (0.0 0.0)"}); String datafile = TestHelper.createTempFile(data, "\t"); datafile = datafile.replace("\\", "\\\\"); PigServer pig = new PigServer(LOCAL); String query = "A = LOAD 'file:" + datafile + "' as (geom_id: int, point_id: int, point_pos: int, point);\n" + "B = ORDER A BY point_pos;" + "C = GROUP B BY geom_id;" + "D = FOREACH C GENERATE group, FLATTEN(" + MakeSegments.class.getName() + "(B.point_id, B.point));"; pig.registerQuery(query); Iterator<?> it = pig.openIterator("D"); ArrayList<String[]> expectedResults = new ArrayList<String[]>(); expectedResults.add(new String[] {"1", "0", "1", "0.0", "0.0", "2", "0.0", "3.0"}); expectedResults.add(new String[] {"1", "1", "2", "0.0", "3.0", "3", "4.0", "5.0"}); expectedResults.add(new String[] {"1", "2", "3", "4.0", "5.0", "4", "10.0", "0.0"}); expectedResults.add(new String[] {"2", "0", "5", "5.0", "6.0", "6", "10.0", "3.0"}); expectedResults.add(new String[] {"2", "1", "6", "10.0", "3.0", "7", "7.0", "13.0"}); expectedResults.add(new String[] {"3", "0", "1", "0.0", "0.0", "8", "10.0", "10.0"}); expectedResults.add(new String[] {"3", "1", "8", "10.0", "10.0", "9", "18.0", "5.0"}); expectedResults.add(new String[] {"3", "2", "9", "18.0", "5.0", "1", "0.0", "0.0"}); Iterator<String[]> expectedResultIter = expectedResults.iterator(); int count = 0; while (it.hasNext() && expectedResultIter.hasNext()) { Tuple tuple = (Tuple) it.next(); String[] expectedResult = expectedResultIter.next(); if (tuple == null) break; assertEquals(Integer.parseInt(expectedResult[0]), tuple.get(0)); assertEquals(Integer.parseInt(expectedResult[1]), tuple.get(1)); assertEquals(Long.parseLong(expectedResult[2]), tuple.get(2)); assertEquals(Double.parseDouble(expectedResult[3]), tuple.get(3)); assertEquals(Double.parseDouble(expectedResult[4]), tuple.get(4)); assertEquals(Long.parseLong(expectedResult[5]), tuple.get(5)); assertEquals(Double.parseDouble(expectedResult[6]), tuple.get(6)); assertEquals(Double.parseDouble(expectedResult[7]), tuple.get(7)); count++; } assertEquals(expectedResults.size(), count); }
public void testSkewedJoinKeyPartition() throws IOException { try { Util.deleteFile(cluster, "skewedjoin"); } catch (Exception e) { // it is ok if directory not exist } pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 7;"); pigServer.store("E", "skewedjoin"); int[][] lineCount = new int[3][7]; new File("skewedjoin").mkdir(); // check how many times a key appear in each part- file for (int i = 0; i < 7; i++) { Util.copyFromClusterToLocal( cluster, "skewedjoin/part-r-0000" + i, "skewedjoin/part-r-0000" + i); BufferedReader reader = new BufferedReader(new FileReader("skewedjoin/part-r-0000" + i)); String line = null; while ((line = reader.readLine()) != null) { String[] cols = line.split("\t"); int key = Integer.parseInt(cols[0]) / 100 - 1; lineCount[key][i]++; } } int fc = 0; for (int i = 0; i < 3; i++) { for (int j = 0; j < 7; j++) { if (lineCount[i][j] > 0) { fc++; } } } // atleast one key should be a skewed key // check atleast one key should appear in more than 1 part- file assertTrue(fc > 3); }
// See PIG-1434 @Test public void testScalarAliasesGrammarNegative() throws Exception { String[] input = {"1\t5", "2\t10", "3\t20"}; String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar"; TestScalarAliases.createLocalInputFile(inputPath, input); try { pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A);"); // Only projections of C are supported pigServer.registerQuery("Y = foreach A generate C;"); pigServer.openIterator("Y"); // Control should not reach here fail("Scalar projections are only supported"); } catch (IOException pe) { assertTrue(pe.getMessage().contains("Invalid scalar projection: C")); } }
public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); }
public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; }
public void testSkewedJoinMapKey() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support maps and expression operators as keys"); } return; }
// See PIG-1434 @Test public void testScalarWithTwoBranches() throws Exception { String[] inputA = {"1\t5", "2\t10", "3\t20"}; String[] inputX = {"pig", "hadoop", "rocks"}; String output = BUILD_TEST_TMP + "testScalarWithTwoBranchesDir"; TestScalarAliases.deleteDirectory(new File(output)); // Test the use of scalars in expressions String inputPathA = BUILD_TEST_TMP + "testScalarWithTwoBranchesA"; TestScalarAliases.createLocalInputFile(inputPathA, inputA); String inputPathX = BUILD_TEST_TMP + "testScalarWithTwoBranchesX"; TestScalarAliases.createLocalInputFile(inputPathX, inputX); // Test in script mode pigServer.setBatchOn(); pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0: long, a1: double);"); pigServer.registerQuery("B = group A all;"); pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;"); pigServer.registerQuery("X = LOAD '" + inputPathX + "' as (names: chararray);"); pigServer.registerQuery("Y = foreach X generate names, C.max;"); pigServer.registerQuery("Store Y into '" + output + "';"); pigServer.executeBatch(); // Check output pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: chararray, a1: double);"); Iterator<Tuple> iter = pigServer.openIterator("Z"); Tuple t = iter.next(); assertTrue(t.toString().equals("(pig,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(hadoop,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(rocks,20.0)")); assertFalse(iter.hasNext()); // Check in non-batch mode iter = pigServer.openIterator("Y"); t = iter.next(); assertTrue(t.toString().equals("(pig,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(hadoop,20.0)")); t = iter.next(); assertTrue(t.toString().equals("(rocks,20.0)")); assertFalse(iter.hasNext()); pigServer.getPigContext().getProperties().remove("tez.am.inline.task.execution.max-tasks"); }
/** Tests the end-to-end writing and reading of an empty BZip file. */ @Test public void testEmptyBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".tmp"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath()); FileOutputStream fos = new FileOutputStream(in); fos.write("55\n".getBytes()); fos.close(); System.out.println(in.getAbsolutePath()); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(-1, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';"); pig.openIterator("B"); in.delete(); Util.deleteFile(cluster, clusterOutputFilePath); }
@Test public void testUpdate() throws Exception { BasicDBObject obj1 = new BasicDBObject().append("f1", "a").append("f2", "value1"); BasicDBObject obj2 = new BasicDBObject().append("f1", "b").append("f2", "value2"); insertData("testUpdate", obj1, obj2); String[] input = {"a\tnewValue1\t1", "b\tnewValue2\t2"}; Util.createLocalInputFile("simple_input", input); pigServerLocal = new PigServer(ExecType.LOCAL); pigServerLocal.registerQuery( "A = LOAD 'simple_input' as (f1:chararray, f2:chararray, f3:int);"); pigServerLocal.registerQuery( String.format( "STORE A INTO 'mongodb://localhost:27017/%s.%s' USING com.mongodb.hadoop.pig.MongoUpdateStorage(" + " '{f1:\"\\\\$f1\"}'," + " '{\\\\$set:{f2:\"\\\\$f2\", f3:\"\\\\$f3\"}}'," + " 'f1:chararray, f2:chararray, f3:int'" + ");", dbName, "update_simple")); pigServerLocal.setBatchOn(); pigServerLocal.executeBatch(); MongoClient mc = new MongoClient(); DBCollection col = mc.getDB(dbName).getCollection("update_simple"); DBCursor cursor = col.find(); assertEquals(2, cursor.size()); DBObject result1 = cursor.next(); assertEquals("a", result1.get("f1")); assertEquals("newValue1", result1.get("f2")); assertEquals(1, result1.get("f3")); DBObject result2 = cursor.next(); assertEquals("b", result2.get("f1")); assertEquals("newValue2", result2.get("f2")); assertEquals(2, result2.get("f3")); }
@Test public void testImplicitSplit() throws Exception { int LOOP_SIZE = 20; String[] input = new String[LOOP_SIZE]; for (int i = 1; i <= LOOP_SIZE; i++) { input[i - 1] = Integer.toString(i); } String inputFileName = "testImplicitSplit-input.txt"; Util.createInputFile(cluster, inputFileName, input); pigServer.registerQuery("A = LOAD '" + inputFileName + "';"); pigServer.registerQuery("B = filter A by $0<=10;"); pigServer.registerQuery("C = filter A by $0>10;"); pigServer.registerQuery("D = union B,C;"); Iterator<Tuple> iter = pigServer.openIterator("D"); if (!iter.hasNext()) fail("No Output received"); int cnt = 0; while (iter.hasNext()) { Tuple t = iter.next(); ++cnt; } assertEquals(20, cnt); Util.deleteFile(cluster, inputFileName); }
@Test public void readWriteNullValuesRead() throws IOException { registerLoadQuery(); tempFilename = tempFilename + "-2"; pigServer.registerQuery( String.format( "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');", tempFilename, SequenceFileStorage.class.getName(), IntWritableConverter.class.getName(), NullWritableConverter.class.getName())); registerLoadQuery(IntWritableConverter.class, NullWritableConverter.class, null); validateIndex(pigServer.openIterator("A"), 2, 0, 0); }