@BeforeClass public static void setUpBeforeClass() throws Exception { cluster = MiniCluster.buildCluster(); pc = new PigContext(ExecType.LOCAL, new Properties()); pcMR = new PigContext(ExecType.MAPREDUCE, cluster.getProperties()); pc.connect(); }
public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } }
@Test public void testBzipStoreInMultiQuery2() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input2.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); PigContext pigContext = pig.getPigContext(); pigContext.getProperties().setProperty("output.compression.enabled", "true"); pigContext .getProperties() .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output2.bz2';"); pig.registerQuery("store a into 'output2';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
/** * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they * are when using uncompressed text */ @Test public void testRecordDelims() throws Exception { String[] inputData = new String[] { "1\t2\r3\t4", // '\r' case - this will be split into two tuples "5\t6\r", // '\r\n' case "7\t8", // '\n' case "9\t10\r" // '\r\n' at the end of file }; // bzip compressed input File in = File.createTempFile("junit", ".bz2"); String compressedInputFileName = in.getAbsolutePath(); in.deleteOnExit(); String clusterCompressedFilePath = Util.removeColon(compressedInputFileName); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputData); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 0; i < inputData.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath); // pig script to read uncompressed input String script = "a = load '" + unCompressedInputFileName + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed input script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in.delete(); Util.deleteFile(cluster, unCompressedInputFileName); Util.deleteFile(cluster, clusterCompressedFilePath); } }
public void testSchemaSerialization() throws IOException { MiniCluster cluster = MiniCluster.buildCluster(); PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); String inputFileName = "testSchemaSerialization-input.txt"; String[] inputData = new String[] {"foo\t1", "hello\t2"}; Util.createInputFile(cluster, inputFileName, inputData); String script = "a = load '" + inputFileName + "' as (f1:chararray, f2:int);" + " b = group a all; c = foreach b generate org.apache.pig.test.InputSchemaUDF(a);"; Util.registerMultiLineQuery(pigServer, script); Iterator<Tuple> it = pigServer.openIterator("c"); while (it.hasNext()) { Tuple t = it.next(); Assert.assertEquals("{a: {(f1: chararray,f2: int)}}", t.get(0)); } cluster.shutDown(); }
/** Tests the end-to-end writing and reading of a BZip file. */ @Test public void testBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".bz2"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutput = Util.removeColon(out.getAbsolutePath()); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 1; i < 100; i++) { StringBuffer sb = new StringBuffer(); sb.append(i).append("\n").append(-i).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(100, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';"); Iterator<Tuple> i = pig.openIterator("B"); HashMap<Integer, Integer> map = new HashMap<Integer, Integer>(); while (i.hasNext()) { Integer val = DataType.toInteger(i.next().get(0)); map.put(val, val); } assertEquals(new Integer(99), new Integer(map.keySet().size())); for (int j = 1; j < 100; j++) { assertEquals(new Integer(j), map.get(j)); } in.delete(); Util.deleteFile(cluster, clusterOutput); }
private void testCount( String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec) throws IOException { String outputFile = "/tmp/bz-output"; // simple load-store script to verify that the bzip input is getting // split String scriptToTestSplitting = "a = load '" + inputFileName + "' using " + loadFuncSpec + "; store a into '" + outputFile + "';"; String script = "a = load '" + inputFileName + "';" + "b = group a all;" + "c = foreach b generate COUNT_STAR(a);"; Properties props = new Properties(); for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) { props.put(entry.getKey(), entry.getValue()); } props.setProperty("mapred.max.split.size", Integer.toString(splitSize)); PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props); PigServer pig = new PigServer(pigContext); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props)); fs.delete(new Path(outputFile), true); Util.registerMultiLineQuery(pig, scriptToTestSplitting); // verify that > 1 maps were launched due to splitting of the bzip input FileStatus[] files = fs.listStatus(new Path(outputFile)); int numPartFiles = 0; for (FileStatus fileStatus : files) { if (fileStatus.getPath().getName().startsWith("part")) { numPartFiles++; } } assertEquals(true, numPartFiles > 0); // verify record count to verify we read bzip data correctly Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("c"); Long result = (Long) it.next().get(0); assertEquals(expectedCount, result); }
/** Tests the end-to-end writing and reading of an empty BZip file. */ @Test public void testEmptyBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".tmp"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath()); FileOutputStream fos = new FileOutputStream(in); fos.write("55\n".getBytes()); fos.close(); System.out.println(in.getAbsolutePath()); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(-1, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';"); pig.openIterator("B"); in.delete(); Util.deleteFile(cluster, clusterOutputFilePath); }
// See PIG-1714 @Test public void testBzipStoreInMultiQuery3() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input3.txt"; Util.createInputFile(cluster, inputFileName, inputData); String inputScript = "set mapred.output.compress true\n" + "set mapreduce.output.fileoutputformat.compress true\n" + "set mapred.output.compression.codec org.apache.hadoop.io.compress.BZip2Codec\n" + "set mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.BZip2Codec\n" + "a = load '" + inputFileName + "';\n" + "store a into 'output3.bz2';\n" + "store a into 'output3';"; String inputScriptName = "script3.txt"; PrintWriter pw = new PrintWriter(new FileWriter(inputScriptName)); pw.println(inputScript); pw.close(); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); FileInputStream fis = new FileInputStream(inputScriptName); pig.registerScript(fis); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output3/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output3.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
public TestSkewedJoin() throws ExecException, IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); // pigServer = new PigServer(ExecType.LOCAL); pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5"); pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.01"); }
public class TestSkewedJoin extends TestCase { private static final String INPUT_FILE1 = "SkewedJoinInput1.txt"; private static final String INPUT_FILE2 = "SkewedJoinInput2.txt"; private static final String INPUT_FILE3 = "SkewedJoinInput3.txt"; private static final String INPUT_FILE4 = "SkewedJoinInput4.txt"; private static final String INPUT_FILE5 = "SkewedJoinInput5.txt"; private static final String INPUT_FILE6 = "SkewedJoinInput6.txt"; private static final String INPUT_FILE7 = "SkewedJoinInput7.txt"; private PigServer pigServer; private MiniCluster cluster = MiniCluster.buildCluster(); public TestSkewedJoin() throws ExecException, IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); // pigServer = new PigServer(ExecType.LOCAL); pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5"); pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.01"); } @Before public void setUp() throws Exception { createFiles(); } private void createFiles() throws IOException { PrintWriter w = new PrintWriter(new FileWriter(INPUT_FILE1)); int k = 0; for (int j = 0; j < 120; j++) { w.println("100\tapple1\taaa" + k); k++; w.println("200\torange1\tbbb" + k); k++; w.println("300\tstrawberry\tccc" + k); k++; } w.close(); PrintWriter w2 = new PrintWriter(new FileWriter(INPUT_FILE2)); w2.println("100\tapple1"); w2.println("100\tapple2"); w2.println("100\tapple2"); w2.println("200\torange1"); w2.println("200\torange2"); w2.println("300\tstrawberry"); w2.println("400\tpear"); w2.close(); PrintWriter w3 = new PrintWriter(new FileWriter(INPUT_FILE3)); w3.println("100\tapple1"); w3.println("100\tapple2"); w3.println("200\torange1"); w3.println("200\torange2"); w3.println("300\tstrawberry"); w3.println("300\tstrawberry2"); w3.println("400\tpear"); w3.close(); PrintWriter w4 = new PrintWriter(new FileWriter(INPUT_FILE4)); for (int i = 0; i < 100; i++) { w4.println( "[a100#apple1,a100#apple2,a200#orange1,a200#orange2,a300#strawberry,a300#strawberry2,a400#pear]"); } w4.close(); // Create a file with null keys PrintWriter w5 = new PrintWriter(new FileWriter(INPUT_FILE5)); for (int i = 0; i < 10; i++) { w5.println("\tapple1"); } w5.println("100\tapple2"); for (int i = 0; i < 10; i++) { w5.println("\torange1"); } w5.println("\t"); w5.println("100\t"); w5.close(); PrintWriter w6 = new PrintWriter(new FileWriter(INPUT_FILE6)); for (int i = 0; i < 300; i++) { for (int j = 0; j < 5; j++) { w6.println("" + i + "\t" + j); } } w6.close(); PrintWriter w7 = new PrintWriter(new FileWriter(INPUT_FILE7)); for (int i = 0; i < 300; i = i + 3) { for (int j = 0; j < 2; j++) { w7.println("" + i + "\t" + j); } } w7.close(); Util.copyFromLocalToCluster(cluster, INPUT_FILE1, INPUT_FILE1); Util.copyFromLocalToCluster(cluster, INPUT_FILE2, INPUT_FILE2); Util.copyFromLocalToCluster(cluster, INPUT_FILE3, INPUT_FILE3); Util.copyFromLocalToCluster(cluster, INPUT_FILE4, INPUT_FILE4); Util.copyFromLocalToCluster(cluster, INPUT_FILE5, INPUT_FILE5); Util.copyFromLocalToCluster(cluster, INPUT_FILE6, INPUT_FILE6); Util.copyFromLocalToCluster(cluster, INPUT_FILE7, INPUT_FILE7); } @After public void tearDown() throws Exception { new File(INPUT_FILE1).delete(); new File(INPUT_FILE2).delete(); new File(INPUT_FILE3).delete(); new File(INPUT_FILE4).delete(); new File(INPUT_FILE5).delete(); new File(INPUT_FILE6).delete(); new File(INPUT_FILE7).delete(); Util.deleteDirectory(new File("skewedjoin")); Util.deleteFile(cluster, INPUT_FILE1); Util.deleteFile(cluster, INPUT_FILE2); Util.deleteFile(cluster, INPUT_FILE3); Util.deleteFile(cluster, INPUT_FILE4); Util.deleteFile(cluster, INPUT_FILE5); Util.deleteFile(cluster, INPUT_FILE6); Util.deleteFile(cluster, INPUT_FILE7); } public void testSkewedJoinWithGroup() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = GROUP A by id;"); pigServer.registerQuery("D = GROUP B by id;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by group, D by group;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } } public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } } public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); } public void testSkewedJoinMapKey() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support maps and expression operators as keys"); } return; } public void testSkewedJoinKeyPartition() throws IOException { try { Util.deleteFile(cluster, "skewedjoin"); } catch (Exception e) { // it is ok if directory not exist } pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 7;"); pigServer.store("E", "skewedjoin"); int[][] lineCount = new int[3][7]; new File("skewedjoin").mkdir(); // check how many times a key appear in each part- file for (int i = 0; i < 7; i++) { Util.copyFromClusterToLocal( cluster, "skewedjoin/part-r-0000" + i, "skewedjoin/part-r-0000" + i); BufferedReader reader = new BufferedReader(new FileReader("skewedjoin/part-r-0000" + i)); String line = null; while ((line = reader.readLine()) != null) { String[] cols = line.split("\t"); int key = Integer.parseInt(cols[0]) / 100 - 1; lineCount[key][i]++; } } int fc = 0; for (int i = 0; i < 3; i++) { for (int j = 0; j < 7; j++) { if (lineCount[i][j] > 0) { fc++; } } } // atleast one key should be a skewed key // check atleast one key should appear in more than 1 part- file assertTrue(fc > 3); } public void testSkewedJoinNullKeys() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support null keys in skewed join"); } return; } public void testSkewedJoinOuter() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id left, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("C = join A by id right, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("C = join A by id full, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support outer join in skewed join"); } return; } // pig 1048 public void testSkewedJoinOneValue() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);"); // Filter key with a single value pigServer.registerQuery("C = FILTER A by id == 400;"); pigServer.registerQuery("D = FILTER B by id == 400;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by id, D by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by id, D by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); } public void testSkewedJoinManyReducers() throws IOException { pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2"); pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by id, B by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); } public void testSkewedJoinEmptyInput() throws IOException { String LEFT_INPUT_FILE = "left.dat"; String RIGHT_INPUT_FILE = "right.dat"; PrintWriter w = new PrintWriter(new FileWriter(LEFT_INPUT_FILE)); w.println("1"); w.println("2"); w.println("3"); w.println("5"); w.close(); Util.copyFromLocalToCluster(cluster, LEFT_INPUT_FILE, LEFT_INPUT_FILE); PrintWriter w2 = new PrintWriter(new FileWriter(RIGHT_INPUT_FILE)); w2.println("1\tone"); w2.println("2\ttwo"); w2.println("3\tthree"); w2.close(); Util.copyFromLocalToCluster(cluster, RIGHT_INPUT_FILE, RIGHT_INPUT_FILE); pigServer.registerQuery("a = load 'left.dat' as (nums:chararray);"); pigServer.registerQuery("b = load 'right.dat' as (number:chararray,text:chararray);"); pigServer.registerQuery("c = filter a by nums == '7';"); pigServer.registerQuery("d = join c by nums LEFT OUTER, b by number USING 'skewed';"); Iterator<Tuple> iter = pigServer.openIterator("d"); Assert.assertFalse(iter.hasNext()); new File(LEFT_INPUT_FILE).delete(); Util.deleteFile(cluster, LEFT_INPUT_FILE); new File(RIGHT_INPUT_FILE).delete(); Util.deleteFile(cluster, RIGHT_INPUT_FILE); } public void testRecursiveFileListing() throws IOException { String LOCAL_INPUT_FILE = "test.dat"; String INPUT_FILE = "foo/bar/test.dat"; PrintWriter w = new PrintWriter(new FileWriter(LOCAL_INPUT_FILE)); w.println("1"); w.println("2"); w.println("3"); w.println("5"); w.close(); Util.copyFromLocalToCluster(cluster, LOCAL_INPUT_FILE, INPUT_FILE); pigServer.registerQuery("a = load 'foo' as (nums:chararray);"); pigServer.registerQuery("b = load 'foo' as (nums:chararray);"); pigServer.registerQuery("d = join a by nums, b by nums USING 'skewed';"); Iterator<Tuple> iter = pigServer.openIterator("d"); int count = 0; while (iter.hasNext()) { iter.next(); count++; } Assert.assertEquals(4, count); new File(LOCAL_INPUT_FILE).delete(); Util.deleteFile(cluster, INPUT_FILE); } }
@Before public void setUp() throws Exception { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); }
public class TestImplicitSplit extends TestCase { private PigServer pigServer; MiniCluster cluster = MiniCluster.buildCluster(); @Before public void setUp() throws Exception { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); } @After public void tearDown() throws Exception {} @Test public void testImplicitSplit() throws Exception { int LOOP_SIZE = 20; String[] input = new String[LOOP_SIZE]; for (int i = 1; i <= LOOP_SIZE; i++) { input[i - 1] = Integer.toString(i); } String inputFileName = "testImplicitSplit-input.txt"; Util.createInputFile(cluster, inputFileName, input); pigServer.registerQuery("A = LOAD '" + inputFileName + "';"); pigServer.registerQuery("B = filter A by $0<=10;"); pigServer.registerQuery("C = filter A by $0>10;"); pigServer.registerQuery("D = union B,C;"); Iterator<Tuple> iter = pigServer.openIterator("D"); if (!iter.hasNext()) fail("No Output received"); int cnt = 0; while (iter.hasNext()) { Tuple t = iter.next(); ++cnt; } assertEquals(20, cnt); Util.deleteFile(cluster, inputFileName); } @Test public void testImplicitSplitInCoGroup() throws Exception { // this query is similar to the one reported in JIRA - PIG-537 // Create input file String input1 = "testImplicitSplitInCoGroup-input1.txt"; String input2 = "testImplicitSplitInCoGroup-input2.txt"; Util.createInputFile(cluster, input1, new String[] {"a:1", "b:2", "b:20", "c:3", "c:30"}); Util.createInputFile(cluster, input2, new String[] {"a:first", "b:second", "c:third"}); pigServer.registerQuery( "a = load '" + input1 + "' using PigStorage(':') as (name:chararray, marks:int);"); pigServer.registerQuery( "b = load '" + input2 + "' using PigStorage(':') as (name:chararray, rank:chararray);"); pigServer.registerQuery("c = cogroup a by name, b by name;"); pigServer.registerQuery("d = foreach c generate group, FLATTEN(a.marks) as newmarks;"); pigServer.registerQuery("e = cogroup a by marks, d by newmarks;"); pigServer.registerQuery("f = foreach e generate group, flatten(a), flatten(d);"); HashMap<Integer, Object[]> results = new HashMap<Integer, Object[]>(); results.put(1, new Object[] {"a", 1, "a", 1}); results.put(2, new Object[] {"b", 2, "b", 2}); results.put(3, new Object[] {"c", 3, "c", 3}); results.put(20, new Object[] {"b", 20, "b", 20}); results.put(30, new Object[] {"c", 30, "c", 30}); Iterator<Tuple> it = pigServer.openIterator("f"); while (it.hasNext()) { Tuple t = it.next(); System.err.println("Tuple:" + t); Integer group = (Integer) t.get(0); Object[] groupValues = results.get(group); for (int i = 0; i < 4; i++) { assertEquals(groupValues[i], t.get(i + 1)); } } Util.deleteFile(cluster, input1); Util.deleteFile(cluster, input2); } @Test public void testImplicitSplitInCoGroup2() throws Exception { // this query is similar to the one reported in JIRA - PIG-537 LogicalPlanTester planTester = new LogicalPlanTester(); planTester.buildPlan("a = load 'file1' using PigStorage(':') as (name:chararray, marks:int);"); planTester.buildPlan( "b = load 'file2' using PigStorage(':') as (name:chararray, rank:chararray);"); planTester.buildPlan("c = cogroup a by name, b by name;"); planTester.buildPlan("d = foreach c generate group, FLATTEN(a.marks) as newmarks;"); planTester.buildPlan("e = cogroup a by marks, d by newmarks;"); LogicalPlan plan = planTester.buildPlan("f = foreach e generate group, flatten(a), flatten(d);"); // Set the logical plan values correctly in all the operators PlanSetter ps = new PlanSetter(plan); ps.visit(); // run through validator CompilationMessageCollector collector = new CompilationMessageCollector(); TypeCheckingValidator typeValidator = new TypeCheckingValidator(); typeValidator.validate(plan, collector); printMessageCollector(collector); printTypeGraph(plan); if (collector.hasError()) { throw new Exception("Error during type checking"); } // this will run ImplicitSplitInserter TestLogicalOptimizer.optimizePlan(plan); // get Schema of leaf and compare: Schema expectedSchema = Util.getSchemaFromString( "grp: int,A::username: chararray,A::marks: int,AB::group: chararray,AB::newmarks: int"); assertTrue(Schema.equals(expectedSchema, plan.getLeaves().get(0).getSchema(), false, true)); } }
@RunWith(JUnit4.class) public class TestBZip { static MiniCluster cluster = MiniCluster.buildCluster(); @AfterClass public static void oneTimeTearDown() throws Exception { cluster.shutDown(); } /** Tests the end-to-end writing and reading of a BZip file. */ @Test public void testBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".bz2"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutput = Util.removeColon(out.getAbsolutePath()); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 1; i < 100; i++) { StringBuffer sb = new StringBuffer(); sb.append(i).append("\n").append(-i).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(100, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';"); Iterator<Tuple> i = pig.openIterator("B"); HashMap<Integer, Integer> map = new HashMap<Integer, Integer>(); while (i.hasNext()) { Integer val = DataType.toInteger(i.next().get(0)); map.put(val, val); } assertEquals(new Integer(99), new Integer(map.keySet().size())); for (int j = 1; j < 100; j++) { assertEquals(new Integer(j), map.get(j)); } in.delete(); Util.deleteFile(cluster, clusterOutput); } /** * Tests the end-to-end writing and reading of a BZip file using absolute path with a trailing /. */ @Test public void testBzipInPig2() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".bz2"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutput = Util.removeColon(out.getAbsolutePath()); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 1; i < 100; i++) { StringBuffer sb = new StringBuffer(); sb.append(i).append("\n").append(-i).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); pig.registerQuery( "AA = load '" + Util.generateURI(in.getAbsolutePath(), pig.getPigContext()) + "';"); pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "/';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(100, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';"); Iterator<Tuple> i = pig.openIterator("B"); HashMap<Integer, Integer> map = new HashMap<Integer, Integer>(); while (i.hasNext()) { Integer val = DataType.toInteger(i.next().get(0)); map.put(val, val); } assertEquals(new Integer(99), new Integer(map.keySet().size())); for (int j = 1; j < 100; j++) { assertEquals(new Integer(j), map.get(j)); } in.delete(); out.delete(); } // see PIG-2391 @Test public void testBz2() throws Exception { String[] inputData = new String[] { "1\t2\r3\t4", // '\r' case - this will be split into two tuples "5\t6\r", // '\r\n' case "7\t8", // '\n' case "9\t10\r" // '\r\n' at the end of file }; // bzip compressed input File in = File.createTempFile("junit", ".bz2"); String compressedInputFileName = in.getAbsolutePath(); String clusterCompressedFilePath = Util.removeColon(compressedInputFileName); in.deleteOnExit(); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 0; i < inputData.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath); // pig script to read compressed input PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); // pig script to read compressed input String script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';"; pig.registerQuery(script); pig.registerQuery("store a into 'intermediate.bz';"); pig.registerQuery("b = load 'intermediate.bz';"); Iterator<Tuple> it2 = pig.openIterator("b"); while (it2.hasNext()) { it2.next(); } } finally { in.delete(); Util.deleteFile(cluster, "intermediate.bz"); Util.deleteFile(cluster, "final.bz"); } } /** * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they * are when using uncompressed text */ @Test public void testRecordDelims() throws Exception { String[] inputData = new String[] { "1\t2\r3\t4", // '\r' case - this will be split into two tuples "5\t6\r", // '\r\n' case "7\t8", // '\n' case "9\t10\r" // '\r\n' at the end of file }; // bzip compressed input File in = File.createTempFile("junit", ".bz2"); String compressedInputFileName = in.getAbsolutePath(); in.deleteOnExit(); String clusterCompressedFilePath = Util.removeColon(compressedInputFileName); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputData); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 0; i < inputData.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath); // pig script to read uncompressed input String script = "a = load '" + unCompressedInputFileName + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed input script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in.delete(); Util.deleteFile(cluster, unCompressedInputFileName); Util.deleteFile(cluster, clusterCompressedFilePath); } } /** Tests the end-to-end writing and reading of an empty BZip file. */ @Test public void testEmptyBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".tmp"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath()); FileOutputStream fos = new FileOutputStream(in); fos.write("55\n".getBytes()); fos.close(); System.out.println(in.getAbsolutePath()); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(-1, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';"); pig.openIterator("B"); in.delete(); Util.deleteFile(cluster, clusterOutputFilePath); } /** Tests the writing and reading of an empty BZip file. */ @Test public void testEmptyBzip() throws Exception { File tmp = File.createTempFile("junit", ".tmp"); tmp.deleteOnExit(); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(tmp)); cos.close(); assertNotSame(0, tmp.length()); FileSystem fs = FileSystem.getLocal(new Configuration(false)); CBZip2InputStream cis = new CBZip2InputStream(fs.open(new Path(tmp.getAbsolutePath())), -1, tmp.length()); assertEquals(-1, cis.read(new byte[100])); cis.close(); tmp.delete(); } /** * Tests the case where a bzip block ends exactly at the end of the {@link InputSplit} with the * block header ending a few bits into the last byte of current InputSplit. This case results in * dropped records in Pig 0.6 release This test also tests that bzip files couple of dirs deep can * be read by specifying the top level dir. */ @Test public void testBlockHeaderEndingAtSplitNotByteAligned() throws IOException { // the actual input file is at // test/org/apache/pig/test/data/bzipdir1.bz2/bzipdir2.bz2/recordLossblockHeaderEndsAt136500.txt.bz2 // In this test we will load test/org/apache/pig/test/data/bzipdir1.bz2 to also // test that the BZip2TextInputFormat can read subdirs recursively String inputFileName = "test/org/apache/pig/test/data/bzipdir1.bz2"; Long expectedCount = 74999L; // number of lines in above file // the first block in the above file exactly ends a few bits into the // byte at position 136500 int splitSize = 136500; try { Util.copyFromLocalToCluster(cluster, inputFileName, inputFileName); testCount(inputFileName, expectedCount, splitSize, "PigStorage()"); testCount(inputFileName, expectedCount, splitSize, "TextLoader()"); } finally { Util.deleteFile(cluster, inputFileName); } } /** * Tests the case where a bzip block ends exactly at the end of the input split (byte aligned with * the last byte) and the last byte is a carriage return. */ @Test public void testBlockHeaderEndingWithCR() throws IOException { String inputFileName = "test/org/apache/pig/test/data/blockEndingInCR.txt.bz2"; // number of lines in above file (the value is 1 more than bzcat | wc -l // since there is a '\r' which is also treated as a record delim Long expectedCount = 82094L; // the first block in the above file exactly ends at the byte at // position 136498 and the last byte is a carriage return ('\r') try { int splitSize = 136498; Util.copyFromLocalToCluster(cluster, inputFileName, inputFileName); testCount(inputFileName, expectedCount, splitSize, "PigStorage()"); } finally { Util.deleteFile(cluster, inputFileName); } } /** * Tests the case where a bzip block ends exactly at the end of the input split and has more data * which results in overcounting (record duplication) in Pig 0.6 */ @Test public void testBlockHeaderEndingAtSplitOverCounting() throws IOException { String inputFileName = "test/org/apache/pig/test/data/blockHeaderEndsAt136500.txt.bz2"; Long expectedCount = 1041046L; // number of lines in above file // the first block in the above file exactly ends a few bits into the // byte at position 136500 int splitSize = 136500; try { Util.copyFromLocalToCluster(cluster, inputFileName, inputFileName); testCount(inputFileName, expectedCount, splitSize, "PigStorage()"); } finally { Util.deleteFile(cluster, inputFileName); } } private void testCount( String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec) throws IOException { String outputFile = "/tmp/bz-output"; // simple load-store script to verify that the bzip input is getting // split String scriptToTestSplitting = "a = load '" + inputFileName + "' using " + loadFuncSpec + "; store a into '" + outputFile + "';"; String script = "a = load '" + inputFileName + "';" + "b = group a all;" + "c = foreach b generate COUNT_STAR(a);"; Properties props = new Properties(); for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) { props.put(entry.getKey(), entry.getValue()); } props.setProperty("mapred.max.split.size", Integer.toString(splitSize)); PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props); PigServer pig = new PigServer(pigContext); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props)); fs.delete(new Path(outputFile), true); Util.registerMultiLineQuery(pig, scriptToTestSplitting); // verify that > 1 maps were launched due to splitting of the bzip input FileStatus[] files = fs.listStatus(new Path(outputFile)); int numPartFiles = 0; for (FileStatus fileStatus : files) { if (fileStatus.getPath().getName().startsWith("part")) { numPartFiles++; } } assertEquals(true, numPartFiles > 0); // verify record count to verify we read bzip data correctly Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("c"); Long result = (Long) it.next().get(0); assertEquals(expectedCount, result); } @Test public void testBzipStoreInMultiQuery() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output.bz2';"); pig.registerQuery("store a into 'output';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output/part-m-00000")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); } @Test public void testBzipStoreInMultiQuery2() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input2.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); PigContext pigContext = pig.getPigContext(); pigContext.getProperties().setProperty("output.compression.enabled", "true"); pigContext .getProperties() .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output2.bz2';"); pig.registerQuery("store a into 'output2';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); } /** * Tests that Pig throws an Exception when the input files to be loaded are actually a result of * concatenating 2 or more bz2 files. Pig should not silently ignore part of the input data. */ @Test(expected = IOException.class) public void testBZ2Concatenation() throws Exception { String[] inputData1 = new String[] {"1\ta", "2\taa"}; String[] inputData2 = new String[] {"1\tb", "2\tbb"}; String[] inputDataMerged = new String[] {"1\ta", "2\taa", "1\tb", "2\tbb"}; // bzip compressed input file1 File in1 = File.createTempFile("junit", ".bz2"); String compressedInputFileName1 = in1.getAbsolutePath(); in1.deleteOnExit(); // file2 File in2 = File.createTempFile("junit", ".bz2"); String compressedInputFileName2 = in2.getAbsolutePath(); in1.deleteOnExit(); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputDataMerged); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in1)); for (int i = 0; i < inputData1.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData1[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); CBZip2OutputStream cos2 = new CBZip2OutputStream(new FileOutputStream(in2)); for (int i = 0; i < inputData2.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData2[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos2.write(bytes); } cos2.close(); // cat catInto(compressedInputFileName2, compressedInputFileName1); Util.copyFromLocalToCluster(cluster, compressedInputFileName1, compressedInputFileName1); // pig script to read uncompressed input String script = "a = load '" + Util.encodeEscape(unCompressedInputFileName) + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed concatenated input script = "a = load '" + Util.encodeEscape(compressedInputFileName1) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in1.delete(); in2.delete(); Util.deleteFile(cluster, unCompressedInputFileName); } } /* * Concatenate the contents of src file to the contents of dest file */ private void catInto(String src, String dest) throws IOException { BufferedWriter out = new BufferedWriter(new FileWriter(dest, true)); BufferedReader in = new BufferedReader(new FileReader(src)); String str; while ((str = in.readLine()) != null) { out.write(str); } in.close(); out.close(); } // See PIG-1714 @Test public void testBzipStoreInMultiQuery3() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input3.txt"; Util.createInputFile(cluster, inputFileName, inputData); String inputScript = "set mapred.output.compress true\n" + "set mapreduce.output.fileoutputformat.compress true\n" + "set mapred.output.compression.codec org.apache.hadoop.io.compress.BZip2Codec\n" + "set mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.BZip2Codec\n" + "a = load '" + inputFileName + "';\n" + "store a into 'output3.bz2';\n" + "store a into 'output3';"; String inputScriptName = "script3.txt"; PrintWriter pw = new PrintWriter(new FileWriter(inputScriptName)); pw.println(inputScript); pw.close(); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); FileInputStream fis = new FileInputStream(inputScriptName); pig.registerScript(fis); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output3/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output3.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); } }
/** * Tests that Pig throws an Exception when the input files to be loaded are actually a result of * concatenating 2 or more bz2 files. Pig should not silently ignore part of the input data. */ @Test(expected = IOException.class) public void testBZ2Concatenation() throws Exception { String[] inputData1 = new String[] {"1\ta", "2\taa"}; String[] inputData2 = new String[] {"1\tb", "2\tbb"}; String[] inputDataMerged = new String[] {"1\ta", "2\taa", "1\tb", "2\tbb"}; // bzip compressed input file1 File in1 = File.createTempFile("junit", ".bz2"); String compressedInputFileName1 = in1.getAbsolutePath(); in1.deleteOnExit(); // file2 File in2 = File.createTempFile("junit", ".bz2"); String compressedInputFileName2 = in2.getAbsolutePath(); in1.deleteOnExit(); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputDataMerged); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in1)); for (int i = 0; i < inputData1.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData1[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); CBZip2OutputStream cos2 = new CBZip2OutputStream(new FileOutputStream(in2)); for (int i = 0; i < inputData2.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData2[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos2.write(bytes); } cos2.close(); // cat catInto(compressedInputFileName2, compressedInputFileName1); Util.copyFromLocalToCluster(cluster, compressedInputFileName1, compressedInputFileName1); // pig script to read uncompressed input String script = "a = load '" + Util.encodeEscape(unCompressedInputFileName) + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed concatenated input script = "a = load '" + Util.encodeEscape(compressedInputFileName1) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in1.delete(); in2.delete(); Util.deleteFile(cluster, unCompressedInputFileName); } }
@AfterClass public static void tearDownAfterClass() throws Exception { cluster.shutDown(); }
@AfterClass public static void oneTimeTearDown() throws Exception { cluster.shutDown(); }