@Test public void testBzipStoreInMultiQuery2() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input2.txt"; Util.createInputFile(cluster, inputFileName, inputData); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); PigContext pigContext = pig.getPigContext(); pigContext.getProperties().setProperty("output.compression.enabled", "true"); pigContext .getProperties() .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec"); pig.setBatchOn(); pig.registerQuery("a = load '" + inputFileName + "';"); pig.registerQuery("store a into 'output2.bz2';"); pig.registerQuery("store a into 'output2';"); pig.executeBatch(); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
/** * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they * are when using uncompressed text */ @Test public void testRecordDelims() throws Exception { String[] inputData = new String[] { "1\t2\r3\t4", // '\r' case - this will be split into two tuples "5\t6\r", // '\r\n' case "7\t8", // '\n' case "9\t10\r" // '\r\n' at the end of file }; // bzip compressed input File in = File.createTempFile("junit", ".bz2"); String compressedInputFileName = in.getAbsolutePath(); in.deleteOnExit(); String clusterCompressedFilePath = Util.removeColon(compressedInputFileName); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputData); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 0; i < inputData.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath); // pig script to read uncompressed input String script = "a = load '" + unCompressedInputFileName + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed input script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in.delete(); Util.deleteFile(cluster, unCompressedInputFileName); Util.deleteFile(cluster, clusterCompressedFilePath); } }
/** Tests the end-to-end writing and reading of a BZip file. */ @Test public void testBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".bz2"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutput = Util.removeColon(out.getAbsolutePath()); CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in)); for (int i = 1; i < 100; i++) { StringBuffer sb = new StringBuffer(); sb.append(i).append("\n").append(-i).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(100, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';"); Iterator<Tuple> i = pig.openIterator("B"); HashMap<Integer, Integer> map = new HashMap<Integer, Integer>(); while (i.hasNext()) { Integer val = DataType.toInteger(i.next().get(0)); map.put(val, val); } assertEquals(new Integer(99), new Integer(map.keySet().size())); for (int j = 1; j < 100; j++) { assertEquals(new Integer(j), map.get(j)); } in.delete(); Util.deleteFile(cluster, clusterOutput); }
private void testCount( String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec) throws IOException { String outputFile = "/tmp/bz-output"; // simple load-store script to verify that the bzip input is getting // split String scriptToTestSplitting = "a = load '" + inputFileName + "' using " + loadFuncSpec + "; store a into '" + outputFile + "';"; String script = "a = load '" + inputFileName + "';" + "b = group a all;" + "c = foreach b generate COUNT_STAR(a);"; Properties props = new Properties(); for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) { props.put(entry.getKey(), entry.getValue()); } props.setProperty("mapred.max.split.size", Integer.toString(splitSize)); PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props); PigServer pig = new PigServer(pigContext); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props)); fs.delete(new Path(outputFile), true); Util.registerMultiLineQuery(pig, scriptToTestSplitting); // verify that > 1 maps were launched due to splitting of the bzip input FileStatus[] files = fs.listStatus(new Path(outputFile)); int numPartFiles = 0; for (FileStatus fileStatus : files) { if (fileStatus.getPath().getName().startsWith("part")) { numPartFiles++; } } assertEquals(true, numPartFiles > 0); // verify record count to verify we read bzip data correctly Util.registerMultiLineQuery(pig, script); Iterator<Tuple> it = pig.openIterator("c"); Long result = (Long) it.next().get(0); assertEquals(expectedCount, result); }
/** Tests the end-to-end writing and reading of an empty BZip file. */ @Test public void testEmptyBzipInPig() throws Exception { PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); File in = File.createTempFile("junit", ".tmp"); in.deleteOnExit(); File out = File.createTempFile("junit", ".bz2"); out.delete(); String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath()); FileOutputStream fos = new FileOutputStream(in); fos.write("55\n".getBytes()); fos.close(); System.out.println(in.getAbsolutePath()); pig.registerQuery( "AA = load '" + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext()) + "';"); pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);"); pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';"); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2")); CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length()); // Just a sanity check, to make sure it was a bzip file; we // will do the value verification later assertEquals(-1, cis.read(new byte[100])); cis.close(); pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';"); pig.openIterator("B"); in.delete(); Util.deleteFile(cluster, clusterOutputFilePath); }
// See PIG-1714 @Test public void testBzipStoreInMultiQuery3() throws Exception { String[] inputData = new String[] {"1\t2\r3\t4"}; String inputFileName = "input3.txt"; Util.createInputFile(cluster, inputFileName, inputData); String inputScript = "set mapred.output.compress true\n" + "set mapreduce.output.fileoutputformat.compress true\n" + "set mapred.output.compression.codec org.apache.hadoop.io.compress.BZip2Codec\n" + "set mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.BZip2Codec\n" + "a = load '" + inputFileName + "';\n" + "store a into 'output3.bz2';\n" + "store a into 'output3';"; String inputScriptName = "script3.txt"; PrintWriter pw = new PrintWriter(new FileWriter(inputScriptName)); pw.println(inputScript); pw.close(); PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); FileInputStream fis = new FileInputStream(inputScriptName); pig.registerScript(fis); FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties())); FileStatus stat = fs.getFileStatus(new Path("output3/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); stat = fs.getFileStatus(new Path("output3.bz2/part-m-00000.bz2")); assertTrue(stat.getLen() > 0); }
/** * Tests that Pig throws an Exception when the input files to be loaded are actually a result of * concatenating 2 or more bz2 files. Pig should not silently ignore part of the input data. */ @Test(expected = IOException.class) public void testBZ2Concatenation() throws Exception { String[] inputData1 = new String[] {"1\ta", "2\taa"}; String[] inputData2 = new String[] {"1\tb", "2\tbb"}; String[] inputDataMerged = new String[] {"1\ta", "2\taa", "1\tb", "2\tbb"}; // bzip compressed input file1 File in1 = File.createTempFile("junit", ".bz2"); String compressedInputFileName1 = in1.getAbsolutePath(); in1.deleteOnExit(); // file2 File in2 = File.createTempFile("junit", ".bz2"); String compressedInputFileName2 = in2.getAbsolutePath(); in1.deleteOnExit(); String unCompressedInputFileName = "testRecordDelims-uncomp.txt"; Util.createInputFile(cluster, unCompressedInputFileName, inputDataMerged); try { CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in1)); for (int i = 0; i < inputData1.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData1[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos.write(bytes); } cos.close(); CBZip2OutputStream cos2 = new CBZip2OutputStream(new FileOutputStream(in2)); for (int i = 0; i < inputData2.length; i++) { StringBuffer sb = new StringBuffer(); sb.append(inputData2[i]).append("\n"); byte bytes[] = sb.toString().getBytes(); cos2.write(bytes); } cos2.close(); // cat catInto(compressedInputFileName2, compressedInputFileName1); Util.copyFromLocalToCluster(cluster, compressedInputFileName1, compressedInputFileName1); // pig script to read uncompressed input String script = "a = load '" + Util.encodeEscape(unCompressedInputFileName) + "';"; PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pig.registerQuery(script); Iterator<Tuple> it1 = pig.openIterator("a"); // pig script to read compressed concatenated input script = "a = load '" + Util.encodeEscape(compressedInputFileName1) + "';"; pig.registerQuery(script); Iterator<Tuple> it2 = pig.openIterator("a"); while (it1.hasNext()) { Tuple t1 = it1.next(); Tuple t2 = it2.next(); Assert.assertEquals(t1, t2); } Assert.assertFalse(it2.hasNext()); } finally { in1.delete(); in2.delete(); Util.deleteFile(cluster, unCompressedInputFileName); } }