Esempio n. 1
0
  @Test
  public void testBzipStoreInMultiQuery2() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input2.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    PigContext pigContext = pig.getPigContext();
    pigContext.getProperties().setProperty("output.compression.enabled", "true");
    pigContext
        .getProperties()
        .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");

    pig.setBatchOn();
    pig.registerQuery("a = load '" + inputFileName + "';");
    pig.registerQuery("store a into 'output2.bz2';");
    pig.registerQuery("store a into 'output2';");
    pig.executeBatch();

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
Esempio n. 2
0
  /**
   * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they
   * are when using uncompressed text
   */
  @Test
  public void testRecordDelims() throws Exception {
    String[] inputData =
        new String[] {
          "1\t2\r3\t4", // '\r' case - this will be split into two tuples
          "5\t6\r", // '\r\n' case
          "7\t8", // '\n' case
          "9\t10\r" // '\r\n' at the end of file
        };

    // bzip compressed input
    File in = File.createTempFile("junit", ".bz2");
    String compressedInputFileName = in.getAbsolutePath();
    in.deleteOnExit();
    String clusterCompressedFilePath = Util.removeColon(compressedInputFileName);

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputData);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
      for (int i = 0; i < inputData.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath);

      // pig script to read uncompressed input
      String script = "a = load '" + unCompressedInputFileName + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed input
      script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
      Util.deleteFile(cluster, clusterCompressedFilePath);
    }
  }
Esempio n. 3
0
  /** Tests the end-to-end writing and reading of a BZip file. */
  @Test
  public void testBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".bz2");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutput = Util.removeColon(out.getAbsolutePath());

    CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
    for (int i = 1; i < 100; i++) {
      StringBuffer sb = new StringBuffer();
      sb.append(i).append("\n").append(-i).append("\n");
      byte bytes[] = sb.toString().getBytes();
      cos.write(bytes);
    }
    cos.close();

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(100, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';");

    Iterator<Tuple> i = pig.openIterator("B");
    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
    while (i.hasNext()) {
      Integer val = DataType.toInteger(i.next().get(0));
      map.put(val, val);
    }

    assertEquals(new Integer(99), new Integer(map.keySet().size()));

    for (int j = 1; j < 100; j++) {
      assertEquals(new Integer(j), map.get(j));
    }

    in.delete();
    Util.deleteFile(cluster, clusterOutput);
  }
Esempio n. 4
0
  private void testCount(
      String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec)
      throws IOException {
    String outputFile = "/tmp/bz-output";
    // simple load-store script to verify that the bzip input is getting
    // split
    String scriptToTestSplitting =
        "a = load '"
            + inputFileName
            + "' using "
            + loadFuncSpec
            + "; store a into '"
            + outputFile
            + "';";

    String script =
        "a = load '"
            + inputFileName
            + "';"
            + "b = group a all;"
            + "c = foreach b generate COUNT_STAR(a);";
    Properties props = new Properties();
    for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) {
      props.put(entry.getKey(), entry.getValue());
    }
    props.setProperty("mapred.max.split.size", Integer.toString(splitSize));
    PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props);
    PigServer pig = new PigServer(pigContext);
    FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props));
    fs.delete(new Path(outputFile), true);
    Util.registerMultiLineQuery(pig, scriptToTestSplitting);

    // verify that > 1 maps were launched due to splitting of the bzip input
    FileStatus[] files = fs.listStatus(new Path(outputFile));
    int numPartFiles = 0;
    for (FileStatus fileStatus : files) {
      if (fileStatus.getPath().getName().startsWith("part")) {
        numPartFiles++;
      }
    }
    assertEquals(true, numPartFiles > 0);

    // verify record count to verify we read bzip data correctly
    Util.registerMultiLineQuery(pig, script);
    Iterator<Tuple> it = pig.openIterator("c");
    Long result = (Long) it.next().get(0);
    assertEquals(expectedCount, result);
  }
Esempio n. 5
0
  /** Tests the end-to-end writing and reading of an empty BZip file. */
  @Test
  public void testEmptyBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".tmp");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath());

    FileOutputStream fos = new FileOutputStream(in);
    fos.write("55\n".getBytes());
    fos.close();
    System.out.println(in.getAbsolutePath());

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(-1, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    pig.openIterator("B");

    in.delete();
    Util.deleteFile(cluster, clusterOutputFilePath);
  }
Esempio n. 6
0
  // See PIG-1714
  @Test
  public void testBzipStoreInMultiQuery3() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input3.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    String inputScript =
        "set mapred.output.compress true\n"
            + "set mapreduce.output.fileoutputformat.compress true\n"
            + "set mapred.output.compression.codec org.apache.hadoop.io.compress.BZip2Codec\n"
            + "set mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.BZip2Codec\n"
            + "a = load '"
            + inputFileName
            + "';\n"
            + "store a into 'output3.bz2';\n"
            + "store a into 'output3';";

    String inputScriptName = "script3.txt";
    PrintWriter pw = new PrintWriter(new FileWriter(inputScriptName));
    pw.println(inputScript);
    pw.close();

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    FileInputStream fis = new FileInputStream(inputScriptName);
    pig.registerScript(fis);

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output3/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output3.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
Esempio n. 7
0
  /**
   * Tests that Pig throws an Exception when the input files to be loaded are actually a result of
   * concatenating 2 or more bz2 files. Pig should not silently ignore part of the input data.
   */
  @Test(expected = IOException.class)
  public void testBZ2Concatenation() throws Exception {
    String[] inputData1 = new String[] {"1\ta", "2\taa"};
    String[] inputData2 = new String[] {"1\tb", "2\tbb"};
    String[] inputDataMerged = new String[] {"1\ta", "2\taa", "1\tb", "2\tbb"};

    // bzip compressed input file1
    File in1 = File.createTempFile("junit", ".bz2");
    String compressedInputFileName1 = in1.getAbsolutePath();
    in1.deleteOnExit();

    // file2
    File in2 = File.createTempFile("junit", ".bz2");
    String compressedInputFileName2 = in2.getAbsolutePath();
    in1.deleteOnExit();

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputDataMerged);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in1));
      for (int i = 0; i < inputData1.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData1[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      CBZip2OutputStream cos2 = new CBZip2OutputStream(new FileOutputStream(in2));
      for (int i = 0; i < inputData2.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData2[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos2.write(bytes);
      }
      cos2.close();

      // cat
      catInto(compressedInputFileName2, compressedInputFileName1);
      Util.copyFromLocalToCluster(cluster, compressedInputFileName1, compressedInputFileName1);

      // pig script to read uncompressed input
      String script = "a = load '" + Util.encodeEscape(unCompressedInputFileName) + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed concatenated input
      script = "a = load '" + Util.encodeEscape(compressedInputFileName1) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in1.delete();
      in2.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
    }
  }