Exemplo n.º 1
0
 @BeforeClass
 public static void setUpBeforeClass() throws Exception {
   cluster = MiniCluster.buildCluster();
   pc = new PigContext(ExecType.LOCAL, new Properties());
   pcMR = new PigContext(ExecType.MAPREDUCE, cluster.getProperties());
   pc.connect();
 }
Exemplo n.º 2
0
  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }
Exemplo n.º 3
0
  @Test
  public void testBzipStoreInMultiQuery2() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input2.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    PigContext pigContext = pig.getPigContext();
    pigContext.getProperties().setProperty("output.compression.enabled", "true");
    pigContext
        .getProperties()
        .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");

    pig.setBatchOn();
    pig.registerQuery("a = load '" + inputFileName + "';");
    pig.registerQuery("store a into 'output2.bz2';");
    pig.registerQuery("store a into 'output2';");
    pig.executeBatch();

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
Exemplo n.º 4
0
  /**
   * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they
   * are when using uncompressed text
   */
  @Test
  public void testRecordDelims() throws Exception {
    String[] inputData =
        new String[] {
          "1\t2\r3\t4", // '\r' case - this will be split into two tuples
          "5\t6\r", // '\r\n' case
          "7\t8", // '\n' case
          "9\t10\r" // '\r\n' at the end of file
        };

    // bzip compressed input
    File in = File.createTempFile("junit", ".bz2");
    String compressedInputFileName = in.getAbsolutePath();
    in.deleteOnExit();
    String clusterCompressedFilePath = Util.removeColon(compressedInputFileName);

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputData);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
      for (int i = 0; i < inputData.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath);

      // pig script to read uncompressed input
      String script = "a = load '" + unCompressedInputFileName + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed input
      script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
      Util.deleteFile(cluster, clusterCompressedFilePath);
    }
  }
Exemplo n.º 5
0
 public void testSchemaSerialization() throws IOException {
   MiniCluster cluster = MiniCluster.buildCluster();
   PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
   String inputFileName = "testSchemaSerialization-input.txt";
   String[] inputData = new String[] {"foo\t1", "hello\t2"};
   Util.createInputFile(cluster, inputFileName, inputData);
   String script =
       "a = load '"
           + inputFileName
           + "' as (f1:chararray, f2:int);"
           + " b = group a all; c = foreach b generate org.apache.pig.test.InputSchemaUDF(a);";
   Util.registerMultiLineQuery(pigServer, script);
   Iterator<Tuple> it = pigServer.openIterator("c");
   while (it.hasNext()) {
     Tuple t = it.next();
     Assert.assertEquals("{a: {(f1: chararray,f2: int)}}", t.get(0));
   }
   cluster.shutDown();
 }
Exemplo n.º 6
0
  /** Tests the end-to-end writing and reading of a BZip file. */
  @Test
  public void testBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".bz2");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutput = Util.removeColon(out.getAbsolutePath());

    CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
    for (int i = 1; i < 100; i++) {
      StringBuffer sb = new StringBuffer();
      sb.append(i).append("\n").append(-i).append("\n");
      byte bytes[] = sb.toString().getBytes();
      cos.write(bytes);
    }
    cos.close();

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(100, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';");

    Iterator<Tuple> i = pig.openIterator("B");
    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
    while (i.hasNext()) {
      Integer val = DataType.toInteger(i.next().get(0));
      map.put(val, val);
    }

    assertEquals(new Integer(99), new Integer(map.keySet().size()));

    for (int j = 1; j < 100; j++) {
      assertEquals(new Integer(j), map.get(j));
    }

    in.delete();
    Util.deleteFile(cluster, clusterOutput);
  }
Exemplo n.º 7
0
  private void testCount(
      String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec)
      throws IOException {
    String outputFile = "/tmp/bz-output";
    // simple load-store script to verify that the bzip input is getting
    // split
    String scriptToTestSplitting =
        "a = load '"
            + inputFileName
            + "' using "
            + loadFuncSpec
            + "; store a into '"
            + outputFile
            + "';";

    String script =
        "a = load '"
            + inputFileName
            + "';"
            + "b = group a all;"
            + "c = foreach b generate COUNT_STAR(a);";
    Properties props = new Properties();
    for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) {
      props.put(entry.getKey(), entry.getValue());
    }
    props.setProperty("mapred.max.split.size", Integer.toString(splitSize));
    PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props);
    PigServer pig = new PigServer(pigContext);
    FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props));
    fs.delete(new Path(outputFile), true);
    Util.registerMultiLineQuery(pig, scriptToTestSplitting);

    // verify that > 1 maps were launched due to splitting of the bzip input
    FileStatus[] files = fs.listStatus(new Path(outputFile));
    int numPartFiles = 0;
    for (FileStatus fileStatus : files) {
      if (fileStatus.getPath().getName().startsWith("part")) {
        numPartFiles++;
      }
    }
    assertEquals(true, numPartFiles > 0);

    // verify record count to verify we read bzip data correctly
    Util.registerMultiLineQuery(pig, script);
    Iterator<Tuple> it = pig.openIterator("c");
    Long result = (Long) it.next().get(0);
    assertEquals(expectedCount, result);
  }
Exemplo n.º 8
0
  /** Tests the end-to-end writing and reading of an empty BZip file. */
  @Test
  public void testEmptyBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".tmp");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath());

    FileOutputStream fos = new FileOutputStream(in);
    fos.write("55\n".getBytes());
    fos.close();
    System.out.println(in.getAbsolutePath());

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(-1, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    pig.openIterator("B");

    in.delete();
    Util.deleteFile(cluster, clusterOutputFilePath);
  }
Exemplo n.º 9
0
  // See PIG-1714
  @Test
  public void testBzipStoreInMultiQuery3() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input3.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    String inputScript =
        "set mapred.output.compress true\n"
            + "set mapreduce.output.fileoutputformat.compress true\n"
            + "set mapred.output.compression.codec org.apache.hadoop.io.compress.BZip2Codec\n"
            + "set mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.BZip2Codec\n"
            + "a = load '"
            + inputFileName
            + "';\n"
            + "store a into 'output3.bz2';\n"
            + "store a into 'output3';";

    String inputScriptName = "script3.txt";
    PrintWriter pw = new PrintWriter(new FileWriter(inputScriptName));
    pw.println(inputScript);
    pw.close();

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    FileInputStream fis = new FileInputStream(inputScriptName);
    pig.registerScript(fis);

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output3/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output3.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
Exemplo n.º 10
0
 public TestSkewedJoin() throws ExecException, IOException {
   pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
   // pigServer = new PigServer(ExecType.LOCAL);
   pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5");
   pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.01");
 }
Exemplo n.º 11
0
public class TestSkewedJoin extends TestCase {
  private static final String INPUT_FILE1 = "SkewedJoinInput1.txt";
  private static final String INPUT_FILE2 = "SkewedJoinInput2.txt";
  private static final String INPUT_FILE3 = "SkewedJoinInput3.txt";
  private static final String INPUT_FILE4 = "SkewedJoinInput4.txt";
  private static final String INPUT_FILE5 = "SkewedJoinInput5.txt";
  private static final String INPUT_FILE6 = "SkewedJoinInput6.txt";
  private static final String INPUT_FILE7 = "SkewedJoinInput7.txt";

  private PigServer pigServer;
  private MiniCluster cluster = MiniCluster.buildCluster();

  public TestSkewedJoin() throws ExecException, IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    // pigServer = new PigServer(ExecType.LOCAL);
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5");
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.01");
  }

  @Before
  public void setUp() throws Exception {
    createFiles();
  }

  private void createFiles() throws IOException {
    PrintWriter w = new PrintWriter(new FileWriter(INPUT_FILE1));

    int k = 0;
    for (int j = 0; j < 120; j++) {
      w.println("100\tapple1\taaa" + k);
      k++;
      w.println("200\torange1\tbbb" + k);
      k++;
      w.println("300\tstrawberry\tccc" + k);
      k++;
    }

    w.close();

    PrintWriter w2 = new PrintWriter(new FileWriter(INPUT_FILE2));
    w2.println("100\tapple1");
    w2.println("100\tapple2");
    w2.println("100\tapple2");
    w2.println("200\torange1");
    w2.println("200\torange2");
    w2.println("300\tstrawberry");
    w2.println("400\tpear");

    w2.close();

    PrintWriter w3 = new PrintWriter(new FileWriter(INPUT_FILE3));
    w3.println("100\tapple1");
    w3.println("100\tapple2");
    w3.println("200\torange1");
    w3.println("200\torange2");
    w3.println("300\tstrawberry");
    w3.println("300\tstrawberry2");
    w3.println("400\tpear");

    w3.close();

    PrintWriter w4 = new PrintWriter(new FileWriter(INPUT_FILE4));
    for (int i = 0; i < 100; i++) {
      w4.println(
          "[a100#apple1,a100#apple2,a200#orange1,a200#orange2,a300#strawberry,a300#strawberry2,a400#pear]");
    }
    w4.close();

    // Create a file with null keys
    PrintWriter w5 = new PrintWriter(new FileWriter(INPUT_FILE5));
    for (int i = 0; i < 10; i++) {
      w5.println("\tapple1");
    }
    w5.println("100\tapple2");
    for (int i = 0; i < 10; i++) {
      w5.println("\torange1");
    }
    w5.println("\t");
    w5.println("100\t");
    w5.close();

    PrintWriter w6 = new PrintWriter(new FileWriter(INPUT_FILE6));

    for (int i = 0; i < 300; i++) {
      for (int j = 0; j < 5; j++) {
        w6.println("" + i + "\t" + j);
      }
    }
    w6.close();

    PrintWriter w7 = new PrintWriter(new FileWriter(INPUT_FILE7));

    for (int i = 0; i < 300; i = i + 3) {
      for (int j = 0; j < 2; j++) {
        w7.println("" + i + "\t" + j);
      }
    }
    w7.close();

    Util.copyFromLocalToCluster(cluster, INPUT_FILE1, INPUT_FILE1);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE2, INPUT_FILE2);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE3, INPUT_FILE3);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE4, INPUT_FILE4);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE5, INPUT_FILE5);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE6, INPUT_FILE6);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE7, INPUT_FILE7);
  }

  @After
  public void tearDown() throws Exception {
    new File(INPUT_FILE1).delete();
    new File(INPUT_FILE2).delete();
    new File(INPUT_FILE3).delete();
    new File(INPUT_FILE4).delete();
    new File(INPUT_FILE5).delete();
    new File(INPUT_FILE6).delete();
    new File(INPUT_FILE7).delete();
    Util.deleteDirectory(new File("skewedjoin"));

    Util.deleteFile(cluster, INPUT_FILE1);
    Util.deleteFile(cluster, INPUT_FILE2);
    Util.deleteFile(cluster, INPUT_FILE3);
    Util.deleteFile(cluster, INPUT_FILE4);
    Util.deleteFile(cluster, INPUT_FILE5);
    Util.deleteFile(cluster, INPUT_FILE6);
    Util.deleteFile(cluster, INPUT_FILE7);
  }

  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }

  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }

  public void testSkewedJoinReducers() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      fail("Should not throw exception, should continue execution");
    }
  }

  public void testSkewedJoin3Way() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("D");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      return;
    }

    fail("Should throw exception, do not support 3 way join");
  }

  public void testSkewedJoinMapKey() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support maps and expression operators as keys");
    }

    return;
  }

  public void testSkewedJoinKeyPartition() throws IOException {
    try {
      Util.deleteFile(cluster, "skewedjoin");
    } catch (Exception e) {
      // it is ok if directory not exist
    }

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");

    pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 7;");
    pigServer.store("E", "skewedjoin");

    int[][] lineCount = new int[3][7];

    new File("skewedjoin").mkdir();
    // check how many times a key appear in each part- file
    for (int i = 0; i < 7; i++) {
      Util.copyFromClusterToLocal(
          cluster, "skewedjoin/part-r-0000" + i, "skewedjoin/part-r-0000" + i);

      BufferedReader reader = new BufferedReader(new FileReader("skewedjoin/part-r-0000" + i));
      String line = null;
      while ((line = reader.readLine()) != null) {
        String[] cols = line.split("\t");
        int key = Integer.parseInt(cols[0]) / 100 - 1;
        lineCount[key][i]++;
      }
    }

    int fc = 0;
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 7; j++) {
        if (lineCount[i][j] > 0) {
          fc++;
        }
      }
    }
    // atleast one key should be a skewed key
    // check atleast one key should appear in more than 1 part- file
    assertTrue(fc > 3);
  }

  public void testSkewedJoinNullKeys() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support null keys in skewed join");
    }
    return;
  }

  public void testSkewedJoinOuter() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id left, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("C = join A by id right, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("C = join A by id full, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support outer join in skewed join");
    }
    return;
  }

  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }

  public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }

  public void testSkewedJoinEmptyInput() throws IOException {
    String LEFT_INPUT_FILE = "left.dat";
    String RIGHT_INPUT_FILE = "right.dat";

    PrintWriter w = new PrintWriter(new FileWriter(LEFT_INPUT_FILE));
    w.println("1");
    w.println("2");
    w.println("3");
    w.println("5");
    w.close();

    Util.copyFromLocalToCluster(cluster, LEFT_INPUT_FILE, LEFT_INPUT_FILE);

    PrintWriter w2 = new PrintWriter(new FileWriter(RIGHT_INPUT_FILE));
    w2.println("1\tone");
    w2.println("2\ttwo");
    w2.println("3\tthree");

    w2.close();

    Util.copyFromLocalToCluster(cluster, RIGHT_INPUT_FILE, RIGHT_INPUT_FILE);

    pigServer.registerQuery("a = load 'left.dat' as (nums:chararray);");
    pigServer.registerQuery("b = load 'right.dat' as (number:chararray,text:chararray);");
    pigServer.registerQuery("c = filter a by nums == '7';");
    pigServer.registerQuery("d = join c by nums LEFT OUTER, b by number USING 'skewed';");

    Iterator<Tuple> iter = pigServer.openIterator("d");

    Assert.assertFalse(iter.hasNext());

    new File(LEFT_INPUT_FILE).delete();
    Util.deleteFile(cluster, LEFT_INPUT_FILE);
    new File(RIGHT_INPUT_FILE).delete();
    Util.deleteFile(cluster, RIGHT_INPUT_FILE);
  }

  public void testRecursiveFileListing() throws IOException {
    String LOCAL_INPUT_FILE = "test.dat";
    String INPUT_FILE = "foo/bar/test.dat";

    PrintWriter w = new PrintWriter(new FileWriter(LOCAL_INPUT_FILE));
    w.println("1");
    w.println("2");
    w.println("3");
    w.println("5");
    w.close();

    Util.copyFromLocalToCluster(cluster, LOCAL_INPUT_FILE, INPUT_FILE);

    pigServer.registerQuery("a = load 'foo' as (nums:chararray);");
    pigServer.registerQuery("b = load 'foo' as (nums:chararray);");
    pigServer.registerQuery("d = join a by nums, b by nums USING 'skewed';");

    Iterator<Tuple> iter = pigServer.openIterator("d");
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    Assert.assertEquals(4, count);

    new File(LOCAL_INPUT_FILE).delete();
    Util.deleteFile(cluster, INPUT_FILE);
  }
}
Exemplo n.º 12
0
 @Before
 public void setUp() throws Exception {
   pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
 }
Exemplo n.º 13
0
public class TestImplicitSplit extends TestCase {
  private PigServer pigServer;
  MiniCluster cluster = MiniCluster.buildCluster();

  @Before
  public void setUp() throws Exception {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
  }

  @After
  public void tearDown() throws Exception {}

  @Test
  public void testImplicitSplit() throws Exception {
    int LOOP_SIZE = 20;
    String[] input = new String[LOOP_SIZE];
    for (int i = 1; i <= LOOP_SIZE; i++) {
      input[i - 1] = Integer.toString(i);
    }
    String inputFileName = "testImplicitSplit-input.txt";
    Util.createInputFile(cluster, inputFileName, input);
    pigServer.registerQuery("A = LOAD '" + inputFileName + "';");
    pigServer.registerQuery("B = filter A by $0<=10;");
    pigServer.registerQuery("C = filter A by $0>10;");
    pigServer.registerQuery("D = union B,C;");
    Iterator<Tuple> iter = pigServer.openIterator("D");
    if (!iter.hasNext()) fail("No Output received");
    int cnt = 0;
    while (iter.hasNext()) {
      Tuple t = iter.next();
      ++cnt;
    }
    assertEquals(20, cnt);
    Util.deleteFile(cluster, inputFileName);
  }

  @Test
  public void testImplicitSplitInCoGroup() throws Exception {
    // this query is similar to the one reported in JIRA - PIG-537
    // Create input file
    String input1 = "testImplicitSplitInCoGroup-input1.txt";
    String input2 = "testImplicitSplitInCoGroup-input2.txt";
    Util.createInputFile(cluster, input1, new String[] {"a:1", "b:2", "b:20", "c:3", "c:30"});
    Util.createInputFile(cluster, input2, new String[] {"a:first", "b:second", "c:third"});
    pigServer.registerQuery(
        "a = load '" + input1 + "' using PigStorage(':') as (name:chararray, marks:int);");
    pigServer.registerQuery(
        "b = load '" + input2 + "' using PigStorage(':') as (name:chararray, rank:chararray);");
    pigServer.registerQuery("c = cogroup a by name, b by name;");
    pigServer.registerQuery("d = foreach c generate group, FLATTEN(a.marks) as newmarks;");
    pigServer.registerQuery("e = cogroup a by marks, d by newmarks;");
    pigServer.registerQuery("f = foreach e generate group, flatten(a), flatten(d);");
    HashMap<Integer, Object[]> results = new HashMap<Integer, Object[]>();
    results.put(1, new Object[] {"a", 1, "a", 1});
    results.put(2, new Object[] {"b", 2, "b", 2});
    results.put(3, new Object[] {"c", 3, "c", 3});
    results.put(20, new Object[] {"b", 20, "b", 20});
    results.put(30, new Object[] {"c", 30, "c", 30});

    Iterator<Tuple> it = pigServer.openIterator("f");
    while (it.hasNext()) {
      Tuple t = it.next();
      System.err.println("Tuple:" + t);
      Integer group = (Integer) t.get(0);
      Object[] groupValues = results.get(group);
      for (int i = 0; i < 4; i++) {
        assertEquals(groupValues[i], t.get(i + 1));
      }
    }
    Util.deleteFile(cluster, input1);
    Util.deleteFile(cluster, input2);
  }

  @Test
  public void testImplicitSplitInCoGroup2() throws Exception {
    // this query is similar to the one reported in JIRA - PIG-537
    LogicalPlanTester planTester = new LogicalPlanTester();
    planTester.buildPlan("a = load 'file1' using PigStorage(':') as (name:chararray, marks:int);");
    planTester.buildPlan(
        "b = load 'file2' using PigStorage(':') as (name:chararray, rank:chararray);");
    planTester.buildPlan("c = cogroup a by name, b by name;");
    planTester.buildPlan("d = foreach c generate group, FLATTEN(a.marks) as newmarks;");
    planTester.buildPlan("e = cogroup a by marks, d by newmarks;");
    LogicalPlan plan =
        planTester.buildPlan("f = foreach e generate group, flatten(a), flatten(d);");

    // Set the logical plan values correctly in all the operators
    PlanSetter ps = new PlanSetter(plan);
    ps.visit();

    // run through validator
    CompilationMessageCollector collector = new CompilationMessageCollector();
    TypeCheckingValidator typeValidator = new TypeCheckingValidator();
    typeValidator.validate(plan, collector);
    printMessageCollector(collector);
    printTypeGraph(plan);

    if (collector.hasError()) {
      throw new Exception("Error during type checking");
    }

    // this will run ImplicitSplitInserter
    TestLogicalOptimizer.optimizePlan(plan);

    // get Schema of leaf and compare:
    Schema expectedSchema =
        Util.getSchemaFromString(
            "grp: int,A::username: chararray,A::marks: int,AB::group: chararray,AB::newmarks: int");
    assertTrue(Schema.equals(expectedSchema, plan.getLeaves().get(0).getSchema(), false, true));
  }
}
Exemplo n.º 14
0
@RunWith(JUnit4.class)
public class TestBZip {
  static MiniCluster cluster = MiniCluster.buildCluster();

  @AfterClass
  public static void oneTimeTearDown() throws Exception {
    cluster.shutDown();
  }

  /** Tests the end-to-end writing and reading of a BZip file. */
  @Test
  public void testBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".bz2");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutput = Util.removeColon(out.getAbsolutePath());

    CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
    for (int i = 1; i < 100; i++) {
      StringBuffer sb = new StringBuffer();
      sb.append(i).append("\n").append(-i).append("\n");
      byte bytes[] = sb.toString().getBytes();
      cos.write(bytes);
    }
    cos.close();

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(100, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';");

    Iterator<Tuple> i = pig.openIterator("B");
    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
    while (i.hasNext()) {
      Integer val = DataType.toInteger(i.next().get(0));
      map.put(val, val);
    }

    assertEquals(new Integer(99), new Integer(map.keySet().size()));

    for (int j = 1; j < 100; j++) {
      assertEquals(new Integer(j), map.get(j));
    }

    in.delete();
    Util.deleteFile(cluster, clusterOutput);
  }

  /**
   * Tests the end-to-end writing and reading of a BZip file using absolute path with a trailing /.
   */
  @Test
  public void testBzipInPig2() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".bz2");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutput = Util.removeColon(out.getAbsolutePath());

    CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
    for (int i = 1; i < 100; i++) {
      StringBuffer sb = new StringBuffer();
      sb.append(i).append("\n").append(-i).append("\n");
      byte bytes[] = sb.toString().getBytes();
      cos.write(bytes);
    }
    cos.close();

    pig.registerQuery(
        "AA = load '" + Util.generateURI(in.getAbsolutePath(), pig.getPigContext()) + "';");
    pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "/';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(100, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';");

    Iterator<Tuple> i = pig.openIterator("B");
    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
    while (i.hasNext()) {
      Integer val = DataType.toInteger(i.next().get(0));
      map.put(val, val);
    }

    assertEquals(new Integer(99), new Integer(map.keySet().size()));

    for (int j = 1; j < 100; j++) {
      assertEquals(new Integer(j), map.get(j));
    }

    in.delete();
    out.delete();
  }

  // see PIG-2391
  @Test
  public void testBz2() throws Exception {
    String[] inputData =
        new String[] {
          "1\t2\r3\t4", // '\r' case - this will be split into two tuples
          "5\t6\r", // '\r\n' case
          "7\t8", // '\n' case
          "9\t10\r" // '\r\n' at the end of file
        };

    // bzip compressed input
    File in = File.createTempFile("junit", ".bz2");
    String compressedInputFileName = in.getAbsolutePath();
    String clusterCompressedFilePath = Util.removeColon(compressedInputFileName);
    in.deleteOnExit();

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
      for (int i = 0; i < inputData.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath);

      // pig script to read compressed input
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

      // pig script to read compressed input
      String script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';";
      pig.registerQuery(script);

      pig.registerQuery("store a into 'intermediate.bz';");
      pig.registerQuery("b = load 'intermediate.bz';");
      Iterator<Tuple> it2 = pig.openIterator("b");
      while (it2.hasNext()) {
        it2.next();
      }
    } finally {
      in.delete();
      Util.deleteFile(cluster, "intermediate.bz");
      Util.deleteFile(cluster, "final.bz");
    }
  }
  /**
   * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they
   * are when using uncompressed text
   */
  @Test
  public void testRecordDelims() throws Exception {
    String[] inputData =
        new String[] {
          "1\t2\r3\t4", // '\r' case - this will be split into two tuples
          "5\t6\r", // '\r\n' case
          "7\t8", // '\n' case
          "9\t10\r" // '\r\n' at the end of file
        };

    // bzip compressed input
    File in = File.createTempFile("junit", ".bz2");
    String compressedInputFileName = in.getAbsolutePath();
    in.deleteOnExit();
    String clusterCompressedFilePath = Util.removeColon(compressedInputFileName);

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputData);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
      for (int i = 0; i < inputData.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath);

      // pig script to read uncompressed input
      String script = "a = load '" + unCompressedInputFileName + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed input
      script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
      Util.deleteFile(cluster, clusterCompressedFilePath);
    }
  }

  /** Tests the end-to-end writing and reading of an empty BZip file. */
  @Test
  public void testEmptyBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".tmp");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath());

    FileOutputStream fos = new FileOutputStream(in);
    fos.write("55\n".getBytes());
    fos.close();
    System.out.println(in.getAbsolutePath());

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(-1, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    pig.openIterator("B");

    in.delete();
    Util.deleteFile(cluster, clusterOutputFilePath);
  }

  /** Tests the writing and reading of an empty BZip file. */
  @Test
  public void testEmptyBzip() throws Exception {
    File tmp = File.createTempFile("junit", ".tmp");
    tmp.deleteOnExit();
    CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(tmp));
    cos.close();
    assertNotSame(0, tmp.length());
    FileSystem fs = FileSystem.getLocal(new Configuration(false));
    CBZip2InputStream cis =
        new CBZip2InputStream(fs.open(new Path(tmp.getAbsolutePath())), -1, tmp.length());
    assertEquals(-1, cis.read(new byte[100]));
    cis.close();
    tmp.delete();
  }

  /**
   * Tests the case where a bzip block ends exactly at the end of the {@link InputSplit} with the
   * block header ending a few bits into the last byte of current InputSplit. This case results in
   * dropped records in Pig 0.6 release This test also tests that bzip files couple of dirs deep can
   * be read by specifying the top level dir.
   */
  @Test
  public void testBlockHeaderEndingAtSplitNotByteAligned() throws IOException {
    // the actual input file is at
    // test/org/apache/pig/test/data/bzipdir1.bz2/bzipdir2.bz2/recordLossblockHeaderEndsAt136500.txt.bz2
    // In this test we will load test/org/apache/pig/test/data/bzipdir1.bz2 to also
    // test that the BZip2TextInputFormat can read subdirs recursively
    String inputFileName = "test/org/apache/pig/test/data/bzipdir1.bz2";
    Long expectedCount = 74999L; // number of lines in above file
    // the first block in the above file exactly ends a few bits into the
    // byte at position 136500
    int splitSize = 136500;
    try {
      Util.copyFromLocalToCluster(cluster, inputFileName, inputFileName);
      testCount(inputFileName, expectedCount, splitSize, "PigStorage()");
      testCount(inputFileName, expectedCount, splitSize, "TextLoader()");
    } finally {
      Util.deleteFile(cluster, inputFileName);
    }
  }

  /**
   * Tests the case where a bzip block ends exactly at the end of the input split (byte aligned with
   * the last byte) and the last byte is a carriage return.
   */
  @Test
  public void testBlockHeaderEndingWithCR() throws IOException {
    String inputFileName = "test/org/apache/pig/test/data/blockEndingInCR.txt.bz2";
    // number of lines in above file (the value is 1 more than bzcat | wc -l
    // since there is a '\r' which is also treated as a record delim
    Long expectedCount = 82094L;
    // the first block in the above file exactly ends at the byte at
    // position 136498 and the last byte is a carriage return ('\r')
    try {
      int splitSize = 136498;
      Util.copyFromLocalToCluster(cluster, inputFileName, inputFileName);
      testCount(inputFileName, expectedCount, splitSize, "PigStorage()");
    } finally {
      Util.deleteFile(cluster, inputFileName);
    }
  }

  /**
   * Tests the case where a bzip block ends exactly at the end of the input split and has more data
   * which results in overcounting (record duplication) in Pig 0.6
   */
  @Test
  public void testBlockHeaderEndingAtSplitOverCounting() throws IOException {

    String inputFileName = "test/org/apache/pig/test/data/blockHeaderEndsAt136500.txt.bz2";
    Long expectedCount = 1041046L; // number of lines in above file
    // the first block in the above file exactly ends a few bits into the
    // byte at position 136500
    int splitSize = 136500;
    try {
      Util.copyFromLocalToCluster(cluster, inputFileName, inputFileName);
      testCount(inputFileName, expectedCount, splitSize, "PigStorage()");
    } finally {
      Util.deleteFile(cluster, inputFileName);
    }
  }

  private void testCount(
      String inputFileName, Long expectedCount, int splitSize, String loadFuncSpec)
      throws IOException {
    String outputFile = "/tmp/bz-output";
    // simple load-store script to verify that the bzip input is getting
    // split
    String scriptToTestSplitting =
        "a = load '"
            + inputFileName
            + "' using "
            + loadFuncSpec
            + "; store a into '"
            + outputFile
            + "';";

    String script =
        "a = load '"
            + inputFileName
            + "';"
            + "b = group a all;"
            + "c = foreach b generate COUNT_STAR(a);";
    Properties props = new Properties();
    for (Entry<Object, Object> entry : cluster.getProperties().entrySet()) {
      props.put(entry.getKey(), entry.getValue());
    }
    props.setProperty("mapred.max.split.size", Integer.toString(splitSize));
    PigContext pigContext = new PigContext(ExecType.MAPREDUCE, props);
    PigServer pig = new PigServer(pigContext);
    FileSystem fs = FileSystem.get(ConfigurationUtil.toConfiguration(props));
    fs.delete(new Path(outputFile), true);
    Util.registerMultiLineQuery(pig, scriptToTestSplitting);

    // verify that > 1 maps were launched due to splitting of the bzip input
    FileStatus[] files = fs.listStatus(new Path(outputFile));
    int numPartFiles = 0;
    for (FileStatus fileStatus : files) {
      if (fileStatus.getPath().getName().startsWith("part")) {
        numPartFiles++;
      }
    }
    assertEquals(true, numPartFiles > 0);

    // verify record count to verify we read bzip data correctly
    Util.registerMultiLineQuery(pig, script);
    Iterator<Tuple> it = pig.openIterator("c");
    Long result = (Long) it.next().get(0);
    assertEquals(expectedCount, result);
  }

  @Test
  public void testBzipStoreInMultiQuery() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pig.setBatchOn();
    pig.registerQuery("a = load '" + inputFileName + "';");
    pig.registerQuery("store a into 'output.bz2';");
    pig.registerQuery("store a into 'output';");
    pig.executeBatch();

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output/part-m-00000"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }

  @Test
  public void testBzipStoreInMultiQuery2() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input2.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    PigContext pigContext = pig.getPigContext();
    pigContext.getProperties().setProperty("output.compression.enabled", "true");
    pigContext
        .getProperties()
        .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");

    pig.setBatchOn();
    pig.registerQuery("a = load '" + inputFileName + "';");
    pig.registerQuery("store a into 'output2.bz2';");
    pig.registerQuery("store a into 'output2';");
    pig.executeBatch();

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }

  /**
   * Tests that Pig throws an Exception when the input files to be loaded are actually a result of
   * concatenating 2 or more bz2 files. Pig should not silently ignore part of the input data.
   */
  @Test(expected = IOException.class)
  public void testBZ2Concatenation() throws Exception {
    String[] inputData1 = new String[] {"1\ta", "2\taa"};
    String[] inputData2 = new String[] {"1\tb", "2\tbb"};
    String[] inputDataMerged = new String[] {"1\ta", "2\taa", "1\tb", "2\tbb"};

    // bzip compressed input file1
    File in1 = File.createTempFile("junit", ".bz2");
    String compressedInputFileName1 = in1.getAbsolutePath();
    in1.deleteOnExit();

    // file2
    File in2 = File.createTempFile("junit", ".bz2");
    String compressedInputFileName2 = in2.getAbsolutePath();
    in1.deleteOnExit();

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputDataMerged);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in1));
      for (int i = 0; i < inputData1.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData1[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      CBZip2OutputStream cos2 = new CBZip2OutputStream(new FileOutputStream(in2));
      for (int i = 0; i < inputData2.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData2[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos2.write(bytes);
      }
      cos2.close();

      // cat
      catInto(compressedInputFileName2, compressedInputFileName1);
      Util.copyFromLocalToCluster(cluster, compressedInputFileName1, compressedInputFileName1);

      // pig script to read uncompressed input
      String script = "a = load '" + Util.encodeEscape(unCompressedInputFileName) + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed concatenated input
      script = "a = load '" + Util.encodeEscape(compressedInputFileName1) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in1.delete();
      in2.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
    }
  }

  /*
   * Concatenate the contents of src file to the contents of dest file
   */
  private void catInto(String src, String dest) throws IOException {
    BufferedWriter out = new BufferedWriter(new FileWriter(dest, true));
    BufferedReader in = new BufferedReader(new FileReader(src));
    String str;
    while ((str = in.readLine()) != null) {
      out.write(str);
    }
    in.close();
    out.close();
  }

  // See PIG-1714
  @Test
  public void testBzipStoreInMultiQuery3() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input3.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    String inputScript =
        "set mapred.output.compress true\n"
            + "set mapreduce.output.fileoutputformat.compress true\n"
            + "set mapred.output.compression.codec org.apache.hadoop.io.compress.BZip2Codec\n"
            + "set mapreduce.output.fileoutputformat.compress.codec org.apache.hadoop.io.compress.BZip2Codec\n"
            + "a = load '"
            + inputFileName
            + "';\n"
            + "store a into 'output3.bz2';\n"
            + "store a into 'output3';";

    String inputScriptName = "script3.txt";
    PrintWriter pw = new PrintWriter(new FileWriter(inputScriptName));
    pw.println(inputScript);
    pw.close();

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    FileInputStream fis = new FileInputStream(inputScriptName);
    pig.registerScript(fis);

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output3/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output3.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
}
Exemplo n.º 15
0
  /**
   * Tests that Pig throws an Exception when the input files to be loaded are actually a result of
   * concatenating 2 or more bz2 files. Pig should not silently ignore part of the input data.
   */
  @Test(expected = IOException.class)
  public void testBZ2Concatenation() throws Exception {
    String[] inputData1 = new String[] {"1\ta", "2\taa"};
    String[] inputData2 = new String[] {"1\tb", "2\tbb"};
    String[] inputDataMerged = new String[] {"1\ta", "2\taa", "1\tb", "2\tbb"};

    // bzip compressed input file1
    File in1 = File.createTempFile("junit", ".bz2");
    String compressedInputFileName1 = in1.getAbsolutePath();
    in1.deleteOnExit();

    // file2
    File in2 = File.createTempFile("junit", ".bz2");
    String compressedInputFileName2 = in2.getAbsolutePath();
    in1.deleteOnExit();

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputDataMerged);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in1));
      for (int i = 0; i < inputData1.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData1[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      CBZip2OutputStream cos2 = new CBZip2OutputStream(new FileOutputStream(in2));
      for (int i = 0; i < inputData2.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData2[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos2.write(bytes);
      }
      cos2.close();

      // cat
      catInto(compressedInputFileName2, compressedInputFileName1);
      Util.copyFromLocalToCluster(cluster, compressedInputFileName1, compressedInputFileName1);

      // pig script to read uncompressed input
      String script = "a = load '" + Util.encodeEscape(unCompressedInputFileName) + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed concatenated input
      script = "a = load '" + Util.encodeEscape(compressedInputFileName1) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in1.delete();
      in2.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
    }
  }
Exemplo n.º 16
0
 @AfterClass
 public static void tearDownAfterClass() throws Exception {
   cluster.shutDown();
 }
Exemplo n.º 17
0
 @AfterClass
 public static void oneTimeTearDown() throws Exception {
   cluster.shutDown();
 }