예제 #1
0
  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }
예제 #2
0
 public TestSkewedJoin() throws ExecException, IOException {
   pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
   // pigServer = new PigServer(ExecType.LOCAL);
   pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5");
   pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.01");
 }
예제 #3
0
public class TestSkewedJoin extends TestCase {
  private static final String INPUT_FILE1 = "SkewedJoinInput1.txt";
  private static final String INPUT_FILE2 = "SkewedJoinInput2.txt";
  private static final String INPUT_FILE3 = "SkewedJoinInput3.txt";
  private static final String INPUT_FILE4 = "SkewedJoinInput4.txt";
  private static final String INPUT_FILE5 = "SkewedJoinInput5.txt";
  private static final String INPUT_FILE6 = "SkewedJoinInput6.txt";
  private static final String INPUT_FILE7 = "SkewedJoinInput7.txt";

  private PigServer pigServer;
  private MiniCluster cluster = MiniCluster.buildCluster();

  public TestSkewedJoin() throws ExecException, IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    // pigServer = new PigServer(ExecType.LOCAL);
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "5");
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.memusage", "0.01");
  }

  @Before
  public void setUp() throws Exception {
    createFiles();
  }

  private void createFiles() throws IOException {
    PrintWriter w = new PrintWriter(new FileWriter(INPUT_FILE1));

    int k = 0;
    for (int j = 0; j < 120; j++) {
      w.println("100\tapple1\taaa" + k);
      k++;
      w.println("200\torange1\tbbb" + k);
      k++;
      w.println("300\tstrawberry\tccc" + k);
      k++;
    }

    w.close();

    PrintWriter w2 = new PrintWriter(new FileWriter(INPUT_FILE2));
    w2.println("100\tapple1");
    w2.println("100\tapple2");
    w2.println("100\tapple2");
    w2.println("200\torange1");
    w2.println("200\torange2");
    w2.println("300\tstrawberry");
    w2.println("400\tpear");

    w2.close();

    PrintWriter w3 = new PrintWriter(new FileWriter(INPUT_FILE3));
    w3.println("100\tapple1");
    w3.println("100\tapple2");
    w3.println("200\torange1");
    w3.println("200\torange2");
    w3.println("300\tstrawberry");
    w3.println("300\tstrawberry2");
    w3.println("400\tpear");

    w3.close();

    PrintWriter w4 = new PrintWriter(new FileWriter(INPUT_FILE4));
    for (int i = 0; i < 100; i++) {
      w4.println(
          "[a100#apple1,a100#apple2,a200#orange1,a200#orange2,a300#strawberry,a300#strawberry2,a400#pear]");
    }
    w4.close();

    // Create a file with null keys
    PrintWriter w5 = new PrintWriter(new FileWriter(INPUT_FILE5));
    for (int i = 0; i < 10; i++) {
      w5.println("\tapple1");
    }
    w5.println("100\tapple2");
    for (int i = 0; i < 10; i++) {
      w5.println("\torange1");
    }
    w5.println("\t");
    w5.println("100\t");
    w5.close();

    PrintWriter w6 = new PrintWriter(new FileWriter(INPUT_FILE6));

    for (int i = 0; i < 300; i++) {
      for (int j = 0; j < 5; j++) {
        w6.println("" + i + "\t" + j);
      }
    }
    w6.close();

    PrintWriter w7 = new PrintWriter(new FileWriter(INPUT_FILE7));

    for (int i = 0; i < 300; i = i + 3) {
      for (int j = 0; j < 2; j++) {
        w7.println("" + i + "\t" + j);
      }
    }
    w7.close();

    Util.copyFromLocalToCluster(cluster, INPUT_FILE1, INPUT_FILE1);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE2, INPUT_FILE2);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE3, INPUT_FILE3);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE4, INPUT_FILE4);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE5, INPUT_FILE5);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE6, INPUT_FILE6);
    Util.copyFromLocalToCluster(cluster, INPUT_FILE7, INPUT_FILE7);
  }

  @After
  public void tearDown() throws Exception {
    new File(INPUT_FILE1).delete();
    new File(INPUT_FILE2).delete();
    new File(INPUT_FILE3).delete();
    new File(INPUT_FILE4).delete();
    new File(INPUT_FILE5).delete();
    new File(INPUT_FILE6).delete();
    new File(INPUT_FILE7).delete();
    Util.deleteDirectory(new File("skewedjoin"));

    Util.deleteFile(cluster, INPUT_FILE1);
    Util.deleteFile(cluster, INPUT_FILE2);
    Util.deleteFile(cluster, INPUT_FILE3);
    Util.deleteFile(cluster, INPUT_FILE4);
    Util.deleteFile(cluster, INPUT_FILE5);
    Util.deleteFile(cluster, INPUT_FILE6);
    Util.deleteFile(cluster, INPUT_FILE7);
  }

  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }

  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }

  public void testSkewedJoinReducers() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      fail("Should not throw exception, should continue execution");
    }
  }

  public void testSkewedJoin3Way() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("D");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      return;
    }

    fail("Should throw exception, do not support 3 way join");
  }

  public void testSkewedJoinMapKey() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support maps and expression operators as keys");
    }

    return;
  }

  public void testSkewedJoinKeyPartition() throws IOException {
    try {
      Util.deleteFile(cluster, "skewedjoin");
    } catch (Exception e) {
      // it is ok if directory not exist
    }

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");

    pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 7;");
    pigServer.store("E", "skewedjoin");

    int[][] lineCount = new int[3][7];

    new File("skewedjoin").mkdir();
    // check how many times a key appear in each part- file
    for (int i = 0; i < 7; i++) {
      Util.copyFromClusterToLocal(
          cluster, "skewedjoin/part-r-0000" + i, "skewedjoin/part-r-0000" + i);

      BufferedReader reader = new BufferedReader(new FileReader("skewedjoin/part-r-0000" + i));
      String line = null;
      while ((line = reader.readLine()) != null) {
        String[] cols = line.split("\t");
        int key = Integer.parseInt(cols[0]) / 100 - 1;
        lineCount[key][i]++;
      }
    }

    int fc = 0;
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 7; j++) {
        if (lineCount[i][j] > 0) {
          fc++;
        }
      }
    }
    // atleast one key should be a skewed key
    // check atleast one key should appear in more than 1 part- file
    assertTrue(fc > 3);
  }

  public void testSkewedJoinNullKeys() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support null keys in skewed join");
    }
    return;
  }

  public void testSkewedJoinOuter() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id left, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("C = join A by id right, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("C = join A by id full, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support outer join in skewed join");
    }
    return;
  }

  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }

  public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }

  public void testSkewedJoinEmptyInput() throws IOException {
    String LEFT_INPUT_FILE = "left.dat";
    String RIGHT_INPUT_FILE = "right.dat";

    PrintWriter w = new PrintWriter(new FileWriter(LEFT_INPUT_FILE));
    w.println("1");
    w.println("2");
    w.println("3");
    w.println("5");
    w.close();

    Util.copyFromLocalToCluster(cluster, LEFT_INPUT_FILE, LEFT_INPUT_FILE);

    PrintWriter w2 = new PrintWriter(new FileWriter(RIGHT_INPUT_FILE));
    w2.println("1\tone");
    w2.println("2\ttwo");
    w2.println("3\tthree");

    w2.close();

    Util.copyFromLocalToCluster(cluster, RIGHT_INPUT_FILE, RIGHT_INPUT_FILE);

    pigServer.registerQuery("a = load 'left.dat' as (nums:chararray);");
    pigServer.registerQuery("b = load 'right.dat' as (number:chararray,text:chararray);");
    pigServer.registerQuery("c = filter a by nums == '7';");
    pigServer.registerQuery("d = join c by nums LEFT OUTER, b by number USING 'skewed';");

    Iterator<Tuple> iter = pigServer.openIterator("d");

    Assert.assertFalse(iter.hasNext());

    new File(LEFT_INPUT_FILE).delete();
    Util.deleteFile(cluster, LEFT_INPUT_FILE);
    new File(RIGHT_INPUT_FILE).delete();
    Util.deleteFile(cluster, RIGHT_INPUT_FILE);
  }

  public void testRecursiveFileListing() throws IOException {
    String LOCAL_INPUT_FILE = "test.dat";
    String INPUT_FILE = "foo/bar/test.dat";

    PrintWriter w = new PrintWriter(new FileWriter(LOCAL_INPUT_FILE));
    w.println("1");
    w.println("2");
    w.println("3");
    w.println("5");
    w.close();

    Util.copyFromLocalToCluster(cluster, LOCAL_INPUT_FILE, INPUT_FILE);

    pigServer.registerQuery("a = load 'foo' as (nums:chararray);");
    pigServer.registerQuery("b = load 'foo' as (nums:chararray);");
    pigServer.registerQuery("d = join a by nums, b by nums USING 'skewed';");

    Iterator<Tuple> iter = pigServer.openIterator("d");
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    Assert.assertEquals(4, count);

    new File(LOCAL_INPUT_FILE).delete();
    Util.deleteFile(cluster, INPUT_FILE);
  }
}