public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }
  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }
Beispiel #5
0
  private void run(PhysicalPlan pp, String expectedFile) throws Exception {
    String compiledPlan, goldenPlan = null;
    int MAX_SIZE = 100000;
    MRCompiler comp = new MRCompiler(pp, pc);
    comp.compile();

    MROperPlan mrp = comp.getMRPlan();
    PlanPrinter ppp = new PlanPrinter(mrp);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ppp.print(baos);
    compiledPlan = baos.toString();

    if (generate) {
      FileOutputStream fos = new FileOutputStream(expectedFile);
      fos.write(baos.toByteArray());
      return;
    }
    FileInputStream fis = new FileInputStream(expectedFile);
    byte[] b = new byte[MAX_SIZE];
    int len = fis.read(b);
    goldenPlan = new String(b, 0, len);
    if (goldenPlan.charAt(len - 1) == '\n') goldenPlan = goldenPlan.substring(0, len - 1);

    pp.explain(System.out);
    System.out.println();
    System.out.println("<<<" + compiledPlan + ">>>");
    System.out.println("-------------");
    System.out.println("Golden");
    System.out.println("<<<" + goldenPlan + ">>>");
    System.out.println("-------------");

    String goldenPlanClean = Util.standardizeNewline(goldenPlan);
    String compiledPlanClean = Util.standardizeNewline(compiledPlan);
    assertEquals(
        TestHelper.sortUDFs(Util.removeSignature(goldenPlanClean)),
        TestHelper.sortUDFs(Util.removeSignature(compiledPlanClean)));
  }
  private void runTest(Object key, boolean inner[], byte keyType)
      throws ExecException, IOException {
    Random r = new Random();
    DataBag db1 = GenRandomData.genRandSmallTupDataBag(r, 10, 100);
    DataBag db2 = GenRandomData.genRandSmallTupDataBag(r, 10, 100);
    List<NullableTuple> db = new ArrayList<NullableTuple>(200);
    Iterator<Tuple> db1Iter = db1.iterator();
    if (!inner[0]) {
      while (db1Iter.hasNext()) {
        NullableTuple it = new NullableTuple(db1Iter.next());
        it.setIndex((byte) 0);
        db.add(it);
      }
    }
    Iterator<Tuple> db2Iter = db2.iterator();
    while (db2Iter.hasNext()) {
      NullableTuple it = new NullableTuple(db2Iter.next());
      it.setIndex((byte) 1);
      db.add(it);
    }
    // ITIterator iti = new TestPackage.ITIterator(db.iterator());
    POPackage pop = new POPackage(new OperatorKey("", r.nextLong()));
    pop.setNumInps(2);
    pop.getPkgr().setInner(inner);
    PigNullableWritable k = HDataType.getWritableComparableTypes(key, keyType);
    pop.attachInput(k, db.iterator());
    if (keyType != DataType.BAG) {
      // test serialization
      NullablePartitionWritable wr;
      if (keyType == DataType.TUPLE) {
        BinSedesTuple tup =
            (BinSedesTuple) binfactory.newTupleNoCopy(((Tuple) k.getValueAsPigType()).getAll());
        wr = new NullablePartitionWritable(new NullableTuple(tup));
      } else {
        wr = new NullablePartitionWritable(k);
      }
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      DataOutputStream out = new DataOutputStream(baos);
      wr.write(out);
      byte[] arr = baos.toByteArray();
      ByteArrayInputStream bais = new ByteArrayInputStream(arr);
      DataInputStream in = new DataInputStream(bais);
      NullablePartitionWritable re = new NullablePartitionWritable();
      re.readFields(in);
      assertEquals(re, wr);
    }

    // we are not doing any optimization to remove
    // parts of the "value" which are present in the "key" in this
    // unit test - so set up the "keyInfo" accordingly in
    // the POPackage
    Map<Integer, Pair<Boolean, Map<Integer, Integer>>> keyInfo =
        new HashMap<Integer, Pair<Boolean, Map<Integer, Integer>>>();
    Pair<Boolean, Map<Integer, Integer>> p =
        new Pair<Boolean, Map<Integer, Integer>>(false, new HashMap<Integer, Integer>());
    keyInfo.put(0, p);
    keyInfo.put(1, p);
    pop.getPkgr().setKeyInfo(keyInfo);
    Tuple t = null;
    Result res = null;
    res = pop.getNextTuple();
    if (res.returnStatus == POStatus.STATUS_NULL && inner[0]) return;
    assertEquals(POStatus.STATUS_OK, res.returnStatus);

    t = (Tuple) res.result;
    Object outKey = t.get(0);
    DataBag outDb1 = (DataBag) t.get(1);
    DataBag outDb2 = (DataBag) t.get(2);

    assertEquals(key, outKey);
    assertTrue(TestHelper.compareBags(db1, outDb1));
    assertTrue(TestHelper.compareBags(db2, outDb2));
  }