Ejemplo n.º 1
0
  @Test
  public void testBzipStoreInMultiQuery2() throws Exception {
    String[] inputData = new String[] {"1\t2\r3\t4"};

    String inputFileName = "input2.txt";
    Util.createInputFile(cluster, inputFileName, inputData);

    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    PigContext pigContext = pig.getPigContext();
    pigContext.getProperties().setProperty("output.compression.enabled", "true");
    pigContext
        .getProperties()
        .setProperty("output.compression.codec", "org.apache.hadoop.io.compress.BZip2Codec");

    pig.setBatchOn();
    pig.registerQuery("a = load '" + inputFileName + "';");
    pig.registerQuery("store a into 'output2.bz2';");
    pig.registerQuery("store a into 'output2';");
    pig.executeBatch();

    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FileStatus stat = fs.getFileStatus(new Path("output2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);

    stat = fs.getFileStatus(new Path("output2.bz2/part-m-00000.bz2"));
    assertTrue(stat.getLen() > 0);
  }
Ejemplo n.º 2
0
  // See PIG-1636
  @Test
  public void testScalarAliasesLimit() throws Exception {
    String[] input = {"a\t1", "b\t2", "c\t3", "a\t4", "c\t5"};

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesLimit";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0:chararray, a1: int);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate SUM(A.$1) as total;");
    pigServer.registerQuery("C1 = limit C 1;");
    pigServer.registerQuery("Y = foreach A generate a0, a1 * (double)C1.total;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    // Average is 11
    Tuple t = iter.next();
    assertTrue(t.toString().equals("(a,15.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(b,30.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(c,45.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(a,60.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(c,75.0)"));

    assertFalse(iter.hasNext());
  }
Ejemplo n.º 3
0
  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }
Ejemplo n.º 4
0
  public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
Ejemplo n.º 5
0
  // See PIG-1434
  @Test
  public void testScalarAliasesJoinClause() throws Exception {
    String[] inputA = {"1\t5", "2\t10", "3\t20"};
    String[] inputB = {"Total3\tthree", "Total2\ttwo", "Total1\tone"};

    // Test the use of scalars in expressions
    String inputPathA = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseA";
    TestScalarAliases.createLocalInputFile(inputPathA, inputA);
    String inputPathB = BUILD_TEST_TMP + "table_testScalarAliasesJoinClauseB";
    TestScalarAliases.createLocalInputFile(inputPathB, inputB);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0, a1);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate COUNT(A) as count;");

    pigServer.registerQuery("B = LOAD '" + inputPathB + "' as (b0:chararray, b1:chararray);");
    pigServer.registerQuery("Y = join A by CONCAT('Total', (chararray)C.count), B by $0;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    String[] expected =
        new String[] {"(1,5,Total3,three)", "(2,10,Total3,three)", "(3,20,Total3,three)"};

    Util.checkQueryOutputsAfterSortRecursive(
        iter,
        expected,
        org.apache.pig.newplan.logical.Util.translateSchema(pigServer.dumpSchema("Y")));
  }
Ejemplo n.º 6
0
  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
Ejemplo n.º 7
0
  // See PIG-1434
  @Test
  public void testScalarWithNoSchemaDollarProj() throws Exception {
    String[] scalarInput = {"1\t5"};
    String[] input = {"1\t5", "2\t10", "3\t20"};
    String inputPath = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProj";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    String inputPathScalar = BUILD_TEST_TMP + "table_testScalarWithNoSchemaDollarProjScalar";
    TestScalarAliases.createLocalInputFile(inputPathScalar, scalarInput);
    // Load A as a scalar
    pigServer.registerQuery("A = LOAD '" + inputPath + "';");
    pigServer.registerQuery("scalar = LOAD '" + inputPathScalar + "';");
    pigServer.registerQuery("B = foreach A generate 5 / scalar.$1;");

    Iterator<Tuple> iter = pigServer.openIterator("B");

    Tuple t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    t = iter.next();
    assertTrue(t.get(0).toString().equals("1"));

    assertFalse(iter.hasNext());
  }
Ejemplo n.º 8
0
  public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }
Ejemplo n.º 9
0
  public void testRecursiveFileListing() throws IOException {
    String LOCAL_INPUT_FILE = "test.dat";
    String INPUT_FILE = "foo/bar/test.dat";

    PrintWriter w = new PrintWriter(new FileWriter(LOCAL_INPUT_FILE));
    w.println("1");
    w.println("2");
    w.println("3");
    w.println("5");
    w.close();

    Util.copyFromLocalToCluster(cluster, LOCAL_INPUT_FILE, INPUT_FILE);

    pigServer.registerQuery("a = load 'foo' as (nums:chararray);");
    pigServer.registerQuery("b = load 'foo' as (nums:chararray);");
    pigServer.registerQuery("d = join a by nums, b by nums USING 'skewed';");

    Iterator<Tuple> iter = pigServer.openIterator("d");
    int count = 0;
    while (iter.hasNext()) {
      iter.next();
      count++;
    }
    Assert.assertEquals(4, count);

    new File(LOCAL_INPUT_FILE).delete();
    Util.deleteFile(cluster, INPUT_FILE);
  }
Ejemplo n.º 10
0
  // See PIG-1434
  @Test
  public void testScalarAliasesFilterClause() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20", "4\t12", "5\t8"};

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesFilterClause";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0, a1);");
    pigServer.registerQuery("G = group A all;");
    pigServer.registerQuery("C = foreach G generate AVG(A.$1) as average;");

    pigServer.registerQuery("Y = filter A by a1 > C.average;");

    Iterator<Tuple> iter = pigServer.openIterator("Y");

    // Average is 11
    Tuple t = iter.next();
    assertTrue(t.toString().equals("(3,20)"));

    t = iter.next();
    assertTrue(t.toString().equals("(4,12)"));

    assertFalse(iter.hasNext());
  }
Ejemplo n.º 11
0
  // See PIG-1434
  @Test
  public void testFilteredScalarDollarProj() throws Exception {
    String output = BUILD_TEST_TMP + "table_testFilteredScalarDollarProjDir";
    TestScalarAliases.deleteDirectory(new File(output));
    String[] input = {
      "1\t5\t[state#maine,city#portland]\t{(a),(b)}\t(a,b)", "2\t10\t\t\t", "3\t20\t\t\t"
    };

    // Test the use of scalars in expressions
    String inputPath = BUILD_TEST_TMP + "table_testFilteredScalarDollarProj";
    TestScalarAliases.createLocalInputFile(inputPath, input);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery(
        "A = LOAD '"
            + inputPath
            + "'"
            + " as (a0: long, a1: double, a2 : bytearray, "
            + "a3: bag{ t : tuple(tc : chararray)}, "
            + "a4: tuple(c1 : chararray, c2 : chararray) );");
    pigServer.registerQuery("B = filter A by $1 < 8;");
    pigServer.registerQuery(
        "Y = foreach A generate (a0 * B.$0), (a1 / B.$1), B.$2, B.$2#'state', B.$3, B.a4;");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.explain("Y", System.err);
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: int, a1: double);");
    pigServer.explain("Z", System.err);

    Iterator<Tuple> iter = pigServer.openIterator("Z");

    Tuple t = iter.next();
    assertTrue(t.toString().equals("(1,1.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(2,2.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(3,4.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertEquals(t.toString(), "(1,1.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    t = iter.next();
    assertEquals(t.toString(), "(2,2.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    t = iter.next();
    assertEquals(t.toString(), "(3,4.0,[state#maine,city#portland],maine,{(a),(b)},(a,b))");

    assertFalse(iter.hasNext());
  }
Ejemplo n.º 12
0
  /**
   * Tests that '\n', '\r' and '\r\n' are treated as record delims when using bzip just like they
   * are when using uncompressed text
   */
  @Test
  public void testRecordDelims() throws Exception {
    String[] inputData =
        new String[] {
          "1\t2\r3\t4", // '\r' case - this will be split into two tuples
          "5\t6\r", // '\r\n' case
          "7\t8", // '\n' case
          "9\t10\r" // '\r\n' at the end of file
        };

    // bzip compressed input
    File in = File.createTempFile("junit", ".bz2");
    String compressedInputFileName = in.getAbsolutePath();
    in.deleteOnExit();
    String clusterCompressedFilePath = Util.removeColon(compressedInputFileName);

    String unCompressedInputFileName = "testRecordDelims-uncomp.txt";
    Util.createInputFile(cluster, unCompressedInputFileName, inputData);

    try {
      CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
      for (int i = 0; i < inputData.length; i++) {
        StringBuffer sb = new StringBuffer();
        sb.append(inputData[i]).append("\n");
        byte bytes[] = sb.toString().getBytes();
        cos.write(bytes);
      }
      cos.close();

      Util.copyFromLocalToCluster(cluster, compressedInputFileName, clusterCompressedFilePath);

      // pig script to read uncompressed input
      String script = "a = load '" + unCompressedInputFileName + "';";
      PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
      pig.registerQuery(script);
      Iterator<Tuple> it1 = pig.openIterator("a");

      // pig script to read compressed input
      script = "a = load '" + Util.encodeEscape(clusterCompressedFilePath) + "';";
      pig.registerQuery(script);
      Iterator<Tuple> it2 = pig.openIterator("a");

      while (it1.hasNext()) {
        Tuple t1 = it1.next();
        Tuple t2 = it2.next();
        Assert.assertEquals(t1, t2);
      }

      Assert.assertFalse(it2.hasNext());

    } finally {
      in.delete();
      Util.deleteFile(cluster, unCompressedInputFileName);
      Util.deleteFile(cluster, clusterCompressedFilePath);
    }
  }
Ejemplo n.º 13
0
  /** Tests the end-to-end writing and reading of a BZip file. */
  @Test
  public void testBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".bz2");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutput = Util.removeColon(out.getAbsolutePath());

    CBZip2OutputStream cos = new CBZip2OutputStream(new FileOutputStream(in));
    for (int i = 1; i < 100; i++) {
      StringBuffer sb = new StringBuffer();
      sb.append(i).append("\n").append(-i).append("\n");
      byte bytes[] = sb.toString().getBytes();
      cos.write(bytes);
    }
    cos.close();

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A = foreach (group (filter AA by $0 > 0) all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutput) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutput + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(100, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutput) + "';");

    Iterator<Tuple> i = pig.openIterator("B");
    HashMap<Integer, Integer> map = new HashMap<Integer, Integer>();
    while (i.hasNext()) {
      Integer val = DataType.toInteger(i.next().get(0));
      map.put(val, val);
    }

    assertEquals(new Integer(99), new Integer(map.keySet().size()));

    for (int j = 1; j < 100; j++) {
      assertEquals(new Integer(j), map.get(j));
    }

    in.delete();
    Util.deleteFile(cluster, clusterOutput);
  }
Ejemplo n.º 14
0
 @Test
 public void testErrorMessageUndefinedAliasInGroupByStatement() throws Exception {
   String queryA = "A = load 'nosuchfile'  using PigStorage() as (f1:chararray,f2:chararray);";
   String queryB = "B = GROUP B by f1;";
   PigServer ps = new PigServer(ExecType.LOCAL);
   ps.registerQuery(queryA);
   try {
     ps.registerQuery(queryB);
   } catch (FrontendException e) {
     Assert.assertTrue(e.getMessage().contains("Undefined alias:"));
     return;
   }
   Assert.fail();
 }
 @Test(expected = IOException.class)
 public void writeUnsupportedConversion() throws IOException {
   registerLoadQuery();
   // swap ordering of key and value
   pigServer.registerQuery("A = FOREACH A GENERATE TOTUPLE(key), value;");
   // the following should die because IntWritableConverter doesn't support conversion of Tuple to
   // IntWritable
   pigServer.registerQuery(
       String.format(
           "STORE A INTO 'file:%s-2' USING %s('-c %s', '-c %s');",
           tempFilename,
           SequenceFileStorage.class.getName(),
           IntWritableConverter.class.getName(),
           TextConverter.class.getName()));
 }
Ejemplo n.º 16
0
  public void testSkewedJoinReducers() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      fail("Should not throw exception, should continue execution");
    }
  }
Ejemplo n.º 17
0
 @Test
 public void testSplitWithNotEvalCondition() throws Exception {
   String defineQ =
       "define minelogs org.apache.pig.test.RegexGroupCount('www\\\\.xyz\\\\.com/sports');";
   String defineL = "a = load 'nosuchfile' " + " using PigStorage() as (source : chararray);";
   String defineSplit =
       "SPLIT a INTO a1 IF (minelogs(source) > 0 ), a2 IF (NOT (minelogs(source)>0));"; //    (NOT
                                                                                        // (
                                                                                        // minelogs(source) ) > 0) ;";
   PigServer ps = new PigServer(ExecType.LOCAL);
   ps.registerQuery(defineQ);
   ps.registerQuery(defineL);
   try {
     ps.registerQuery(defineSplit);
   } catch (FrontendException e) {
     Assert.fail(e.getMessage());
   }
 }
Ejemplo n.º 18
0
  @Test
  public void testScalarNullValue() throws Exception {
    Storage.Data data = Storage.resetData(pigServer);
    data.set("input", Storage.tuple("a", 1), Storage.tuple("b", 2));

    pigServer.setBatchOn();
    pigServer.registerQuery("A = load 'input' using mock.Storage() as (a:chararray, b:int);");
    pigServer.registerQuery("B = FILTER A by a == 'c';");
    pigServer.registerQuery("C = FOREACH A generate a, b + B.b;");
    pigServer.registerQuery("store C into 'output' using mock.Storage();");

    pigServer.executeBatch();

    List<Tuple> actualResults = data.get("output");
    List<Tuple> expectedResults =
        Util.getTuplesFromConstantTupleStrings(new String[] {"('a', null)", "('b', null)"});
    Util.checkQueryOutputsAfterSort(actualResults.iterator(), expectedResults);
  }
 @Test
 public void readWriteUnexpectedNullValuesRead() throws IOException {
   registerLoadQuery();
   tempFilename = tempFilename + "-2";
   // swap last value with null; this pair should not be stored
   pigServer.registerQuery(
       String.format("A = FOREACH A GENERATE key, (key == 2 ? null : value) AS value;"));
   pigServer.registerQuery(
       String.format(
           "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');",
           tempFilename,
           SequenceFileStorage.class.getName(),
           IntWritableConverter.class.getName(),
           TextConverter.class.getName()));
   registerLoadQuery();
   // validation against expected pairs will succeed, with expected number of pairs one less than
   // usual (the last pair wasn't stored due to null value)
   validate(pigServer.openIterator("A"), DATA.length - 1);
 }
Ejemplo n.º 20
0
  public void testShouldWorkWithWKT() throws Exception {
    ArrayList<String[]> data = new ArrayList<String[]>();
    data.add(new String[] {"1", "1", "0", "POINT (0.0 0.0)"});
    data.add(new String[] {"1", "2", "1", "POINT (0.0 3.0)"});
    data.add(new String[] {"1", "3", "2", "POINT (4.0 5.0)"});
    data.add(new String[] {"1", "4", "3", "POINT (10.0 0.0)"});
    data.add(new String[] {"2", "5", "0", "POINT (5.0 6.0)"});
    data.add(new String[] {"2", "6", "1", "POINT (10.0 3.0)"});
    data.add(new String[] {"2", "7", "2", "POINT (7.0 13.0)"});
    data.add(new String[] {"3", "1", "0", "POINT (0.0 0.0)"});
    data.add(new String[] {"3", "8", "1", "POINT (10.0 10.0)"});
    data.add(new String[] {"3", "9", "2", "POINT (18.0 5.0)"});
    data.add(new String[] {"3", "1", "3", "POINT (0.0 0.0)"});
    String datafile = TestHelper.createTempFile(data, "\t");
    datafile = datafile.replace("\\", "\\\\");
    PigServer pig = new PigServer(LOCAL);
    String query =
        "A = LOAD 'file:"
            + datafile
            + "' as (geom_id: int, point_id: int, point_pos: int, point);\n"
            + "B = ORDER A BY point_pos;"
            + "C = GROUP B BY geom_id;"
            + "D = FOREACH C GENERATE group, FLATTEN("
            + MakeSegments.class.getName()
            + "(B.point_id, B.point));";
    pig.registerQuery(query);
    Iterator<?> it = pig.openIterator("D");

    ArrayList<String[]> expectedResults = new ArrayList<String[]>();
    expectedResults.add(new String[] {"1", "0", "1", "0.0", "0.0", "2", "0.0", "3.0"});
    expectedResults.add(new String[] {"1", "1", "2", "0.0", "3.0", "3", "4.0", "5.0"});
    expectedResults.add(new String[] {"1", "2", "3", "4.0", "5.0", "4", "10.0", "0.0"});
    expectedResults.add(new String[] {"2", "0", "5", "5.0", "6.0", "6", "10.0", "3.0"});
    expectedResults.add(new String[] {"2", "1", "6", "10.0", "3.0", "7", "7.0", "13.0"});
    expectedResults.add(new String[] {"3", "0", "1", "0.0", "0.0", "8", "10.0", "10.0"});
    expectedResults.add(new String[] {"3", "1", "8", "10.0", "10.0", "9", "18.0", "5.0"});
    expectedResults.add(new String[] {"3", "2", "9", "18.0", "5.0", "1", "0.0", "0.0"});
    Iterator<String[]> expectedResultIter = expectedResults.iterator();
    int count = 0;
    while (it.hasNext() && expectedResultIter.hasNext()) {
      Tuple tuple = (Tuple) it.next();
      String[] expectedResult = expectedResultIter.next();
      if (tuple == null) break;
      assertEquals(Integer.parseInt(expectedResult[0]), tuple.get(0));
      assertEquals(Integer.parseInt(expectedResult[1]), tuple.get(1));
      assertEquals(Long.parseLong(expectedResult[2]), tuple.get(2));
      assertEquals(Double.parseDouble(expectedResult[3]), tuple.get(3));
      assertEquals(Double.parseDouble(expectedResult[4]), tuple.get(4));
      assertEquals(Long.parseLong(expectedResult[5]), tuple.get(5));
      assertEquals(Double.parseDouble(expectedResult[6]), tuple.get(6));
      assertEquals(Double.parseDouble(expectedResult[7]), tuple.get(7));
      count++;
    }
    assertEquals(expectedResults.size(), count);
  }
Ejemplo n.º 21
0
  public void testSkewedJoinKeyPartition() throws IOException {
    try {
      Util.deleteFile(cluster, "skewedjoin");
    } catch (Exception e) {
      // it is ok if directory not exist
    }

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");

    pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 7;");
    pigServer.store("E", "skewedjoin");

    int[][] lineCount = new int[3][7];

    new File("skewedjoin").mkdir();
    // check how many times a key appear in each part- file
    for (int i = 0; i < 7; i++) {
      Util.copyFromClusterToLocal(
          cluster, "skewedjoin/part-r-0000" + i, "skewedjoin/part-r-0000" + i);

      BufferedReader reader = new BufferedReader(new FileReader("skewedjoin/part-r-0000" + i));
      String line = null;
      while ((line = reader.readLine()) != null) {
        String[] cols = line.split("\t");
        int key = Integer.parseInt(cols[0]) / 100 - 1;
        lineCount[key][i]++;
      }
    }

    int fc = 0;
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 7; j++) {
        if (lineCount[i][j] > 0) {
          fc++;
        }
      }
    }
    // atleast one key should be a skewed key
    // check atleast one key should appear in more than 1 part- file
    assertTrue(fc > 3);
  }
Ejemplo n.º 22
0
  // See PIG-1434
  @Test
  public void testScalarAliasesGrammarNegative() throws Exception {
    String[] input = {"1\t5", "2\t10", "3\t20"};

    String inputPath = BUILD_TEST_TMP + "table_testScalarAliasesGrammar";
    TestScalarAliases.createLocalInputFile(inputPath, input);

    try {
      pigServer.registerQuery("A = LOAD '" + inputPath + "' as (a0: long, a1: double);");
      pigServer.registerQuery("B = group A all;");
      pigServer.registerQuery("C = foreach B generate COUNT(A);");
      // Only projections of C are supported
      pigServer.registerQuery("Y = foreach A generate C;");
      pigServer.openIterator("Y");
      // Control should not reach here
      fail("Scalar projections are only supported");
    } catch (IOException pe) {
      assertTrue(pe.getMessage().contains("Invalid scalar projection: C"));
    }
  }
Ejemplo n.º 23
0
  public void testSkewedJoin3Way() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("D");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      return;
    }

    fail("Should throw exception, do not support 3 way join");
  }
Ejemplo n.º 24
0
  public void testSkewedJoinNullKeys() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support null keys in skewed join");
    }
    return;
  }
Ejemplo n.º 25
0
  public void testSkewedJoinMapKey() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support maps and expression operators as keys");
    }

    return;
  }
Ejemplo n.º 26
0
  // See PIG-1434
  @Test
  public void testScalarWithTwoBranches() throws Exception {
    String[] inputA = {"1\t5", "2\t10", "3\t20"};

    String[] inputX = {"pig", "hadoop", "rocks"};

    String output = BUILD_TEST_TMP + "testScalarWithTwoBranchesDir";
    TestScalarAliases.deleteDirectory(new File(output));
    // Test the use of scalars in expressions
    String inputPathA = BUILD_TEST_TMP + "testScalarWithTwoBranchesA";
    TestScalarAliases.createLocalInputFile(inputPathA, inputA);
    String inputPathX = BUILD_TEST_TMP + "testScalarWithTwoBranchesX";
    TestScalarAliases.createLocalInputFile(inputPathX, inputX);
    // Test in script mode
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD '" + inputPathA + "' as (a0: long, a1: double);");
    pigServer.registerQuery("B = group A all;");
    pigServer.registerQuery("C = foreach B generate COUNT(A) as count, MAX(A.$1) as max;");
    pigServer.registerQuery("X = LOAD '" + inputPathX + "' as (names: chararray);");
    pigServer.registerQuery("Y = foreach X generate names, C.max;");
    pigServer.registerQuery("Store Y into '" + output + "';");
    pigServer.executeBatch();
    // Check output
    pigServer.registerQuery("Z = LOAD '" + output + "' as (a0: chararray, a1: double);");

    Iterator<Tuple> iter = pigServer.openIterator("Z");

    Tuple t = iter.next();
    assertTrue(t.toString().equals("(pig,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(hadoop,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(rocks,20.0)"));

    assertFalse(iter.hasNext());

    // Check in non-batch mode
    iter = pigServer.openIterator("Y");

    t = iter.next();
    assertTrue(t.toString().equals("(pig,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(hadoop,20.0)"));

    t = iter.next();
    assertTrue(t.toString().equals("(rocks,20.0)"));

    assertFalse(iter.hasNext());

    pigServer.getPigContext().getProperties().remove("tez.am.inline.task.execution.max-tasks");
  }
Ejemplo n.º 27
0
  /** Tests the end-to-end writing and reading of an empty BZip file. */
  @Test
  public void testEmptyBzipInPig() throws Exception {
    PigServer pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    File in = File.createTempFile("junit", ".tmp");
    in.deleteOnExit();

    File out = File.createTempFile("junit", ".bz2");
    out.delete();
    String clusterOutputFilePath = Util.removeColon(out.getAbsolutePath());

    FileOutputStream fos = new FileOutputStream(in);
    fos.write("55\n".getBytes());
    fos.close();
    System.out.println(in.getAbsolutePath());

    pig.registerQuery(
        "AA = load '"
            + Util.generateURI(Util.encodeEscape(in.getAbsolutePath()), pig.getPigContext())
            + "';");
    pig.registerQuery("A=foreach (group (filter AA by $0 < '0') all) generate flatten($1);");
    pig.registerQuery("store A into '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    FileSystem fs =
        FileSystem.get(ConfigurationUtil.toConfiguration(pig.getPigContext().getProperties()));
    FSDataInputStream is = fs.open(new Path(clusterOutputFilePath + "/part-r-00000.bz2"));
    CBZip2InputStream cis = new CBZip2InputStream(is, -1, out.length());

    // Just a sanity check, to make sure it was a bzip file; we
    // will do the value verification later
    assertEquals(-1, cis.read(new byte[100]));
    cis.close();

    pig.registerQuery("B = load '" + Util.encodeEscape(clusterOutputFilePath) + "';");
    pig.openIterator("B");

    in.delete();
    Util.deleteFile(cluster, clusterOutputFilePath);
  }
  @Test
  public void testUpdate() throws Exception {
    BasicDBObject obj1 = new BasicDBObject().append("f1", "a").append("f2", "value1");
    BasicDBObject obj2 = new BasicDBObject().append("f1", "b").append("f2", "value2");
    insertData("testUpdate", obj1, obj2);

    String[] input = {"a\tnewValue1\t1", "b\tnewValue2\t2"};
    Util.createLocalInputFile("simple_input", input);

    pigServerLocal = new PigServer(ExecType.LOCAL);
    pigServerLocal.registerQuery(
        "A = LOAD 'simple_input' as (f1:chararray, f2:chararray, f3:int);");
    pigServerLocal.registerQuery(
        String.format(
            "STORE A INTO 'mongodb://localhost:27017/%s.%s' USING com.mongodb.hadoop.pig.MongoUpdateStorage("
                + "  '{f1:\"\\\\$f1\"}',"
                + "  '{\\\\$set:{f2:\"\\\\$f2\", f3:\"\\\\$f3\"}}',"
                + "  'f1:chararray, f2:chararray, f3:int'"
                + ");",
            dbName, "update_simple"));
    pigServerLocal.setBatchOn();
    pigServerLocal.executeBatch();

    MongoClient mc = new MongoClient();
    DBCollection col = mc.getDB(dbName).getCollection("update_simple");

    DBCursor cursor = col.find();

    assertEquals(2, cursor.size());
    DBObject result1 = cursor.next();
    assertEquals("a", result1.get("f1"));
    assertEquals("newValue1", result1.get("f2"));
    assertEquals(1, result1.get("f3"));
    DBObject result2 = cursor.next();
    assertEquals("b", result2.get("f1"));
    assertEquals("newValue2", result2.get("f2"));
    assertEquals(2, result2.get("f3"));
  }
Ejemplo n.º 29
0
 @Test
 public void testImplicitSplit() throws Exception {
   int LOOP_SIZE = 20;
   String[] input = new String[LOOP_SIZE];
   for (int i = 1; i <= LOOP_SIZE; i++) {
     input[i - 1] = Integer.toString(i);
   }
   String inputFileName = "testImplicitSplit-input.txt";
   Util.createInputFile(cluster, inputFileName, input);
   pigServer.registerQuery("A = LOAD '" + inputFileName + "';");
   pigServer.registerQuery("B = filter A by $0<=10;");
   pigServer.registerQuery("C = filter A by $0>10;");
   pigServer.registerQuery("D = union B,C;");
   Iterator<Tuple> iter = pigServer.openIterator("D");
   if (!iter.hasNext()) fail("No Output received");
   int cnt = 0;
   while (iter.hasNext()) {
     Tuple t = iter.next();
     ++cnt;
   }
   assertEquals(20, cnt);
   Util.deleteFile(cluster, inputFileName);
 }
 @Test
 public void readWriteNullValuesRead() throws IOException {
   registerLoadQuery();
   tempFilename = tempFilename + "-2";
   pigServer.registerQuery(
       String.format(
           "STORE A INTO 'file:%s' USING %s('-c %s', '-c %s');",
           tempFilename,
           SequenceFileStorage.class.getName(),
           IntWritableConverter.class.getName(),
           NullWritableConverter.class.getName()));
   registerLoadQuery(IntWritableConverter.class, NullWritableConverter.class, null);
   validateIndex(pigServer.openIterator("A"), 2, 0, 0);
 }