public void testSkewedJoinWithNoProperties() throws IOException {
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      DataBag dbshj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("E = join A by(id, name), B by (id, name);");
        Iterator<Tuple> iter = pigServer.openIterator("E");

        while (iter.hasNext()) {
          dbshj.add(iter.next());
        }
      }
      Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
      Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));

    } catch (Exception e) {
      fail(e.getMessage());
    }
  }
Beispiel #2
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    try {

      DataBag bag = DefaultBagFactory.getInstance().newDefaultBag();

      if (input == null || input.size() == 0) {
        return bag; // an empty bag
      }
      if (this.fieldType == DataType.MAP) {

        Tuple t = DefaultTupleFactory.getInstance().newTuple(1);
        t.set(0, createMap(input));

        bag.add(t);

      } else {
        bag.add(input);
      }

      return bag;

    } catch (Exception e) {
      throw new RuntimeException(
          "Error while computing size in " + this.getClass().getSimpleName());
    }
  }
    @Override
    public Tuple exec(Tuple input) throws IOException {
      Tuple output = tupleFactory.newTuple();
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      DataBag items = (DataBag) input.get(0);

      if (items != null) {
        long n = items.size();

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple item : items) {
          double key = _rdg.nextUniform(0.0d, 1.0d);

          if (key < q1) {
            selected.add(item);
          } else if (key < q2) {
            waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory));
          }
        }

        output.append(n);
        output.append(selected);
        output.append(waiting);
      }

      return output;
    }
  // pig 1048
  public void testSkewedJoinOneValue() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);");
    // Filter key with a single value

    pigServer.registerQuery("C = FILTER A by id == 400;");
    pigServer.registerQuery("D = FILTER B by id == 400;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by id, D by id using \"skewed\";");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by id, D by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
Beispiel #5
0
    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag output = BagFactory.getInstance().newDefaultBag();

      DataBag samples = (DataBag) input.get(0);
      if (samples == null) {
        // do nothing
      } else if (samples.size() <= numSamples) {
        // no need to construct a reservoir, so just emit intermediate tuples
        for (Tuple sample : samples) {
          // add the score on to the intermediate tuple
          output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory));
        }
      } else {
        for (Tuple sample : samples) {
          getReservoir().consider(new ScoredTuple(Math.random(), sample));
        }

        for (ScoredTuple scoredTuple : getReservoir()) {
          // add the score on to the intermediate tuple
          output.add(scoredTuple.getIntermediateTuple(tupleFactory));
        }
      }

      return tupleFactory.newTuple(output);
    }
  public void testSkewedJoinWithGroup() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = GROUP A by id;");
    pigServer.registerQuery("D = GROUP B by id;");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbshj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join C by group, D by group;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbshj.add(iter.next());
      }
    }
    Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0);
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj));
  }
  public void testSkewedJoinManyReducers() throws IOException {
    pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2");
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);");

    DataBag dbfrj = BagFactory.getInstance().newDefaultBag(),
        dbrj = BagFactory.getInstance().newDefaultBag();
    {
      pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbfrj.add(iter.next());
      }
    }
    {
      pigServer.registerQuery("E = join A by id, B by id;");
      Iterator<Tuple> iter = pigServer.openIterator("E");

      while (iter.hasNext()) {
        dbrj.add(iter.next());
      }
    }
    Assert.assertEquals(dbfrj.size(), dbrj.size());
    Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj));
  }
  public DataBag exec(Tuple input) throws IOException {
    try {
      if (!input.isNull()) {
        // Create the output like a databag {(res1,res2),(res3,res4)..}
        DataBag output_databag = mBagFactory.newDefaultBag();
        // Unpack tuple in order to get the bag {(1,2),(3,4),...}
        String input_time = (String) input.get(0);
        try {

          DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss");
          Date date =
              formatter.parse(
                  String.format(
                      "%s/%s/%s %s:%s:%s",
                      input_time.substring(5, 7),
                      input_time.substring(8, 10),
                      input_time.substring(0, 4),
                      input_time.substring(11, 13),
                      input_time.substring(14, 16),
                      input_time.substring(17, 18)));
          Calendar calendar = Calendar.getInstance();
          calendar.setTime(date);
          int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK);
          int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH);
          int hour = calendar.get(Calendar.HOUR_OF_DAY);

          // Add items to output
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour));
          output_databag.add(items);

        } catch (Exception e) {
          Tuple items = TupleFactory.getInstance().newTuple(1);
          items.set(0, "petting #1" + e.getMessage());
          output_databag.add(items);
          return output_databag;
        }
        return output_databag;
      } else {
        DataBag output_databag = mBagFactory.newDefaultBag();
        Tuple items = TupleFactory.getInstance().newTuple(1);
        items.set(0, "petting #2");
        output_databag.add(items);
        return output_databag;
      }
    } catch (Exception e) {
      System.err.println("Error with ?? ..");
      DataBag output_databag = mBagFactory.newDefaultBag();
      Tuple items = TupleFactory.getInstance().newTuple(1);
      items.set(0, "petting #3" + e.getMessage());
      output_databag.add(items);
      return output_databag;
    }
  }
 private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema)
     throws IOException {
   if (fieldSchema == null) {
     throw new IOException("Schema is null");
   }
   ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
   Tuple t;
   int buf;
   while ((buf = in.read()) != '{') {
     if (buf == -1) {
       throw new IOException("Unexpect end of bag");
     }
   }
   if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema");
   ResourceFieldSchema fs = fss[0];
   DataBag db = DefaultBagFactory.getInstance().newDefaultBag();
   while (true) {
     t = consumeTuple(in, fs);
     if (t != null) db.add(t);
     while ((buf = in.read()) != '}' && buf != ',') {
       if (buf == -1) {
         throw new IOException("Unexpect end of bag");
       }
     }
     if (buf == '}') break;
   }
   return db;
 }
Beispiel #10
0
  @Override
  public DataBag exec(Tuple input) throws IOException {
    DataBag bag = (DataBag) input.get(0);
    HashMap<String, Double> clsCnt = new HashMap<String, Double>();
    Iterator<Tuple> it = bag.iterator();
    Double sum = new Double(0.0);
    while (it.hasNext()) {
      Tuple item = (Tuple) it.next();
      String cls = (String) item.get(3);
      if (cls != null && cls.length() > 0) {
        Double cur = clsCnt.get(cls);
        Double inc = (Double) item.get(2);
        if (cur != null) {
          clsCnt.put(cls, cur + inc);
        } else {
          clsCnt.put(cls, inc);
        }
        sum += inc;
      }
    }

    Set<Entry<String, Double>> clses = clsCnt.entrySet();
    Iterator<Entry<String, Double>> cit = clses.iterator();
    DataBag result = bagFactory.newDefaultBag();
    while (cit.hasNext()) {
      Entry<String, Double> cls = cit.next();
      Tuple tpl = tupleFactory.newTuple(2);
      tpl.set(0, cls.getKey());
      tpl.set(1, cls.getValue() / sum);
      result.add(tpl);
    }

    return result;
  }
Beispiel #11
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    String normStr = ((String) input.get(0));
    if (normStr == null) {
      return null;
    }

    // Remove punctuation except when it's a version number
    normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" ");
    normStr = spacePattern.matcher(normStr).replaceAll(" ");

    DataBag output = bagFactory.newDefaultBag();
    for (String s : spacePattern.split(normStr.trim())) {
      if (s.length() <= 30) {
        Tuple t = tupleFactory.newTuple(1);
        t.set(0, s);
        output.add(t);
      }
    }

    return output;
  }
    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      long n = 0L;
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);
        selected.addAll((DataBag) innerTuple.get(1));
        waiting.addAll((DataBag) innerTuple.get(2));
      }

      long sampleSize = (long) Math.ceil(_samplingProbability * n);
      long nNeeded = sampleSize - selected.size();

      for (Tuple scored : waiting) {
        if (nNeeded <= 0) {
          break;
        }
        selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple());
        nNeeded--;
      }

      return selected;
    }
  @Test
  public void testCase1() throws IOException {

    Tuple input = TupleFactory.getInstance().newTuple(2);
    Tuple groupInfo = TupleFactory.getInstance().newTuple(2);

    groupInfo.set(0, "column_3");
    groupInfo.set(1, Integer.valueOf(1));

    DataBag dataBag = new DefaultDataBag();
    // {(PSIColumn: int, columnId: int, value: chararray, tag: boolean , rand: int)}
    for (int i = 0; i < 10; i++) {
      Tuple t = TupleFactory.getInstance().newTuple(4);
      t.set(0, Integer.valueOf(1));
      t.set(1, Integer.valueOf(1));
      t.set(2, array[i]);
      dataBag.add(t);
    }

    input.set(0, groupInfo);
    input.set(1, dataBag);

    Tuple output = inst.exec(input);

    Assert.assertEquals(output.get(0), 1);

    String[] outputArray =
        output.get(1).toString().split(String.valueOf(CalculateStatsUDF.CATEGORY_VAL_SEPARATOR));

    Assert.assertEquals(outputArray[0], "1");
    Assert.assertEquals(outputArray[1], "2");
    Assert.assertEquals(outputArray[2], "0");
  }
Beispiel #14
0
  @Override
  public DataBag extract(Object o, String lang) {

    DocumentMetadata dm = (DocumentMetadata) o;
    DataBag db = new DefaultDataBag();
    DiacriticsRemover DR = new DiacriticsRemover();

    for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) {
      if (lang != null && !lang.equalsIgnoreCase(title.getLanguage())) {
        continue;
      }
      String sTitle = title.getText();
      String normalized_title = (String) DR.normalize(sTitle);
      if (normalized_title == null) {
        continue;
      }
      String[] normals = normalized_title.split("[\\W]+");
      for (String s : normals) {
        if (s.isEmpty()) {
          continue;
        }
        Object normalized = normalizeExtracted(s);
        if (normalized == null) {
          continue;
        }
        Tuple t = TupleFactory.getInstance().newTuple(normalized);
        db.add(t);
      }
    }

    return db;
  }
Beispiel #15
0
 @Override
 public DataBag getValue() {
   DataBag output = BagFactory.getInstance().newDefaultBag();
   for (ScoredTuple sample : getReservoir()) {
     output.add(sample.getTuple());
   }
   return output;
 }
 /**
  * create bag having given number of tuples
  *
  * @param size
  * @return
  */
 private DataBag createBag(int size) {
   Tuple innerTuple = TupleFactory.getInstance().newTuple();
   innerTuple.append(Integer.valueOf(1));
   DataBag bag = BagFactory.getInstance().newDefaultBag();
   for (int i = 0; i < size; i++) {
     bag.add(innerTuple);
   }
   return bag;
 }
  @SuppressWarnings("unchecked")
  @Override
  public DataBag exec(Tuple input) throws IOException {
    if (input.size() < 2) {
      throw new RuntimeException("Expected at least two inputs, but found " + input.size());
    }

    for (Object o : input) {
      if (o != null && !(o instanceof DataBag)) {
        throw new RuntimeException("Inputs must be bags");
      }
    }

    DataBag outputBag = bagFactory.newDefaultBag();

    DataBag bag1 = (DataBag) input.get(0);
    DataBag bag2 = (DataBag) input.get(1);

    if (bag1 == null || bag1.size() == 0) {
      return outputBag;
    }
    // optimization
    else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) {
      return bag1;
    }

    PriorityQueue<Pair> pq = loadBags(input);

    Tuple lastData = null;

    while (true) {
      Pair nextPair = pq.peek();

      // ignore data we've already encountered
      if (nextPair.data.compareTo(lastData) != 0) {
        // Only take data from the first bag, where there are no other
        // bags that have the same data.
        if (nextPair.index.equals(0) && countMatches(pq) == 0) {
          outputBag.add(nextPair.data);
          lastData = nextPair.data;
        }
      }

      Pair p = pq.poll();

      // only put the bag back into the queue if it still has data
      if (p.hasNext()) {
        p.next();
        pq.offer(p);
      } else if (p.index.equals(0)) {
        // stop when we exhaust all elements from the first bag
        break;
      }
    }

    return outputBag;
  }
  @Override
  public Tuple exec(Tuple input) throws IOException {
    if (input == null || input.size() == 0) {
      return null;
    }

    Object obj = null;
    Integer limnum = null;
    try {
      obj = (DataByteArray) input.get(1);

    } catch (ExecException e) {
      logger.error("Error in reading field proto:", e);
      throw e;
    }

    try {
      limnum = (Integer) input.get(2);
    } catch (ExecException e) {
      logger.error("Error in reading baglimit:", e);
      throw e;
    }

    DataByteArray dba = null;
    try {
      dba = (DataByteArray) obj;
    } catch (ClassCastException e) {
      logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e);
      throw e;
    }

    DocumentMetadata dm = null;
    try {
      dm = DocumentMetadata.parseFrom(dba.get());
    } catch (InvalidProtocolBufferException e) {
      logger.error("Error in reading ByteArray to DocumentMetadata:", e);
      throw e;
    }

    String key = dm.getKey();
    DataBag db = new DefaultDataBag();
    int bagsize = 0;
    for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) {
      for (String co_str : code.getValueList()) {
        bagsize++;
        db.add(TupleFactory.getInstance().newTuple(co_str));
      }
    }
    if (bagsize > limnum) {
      Object[] to = new Object[] {key, db, bagsize};
      return TupleFactory.getInstance().newTuple(Arrays.asList(to));
    }
    return null;
  }
Beispiel #19
0
  @Override
  public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props)
      throws IOException, InterruptedException {
    inputBlock = input.values().iterator().next();

    init(operatorJson, inputBlock.getProperties().getSchema());

    nullBag = BagFactory.getInstance().newDefaultBag();

    nullBag.add(TupleFactory.getInstance().newTuple(0));
  }
  @Override
  public DataBag exec(Tuple input) throws IOException {
    retrieveContextValues();

    ArrayList<String> joinKeyNames = new ArrayList<String>();
    for (int i = 1; i < input.size(); i += 2) {
      joinKeyNames.add((String) input.get(i));
    }

    JoinCollector collector = new JoinCollector();
    // the first bag is the outer bag
    String leftBagName = bagNames.get(0);
    DataBag leftBag = getBag(input, leftBagName);
    String leftBagJoinKeyName =
        getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0));
    collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName));
    // now, for each additional bag, group up the tuples by the join key, then join them in
    if (bagNames.size() > 1) {
      for (int i = 1; i < bagNames.size(); i++) {
        String bagName = bagNames.get(i);
        DataBag bag = getBag(input, bagName);
        String joinKeyName =
            getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i));
        int tupleSize = bagNameToSize.get(bagName);
        if (bag == null)
          throw new IOException(
              "Error in instance: "
                  + getInstanceName()
                  + " with properties: "
                  + getInstanceProperties()
                  + " and tuple: "
                  + input.toDelimitedString(", ")
                  + " -- Expected bag, got null");
        HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName);
        // outer join, so go back in and add nulls;
        groupedData = collector.insertNullTuples(groupedData, tupleSize);
        for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) {
          collector.joinTuples(entry.getKey(), entry.getValue());
        }
      }
    }

    // assemble output bag
    DataBag outputBag = BagFactory.getInstance().newDefaultBag();
    for (List<Tuple> tuples : collector.getJoinData().values()) {
      for (Tuple tuple : tuples) {
        outputBag.add(tuple);
      }
    }

    return outputBag;
  }
  public void testSkewedJoinOuter() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id left, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("C = join A by id right, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
      {
        pigServer.registerQuery("C = join A by id full, B by id using \"skewed\";");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support outer join in skewed join");
    }
    return;
  }
  public DataBag exec(Tuple input) throws IOException {
    try {
      DataBag bag = BagFactory.getInstance().newDefaultBag();

      for (int i = 0; i < input.size(); i++) {
        final Object object = input.get(i);
        if (object instanceof Tuple) {
          for (int j = 0; j < ((Tuple) object).size(); j++) {
            Tuple tp2 = TupleFactory.getInstance().newTuple(1);
            tp2.set(0, ((Tuple) object).get(j));
            bag.add(tp2);
          }
        } else {
          Tuple tp2 = TupleFactory.getInstance().newTuple(1);
          tp2.set(0, object);
          bag.add(tp2);
        }
      }

      return bag;
    } catch (Exception ee) {
      throw new RuntimeException("Error while creating a bag", ee);
    }
  }
  public void testSkewedJoinReducers() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      fail("Should not throw exception, should continue execution");
    }
  }
  @Override
  public DataBag exec(Tuple input) throws IOException {

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      try {
        dba = (DataByteArray) input.get(0);
      } catch (ExecException e) {
        logger.error("Error in reading field:", e);
        throw e;
      }

      DocumentWrapper dm = null;
      try {
        dm = DocumentWrapper.parseFrom(dba.get());
      } catch (Exception e) {
        logger.error("Error in reading ByteArray to DocumentMetadata:", e);
        throw e;
      }

      DataBag ret = new DefaultDataBag();
      DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray());

      List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList();

      for (int i = 0; i < authors.size(); i++) {
        String sname = authors.get(i).getSurname();
        Object[] to = new Object[] {sname, metadata, i};
        Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to));
        ret.add(t);
      }

      return ret;

    } catch (Exception e) {
      logger.error("Error in processing input row:", e);
      throw new IOException(
          "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e));
    }
  }
Beispiel #25
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
Beispiel #26
0
    @Override
    public DataBag exec(Tuple input) throws IOException {
      DataBag bagOfSamples = (DataBag) input.get(0);
      for (Tuple innerTuple : bagOfSamples) {
        DataBag samples = (DataBag) innerTuple.get(0);

        for (Tuple sample : samples) {
          // use the same score as previously generated
          getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample));
        }
      }

      DataBag output = BagFactory.getInstance().newDefaultBag();
      for (ScoredTuple scoredTuple : getReservoir()) {
        // output the original tuple
        output.add(scoredTuple.getTuple());
      }

      return output;
    }
  public void testSkewedJoin3Way() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);");
    pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;");
        Iterator<Tuple> iter = pigServer.openIterator("D");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      return;
    }

    fail("Should throw exception, do not support 3 way join");
  }
 @SuppressWarnings("unchecked")
 private void accumulateData() throws ExecException {
   int count = 0;
   int length = inputs.size() - 1;
   inputBags = new DataBag[length];
   its = new Iterator[length];
   for (int i = 0; i < length; ++i) {
     PhysicalOperator op = inputs.get(i);
     DataBag bag = BagFactory.getInstance().newDefaultBag();
     inputBags[count] = bag;
     for (Result res = op.getNextTuple();
         res.returnStatus != POStatus.STATUS_EOP;
         res = op.getNextTuple()) {
       if (res.returnStatus == POStatus.STATUS_NULL) continue;
       if (res.returnStatus == POStatus.STATUS_ERR)
         throw new ExecException("Error accumulating data in the local Cross operator");
       if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result);
     }
     its[count++] = bag.iterator();
   }
 }
  public void testSkewedJoinMapKey() throws IOException {
    pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);");
    try {
      DataBag dbfrj = BagFactory.getInstance().newDefaultBag();
      {
        pigServer.registerQuery(
            "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;");
        Iterator<Tuple> iter = pigServer.openIterator("C");

        while (iter.hasNext()) {
          dbfrj.add(iter.next());
        }
      }
    } catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
      fail("Should support maps and expression operators as keys");
    }

    return;
  }
  private Pair<String, DataBag> extractLangKeywords(DocumentMetadata dm) {
    List<String> kws = new ArrayList<String>();
    Set<String> ctgs = new HashSet<String>();

    for (KeywordsList kwl : dm.getKeywordsList()) {
      if (language.equalsIgnoreCase(kwl.getLanguage())) {
        for (String str : kwl.getKeywordsList()) {

          if (isClassifCode(str)) {
            ctgs.add(str);
            continue;
          }

          if (action == Action.TRANSLATE) {
            str = translateNonAlphaNumeric(str);
          } else if (action == Action.REMOVE_KEYCHARACTERS) {
            str = removeAllKeyPunctations(str);
          } else {
            str = removeAllNonAlphaNumeric(str);
          }

          kws.add(str);
        }
      }
    }

    for (ClassifCode cc : dm.getBasicMetadata().getClassifCodeList()) {
      for (String s : cc.getValueList()) {
        ctgs.add(s);
      }
    }

    DataBag db = new DefaultDataBag();
    for (String s : ctgs) {
      db.add(TupleFactory.getInstance().newTuple(s));
    }

    return new Pair<String, DataBag>(Joiner.on(" ").join(kws), db);
  }