@Override
    public Tuple exec(Tuple input) throws IOException {
      Tuple output = tupleFactory.newTuple();
      DataBag selected = bagFactory.newDefaultBag();
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());

      DataBag items = (DataBag) input.get(0);

      if (items != null) {
        long n = items.size();

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple item : items) {
          double key = _rdg.nextUniform(0.0d, 1.0d);

          if (key < q1) {
            selected.add(item);
          } else if (key < q2) {
            waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory));
          }
        }

        output.append(n);
        output.append(selected);
        output.append(waiting);
      }

      return output;
    }
  protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException {
    // Need to make a copy of the value, as hadoop uses the same ntup
    // to represent each value.
    Tuple val = (Tuple) ntup.getValueAsPigType();

    Tuple copy = null;
    // The "value (val)" that we just got may not
    // be the complete "value". It may have some portions
    // in the "key" (look in POLocalRearrange for more comments)
    // If this is the case we need to stitch
    // the "value" together.
    Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
    boolean isProjectStar = lrKeyInfo.first;
    Map<Integer, Integer> keyLookup = lrKeyInfo.second;
    int keyLookupSize = keyLookup.size();

    if (keyLookupSize > 0) {

      // we have some fields of the "value" in the
      // "key".
      copy = mTupleFactory.newTuple();
      int finalValueSize = keyLookupSize + val.size();
      int valIndex = 0; // an index for accessing elements from
      // the value (val) that we have currently
      for (int i = 0; i < finalValueSize; i++) {
        Integer keyIndex = keyLookup.get(i);
        if (keyIndex == null) {
          // the field for this index is not in the
          // key - so just take it from the "value"
          // we were handed
          copy.append(val.get(valIndex));
          valIndex++;
        } else {
          // the field for this index is in the key
          if (isKeyTuple) {
            // the key is a tuple, extract the
            // field out of the tuple
            copy.append(keyAsTuple.get(keyIndex));
          } else {
            copy.append(key);
          }
        }
      }

    } else if (isProjectStar) {

      // the whole "value" is present in the "key"
      copy = mTupleFactory.newTuple(keyAsTuple.getAll());

    } else {

      // there is no field of the "value" in the
      // "key" - so just make a copy of what we got
      // as the "value"
      copy = mTupleFactory.newTuple(val.getAll());
    }
    return copy;
  }
Exemple #3
0
 @Before
 public void setUp() throws Exception {
   // New args and tuple to fuse for each test:
   arg = new DefaultTuple();
   arg.append(null);
   arg.append(null);
   arg.append(null);
   toFuse = new DefaultTuple();
 }
    @Override
    public void map(
        LongWritable key,
        Text value,
        OutputCollector<BytesWritable, Tuple> output,
        Reporter reporter)
        throws IOException {
      // value should contain "word count"
      String[] wdct = value.toString().split(" ");
      if (wdct.length != 2) {
        // LOG the error
        return;
      }

      byte[] word = wdct[0].getBytes();
      bytesKey.set(word, 0, word.length);
      System.out.println("word: " + new String(word));
      tupleRow.set(0, new String(word));
      tupleRow.set(1, Integer.parseInt(wdct[1]));
      System.out.println("count:  " + Integer.parseInt(wdct[1]));

      // This key has to be created by user
      /*
       * Tuple userKey = new DefaultTuple(); userKey.append(new String(word));
       * userKey.append(Integer.parseInt(wdct[1]));
       */
      System.out.println("in map, sortkey: " + sortKey);
      Tuple userKey = new ZebraTuple();
      if (sortKey.equalsIgnoreCase("word,count")) {
        userKey.append(new String(word));
        userKey.append(Integer.parseInt(wdct[1]));
      }

      if (sortKey.equalsIgnoreCase("count")) {
        userKey.append(Integer.parseInt(wdct[1]));
      }

      if (sortKey.equalsIgnoreCase("word")) {
        userKey.append(new String(word));
      }

      try {

        /* New M/R Interface */
        /* Converts user key to zebra BytesWritable key */
        /* using sort key expr tree */
        /* Returns a java base object */
        /* Done for each user key */

        bytesKey = BasicTableOutputFormat.getSortKey(javaObj, userKey);
      } catch (Exception e) {

      }

      output.collect(bytesKey, tupleRow);
    }
 private void setCrossTaskDownstreamTaint(Tuple keys, Set<String> tags) {
   for (String neighbor : crossTaskDownstreamNeighbors) {
     Tuple body = new DefaultTuple();
     body.append("cross");
     body.append(keys);
     for (String tag : tags) {
       body.append(tag);
     }
     senderReceiver.sendAsync(
         new Message(Message.Type.TAINT, location, new LogicalLocation(neighbor), body));
   }
 }
  /**
   * test sedes of int of diff sizes
   *
   * @throws IOException
   */
  @Test
  public void testTupleWriteReadIntDiffSizes() throws IOException {
    // create a tuple with integer columns of different sizes
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(new Integer(0)); // boolean rep
    tuple.append(new Integer(1)); // boolean rep
    tuple.append(new Integer(125)); // fits into byte
    tuple.append(new Integer(1024)); // fits into short
    tuple.append(new Integer(1024 * 1024 * 1024)); // fits into int (=~ 2 ^30)

    testTupleSedes(tuple);
  }
Exemple #7
0
  @Test
  public void testGoodArgs() throws IOException {
    toFuse.append("notIncluded");
    toFuse.append("foo");
    toFuse.append("bar");
    // Fuse cols 1 to end of tuple:
    arg.set(ConcatColumns.SLICE_SPEC_POS, "1:4");
    arg.set(ConcatColumns.TUPLE_TO_FUSE_POS, toFuse);
    String fusedStr = new ConcatColumns().exec(arg);
    assertEquals("foobar", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, "1:1");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("foo", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, ":1");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("notIncluded", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, "0:");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("notIncludedfoobar", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, ":");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("notIncludedfoobar", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, "1:-1");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("foo", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, ":-1");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("notIncludedfoo", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, ":-2");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("notIncluded", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, ":-3");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("notIncluded", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, "1:2");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("foo", fusedStr);

    arg.set(ConcatColumns.SLICE_SPEC_POS, "1:3");
    arg.set(ConcatColumns.CONCAT_SEPARATOR_POS, "|");
    fusedStr = new ConcatColumns().exec(arg);
    assertEquals("foo|bar", fusedStr);
  }
  /**
   * test sedes with maps of diff sizes
   *
   * @throws IOException
   */
  @Test
  public void testTupleWriteReadMapDiffSizes() throws IOException {
    // tuple with ByteArray and strings of different sizes
    Tuple tuple = TupleFactory.getInstance().newTuple();
    Map<String, Object> tinyMap = createMap(10);
    Map<String, Object> smallMap = createMap(1000);
    Map<String, Object> largeMap = createMap(100 * 1024);
    tuple.append(tinyMap);
    tuple.append(smallMap);
    tuple.append(largeMap);

    testTupleSedes(tuple);
  }
  /**
   * test sedes with bags of diff sizes
   *
   * @throws IOException
   */
  @Test
  public void testTupleWriteReadBagDiffSizes() throws IOException {
    // tuple with ByteArray and strings of different sizes
    Tuple tuple = TupleFactory.getInstance().newTuple();
    DataBag tinyBag = createBag(10);
    DataBag smallBag = createBag(1000);
    DataBag largeBag = createBag(100 * 1024);

    tuple.append(tinyBag);
    tuple.append(smallBag);
    tuple.append(largeBag);

    testTupleSedes(tuple);
  }
 private void setWithinTaskDownstreamTaint(Set<String> tags) {
   if (!withinTaskDownstreamTaint.equals(tags)) {
     Tuple body = new DefaultTuple();
     body.append("within");
     for (String tag : tags) {
       body.append(tag);
     }
     for (String neighbor : withinTaskDownstreamNeighbors) {
       sendWithinTaskMessage(
           neighbor,
           new Message(Message.Type.TAINT, location, new LogicalLocation(neighbor), body));
     }
     withinTaskDownstreamTaint = tags;
   }
 }
  @Override
  public Tuple exec(Tuple input) throws IOException {
    TupleFactory tFactory = TupleFactory.getInstance();
    Tuple oTuple = tFactory.newTuple();
    Tuple sketchTupleA = (DefaultTuple) input.get(0);
    Tuple sketchTupleB = (DefaultTuple) input.get(1);

    List<Object> fieldsA = sketchTupleA.getAll();
    List<Object> fieldsB = sketchTupleB.getAll();
    List<Integer> iFields1 = new ArrayList<Integer>();
    List<Integer> iFields2 = new ArrayList<Integer>();

    int count = fieldsA.size() * 2;
    int match = 0;

    for (int i = 0; i < fieldsA.size(); i++) {
      int a = (Integer) fieldsA.get(i);
      for (int j = 0; j < fieldsB.size(); j++) {
        int b = (Integer) fieldsB.get(j);
        if (a == b) {
          match += 2;
        }
      }
    }

    double sim = (double) match / (double) count;

    oTuple.append(sim);
    return oTuple;
  }
 private static Tuple extractKeys(Tuple t, List<Integer> keyFields) throws ExecException {
   Tuple keys = new DefaultTuple();
   for (int keyField : keyFields) {
     keys.append(t.get(keyField));
   }
   return keys;
 }
 private Tuple createTupleWithManyCols(int size) {
   Tuple t = TupleFactory.getInstance().newTuple(size);
   Integer col = Integer.valueOf(1);
   for (int i = 0; i < size; i++) {
     t.append(col);
   }
   return t;
 }
Exemple #14
0
  // Construct a tuple that represents this json:
  //   {"stacks":[[[4,3],[2,1]], [[1,2],[3,4]]]}
  public Tuple getTestTuple() {
    TupleFactory tupleFactory = TupleFactory.getInstance();
    Tuple tAll = tupleFactory.newTuple();
    Tuple t1 = tupleFactory.newTuple();
    Tuple t1a = tupleFactory.newTuple();
    t1a.append(4);
    t1a.append(3);
    Tuple t1b = tupleFactory.newTuple();
    t1b.append(2);
    t1b.append(1);
    t1.append(t1a);
    t1.append(t1b);
    Tuple t2 = tupleFactory.newTuple();
    Tuple t2a = tupleFactory.newTuple();
    t2a.append(1);
    t2a.append(2);
    Tuple t2b = tupleFactory.newTuple();
    t2b.append(3);
    t2b.append(4);
    t2.append(t2a);
    t2.append(t2b);
    tAll.append(t1);
    tAll.append(t2);

    return tAll;
  }
 /**
  * create bag having given number of tuples
  *
  * @param size
  * @return
  */
 private DataBag createBag(int size) {
   Tuple innerTuple = TupleFactory.getInstance().newTuple();
   innerTuple.append(Integer.valueOf(1));
   DataBag bag = BagFactory.getInstance().newDefaultBag();
   for (int i = 0; i < size; i++) {
     bag.add(innerTuple);
   }
   return bag;
 }
Exemple #16
0
 @Test(expected = IOException.class)
 public void testBadSliceDefs() throws IOException {
   toFuse.append("foo");
   arg.set(ConcatColumns.SLICE_SPEC_POS, "3:1");
   arg.set(ConcatColumns.CONCAT_SEPARATOR_POS, "");
   arg.set(ConcatColumns.TUPLE_TO_FUSE_POS, toFuse);
   // Start > end:
   new ConcatColumns().exec(arg);
 }
 public void joinTuples(Object key, List<Tuple> tuples) throws ExecException {
   List<Tuple> currentTuples = joinData.get(key);
   if (currentTuples != null) {
     List<Tuple> newTuples = new LinkedList<Tuple>();
     if (tuples != null) {
       for (Tuple t1 : currentTuples) {
         for (Tuple t2 : tuples) {
           Tuple t = TupleFactory.getInstance().newTuple();
           for (Object o : t1.getAll()) {
             t.append(o);
           }
           for (Object o : t2.getAll()) {
             t.append(o);
           }
           newTuples.add(t);
         }
       }
     }
     joinData.put(key, newTuples);
   }
 }
  /**
   * test sedes of bytearray, string of diff sizes
   *
   * @throws IOException
   */
  @Test
  public void testTupleWriteReadByteArrStringDiffSizes() throws IOException {
    // tuple with ByteArray and strings of different sizes
    Tuple tuple = TupleFactory.getInstance().newTuple();
    byte[] tinyBA = new byte[10];
    byte[] smallBA = new byte[1000];
    byte[] largeBytearray = new byte[80000];
    // init large bytearray with non 0 values, its going to be used as
    // string as well
    for (int i = 0; i < largeBytearray.length; i++) {
      largeBytearray[i] = '1';
    }
    tuple.append(new DataByteArray(tinyBA));
    tuple.append(new DataByteArray(smallBA));
    tuple.append(new DataByteArray(largeBytearray));

    testTupleSedes(tuple);

    // add strings of different sizes
    tuple = TupleFactory.getInstance().newTuple();
    tuple.append(new String(""));
    tuple.append(new String("x"));
    // string larger than 32k
    tuple.append(new String(largeBytearray));

    testTupleSedes(tuple);
  }
  private Tuple createTuple(Tuple[] data) throws ExecException {
    Tuple out = TupleFactory.getInstance().newTuple();

    for (int i = 0; i < data.length; ++i) {
      Tuple t = data[i];
      int size = t.size();
      for (int j = 0; j < size; ++j) {
        out.append(t.get(j));
      }
    }

    return illustratorMarkup(out, out, 0);
  }
  @Test
  public void test() throws IOException, URISyntaxException {
    // Prepare Resource File
    URL metricRes = EndpointGroupsTest.class.getResource("/avro/poem_sync_v2.avro");
    File metricAvro = new File(metricRes.toURI());

    UnwindServiceMetrics uw = new UnwindServiceMetrics("", "test");

    uw.mpsMgr.loadAvro(metricAvro);

    TupleFactory tf = TupleFactory.getInstance();

    Tuple inTuple = tf.newTuple();

    inTuple.append("SRMv2");
    inTuple.append("se01.afroditi.hellasgrid.gr");

    String jsonStr =
        IOUtils.toString(this.getClass().getResourceAsStream("/ar/missing_endpoint.json"), "UTF-8");
    Tuple expTuple = JsonToPig.jsonToTuple(jsonStr);
    Tuple outTuple = uw.exec(inTuple);

    assertTrue(expTuple.toString().equals(outTuple.toString()));
  }
  /*
   * test sedes of long of diff sizes
   * @throws IOException
   */
  @Test
  public void testTupleWriteReadLongDiffSizes() throws IOException {
    Random r = new Random(100L);

    Tuple tuple = TupleFactory.getInstance().newTuple();

    tuple.append(new Long(0));
    tuple.append(new Long(1));
    tuple.append(new Long(-1));
    tuple.append(new Long(300));
    tuple.append(new Long(600));
    tuple.append(new Long(10000));
    tuple.append(new Long(-10000));
    tuple.append(new Long(5000000000000000000L));
    tuple.append(new Long(-5000000000000000000L));

    for (int i = 0; i < 100000; i++) {
      tuple.append(new Long(r.nextLong()));
    }

    testTupleSedes(tuple);
  }
Exemple #22
0
  @Test
  public void testExecNestedTuple() throws IOException {
    Tuple input = tupleFactory.newTuple();
    input.append("{\"stacks\":[[[4,3],[2,1]], [[1,2],[3,4]]]}");
    Map<String, Object> myMap = jsonMap.exec(input);
    Tuple stacks = (Tuple) myMap.get("stacks");

    System.out.println(stacks);

    Tuple reference = getTestTuple();

    assertEquals(reference.toString(), stacks.toString());

    assertEquals(reference.size(), stacks.size());
    for (int i = 0; i < reference.size(); i++) {
      Tuple r = (Tuple) reference.get(i);
      Tuple s = (Tuple) stacks.get(i);
      assertEquals(r.size(), s.size());
      for (int j = 0; j < r.size(); j++) {
        System.out.println("Checking if " + r.get(j) + " == " + s.get(j));
        assertEquals(r.get(j), s.get(j));
      }
    }
  }
  @Override
  public Tuple exec(Tuple input) throws IOException {

    myreporter = PigStatusReporter.getInstance();

    if (input == null || input.size() == 0) {
      return null;
    }

    try {
      DataByteArray dba = null;
      DocumentMetadata dm = null;
      String title = null;
      String doi = null;
      String year = null;

      try {
        dba = (DataByteArray) input.get(0);
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "DataByteArray from tuple");
        return null;
      }

      try {
        dm = DocumentWrapper.parseFrom(dba.get()).getDocumentMetadata();
      } catch (Exception e) {
        myreporter.getCounter("extraction problems", "document metadata");
        return null;
      }

      try {
        for (TextWithLanguage twl : dm.getBasicMetadata().getTitleList()) {
          if (twl.getLanguage().toLowerCase().startsWith("en")) {
            title = twl.getText();

            break;
          }
        }
        if (title == null) {
          title = dm.getBasicMetadata().getTitle(0).getText();
        }
        if (title != null && !title.trim().isEmpty()) {
          title = DiacriticsRemover.removeDiacritics(title);
          title = title.replaceAll("[^A-Za-z0-9\\-_]", " ").replaceAll("\\s++", " ").trim();
        }
      } catch (Exception e) {
      } finally {
        if (title == null || title.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "title extraction");
          return null;
        }
      }

      try {
        doi = dm.getBasicMetadata().getDoi().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (doi == null || doi.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "doi extraction");
          return null;
        }
      }

      try {
        year = dm.getBasicMetadata().getYear().replaceAll("\\s++", " ").trim();
      } catch (Exception e) {
      } finally {
        if (year == null || year.trim().isEmpty()) {
          myreporter.getCounter("extraction problems", "year extraction");
          return null;
        }
      }

      Tuple t = TupleFactory.getInstance().newTuple();
      t.append(doi);
      t.append(year);
      t.append(title);

      return t;
    } catch (Exception e) {
      logger.debug(StackTraceExtractor.getStackTrace(e));
      throw new IOException(e);
    }
  }
  private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    if (fieldSchema == null) {
      throw new IOException("Schema is null");
    }
    int buf;
    ByteArrayOutputStream mOut;

    while ((buf = in.read()) != '(' || buf == '}') {
      if (buf == -1) {
        throw new IOException("Unexpect end of tuple");
      }
      if (buf == '}') {
        in.unread(buf);
        return null;
      }
    }
    Tuple t = TupleFactory.getInstance().newTuple();
    if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) {
      ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
      // Interpret item inside tuple one by one based on the inner schema
      for (int i = 0; i < fss.length; i++) {
        Object field;
        ResourceFieldSchema fs = fss[i];
        int delimit = ',';
        if (i == fss.length - 1) delimit = ')';

        if (DataType.isComplex(fs.getType())) {
          field = consumeComplexType(in, fs);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
          }
        } else {
          mOut = new ByteArrayOutputStream(BUFFER_SIZE);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
            if (buf == delimit) break;
            mOut.write(buf);
          }
          field = parseSimpleType(mOut.toByteArray(), fs);
        }
        t.append(field);
      }
    } else {
      // No inner schema, treat everything inside tuple as bytearray
      Deque<Character> level =
          new LinkedList<
              Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as
      // bytearray
      mOut = new ByteArrayOutputStream(BUFFER_SIZE);
      while (true) {
        buf = in.read();
        if (buf == -1) {
          throw new IOException("Unexpect end of tuple");
        }
        if (buf == '[' || buf == '{' || buf == '(') {
          level.push((char) buf);
          mOut.write(buf);
        } else if (buf == ')' && level.isEmpty()) // End of tuple
        {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          break;
        } else if (buf == ',' && level.isEmpty()) {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          mOut.reset();
        } else if (buf == ']' || buf == '}' || buf == ')') {
          if (level.peek() == findStartChar((char) buf)) level.pop();
          else throw new IOException("Malformed tuple");
          mOut.write(buf);
        } else mOut.write(buf);
      }
    }
    return t;
  }
    @Override
    public Tuple exec(Tuple input) throws IOException {
      DataBag bag = (DataBag) input.get(0);
      DataBag selected = bagFactory.newDefaultBag();
      DataBag aggWaiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator());
      Tuple output = tupleFactory.newTuple();

      long n = 0L;

      for (Tuple innerTuple : bag) {
        n += (Long) innerTuple.get(0);

        selected.addAll((DataBag) innerTuple.get(1));

        double q1 = getQ1(n, _samplingProbability);
        double q2 = getQ2(n, _samplingProbability);

        for (Tuple t : (DataBag) innerTuple.get(2)) {
          ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

          if (scored.getScore() < q1) {
            selected.add(scored.getTuple());
          } else if (scored.getScore() < q2) {
            aggWaiting.add(t);
          } else {
            break;
          }
        }
      }

      double q1 = getQ1(n, _samplingProbability);
      double q2 = getQ2(n, _samplingProbability);

      for (Tuple t : aggWaiting) {
        ScoredTuple scored = ScoredTuple.fromIntermediateTuple(t);

        if (scored.getScore() < q1) {
          selected.add(scored.getTuple());
        } else if (scored.getScore() < q2) {
          waiting.add(t);
        } else {
          break;
        }
      }

      output.append(n);
      output.append(selected);
      output.append(waiting);

      System.err.println(
          "Read "
              + n
              + " items, selected "
              + selected.size()
              + ", and wait-listed "
              + aggWaiting.size()
              + ".");

      return output;
    }
 @Override
 public void append(Object val) {
   t.append(val);
 }