Exemple #1
0
  public void cat(StringBuilder sb, Object input, String delim) throws IOException {
    if (input == null) return;

    if (input instanceof Tuple) {
      Tuple tuple = (Tuple) input;

      for (Object o : tuple.getAll()) {
        cat(sb, o, delim);
      }
    } else if (input instanceof DataBag) {
      DataBag bag = (DataBag) input;

      for (Tuple t : bag) {
        for (Object o : t.getAll()) {
          cat(sb, o, delim);
        }
      }
    } else {
      String s = input.toString();

      s = s.trim();

      if (s.length() > 0) {
        sb.append(s).append(delim);
      }
    }
  }
  @Override
  public Tuple exec(Tuple input) throws IOException {
    TupleFactory tFactory = TupleFactory.getInstance();
    Tuple oTuple = tFactory.newTuple();
    Tuple sketchTupleA = (DefaultTuple) input.get(0);
    Tuple sketchTupleB = (DefaultTuple) input.get(1);

    List<Object> fieldsA = sketchTupleA.getAll();
    List<Object> fieldsB = sketchTupleB.getAll();
    List<Integer> iFields1 = new ArrayList<Integer>();
    List<Integer> iFields2 = new ArrayList<Integer>();

    int count = fieldsA.size() * 2;
    int match = 0;

    for (int i = 0; i < fieldsA.size(); i++) {
      int a = (Integer) fieldsA.get(i);
      for (int j = 0; j < fieldsB.size(); j++) {
        int b = (Integer) fieldsB.get(j);
        if (a == b) {
          match += 2;
        }
      }
    }

    double sim = (double) match / (double) count;

    oTuple.append(sim);
    return oTuple;
  }
  protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException {
    // Need to make a copy of the value, as hadoop uses the same ntup
    // to represent each value.
    Tuple val = (Tuple) ntup.getValueAsPigType();

    Tuple copy = null;
    // The "value (val)" that we just got may not
    // be the complete "value". It may have some portions
    // in the "key" (look in POLocalRearrange for more comments)
    // If this is the case we need to stitch
    // the "value" together.
    Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index);
    boolean isProjectStar = lrKeyInfo.first;
    Map<Integer, Integer> keyLookup = lrKeyInfo.second;
    int keyLookupSize = keyLookup.size();

    if (keyLookupSize > 0) {

      // we have some fields of the "value" in the
      // "key".
      copy = mTupleFactory.newTuple();
      int finalValueSize = keyLookupSize + val.size();
      int valIndex = 0; // an index for accessing elements from
      // the value (val) that we have currently
      for (int i = 0; i < finalValueSize; i++) {
        Integer keyIndex = keyLookup.get(i);
        if (keyIndex == null) {
          // the field for this index is not in the
          // key - so just take it from the "value"
          // we were handed
          copy.append(val.get(valIndex));
          valIndex++;
        } else {
          // the field for this index is in the key
          if (isKeyTuple) {
            // the key is a tuple, extract the
            // field out of the tuple
            copy.append(keyAsTuple.get(keyIndex));
          } else {
            copy.append(key);
          }
        }
      }

    } else if (isProjectStar) {

      // the whole "value" is present in the "key"
      copy = mTupleFactory.newTuple(keyAsTuple.getAll());

    } else {

      // there is no field of the "value" in the
      // "key" - so just make a copy of what we got
      // as the "value"
      copy = mTupleFactory.newTuple(val.getAll());
    }
    return copy;
  }
 public void joinTuples(Object key, List<Tuple> tuples) throws ExecException {
   List<Tuple> currentTuples = joinData.get(key);
   if (currentTuples != null) {
     List<Tuple> newTuples = new LinkedList<Tuple>();
     if (tuples != null) {
       for (Tuple t1 : currentTuples) {
         for (Tuple t2 : tuples) {
           Tuple t = TupleFactory.getInstance().newTuple();
           for (Object o : t1.getAll()) {
             t.append(o);
           }
           for (Object o : t2.getAll()) {
             t.append(o);
           }
           newTuples.add(t);
         }
       }
     }
     joinData.put(key, newTuples);
   }
 }
Exemple #5
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple)
   *
   * Given a tuple that corresponds to one record, write
   * it out as CSV, converting among Unix/Windows line
   * breaks as requested in the instantiation. Also take
   * care of escaping field delimiters, double quotes,
   * and linebreaks embedded within fields,
   *
   */
  @Override
  public void putNext(Tuple tupleToWrite) throws IOException {
    // If WRITE_OUTPUT_HEADER, store a header record with the names of each field
    if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) {
      ArrayList<Object> headerProtoTuple = new ArrayList<Object>();
      ResourceFieldSchema[] fields = schema.getFields();
      for (ResourceFieldSchema field : fields) {
        headerProtoTuple.add(field.getName());
      }
      super.putNext(tupleMaker.newTuple(headerProtoTuple));
    }
    storingFirstRecord = false;

    ArrayList<Object> mProtoTuple = new ArrayList<Object>();
    int embeddedNewlineIndex = -1;
    int embeddedCarriageReturn = -1;
    String fieldStr = null;
    // For good debug messages:
    int fieldCounter = -1;

    // Do the escaping:
    for (Object field : tupleToWrite.getAll()) {
      fieldCounter++;

      // Substitute a null value with an empty string. See PIG-2470.
      if (field == null) {
        fieldStr = null;
        mProtoTuple.add("");
        continue;
      }

      fieldStr = field.toString();

      // Embedded double quotes are replaced by two double quotes:
      fieldStr = fieldStr.replaceAll("[\"]", "\"\"");

      // If any field delimiters are in the field, or if we did replace
      // any double quotes with a pair of double quotes above,
      // or if the string includes a newline character (LF:\n:0x0A)
      //               or includes a carriage return (CR:\r:0x0D)
      // and we are to allow newlines in fields,
      // then the entire field must be enclosed in double quotes:
      embeddedNewlineIndex = fieldStr.indexOf(LINEFEED);
      embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN);

      if ((fieldStr.indexOf(fieldDelimiter) != -1)
          || (fieldStr.indexOf(DOUBLE_QUOTE) != -1)
          || (multilineTreatment == Multiline.YES)
              && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) {
        fieldStr = "\"" + fieldStr + "\"";
      }

      // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J),
      // This is needed for Excel to recognize a field-internal
      // new line:

      if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) {
        if (eolTreatment == Linebreaks.WINDOWS) {
          loneLFDetector.reset(fieldStr);
          loneLFDetector.matches();
          fieldStr = loneLFDetector.replaceAll("$1\r\n");
        } else if (eolTreatment == Linebreaks.UNIX) {
          CRLFDetector.reset(fieldStr);
          fieldStr = CRLFDetector.replaceAll("\n");
        }
      }

      mProtoTuple.add(fieldStr);
    }
    // If Windows line breaks are requested, append
    // a newline (0x0D a.k.a. ^M) to the last field
    // so that the row termination will end up being
    // \r\n, once the superclass' putNext() method
    // is done below:

    if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null))
      mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r");

    Tuple resTuple = tupleMaker.newTuple(mProtoTuple);
    super.putNext(resTuple);
  }
 @Override
 public List<Object> getAll() {
   return t.getAll();
 }