public void cat(StringBuilder sb, Object input, String delim) throws IOException { if (input == null) return; if (input instanceof Tuple) { Tuple tuple = (Tuple) input; for (Object o : tuple.getAll()) { cat(sb, o, delim); } } else if (input instanceof DataBag) { DataBag bag = (DataBag) input; for (Tuple t : bag) { for (Object o : t.getAll()) { cat(sb, o, delim); } } } else { String s = input.toString(); s = s.trim(); if (s.length() > 0) { sb.append(s).append(delim); } } }
@Override public Tuple exec(Tuple input) throws IOException { TupleFactory tFactory = TupleFactory.getInstance(); Tuple oTuple = tFactory.newTuple(); Tuple sketchTupleA = (DefaultTuple) input.get(0); Tuple sketchTupleB = (DefaultTuple) input.get(1); List<Object> fieldsA = sketchTupleA.getAll(); List<Object> fieldsB = sketchTupleB.getAll(); List<Integer> iFields1 = new ArrayList<Integer>(); List<Integer> iFields2 = new ArrayList<Integer>(); int count = fieldsA.size() * 2; int match = 0; for (int i = 0; i < fieldsA.size(); i++) { int a = (Integer) fieldsA.get(i); for (int j = 0; j < fieldsB.size(); j++) { int b = (Integer) fieldsB.get(j); if (a == b) { match += 2; } } } double sim = (double) match / (double) count; oTuple.append(sim); return oTuple; }
protected Tuple getValueTuple(NullableTuple ntup, int index) throws ExecException { // Need to make a copy of the value, as hadoop uses the same ntup // to represent each value. Tuple val = (Tuple) ntup.getValueAsPigType(); Tuple copy = null; // The "value (val)" that we just got may not // be the complete "value". It may have some portions // in the "key" (look in POLocalRearrange for more comments) // If this is the case we need to stitch // the "value" together. Pair<Boolean, Map<Integer, Integer>> lrKeyInfo = keyInfo.get(index); boolean isProjectStar = lrKeyInfo.first; Map<Integer, Integer> keyLookup = lrKeyInfo.second; int keyLookupSize = keyLookup.size(); if (keyLookupSize > 0) { // we have some fields of the "value" in the // "key". copy = mTupleFactory.newTuple(); int finalValueSize = keyLookupSize + val.size(); int valIndex = 0; // an index for accessing elements from // the value (val) that we have currently for (int i = 0; i < finalValueSize; i++) { Integer keyIndex = keyLookup.get(i); if (keyIndex == null) { // the field for this index is not in the // key - so just take it from the "value" // we were handed copy.append(val.get(valIndex)); valIndex++; } else { // the field for this index is in the key if (isKeyTuple) { // the key is a tuple, extract the // field out of the tuple copy.append(keyAsTuple.get(keyIndex)); } else { copy.append(key); } } } } else if (isProjectStar) { // the whole "value" is present in the "key" copy = mTupleFactory.newTuple(keyAsTuple.getAll()); } else { // there is no field of the "value" in the // "key" - so just make a copy of what we got // as the "value" copy = mTupleFactory.newTuple(val.getAll()); } return copy; }
public void joinTuples(Object key, List<Tuple> tuples) throws ExecException { List<Tuple> currentTuples = joinData.get(key); if (currentTuples != null) { List<Tuple> newTuples = new LinkedList<Tuple>(); if (tuples != null) { for (Tuple t1 : currentTuples) { for (Tuple t2 : tuples) { Tuple t = TupleFactory.getInstance().newTuple(); for (Object o : t1.getAll()) { t.append(o); } for (Object o : t2.getAll()) { t.append(o); } newTuples.add(t); } } } joinData.put(key, newTuples); } }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple) * * Given a tuple that corresponds to one record, write * it out as CSV, converting among Unix/Windows line * breaks as requested in the instantiation. Also take * care of escaping field delimiters, double quotes, * and linebreaks embedded within fields, * */ @Override public void putNext(Tuple tupleToWrite) throws IOException { // If WRITE_OUTPUT_HEADER, store a header record with the names of each field if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) { ArrayList<Object> headerProtoTuple = new ArrayList<Object>(); ResourceFieldSchema[] fields = schema.getFields(); for (ResourceFieldSchema field : fields) { headerProtoTuple.add(field.getName()); } super.putNext(tupleMaker.newTuple(headerProtoTuple)); } storingFirstRecord = false; ArrayList<Object> mProtoTuple = new ArrayList<Object>(); int embeddedNewlineIndex = -1; int embeddedCarriageReturn = -1; String fieldStr = null; // For good debug messages: int fieldCounter = -1; // Do the escaping: for (Object field : tupleToWrite.getAll()) { fieldCounter++; // Substitute a null value with an empty string. See PIG-2470. if (field == null) { fieldStr = null; mProtoTuple.add(""); continue; } fieldStr = field.toString(); // Embedded double quotes are replaced by two double quotes: fieldStr = fieldStr.replaceAll("[\"]", "\"\""); // If any field delimiters are in the field, or if we did replace // any double quotes with a pair of double quotes above, // or if the string includes a newline character (LF:\n:0x0A) // or includes a carriage return (CR:\r:0x0D) // and we are to allow newlines in fields, // then the entire field must be enclosed in double quotes: embeddedNewlineIndex = fieldStr.indexOf(LINEFEED); embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN); if ((fieldStr.indexOf(fieldDelimiter) != -1) || (fieldStr.indexOf(DOUBLE_QUOTE) != -1) || (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) { fieldStr = "\"" + fieldStr + "\""; } // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J), // This is needed for Excel to recognize a field-internal // new line: if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) { if (eolTreatment == Linebreaks.WINDOWS) { loneLFDetector.reset(fieldStr); loneLFDetector.matches(); fieldStr = loneLFDetector.replaceAll("$1\r\n"); } else if (eolTreatment == Linebreaks.UNIX) { CRLFDetector.reset(fieldStr); fieldStr = CRLFDetector.replaceAll("\n"); } } mProtoTuple.add(fieldStr); } // If Windows line breaks are requested, append // a newline (0x0D a.k.a. ^M) to the last field // so that the row termination will end up being // \r\n, once the superclass' putNext() method // is done below: if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null)) mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r"); Tuple resTuple = tupleMaker.newTuple(mProtoTuple); super.putNext(resTuple); }
@Override public List<Object> getAll() { return t.getAll(); }