/** check whether it is just a wrapped tuple */ public static boolean isTupleWrapper(ResourceFieldSchema pigSchema) { Boolean status = false; if (pigSchema.getType() == DataType.TUPLE) if (pigSchema.getName() != null) if (pigSchema.getName().equals(AvroStorageUtils.PIG_TUPLE_WRAPPER)) status = true; return status; }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple) * * Given a tuple that corresponds to one record, write * it out as CSV, converting among Unix/Windows line * breaks as requested in the instantiation. Also take * care of escaping field delimiters, double quotes, * and linebreaks embedded within fields, * */ @Override public void putNext(Tuple tupleToWrite) throws IOException { // If WRITE_OUTPUT_HEADER, store a header record with the names of each field if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) { ArrayList<Object> headerProtoTuple = new ArrayList<Object>(); ResourceFieldSchema[] fields = schema.getFields(); for (ResourceFieldSchema field : fields) { headerProtoTuple.add(field.getName()); } super.putNext(tupleMaker.newTuple(headerProtoTuple)); } storingFirstRecord = false; ArrayList<Object> mProtoTuple = new ArrayList<Object>(); int embeddedNewlineIndex = -1; int embeddedCarriageReturn = -1; String fieldStr = null; // For good debug messages: int fieldCounter = -1; // Do the escaping: for (Object field : tupleToWrite.getAll()) { fieldCounter++; // Substitute a null value with an empty string. See PIG-2470. if (field == null) { fieldStr = null; mProtoTuple.add(""); continue; } fieldStr = field.toString(); // Embedded double quotes are replaced by two double quotes: fieldStr = fieldStr.replaceAll("[\"]", "\"\""); // If any field delimiters are in the field, or if we did replace // any double quotes with a pair of double quotes above, // or if the string includes a newline character (LF:\n:0x0A) // or includes a carriage return (CR:\r:0x0D) // and we are to allow newlines in fields, // then the entire field must be enclosed in double quotes: embeddedNewlineIndex = fieldStr.indexOf(LINEFEED); embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN); if ((fieldStr.indexOf(fieldDelimiter) != -1) || (fieldStr.indexOf(DOUBLE_QUOTE) != -1) || (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) { fieldStr = "\"" + fieldStr + "\""; } // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J), // This is needed for Excel to recognize a field-internal // new line: if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) { if (eolTreatment == Linebreaks.WINDOWS) { loneLFDetector.reset(fieldStr); loneLFDetector.matches(); fieldStr = loneLFDetector.replaceAll("$1\r\n"); } else if (eolTreatment == Linebreaks.UNIX) { CRLFDetector.reset(fieldStr); fieldStr = CRLFDetector.replaceAll("\n"); } } mProtoTuple.add(fieldStr); } // If Windows line breaks are requested, append // a newline (0x0D a.k.a. ^M) to the last field // so that the row termination will end up being // \r\n, once the superclass' putNext() method // is done below: if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null)) mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r"); Tuple resTuple = tupleMaker.newTuple(mProtoTuple); super.putNext(resTuple); }
public boolean write( Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) { byte type = (field != null ? field.getType() : DataType.findType(object)); if (writeFieldName) { generator.writeFieldName(alias.toES(field.getName())); } if (object == null) { generator.writeNull(); return true; } switch (type) { case DataType.ERROR: case DataType.UNKNOWN: return handleUnknown(object, field, generator); case DataType.NULL: generator.writeNull(); break; case DataType.BOOLEAN: generator.writeBoolean((Boolean) object); break; case DataType.INTEGER: generator.writeNumber(((Number) object).intValue()); break; case DataType.LONG: generator.writeNumber(((Number) object).longValue()); break; case DataType.FLOAT: generator.writeNumber(((Number) object).floatValue()); break; case DataType.DOUBLE: generator.writeNumber(((Number) object).doubleValue()); break; case DataType.BYTE: generator.writeNumber((Byte) object); break; case DataType.CHARARRAY: generator.writeString(object.toString()); break; case DataType.BYTEARRAY: generator.writeBinary(((DataByteArray) object).get()); break; // DateTime introduced in Pig 11 case 30: // DataType.DATETIME generator.writeString(PigUtils.convertDateToES(object)); break; // DateTime introduced in Pig 12 case 65: // DataType.BIGINTEGER throw new SerializationException( "Big integers are not supported by Elasticsearch - consider using a different type (such as string)"); // DateTime introduced in Pig 12 case 70: // DataType.BIGDECIMAL throw new SerializationException( "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)"); case DataType.MAP: ResourceSchema nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginObject(); generator.writeEndObject(); break; } ResourceFieldSchema[] nestedFields = nestedSchema.getFields(); generator.writeBeginObject(); // Pig maps are actually String -> Object association so we can save the key right away for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) { generator.writeFieldName(alias.toES(entry.getKey().toString())); write(entry.getValue(), nestedFields[0], generator, false); } generator.writeEndObject(); break; case DataType.TUPLE: nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginObject(); generator.writeEndObject(); break; } nestedFields = nestedSchema.getFields(); // use getAll instead of get(int) to avoid having to handle Exception... List<Object> tuples = ((Tuple) object).getAll(); generator.writeBeginObject(); for (int i = 0; i < nestedFields.length; i++) { String name = nestedFields[i].getName(); // handle schemas without names name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i)); generator.writeFieldName(name); write(tuples.get(i), nestedFields[i], generator, false); } generator.writeEndObject(); break; case DataType.BAG: nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginArray(); generator.writeEndArray(); break; } ResourceFieldSchema bagType = nestedSchema.getFields()[0]; generator.writeBeginArray(); for (Tuple tuple : (DataBag) object) { write(tuple, bagType, generator, false); } generator.writeEndArray(); break; default: if (writeUnknownTypes) { return handleUnknown(object, field, generator); } return false; } return true; }