/** check whether it is just a wrapped tuple */ public static boolean isTupleWrapper(ResourceFieldSchema pigSchema) { Boolean status = false; if (pigSchema.getType() == DataType.TUPLE) if (pigSchema.getName() != null) if (pigSchema.getName().equals(AvroStorageUtils.PIG_TUPLE_WRAPPER)) status = true; return status; }
private Map<String, Object> consumeMap(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { int buf; while ((buf = in.read()) != '[') { if (buf == -1) { throw new IOException("Unexpect end of map"); } } HashMap<String, Object> m = new HashMap<String, Object>(); ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE); while (true) { // Read key (assume key can not contains special character such as #, (, [, {, }, ], ) while ((buf = in.read()) != '#') { if (buf == -1) { throw new IOException("Unexpect end of map"); } mOut.write(buf); } String key = bytesToCharArray(mOut.toByteArray()); if (key.length() == 0) throw new IOException("Map key can not be null"); // Read value mOut.reset(); Deque<Character> level = new LinkedList< Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as // bytearray while (true) { buf = in.read(); if (buf == -1) { throw new IOException("Unexpect end of map"); } if (buf == '[' || buf == '{' || buf == '(') { level.push((char) buf); } else if (buf == ']' && level.isEmpty()) // End of map break; else if (buf == ']' || buf == '}' || buf == ')') { if (level.isEmpty()) throw new IOException("Malformed map"); if (level.peek() == findStartChar((char) buf)) level.pop(); } else if (buf == ',' && level.isEmpty()) { // Current map item complete break; } mOut.write(buf); } Object value = null; if (fieldSchema != null && fieldSchema.getSchema() != null && mOut.size() > 0) { value = bytesToObject(mOut.toByteArray(), fieldSchema.getSchema().getFields()[0]); } else if (mOut.size() > 0) { // untyped map value = new DataByteArray(mOut.toByteArray()); } m.put(key, value); mOut.reset(); if (buf == ']') break; } return m; }
/** wrap a pig schema as tuple */ public static ResourceFieldSchema wrapAsTuple(ResourceFieldSchema subFieldSchema) throws IOException { ResourceSchema listSchema = new ResourceSchema(); listSchema.setFields(new ResourceFieldSchema[] {subFieldSchema}); ResourceFieldSchema tupleWrapper = new ResourceFieldSchema(); tupleWrapper.setType(DataType.TUPLE); tupleWrapper.setName(PIG_TUPLE_WRAPPER); tupleWrapper.setSchema(listSchema); return tupleWrapper; }
private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); Tuple t; int buf; while ((buf = in.read()) != '{') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema"); ResourceFieldSchema fs = fss[0]; DataBag db = DefaultBagFactory.getInstance().newDefaultBag(); while (true) { t = consumeTuple(in, fs); if (t != null) db.add(t); while ((buf = in.read()) != '}' && buf != ',') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (buf == '}') break; } return db; }
private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException { Object field; if (DataType.isComplex(fs.getType())) { ByteArrayInputStream bis = new ByteArrayInputStream(b); PushbackInputStream in = new PushbackInputStream(bis); field = consumeComplexType(in, fs); } else { field = parseSimpleType(b, fs); } return field; }
private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema) throws IOException { Object field; switch (complexFieldSchema.getType()) { case DataType.BAG: field = consumeBag(in, complexFieldSchema); break; case DataType.TUPLE: field = consumeTuple(in, complexFieldSchema); break; case DataType.MAP: field = consumeMap(in, complexFieldSchema); break; default: throw new IOException("Unknown complex data type"); } return field; }
private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema) throws IOException { Object field; switch (simpleFieldSchema.getType()) { case DataType.INTEGER: field = bytesToInteger(b); break; case DataType.LONG: field = bytesToLong(b); break; case DataType.FLOAT: field = bytesToFloat(b); break; case DataType.DOUBLE: field = bytesToDouble(b); break; case DataType.CHARARRAY: field = bytesToCharArray(b); break; case DataType.BYTEARRAY: field = new DataByteArray(b); break; case DataType.BOOLEAN: field = bytesToBoolean(b); break; case DataType.BIGINTEGER: field = bytesToBigInteger(b); break; case DataType.BIGDECIMAL: field = bytesToBigDecimal(b); case DataType.DATETIME: field = bytesToDateTime(b); break; default: throw new IOException("Unknown simple data type"); } return field; }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple) * * Given a tuple that corresponds to one record, write * it out as CSV, converting among Unix/Windows line * breaks as requested in the instantiation. Also take * care of escaping field delimiters, double quotes, * and linebreaks embedded within fields, * */ @Override public void putNext(Tuple tupleToWrite) throws IOException { // If WRITE_OUTPUT_HEADER, store a header record with the names of each field if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) { ArrayList<Object> headerProtoTuple = new ArrayList<Object>(); ResourceFieldSchema[] fields = schema.getFields(); for (ResourceFieldSchema field : fields) { headerProtoTuple.add(field.getName()); } super.putNext(tupleMaker.newTuple(headerProtoTuple)); } storingFirstRecord = false; ArrayList<Object> mProtoTuple = new ArrayList<Object>(); int embeddedNewlineIndex = -1; int embeddedCarriageReturn = -1; String fieldStr = null; // For good debug messages: int fieldCounter = -1; // Do the escaping: for (Object field : tupleToWrite.getAll()) { fieldCounter++; // Substitute a null value with an empty string. See PIG-2470. if (field == null) { fieldStr = null; mProtoTuple.add(""); continue; } fieldStr = field.toString(); // Embedded double quotes are replaced by two double quotes: fieldStr = fieldStr.replaceAll("[\"]", "\"\""); // If any field delimiters are in the field, or if we did replace // any double quotes with a pair of double quotes above, // or if the string includes a newline character (LF:\n:0x0A) // or includes a carriage return (CR:\r:0x0D) // and we are to allow newlines in fields, // then the entire field must be enclosed in double quotes: embeddedNewlineIndex = fieldStr.indexOf(LINEFEED); embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN); if ((fieldStr.indexOf(fieldDelimiter) != -1) || (fieldStr.indexOf(DOUBLE_QUOTE) != -1) || (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) { fieldStr = "\"" + fieldStr + "\""; } // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J), // This is needed for Excel to recognize a field-internal // new line: if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) { if (eolTreatment == Linebreaks.WINDOWS) { loneLFDetector.reset(fieldStr); loneLFDetector.matches(); fieldStr = loneLFDetector.replaceAll("$1\r\n"); } else if (eolTreatment == Linebreaks.UNIX) { CRLFDetector.reset(fieldStr); fieldStr = CRLFDetector.replaceAll("\n"); } } mProtoTuple.add(fieldStr); } // If Windows line breaks are requested, append // a newline (0x0D a.k.a. ^M) to the last field // so that the row termination will end up being // \r\n, once the superclass' putNext() method // is done below: if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null)) mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r"); Tuple resTuple = tupleMaker.newTuple(mProtoTuple); super.putNext(resTuple); }
private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } int buf; ByteArrayOutputStream mOut; while ((buf = in.read()) != '(' || buf == '}') { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == '}') { in.unread(buf); return null; } } Tuple t = TupleFactory.getInstance().newTuple(); if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) { ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); // Interpret item inside tuple one by one based on the inner schema for (int i = 0; i < fss.length; i++) { Object field; ResourceFieldSchema fs = fss[i]; int delimit = ','; if (i == fss.length - 1) delimit = ')'; if (DataType.isComplex(fs.getType())) { field = consumeComplexType(in, fs); while ((buf = in.read()) != delimit) { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } } } else { mOut = new ByteArrayOutputStream(BUFFER_SIZE); while ((buf = in.read()) != delimit) { if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == delimit) break; mOut.write(buf); } field = parseSimpleType(mOut.toByteArray(), fs); } t.append(field); } } else { // No inner schema, treat everything inside tuple as bytearray Deque<Character> level = new LinkedList< Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as // bytearray mOut = new ByteArrayOutputStream(BUFFER_SIZE); while (true) { buf = in.read(); if (buf == -1) { throw new IOException("Unexpect end of tuple"); } if (buf == '[' || buf == '{' || buf == '(') { level.push((char) buf); mOut.write(buf); } else if (buf == ')' && level.isEmpty()) // End of tuple { DataByteArray value = new DataByteArray(mOut.toByteArray()); t.append(value); break; } else if (buf == ',' && level.isEmpty()) { DataByteArray value = new DataByteArray(mOut.toByteArray()); t.append(value); mOut.reset(); } else if (buf == ']' || buf == '}' || buf == ')') { if (level.peek() == findStartChar((char) buf)) level.pop(); else throw new IOException("Malformed tuple"); mOut.write(buf); } else mOut.write(buf); } } return t; }
public boolean write( Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) { byte type = (field != null ? field.getType() : DataType.findType(object)); if (writeFieldName) { generator.writeFieldName(alias.toES(field.getName())); } if (object == null) { generator.writeNull(); return true; } switch (type) { case DataType.ERROR: case DataType.UNKNOWN: return handleUnknown(object, field, generator); case DataType.NULL: generator.writeNull(); break; case DataType.BOOLEAN: generator.writeBoolean((Boolean) object); break; case DataType.INTEGER: generator.writeNumber(((Number) object).intValue()); break; case DataType.LONG: generator.writeNumber(((Number) object).longValue()); break; case DataType.FLOAT: generator.writeNumber(((Number) object).floatValue()); break; case DataType.DOUBLE: generator.writeNumber(((Number) object).doubleValue()); break; case DataType.BYTE: generator.writeNumber((Byte) object); break; case DataType.CHARARRAY: generator.writeString(object.toString()); break; case DataType.BYTEARRAY: generator.writeBinary(((DataByteArray) object).get()); break; // DateTime introduced in Pig 11 case 30: // DataType.DATETIME generator.writeString(PigUtils.convertDateToES(object)); break; // DateTime introduced in Pig 12 case 65: // DataType.BIGINTEGER throw new SerializationException( "Big integers are not supported by Elasticsearch - consider using a different type (such as string)"); // DateTime introduced in Pig 12 case 70: // DataType.BIGDECIMAL throw new SerializationException( "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)"); case DataType.MAP: ResourceSchema nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginObject(); generator.writeEndObject(); break; } ResourceFieldSchema[] nestedFields = nestedSchema.getFields(); generator.writeBeginObject(); // Pig maps are actually String -> Object association so we can save the key right away for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) { generator.writeFieldName(alias.toES(entry.getKey().toString())); write(entry.getValue(), nestedFields[0], generator, false); } generator.writeEndObject(); break; case DataType.TUPLE: nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginObject(); generator.writeEndObject(); break; } nestedFields = nestedSchema.getFields(); // use getAll instead of get(int) to avoid having to handle Exception... List<Object> tuples = ((Tuple) object).getAll(); generator.writeBeginObject(); for (int i = 0; i < nestedFields.length; i++) { String name = nestedFields[i].getName(); // handle schemas without names name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i)); generator.writeFieldName(name); write(tuples.get(i), nestedFields[i], generator, false); } generator.writeEndObject(); break; case DataType.BAG: nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginArray(); generator.writeEndArray(); break; } ResourceFieldSchema bagType = nestedSchema.getFields()[0]; generator.writeBeginArray(); for (Tuple tuple : (DataBag) object) { write(tuple, bagType, generator, false); } generator.writeEndArray(); break; default: if (writeUnknownTypes) { return handleUnknown(object, field, generator); } return false; } return true; }