@Override public ResourceSchema getSchema(String location, Job job) throws IOException { if (!partitionKeysSet) { Set<String> keys = getPartitionColumns(location, job); if (!(keys == null || keys.size() == 0)) { // re-edit the pigSchema to contain the new partition keys. ResourceFieldSchema[] fields = pigSchema.getFields(); LOG.debug("Schema: " + Arrays.toString(fields)); ResourceFieldSchema[] newFields = Arrays.copyOf(fields, fields.length + keys.size()); int index = fields.length; for (String key : keys) { newFields[index++] = new ResourceFieldSchema(new FieldSchema(key, DataType.CHARARRAY)); } pigSchema.setFields(newFields); LOG.debug("Added partition fields: " + keys + " to loader schema"); LOG.debug("Schema is: " + Arrays.toString(newFields)); } partitionKeysSet = true; } return pigSchema; }
@Test public void testGetBagSubSchema() throws Exception { // Define the expected schema. ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1]; bagSubFieldSchemas[0] = new ResourceFieldSchema() .setName("innertuple") .setDescription("The tuple in the bag") .setType(DataType.TUPLE); ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1]; innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("innerfield").setType(DataType.CHARARRAY); bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas)); ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas); // Get the actual converted schema. HCatSchema hCatSchema = new HCatSchema( Lists.newArrayList( new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null))); HCatFieldSchema hCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, hCatSchema, null); ResourceSchema actual = PigHCatUtil.getBagSubSchema(hCatFieldSchema); Assert.assertEquals(expected.toString(), actual.toString()); }
/** wrap a pig schema as tuple */ public static ResourceFieldSchema wrapAsTuple(ResourceFieldSchema subFieldSchema) throws IOException { ResourceSchema listSchema = new ResourceSchema(); listSchema.setFields(new ResourceFieldSchema[] {subFieldSchema}); ResourceFieldSchema tupleWrapper = new ResourceFieldSchema(); tupleWrapper.setType(DataType.TUPLE); tupleWrapper.setName(PIG_TUPLE_WRAPPER); tupleWrapper.setSchema(listSchema); return tupleWrapper; }
public void checkSchema(ResourceSchema s) throws IOException { // Not actually checking schema // Actually, just storing it to use in the backend UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature}); p.setProperty(SCHEMA_SIGNATURE, s.toString()); }
@Test public void testGetBagSubSchemaConfigured() throws Exception { // NOTE: pig-0.8 sets client system properties by actually getting the client // system properties. Starting in pig-0.9 you must pass the properties in. // When updating our pig dependency this will need updated. System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t"); System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple"); UDFContext.getUDFContext().setClientSystemProps(System.getProperties()); // Define the expected schema. ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1]; bagSubFieldSchemas[0] = new ResourceFieldSchema() .setName("t") .setDescription("The tuple in the bag") .setType(DataType.TUPLE); ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1]; innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY); bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas)); ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas); // Get the actual converted schema. HCatSchema actualHCatSchema = new HCatSchema( Lists.newArrayList( new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null))); HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null); ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema); Assert.assertEquals(expected.toString(), actual.toString()); }
/** * Only read the columns that were requested in the constructor.<br> * * @param struct ColumnarStruct * @param path Path * @return Tuple * @throws IOException */ private Tuple readColumnarTuple(ColumnarStruct struct, Path path) throws IOException { int[] columnIndexes = getRequiredColumns(); // the partition keys if any will already be in the UDFContext here. String[] partitionKeys = getPartitionKeys(null, null); // only if the path has changed should be run the if (currentPath == null || !currentPath.equals(path)) { currentPathPartitionKeyMap = (partitionKeys == null) ? null : pathPartitionerHelper.getPathPartitionKeyValues(path.toString()); currentPath = path; } // if the partitionColumns is null this value will stop the for loop // below from trynig to add any partition columns // that do not exist int partitionColumnStartIndex = Integer.MAX_VALUE; if (!(partitionColumns == null || partitionColumns.size() == 0)) { // partition columns are always appended to the schema fields. partitionColumnStartIndex = pigSchema.getFields().length; } // create tuple with determined previous size Tuple t = tupleFactory.newTuple(columnIndexes.length); // read in all columns for (int i = 0; i < columnIndexes.length; i++) { int columnIndex = columnIndexes[i]; if (columnIndex < partitionColumnStartIndex) { Object obj = struct.getField(columnIndex); Object pigType = HiveRCSchemaUtil.extractPigTypeFromHiveType(obj); t.set(i, pigType); } else { // read the partition columns // will only be executed if partitionColumns is not null String key = partitionKeys[columnIndex - partitionColumnStartIndex]; Object value = currentPathPartitionKeyMap.get(key); t.set(i, value); } } return t; }
@Override public ResourceSchema getSchema(String location, Job job) throws IOException { if (schema != null) { return schema; } final Configuration configuration = job.getConfiguration(); this.initializePhoenixPigConfiguration(location, configuration); this.schema = PhoenixPigSchemaUtil.getResourceSchema(this.config); if (LOG.isDebugEnabled()) { LOG.debug( String.format( "Resource Schema generated for location [%s] is [%s]", location, schema.toString())); } this.storeInUDFContext( this.contextSignature, RESOURCE_SCHEMA_SIGNATURE, ObjectSerializer.serialize(schema)); return schema; }
@Override public Tuple getNext() throws IOException { try { if (!reader.nextKeyValue()) { return null; } final PhoenixRecord phoenixRecord = reader.getCurrentValue(); if (phoenixRecord == null) { return null; } final Tuple tuple = TypeUtil.transformToTuple(phoenixRecord, schema.getFields()); return tuple; } catch (InterruptedException e) { int errCode = 6018; final String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
@Override public void prepareToRead(@SuppressWarnings("rawtypes") RecordReader reader, PigSplit split) throws IOException { this.reader = (HiveRCRecordReader) reader; // check that the required indexes actually exist i.e. the columns that // should be read. // assuming this is always defined simplifies the readColumnarTuple // logic. int requiredIndexes[] = getRequiredColumns(); if (requiredIndexes == null) { int fieldLen = pigSchema.getFields().length; // if any the partition keys should already exist String[] partitionKeys = getPartitionKeys(null, null); if (partitionKeys != null) { fieldLen = partitionKeys.length; } requiredIndexes = new int[fieldLen]; for (int i = 0; i < fieldLen; i++) { requiredIndexes[i] = i; } this.requiredColumns = requiredIndexes; } try { serde = new ColumnarSerDe(); serde.initialize(hiveConf, props); } catch (SerDeException e) { LOG.error(e.toString(), e); throw new IOException(e); } }
@Override public void storeSchema(ResourceSchema schema, String location, Job job) throws IOException { Configuration conf = job.getConfiguration(); DataStorage storage = new HDataStorage(new Path(location).toUri(), ConfigurationUtil.toProperties(conf)); ElementDescriptor schemaFilePath = storage.asElement(location, schemaFileName); if (!schemaFilePath.exists() && schema != null) { try { new ObjectMapper().writeValue(schemaFilePath.create(), schema); } catch (JsonGenerationException e) { log.warn("Unable to write Resource Statistics for " + location); e.printStackTrace(); } catch (JsonMappingException e) { log.warn("Unable to write Resource Statistics for " + location); e.printStackTrace(); } } if (printHeaders) { ElementDescriptor headerFilePath = storage.asElement(location, headerFileName); if (!headerFilePath.exists()) { OutputStream os = headerFilePath.create(); try { String[] names = schema.fieldNames(); String fn; for (int i = 0; i < names.length; i++) { fn = ((names[i] == null) ? ("$" + i) : names[i]); os.write(fn.getBytes("UTF-8")); if (i < names.length - 1) { os.write(fieldDel); } else { os.write(recordDel); } } } finally { os.close(); } } } }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple) * * Given a tuple that corresponds to one record, write * it out as CSV, converting among Unix/Windows line * breaks as requested in the instantiation. Also take * care of escaping field delimiters, double quotes, * and linebreaks embedded within fields, * */ @Override public void putNext(Tuple tupleToWrite) throws IOException { // If WRITE_OUTPUT_HEADER, store a header record with the names of each field if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) { ArrayList<Object> headerProtoTuple = new ArrayList<Object>(); ResourceFieldSchema[] fields = schema.getFields(); for (ResourceFieldSchema field : fields) { headerProtoTuple.add(field.getName()); } super.putNext(tupleMaker.newTuple(headerProtoTuple)); } storingFirstRecord = false; ArrayList<Object> mProtoTuple = new ArrayList<Object>(); int embeddedNewlineIndex = -1; int embeddedCarriageReturn = -1; String fieldStr = null; // For good debug messages: int fieldCounter = -1; // Do the escaping: for (Object field : tupleToWrite.getAll()) { fieldCounter++; // Substitute a null value with an empty string. See PIG-2470. if (field == null) { fieldStr = null; mProtoTuple.add(""); continue; } fieldStr = field.toString(); // Embedded double quotes are replaced by two double quotes: fieldStr = fieldStr.replaceAll("[\"]", "\"\""); // If any field delimiters are in the field, or if we did replace // any double quotes with a pair of double quotes above, // or if the string includes a newline character (LF:\n:0x0A) // or includes a carriage return (CR:\r:0x0D) // and we are to allow newlines in fields, // then the entire field must be enclosed in double quotes: embeddedNewlineIndex = fieldStr.indexOf(LINEFEED); embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN); if ((fieldStr.indexOf(fieldDelimiter) != -1) || (fieldStr.indexOf(DOUBLE_QUOTE) != -1) || (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) { fieldStr = "\"" + fieldStr + "\""; } // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J), // This is needed for Excel to recognize a field-internal // new line: if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) { if (eolTreatment == Linebreaks.WINDOWS) { loneLFDetector.reset(fieldStr); loneLFDetector.matches(); fieldStr = loneLFDetector.replaceAll("$1\r\n"); } else if (eolTreatment == Linebreaks.UNIX) { CRLFDetector.reset(fieldStr); fieldStr = CRLFDetector.replaceAll("\n"); } } mProtoTuple.add(fieldStr); } // If Windows line breaks are requested, append // a newline (0x0D a.k.a. ^M) to the last field // so that the row termination will end up being // \r\n, once the superclass' putNext() method // is done below: if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null)) mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r"); Tuple resTuple = tupleMaker.newTuple(mProtoTuple); super.putNext(resTuple); }
public boolean write( Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) { byte type = (field != null ? field.getType() : DataType.findType(object)); if (writeFieldName) { generator.writeFieldName(alias.toES(field.getName())); } if (object == null) { generator.writeNull(); return true; } switch (type) { case DataType.ERROR: case DataType.UNKNOWN: return handleUnknown(object, field, generator); case DataType.NULL: generator.writeNull(); break; case DataType.BOOLEAN: generator.writeBoolean((Boolean) object); break; case DataType.INTEGER: generator.writeNumber(((Number) object).intValue()); break; case DataType.LONG: generator.writeNumber(((Number) object).longValue()); break; case DataType.FLOAT: generator.writeNumber(((Number) object).floatValue()); break; case DataType.DOUBLE: generator.writeNumber(((Number) object).doubleValue()); break; case DataType.BYTE: generator.writeNumber((Byte) object); break; case DataType.CHARARRAY: generator.writeString(object.toString()); break; case DataType.BYTEARRAY: generator.writeBinary(((DataByteArray) object).get()); break; // DateTime introduced in Pig 11 case 30: // DataType.DATETIME generator.writeString(PigUtils.convertDateToES(object)); break; // DateTime introduced in Pig 12 case 65: // DataType.BIGINTEGER throw new SerializationException( "Big integers are not supported by Elasticsearch - consider using a different type (such as string)"); // DateTime introduced in Pig 12 case 70: // DataType.BIGDECIMAL throw new SerializationException( "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)"); case DataType.MAP: ResourceSchema nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginObject(); generator.writeEndObject(); break; } ResourceFieldSchema[] nestedFields = nestedSchema.getFields(); generator.writeBeginObject(); // Pig maps are actually String -> Object association so we can save the key right away for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) { generator.writeFieldName(alias.toES(entry.getKey().toString())); write(entry.getValue(), nestedFields[0], generator, false); } generator.writeEndObject(); break; case DataType.TUPLE: nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginObject(); generator.writeEndObject(); break; } nestedFields = nestedSchema.getFields(); // use getAll instead of get(int) to avoid having to handle Exception... List<Object> tuples = ((Tuple) object).getAll(); generator.writeBeginObject(); for (int i = 0; i < nestedFields.length; i++) { String name = nestedFields[i].getName(); // handle schemas without names name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i)); generator.writeFieldName(name); write(tuples.get(i), nestedFields[i], generator, false); } generator.writeEndObject(); break; case DataType.BAG: nestedSchema = field.getSchema(); // empty tuple shortcut if (nestedSchema == null) { generator.writeBeginArray(); generator.writeEndArray(); break; } ResourceFieldSchema bagType = nestedSchema.getFields()[0]; generator.writeBeginArray(); for (Tuple tuple : (DataBag) object) { write(tuple, bagType, generator, false); } generator.writeEndArray(); break; default: if (writeUnknownTypes) { return handleUnknown(object, field, generator); } return false; } return true; }