public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException { if (requiredFieldList == null) { return null; } if (requiredFieldList.getFields() != null) { int lastColumn = -1; for (RequiredField rf : requiredFieldList.getFields()) { if (rf.getIndex() > lastColumn) { lastColumn = rf.getIndex(); } } requiredFields = new boolean[lastColumn + 1]; for (RequiredField rf : requiredFieldList.getFields()) { if (rf.getIndex() != -1) requiredFields[rf.getIndex()] = true; } Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); try { p.setProperty(REQUIRED_FIELDS_SIGNATURE, ObjectSerializer.serialize(requiredFields)); } catch (Exception e) { throw new RuntimeException("Cannot serialize mRequiredColumns"); } } return new RequiredFieldResponse(true); }
public void checkSchema(ResourceSchema s) throws IOException { // Not actually checking schema // Actually, just storing it to use in the backend UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature}); p.setProperty(SCHEMA_SIGNATURE, s.toString()); }
@Override public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException { reader = recordReader; if (!requiredFieldsInitialized) { UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature}); requiredFields = (boolean[]) ObjectSerializer.deserialize(p.getProperty(REQUIRED_FIELDS_SIGNATURE)); requiredFieldsInitialized = true; } }
private void write(Object part, int id, Vector vector) throws IOException { SequenceFile.Writer writer = writers.get(part); if (writer == null) { Configuration conf = UDFContext.getUDFContext().getJobConf(); Path file = PathUtils.enter(getStorePath(), String.valueOf(part), "part-" + Env.getPartID()); writer = IOUtils.forSequenceWrite(conf, file, IntWritable.class, VectorWritable.class); writers.put(part, writer); } keyWritable.set(id); valueWritable.set(vector); writer.append(keyWritable, valueWritable); }
public void prepareToWrite(RecordWriter writer) { // Get the schema string from the UDFContext object. UDFContext udfc = UDFContext.getUDFContext(); Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature}); String strSchema = p.getProperty(SCHEMA_SIGNATURE); if (strSchema != null) { // Parse the schema from the string stored in the properties object. try { schema = new ResourceSchema(Utils.getSchemaFromString(strSchema)); } catch (ParserException pex) { logger.warn("Could not parse schema for storing."); } } if (headerTreatment == Headers.DEFAULT) { headerTreatment = Headers.SKIP_OUTPUT_HEADER; } // PigStorage's prepareToWrite() super.prepareToWrite(writer); }
public Schema outputSchema(Schema input) { try { Properties prop = UDFContext.getUDFContext().getUDFProperties(this.getClass()); String outputAlias = null; if (input.size() == 1) { Schema.FieldSchema onlyField = input.getField(0); outputAlias = onlyField.alias; if (onlyField.type == DataType.TUPLE) { prop.setProperty(INPUT_TYPE_SIGNATURE, new Byte(INPUT_TUPLE_FIELD).toString()); determineArrayCollectionType(onlyField.schema, prop); } else if (onlyField.type == DataType.BAG) { prop.setProperty(INPUT_TYPE_SIGNATURE, new Byte(INPUT_BAG_FIELD).toString()); Schema tupleSchema = onlyField.schema.getField(0).schema; if (tupleSchema.size() == 1) { determineSetCollectionType(tupleSchema, prop); } else if (tupleSchema.size() == 2) { determineMapCollectionType(tupleSchema, prop); } else { throw new RuntimeException( "Bag must have either single-element tuples (set) " + "or two-element tuples (key, value) to be encoded as a PigArray."); } } } else { prop.setProperty(INPUT_TYPE_SIGNATURE, new Byte(INPUT_SEVERAL_FIELDS).toString()); determineArrayCollectionType(input, prop); } return new Schema( new Schema.FieldSchema( outputAlias == null ? "pig_collection" : outputAlias, DataType.BYTEARRAY)); } catch (FrontendException e) { throw new RuntimeException(e); } }
@Test public void testGetBagSubSchemaConfigured() throws Exception { // NOTE: pig-0.8 sets client system properties by actually getting the client // system properties. Starting in pig-0.9 you must pass the properties in. // When updating our pig dependency this will need updated. System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t"); System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple"); UDFContext.getUDFContext().setClientSystemProps(System.getProperties()); // Define the expected schema. ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1]; bagSubFieldSchemas[0] = new ResourceFieldSchema() .setName("t") .setDescription("The tuple in the bag") .setType(DataType.TUPLE); ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1]; innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY); bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas)); ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas); // Get the actual converted schema. HCatSchema actualHCatSchema = new HCatSchema( Lists.newArrayList( new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null))); HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null); ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema); Assert.assertEquals(expected.toString(), actual.toString()); }
private Properties getUDFContext() { return UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] {signature}); }
public DataByteArray exec(Tuple input) throws IOException { try { Properties prop = UDFContext.getUDFContext().getUDFProperties(this.getClass()); byte inputType = Byte.parseByte(prop.getProperty(INPUT_TYPE_SIGNATURE)); byte arrayType = Byte.parseByte(prop.getProperty(COLLECTION_TYPE_SIGNATURE)); if (arrayType == PigCollection.INT_ARRAY) { Tuple t = getTupleToEncode(input, inputType); int arr[] = new int[t.size()]; for (int i = 0; i < t.size(); i++) { arr[i] = (Integer) t.get(i); } return PigCollection.serialize(arr); } else if (arrayType == PigCollection.FLOAT_ARRAY) { Tuple t = getTupleToEncode(input, inputType); float arr[] = new float[t.size()]; for (int i = 0; i < t.size(); i++) { arr[i] = (Float) t.get(i); } return PigCollection.serialize(arr); } else if (arrayType == PigCollection.INT_INT_MAP) { DataBag bag = (DataBag) input.get(0); TIntIntHashMap map = new TIntIntHashMap((int) bag.size()); for (Tuple t : bag) { map.put((Integer) t.get(0), (Integer) t.get(1)); } return PigCollection.serialize(map); } else if (arrayType == PigCollection.INT_FLOAT_MAP) { DataBag bag = (DataBag) input.get(0); TIntFloatHashMap map = new TIntFloatHashMap((int) bag.size()); for (Tuple t : bag) { map.put((Integer) t.get(0), (Float) t.get(1)); } return PigCollection.serialize(map); } else if (arrayType == PigCollection.STRING_INT_MAP) { DataBag bag = (DataBag) input.get(0); TObjectIntHashMap map = new TObjectIntHashMap((int) bag.size()); for (Tuple t : bag) { map.put((String) t.get(0), (Integer) t.get(1)); } return PigCollection.serialize(map); } else if (arrayType == PigCollection.STRING_FLOAT_MAP) { DataBag bag = (DataBag) input.get(0); TObjectFloatHashMap map = new TObjectFloatHashMap((int) bag.size()); for (Tuple t : bag) { map.put((String) t.get(0), (Float) t.get(1)); } return PigCollection.serialize(map); } else if (arrayType == PigCollection.INT_SET) { DataBag bag = (DataBag) input.get(0); TIntHashSet set = new TIntHashSet((int) bag.size()); for (Tuple t : bag) { set.add((Integer) t.get(0)); } return PigCollection.serialize(set); } else if (arrayType == PigCollection.STRING_SET) { DataBag bag = (DataBag) input.get(0); Set<String> set = new HashSet<String>(); for (Tuple t : bag) { set.add((String) t.get(0)); } return PigCollection.serialize(set); } else { throw new RuntimeException("Invalid PigCollection type requested"); } } catch (ExecException e) { throw new RuntimeException(e); } }
/* (non-Javadoc) * @see org.apache.pig.builtin.PigStorage#getNext() */ @Override public Tuple getNext() throws IOException { // If SKIP_INPUT_HEADER and this is the first input split, skip header record // We store its value as a string though, so we can compare // further records to it. If they are the same (this would // happen if multiple small files each with a header were combined // into one split), we know to skip the duplicate header record as well. if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER && (splitIndex == 0 || splitIndex == -1)) { try { if (!in.nextKeyValue()) return null; header = ((Text) in.getCurrentValue()).toString(); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } loadingFirstRecord = false; mProtoTuple = new ArrayList<Object>(); getNextInQuotedField = false; boolean evenQuotesSeen = true; boolean sawEmbeddedRecordDelimiter = false; byte[] buf = null; if (!mRequiredColumnsInitialized) { if (udfContextSignature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature)); } mRequiredColumnsInitialized = true; } // Note: we cannot factor out the check for nextKeyValue() being null, // because that call overwrites buf with the new line, which is // bad if we have a field with a newline. try { int recordLen = 0; getNextFieldID = 0; while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) { Text value = null; if (sawEmbeddedRecordDelimiter) { // Deal with pulling more records from the input, because // a double quoted embedded newline was encountered in a field. // Save the length of the record so far, plus one byte for the // record delimiter (usually newline) that's embedded in the field // we were working on before falling into this branch: int prevLineLen = recordLen + 1; // Save previous line (the one with the field that has the newline) in a new array. // The last byte will be random; we'll fill in the embedded // record delimiter (usually newline) below: byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen); prevLineSaved[prevLineLen - 1] = RECORD_DEL; // Read the continuation of the record, unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); recordLen = value.getLength(); // Grab the continuation's bytes: buf = value.getBytes(); // Combine the previous line and the continuation into a new array. // The following copyOf() does half the job: it allocates all the // space, and also copies the previous line into that space: byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen); // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, // lengthToCopy: System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen); // We'll work with the combination now: buf = prevLineAndContinuation; // Do the whole record over from the start: mProtoTuple.clear(); getNextInQuotedField = false; evenQuotesSeen = true; getNextFieldID = 0; recordLen = prevLineAndContinuation.length; } else { // Previous record finished cleanly: start with the next record, // unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it // (this might happen if multiple files each with a header are combined into a single // split) if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) { if (!in.nextKeyValue()) return null; value = (Text) in.getCurrentValue(); } buf = value.getBytes(); getNextFieldID = 0; recordLen = value.getLength(); } nextTupleSkipChar = false; ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen); sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer); // The last field is never delimited by a FIELD_DEL, but by // the end of the record. So we need to add that last field. // The '!sawEmbeddedRecordDelimiter' handles the case of // embedded newlines; we are amidst a field, not at // the final record: if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++); } // end while } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; }
private String getValueFromUDFContext(final String signature, final String key) { final UDFContext udfContext = UDFContext.getUDFContext(); final Properties props = udfContext.getUDFProperties(this.getClass(), new String[] {signature}); return props.getProperty(key); }
private void storeInUDFContext(final String signature, final String key, final String value) { final UDFContext udfContext = UDFContext.getUDFContext(); final Properties props = udfContext.getUDFProperties(this.getClass(), new String[] {signature}); props.put(key, value); }
@Override protected void execute(LogicalExpression op) throws FrontendException { if (op instanceof UserFuncExpression) { UserFuncExpression udf = (UserFuncExpression) op; if (!udf.getEvalFunc().allowCompileTimeCalculation()) { return; } } boolean valSet = false; Object val = null; if (currentWalker.getPlan().getSuccessors(op) != null) { // If has successors and all successors are constant, calculate the constant for (Operator succ : currentWalker.getPlan().getSuccessors(op)) { if (!(succ instanceof ConstantExpression)) { return; } } // All successors are constant, calculate the value OperatorPlan expLogicalPlan = new LogicalExpressionPlan(); ((BaseOperatorPlan) currentWalker.getPlan()) .moveTree(op, (BaseOperatorPlan) expLogicalPlan); PhysicalPlan expPhysicalPlan = new PhysicalPlan(); Map<Operator, PhysicalOperator> logToPhyMap = new HashMap<Operator, PhysicalOperator>(); PlanWalker childWalker = new ReverseDependencyOrderWalkerWOSeenChk(expLogicalPlan); // Save the old walker and use childWalker as current Walker pushWalker(childWalker); ExpToPhyTranslationVisitor expTranslationVisitor = new ExpToPhyTranslationVisitor( expLogicalPlan, childWalker, currentOp, expPhysicalPlan, logToPhyMap); expTranslationVisitor.visit(); popWalker(); PhysicalOperator root = expPhysicalPlan.getLeaves().get(0); try { UDFContext.getUDFContext() .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true)); PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance(); PhysicalOperator.setPigLogger(pigHadoopLogger); setDefaultTimeZone(); val = root.getNext(root.getResultType()).result; restoreDefaultTimeZone(); UDFContext.getUDFContext().addJobConf(null); } catch (ExecException e) { throw new FrontendException(e); } valSet = true; } else if (op instanceof UserFuncExpression) { // If solo UDF, calculate UDF UserFuncExpression udf = (UserFuncExpression) op; try { UDFContext.getUDFContext() .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true)); setDefaultTimeZone(); val = udf.getEvalFunc().exec(null); restoreDefaultTimeZone(); UDFContext.getUDFContext().addJobConf(null); } catch (IOException e) { throw new FrontendException(e); } valSet = true; } if (valSet) { ConstantExpression constantExpr; constantExpr = new ConstantExpression(currentWalker.getPlan(), val); constantExpr.inheritSchema(op); currentWalker.getPlan().replace(op, constantExpr); } }