Beispiel #1
0
  public void checkSchema(ResourceSchema s) throws IOException {
    // Not actually checking schema
    // Actually, just storing it to use in the backend

    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature});
    p.setProperty(SCHEMA_SIGNATURE, s.toString());
  }
 @Override
 public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException {
   reader = recordReader;
   if (!requiredFieldsInitialized) {
     UDFContext udfc = UDFContext.getUDFContext();
     Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature});
     requiredFields =
         (boolean[]) ObjectSerializer.deserialize(p.getProperty(REQUIRED_FIELDS_SIGNATURE));
     requiredFieldsInitialized = true;
   }
 }
  public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList)
      throws FrontendException {
    if (requiredFieldList == null) {
      return null;
    }
    if (requiredFieldList.getFields() != null) {
      int lastColumn = -1;
      for (RequiredField rf : requiredFieldList.getFields()) {
        if (rf.getIndex() > lastColumn) {
          lastColumn = rf.getIndex();
        }
      }
      requiredFields = new boolean[lastColumn + 1];
      for (RequiredField rf : requiredFieldList.getFields()) {
        if (rf.getIndex() != -1) requiredFields[rf.getIndex()] = true;
      }
      Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
      try {
        p.setProperty(REQUIRED_FIELDS_SIGNATURE, ObjectSerializer.serialize(requiredFields));
      } catch (Exception e) {
        throw new RuntimeException("Cannot serialize mRequiredColumns");
      }
    }

    return new RequiredFieldResponse(true);
  }
Beispiel #4
0
 private void write(Object part, int id, Vector vector) throws IOException {
   SequenceFile.Writer writer = writers.get(part);
   if (writer == null) {
     Configuration conf = UDFContext.getUDFContext().getJobConf();
     Path file = PathUtils.enter(getStorePath(), String.valueOf(part), "part-" + Env.getPartID());
     writer = IOUtils.forSequenceWrite(conf, file, IntWritable.class, VectorWritable.class);
     writers.put(part, writer);
   }
   keyWritable.set(id);
   valueWritable.set(vector);
   writer.append(keyWritable, valueWritable);
 }
Beispiel #5
0
  public void prepareToWrite(RecordWriter writer) {
    // Get the schema string from the UDFContext object.
    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature});

    String strSchema = p.getProperty(SCHEMA_SIGNATURE);
    if (strSchema != null) {
      // Parse the schema from the string stored in the properties object.
      try {
        schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
      } catch (ParserException pex) {
        logger.warn("Could not parse schema for storing.");
      }
    }

    if (headerTreatment == Headers.DEFAULT) {
      headerTreatment = Headers.SKIP_OUTPUT_HEADER;
    }

    // PigStorage's prepareToWrite()
    super.prepareToWrite(writer);
  }
  public Schema outputSchema(Schema input) {
    try {
      Properties prop = UDFContext.getUDFContext().getUDFProperties(this.getClass());
      String outputAlias = null;

      if (input.size() == 1) {
        Schema.FieldSchema onlyField = input.getField(0);
        outputAlias = onlyField.alias;
        if (onlyField.type == DataType.TUPLE) {
          prop.setProperty(INPUT_TYPE_SIGNATURE, new Byte(INPUT_TUPLE_FIELD).toString());
          determineArrayCollectionType(onlyField.schema, prop);
        } else if (onlyField.type == DataType.BAG) {
          prop.setProperty(INPUT_TYPE_SIGNATURE, new Byte(INPUT_BAG_FIELD).toString());

          Schema tupleSchema = onlyField.schema.getField(0).schema;
          if (tupleSchema.size() == 1) {
            determineSetCollectionType(tupleSchema, prop);
          } else if (tupleSchema.size() == 2) {
            determineMapCollectionType(tupleSchema, prop);
          } else {
            throw new RuntimeException(
                "Bag must have either single-element tuples (set) "
                    + "or two-element tuples (key, value) to be encoded as a PigArray.");
          }
        }
      } else {
        prop.setProperty(INPUT_TYPE_SIGNATURE, new Byte(INPUT_SEVERAL_FIELDS).toString());
        determineArrayCollectionType(input, prop);
      }

      return new Schema(
          new Schema.FieldSchema(
              outputAlias == null ? "pig_collection" : outputAlias, DataType.BYTEARRAY));
    } catch (FrontendException e) {
      throw new RuntimeException(e);
    }
  }
Beispiel #7
0
  @Test
  public void testGetBagSubSchemaConfigured() throws Exception {

    // NOTE: pig-0.8 sets client system properties by actually getting the client
    // system properties. Starting in pig-0.9 you must pass the properties in.
    // When updating our pig dependency this will need updated.
    System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
    System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
    UDFContext.getUDFContext().setClientSystemProps(System.getProperties());

    // Define the expected schema.
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] =
        new ResourceFieldSchema()
            .setName("t")
            .setDescription("The tuple in the bag")
            .setType(DataType.TUPLE);

    ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
    innerTupleFieldSchemas[0] =
        new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);

    bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);

    // Get the actual converted schema.
    HCatSchema actualHCatSchema =
        new HCatSchema(
            Lists.newArrayList(
                new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
    HCatFieldSchema actualHCatFieldSchema =
        new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
    ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);

    Assert.assertEquals(expected.toString(), actual.toString());
  }
 private Properties getUDFContext() {
   return UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] {signature});
 }
  public DataByteArray exec(Tuple input) throws IOException {
    try {
      Properties prop = UDFContext.getUDFContext().getUDFProperties(this.getClass());
      byte inputType = Byte.parseByte(prop.getProperty(INPUT_TYPE_SIGNATURE));
      byte arrayType = Byte.parseByte(prop.getProperty(COLLECTION_TYPE_SIGNATURE));

      if (arrayType == PigCollection.INT_ARRAY) {
        Tuple t = getTupleToEncode(input, inputType);
        int arr[] = new int[t.size()];
        for (int i = 0; i < t.size(); i++) {
          arr[i] = (Integer) t.get(i);
        }
        return PigCollection.serialize(arr);
      } else if (arrayType == PigCollection.FLOAT_ARRAY) {
        Tuple t = getTupleToEncode(input, inputType);
        float arr[] = new float[t.size()];
        for (int i = 0; i < t.size(); i++) {
          arr[i] = (Float) t.get(i);
        }
        return PigCollection.serialize(arr);
      } else if (arrayType == PigCollection.INT_INT_MAP) {
        DataBag bag = (DataBag) input.get(0);
        TIntIntHashMap map = new TIntIntHashMap((int) bag.size());
        for (Tuple t : bag) {
          map.put((Integer) t.get(0), (Integer) t.get(1));
        }
        return PigCollection.serialize(map);
      } else if (arrayType == PigCollection.INT_FLOAT_MAP) {
        DataBag bag = (DataBag) input.get(0);
        TIntFloatHashMap map = new TIntFloatHashMap((int) bag.size());
        for (Tuple t : bag) {
          map.put((Integer) t.get(0), (Float) t.get(1));
        }
        return PigCollection.serialize(map);
      } else if (arrayType == PigCollection.STRING_INT_MAP) {
        DataBag bag = (DataBag) input.get(0);
        TObjectIntHashMap map = new TObjectIntHashMap((int) bag.size());
        for (Tuple t : bag) {
          map.put((String) t.get(0), (Integer) t.get(1));
        }
        return PigCollection.serialize(map);
      } else if (arrayType == PigCollection.STRING_FLOAT_MAP) {
        DataBag bag = (DataBag) input.get(0);
        TObjectFloatHashMap map = new TObjectFloatHashMap((int) bag.size());
        for (Tuple t : bag) {
          map.put((String) t.get(0), (Float) t.get(1));
        }
        return PigCollection.serialize(map);
      } else if (arrayType == PigCollection.INT_SET) {
        DataBag bag = (DataBag) input.get(0);
        TIntHashSet set = new TIntHashSet((int) bag.size());
        for (Tuple t : bag) {
          set.add((Integer) t.get(0));
        }
        return PigCollection.serialize(set);
      } else if (arrayType == PigCollection.STRING_SET) {
        DataBag bag = (DataBag) input.get(0);
        Set<String> set = new HashSet<String>();
        for (Tuple t : bag) {
          set.add((String) t.get(0));
        }
        return PigCollection.serialize(set);
      } else {
        throw new RuntimeException("Invalid PigCollection type requested");
      }
    } catch (ExecException e) {
      throw new RuntimeException(e);
    }
  }
Beispiel #10
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#getNext()
   */
  @Override
  public Tuple getNext() throws IOException {
    // If SKIP_INPUT_HEADER and this is the first input split, skip header record
    // We store its value as a string though, so we can compare
    // further records to it. If they are the same (this would
    // happen if multiple small files each with a header were combined
    // into one split), we know to skip the duplicate header record as well.
    if (loadingFirstRecord
        && headerTreatment == Headers.SKIP_INPUT_HEADER
        && (splitIndex == 0 || splitIndex == -1)) {
      try {
        if (!in.nextKeyValue()) return null;
        header = ((Text) in.getCurrentValue()).toString();
      } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
      }
    }
    loadingFirstRecord = false;

    mProtoTuple = new ArrayList<Object>();

    getNextInQuotedField = false;
    boolean evenQuotesSeen = true;
    boolean sawEmbeddedRecordDelimiter = false;
    byte[] buf = null;

    if (!mRequiredColumnsInitialized) {
      if (udfContextSignature != null) {
        Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
        mRequiredColumns =
            (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature));
      }
      mRequiredColumnsInitialized = true;
    }
    // Note: we cannot factor out the check for nextKeyValue() being null,
    // because that call overwrites buf with the new line, which is
    // bad if we have a field with a newline.

    try {
      int recordLen = 0;
      getNextFieldID = 0;

      while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) {
        Text value = null;
        if (sawEmbeddedRecordDelimiter) {

          // Deal with pulling more records from the input, because
          // a double quoted embedded newline was encountered in a field.
          // Save the length of the record so far, plus one byte for the
          // record delimiter (usually newline) that's embedded in the field
          // we were working on before falling into this branch:
          int prevLineLen = recordLen + 1;

          // Save previous line (the one with the field that has the newline) in a new array.
          // The last byte will be random; we'll fill in the embedded
          // record delimiter (usually newline) below:
          byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen);
          prevLineSaved[prevLineLen - 1] = RECORD_DEL;

          // Read the continuation of the record, unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();
          recordLen = value.getLength();
          // Grab the continuation's bytes:
          buf = value.getBytes();

          // Combine the previous line and the continuation into a new array.
          // The following copyOf() does half the job: it allocates all the
          // space, and also copies the previous line into that space:
          byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen);

          // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos,
          // lengthToCopy:
          System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen);

          // We'll work with the combination now:
          buf = prevLineAndContinuation;

          // Do the whole record over from the start:
          mProtoTuple.clear();
          getNextInQuotedField = false;
          evenQuotesSeen = true;
          getNextFieldID = 0;
          recordLen = prevLineAndContinuation.length;

        } else {
          // Previous record finished cleanly: start with the next record,
          // unless EOF:
          if (!in.nextKeyValue()) {
            return null;
          }
          value = (Text) in.getCurrentValue();

          // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it
          // (this might happen if multiple files each with a header are combined into a single
          // split)
          if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) {
            if (!in.nextKeyValue()) return null;
            value = (Text) in.getCurrentValue();
          }

          buf = value.getBytes();
          getNextFieldID = 0;
          recordLen = value.getLength();
        }

        nextTupleSkipChar = false;

        ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen);

        sawEmbeddedRecordDelimiter =
            processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer);

        // The last field is never delimited by a FIELD_DEL, but by
        // the end of the record. So we need to add that last field.
        // The '!sawEmbeddedRecordDelimiter' handles the case of
        // embedded newlines; we are amidst a field, not at
        // the final record:
        if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++);
      } // end while

    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
  }
 private String getValueFromUDFContext(final String signature, final String key) {
   final UDFContext udfContext = UDFContext.getUDFContext();
   final Properties props = udfContext.getUDFProperties(this.getClass(), new String[] {signature});
   return props.getProperty(key);
 }
 private void storeInUDFContext(final String signature, final String key, final String value) {
   final UDFContext udfContext = UDFContext.getUDFContext();
   final Properties props = udfContext.getUDFProperties(this.getClass(), new String[] {signature});
   props.put(key, value);
 }
      @Override
      protected void execute(LogicalExpression op) throws FrontendException {
        if (op instanceof UserFuncExpression) {
          UserFuncExpression udf = (UserFuncExpression) op;
          if (!udf.getEvalFunc().allowCompileTimeCalculation()) {
            return;
          }
        }
        boolean valSet = false;
        Object val = null;
        if (currentWalker.getPlan().getSuccessors(op) != null) {
          // If has successors and all successors are constant, calculate the constant
          for (Operator succ : currentWalker.getPlan().getSuccessors(op)) {
            if (!(succ instanceof ConstantExpression)) {
              return;
            }
          }
          // All successors are constant, calculate the value
          OperatorPlan expLogicalPlan = new LogicalExpressionPlan();
          ((BaseOperatorPlan) currentWalker.getPlan())
              .moveTree(op, (BaseOperatorPlan) expLogicalPlan);
          PhysicalPlan expPhysicalPlan = new PhysicalPlan();
          Map<Operator, PhysicalOperator> logToPhyMap = new HashMap<Operator, PhysicalOperator>();
          PlanWalker childWalker = new ReverseDependencyOrderWalkerWOSeenChk(expLogicalPlan);

          // Save the old walker and use childWalker as current Walker
          pushWalker(childWalker);
          ExpToPhyTranslationVisitor expTranslationVisitor =
              new ExpToPhyTranslationVisitor(
                  expLogicalPlan, childWalker, currentOp, expPhysicalPlan, logToPhyMap);
          expTranslationVisitor.visit();
          popWalker();
          PhysicalOperator root = expPhysicalPlan.getLeaves().get(0);
          try {
            UDFContext.getUDFContext()
                .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true));
            PigHadoopLogger pigHadoopLogger = PigHadoopLogger.getInstance();
            PhysicalOperator.setPigLogger(pigHadoopLogger);
            setDefaultTimeZone();
            val = root.getNext(root.getResultType()).result;
            restoreDefaultTimeZone();
            UDFContext.getUDFContext().addJobConf(null);
          } catch (ExecException e) {
            throw new FrontendException(e);
          }
          valSet = true;
        } else if (op instanceof UserFuncExpression) {
          // If solo UDF, calculate UDF
          UserFuncExpression udf = (UserFuncExpression) op;
          try {
            UDFContext.getUDFContext()
                .addJobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), true));
            setDefaultTimeZone();
            val = udf.getEvalFunc().exec(null);
            restoreDefaultTimeZone();
            UDFContext.getUDFContext().addJobConf(null);
          } catch (IOException e) {
            throw new FrontendException(e);
          }
          valSet = true;
        }
        if (valSet) {
          ConstantExpression constantExpr;
          constantExpr = new ConstantExpression(currentWalker.getPlan(), val);
          constantExpr.inheritSchema(op);
          currentWalker.getPlan().replace(op, constantExpr);
        }
      }