@Override
  public ResourceSchema getSchema(String location, Job job) throws IOException {

    if (!partitionKeysSet) {
      Set<String> keys = getPartitionColumns(location, job);

      if (!(keys == null || keys.size() == 0)) {

        // re-edit the pigSchema to contain the new partition keys.
        ResourceFieldSchema[] fields = pigSchema.getFields();

        LOG.debug("Schema: " + Arrays.toString(fields));

        ResourceFieldSchema[] newFields = Arrays.copyOf(fields, fields.length + keys.size());

        int index = fields.length;

        for (String key : keys) {
          newFields[index++] = new ResourceFieldSchema(new FieldSchema(key, DataType.CHARARRAY));
        }

        pigSchema.setFields(newFields);

        LOG.debug("Added partition fields: " + keys + " to loader schema");
        LOG.debug("Schema is: " + Arrays.toString(newFields));
      }

      partitionKeysSet = true;
    }

    return pigSchema;
  }
Exemple #2
0
  @Test
  public void testGetBagSubSchema() throws Exception {

    // Define the expected schema.
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] =
        new ResourceFieldSchema()
            .setName("innertuple")
            .setDescription("The tuple in the bag")
            .setType(DataType.TUPLE);

    ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
    innerTupleFieldSchemas[0] =
        new ResourceFieldSchema().setName("innerfield").setType(DataType.CHARARRAY);

    bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);

    // Get the actual converted schema.
    HCatSchema hCatSchema =
        new HCatSchema(
            Lists.newArrayList(
                new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
    HCatFieldSchema hCatFieldSchema =
        new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, hCatSchema, null);
    ResourceSchema actual = PigHCatUtil.getBagSubSchema(hCatFieldSchema);

    Assert.assertEquals(expected.toString(), actual.toString());
  }
  /** wrap a pig schema as tuple */
  public static ResourceFieldSchema wrapAsTuple(ResourceFieldSchema subFieldSchema)
      throws IOException {
    ResourceSchema listSchema = new ResourceSchema();
    listSchema.setFields(new ResourceFieldSchema[] {subFieldSchema});

    ResourceFieldSchema tupleWrapper = new ResourceFieldSchema();
    tupleWrapper.setType(DataType.TUPLE);
    tupleWrapper.setName(PIG_TUPLE_WRAPPER);
    tupleWrapper.setSchema(listSchema);

    return tupleWrapper;
  }
Exemple #4
0
  public void checkSchema(ResourceSchema s) throws IOException {
    // Not actually checking schema
    // Actually, just storing it to use in the backend

    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[] {udfContextSignature});
    p.setProperty(SCHEMA_SIGNATURE, s.toString());
  }
Exemple #5
0
  @Test
  public void testGetBagSubSchemaConfigured() throws Exception {

    // NOTE: pig-0.8 sets client system properties by actually getting the client
    // system properties. Starting in pig-0.9 you must pass the properties in.
    // When updating our pig dependency this will need updated.
    System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
    System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
    UDFContext.getUDFContext().setClientSystemProps(System.getProperties());

    // Define the expected schema.
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] =
        new ResourceFieldSchema()
            .setName("t")
            .setDescription("The tuple in the bag")
            .setType(DataType.TUPLE);

    ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
    innerTupleFieldSchemas[0] =
        new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);

    bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);

    // Get the actual converted schema.
    HCatSchema actualHCatSchema =
        new HCatSchema(
            Lists.newArrayList(
                new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
    HCatFieldSchema actualHCatFieldSchema =
        new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
    ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);

    Assert.assertEquals(expected.toString(), actual.toString());
  }
  /**
   * Only read the columns that were requested in the constructor.<br>
   *
   * @param struct ColumnarStruct
   * @param path Path
   * @return Tuple
   * @throws IOException
   */
  private Tuple readColumnarTuple(ColumnarStruct struct, Path path) throws IOException {

    int[] columnIndexes = getRequiredColumns();
    // the partition keys if any will already be in the UDFContext here.
    String[] partitionKeys = getPartitionKeys(null, null);
    // only if the path has changed should be run the
    if (currentPath == null || !currentPath.equals(path)) {
      currentPathPartitionKeyMap =
          (partitionKeys == null)
              ? null
              : pathPartitionerHelper.getPathPartitionKeyValues(path.toString());
      currentPath = path;
    }

    // if the partitionColumns is null this value will stop the for loop
    // below from trynig to add any partition columns
    // that do not exist
    int partitionColumnStartIndex = Integer.MAX_VALUE;

    if (!(partitionColumns == null || partitionColumns.size() == 0)) {
      // partition columns are always appended to the schema fields.
      partitionColumnStartIndex = pigSchema.getFields().length;
    }

    // create tuple with determined previous size
    Tuple t = tupleFactory.newTuple(columnIndexes.length);

    // read in all columns
    for (int i = 0; i < columnIndexes.length; i++) {
      int columnIndex = columnIndexes[i];

      if (columnIndex < partitionColumnStartIndex) {
        Object obj = struct.getField(columnIndex);
        Object pigType = HiveRCSchemaUtil.extractPigTypeFromHiveType(obj);

        t.set(i, pigType);

      } else {
        // read the partition columns
        // will only be executed if partitionColumns is not null
        String key = partitionKeys[columnIndex - partitionColumnStartIndex];
        Object value = currentPathPartitionKeyMap.get(key);
        t.set(i, value);
      }
    }

    return t;
  }
 @Override
 public ResourceSchema getSchema(String location, Job job) throws IOException {
   if (schema != null) {
     return schema;
   }
   final Configuration configuration = job.getConfiguration();
   this.initializePhoenixPigConfiguration(location, configuration);
   this.schema = PhoenixPigSchemaUtil.getResourceSchema(this.config);
   if (LOG.isDebugEnabled()) {
     LOG.debug(
         String.format(
             "Resource Schema generated for location [%s] is [%s]", location, schema.toString()));
   }
   this.storeInUDFContext(
       this.contextSignature, RESOURCE_SCHEMA_SIGNATURE, ObjectSerializer.serialize(schema));
   return schema;
 }
 @Override
 public Tuple getNext() throws IOException {
   try {
     if (!reader.nextKeyValue()) {
       return null;
     }
     final PhoenixRecord phoenixRecord = reader.getCurrentValue();
     if (phoenixRecord == null) {
       return null;
     }
     final Tuple tuple = TypeUtil.transformToTuple(phoenixRecord, schema.getFields());
     return tuple;
   } catch (InterruptedException e) {
     int errCode = 6018;
     final String errMsg = "Error while reading input";
     throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
   }
 }
  @Override
  public void prepareToRead(@SuppressWarnings("rawtypes") RecordReader reader, PigSplit split)
      throws IOException {

    this.reader = (HiveRCRecordReader) reader;

    // check that the required indexes actually exist i.e. the columns that
    // should be read.
    // assuming this is always defined simplifies the readColumnarTuple
    // logic.

    int requiredIndexes[] = getRequiredColumns();
    if (requiredIndexes == null) {

      int fieldLen = pigSchema.getFields().length;

      // if any the partition keys should already exist
      String[] partitionKeys = getPartitionKeys(null, null);
      if (partitionKeys != null) {
        fieldLen = partitionKeys.length;
      }

      requiredIndexes = new int[fieldLen];

      for (int i = 0; i < fieldLen; i++) {
        requiredIndexes[i] = i;
      }

      this.requiredColumns = requiredIndexes;
    }

    try {
      serde = new ColumnarSerDe();
      serde.initialize(hiveConf, props);
    } catch (SerDeException e) {
      LOG.error(e.toString(), e);
      throw new IOException(e);
    }
  }
 @Override
 public void storeSchema(ResourceSchema schema, String location, Job job) throws IOException {
   Configuration conf = job.getConfiguration();
   DataStorage storage =
       new HDataStorage(new Path(location).toUri(), ConfigurationUtil.toProperties(conf));
   ElementDescriptor schemaFilePath = storage.asElement(location, schemaFileName);
   if (!schemaFilePath.exists() && schema != null) {
     try {
       new ObjectMapper().writeValue(schemaFilePath.create(), schema);
     } catch (JsonGenerationException e) {
       log.warn("Unable to write Resource Statistics for " + location);
       e.printStackTrace();
     } catch (JsonMappingException e) {
       log.warn("Unable to write Resource Statistics for " + location);
       e.printStackTrace();
     }
   }
   if (printHeaders) {
     ElementDescriptor headerFilePath = storage.asElement(location, headerFileName);
     if (!headerFilePath.exists()) {
       OutputStream os = headerFilePath.create();
       try {
         String[] names = schema.fieldNames();
         String fn;
         for (int i = 0; i < names.length; i++) {
           fn = ((names[i] == null) ? ("$" + i) : names[i]);
           os.write(fn.getBytes("UTF-8"));
           if (i < names.length - 1) {
             os.write(fieldDel);
           } else {
             os.write(recordDel);
           }
         }
       } finally {
         os.close();
       }
     }
   }
 }
Exemple #11
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple)
   *
   * Given a tuple that corresponds to one record, write
   * it out as CSV, converting among Unix/Windows line
   * breaks as requested in the instantiation. Also take
   * care of escaping field delimiters, double quotes,
   * and linebreaks embedded within fields,
   *
   */
  @Override
  public void putNext(Tuple tupleToWrite) throws IOException {
    // If WRITE_OUTPUT_HEADER, store a header record with the names of each field
    if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) {
      ArrayList<Object> headerProtoTuple = new ArrayList<Object>();
      ResourceFieldSchema[] fields = schema.getFields();
      for (ResourceFieldSchema field : fields) {
        headerProtoTuple.add(field.getName());
      }
      super.putNext(tupleMaker.newTuple(headerProtoTuple));
    }
    storingFirstRecord = false;

    ArrayList<Object> mProtoTuple = new ArrayList<Object>();
    int embeddedNewlineIndex = -1;
    int embeddedCarriageReturn = -1;
    String fieldStr = null;
    // For good debug messages:
    int fieldCounter = -1;

    // Do the escaping:
    for (Object field : tupleToWrite.getAll()) {
      fieldCounter++;

      // Substitute a null value with an empty string. See PIG-2470.
      if (field == null) {
        fieldStr = null;
        mProtoTuple.add("");
        continue;
      }

      fieldStr = field.toString();

      // Embedded double quotes are replaced by two double quotes:
      fieldStr = fieldStr.replaceAll("[\"]", "\"\"");

      // If any field delimiters are in the field, or if we did replace
      // any double quotes with a pair of double quotes above,
      // or if the string includes a newline character (LF:\n:0x0A)
      //               or includes a carriage return (CR:\r:0x0D)
      // and we are to allow newlines in fields,
      // then the entire field must be enclosed in double quotes:
      embeddedNewlineIndex = fieldStr.indexOf(LINEFEED);
      embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN);

      if ((fieldStr.indexOf(fieldDelimiter) != -1)
          || (fieldStr.indexOf(DOUBLE_QUOTE) != -1)
          || (multilineTreatment == Multiline.YES)
              && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) {
        fieldStr = "\"" + fieldStr + "\"";
      }

      // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J),
      // This is needed for Excel to recognize a field-internal
      // new line:

      if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) {
        if (eolTreatment == Linebreaks.WINDOWS) {
          loneLFDetector.reset(fieldStr);
          loneLFDetector.matches();
          fieldStr = loneLFDetector.replaceAll("$1\r\n");
        } else if (eolTreatment == Linebreaks.UNIX) {
          CRLFDetector.reset(fieldStr);
          fieldStr = CRLFDetector.replaceAll("\n");
        }
      }

      mProtoTuple.add(fieldStr);
    }
    // If Windows line breaks are requested, append
    // a newline (0x0D a.k.a. ^M) to the last field
    // so that the row termination will end up being
    // \r\n, once the superclass' putNext() method
    // is done below:

    if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null))
      mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r");

    Tuple resTuple = tupleMaker.newTuple(mProtoTuple);
    super.putNext(resTuple);
  }
  public boolean write(
      Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) {
    byte type = (field != null ? field.getType() : DataType.findType(object));

    if (writeFieldName) {
      generator.writeFieldName(alias.toES(field.getName()));
    }

    if (object == null) {
      generator.writeNull();
      return true;
    }

    switch (type) {
      case DataType.ERROR:
      case DataType.UNKNOWN:
        return handleUnknown(object, field, generator);
      case DataType.NULL:
        generator.writeNull();
        break;
      case DataType.BOOLEAN:
        generator.writeBoolean((Boolean) object);
        break;
      case DataType.INTEGER:
        generator.writeNumber(((Number) object).intValue());
        break;
      case DataType.LONG:
        generator.writeNumber(((Number) object).longValue());
        break;
      case DataType.FLOAT:
        generator.writeNumber(((Number) object).floatValue());
        break;
      case DataType.DOUBLE:
        generator.writeNumber(((Number) object).doubleValue());
        break;
      case DataType.BYTE:
        generator.writeNumber((Byte) object);
        break;
      case DataType.CHARARRAY:
        generator.writeString(object.toString());
        break;
      case DataType.BYTEARRAY:
        generator.writeBinary(((DataByteArray) object).get());
        break;
        // DateTime introduced in Pig 11
      case 30: // DataType.DATETIME
        generator.writeString(PigUtils.convertDateToES(object));
        break;
        // DateTime introduced in Pig 12
      case 65: // DataType.BIGINTEGER
        throw new SerializationException(
            "Big integers are not supported by Elasticsearch - consider using a different type (such as string)");
        // DateTime introduced in Pig 12
      case 70: // DataType.BIGDECIMAL
        throw new SerializationException(
            "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)");
      case DataType.MAP:
        ResourceSchema nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        ResourceFieldSchema[] nestedFields = nestedSchema.getFields();

        generator.writeBeginObject();
        // Pig maps are actually String -> Object association so we can save the key right away
        for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
          generator.writeFieldName(alias.toES(entry.getKey().toString()));
          write(entry.getValue(), nestedFields[0], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.TUPLE:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        nestedFields = nestedSchema.getFields();

        // use getAll instead of get(int) to avoid having to handle Exception...
        List<Object> tuples = ((Tuple) object).getAll();

        generator.writeBeginObject();
        for (int i = 0; i < nestedFields.length; i++) {
          String name = nestedFields[i].getName();
          // handle schemas without names
          name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i));
          generator.writeFieldName(name);
          write(tuples.get(i), nestedFields[i], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.BAG:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginArray();
          generator.writeEndArray();
          break;
        }

        ResourceFieldSchema bagType = nestedSchema.getFields()[0];

        generator.writeBeginArray();
        for (Tuple tuple : (DataBag) object) {
          write(tuple, bagType, generator, false);
        }
        generator.writeEndArray();
        break;
      default:
        if (writeUnknownTypes) {
          return handleUnknown(object, field, generator);
        }
        return false;
    }
    return true;
  }