Esempio n. 1
0
 /** check whether it is just a wrapped tuple */
 public static boolean isTupleWrapper(ResourceFieldSchema pigSchema) {
   Boolean status = false;
   if (pigSchema.getType() == DataType.TUPLE)
     if (pigSchema.getName() != null)
       if (pigSchema.getName().equals(AvroStorageUtils.PIG_TUPLE_WRAPPER)) status = true;
   return status;
 }
 private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException {
   Object field;
   if (DataType.isComplex(fs.getType())) {
     ByteArrayInputStream bis = new ByteArrayInputStream(b);
     PushbackInputStream in = new PushbackInputStream(bis);
     field = consumeComplexType(in, fs);
   } else {
     field = parseSimpleType(b, fs);
   }
   return field;
 }
 private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema)
     throws IOException {
   Object field;
   switch (complexFieldSchema.getType()) {
     case DataType.BAG:
       field = consumeBag(in, complexFieldSchema);
       break;
     case DataType.TUPLE:
       field = consumeTuple(in, complexFieldSchema);
       break;
     case DataType.MAP:
       field = consumeMap(in, complexFieldSchema);
       break;
     default:
       throw new IOException("Unknown complex data type");
   }
   return field;
 }
 private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema)
     throws IOException {
   Object field;
   switch (simpleFieldSchema.getType()) {
     case DataType.INTEGER:
       field = bytesToInteger(b);
       break;
     case DataType.LONG:
       field = bytesToLong(b);
       break;
     case DataType.FLOAT:
       field = bytesToFloat(b);
       break;
     case DataType.DOUBLE:
       field = bytesToDouble(b);
       break;
     case DataType.CHARARRAY:
       field = bytesToCharArray(b);
       break;
     case DataType.BYTEARRAY:
       field = new DataByteArray(b);
       break;
     case DataType.BOOLEAN:
       field = bytesToBoolean(b);
       break;
     case DataType.BIGINTEGER:
       field = bytesToBigInteger(b);
       break;
     case DataType.BIGDECIMAL:
       field = bytesToBigDecimal(b);
     case DataType.DATETIME:
       field = bytesToDateTime(b);
       break;
     default:
       throw new IOException("Unknown simple data type");
   }
   return field;
 }
  private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    if (fieldSchema == null) {
      throw new IOException("Schema is null");
    }
    int buf;
    ByteArrayOutputStream mOut;

    while ((buf = in.read()) != '(' || buf == '}') {
      if (buf == -1) {
        throw new IOException("Unexpect end of tuple");
      }
      if (buf == '}') {
        in.unread(buf);
        return null;
      }
    }
    Tuple t = TupleFactory.getInstance().newTuple();
    if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) {
      ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
      // Interpret item inside tuple one by one based on the inner schema
      for (int i = 0; i < fss.length; i++) {
        Object field;
        ResourceFieldSchema fs = fss[i];
        int delimit = ',';
        if (i == fss.length - 1) delimit = ')';

        if (DataType.isComplex(fs.getType())) {
          field = consumeComplexType(in, fs);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
          }
        } else {
          mOut = new ByteArrayOutputStream(BUFFER_SIZE);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
            if (buf == delimit) break;
            mOut.write(buf);
          }
          field = parseSimpleType(mOut.toByteArray(), fs);
        }
        t.append(field);
      }
    } else {
      // No inner schema, treat everything inside tuple as bytearray
      Deque<Character> level =
          new LinkedList<
              Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as
      // bytearray
      mOut = new ByteArrayOutputStream(BUFFER_SIZE);
      while (true) {
        buf = in.read();
        if (buf == -1) {
          throw new IOException("Unexpect end of tuple");
        }
        if (buf == '[' || buf == '{' || buf == '(') {
          level.push((char) buf);
          mOut.write(buf);
        } else if (buf == ')' && level.isEmpty()) // End of tuple
        {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          break;
        } else if (buf == ',' && level.isEmpty()) {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          mOut.reset();
        } else if (buf == ']' || buf == '}' || buf == ')') {
          if (level.peek() == findStartChar((char) buf)) level.pop();
          else throw new IOException("Malformed tuple");
          mOut.write(buf);
        } else mOut.write(buf);
      }
    }
    return t;
  }
  public boolean write(
      Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) {
    byte type = (field != null ? field.getType() : DataType.findType(object));

    if (writeFieldName) {
      generator.writeFieldName(alias.toES(field.getName()));
    }

    if (object == null) {
      generator.writeNull();
      return true;
    }

    switch (type) {
      case DataType.ERROR:
      case DataType.UNKNOWN:
        return handleUnknown(object, field, generator);
      case DataType.NULL:
        generator.writeNull();
        break;
      case DataType.BOOLEAN:
        generator.writeBoolean((Boolean) object);
        break;
      case DataType.INTEGER:
        generator.writeNumber(((Number) object).intValue());
        break;
      case DataType.LONG:
        generator.writeNumber(((Number) object).longValue());
        break;
      case DataType.FLOAT:
        generator.writeNumber(((Number) object).floatValue());
        break;
      case DataType.DOUBLE:
        generator.writeNumber(((Number) object).doubleValue());
        break;
      case DataType.BYTE:
        generator.writeNumber((Byte) object);
        break;
      case DataType.CHARARRAY:
        generator.writeString(object.toString());
        break;
      case DataType.BYTEARRAY:
        generator.writeBinary(((DataByteArray) object).get());
        break;
        // DateTime introduced in Pig 11
      case 30: // DataType.DATETIME
        generator.writeString(PigUtils.convertDateToES(object));
        break;
        // DateTime introduced in Pig 12
      case 65: // DataType.BIGINTEGER
        throw new SerializationException(
            "Big integers are not supported by Elasticsearch - consider using a different type (such as string)");
        // DateTime introduced in Pig 12
      case 70: // DataType.BIGDECIMAL
        throw new SerializationException(
            "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)");
      case DataType.MAP:
        ResourceSchema nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        ResourceFieldSchema[] nestedFields = nestedSchema.getFields();

        generator.writeBeginObject();
        // Pig maps are actually String -> Object association so we can save the key right away
        for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
          generator.writeFieldName(alias.toES(entry.getKey().toString()));
          write(entry.getValue(), nestedFields[0], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.TUPLE:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        nestedFields = nestedSchema.getFields();

        // use getAll instead of get(int) to avoid having to handle Exception...
        List<Object> tuples = ((Tuple) object).getAll();

        generator.writeBeginObject();
        for (int i = 0; i < nestedFields.length; i++) {
          String name = nestedFields[i].getName();
          // handle schemas without names
          name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i));
          generator.writeFieldName(name);
          write(tuples.get(i), nestedFields[i], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.BAG:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginArray();
          generator.writeEndArray();
          break;
        }

        ResourceFieldSchema bagType = nestedSchema.getFields()[0];

        generator.writeBeginArray();
        for (Tuple tuple : (DataBag) object) {
          write(tuple, bagType, generator, false);
        }
        generator.writeEndArray();
        break;
      default:
        if (writeUnknownTypes) {
          return handleUnknown(object, field, generator);
        }
        return false;
    }
    return true;
  }