예제 #1
0
 /** check whether it is just a wrapped tuple */
 public static boolean isTupleWrapper(ResourceFieldSchema pigSchema) {
   Boolean status = false;
   if (pigSchema.getType() == DataType.TUPLE)
     if (pigSchema.getName() != null)
       if (pigSchema.getName().equals(AvroStorageUtils.PIG_TUPLE_WRAPPER)) status = true;
   return status;
 }
  private Map<String, Object> consumeMap(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    int buf;

    while ((buf = in.read()) != '[') {
      if (buf == -1) {
        throw new IOException("Unexpect end of map");
      }
    }
    HashMap<String, Object> m = new HashMap<String, Object>();
    ByteArrayOutputStream mOut = new ByteArrayOutputStream(BUFFER_SIZE);
    while (true) {
      // Read key (assume key can not contains special character such as #, (, [, {, }, ], )
      while ((buf = in.read()) != '#') {
        if (buf == -1) {
          throw new IOException("Unexpect end of map");
        }
        mOut.write(buf);
      }
      String key = bytesToCharArray(mOut.toByteArray());
      if (key.length() == 0) throw new IOException("Map key can not be null");

      // Read value
      mOut.reset();
      Deque<Character> level =
          new LinkedList<
              Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as
      // bytearray
      while (true) {
        buf = in.read();
        if (buf == -1) {
          throw new IOException("Unexpect end of map");
        }
        if (buf == '[' || buf == '{' || buf == '(') {
          level.push((char) buf);
        } else if (buf == ']' && level.isEmpty()) // End of map
        break;
        else if (buf == ']' || buf == '}' || buf == ')') {
          if (level.isEmpty()) throw new IOException("Malformed map");

          if (level.peek() == findStartChar((char) buf)) level.pop();
        } else if (buf == ',' && level.isEmpty()) { // Current map item complete
          break;
        }
        mOut.write(buf);
      }
      Object value = null;
      if (fieldSchema != null && fieldSchema.getSchema() != null && mOut.size() > 0) {
        value = bytesToObject(mOut.toByteArray(), fieldSchema.getSchema().getFields()[0]);
      } else if (mOut.size() > 0) { // untyped map
        value = new DataByteArray(mOut.toByteArray());
      }
      m.put(key, value);
      mOut.reset();
      if (buf == ']') break;
    }
    return m;
  }
예제 #3
0
  /** wrap a pig schema as tuple */
  public static ResourceFieldSchema wrapAsTuple(ResourceFieldSchema subFieldSchema)
      throws IOException {
    ResourceSchema listSchema = new ResourceSchema();
    listSchema.setFields(new ResourceFieldSchema[] {subFieldSchema});

    ResourceFieldSchema tupleWrapper = new ResourceFieldSchema();
    tupleWrapper.setType(DataType.TUPLE);
    tupleWrapper.setName(PIG_TUPLE_WRAPPER);
    tupleWrapper.setSchema(listSchema);

    return tupleWrapper;
  }
 private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema)
     throws IOException {
   if (fieldSchema == null) {
     throw new IOException("Schema is null");
   }
   ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
   Tuple t;
   int buf;
   while ((buf = in.read()) != '{') {
     if (buf == -1) {
       throw new IOException("Unexpect end of bag");
     }
   }
   if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema");
   ResourceFieldSchema fs = fss[0];
   DataBag db = DefaultBagFactory.getInstance().newDefaultBag();
   while (true) {
     t = consumeTuple(in, fs);
     if (t != null) db.add(t);
     while ((buf = in.read()) != '}' && buf != ',') {
       if (buf == -1) {
         throw new IOException("Unexpect end of bag");
       }
     }
     if (buf == '}') break;
   }
   return db;
 }
 private Object bytesToObject(byte[] b, ResourceFieldSchema fs) throws IOException {
   Object field;
   if (DataType.isComplex(fs.getType())) {
     ByteArrayInputStream bis = new ByteArrayInputStream(b);
     PushbackInputStream in = new PushbackInputStream(bis);
     field = consumeComplexType(in, fs);
   } else {
     field = parseSimpleType(b, fs);
   }
   return field;
 }
 private Object consumeComplexType(PushbackInputStream in, ResourceFieldSchema complexFieldSchema)
     throws IOException {
   Object field;
   switch (complexFieldSchema.getType()) {
     case DataType.BAG:
       field = consumeBag(in, complexFieldSchema);
       break;
     case DataType.TUPLE:
       field = consumeTuple(in, complexFieldSchema);
       break;
     case DataType.MAP:
       field = consumeMap(in, complexFieldSchema);
       break;
     default:
       throw new IOException("Unknown complex data type");
   }
   return field;
 }
 private Object parseSimpleType(byte[] b, ResourceFieldSchema simpleFieldSchema)
     throws IOException {
   Object field;
   switch (simpleFieldSchema.getType()) {
     case DataType.INTEGER:
       field = bytesToInteger(b);
       break;
     case DataType.LONG:
       field = bytesToLong(b);
       break;
     case DataType.FLOAT:
       field = bytesToFloat(b);
       break;
     case DataType.DOUBLE:
       field = bytesToDouble(b);
       break;
     case DataType.CHARARRAY:
       field = bytesToCharArray(b);
       break;
     case DataType.BYTEARRAY:
       field = new DataByteArray(b);
       break;
     case DataType.BOOLEAN:
       field = bytesToBoolean(b);
       break;
     case DataType.BIGINTEGER:
       field = bytesToBigInteger(b);
       break;
     case DataType.BIGDECIMAL:
       field = bytesToBigDecimal(b);
     case DataType.DATETIME:
       field = bytesToDateTime(b);
       break;
     default:
       throw new IOException("Unknown simple data type");
   }
   return field;
 }
예제 #8
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple)
   *
   * Given a tuple that corresponds to one record, write
   * it out as CSV, converting among Unix/Windows line
   * breaks as requested in the instantiation. Also take
   * care of escaping field delimiters, double quotes,
   * and linebreaks embedded within fields,
   *
   */
  @Override
  public void putNext(Tuple tupleToWrite) throws IOException {
    // If WRITE_OUTPUT_HEADER, store a header record with the names of each field
    if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) {
      ArrayList<Object> headerProtoTuple = new ArrayList<Object>();
      ResourceFieldSchema[] fields = schema.getFields();
      for (ResourceFieldSchema field : fields) {
        headerProtoTuple.add(field.getName());
      }
      super.putNext(tupleMaker.newTuple(headerProtoTuple));
    }
    storingFirstRecord = false;

    ArrayList<Object> mProtoTuple = new ArrayList<Object>();
    int embeddedNewlineIndex = -1;
    int embeddedCarriageReturn = -1;
    String fieldStr = null;
    // For good debug messages:
    int fieldCounter = -1;

    // Do the escaping:
    for (Object field : tupleToWrite.getAll()) {
      fieldCounter++;

      // Substitute a null value with an empty string. See PIG-2470.
      if (field == null) {
        fieldStr = null;
        mProtoTuple.add("");
        continue;
      }

      fieldStr = field.toString();

      // Embedded double quotes are replaced by two double quotes:
      fieldStr = fieldStr.replaceAll("[\"]", "\"\"");

      // If any field delimiters are in the field, or if we did replace
      // any double quotes with a pair of double quotes above,
      // or if the string includes a newline character (LF:\n:0x0A)
      //               or includes a carriage return (CR:\r:0x0D)
      // and we are to allow newlines in fields,
      // then the entire field must be enclosed in double quotes:
      embeddedNewlineIndex = fieldStr.indexOf(LINEFEED);
      embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN);

      if ((fieldStr.indexOf(fieldDelimiter) != -1)
          || (fieldStr.indexOf(DOUBLE_QUOTE) != -1)
          || (multilineTreatment == Multiline.YES)
              && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) {
        fieldStr = "\"" + fieldStr + "\"";
      }

      // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J),
      // This is needed for Excel to recognize a field-internal
      // new line:

      if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) {
        if (eolTreatment == Linebreaks.WINDOWS) {
          loneLFDetector.reset(fieldStr);
          loneLFDetector.matches();
          fieldStr = loneLFDetector.replaceAll("$1\r\n");
        } else if (eolTreatment == Linebreaks.UNIX) {
          CRLFDetector.reset(fieldStr);
          fieldStr = CRLFDetector.replaceAll("\n");
        }
      }

      mProtoTuple.add(fieldStr);
    }
    // If Windows line breaks are requested, append
    // a newline (0x0D a.k.a. ^M) to the last field
    // so that the row termination will end up being
    // \r\n, once the superclass' putNext() method
    // is done below:

    if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null))
      mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r");

    Tuple resTuple = tupleMaker.newTuple(mProtoTuple);
    super.putNext(resTuple);
  }
  private Tuple consumeTuple(PushbackInputStream in, ResourceFieldSchema fieldSchema)
      throws IOException {
    if (fieldSchema == null) {
      throw new IOException("Schema is null");
    }
    int buf;
    ByteArrayOutputStream mOut;

    while ((buf = in.read()) != '(' || buf == '}') {
      if (buf == -1) {
        throw new IOException("Unexpect end of tuple");
      }
      if (buf == '}') {
        in.unread(buf);
        return null;
      }
    }
    Tuple t = TupleFactory.getInstance().newTuple();
    if (fieldSchema.getSchema() != null && fieldSchema.getSchema().getFields().length != 0) {
      ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields();
      // Interpret item inside tuple one by one based on the inner schema
      for (int i = 0; i < fss.length; i++) {
        Object field;
        ResourceFieldSchema fs = fss[i];
        int delimit = ',';
        if (i == fss.length - 1) delimit = ')';

        if (DataType.isComplex(fs.getType())) {
          field = consumeComplexType(in, fs);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
          }
        } else {
          mOut = new ByteArrayOutputStream(BUFFER_SIZE);
          while ((buf = in.read()) != delimit) {
            if (buf == -1) {
              throw new IOException("Unexpect end of tuple");
            }
            if (buf == delimit) break;
            mOut.write(buf);
          }
          field = parseSimpleType(mOut.toByteArray(), fs);
        }
        t.append(field);
      }
    } else {
      // No inner schema, treat everything inside tuple as bytearray
      Deque<Character> level =
          new LinkedList<
              Character>(); // keep track of nested tuple/bag/map. We do not interpret, save them as
      // bytearray
      mOut = new ByteArrayOutputStream(BUFFER_SIZE);
      while (true) {
        buf = in.read();
        if (buf == -1) {
          throw new IOException("Unexpect end of tuple");
        }
        if (buf == '[' || buf == '{' || buf == '(') {
          level.push((char) buf);
          mOut.write(buf);
        } else if (buf == ')' && level.isEmpty()) // End of tuple
        {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          break;
        } else if (buf == ',' && level.isEmpty()) {
          DataByteArray value = new DataByteArray(mOut.toByteArray());
          t.append(value);
          mOut.reset();
        } else if (buf == ']' || buf == '}' || buf == ')') {
          if (level.peek() == findStartChar((char) buf)) level.pop();
          else throw new IOException("Malformed tuple");
          mOut.write(buf);
        } else mOut.write(buf);
      }
    }
    return t;
  }
  public boolean write(
      Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) {
    byte type = (field != null ? field.getType() : DataType.findType(object));

    if (writeFieldName) {
      generator.writeFieldName(alias.toES(field.getName()));
    }

    if (object == null) {
      generator.writeNull();
      return true;
    }

    switch (type) {
      case DataType.ERROR:
      case DataType.UNKNOWN:
        return handleUnknown(object, field, generator);
      case DataType.NULL:
        generator.writeNull();
        break;
      case DataType.BOOLEAN:
        generator.writeBoolean((Boolean) object);
        break;
      case DataType.INTEGER:
        generator.writeNumber(((Number) object).intValue());
        break;
      case DataType.LONG:
        generator.writeNumber(((Number) object).longValue());
        break;
      case DataType.FLOAT:
        generator.writeNumber(((Number) object).floatValue());
        break;
      case DataType.DOUBLE:
        generator.writeNumber(((Number) object).doubleValue());
        break;
      case DataType.BYTE:
        generator.writeNumber((Byte) object);
        break;
      case DataType.CHARARRAY:
        generator.writeString(object.toString());
        break;
      case DataType.BYTEARRAY:
        generator.writeBinary(((DataByteArray) object).get());
        break;
        // DateTime introduced in Pig 11
      case 30: // DataType.DATETIME
        generator.writeString(PigUtils.convertDateToES(object));
        break;
        // DateTime introduced in Pig 12
      case 65: // DataType.BIGINTEGER
        throw new SerializationException(
            "Big integers are not supported by Elasticsearch - consider using a different type (such as string)");
        // DateTime introduced in Pig 12
      case 70: // DataType.BIGDECIMAL
        throw new SerializationException(
            "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)");
      case DataType.MAP:
        ResourceSchema nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        ResourceFieldSchema[] nestedFields = nestedSchema.getFields();

        generator.writeBeginObject();
        // Pig maps are actually String -> Object association so we can save the key right away
        for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
          generator.writeFieldName(alias.toES(entry.getKey().toString()));
          write(entry.getValue(), nestedFields[0], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.TUPLE:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        nestedFields = nestedSchema.getFields();

        // use getAll instead of get(int) to avoid having to handle Exception...
        List<Object> tuples = ((Tuple) object).getAll();

        generator.writeBeginObject();
        for (int i = 0; i < nestedFields.length; i++) {
          String name = nestedFields[i].getName();
          // handle schemas without names
          name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i));
          generator.writeFieldName(name);
          write(tuples.get(i), nestedFields[i], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.BAG:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginArray();
          generator.writeEndArray();
          break;
        }

        ResourceFieldSchema bagType = nestedSchema.getFields()[0];

        generator.writeBeginArray();
        for (Tuple tuple : (DataBag) object) {
          write(tuple, bagType, generator, false);
        }
        generator.writeEndArray();
        break;
      default:
        if (writeUnknownTypes) {
          return handleUnknown(object, field, generator);
        }
        return false;
    }
    return true;
  }