/** check whether it is just a wrapped tuple */
 public static boolean isTupleWrapper(ResourceFieldSchema pigSchema) {
   Boolean status = false;
   if (pigSchema.getType() == DataType.TUPLE)
     if (pigSchema.getName() != null)
       if (pigSchema.getName().equals(AvroStorageUtils.PIG_TUPLE_WRAPPER)) status = true;
   return status;
 }
Exemple #2
0
  /* (non-Javadoc)
   * @see org.apache.pig.builtin.PigStorage#putNext(org.apache.pig.data.Tuple)
   *
   * Given a tuple that corresponds to one record, write
   * it out as CSV, converting among Unix/Windows line
   * breaks as requested in the instantiation. Also take
   * care of escaping field delimiters, double quotes,
   * and linebreaks embedded within fields,
   *
   */
  @Override
  public void putNext(Tuple tupleToWrite) throws IOException {
    // If WRITE_OUTPUT_HEADER, store a header record with the names of each field
    if (storingFirstRecord && headerTreatment == Headers.WRITE_OUTPUT_HEADER && schema != null) {
      ArrayList<Object> headerProtoTuple = new ArrayList<Object>();
      ResourceFieldSchema[] fields = schema.getFields();
      for (ResourceFieldSchema field : fields) {
        headerProtoTuple.add(field.getName());
      }
      super.putNext(tupleMaker.newTuple(headerProtoTuple));
    }
    storingFirstRecord = false;

    ArrayList<Object> mProtoTuple = new ArrayList<Object>();
    int embeddedNewlineIndex = -1;
    int embeddedCarriageReturn = -1;
    String fieldStr = null;
    // For good debug messages:
    int fieldCounter = -1;

    // Do the escaping:
    for (Object field : tupleToWrite.getAll()) {
      fieldCounter++;

      // Substitute a null value with an empty string. See PIG-2470.
      if (field == null) {
        fieldStr = null;
        mProtoTuple.add("");
        continue;
      }

      fieldStr = field.toString();

      // Embedded double quotes are replaced by two double quotes:
      fieldStr = fieldStr.replaceAll("[\"]", "\"\"");

      // If any field delimiters are in the field, or if we did replace
      // any double quotes with a pair of double quotes above,
      // or if the string includes a newline character (LF:\n:0x0A)
      //               or includes a carriage return (CR:\r:0x0D)
      // and we are to allow newlines in fields,
      // then the entire field must be enclosed in double quotes:
      embeddedNewlineIndex = fieldStr.indexOf(LINEFEED);
      embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN);

      if ((fieldStr.indexOf(fieldDelimiter) != -1)
          || (fieldStr.indexOf(DOUBLE_QUOTE) != -1)
          || (multilineTreatment == Multiline.YES)
              && (embeddedNewlineIndex != -1 || embeddedCarriageReturn != -1)) {
        fieldStr = "\"" + fieldStr + "\"";
      }

      // If requested: replace any Linefeed-only (^J), with LF-Newline (^M^J),
      // This is needed for Excel to recognize a field-internal
      // new line:

      if ((eolTreatment != Linebreaks.NOCHANGE) && (embeddedNewlineIndex != -1)) {
        if (eolTreatment == Linebreaks.WINDOWS) {
          loneLFDetector.reset(fieldStr);
          loneLFDetector.matches();
          fieldStr = loneLFDetector.replaceAll("$1\r\n");
        } else if (eolTreatment == Linebreaks.UNIX) {
          CRLFDetector.reset(fieldStr);
          fieldStr = CRLFDetector.replaceAll("\n");
        }
      }

      mProtoTuple.add(fieldStr);
    }
    // If Windows line breaks are requested, append
    // a newline (0x0D a.k.a. ^M) to the last field
    // so that the row termination will end up being
    // \r\n, once the superclass' putNext() method
    // is done below:

    if ((eolTreatment == Linebreaks.WINDOWS) && (fieldStr != null))
      mProtoTuple.set(mProtoTuple.size() - 1, fieldStr + "\r");

    Tuple resTuple = tupleMaker.newTuple(mProtoTuple);
    super.putNext(resTuple);
  }
  public boolean write(
      Object object, ResourceFieldSchema field, Generator generator, boolean writeFieldName) {
    byte type = (field != null ? field.getType() : DataType.findType(object));

    if (writeFieldName) {
      generator.writeFieldName(alias.toES(field.getName()));
    }

    if (object == null) {
      generator.writeNull();
      return true;
    }

    switch (type) {
      case DataType.ERROR:
      case DataType.UNKNOWN:
        return handleUnknown(object, field, generator);
      case DataType.NULL:
        generator.writeNull();
        break;
      case DataType.BOOLEAN:
        generator.writeBoolean((Boolean) object);
        break;
      case DataType.INTEGER:
        generator.writeNumber(((Number) object).intValue());
        break;
      case DataType.LONG:
        generator.writeNumber(((Number) object).longValue());
        break;
      case DataType.FLOAT:
        generator.writeNumber(((Number) object).floatValue());
        break;
      case DataType.DOUBLE:
        generator.writeNumber(((Number) object).doubleValue());
        break;
      case DataType.BYTE:
        generator.writeNumber((Byte) object);
        break;
      case DataType.CHARARRAY:
        generator.writeString(object.toString());
        break;
      case DataType.BYTEARRAY:
        generator.writeBinary(((DataByteArray) object).get());
        break;
        // DateTime introduced in Pig 11
      case 30: // DataType.DATETIME
        generator.writeString(PigUtils.convertDateToES(object));
        break;
        // DateTime introduced in Pig 12
      case 65: // DataType.BIGINTEGER
        throw new SerializationException(
            "Big integers are not supported by Elasticsearch - consider using a different type (such as string)");
        // DateTime introduced in Pig 12
      case 70: // DataType.BIGDECIMAL
        throw new SerializationException(
            "Big decimals are not supported by Elasticsearch - consider using a different type (such as string)");
      case DataType.MAP:
        ResourceSchema nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        ResourceFieldSchema[] nestedFields = nestedSchema.getFields();

        generator.writeBeginObject();
        // Pig maps are actually String -> Object association so we can save the key right away
        for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
          generator.writeFieldName(alias.toES(entry.getKey().toString()));
          write(entry.getValue(), nestedFields[0], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.TUPLE:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginObject();
          generator.writeEndObject();
          break;
        }

        nestedFields = nestedSchema.getFields();

        // use getAll instead of get(int) to avoid having to handle Exception...
        List<Object> tuples = ((Tuple) object).getAll();

        generator.writeBeginObject();
        for (int i = 0; i < nestedFields.length; i++) {
          String name = nestedFields[i].getName();
          // handle schemas without names
          name = (StringUtils.hasText(name) ? alias.toES(name) : Integer.toString(i));
          generator.writeFieldName(name);
          write(tuples.get(i), nestedFields[i], generator, false);
        }
        generator.writeEndObject();
        break;

      case DataType.BAG:
        nestedSchema = field.getSchema();

        // empty tuple shortcut
        if (nestedSchema == null) {
          generator.writeBeginArray();
          generator.writeEndArray();
          break;
        }

        ResourceFieldSchema bagType = nestedSchema.getFields()[0];

        generator.writeBeginArray();
        for (Tuple tuple : (DataBag) object) {
          write(tuple, bagType, generator, false);
        }
        generator.writeEndArray();
        break;
      default:
        if (writeUnknownTypes) {
          return handleUnknown(object, field, generator);
        }
        return false;
    }
    return true;
  }