Exemplo n.º 1
0
 public void process(Writable value) throws HiveException {
   // A mapper can span multiple files/partitions.
   // The serializers need to be reset if the input file changed
   ExecMapperContext context = getExecContext();
   if (context != null && context.inputFileChanged()) {
     // The child operators cleanup if input file has changed
     cleanUpInputFileChanged();
   }
   int childrenDone = 0;
   for (MapOpCtx current : currentCtxs) {
     Object row = null;
     try {
       row = current.readRow(value, context);
       if (!current.forward(row)) {
         childrenDone++;
       }
     } catch (Exception e) {
       // TODO: policy on deserialization errors
       String message = toErrorMessage(value, row, current.rowObjectInspector);
       if (row == null) {
         deserialize_error_count.set(deserialize_error_count.get() + 1);
         throw new HiveException("Hive Runtime Error while processing writable " + message, e);
       }
       throw new HiveException("Hive Runtime Error while processing row " + message, e);
     }
   }
   rowsForwarded(childrenDone, 1);
 }
Exemplo n.º 2
0
 protected boolean isInputFileChangeSensitive(ExecMapperContext mapContext) {
   return !(mapContext == null
       || mapContext.getLocalWork() == null
       || mapContext.getLocalWork().getInputFileChangeSensitive() == false);
 }
  @Override
  @SuppressWarnings("unchecked")
  public void init(JobConf job, OutputCollector output, Reporter reporter) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
    super.init(job, output, reporter);

    rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector keyObjectInspector;

    ReduceWork gWork = Utilities.getReduceWork(job);

    reducer = gWork.getReducer();
    vectorized = gWork.getVectorMode();
    reducer.setParentOperators(null); // clear out any parents as reducer is the
    // root
    isTagged = gWork.getNeedsTagging();
    try {
      keyTableDesc = gWork.getKeyDesc();
      inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
      SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
      keyObjectInspector = inputKeyDeserializer.getObjectInspector();
      valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()];

      if (vectorized) {
        final int maxTags = gWork.getTagToValueDesc().size();
        keyStructInspector = (StructObjectInspector) keyObjectInspector;
        batches = new VectorizedRowBatch[maxTags];
        valueStructInspectors = new StructObjectInspector[maxTags];
        valueStringWriters = new List[maxTags];
        keysColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
        buffer = new DataOutputBuffer();
      }

      for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) {
        // We should initialize the SerDe with the TypeInfo when available.
        valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag);
        inputValueDeserializer[tag] =
            ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(
            inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null);
        valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector();

        ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();

        if (vectorized) {
          /* vectorization only works with struct object inspectors */
          valueStructInspectors[tag] = (StructObjectInspector) valueObjectInspector[tag];

          ObjectPair<VectorizedRowBatch, StandardStructObjectInspector> pair =
              VectorizedBatchUtil.constructVectorizedRowBatch(
                  keyStructInspector,
                  valueStructInspectors[tag],
                  gWork.getVectorScratchColumnTypeMap());
          batches[tag] = pair.getFirst();
          final int totalColumns =
              keysColumnOffset + valueStructInspectors[tag].getAllStructFieldRefs().size();
          valueStringWriters[tag] = new ArrayList<VectorExpressionWriter>(totalColumns);
          valueStringWriters[tag].addAll(
              Arrays.asList(
                  VectorExpressionWriterFactory.genVectorStructExpressionWritables(
                      keyStructInspector)));
          valueStringWriters[tag].addAll(
              Arrays.asList(
                  VectorExpressionWriterFactory.genVectorStructExpressionWritables(
                      valueStructInspectors[tag])));

          rowObjectInspector[tag] = pair.getSecond();
        } else {
          ois.add(keyObjectInspector);
          ois.add(valueObjectInspector[tag]);
          // reducer.setGroupKeyObjectInspector(keyObjectInspector);
          rowObjectInspector[tag] =
              ObjectInspectorFactory.getStandardStructObjectInspector(
                  Utilities.reduceFieldNameList, ois);
        }
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }

    ExecMapperContext execContext = new ExecMapperContext(job);
    localWork = gWork.getMapRedLocalWork();
    execContext.setJc(jc);
    execContext.setLocalWork(localWork);
    reducer.passExecContext(execContext);

    reducer.setReporter(rp);
    OperatorUtils.setChildrenCollector(
        Arrays.<Operator<? extends OperatorDesc>>asList(reducer), output);

    // initialize reduce operator tree
    try {
      LOG.info(reducer.dump(0));
      reducer.initialize(jc, rowObjectInspector);

      if (localWork != null) {
        for (Operator<? extends OperatorDesc> dummyOp : localWork.getDummyParentOp()) {
          dummyOp.setExecContext(execContext);
          dummyOp.initialize(jc, null);
        }
      }

    } catch (Throwable e) {
      abort = true;
      if (e instanceof OutOfMemoryError) {
        // Don't create a new object if we are already out of memory
        throw (OutOfMemoryError) e;
      } else {
        throw new RuntimeException("Reduce operator initialization failed", e);
      }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_INIT_OPERATORS);
  }
Exemplo n.º 4
0
 public static Object[] populateVirtualColumnValues(
     ExecMapperContext ctx,
     List<VirtualColumn> vcs,
     Object[] vcValues,
     Deserializer deserializer) {
   if (vcs == null) {
     return vcValues;
   }
   if (vcValues == null) {
     vcValues = new Object[vcs.size()];
   }
   for (int i = 0; i < vcs.size(); i++) {
     VirtualColumn vc = vcs.get(i);
     if (vc.equals(VirtualColumn.FILENAME)) {
       if (ctx.inputFileChanged()) {
         vcValues[i] = new Text(ctx.getCurrentInputPath().toString());
       }
     } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) {
       long current = ctx.getIoCxt().getCurrentBlockStart();
       LongWritable old = (LongWritable) vcValues[i];
       if (old == null) {
         old = new LongWritable(current);
         vcValues[i] = old;
         continue;
       }
       if (current != old.get()) {
         old.set(current);
       }
     } else if (vc.equals(VirtualColumn.ROWOFFSET)) {
       long current = ctx.getIoCxt().getCurrentRow();
       LongWritable old = (LongWritable) vcValues[i];
       if (old == null) {
         old = new LongWritable(current);
         vcValues[i] = old;
         continue;
       }
       if (current != old.get()) {
         old.set(current);
       }
     } else if (vc.equals(VirtualColumn.RAWDATASIZE)) {
       long current = 0L;
       SerDeStats stats = deserializer.getSerDeStats();
       if (stats != null) {
         current = stats.getRawDataSize();
       }
       LongWritable old = (LongWritable) vcValues[i];
       if (old == null) {
         old = new LongWritable(current);
         vcValues[i] = old;
         continue;
       }
       if (current != old.get()) {
         old.set(current);
       }
     } else if (vc.equals(VirtualColumn.ROWID)) {
       if (ctx.getIoCxt().getRecordIdentifier() == null) {
         vcValues[i] = null;
       } else {
         if (vcValues[i] == null) {
           vcValues[i] = new Object[RecordIdentifier.Field.values().length];
         }
         RecordIdentifier.StructInfo.toArray(
             ctx.getIoCxt().getRecordIdentifier(), (Object[]) vcValues[i]);
         ctx.getIoCxt()
             .setRecordIdentifier(null); // so we don't accidentally cache the value; shouldn't
         // happen since IO layer either knows how to produce ROW__ID or not - but to be safe
       }
     }
   }
   return vcValues;
 }