示例#1
0
 private <W extends BoundedWindow> WindowingStrategy<?, W> getOutputWindowing(
     WindowingStrategy<?, W> inputStrategy) {
   if (inputStrategy.getWindowFn() instanceof InvalidWindows) {
     @SuppressWarnings("unchecked")
     InvalidWindows<W> invalidWindows = (InvalidWindows<W>) inputStrategy.getWindowFn();
     return inputStrategy.withWindowFn(invalidWindows.getOriginalWindowFn());
   } else {
     return inputStrategy;
   }
 }
示例#2
0
    @Override
    public void validate(PCollection<T> input) {
      WindowingStrategy<?, ?> outputStrategy =
          getOutputStrategyInternal(input.getWindowingStrategy());

      // Make sure that the windowing strategy is complete & valid.
      if (outputStrategy.isTriggerSpecified()
          && !(outputStrategy.getTrigger() instanceof DefaultTrigger)) {
        if (!(outputStrategy.getWindowFn() instanceof GlobalWindows)
            && !outputStrategy.isAllowedLatenessSpecified()) {
          throw new IllegalArgumentException(
              "Except when using GlobalWindows,"
                  + " calling .triggering() to specify a trigger requires that the allowed lateness be"
                  + " specified using .withAllowedLateness() to set the upper bound on how late data"
                  + " can arrive before being dropped. See Javadoc for more details.");
        }

        if (!outputStrategy.isModeSpecified()) {
          throw new IllegalArgumentException(
              "Calling .triggering() to specify a trigger requires that the accumulation mode be"
                  + " specified using .discardingFiredPanes() or .accumulatingFiredPanes()."
                  + " See Javadoc for more details.");
        }
      }
    }
    @Override
    public void translateNode(Window.Bound<T> transform, FlinkBatchTranslationContext context) {
      PValue input = context.getInput(transform);

      TypeInformation<WindowedValue<T>> resultTypeInfo =
          context.getTypeInfo(context.getOutput(transform));

      DataSet<WindowedValue<T>> inputDataSet = context.getInputDataSet(input);

      @SuppressWarnings("unchecked")
      final WindowingStrategy<T, ? extends BoundedWindow> windowingStrategy =
          (WindowingStrategy<T, ? extends BoundedWindow>)
              context.getOutput(transform).getWindowingStrategy();

      WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn();

      FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction =
          new FlinkAssignWindows<>(windowFn);

      DataSet<WindowedValue<T>> resultDataSet =
          inputDataSet
              .flatMap(assignWindowsFunction)
              .name(context.getOutput(transform).getName())
              .returns(resultTypeInfo);

      context.setOutputDataSet(context.getOutput(transform), resultDataSet);
    }
    @Override
    public void translateNode(
        ParDo.BoundMulti<InputT, OutputT> transform, FlinkBatchTranslationContext context) {
      DoFn<InputT, OutputT> doFn = transform.getFn();
      rejectStateAndTimers(doFn);
      DataSet<WindowedValue<InputT>> inputDataSet =
          context.getInputDataSet(context.getInput(transform));

      List<TaggedPValue> outputs = context.getOutputs(transform);

      Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
      // put the main output at index 0, FlinkMultiOutputDoFnFunction  expects this
      outputMap.put(transform.getMainOutputTag(), 0);
      int count = 1;
      for (TaggedPValue taggedValue : outputs) {
        if (!outputMap.containsKey(taggedValue.getTag())) {
          outputMap.put(taggedValue.getTag(), count++);
        }
      }

      // assume that the windowing strategy is the same for all outputs
      WindowingStrategy<?, ?> windowingStrategy = null;

      // collect all output Coders and create a UnionCoder for our tagged outputs
      List<Coder<?>> outputCoders = Lists.newArrayList();
      for (TaggedPValue taggedValue : outputs) {
        checkState(
            taggedValue.getValue() instanceof PCollection,
            "Within ParDo, got a non-PCollection output %s of type %s",
            taggedValue.getValue(),
            taggedValue.getValue().getClass().getSimpleName());
        PCollection<?> coll = (PCollection<?>) taggedValue.getValue();
        outputCoders.add(coll.getCoder());
        windowingStrategy = coll.getWindowingStrategy();
      }

      if (windowingStrategy == null) {
        throw new IllegalStateException("No outputs defined.");
      }

      UnionCoder unionCoder = UnionCoder.of(outputCoders);

      TypeInformation<WindowedValue<RawUnionValue>> typeInformation =
          new CoderTypeInformation<>(
              WindowedValue.getFullCoder(
                  unionCoder, windowingStrategy.getWindowFn().windowCoder()));

      List<PCollectionView<?>> sideInputs = transform.getSideInputs();

      // construct a map from side input to WindowingStrategy so that
      // the OldDoFn runner can map main-input windows to side input windows
      Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
      for (PCollectionView<?> sideInput : sideInputs) {
        sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
      }

      @SuppressWarnings("unchecked")
      FlinkMultiOutputDoFnFunction<InputT, OutputT> doFnWrapper =
          new FlinkMultiOutputDoFnFunction(
              doFn,
              windowingStrategy,
              sideInputStrategies,
              context.getPipelineOptions(),
              outputMap);

      MapPartitionOperator<WindowedValue<InputT>, WindowedValue<RawUnionValue>> taggedDataSet =
          new MapPartitionOperator<>(
              inputDataSet, typeInformation, doFnWrapper, transform.getName());

      transformSideInputs(sideInputs, taggedDataSet, context);

      for (TaggedPValue output : outputs) {
        pruneOutput(
            taggedDataSet,
            context,
            outputMap.get(output.getTag()),
            (PCollection) output.getValue());
      }
    }
    @Override
    @SuppressWarnings("unchecked")
    public void translateNode(
        Combine.PerKey<K, InputT, OutputT> transform, FlinkBatchTranslationContext context) {
      DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
          context.getInputDataSet(context.getInput(transform));

      CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT> combineFn =
          (CombineFnBase.PerKeyCombineFn<K, InputT, AccumT, OutputT>) transform.getFn();

      KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) context.getInput(transform).getCoder();

      Coder<AccumT> accumulatorCoder;

      try {
        accumulatorCoder =
            combineFn.getAccumulatorCoder(
                context.getInput(transform).getPipeline().getCoderRegistry(),
                inputCoder.getKeyCoder(),
                inputCoder.getValueCoder());
      } catch (CannotProvideCoderException e) {
        throw new RuntimeException(e);
      }

      WindowingStrategy<?, ?> windowingStrategy =
          context.getInput(transform).getWindowingStrategy();

      TypeInformation<WindowedValue<KV<K, AccumT>>> partialReduceTypeInfo =
          context.getTypeInfo(
              KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder), windowingStrategy);

      Grouping<WindowedValue<KV<K, InputT>>> inputGrouping =
          inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));

      // construct a map from side input to WindowingStrategy so that
      // the OldDoFn runner can map main-input windows to side input windows
      Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
      for (PCollectionView<?> sideInput : transform.getSideInputs()) {
        sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
      }

      if (windowingStrategy.getWindowFn().isNonMerging()) {
        WindowingStrategy<?, BoundedWindow> boundedStrategy =
            (WindowingStrategy<?, BoundedWindow>) windowingStrategy;

        FlinkPartialReduceFunction<K, InputT, AccumT, ?> partialReduceFunction =
            new FlinkPartialReduceFunction<>(
                combineFn, boundedStrategy, sideInputStrategies, context.getPipelineOptions());

        FlinkReduceFunction<K, AccumT, OutputT, ?> reduceFunction =
            new FlinkReduceFunction<>(
                combineFn, boundedStrategy, sideInputStrategies, context.getPipelineOptions());

        // Partially GroupReduce the values into the intermediate format AccumT (combine)
        GroupCombineOperator<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, AccumT>>>
            groupCombine =
                new GroupCombineOperator<>(
                    inputGrouping,
                    partialReduceTypeInfo,
                    partialReduceFunction,
                    "GroupCombine: " + transform.getName());

        transformSideInputs(transform.getSideInputs(), groupCombine, context);

        TypeInformation<WindowedValue<KV<K, OutputT>>> reduceTypeInfo =
            context.getTypeInfo(context.getOutput(transform));

        Grouping<WindowedValue<KV<K, AccumT>>> intermediateGrouping =
            groupCombine.groupBy(new KvKeySelector<AccumT, K>(inputCoder.getKeyCoder()));

        // Fully reduce the values and create output format OutputT
        GroupReduceOperator<WindowedValue<KV<K, AccumT>>, WindowedValue<KV<K, OutputT>>>
            outputDataSet =
                new GroupReduceOperator<>(
                    intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName());

        transformSideInputs(transform.getSideInputs(), outputDataSet, context);

        context.setOutputDataSet(context.getOutput(transform), outputDataSet);

      } else {
        if (!windowingStrategy.getWindowFn().windowCoder().equals(IntervalWindow.getCoder())) {
          throw new UnsupportedOperationException(
              "Merging WindowFn with windows other than IntervalWindow are not supported.");
        }

        // for merging windows we can't to a pre-shuffle combine step since
        // elements would not be in their correct windows for side-input access

        WindowingStrategy<?, IntervalWindow> intervalStrategy =
            (WindowingStrategy<?, IntervalWindow>) windowingStrategy;

        FlinkMergingNonShuffleReduceFunction<K, InputT, AccumT, OutputT, ?> reduceFunction =
            new FlinkMergingNonShuffleReduceFunction<>(
                combineFn, intervalStrategy, sideInputStrategies, context.getPipelineOptions());

        TypeInformation<WindowedValue<KV<K, OutputT>>> reduceTypeInfo =
            context.getTypeInfo(context.getOutput(transform));

        Grouping<WindowedValue<KV<K, InputT>>> grouping =
            inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));

        // Fully reduce the values and create output format OutputT
        GroupReduceOperator<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, OutputT>>>
            outputDataSet =
                new GroupReduceOperator<>(
                    grouping, reduceTypeInfo, reduceFunction, transform.getName());

        transformSideInputs(transform.getSideInputs(), outputDataSet, context);

        context.setOutputDataSet(context.getOutput(transform), outputDataSet);
      }
    }
    @Override
    public void translateNode(
        GroupByKey<K, InputT> transform, FlinkBatchTranslationContext context) {

      // for now, this is copied from the Combine.PerKey translater. Once we have the new runner API
      // we can replace GroupByKey by a Combine.PerKey with the Concatenate CombineFn

      DataSet<WindowedValue<KV<K, InputT>>> inputDataSet =
          context.getInputDataSet(context.getInput(transform));

      Combine.KeyedCombineFn<K, InputT, List<InputT>, List<InputT>> combineFn =
          new Concatenate<InputT>().asKeyedFn();

      KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) context.getInput(transform).getCoder();

      Coder<List<InputT>> accumulatorCoder;

      try {
        accumulatorCoder =
            combineFn.getAccumulatorCoder(
                context.getInput(transform).getPipeline().getCoderRegistry(),
                inputCoder.getKeyCoder(),
                inputCoder.getValueCoder());
      } catch (CannotProvideCoderException e) {
        throw new RuntimeException(e);
      }

      WindowingStrategy<?, ?> windowingStrategy =
          context.getInput(transform).getWindowingStrategy();

      TypeInformation<WindowedValue<KV<K, List<InputT>>>> partialReduceTypeInfo =
          new CoderTypeInformation<>(
              WindowedValue.getFullCoder(
                  KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder),
                  windowingStrategy.getWindowFn().windowCoder()));

      Grouping<WindowedValue<KV<K, InputT>>> inputGrouping =
          inputDataSet.groupBy(new KvKeySelector<InputT, K>(inputCoder.getKeyCoder()));

      FlinkPartialReduceFunction<K, InputT, List<InputT>, ?> partialReduceFunction;
      FlinkReduceFunction<K, List<InputT>, List<InputT>, ?> reduceFunction;

      if (windowingStrategy.getWindowFn().isNonMerging()) {
        @SuppressWarnings("unchecked")
        WindowingStrategy<?, BoundedWindow> boundedStrategy =
            (WindowingStrategy<?, BoundedWindow>) windowingStrategy;

        partialReduceFunction =
            new FlinkPartialReduceFunction<>(
                combineFn,
                boundedStrategy,
                Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
                context.getPipelineOptions());

        reduceFunction =
            new FlinkReduceFunction<>(
                combineFn,
                boundedStrategy,
                Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
                context.getPipelineOptions());

      } else {
        if (!windowingStrategy.getWindowFn().windowCoder().equals(IntervalWindow.getCoder())) {
          throw new UnsupportedOperationException(
              "Merging WindowFn with windows other than IntervalWindow are not supported.");
        }

        @SuppressWarnings("unchecked")
        WindowingStrategy<?, IntervalWindow> intervalStrategy =
            (WindowingStrategy<?, IntervalWindow>) windowingStrategy;

        partialReduceFunction =
            new FlinkMergingPartialReduceFunction<>(
                combineFn,
                intervalStrategy,
                Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
                context.getPipelineOptions());

        reduceFunction =
            new FlinkMergingReduceFunction<>(
                combineFn,
                intervalStrategy,
                Collections.<PCollectionView<?>, WindowingStrategy<?, ?>>emptyMap(),
                context.getPipelineOptions());
      }

      // Partially GroupReduce the values into the intermediate format AccumT (combine)
      GroupCombineOperator<WindowedValue<KV<K, InputT>>, WindowedValue<KV<K, List<InputT>>>>
          groupCombine =
              new GroupCombineOperator<>(
                  inputGrouping,
                  partialReduceTypeInfo,
                  partialReduceFunction,
                  "GroupCombine: " + transform.getName());

      Grouping<WindowedValue<KV<K, List<InputT>>>> intermediateGrouping =
          groupCombine.groupBy(new KvKeySelector<List<InputT>, K>(inputCoder.getKeyCoder()));

      // Fully reduce the values and create output format VO
      GroupReduceOperator<WindowedValue<KV<K, List<InputT>>>, WindowedValue<KV<K, List<InputT>>>>
          outputDataSet =
              new GroupReduceOperator<>(
                  intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getName());

      context.setOutputDataSet(context.getOutput(transform), outputDataSet);
    }