Esempio n. 1
0
    @Override
    public PCollection<KV<URI, String>> apply(PInput input) {
      Pipeline pipeline = input.getPipeline();

      // Create one TextIO.Read transform for each document
      // and add its output to a PCollectionList
      PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

      // TextIO.Read supports:
      //  - file: URIs and paths locally
      //  - gs: URIs on the service
      for (final URI uri : uris) {
        String uriString;
        if (uri.getScheme().equals("file")) {
          uriString = new File(uri).getPath();
        } else {
          uriString = uri.toString();
        }

        PCollection<KV<URI, String>> oneUriToLines =
            pipeline
                .apply(TextIO.Read.from(uriString).named("TextIO.Read(" + uriString + ")"))
                .apply(WithKeys.<URI, String>of(uri).setName("WithKeys(" + uriString + ")"));

        urisToLines = urisToLines.and(oneUriToLines);
      }

      return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
    }
Esempio n. 2
0
      @Override
      public PCollection<T> apply(PInput input) {
        if (filepattern == null) {
          throw new IllegalStateException(
              "need to set the filepattern of an AvroIO.Read transform");
        }
        if (schema == null) {
          throw new IllegalStateException("need to set the schema of an AvroIO.Read transform");
        }
        if (validate) {
          try {
            checkState(
                !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(),
                "Unable to find any files matching %s",
                filepattern);
          } catch (IOException e) {
            throw new IllegalStateException(String.format("Failed to validate %s", filepattern), e);
          }
        }

        @SuppressWarnings("unchecked")
        Bounded<T> read =
            type == GenericRecord.class
                ? (Bounded<T>)
                    com.google.cloud.dataflow.sdk.io.Read.from(
                        AvroSource.from(filepattern).withSchema(schema))
                : com.google.cloud.dataflow.sdk.io.Read.from(
                    AvroSource.from(filepattern).withSchema(type));

        PCollection<T> pcol = input.getPipeline().apply("Read", read);
        // Honor the default output coder that would have been used by this PTransform.
        pcol.setCoder(getDefaultOutputCoder());
        return pcol;
      }
Esempio n. 3
0
 @Override
 public PCollection<T> apply(PInput input) {
   if (topic == null && subscription == null) {
     throw new IllegalStateException(
         "need to set either the topic or the subscription for "
             + "a PubsubIO.Read transform");
   }
   if (topic != null && subscription != null) {
     throw new IllegalStateException(
         "Can't set both the topic and the subscription for a " + "PubsubIO.Read transform");
   }
   return PCollection.<T>createPrimitiveOutputInternal(
           input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED)
       .setCoder(coder);
 }
Esempio n. 4
0
 @Override
 public PCollection<KV<K, V>> apply(PInput input) {
   return PCollection.createPrimitiveOutputInternal(
       input.getPipeline(), WindowingStrategy.globalDefault(), PCollection.IsBounded.BOUNDED);
 }