@Override public void run(SourceContext<OUT> ctx) throws Exception { if (iteratorToRead == null) { throw new IllegalStateException("Kafka iterator not initialized properly."); } final Object checkpointLock = ctx.getCheckpointLock(); while (running && iteratorToRead.hasNext()) { MessageAndMetadata<byte[], byte[]> message = iteratorToRead.next(); if (lastOffsets.getState()[message.partition()] >= message.offset()) { LOG.info( "Skipping message with offset {} from partition {}", message.offset(), message.partition()); continue; } OUT next = deserializationSchema.deserialize(message.message()); if (deserializationSchema.isEndOfStream(next)) { LOG.info("DeserializationSchema signaled end of stream for this source"); break; } // make the state update and the element emission atomic synchronized (checkpointLock) { lastOffsets.getState()[message.partition()] = message.offset(); ctx.collect(next); } if (LOG.isTraceEnabled()) { LOG.trace( "Processed record with offset {} from partition {}", message.offset(), message.partition()); } } }
@Override public void open(Configuration parameters) throws Exception { super.open(parameters); ConsumerConnector consumer = Consumer.createJavaConsumerConnector(this.consumerConfig); // we request only one stream per consumer instance. Kafka will make sure that each consumer // group // will see each message only once. Map<String, Integer> topicCountMap = Collections.singletonMap(topicName, 1); Map<String, List<KafkaStream<byte[], byte[]>>> streams = consumer.createMessageStreams(topicCountMap); if (streams.size() != 1) { throw new RuntimeException("Expected only one message stream but got " + streams.size()); } List<KafkaStream<byte[], byte[]>> kafkaStreams = streams.get(topicName); if (kafkaStreams == null) { throw new RuntimeException( "Requested stream not available. Available streams: " + streams.toString()); } if (kafkaStreams.size() != 1) { throw new RuntimeException( "Requested 1 stream from Kafka, bot got " + kafkaStreams.size() + " streams"); } LOG.info( "Opening Consumer instance for topic '{}' on group '{}'", topicName, consumerConfig.groupId()); this.iteratorToRead = kafkaStreams.get(0).iterator(); this.consumer = consumer; zkClient = new ZkClient( consumerConfig.zkConnect(), consumerConfig.zkSessionTimeoutMs(), consumerConfig.zkConnectionTimeoutMs(), new KafkaZKStringSerializer()); // most likely the number of offsets we're going to store here will be lower than the number of // partitions. int numPartitions = getNumberOfPartitions(); LOG.debug("The topic {} has {} partitions", topicName, numPartitions); this.lastOffsets = getRuntimeContext().getOperatorState("offset", new long[numPartitions], false); this.commitedOffsets = new long[numPartitions]; // check if there are offsets to restore if (!Arrays.equals(lastOffsets.getState(), new long[numPartitions])) { if (lastOffsets.getState().length != numPartitions) { throw new IllegalStateException( "There are " + lastOffsets.getState().length + " offsets to restore for topic " + topicName + " but " + "there are only " + numPartitions + " in the topic"); } LOG.info("Setting restored offsets {} in ZooKeeper", Arrays.toString(lastOffsets.getState())); setOffsetsInZooKeeper(lastOffsets.getState()); } else { // initialize empty offsets Arrays.fill(this.lastOffsets.getState(), -1); } Arrays.fill(this.commitedOffsets, 0); // just to make it clear running = true; }