@Override protected void map(Object key, Object value, Context context) throws IOException, InterruptedException { try { final InputRow inputRow; try { inputRow = parseInputRow(value, parser); } catch (Exception e) { if (config.isIgnoreInvalidRows()) { log.debug(e, "Ignoring invalid row [%s] due to parsing error", value.toString()); context .getCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER) .increment(1); return; // we're ignoring this invalid row } else { throw e; } } if (!granularitySpec.bucketIntervals().isPresent() || granularitySpec .bucketInterval(new DateTime(inputRow.getTimestampFromEpoch())) .isPresent()) { innerMap(inputRow, value, context); } } catch (RuntimeException e) { throw new RE(e, "Failure on row[%s]", value); } }
@Override protected void innerMap(InputRow inputRow, Object value, Context context) throws IOException, InterruptedException { final Map<String, Iterable<String>> dims = Maps.newHashMap(); for (final String dim : inputRow.getDimensions()) { dims.put(dim, inputRow.getDimension(dim)); } helper.emitDimValueCounts(context, new DateTime(inputRow.getTimestampFromEpoch()), dims); }
@Test public void simpleFirehoseReadingTest() throws IOException { Assert.assertEquals(MAX_SHARD_NUMBER.longValue(), segmentSet.size()); Integer rowcount = 0; try (final IngestSegmentFirehose firehose = (IngestSegmentFirehose) factory.connect(rowParser)) { while (firehose.hasMore()) { InputRow row = firehose.nextRow(); Assert.assertArrayEquals(new String[] {DIM_NAME}, row.getDimensions().toArray()); Assert.assertArrayEquals(new String[] {DIM_VALUE}, row.getDimension(DIM_NAME).toArray()); Assert.assertEquals(METRIC_LONG_VALUE.longValue(), row.getLongMetric(METRIC_LONG_NAME)); Assert.assertEquals( METRIC_FLOAT_VALUE, row.getFloatMetric(METRIC_FLOAT_NAME), METRIC_FLOAT_VALUE * 0.0001); ++rowcount; } } Assert.assertEquals((int) MAX_SHARD_NUMBER * MAX_ROWS, (int) rowcount); }
@Override protected void innerMap(InputRow inputRow, Object value, Context context) throws IOException, InterruptedException { final List<Object> groupKey = Rows.toGroupKey(rollupGranularity.truncate(inputRow.getTimestampFromEpoch()), inputRow); context.write( new BytesWritable(HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsBytes(groupKey)), NullWritable.get()); }
@Test(timeout = 60000) public void testPersistFails() throws Exception { final MutableBoolean committed = new MutableBoolean(false); plumber .getSinks() .put( 0L, new Sink( new Interval(0, TimeUnit.HOURS.toMillis(1)), schema, tuningConfig, new DateTime("2014-12-01T12:34:56.789").toString())); plumber.startJob(); final InputRow row = EasyMock.createNiceMock(InputRow.class); EasyMock.expect(row.getTimestampFromEpoch()).andReturn(0L); EasyMock.expect(row.getDimensions()).andReturn(new ArrayList<String>()); EasyMock.replay(row); plumber.add(row, Committers.supplierOf(Committers.nil())); plumber.persist( Committers.supplierFromRunnable( new Runnable() { @Override public void run() { committed.setValue(true); throw new RuntimeException(); } }) .get()); while (!committed.booleanValue()) { Thread.sleep(100); } // Exception may need time to propagate while (metrics.failedPersists() < 1) { Thread.sleep(100); } Assert.assertEquals(1, metrics.failedPersists()); }
private void testPersist(final Object commitMetadata) throws Exception { final MutableBoolean committed = new MutableBoolean(false); plumber .getSinks() .put( 0L, new Sink( new Interval(0, TimeUnit.HOURS.toMillis(1)), schema, tuningConfig, new DateTime("2014-12-01T12:34:56.789").toString())); Assert.assertNull(plumber.startJob()); final InputRow row = EasyMock.createNiceMock(InputRow.class); EasyMock.expect(row.getTimestampFromEpoch()).andReturn(0L); EasyMock.expect(row.getDimensions()).andReturn(new ArrayList<String>()); EasyMock.replay(row); final Committer committer = new Committer() { @Override public Object getMetadata() { return commitMetadata; } @Override public void run() { committed.setValue(true); } }; plumber.add(row, Suppliers.ofInstance(committer)); plumber.persist(committer); while (!committed.booleanValue()) { Thread.sleep(100); } plumber.getSinks().clear(); plumber.finishJob(); }
@Override public void run() { plumber = initPlumber(); final Period intermediatePersistPeriod = config.getIntermediatePersistPeriod(); try { plumber.startJob(); // Delay firehose connection to avoid claiming input resources while the plumber is starting // up. firehose = initFirehose(); long nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); while (firehose.hasMore()) { InputRow inputRow = null; try { try { inputRow = firehose.nextRow(); } catch (Exception e) { log.debug(e, "thrown away line due to exception, considering unparseable"); metrics.incrementUnparseable(); continue; } boolean lateEvent = false; boolean indexLimitExceeded = false; try { lateEvent = plumber.add(inputRow) == -1; } catch (IndexSizeExceededException e) { log.info("Index limit exceeded: %s", e.getMessage()); indexLimitExceeded = true; } if (indexLimitExceeded || lateEvent) { metrics.incrementThrownAway(); log.debug("Throwing away event[%s]", inputRow); if (indexLimitExceeded || System.currentTimeMillis() > nextFlush) { plumber.persist(firehose.commit()); nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); } continue; } final Sink sink = plumber.getSink(inputRow.getTimestampFromEpoch()); if ((sink != null && !sink.canAppendRow()) || System.currentTimeMillis() > nextFlush) { plumber.persist(firehose.commit()); nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); } metrics.incrementProcessed(); } catch (ParseException e) { if (inputRow != null) { log.error(e, "unparseable line: %s", inputRow); } metrics.incrementUnparseable(); } } } catch (RuntimeException e) { log.makeAlert( e, "RuntimeException aborted realtime processing[%s]", fireDepartment.getDataSchema().getDataSource()) .emit(); normalExit = false; throw e; } catch (Error e) { log.makeAlert( e, "Exception aborted realtime processing[%s]", fireDepartment.getDataSchema().getDataSource()) .emit(); normalExit = false; throw e; } finally { CloseQuietly.close(firehose); if (normalExit) { plumber.finishJob(); plumber = null; firehose = null; } } }
@Override public TaskStatus run(final TaskToolbox toolbox) throws Exception { if (this.plumber != null) { throw new IllegalStateException("WTF?!? run with non-null plumber??!"); } // Shed any locks we might have (e.g. if we were uncleanly killed and restarted) since we'll // reacquire // them if we actually need them for (final TaskLock taskLock : getTaskLocks(toolbox)) { toolbox.getTaskActionClient().submit(new LockReleaseAction(taskLock.getInterval())); } boolean normalExit = true; // Set up firehose final Period intermediatePersistPeriod = fireDepartmentConfig.getIntermediatePersistPeriod(); final Firehose firehose = firehoseFactory.connect(); // It would be nice to get the PlumberSchool in the constructor. Although that will need // jackson injectables for // stuff like the ServerView, which seems kind of odd? Perhaps revisit this when Guice has been // introduced. final RealtimePlumberSchool realtimePlumberSchool = new RealtimePlumberSchool( windowPeriod, new File(toolbox.getTaskWorkDir(), "persist"), segmentGranularity); realtimePlumberSchool.setDefaultMaxPendingPersists(maxPendingPersists); final SegmentPublisher segmentPublisher = new TaskActionSegmentPublisher(this, toolbox); // NOTE: We talk to the coordinator in various places in the plumber and we could be more robust // to issues // with the coordinator. Right now, we'll block/throw in whatever thread triggered the // coordinator behavior, // which will typically be either the main data processing loop or the persist thread. // Wrap default DataSegmentAnnouncer such that we unlock intervals as we unannounce segments final DataSegmentAnnouncer lockingSegmentAnnouncer = new DataSegmentAnnouncer() { @Override public void announceSegment(final DataSegment segment) throws IOException { // Side effect: Calling announceSegment causes a lock to be acquired toolbox.getTaskActionClient().submit(new LockAcquireAction(segment.getInterval())); toolbox.getSegmentAnnouncer().announceSegment(segment); } @Override public void unannounceSegment(final DataSegment segment) throws IOException { try { toolbox.getSegmentAnnouncer().unannounceSegment(segment); } finally { toolbox.getTaskActionClient().submit(new LockReleaseAction(segment.getInterval())); } } @Override public void announceSegments(Iterable<DataSegment> segments) throws IOException { // Side effect: Calling announceSegments causes locks to be acquired for (DataSegment segment : segments) { toolbox.getTaskActionClient().submit(new LockAcquireAction(segment.getInterval())); } toolbox.getSegmentAnnouncer().announceSegments(segments); } @Override public void unannounceSegments(Iterable<DataSegment> segments) throws IOException { try { toolbox.getSegmentAnnouncer().unannounceSegments(segments); } finally { for (DataSegment segment : segments) { toolbox.getTaskActionClient().submit(new LockReleaseAction(segment.getInterval())); } } } }; // NOTE: getVersion will block if there is lock contention, which will block plumber.getSink // NOTE: (and thus the firehose) // Shouldn't usually happen, since we don't expect people to submit tasks that intersect with // the // realtime window, but if they do it can be problematic. If we decide to care, we can use more // threads in // the plumber such that waiting for the coordinator doesn't block data processing. final VersioningPolicy versioningPolicy = new VersioningPolicy() { @Override public String getVersion(final Interval interval) { try { // Side effect: Calling getVersion causes a lock to be acquired final TaskLock myLock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval)); return myLock.getVersion(); } catch (IOException e) { throw Throwables.propagate(e); } } }; // NOTE: This pusher selects path based purely on global configuration and the DataSegment, // which means // NOTE: that redundant realtime tasks will upload to the same location. This can cause // index.zip and // NOTE: descriptor.json to mismatch, or it can cause historical nodes to load different // instances of the // NOTE: "same" segment. realtimePlumberSchool.setDataSegmentPusher(toolbox.getSegmentPusher()); realtimePlumberSchool.setConglomerate(toolbox.getQueryRunnerFactoryConglomerate()); realtimePlumberSchool.setQueryExecutorService(toolbox.getQueryExecutorService()); realtimePlumberSchool.setVersioningPolicy(versioningPolicy); realtimePlumberSchool.setSegmentAnnouncer(lockingSegmentAnnouncer); realtimePlumberSchool.setSegmentPublisher(segmentPublisher); realtimePlumberSchool.setServerView(toolbox.getNewSegmentServerView()); realtimePlumberSchool.setEmitter(toolbox.getEmitter()); if (this.rejectionPolicyFactory != null) { realtimePlumberSchool.setRejectionPolicyFactory(rejectionPolicyFactory); } final FireDepartment fireDepartment = new FireDepartment(schema, fireDepartmentConfig, null, null); final RealtimeMetricsMonitor metricsMonitor = new RealtimeMetricsMonitor(ImmutableList.of(fireDepartment)); this.queryRunnerFactoryConglomerate = toolbox.getQueryRunnerFactoryConglomerate(); this.plumber = realtimePlumberSchool.findPlumber(schema, fireDepartment.getMetrics()); try { plumber.startJob(); // Set up metrics emission toolbox.getMonitorScheduler().addMonitor(metricsMonitor); // Time to read data! long nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); while (firehose.hasMore()) { final InputRow inputRow; try { inputRow = firehose.nextRow(); if (inputRow == null) { continue; } final Sink sink = plumber.getSink(inputRow.getTimestampFromEpoch()); if (sink == null) { fireDepartment.getMetrics().incrementThrownAway(); log.debug("Throwing away event[%s]", inputRow); if (System.currentTimeMillis() > nextFlush) { plumber.persist(firehose.commit()); nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); } continue; } if (sink.isEmpty()) { log.info("Task %s: New sink: %s", getId(), sink); } int currCount = sink.add(inputRow); fireDepartment.getMetrics().incrementProcessed(); if (currCount >= fireDepartmentConfig.getMaxRowsInMemory() || System.currentTimeMillis() > nextFlush) { plumber.persist(firehose.commit()); nextFlush = new DateTime().plus(intermediatePersistPeriod).getMillis(); } } catch (FormattedException e) { log.warn(e, "unparseable line"); fireDepartment.getMetrics().incrementUnparseable(); } } } catch (Throwable e) { normalExit = false; log.makeAlert(e, "Exception aborted realtime processing[%s]", schema.getDataSource()).emit(); throw e; } finally { if (normalExit) { try { plumber.persist(firehose.commit()); plumber.finishJob(); } catch (Exception e) { log.makeAlert(e, "Failed to finish realtime task").emit(); } finally { Closeables.closeQuietly(firehose); toolbox.getMonitorScheduler().removeMonitor(metricsMonitor); } } } return TaskStatus.success(getId()); }