private void snapshotTranslog(Translog.Snapshot snapshot, CommitPoint.FileInfo fileInfo) throws IOException { blobContainer.writeBlob(fileInfo.name(), snapshot.stream(), snapshot.lengthInBytes()); // // long chunkBytes = Long.MAX_VALUE; // if (chunkSize != null) { // chunkBytes = chunkSize.bytes(); // } // // long totalLength = fileInfo.length(); // long numberOfChunks = totalLength / chunkBytes; // if (totalLength % chunkBytes > 0) { // numberOfChunks++; // } // if (numberOfChunks == 0) { // numberOfChunks++; // } // // if (numberOfChunks == 1) { // blobContainer.writeBlob(fileInfo.name(), snapshot.stream(), // snapshot.lengthInBytes()); // } else { // InputStream translogStream = snapshot.stream(); // long totalLengthLeftToWrite = totalLength; // for (int i = 0; i < numberOfChunks; i++) { // long lengthToWrite = chunkBytes; // if (totalLengthLeftToWrite < chunkBytes) { // lengthToWrite = totalLengthLeftToWrite; // } // blobContainer.writeBlob(fileInfo.name() + ".part" + i, new // LimitInputStream(translogStream, lengthToWrite), lengthToWrite); // totalLengthLeftToWrite -= lengthToWrite; // } // } }
public void testV0LegacyTranslogVersion() throws Exception { Path translogFile = getDataPath("/org/elasticsearch/index/translog/translog-v0.binary"); assertThat("test file should exist", Files.exists(translogFile), equalTo(true)); try (ImmutableTranslogReader reader = openReader(translogFile, 0)) { assertThat( "a version0 stream is returned", reader instanceof LegacyTranslogReader, equalTo(true)); try (final Translog.Snapshot snapshot = reader.newSnapshot()) { final Translog.Operation operation = snapshot.next(); assertThat( "operation is the correct type correctly", operation.opType() == Translog.Operation.Type.INDEX, equalTo(true)); Translog.Index op = (Translog.Index) operation; assertThat(op.id(), equalTo("1")); assertThat(op.type(), equalTo("doc")); assertThat( op.source().toUtf8(), equalTo("{\"body\": \"worda wordb wordc wordd \\\"worde\\\" wordf\"}")); assertThat(op.routing(), equalTo(null)); assertThat(op.parent(), equalTo(null)); assertThat(op.version(), equalTo(1L)); assertThat(op.timestamp(), equalTo(1407312091791L)); assertThat(op.ttl(), equalTo(-1L)); assertThat(op.versionType(), equalTo(VersionType.INTERNAL)); assertNull(snapshot.next()); } } }
/** * Indicates that the same translog exists, but new operations have been appended to it. Throws * {@link ElasticSearchIllegalStateException} if {@link #newTranslogCreated()} is <tt>true</tt>, * so always check that first. */ public boolean sameTranslogNewOperations() { if (newTranslogCreated()) { throw new ElasticSearchIllegalStateException( "Should not be called when there is a new translog"); } return translogSnapshot.length() > lastTranslogLength; }
public void testCorruptedTranslogs() throws Exception { try { Path translogFile = getDataPath("/org/elasticsearch/index/translog/translog-v1-corrupted-magic.binary"); assertThat("test file should exist", Files.exists(translogFile), equalTo(true)); openReader(translogFile, 0); fail("should have thrown an exception about the header being corrupt"); } catch (TranslogCorruptedException e) { assertThat( "translog corruption from header: " + e.getMessage(), e.getMessage() .contains("translog looks like version 1 or later, but has corrupted header"), equalTo(true)); } try { Path translogFile = getDataPath("/org/elasticsearch/index/translog/translog-invalid-first-byte.binary"); assertThat("test file should exist", Files.exists(translogFile), equalTo(true)); openReader(translogFile, 0); fail("should have thrown an exception about the header being corrupt"); } catch (TranslogCorruptedException e) { assertThat( "translog corruption from header: " + e.getMessage(), e.getMessage() .contains("Invalid first byte in translog file, got: 1, expected 0x00 or 0x3f"), equalTo(true)); } try { Path translogFile = getDataPath("/org/elasticsearch/index/translog/translog-v1-corrupted-body.binary"); assertThat("test file should exist", Files.exists(translogFile), equalTo(true)); try (ImmutableTranslogReader reader = openReader(translogFile, 0)) { try (final Translog.Snapshot snapshot = reader.newSnapshot()) { while (snapshot.next() != null) {} } } fail("should have thrown an exception about the body being corrupted"); } catch (TranslogCorruptedException e) { assertThat( "translog corruption from body: " + e.getMessage(), e.getMessage().contains("translog corruption while reading from stream"), equalTo(true)); } }
public void testTruncatedTranslog() throws Exception { try { Path translogFile = getDataPath("/org/elasticsearch/index/translog/translog-v1-truncated.binary"); assertThat("test file should exist", Files.exists(translogFile), equalTo(true)); try (ImmutableTranslogReader reader = openReader(translogFile, 0)) { try (final Translog.Snapshot snapshot = reader.newSnapshot()) { while (snapshot.next() != null) {} } } fail("should have thrown an exception about the body being truncated"); } catch (TranslogCorruptedException e) { assertThat( "translog truncated: " + e.getMessage(), e.getMessage().contains("operation size is corrupted must be"), equalTo(true)); } }
@Override public <T> T snapshot(SnapshotHandler<T> snapshotHandler) throws EngineException { SnapshotIndexCommit snapshotIndexCommit = null; Translog.Snapshot traslogSnapshot = null; rwl.readLock().lock(); try { snapshotIndexCommit = deletionPolicy.snapshot(); traslogSnapshot = translog.snapshot(); } catch (Exception e) { if (snapshotIndexCommit != null) snapshotIndexCommit.release(); throw new SnapshotFailedEngineException(shardId, e); } finally { rwl.readLock().unlock(); } try { return snapshotHandler.snapshot(snapshotIndexCommit, traslogSnapshot); } finally { snapshotIndexCommit.release(); traslogSnapshot.release(); } }
public void testV1ChecksummedTranslogVersion() throws Exception { Path translogFile = getDataPath("/org/elasticsearch/index/translog/translog-v1.binary"); assertThat("test file should exist", Files.exists(translogFile), equalTo(true)); try (ImmutableTranslogReader reader = openReader(translogFile, 0)) { try (final Translog.Snapshot snapshot = reader.newSnapshot()) { assertThat( "a version1 stream is returned", reader instanceof ImmutableTranslogReader, equalTo(true)); Translog.Operation operation = snapshot.next(); assertThat( "operation is the correct type correctly", operation.opType() == Translog.Operation.Type.INDEX, equalTo(true)); Translog.Index op = (Translog.Index) operation; assertThat(op.id(), equalTo("Bwiq98KFSb6YjJQGeSpeiw")); assertThat(op.type(), equalTo("doc")); assertThat(op.source().toUtf8(), equalTo("{\"body\": \"foo\"}")); assertThat(op.routing(), equalTo(null)); assertThat(op.parent(), equalTo(null)); assertThat(op.version(), equalTo(1L)); assertThat(op.timestamp(), equalTo(1408627184844L)); assertThat(op.ttl(), equalTo(-1L)); assertThat(op.versionType(), equalTo(VersionType.INTERNAL)); // There are more operations int opNum = 1; while (snapshot.next() != null) { opNum++; } assertThat("there should be 5 translog operations", opNum, equalTo(5)); } } }
@Override public void recover(RecoveryHandler recoveryHandler) throws EngineException { // take a write lock here so it won't happen while a flush is in progress // this means that next commits will not be allowed once the lock is released rwl.writeLock().lock(); try { disableFlushCounter++; } finally { rwl.writeLock().unlock(); } SnapshotIndexCommit phase1Snapshot; try { phase1Snapshot = deletionPolicy.snapshot(); } catch (IOException e) { --disableFlushCounter; throw new RecoveryEngineException(shardId, 1, "Snapshot failed", e); } try { recoveryHandler.phase1(phase1Snapshot); } catch (Exception e) { --disableFlushCounter; phase1Snapshot.release(); throw new RecoveryEngineException(shardId, 1, "Execution failed", e); } Translog.Snapshot phase2Snapshot; try { phase2Snapshot = translog.snapshot(); } catch (Exception e) { --disableFlushCounter; phase1Snapshot.release(); throw new RecoveryEngineException(shardId, 2, "Snapshot failed", e); } try { recoveryHandler.phase2(phase2Snapshot); } catch (Exception e) { --disableFlushCounter; phase1Snapshot.release(); phase2Snapshot.release(); throw new RecoveryEngineException(shardId, 2, "Execution failed", e); } rwl.writeLock().lock(); Translog.Snapshot phase3Snapshot; try { phase3Snapshot = translog.snapshot(phase2Snapshot); } catch (Exception e) { --disableFlushCounter; rwl.writeLock().unlock(); phase1Snapshot.release(); phase2Snapshot.release(); throw new RecoveryEngineException(shardId, 3, "Snapshot failed", e); } try { recoveryHandler.phase3(phase3Snapshot); } catch (Exception e) { throw new RecoveryEngineException(shardId, 3, "Execution failed", e); } finally { --disableFlushCounter; rwl.writeLock().unlock(); phase1Snapshot.release(); phase2Snapshot.release(); phase3Snapshot.release(); } }
/** * Indicates that a new transaction log has been created. Note check this <b>before</b> you * check {@link #sameTranslogNewOperations()}. */ public boolean newTranslogCreated() { return translogSnapshot.translogId() != lastTranslogId; }
/** * Send the given snapshot's operations to this handler's target node. * * <p>Operations are bulked into a single request depending on an operation count limit or * size-in-bytes limit * * @return the total number of translog operations that were sent */ protected int sendSnapshot(final Translog.Snapshot snapshot) { int ops = 0; long size = 0; int totalOperations = 0; final List<Translog.Operation> operations = new ArrayList<>(); Translog.Operation operation; try { operation = snapshot.next(); // this ex should bubble up } catch (IOException ex) { throw new ElasticsearchException("failed to get next operation from translog", ex); } if (operation == null) { logger.trace( "[{}][{}] no translog operations to send to {}", indexName, shardId, request.targetNode()); } while (operation != null) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); operations.add(operation); ops += 1; size += operation.estimateSize(); totalOperations++; // Check if this request is past bytes threshold, and // if so, send it off if (size >= chunkSizeInBytes) { // don't throttle translog, since we lock for phase3 indexing, // so we need to move it as fast as possible. Note, since we // index docs to replicas while the index files are recovered // the lock can potentially be removed, in which case, it might // make sense to re-enable throttling in this phase cancellableThreads.execute( () -> recoveryTarget.indexTranslogOperations(operations, snapshot.totalOperations())); if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sent batch of [{}][{}] (total: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), snapshot.totalOperations(), request.targetNode()); } ops = 0; size = 0; operations.clear(); } try { operation = snapshot.next(); // this ex should bubble up } catch (IOException ex) { throw new ElasticsearchException("failed to get next operation from translog", ex); } } // send the leftover if (!operations.isEmpty()) { cancellableThreads.execute( () -> recoveryTarget.indexTranslogOperations(operations, snapshot.totalOperations())); } if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sent final batch of [{}][{}] (total: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), snapshot.totalOperations(), request.targetNode()); } return totalOperations; }
private void doSnapshot(final Snapshot snapshot) throws IndexShardGatewaySnapshotFailedException { ImmutableMap<String, BlobMetaData> blobs; try { blobs = blobContainer.listBlobs(); } catch (IOException e) { throw new IndexShardGatewaySnapshotFailedException(shardId, "failed to list blobs", e); } long generation = findLatestFileNameGeneration(blobs); CommitPoints commitPoints = buildCommitPoints(blobs); currentSnapshotStatus.index().startTime(System.currentTimeMillis()); currentSnapshotStatus.updateStage(SnapshotStatus.Stage.INDEX); final SnapshotIndexCommit snapshotIndexCommit = snapshot.indexCommit(); final Translog.Snapshot translogSnapshot = snapshot.translogSnapshot(); final CountDownLatch indexLatch = new CountDownLatch(snapshotIndexCommit.getFiles().length); final CopyOnWriteArrayList<Throwable> failures = new CopyOnWriteArrayList<Throwable>(); final List<CommitPoint.FileInfo> indexCommitPointFiles = Lists.newArrayList(); int indexNumberOfFiles = 0; long indexTotalFilesSize = 0; for (final String fileName : snapshotIndexCommit.getFiles()) { StoreFileMetaData md; try { md = store.metaData(fileName); } catch (IOException e) { throw new IndexShardGatewaySnapshotFailedException( shardId, "Failed to get store file metadata", e); } boolean snapshotRequired = false; if (snapshot.indexChanged() && fileName.equals(snapshotIndexCommit.getSegmentsFileName())) { snapshotRequired = true; // we want to always snapshot the segment file if the index changed } CommitPoint.FileInfo fileInfo = commitPoints.findPhysicalIndexFile(fileName); if (fileInfo == null || !fileInfo.isSame(md) || !commitPointFileExistsInBlobs(fileInfo, blobs)) { // commit point file does not exists in any commit point, or has different length, or does // not fully exists in the listed blobs snapshotRequired = true; } if (snapshotRequired) { indexNumberOfFiles++; indexTotalFilesSize += md.length(); // create a new FileInfo try { CommitPoint.FileInfo snapshotFileInfo = new CommitPoint.FileInfo( fileNameFromGeneration(++generation), fileName, md.length(), md.checksum()); indexCommitPointFiles.add(snapshotFileInfo); snapshotFile(snapshotIndexCommit.getDirectory(), snapshotFileInfo, indexLatch, failures); } catch (IOException e) { failures.add(e); indexLatch.countDown(); } } else { indexCommitPointFiles.add(fileInfo); indexLatch.countDown(); } } currentSnapshotStatus.index().files(indexNumberOfFiles, indexTotalFilesSize); try { indexLatch.await(); } catch (InterruptedException e) { failures.add(e); } if (!failures.isEmpty()) { throw new IndexShardGatewaySnapshotFailedException( shardId(), "Failed to perform snapshot (index files)", failures.get(failures.size() - 1)); } currentSnapshotStatus .index() .time(System.currentTimeMillis() - currentSnapshotStatus.index().startTime()); currentSnapshotStatus.updateStage(SnapshotStatus.Stage.TRANSLOG); currentSnapshotStatus.translog().startTime(System.currentTimeMillis()); // Note, we assume the snapshot is always started from "base 0". We need to seek forward if we // want to lastTranslogPosition if we want the delta List<CommitPoint.FileInfo> translogCommitPointFiles = Lists.newArrayList(); int expectedNumberOfOperations = 0; boolean snapshotRequired = false; if (snapshot.newTranslogCreated()) { if (translogSnapshot.lengthInBytes() > 0) { snapshotRequired = true; expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations(); } } else { // if we have a commit point, check that we have all the files listed in it in the blob store if (!commitPoints.commits().isEmpty()) { CommitPoint commitPoint = commitPoints.commits().get(0); boolean allTranslogFilesExists = true; for (CommitPoint.FileInfo fileInfo : commitPoint.translogFiles()) { if (!commitPointFileExistsInBlobs(fileInfo, blobs)) { allTranslogFilesExists = false; break; } } // if everything exists, we can seek forward in case there are new operations, otherwise, we // copy over all again... if (allTranslogFilesExists) { translogCommitPointFiles.addAll(commitPoint.translogFiles()); if (snapshot.sameTranslogNewOperations()) { translogSnapshot.seekForward(snapshot.lastTranslogLength()); if (translogSnapshot.lengthInBytes() > 0) { snapshotRequired = true; expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations() - snapshot.lastTotalTranslogOperations(); } } // else (no operations, nothing to snapshot) } else { // a full translog snapshot is required if (translogSnapshot.lengthInBytes() > 0) { expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations(); snapshotRequired = true; } } } else { // no commit point, snapshot all the translog if (translogSnapshot.lengthInBytes() > 0) { expectedNumberOfOperations = translogSnapshot.estimatedTotalOperations(); snapshotRequired = true; } } } currentSnapshotStatus.translog().expectedNumberOfOperations(expectedNumberOfOperations); if (snapshotRequired) { CommitPoint.FileInfo addedTranslogFileInfo = new CommitPoint.FileInfo( fileNameFromGeneration(++generation), "translog-" + translogSnapshot.translogId(), translogSnapshot.lengthInBytes(), null /* no need for checksum in translog */); translogCommitPointFiles.add(addedTranslogFileInfo); try { snapshotTranslog(translogSnapshot, addedTranslogFileInfo); } catch (Exception e) { throw new IndexShardGatewaySnapshotFailedException( shardId, "Failed to snapshot translog", e); } } currentSnapshotStatus .translog() .time(System.currentTimeMillis() - currentSnapshotStatus.translog().startTime()); // now create and write the commit point currentSnapshotStatus.updateStage(SnapshotStatus.Stage.FINALIZE); long version = 0; if (!commitPoints.commits().isEmpty()) { version = commitPoints.commits().iterator().next().version() + 1; } String commitPointName = "commit-" + Long.toString(version, Character.MAX_RADIX); CommitPoint commitPoint = new CommitPoint( version, commitPointName, CommitPoint.Type.GENERATED, indexCommitPointFiles, translogCommitPointFiles); try { byte[] commitPointData = CommitPoints.toXContent(commitPoint); blobContainer.writeBlob( commitPointName, new FastByteArrayInputStream(commitPointData), commitPointData.length); } catch (Exception e) { throw new IndexShardGatewaySnapshotFailedException( shardId, "Failed to write commit point", e); } // delete all files that are not referenced by any commit point // build a new CommitPoint, that includes this one and all the saved ones List<CommitPoint> newCommitPointsList = Lists.newArrayList(); newCommitPointsList.add(commitPoint); for (CommitPoint point : commitPoints) { if (point.type() == CommitPoint.Type.SAVED) { newCommitPointsList.add(point); } } CommitPoints newCommitPoints = new CommitPoints(newCommitPointsList); // first, go over and delete all the commit points for (String blobName : blobs.keySet()) { if (!blobName.startsWith("commit-")) { continue; } long checkedVersion = Long.parseLong(blobName.substring("commit-".length()), Character.MAX_RADIX); if (!newCommitPoints.hasVersion(checkedVersion)) { try { blobContainer.deleteBlob(blobName); } catch (IOException e) { // ignore } } } // now go over all the blobs, and if they don't exists in a commit point, delete them for (String blobName : blobs.keySet()) { String name = blobName; if (!name.startsWith("__")) { continue; } if (blobName.contains(".part")) { name = blobName.substring(0, blobName.indexOf(".part")); } if (newCommitPoints.findNameFile(name) == null) { try { blobContainer.deleteBlob(blobName); } catch (IOException e) { // ignore, will delete it laters } } } }
/** * Send the given snapshot's operations to this handler's target node. * * <p>Operations are bulked into a single request depending on an operation count limit or * size-in-bytes limit * * @return the total number of translog operations that were sent */ protected int sendSnapshot(Translog.Snapshot snapshot) throws ElasticsearchException { int ops = 0; long size = 0; int totalOperations = 0; final List<Translog.Operation> operations = Lists.newArrayList(); Translog.Operation operation = snapshot.next(); final TransportRequestOptions recoveryOptions = TransportRequestOptions.options() .withCompress(recoverySettings.compress()) .withType(TransportRequestOptions.Type.RECOVERY) .withTimeout(recoverySettings.internalActionLongTimeout()); if (operation == null) { logger.trace( "[{}][{}] no translog operations (id: [{}]) to send to {}", indexName, shardId, snapshot.translogId(), request.targetNode()); } while (operation != null) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); operations.add(operation); ops += 1; size += operation.estimateSize(); totalOperations++; // Check if this request is past the size or bytes threshold, and // if so, send it off if (ops >= recoverySettings.translogOps() || size >= recoverySettings.translogSize().bytes()) { // don't throttle translog, since we lock for phase3 indexing, // so we need to move it as fast as possible. Note, since we // index docs to replicas while the index files are recovered // the lock can potentially be removed, in which case, it might // make sense to re-enable throttling in this phase // if (recoverySettings.rateLimiter() != null) { // recoverySettings.rateLimiter().pause(size); // } if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sending batch of [{}][{}] (total: [{}], id: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), shard.translog().estimatedNumberOfOperations(), snapshot.translogId(), request.targetNode()); } cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { final RecoveryTranslogOperationsRequest translogOperationsRequest = new RecoveryTranslogOperationsRequest( request.recoveryId(), request.shardId(), operations, shard.translog().estimatedNumberOfOperations()); transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.TRANSLOG_OPS, translogOperationsRequest, recoveryOptions, EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); ops = 0; size = 0; operations.clear(); } operation = snapshot.next(); } // send the leftover if (logger.isTraceEnabled()) { logger.trace( "[{}][{}] sending final batch of [{}][{}] (total: [{}], id: [{}]) translog operations to {}", indexName, shardId, ops, new ByteSizeValue(size), shard.translog().estimatedNumberOfOperations(), snapshot.translogId(), request.targetNode()); } if (!operations.isEmpty()) { cancellableThreads.execute( new Interruptable() { @Override public void run() throws InterruptedException { RecoveryTranslogOperationsRequest translogOperationsRequest = new RecoveryTranslogOperationsRequest( request.recoveryId(), request.shardId(), operations, shard.translog().estimatedNumberOfOperations()); transportService .submitRequest( request.targetNode(), RecoveryTarget.Actions.TRANSLOG_OPS, translogOperationsRequest, recoveryOptions, EmptyTransportResponseHandler.INSTANCE_SAME) .txGet(); } }); } return totalOperations; }