/** * Builds alert file with all required information - type of event, attachments and sends to * ConnectEMC */ protected void buildAlertFile(EmaApiConnectHome alertFile, EmaApiLogType log) throws Exception { _log.info("Start SendEvent::buildEventType"); alertFile.eventAdd(getEventType(), log); // Create event file to attach String eventFilename = CONNECT_EMC_HOME + EmaApiUtils.emaGenerateFilename(_fileId); _log.info("Event filename: {}", eventFilename); ArrayList<String> fileList = genAttachFiles(); BadRequestException badRequestException = null; if (fileList != null && !fileList.isEmpty()) { boolean attachLogs = true; try { validateAttachmentSize(fileList); } catch (BadRequestException e) { if (forceAttachLogs) { throw e; } badRequestException = e; attachLogs = false; } ArrayList<EmaApiFilenameType> attachFiles = new ArrayList<EmaApiFilenameType>(); if (attachLogs) { for (String file : fileList) { EmaApiFilenameType filename = new EmaApiFilenameType(); filename.setQualifiedFileName(file); filename.setTargetFileName(getTargetFileName(file)); attachFiles.add(filename); } } else { // log size too big, not to attach logs for (String file : fileList) { if (file.equals(SYSTEM_LOGS_FILE_PATH) || file.equals(SYSTEM_EVENT_FILE_PATH)) { continue; } EmaApiFilenameType filename = new EmaApiFilenameType(); filename.setQualifiedFileName(file); filename.setTargetFileName(getTargetFileName(file)); attachFiles.add(filename); } AlertsLogger.getAlertsLogger() .warn( "ConnectEMC alert will be sent without logs attached due to logs have exceeded max allowed size (" + this.getAttachmentsMaxSizeMB() + " MB)"); } alertFile.addFileRawData(eventFilename, attachFiles, log); } alertFile.write(eventFilename, log); alertFile.emaCreateDotEndFile(eventFilename, log); _log.info("Finish SendEvent::buildEventType"); if (badRequestException != null) { throw badRequestException; } }
/** Check offline event info to see if dbsvc/geodbsvc on this node could get started */ private void checkDBOfflineInfo() { Configuration config = _coordinator.queryConfiguration( _coordinator.getSiteId(), Constants.DB_DOWNTIME_TRACKER_CONFIG, _serviceInfo.getName()); DbOfflineEventInfo dbOfflineEventInfo = new DbOfflineEventInfo(config); String localNodeId = _coordinator.getInetAddessLookupMap().getNodeId(); Long lastActiveTimestamp = dbOfflineEventInfo.geLastActiveTimestamp(localNodeId); long zkTimeStamp = (lastActiveTimestamp == null) ? TimeUtils.getCurrentTime() : lastActiveTimestamp; File localDbDir = new File(dbDir); Date lastModified = getLastModified(localDbDir); boolean isDirEmpty = lastModified == null || localDbDir.list().length == 0; long localTimeStamp = (isDirEmpty) ? TimeUtils.getCurrentTime() : lastModified.getTime(); _log.info("Service timestamp in ZK is {}, local file is: {}", zkTimeStamp, localTimeStamp); long diffTime = (zkTimeStamp > localTimeStamp) ? (zkTimeStamp - localTimeStamp) : 0; if (diffTime >= MAX_SERVICE_OUTAGE_TIME) { String errMsg = String.format( "We detect database files on local disk are more than %s days older " + "than last time it was seen in the cluster. It may bring stale data into the database, " + "so the service cannot continue to boot. It may be the result of a VM snapshot rollback. " + "Please contact with EMC support engineer for solution.", diffTime / TimeUtils.DAYS); alertLog.error(errMsg); throw new IllegalStateException(errMsg); } Long offlineTime = dbOfflineEventInfo.getOfflineTimeInMS(localNodeId); if (!isDirEmpty && offlineTime != null && offlineTime >= MAX_SERVICE_OUTAGE_TIME) { String errMsg = String.format( "This node is offline for more than %s days. It may bring stale data into " + "database, so the service cannot continue to boot. Please poweroff this node and follow our " + "node recovery procedure to recover this node", offlineTime / TimeUtils.DAYS); alertLog.error(errMsg); throw new IllegalStateException(errMsg); } }
/** Default database service implementation */ public class DbServiceImpl implements DbService { private static final Logger _log = LoggerFactory.getLogger(DbServiceImpl.class); private static final String DB_INITIALIZED_FLAG_FILE = "/var/run/storageos/dbsvc_initialized"; public static DbServiceImpl instance = null; // run failure detector every 5 min by default private static final int DEFAULT_DETECTOR_RUN_INTERVAL_MIN = 5; private int _detectorInterval = DEFAULT_DETECTOR_RUN_INTERVAL_MIN; // Service outage time should be less than 5 days, or else service will not be allowed to get // started any more. // As we checked the downtime every 15 mins, to avoid actual downtime undervalued, setting the max // value as 4 days. private static final long MAX_SERVICE_OUTAGE_TIME = 4 * TimeUtils.DAYS; private AlertsLogger alertLog = AlertsLogger.getAlertsLogger(); private String _config; private CoordinatorClient _coordinator; private CassandraDaemon _service; private SchemaUtil _schemaUtil; private MigrationHandler _handler; private GarbageCollectionExecutor _gcExecutor; private TaskScrubberExecutor _taskScrubber; private ActionableEventScrubberExecutor _eventScrubber; // 3 threads two threads for node repair, one is for failure detector private static final String POOL_NAME = "DBBackgroundPool"; private ScheduledExecutorService _exe = new NamedScheduledThreadPoolExecutor(POOL_NAME, 3); protected Service _serviceInfo; private JmxServerWrapper _jmxServer; private DbClientImpl _dbClient; private ServiceBeacon _svcBeacon; private DbServiceStatusChecker _statusChecker; // db directory private String dbDir; private String keystorePath; private String truststorePath; private boolean cassandraInitialized = false; private boolean disableScheduledDbRepair = false; private Boolean backCompatPreYoda = false; @Autowired private DbCompactWorker compactWorker; @Autowired private DbManager dbMgr; public void setDbMgr(DbManager dbMgr) { this.dbMgr = dbMgr; } /** Set db client */ public void setDbClient(DbClientImpl dbClient) { _dbClient = dbClient; } /** Set coordinator client */ public void setCoordinator(CoordinatorClient coordinator) { _coordinator = coordinator; } public CoordinatorClient getCoordinator() { return _coordinator; } /** * Set DB schema utility * * @param schemaUtil */ public void setSchemaUtil(SchemaUtil schemaUtil) { _schemaUtil = schemaUtil; } public void setMigrationHandler(MigrationHandler handler) { _handler = handler; } /** * Service setter * * @param service service info */ public void setService(final Service service) { _serviceInfo = service; } /** * Set database config file. It must be in URI form or file must be be in classpath * * @param config database config file */ public void setConfig(String config) { _config = config; } /** JMX server wrapper */ public void setJmxServerWrapper(JmxServerWrapper jmxServer) { _jmxServer = jmxServer; } public void setGarbageCollector(GarbageCollectionExecutor gcExecutor) { _gcExecutor = gcExecutor; } public ActionableEventScrubberExecutor getEventScrubber() { return _eventScrubber; } public void setEventScrubber(ActionableEventScrubberExecutor eventScrubber) { this._eventScrubber = eventScrubber; } public TaskScrubberExecutor getTaskScrubber() { return _taskScrubber; } public void setTaskScrubber(TaskScrubberExecutor taskScrubber) { this._taskScrubber = taskScrubber; } public void setBeacon(ServiceBeacon beacon) { _svcBeacon = beacon; } @Autowired public void setStatusChecker(DbServiceStatusChecker statusChecker) { _statusChecker = statusChecker; } public void setDbDir(String dbDir) { this.dbDir = dbDir; } public String getDbDir() { return this.dbDir; } public void setDisableScheduledDbRepair(boolean disableScheduledDbRepair) { this.disableScheduledDbRepair = disableScheduledDbRepair; } public void setBackCompatPreYoda(Boolean backCompatPreYoda) { this.backCompatPreYoda = backCompatPreYoda; } /** * Check if it is GeoDbSvc * * @return */ private boolean isGeoDbsvc() { return _schemaUtil.isGeoDbsvc(); } /** * Get schema lock name using by current service. * * @return */ private String getSchemaLockName() { return isGeoDbsvc() ? DbConfigConstants.GEODB_SCHEMA_LOCK : DbConfigConstants.DB_SCHEMA_LOCK; } public String getConfigValue(String key) { String configKind = _coordinator.getDbConfigPath(_serviceInfo.getName()); Configuration config = _coordinator.queryConfiguration(_coordinator.getSiteId(), configKind, _serviceInfo.getId()); if (config != null) { return config.getConfig(key); } return null; } public void setConfigValue(String key, String value) { String configKind = _coordinator.getDbConfigPath(_serviceInfo.getName()); Configuration config = _coordinator.queryConfiguration(_coordinator.getSiteId(), configKind, _serviceInfo.getId()); if (config != null) { config.setConfig(key, value); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); } } /** * Checks and registers db configuration information, this is one time when cluster is coming up * for the first time */ private Configuration checkConfiguration() { String configKind = _coordinator.getDbConfigPath(_serviceInfo.getName()); Configuration config = _coordinator.queryConfiguration(_coordinator.getSiteId(), configKind, _serviceInfo.getId()); if (config == null) { // check if it is upgraded from previous version to yoda - configuration may be stored in // zk global area /config. Since SeedProvider still need access that, so we remove the config // from global in migration callback after migration is done. config = _coordinator.queryConfiguration(configKind, _serviceInfo.getId()); if (config != null) { _log.info("Upgrade from pre-yoda release, move dbconfig to new location"); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); return config; } // this is a new node // 1. register its configuration with coordinator // 2. assume autobootstrap configuration // this means that when a node is added, it take 1/2 of biggest token rage and // copies its data over ConfigurationImpl cfg = new ConfigurationImpl(); cfg.setId(_serviceInfo.getId()); cfg.setKind(configKind); cfg.setConfig(DbConfigConstants.NODE_ID, _coordinator.getInetAddessLookupMap().getNodeId()); cfg.setConfig(DbConfigConstants.AUTOBOOT, Boolean.TRUE.toString()); // check other existing db nodes List<Configuration> configs = _coordinator.queryAllConfiguration(_coordinator.getSiteId(), configKind); if (configs.isEmpty()) { // we are the first node - turn off autobootstrap cfg.setConfig(DbConfigConstants.AUTOBOOT, Boolean.FALSE.toString()); } // persist configuration _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), cfg); config = cfg; } return config; } private void removeStaleConfiguration() { removeStaleServiceConfiguration(); removeStaleVersionedDbConfiguration(); } private void removeStaleVersionedDbConfiguration() { String configKind = _coordinator.getVersionedDbConfigPath(_serviceInfo.getName(), _serviceInfo.getVersion()); List<Configuration> configs = _coordinator.queryAllConfiguration(_coordinator.getSiteId(), configKind); for (Configuration config : configs) { if (isStaleConfiguration(config)) { _coordinator.removeServiceConfiguration(_coordinator.getSiteId(), config); _log.info("Remove stale version db config, id: {}", config.getId()); } } } private void removeStaleServiceConfiguration() { boolean isGeoDBSvc = isGeoDbsvc(); boolean resetAutoBootFlag = false; String configKind = _coordinator.getDbConfigPath(_serviceInfo.getName()); List<Configuration> configs = _coordinator.queryAllConfiguration(_coordinator.getSiteId(), configKind); for (Configuration config : configs) { if (isStaleConfiguration(config)) { boolean autoboot = Boolean.parseBoolean(config.getConfig(DbConfigConstants.AUTOBOOT)); String configId = config.getId(); if (isGeoDBSvc && !autoboot && (configId.equals("geodb-4") || configId.equals("geodb-5"))) { // for geodbsvc, if restore with the backup of 5 nodes to 3 nodes and the backup is made // on the cluster that the 'autoboot=false' is set on vipr4 or vipr5 // we should set the autoboot=false on the current node or no node with autoboot=false // TODO:This is a temporary/safest solution in Yoda, we'll provide a better soltuion post // Yoda resetAutoBootFlag = true; } if (isStaleConfiguration(config)) { _coordinator.removeServiceConfiguration(_coordinator.getSiteId(), config); _log.info("Remove stale db config, id: {}", config.getId()); } } } if (resetAutoBootFlag) { _log.info("set autoboot flag to false on {}", _serviceInfo.getId()); Configuration config = _coordinator.queryConfiguration( _coordinator.getSiteId(), configKind, _serviceInfo.getId()); config.setConfig(DbConfigConstants.AUTOBOOT, Boolean.FALSE.toString()); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); } } private boolean isStaleConfiguration(Configuration config) { String delimiter = "-"; String configId = config.getId(); // Bypasses item of "global" and folders of "version", just check db configurations. if (configId == null || configId.equals(Constants.GLOBAL_ID) || !configId.contains(delimiter)) { return false; } if (_serviceInfo.getId().endsWith(Constants.STANDALONE_ID)) { if (!configId.equals(_serviceInfo.getId())) { return true; } } else { CoordinatorClientInetAddressMap nodeMap = _coordinator.getInetAddessLookupMap(); int nodeCount = nodeMap.getControllerNodeIPLookupMap().size(); String nodeIndex = configId.split(delimiter)[1]; if (Constants.STANDALONE_ID.equalsIgnoreCase(nodeIndex) || Integer.parseInt(nodeIndex) > nodeCount) { return true; } } return false; } // check and initialize global configuration private Configuration checkGlobalConfiguration() { String configKind = _coordinator.getDbConfigPath(_serviceInfo.getName()); Configuration config = _coordinator.queryConfiguration(_coordinator.getSiteId(), configKind, Constants.GLOBAL_ID); if (config == null) { // check if it is upgraded from previous version to yoda - configuration may be stored in // znode /config. Since SeedProvider still need access that, so we remove the config // from global in migration callback after migration is done. config = _coordinator.queryConfiguration(configKind, Constants.GLOBAL_ID); if (config != null) { _log.info("Upgrade from pre-yoda release, move global config to new location"); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); return config; } ConfigurationImpl cfg = new ConfigurationImpl(); cfg.setId(Constants.GLOBAL_ID); cfg.setKind(configKind); cfg.setConfig(Constants.SCHEMA_VERSION, this._serviceInfo.getVersion()); // persist configuration _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), cfg); config = cfg; } return config; } // check and initialize versioned configuration private Configuration checkVersionedConfiguration() { String serviceVersion = _serviceInfo.getVersion(); String dbSchemaVersion = _dbClient.getSchemaVersion(); if (!serviceVersion.equals(dbSchemaVersion)) { _log.warn( "The db service version {} doesn't equals Db schema version {}, " + "set db service version to Db schema version", serviceVersion, dbSchemaVersion); _serviceInfo.setVersion(dbSchemaVersion); } String kind = _coordinator.getVersionedDbConfigPath(_serviceInfo.getName(), _serviceInfo.getVersion()); Configuration config = _coordinator.queryConfiguration(_coordinator.getSiteId(), kind, _serviceInfo.getId()); if (config == null) { // check if it is upgraded from previous version to yoda - configuration may be stored in // znode /config config = _coordinator.queryConfiguration(kind, _serviceInfo.getId()); if (config != null) { _log.info("Upgrade from pre-2.5 release, move versioned dbconfig to new location"); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); return config; } ConfigurationImpl cfg = new ConfigurationImpl(); cfg.setId(_serviceInfo.getId()); cfg.setKind(kind); // persist configuration _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), cfg); config = cfg; } return config; } /** Check offline event info to see if dbsvc/geodbsvc on this node could get started */ private void checkDBOfflineInfo() { Configuration config = _coordinator.queryConfiguration( _coordinator.getSiteId(), Constants.DB_DOWNTIME_TRACKER_CONFIG, _serviceInfo.getName()); DbOfflineEventInfo dbOfflineEventInfo = new DbOfflineEventInfo(config); String localNodeId = _coordinator.getInetAddessLookupMap().getNodeId(); Long lastActiveTimestamp = dbOfflineEventInfo.geLastActiveTimestamp(localNodeId); long zkTimeStamp = (lastActiveTimestamp == null) ? TimeUtils.getCurrentTime() : lastActiveTimestamp; File localDbDir = new File(dbDir); Date lastModified = getLastModified(localDbDir); boolean isDirEmpty = lastModified == null || localDbDir.list().length == 0; long localTimeStamp = (isDirEmpty) ? TimeUtils.getCurrentTime() : lastModified.getTime(); _log.info("Service timestamp in ZK is {}, local file is: {}", zkTimeStamp, localTimeStamp); long diffTime = (zkTimeStamp > localTimeStamp) ? (zkTimeStamp - localTimeStamp) : 0; if (diffTime >= MAX_SERVICE_OUTAGE_TIME) { String errMsg = String.format( "We detect database files on local disk are more than %s days older " + "than last time it was seen in the cluster. It may bring stale data into the database, " + "so the service cannot continue to boot. It may be the result of a VM snapshot rollback. " + "Please contact with EMC support engineer for solution.", diffTime / TimeUtils.DAYS); alertLog.error(errMsg); throw new IllegalStateException(errMsg); } Long offlineTime = dbOfflineEventInfo.getOfflineTimeInMS(localNodeId); if (!isDirEmpty && offlineTime != null && offlineTime >= MAX_SERVICE_OUTAGE_TIME) { String errMsg = String.format( "This node is offline for more than %s days. It may bring stale data into " + "database, so the service cannot continue to boot. Please poweroff this node and follow our " + "node recovery procedure to recover this node", offlineTime / TimeUtils.DAYS); alertLog.error(errMsg); throw new IllegalStateException(errMsg); } } /** * Checks and sets INIT_DONE state this means we are done with the actual cf changes on the * cassandra side for the target version */ private void setDbConfigInitDone() { String configKind = _coordinator.getVersionedDbConfigPath(_serviceInfo.getName(), _serviceInfo.getVersion()); Configuration config = _coordinator.queryConfiguration(_coordinator.getSiteId(), configKind, _serviceInfo.getId()); if (config != null) { if (config.getConfig(DbConfigConstants.INIT_DONE) == null) { config.setConfig(DbConfigConstants.INIT_DONE, Boolean.TRUE.toString()); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); } } else { // we are expecting this to exist, because its initialized from checkVersionedConfiguration throw new IllegalStateException("unexpected error, db versioned configuration is null"); } } /** Initializes the keystore/truststore if the paths have been provided. */ private void initKeystoreAndTruststore() { try { DbClientContext ctx = _dbClient.getLocalContext(); if (isGeoDbsvc()) { ctx = _dbClient.getGeoContext(); } String keystorePath = ctx.getKeyStoreFile(); String truststorePath = ctx.getTrustStoreFile(); if (keystorePath == null && truststorePath == null) { _log.info("Skipping keystore/truststore initialization, no paths provided"); return; } String password = ctx.getTrustStorePassword(); CassandraKeystoreHandler keystoreHandler = new CassandraKeystoreHandler(_coordinator, keystorePath, truststorePath, password); if (keystorePath != null) { _log.info("Initializing keystore for current node: {}", keystorePath); keystoreHandler.saveKeyStore(); } else { _log.info("Skipping keystore initialization, no path provided"); } if (truststorePath != null) { _log.info("Initializing truststore for current node: {}", truststorePath); keystoreHandler.saveTrustStore(); } else { _log.info("Skipping truststore initialization, no path provided"); } } catch (Exception e) { _log.error("Unexpected exception during initializing cassandra keystore", e); throw new IllegalStateException(e); } } /** * Use a db initialized flag file to block the peripheral services from starting. This gurantees * CPU cyles for the core services during boot up. */ protected void setDbInitializedFlag() { // set the flag file only for dbsvc (not for geodbsvc) since it always uses more time to // complete comparing to the other if (isGeoDbsvc()) return; File dbInitializedFlag = new File(DB_INITIALIZED_FLAG_FILE); try { if (!dbInitializedFlag.exists()) { new FileOutputStream(dbInitializedFlag).close(); } } catch (Exception e) { _log.error("Failed to create file {} e=", dbInitializedFlag.getName(), e); } } @Override public void start() throws IOException { if (_log.isInfoEnabled()) { _log.info("Starting DB service..."); } // Suppress Sonar violation of Lazy initialization of static fields should be synchronized // start() method will be only called one time when startup dbsvc, so it's safe to ignore sonar // violation instance = this; // NOSONAR ("squid:S2444") if (backCompatPreYoda) { _log.info( "Pre-yoda back compatible flag detected. Initialize local keystore/truststore for Cassandra native encryption"); initKeystoreAndTruststore(); _schemaUtil.setBackCompatPreYoda(true); } System.setProperty("cassandra.config", _config); System.setProperty("cassandra.config.loader", CassandraConfigLoader.class.getName()); // Set to false to clear all gossip state for the node on restart. // // We encounter a weird Cassandra grossip issue(COP-19246) - some nodes are missing from gossip // when rebooting the entire cluster simultaneously. Critical Gossip // fields(ApplicationState.STATUS, ApplicationState.TOKENS) // are not synchronized during handshaking. It looks like some problem caused by incorrect // gossip version/generation // at system local table. So add this option to cleanup local gossip state during reboot // // Make sure add-vdc/add-standby passed when you would remove this option in the future. // // We need make sure majority local nodes are added as seed nodes. Otherwise cassandra may not // see other nodes if it loses // connection to other sites System.setProperty("cassandra.load_ring_state", "false"); // Nodes in new data center should not auto-bootstrap. // See // https://docs.datastax.com/en/cassandra/2.0/cassandra/operations/ops_add_dc_to_cluster_t.html if (_schemaUtil.isStandby()) { System.setProperty("cassandra.auto_bootstrap", "false"); } InterProcessLock lock = null; Configuration config = null; StartupMode mode = null; try { // we use this lock to discourage more than one node bootstrapping / joining at the same time // Cassandra can handle this but it's generally not recommended to make changes to schema // concurrently lock = getLock(getSchemaLockName()); config = checkConfiguration(); checkGlobalConfiguration(); checkVersionedConfiguration(); removeStaleConfiguration(); mode = checkStartupMode(config); _log.info("Current startup mode is {}", mode); // Check if service is allowed to get started by querying db offline info to avoid bringing // back stale data. // Skipping hibernate mode for node recovery procedure to recover the overdue node. int nodeCount = ((CoordinatorClientImpl) _coordinator).getNodeCount(); if (nodeCount != 1 && mode.type != StartupMode.StartupModeType.HIBERNATE_MODE) { checkDBOfflineInfo(); } // this call causes instantiation of a seed provider instance, so the check*Configuration // calls must be preceed it removeCassandraSavedCaches(); mode.onPreStart(); if (_jmxServer != null) { _jmxServer.start(); System.setProperty( "com.sun.management.jmxremote.port", Integer.toString(_jmxServer.getPort())); } _service = new CassandraDaemon(); _service.init(null); _service.start(); cassandraInitialized = true; mode.onPostStart(); } catch (Exception e) { if (mode != null && mode.type == StartupMode.StartupModeType.HIBERNATE_MODE) { printRecoveryWorkAround(e); } _log.error("e=", e); throw new IllegalStateException(e); } finally { if (lock != null) { try { lock.release(); } catch (Exception ignore) { _log.debug("lock release failed"); } } } if (config.getConfig(DbConfigConstants.JOINED) == null) { config.setConfig(DbConfigConstants.JOINED, Boolean.TRUE.toString()); _coordinator.persistServiceConfiguration(_coordinator.getSiteId(), config); } _statusChecker.waitForAllNodesJoined(); _svcBeacon.start(); if (backCompatPreYoda) { _log.info("Enable duplicated beacon in global area during pre-yoda upgrade"); startDupBeacon(); } setDbInitializedFlag(); setDbConfigInitDone(); _dbClient.start(); if (_schemaUtil.isStandby()) { String localDataRevision = getLocalDataRevision(); if (localDataRevision != null) { _schemaUtil.checkDataRevision(localDataRevision); } } // Setup the vdc information, so that login enabled before migration if (!isGeoDbsvc()) { _schemaUtil.checkAndSetupBootStrapInfo(_dbClient); } dbMgr.init(); if (_handler.run()) { // Setup the bootstrap info root tenant, if root tenant migrated from local db, then skip it if (isGeoDbsvc()) { _schemaUtil.checkAndSetupBootStrapInfo(_dbClient); } else { _schemaUtil.checkAndInitStorageSystemTypes(_dbClient); } startBackgroundTasks(); _log.info("DB service started"); } else { _log.error("DB migration failed. Skipping starting background tasks."); } } private InterProcessLock getLock(String name) throws Exception { InterProcessLock lock = null; while (true) { try { lock = _coordinator.getSiteLocalLock(name); lock.acquire(); break; // got lock } catch (Exception e) { if (_coordinator.isConnected()) { throw e; } } } return lock; } private void startDupBeacon() { ServiceBeaconImpl dupBeacon = new ServiceBeaconImpl(); dupBeacon.setService(((ServiceBeaconImpl) _svcBeacon).getService()); dupBeacon.setZkConnection(((ServiceBeaconImpl) _svcBeacon).getZkConnection()); dupBeacon.setSiteSpecific(false); dupBeacon.start(); } /** * Check startup mode on disk. Startup mode is specified by a property file on disk * ${dbdir}/startupmode * * @param config The Confiugration instance * @return BootMode instance if detected, null for no on-disk startup mode */ private StartupMode checkStartupModeOnDisk(Configuration config) throws IOException { String modeType = readStartupModeFromDisk(); if (modeType != null) { if (Constants.STARTUPMODE_HIBERNATE.equalsIgnoreCase(modeType)) { HibernateMode mode = new HibernateMode(config); mode.setCoordinator(_coordinator); mode.setSchemaUtil(_schemaUtil); mode.setDbDir(dbDir); return mode; } else if (Constants.STARTUPMODE_RESTORE_REINIT.equalsIgnoreCase(modeType)) { _log.info( "GeodbRestore startup mode found. Current vdc list {}", _schemaUtil.getVdcList().size()); if (isGeoDbsvc() && _schemaUtil.getVdcList().size() > 1) { GeodbRestoreMode mode = new GeodbRestoreMode(config); mode.setCoordinator(_coordinator); mode.setSchemaUtil(_schemaUtil); mode.setDbDir(dbDir); return mode; } } else { throw new IllegalStateException("Unexpected startup mode " + modeType); } } return null; } public String readStartupModeFromDisk() throws IOException { File startupModeFile = new File(dbDir, Constants.STARTUPMODE); String modeType = readValueFromFile(startupModeFile, Constants.STARTUPMODE); _log.info("On disk startup mode found {}", modeType); return modeType; } /** Remove startup mode flag on disk */ protected void removeStartupModeOnDisk() { _log.info("Remove bootmode file"); File bootModeFile = new File(dbDir, Constants.STARTUPMODE); bootModeFile.delete(); } /** * Read bool value from given db config * * @param config * @param name * @return */ private boolean checkConfigBool(Configuration config, String name) { String value = config.getConfig(name); return value != null && Boolean.parseBoolean(value); } /** * Read a string list(connected by ',') from given db config * * @param config * @return */ private List<String> checkConfigList(Configuration config, String name) { String peerIPs = config.getConfig(name); ArrayList<String> peers = new ArrayList<String>(); if (peerIPs != null) { for (String ip : StringUtils.split(peerIPs, ",")) { peers.add(ip); } } return peers; } /** * Determine current startup mode. See BootMode for detailed explanation of each mode. * * @param config * @return */ private StartupMode checkStartupMode(Configuration config) throws IOException { // Check on disk mode first StartupMode bootMode = checkStartupModeOnDisk(config); if (bootMode != null) { return bootMode; } // Check geodb restore flag in zk if (checkConfigBool(config, Constants.STARTUPMODE_RESTORE_REINIT)) { _log.info("Found geodbrestore config: {}", Constants.STARTUPMODE_RESTORE_REINIT); GeodbRestoreMode mode = new GeodbRestoreMode(config); mode.setCoordinator(_coordinator); mode.setSchemaUtil(_schemaUtil); mode.setDbDir(dbDir); return mode; } // Check geodb reinit ZK flag for add-vdc if (checkConfigBool(config, Constants.REINIT_DB)) { _log.info("Found reinit config: {}", Constants.REINIT_DB); // reinit both system table and StorageOS tables DbReinitMode mode = new DbReinitMode(config); mode.setCoordinator(_coordinator); mode.setSchemaUtil(_schemaUtil); mode.setDbDir(dbDir); return mode; } // check geodb cleanup mode for remove-vdc List<String> obsoletePeers = checkConfigList(config, Constants.OBSOLETE_CASSANDRA_PEERS); if (!obsoletePeers.isEmpty()) { // drop peers ip/tokens from system table ObsoletePeersCleanupMode mode = new ObsoletePeersCleanupMode(config); mode.setCoordinator(_coordinator); mode.setSchemaUtil(_schemaUtil); mode.setObsoletePeers(obsoletePeers); return mode; } else { NormalMode mode = new NormalMode(config); mode.setCoordinator(_coordinator); mode.setSchemaUtil(_schemaUtil); return mode; } } /** Kick off background jobs */ private void startBackgroundTasks() { if (!_schemaUtil.isStandby()) { if (!disableScheduledDbRepair) { startBackgroundNodeRepairTask(); } if (_gcExecutor != null) { _gcExecutor.setDbServiceId(_serviceInfo.getId()); _gcExecutor.start(); } if (_taskScrubber != null) { _taskScrubber.start(); } if (_eventScrubber != null) { _eventScrubber.start(); } } startBackgroundDetectorTask(); startBackgroundCompactTask(); } private void startBackgroundCompactTask() { if (this.compactWorker != null) { // compactWorker is null in JUnit environment this.compactWorker.start(); } } /** Start the node repair task in background */ private void startBackgroundNodeRepairTask() { this.dbMgr.start(); } /** * Start the detector task to monitor Cassandra events. When Cassandra encounter internal * exception or FS error, it will stop Gossip and RPC, watch such events so that dbsvc could * recover from Cassandra internal exception via restart TODO: include other meaningful stats into * consideration like memory usage, etc.. */ private void startBackgroundDetectorTask() { /* start after _detectorInterval 5 mins by default */ _exe.scheduleWithFixedDelay( new Runnable() { @Override public void run() { try { _log.debug("Starting failure detector"); StorageServiceMBean svc = null; svc = StorageService.instance; boolean isRPCRunning = svc.isRPCServerRunning(); boolean isGossipEnabled = svc.isInitialized(); _log.debug( "Thrift status = " + isRPCRunning + ", gossip status = " + isGossipEnabled); if (!isRPCRunning && !isGossipEnabled) { _log.info("Thrift RPC and Gossip both stopped on this node"); _log.error("Cassandra service stopped unexpectedly, stopping dbsvc forcely ..."); /* * As Gossip and RPC stopped, we are not able to flush table out before exit */ System.exit(1); } _log.debug("End failure detector"); } catch (Exception e) { _log.warn("Unexpected exception during cassandra failure detect", e); } } }, _detectorInterval, _detectorInterval, TimeUnit.MINUTES); } /* * Cassandra saved caches would occasionally get corrupted after the reboot, and then * dbsvc will fail to start due to the error of OOM. Delete these files before the start * of dbsvc to avoid this issue, and these files could be rebuilt afterwards. * we should elminate this trick update after Cassandra solve this issue in future. */ private void removeCassandraSavedCaches() { _log.info("Try to remove cassandra saved caches"); String savedCachesLocation = DatabaseDescriptor.getSavedCachesLocation(); File savedCachesDir = new File(savedCachesLocation); if (savedCachesDir != null && savedCachesDir.exists()) { for (File file : savedCachesDir.listFiles()) { FileUtils.deleteQuietly(file); } _log.info("Delete cassandra saved caches({}) successfully", savedCachesLocation); } } @Override public void stop() { if (_log.isInfoEnabled()) { _log.info("Stopping DB service..."); } if (_gcExecutor != null) { _gcExecutor.stop(); } _exe.shutdownNow(); if (_jmxServer != null) { _jmxServer.stop(); } if (_log.isInfoEnabled()) { _log.info("DB service stopped..."); } } /** * Output more clear message in the log when a node down during node recovery introduced by * CASSANDRA-2434 in cassandra 2.1. */ private void printRecoveryWorkAround(Exception e) { if (e.getMessage().startsWith("A node required to move the data consistently is down (")) { String sourceIp = e.getMessage().split("\\(")[1].split("\\)")[0]; _log.error( "{} of node {} is unavailable during node recovery, please double check the node status.", isGeoDbsvc() ? "geodbsvc" : "dbsvc", sourceIp); _log.error("Node recovery will fail in 30 minutes if {} not back to normal state.", sourceIp); } } /** * Read local data revision number. Db data directory is a symbol link to a data revision * directory as the following /data/db/1 -> /data/db/1459567039514.0 Here data version number is * 1459567039514 and 0 is incremental snapshot number. It is always 0 for db revisions * * @return * @throws IOException */ private String getLocalDataRevision() { Path dbDataDir = Paths.get(dbDir, "1"); try { if (Files.isSymbolicLink(dbDataDir)) { Path symDir = Files.readSymbolicLink(dbDataDir); String versionName = symDir.toFile().getName(); int i = versionName.lastIndexOf("."); return versionName.substring(0, i); } } catch (Exception ex) { _log.error("Retrieve local data revision error", ex); } return null; } }