Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ public enum WorkType {
HA; // Restart a VM.
}

enum ReasonType {
Unknown,
HostMaintenance,
HostDown,
HostDegraded;
}

enum Step {
Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error,
}
Expand All @@ -92,7 +99,7 @@ enum Step {
* Investigate why a host has disconnected and migrate the VMs on it
* if necessary.
*
* @param host - the host that has disconnected.
* @param hostId - the id of the host that has disconnected.
*/
Status investigate(long hostId);

Expand All @@ -109,17 +116,19 @@ enum Step {
* @param investigate must be investigated before we do anything with this vm.
*/
void scheduleRestart(VMInstanceVO vm, boolean investigate);
void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType);

void cancelDestroy(VMInstanceVO vm, Long hostId);

boolean scheduleDestroy(VMInstanceVO vm, long hostId);
boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType);

/**
* Schedule restarts for all vms running on the host.
* @param host host.
* @param investigate TODO
* @param investigate whether to investigate
* @param reasonType reason for HA work
*/
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate);
void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType);

/**
* Schedule the vm for migration.
Expand All @@ -128,6 +137,7 @@ enum Step {
* @return true if schedule worked.
*/
boolean scheduleMigration(VMInstanceVO vm);
boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType);

List<VMInstanceVO> findTakenMigrationWork();

Expand All @@ -140,10 +150,11 @@ enum Step {
* 3. Check if a VM has been stopped: WorkType.CheckStop
*
* @param vm virtual machine to stop.
* @param host host the virtual machine is on.
* @param hostId the id of the host the virtual machine is on.
* @param type which type of stop is requested.
*/
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type);
boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType);

void cancelScheduledMigrations(HostVO host);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ protected boolean handleDisconnectWithInvestigation(final AgentAttache attache,
handleDisconnectWithoutInvestigation(attache, event, true, true);
host = _hostDao.findById(hostId); // Maybe the host magically reappeared?
if (host != null && host.getStatus() == Status.Down) {
_haMgr.scheduleRestartForVmsOnHost(host, true);
_haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown);
}
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'last_id', 'bigint(20) uns

-- Add used_iops column to support IOPS data in storage stats
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops', 'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage pool" ');

-- Add reason column for op_ha_work
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"');
15 changes: 14 additions & 1 deletion server/src/main/java/com/cloud/ha/HaWorkVO.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity {
@Column(name = "tried")
int timesTried;

@Column(name = "reason")
@Enumerated(value = EnumType.STRING)
private HighAvailabilityManager.ReasonType reasonType;

protected HaWorkVO() {
}

Expand Down Expand Up @@ -179,7 +183,7 @@ public void setPreviousState(State state) {
}

public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState,
final int timesTried, final long updated) {
final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) {
this.workType = workType;
this.type = type;
this.instanceId = instanceId;
Expand All @@ -191,6 +195,7 @@ public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final Wor
this.step = step;
this.timeToTry = System.currentTimeMillis() >> 10;
this.updateTime = updated;
this.reasonType = reasonType;
}

@Override
Expand All @@ -207,4 +212,12 @@ public String toString() {
.append("]")
.toString();
}

public HighAvailabilityManager.ReasonType getReasonType() {
return reasonType;
}

public void setReasonType(HighAvailabilityManager.ReasonType reasonType) {
this.reasonType = reasonType;
}
}
74 changes: 62 additions & 12 deletions server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
Expand All @@ -43,6 +44,7 @@
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.logging.log4j.ThreadContext;

import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
Expand Down Expand Up @@ -90,7 +92,6 @@
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VirtualMachineProfile;
import com.cloud.vm.dao.VMInstanceDao;
import org.apache.logging.log4j.ThreadContext;

/**
* HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored
Expand Down Expand Up @@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
protected static ConfigKey<Boolean> VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true",
"Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone);

protected static final List<ReasonType> CancellableWorkReasonTypes =
Arrays.asList(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded);

WorkerThread[] _workers;
boolean _stopped;
long _timeToSleep;
Expand Down Expand Up @@ -269,8 +273,7 @@ public Status investigate(final long hostId) {
}

@Override
public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) {

public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) {
if (host.getType() != Host.Type.Routing) {
return;
}
Expand Down Expand Up @@ -337,12 +340,12 @@ public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate)
logger.debug("VM {} is not on down host {} it is on other host {} VM HA is done", vm, host, hostId);
continue;
}
scheduleRestart(vm, investigate);
scheduleRestart(vm, investigate, reasonType);
}
}

@Override
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) {
assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);

if (_haDao.hasBeenScheduled(vm.getId(), type)) {
Expand All @@ -359,7 +362,7 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
return false;
}

HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work);
Expand All @@ -368,6 +371,11 @@ public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
return true;
}

@Override
public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
return scheduleStop(vm, hostId, type, null);
}

protected void wakeupWorkers() {
logger.debug("Wakeup workers HA");
for (WorkerThread worker : _workers) {
Expand All @@ -376,7 +384,7 @@ protected void wakeupWorkers() {
}

@Override
public boolean scheduleMigration(final VMInstanceVO vm) {
public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) {
if (vm.getHostId() == null) {
return false;
}
Expand All @@ -390,15 +398,20 @@ public boolean scheduleMigration(final VMInstanceVO vm) {
return false;
}

final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work);
wakeupWorkers();
return true;
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
public boolean scheduleMigration(final VMInstanceVO vm) {
return scheduleMigration(vm, null);
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId());
if (logger.isDebugEnabled()) {
Expand Down Expand Up @@ -490,7 +503,7 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
}

HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType);
_haDao.persist(work);

if (logger.isInfoEnabled()) {
Expand All @@ -500,6 +513,11 @@ public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
wakeupWorkers();
}

@Override
public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
scheduleRestart(vm, investigate, null);
}

private void startVm(VirtualMachine vm, Map<VirtualMachineProfile.Param, Object> params,
DeploymentPlanner planner) throws InsufficientCapacityException, ResourceUnavailableException,
ConcurrentOperationException, OperationTimedoutException {
Expand Down Expand Up @@ -561,6 +579,9 @@ protected Long restart(final HaWorkVO work) {
logger.info("Unable to find vm: " + vmId);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}

logger.info("HA on " + vm);
if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
Expand Down Expand Up @@ -762,6 +783,22 @@ protected Long restart(final HaWorkVO work) {
return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
}

protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) {
if (!Step.Investigating.equals(work.getStep())) {
return false;
}
if (!CancellableWorkReasonTypes.contains(work.getReasonType())) {
return false;
}
Status hostStatus = investigate(work.getHostId());
if (!Status.Up.equals(hostStatus)) {
return false;
}
logger.debug("Cancelling {} as it is not needed anymore", () -> work);
work.setStep(Step.Cancelled);
return true;
}

public Long migrate(final HaWorkVO work) {
long vmId = work.getInstanceId();
long srcHostId = work.getHostId();
Expand All @@ -772,6 +809,9 @@ public Long migrate(final HaWorkVO work) {
logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
logger.info("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries);
try {
work.setStep(Step.Migrating);
Expand All @@ -791,7 +831,7 @@ public Long migrate(final HaWorkVO work) {
}

@Override
public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) {
if (!VmHaEnabled.valueIn(vm.getDataCenterId())) {
String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId);
if (logger.isDebugEnabled()) {
Expand All @@ -801,7 +841,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId) {
return false;
}

final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated());
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType);
_haDao.persist(work);
if (logger.isDebugEnabled()) {
logger.debug("Scheduled " + work.toString());
Expand Down Expand Up @@ -838,6 +878,9 @@ protected Long destroyVM(final HaWorkVO work) {
logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())
|| VirtualMachine.Type.ConsoleProxy.equals(vm.getType());
if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) {
Expand Down Expand Up @@ -872,6 +915,9 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException {
work.setStep(Step.Done);
return null;
}
if (checkAndCancelWorkIfNeeded(work)) {
return null;
}
logger.info("Stopping " + vm);
try {
if (work.getWorkType() == WorkType.Stop) {
Expand Down Expand Up @@ -1057,6 +1103,8 @@ public boolean configure(final String name, final Map<String, Object> xmlParams)
public boolean start() {
_stopped = false;

_haDao.markPendingWorksAsInvestigating();

for (final WorkerThread thread : _workers) {
thread.start();
}
Expand All @@ -1074,6 +1122,8 @@ public boolean stop() {

_executor.shutdown();

_haDao.markServerPendingWorksAsInvestigating(_msServer.getId());

return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {

List<HaWorkVO> listPendingMigrationsForVm(long vmId);
int expungeByVmList(List<Long> vmIds, Long batchSize);
void markPendingWorksAsInvestigating();
void markServerPendingWorksAsInvestigating(long managementServerId);
}
Loading