Skip to content

Commit

Permalink
Snat health check generates unnecessary api server updates
Browse files Browse the repository at this point in the history
On vrouter down detach infro is not sent correctly to the api server. This causes a bunch of
messages/updates to the API server on a regular interval. The fix now correctly updates the
virtual-machine and vrouter references so that a new vrouter can be attached to the vm.

In addition made the service check interval configurable in contrail-svc-monitor.conf
check_service_interval=<seconds>

Change-Id: I352ec821128e801b24cd227ce35824906140fcab
Closes-Bug: #1502300
  • Loading branch information
Hampapur Ajay authored and rrugge committed Nov 4, 2015
1 parent b08912f commit 03272b6
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/config/common/svc_info.py
Expand Up @@ -16,7 +16,7 @@
_VN_SNAT_PREFIX_NAME = 'snat-si-left'
_VN_SNAT_SUBNET_CIDR = '100.64.0.0/29'

_CHECK_SVC_VM_HEALTH_INTERVAL = 30
_CHECK_SVC_VM_HEALTH_INTERVAL = 60

_VM_INSTANCE_TYPE = 'virtual-machine'
_NETNS_INSTANCE_TYPE = 'network-namespace'
Expand Down
1 change: 1 addition & 0 deletions src/config/svc-monitor/svc_monitor/config_db.py
Expand Up @@ -318,6 +318,7 @@ def __init__(self, uuid, obj_dict=None):
self.params = None
self.state = 'init'
self.launch_count = 0
self.back_off = -1
self.image = None
self.flavor = None
self.max_instances = 0
Expand Down
23 changes: 7 additions & 16 deletions src/config/svc-monitor/svc_monitor/instance_manager.py
Expand Up @@ -518,6 +518,7 @@ def _associate_vrouter(self, si, vm):
vrouter_name = chosen_vr_fq_name[-1]
self.logger.log_info("VRouter %s updated with VM %s" %
(':'.join(chosen_vr_fq_name), vm.name))
vm.update()
else:
vr = VirtualRouterSM.get(vm.virtual_router)
vrouter_name = vr.name
Expand Down Expand Up @@ -555,12 +556,8 @@ def delete_service(self, vm):
self.cleanup_svc_vm_ports(vmi_list)

if vm.virtual_router:
vm_obj = VirtualMachine()
vm_obj.uuid = vm.uuid
vm_obj.fq_name = vm.fq_name
vr_obj = self._vnc_lib.virtual_router_read(id=vm.virtual_router)
vr_obj.del_virtual_machine(vm_obj)
self._vnc_lib.virtual_router_update(vr_obj)
self._vnc_lib.ref_update('virtual-router', vm.virtual_router,
'virtual-machine', vm.uuid, None, 'DELETE')
self.logger.log_info("vm %s deleted from vr %s" %
(vm.fq_name, vm.virtual_router))

Expand All @@ -581,16 +578,10 @@ def check_service(self, si):
vr = VirtualRouterSM.get(vm.virtual_router)
if self.vrouter_scheduler.vrouter_running(vr.name):
continue
vr_obj = VirtualRouter()
vr_obj.uuid = vr.uuid
vr_obj.fq_name = vr.fq_name
vm_obj = VirtualMachine()
vm_obj.uuid = vm.uuid
vm_obj.fq_name = vm.fq_name
vr_obj.del_virtual_machine(vm_obj)
self._vnc_lib.virtual_router_update(vr_obj)
self._update_local_preference(si, vm)
self.logger.log_error("vrouter down for vm %s" % vm.uuid)
self._vnc_lib.ref_update('virtual-router', vr.uuid,
'virtual-machine', vm.uuid, None, 'DELETE')
vr.update()
self.logger.log_error("vrouter %s down for vm %s" % (vr.name, vm.uuid))
service_up = False

return service_up
Expand Down
36 changes: 31 additions & 5 deletions src/config/svc-monitor/svc_monitor/svc_monitor.py
Expand Up @@ -841,6 +841,24 @@ def reset():
cls.reset()


def skip_check_service(si):
# wait for first launch
if not si.launch_count:
return True
# back off going on
if si.back_off > 0:
si.back_off -= 1
return True
# back off done
if si.back_off == 0:
si.back_off = -1
return False
# set back off
if not si.launch_count % 10:
si.back_off = 10
return True
return False

def timer_callback(monitor):
# delete vms without si
vm_delete_list = []
Expand All @@ -852,10 +870,9 @@ def timer_callback(monitor):
monitor._delete_service_instance(vm)

# check status of service
si_id_list = list(ServiceInstanceSM._dict.keys())
for si_id in si_id_list:
si = ServiceInstanceSM.get(si_id)
if not si or not si.launch_count:
si_list = list(ServiceInstanceSM.values())
for si in si_list:
if skip_check_service(si):
continue
if not monitor._check_service_running(si):
monitor._relaunch_service_instance(si)
Expand All @@ -878,8 +895,15 @@ def timer_callback(monitor):
monitor._delete_shared_vn(vn.uuid)

def launch_timer(monitor):
if not monitor._args.check_service_interval.isdigit():
monitor.logger.log_emergency("set seconds for check_service_interval "
"in contrail-svc-monitor.conf. example: check_service_interval=60")
sys.exit()
monitor.logger.log_notice("check_service_interval set to %s seconds" %
monitor._args.check_service_interval)

while True:
gevent.sleep(svc_info.get_vm_health_interval())
gevent.sleep(int(monitor._args.check_service_interval))
try:
timer_callback(monitor)
except Exception:
Expand Down Expand Up @@ -918,6 +942,7 @@ def parse_args(args_str):
--use_syslog
--syslog_facility LOG_USER
--cluster_id <testbed-name>
--check_service_interval 60
[--region_name <name>]
[--reset_config]
'''
Expand Down Expand Up @@ -960,6 +985,7 @@ def parse_args(args_str):
'syslog_facility': Sandesh._DEFAULT_SYSLOG_FACILITY,
'region_name': None,
'cluster_id': '',
'check_service_interval': '60',
}
secopts = {
'use_certs': False,
Expand Down

0 comments on commit 03272b6

Please sign in to comment.