From 9bea272c121d25235bc3939b9a53d3f0dbc4bcaf Mon Sep 17 00:00:00 2001 From: Hampapur Ajay Date: Fri, 2 Oct 2015 13:20:15 -0700 Subject: [PATCH] Snat health check generates unnecessary api server updates On vrouter down detach infro is not sent correctly to the api server. This causes a bunch of messages/updates to the API server on a regular interval. The fix now correctly updates the virtual-machine and vrouter references so that a new vrouter can be attached to the vm. In addition made the service check interval configurable in contrail-svc-monitor.conf check_service_interval= Change-Id: I352ec821128e801b24cd227ce35824906140fcab Closes-Bug: #1502300 --- src/config/common/svc_info.py | 2 +- .../svc-monitor/svc_monitor/config_db.py | 1 + .../svc_monitor/instance_manager.py | 22 ++++-------- .../svc-monitor/svc_monitor/svc_monitor.py | 36 ++++++++++++++++--- 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/src/config/common/svc_info.py b/src/config/common/svc_info.py index 9f9eba9b1eb..c13ca08c415 100644 --- a/src/config/common/svc_info.py +++ b/src/config/common/svc_info.py @@ -16,7 +16,7 @@ _VN_SNAT_PREFIX_NAME = 'snat-si-left' _VN_SNAT_SUBNET_CIDR = '100.64.0.0/29' -_CHECK_SVC_VM_HEALTH_INTERVAL = 30 +_CHECK_SVC_VM_HEALTH_INTERVAL = 60 _VM_INSTANCE_TYPE = 'virtual-machine' _NETNS_INSTANCE_TYPE = 'network-namespace' diff --git a/src/config/svc-monitor/svc_monitor/config_db.py b/src/config/svc-monitor/svc_monitor/config_db.py index 0724465f3f7..efc539455b3 100644 --- a/src/config/svc-monitor/svc_monitor/config_db.py +++ b/src/config/svc-monitor/svc_monitor/config_db.py @@ -318,6 +318,7 @@ def __init__(self, uuid, obj_dict=None): self.params = None self.state = 'init' self.launch_count = 0 + self.back_off = -1 self.image = None self.flavor = None self.max_instances = 0 diff --git a/src/config/svc-monitor/svc_monitor/instance_manager.py b/src/config/svc-monitor/svc_monitor/instance_manager.py index 60707307b8a..cbe709be5fd 100644 --- a/src/config/svc-monitor/svc_monitor/instance_manager.py +++ b/src/config/svc-monitor/svc_monitor/instance_manager.py @@ -507,12 +507,8 @@ def delete_service(self, vm): self.cleanup_svc_vm_ports(vmi_list) if vm.virtual_router: - vm_obj = VirtualMachine() - vm_obj.uuid = vm.uuid - vm_obj.fq_name = vm.fq_name - vr_obj = self._vnc_lib.virtual_router_read(id=vm.virtual_router) - vr_obj.del_virtual_machine(vm_obj) - self._vnc_lib.virtual_router_update(vr_obj) + self._vnc_lib.ref_update('virtual-router', vm.virtual_router, + 'virtual-machine', vm.uuid, None, 'DELETE') self.logger.log_info("vm %s deleted from vr %s" % (vm_obj.get_fq_name_str(), vr_obj.get_fq_name_str())) @@ -532,16 +528,10 @@ def check_service(self, si): vr = VirtualRouterSM.get(vm.virtual_router) if self.vrouter_scheduler.vrouter_running(vr.name): continue - vr_obj = VirtualRouter() - vr_obj.uuid = vr.uuid - vr_obj.fq_name = vr.fq_name - vm_obj = VirtualMachine() - vm_obj.uuid = vm.uuid - vm_obj.fq_name = vm.fq_name - vr_obj.del_virtual_machine(vm_obj) - self._vnc_lib.virtual_router_update(vr_obj) - self._update_local_preference(si, vm) - self.logger.log_error("vrouter down for vm %s" % vm.uuid) + self._vnc_lib.ref_update('virtual-router', vr.uuid, + 'virtual-machine', vm.uuid, None, 'DELETE') + vr.update() + self.logger.log_error("vrouter %s down for vm %s" % (vr.name, vm.uuid)) service_up = False return service_up diff --git a/src/config/svc-monitor/svc_monitor/svc_monitor.py b/src/config/svc-monitor/svc_monitor/svc_monitor.py index e0d5e9397c1..f1c4a04c50a 100644 --- a/src/config/svc-monitor/svc_monitor/svc_monitor.py +++ b/src/config/svc-monitor/svc_monitor/svc_monitor.py @@ -838,6 +838,24 @@ def reset(): cls.reset() +def skip_check_service(si): + # wait for first launch + if not si.launch_count: + return True + # back off going on + if si.back_off > 0: + si.back_off -= 1 + return True + # back off done + if si.back_off == 0: + si.back_off = -1 + return False + # set back off + if not si.launch_count % 10: + si.back_off = 10 + return True + return False + def timer_callback(monitor): # delete vms without si vm_delete_list = [] @@ -849,10 +867,9 @@ def timer_callback(monitor): monitor._delete_service_instance(vm) # check status of service - si_id_list = list(ServiceInstanceSM._dict.keys()) - for si_id in si_id_list: - si = ServiceInstanceSM.get(si_id) - if not si or not si.launch_count: + si_list = list(ServiceInstanceSM.values()) + for si in si_list: + if skip_check_service(si): continue if not monitor._check_service_running(si): monitor._relaunch_service_instance(si) @@ -875,8 +892,15 @@ def timer_callback(monitor): monitor._delete_shared_vn(vn.uuid) def launch_timer(monitor): + if not monitor._args.check_service_interval.isdigit(): + monitor.logger.log_emergency("set seconds for check_service_interval " + "in contrail-svc-monitor.conf. example: check_service_interval=60") + sys.exit() + monitor.logger.log_notice("check_service_interval set to %s seconds" % + monitor._args.check_service_interval) + while True: - gevent.sleep(svc_info.get_vm_health_interval()) + gevent.sleep(int(monitor._args.check_service_interval)) try: timer_callback(monitor) except Exception: @@ -914,6 +938,7 @@ def parse_args(args_str): --use_syslog --syslog_facility LOG_USER --cluster_id + --check_service_interval 60 [--region_name ] [--reset_config] ''' @@ -955,6 +980,7 @@ def parse_args(args_str): 'syslog_facility': Sandesh._DEFAULT_SYSLOG_FACILITY, 'region_name': None, 'cluster_id': '', + 'check_service_interval': '60', } secopts = { 'use_certs': False,