diff --git a/src/config/common/svc_info.py b/src/config/common/svc_info.py index 9f9eba9b1eb..c13ca08c415 100644 --- a/src/config/common/svc_info.py +++ b/src/config/common/svc_info.py @@ -16,7 +16,7 @@ _VN_SNAT_PREFIX_NAME = 'snat-si-left' _VN_SNAT_SUBNET_CIDR = '100.64.0.0/29' -_CHECK_SVC_VM_HEALTH_INTERVAL = 30 +_CHECK_SVC_VM_HEALTH_INTERVAL = 60 _VM_INSTANCE_TYPE = 'virtual-machine' _NETNS_INSTANCE_TYPE = 'network-namespace' diff --git a/src/config/svc-monitor/svc_monitor/config_db.py b/src/config/svc-monitor/svc_monitor/config_db.py index 3520a1323e8..212ee179b16 100644 --- a/src/config/svc-monitor/svc_monitor/config_db.py +++ b/src/config/svc-monitor/svc_monitor/config_db.py @@ -329,6 +329,7 @@ def __init__(self, uuid, obj_dict=None): self.params = None self.state = 'init' self.launch_count = 0 + self.back_off = -1 self.image = None self.flavor = None self.max_instances = 0 diff --git a/src/config/svc-monitor/svc_monitor/instance_manager.py b/src/config/svc-monitor/svc_monitor/instance_manager.py index 95b018f700a..7ca7c362f42 100644 --- a/src/config/svc-monitor/svc_monitor/instance_manager.py +++ b/src/config/svc-monitor/svc_monitor/instance_manager.py @@ -515,12 +515,8 @@ def delete_service(self, vm): self.cleanup_svc_vm_ports(vmi_list) if vm.virtual_router: - vm_obj = VirtualMachine() - vm_obj.uuid = vm.uuid - vm_obj.fq_name = vm.fq_name - vr_obj = self._vnc_lib.virtual_router_read(id=vm.virtual_router) - vr_obj.del_virtual_machine(vm_obj) - self._vnc_lib.virtual_router_update(vr_obj) + self._vnc_lib.ref_update('virtual-router', vm.virtual_router, + 'virtual-machine', vm.uuid, None, 'DELETE') self.logger.log_info("vm %s deleted from vr %s" % (vm_obj.get_fq_name_str(), vr_obj.get_fq_name_str())) @@ -541,16 +537,10 @@ def check_service(self, si): vr = VirtualRouterSM.get(vm.virtual_router) if self.vrouter_scheduler.vrouter_running(vr.name): continue - vr_obj = VirtualRouter() - vr_obj.uuid = vr.uuid - vr_obj.fq_name = vr.fq_name - vm_obj = VirtualMachine() - vm_obj.uuid = vm.uuid - vm_obj.fq_name = vm.fq_name - vr_obj.del_virtual_machine(vm_obj) - self._vnc_lib.virtual_router_update(vr_obj) - self._update_local_preference(si, vm) - self.logger.log_error("vrouter down for vm %s" % vm.uuid) + self._vnc_lib.ref_update('virtual-router', vr.uuid, + 'virtual-machine', vm.uuid, None, 'DELETE') + vr.update() + self.logger.log_error("vrouter %s down for vm %s" % (vr.name, vm.uuid)) service_up = False return service_up diff --git a/src/config/svc-monitor/svc_monitor/svc_monitor.py b/src/config/svc-monitor/svc_monitor/svc_monitor.py index dd8b83f2a54..753dcc2d6ac 100644 --- a/src/config/svc-monitor/svc_monitor/svc_monitor.py +++ b/src/config/svc-monitor/svc_monitor/svc_monitor.py @@ -626,6 +626,24 @@ def reset(): cls.reset() +def skip_check_service(si): + # wait for first launch + if not si.launch_count: + return True + # back off going on + if si.back_off > 0: + si.back_off -= 1 + return True + # back off done + if si.back_off == 0: + si.back_off = -1 + return False + # set back off + if not si.launch_count % 10: + si.back_off = 10 + return True + return False + def timer_callback(monitor): # delete vms without si vm_delete_list = [] @@ -646,10 +664,9 @@ def timer_callback(monitor): monitor.vm_manager.cleanup_svc_vm_ports(vmi_delete_list) # check status of service - si_id_list = list(ServiceInstanceSM._dict.keys()) - for si_id in si_id_list: - si = ServiceInstanceSM.get(si_id) - if not si or not si.launch_count: + si_list = list(ServiceInstanceSM.values()) + for si in si_list: + if skip_check_service(si): continue if not monitor._check_service_running(si): monitor._relaunch_service_instance(si) @@ -672,8 +689,15 @@ def timer_callback(monitor): monitor._delete_shared_vn(vn.uuid) def launch_timer(monitor): + if not monitor._args.check_service_interval.isdigit(): + monitor.logger.log_emergency("set seconds for check_service_interval " + "in contrail-svc-monitor.conf. example: check_service_interval=60") + sys.exit() + monitor.logger.log_notice("check_service_interval set to %s seconds" % + monitor._args.check_service_interval) + while True: - gevent.sleep(svc_info.get_vm_health_interval()) + gevent.sleep(int(monitor._args.check_service_interval)) try: timer_callback(monitor) except Exception: @@ -712,6 +736,7 @@ def parse_args(args_str): --use_syslog --syslog_facility LOG_USER --cluster_id + --check_service_interval 60 [--region_name ] [--reset_config] ''' @@ -757,6 +782,7 @@ def parse_args(args_str): 'logging_conf': '', 'logger_class': None, 'sandesh_send_rate_limit' : SandeshSystem.get_sandesh_send_rate_limit(), + 'check_service_interval': '60', } secopts = { 'use_certs': False,