From 06bdd00f3f46177e49fc1f7bea9b5f1d1e7c9263 Mon Sep 17 00:00:00 2001 From: Ananth Suryanarayana Date: Mon, 13 Oct 2014 13:24:20 -0700 Subject: [PATCH] commit cb097ef4c40af67ebe726d5da5c25d5bccb5658b Author: Raja Sivaramakrishnan Date: Mon Oct 6 15:47:48 2014 -0700 Ensure "scons vrouter" builds uvrouter and utils too (to avoid build breaks) Fixes bug 1374188. commit 9c97927bff3f50f462570a83b7d45a570c58c9be Author: Divakar Date: Sun Oct 5 02:28:37 2014 -0700 Bug:#1377581. Formatting Bridge table display commit ff17558202d5c4b288b0443756025224e656c584 Author: Divakar Date: Mon Sep 29 09:37:07 2014 -0700 Fix for https://bugs.launchpad.net/juniperopenstack/+bug/1372262 a separate port list of link local ser commit e457cab426f370476bc24c14e63d49ae30d1cbd3 Author: Wojciech Zmuda Date: Mon Sep 29 16:30:11 2014 +0200 Fix "symbol vrouter_dbg undefined" and "symbol vr_inet_vrf_stats undefined" errors. Errors occured while loading vrouter.ko module on FreeBSD. commit 78306f449e0de38f572d50044c8b98d49cda9a0b Author: Wojciech Zmuda Date: Wed Sep 24 10:49:11 2014 +0200 Fix coding-style and formatting of latest commit. Author: Wojciech Zmuda commit f6f8f7930918808d3be33d6df9033ea2b73ab0da Author: Ganesan Prabhakaran Date: Thu Sep 25 16:21:10 2014 -0700 Fix build issue in mainline commit 23a2aac703cc2b781ee94e15fe740dead85b72a9 Author: ashoksingh Date: Thu Sep 25 17:32:56 2014 +0530 IPv6 changes (on behalf of Prabhakaran) commit c6bab0328707c3fe4d9ce8d18a73e9e40c1a0bc5 Author: anandhk-juniper Date: Thu Sep 25 00:49:21 2014 +0530 Fix compilation failure for ubuntu 3.13.0-36-generic (and above) From 3.13.0-36, ubuntu did while keeping the kernel version the same as 3.13.0-32. The only option that was left then was to check for definition of skb_get_rxhash Closes Bug:1373574 Change-Id: I55ed2fa036d4a0a68de9386ff7c9cf5d91bb1ef5 --- Makefile | 2 +- SConscript | 3 + dp-core/vr_datapath.c | 49 ++- dp-core/vr_flow.c | 304 +++++++++++++- dp-core/{vr_ip4_mtrie.c => vr_ip_mtrie.c} | 457 +++++++++++++--------- dp-core/vr_mcast.c | 65 ++- dp-core/vr_nexthop.c | 42 +- dp-core/vr_proto_ip.c | 195 ++++++++- dp-core/vr_route.c | 145 ++++++- freebsd/Makefile | 2 +- freebsd/vrouter_mod.c | 1 + host/vrouter_host_mod.c | 2 +- include/genetlink.h | 20 +- include/netlink.h | 64 +-- include/vr_compat.h | 13 +- include/vr_flow.h | 10 + include/vr_ip4_mtrie.h | 101 ----- include/vr_ip_mtrie.h | 78 ++++ include/vr_linux.h | 2 +- include/vr_os.h | 2 +- include/vr_packet.h | 74 +++- include/vr_route.h | 3 + include/vrouter.h | 5 + linux/vhost_dev.c | 2 + linux/vr_host_interface.c | 18 +- linux/vrouter_mod.c | 69 ++-- sandesh/vr.sandesh | 6 +- utils/flow.c | 3 + utils/rt.c | 171 ++++++-- 29 files changed, 1410 insertions(+), 498 deletions(-) rename dp-core/{vr_ip4_mtrie.c => vr_ip_mtrie.c} (66%) delete mode 100644 include/vr_ip4_mtrie.h create mode 100644 include/vr_ip_mtrie.h diff --git a/Makefile b/Makefile index f1bd30e5c..f1a9c5866 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,7 @@ ifneq ($(KERNELRELEASE), ) vrouter-y += dp-core/vr_nexthop.o dp-core/vr_vif_bridge.o vrouter-y += dp-core/vr_datapath.o dp-core/vr_interface.o vrouter-y += dp-core/vr_packet.o dp-core/vr_proto_ip.o - vrouter-y += dp-core/vr_mpls.o dp-core/vr_ip4_mtrie.o + vrouter-y += dp-core/vr_mpls.o dp-core/vr_ip_mtrie.o vrouter-y += dp-core/vr_response.o dp-core/vr_flow.o vrouter-y += dp-core/vr_mirror.o dp-core/vr_vrf_assign.o vrouter-y += dp-core/vr_index_table.o dp-core/vr_mcast.o diff --git a/SConscript b/SConscript index c13dd388a..7ac2ed639 100644 --- a/SConscript +++ b/SConscript @@ -62,6 +62,9 @@ if sys.platform != 'darwin': make_cmd += ' SANDESH_HEADER_PATH=' + Dir(env['TOP'] + '/vrouter/').abspath make_cmd += ' SANDESH_SRC_ROOT=' + '../build/kbuild/' make_cmd += ' SANDESH_EXTRA_HEADER_PATH=' + Dir('#tools/').abspath + if 'vrouter' in COMMAND_LINE_TARGETS: + BUILD_TARGETS.append('vrouter/uvrouter') + BUILD_TARGETS.append('vrouter/utils') kern = env.Command('vrouter.ko', makefile, make_cmd, chdir=dp_dir) env.Default(kern) diff --git a/dp-core/vr_datapath.c b/dp-core/vr_datapath.c index 5943f0130..022366acb 100644 --- a/dp-core/vr_datapath.c +++ b/dp-core/vr_datapath.c @@ -37,6 +37,7 @@ vr_arp_request_treatment(struct vr_interface *vif, struct vr_arp *arp, { struct vr_route_req rt; struct vr_nexthop *nh; + uint32_t rt_prefix; /* * Packet from VM : @@ -87,10 +88,13 @@ vr_arp_request_treatment(struct vr_interface *vif, struct vr_arp *arp, } rt.rtr_req.rtr_vrf_id = vif->vif_vrf; - rt.rtr_req.rtr_prefix = ntohl(arp->arp_dpa); + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + *(uint32_t*)rt.rtr_req.rtr_prefix = (arp->arp_dpa); + rt.rtr_req.rtr_prefix_size = 4; rt.rtr_req.rtr_prefix_len = 32; rt.rtr_req.rtr_nh_id = 0; rt.rtr_req.rtr_label_flags = 0; + rt.rtr_req.rtr_src_size = rt.rtr_req.rtr_marker_size = 0; nh = vr_inet_route_lookup(vif->vif_vrf, &rt, NULL); if (!nh || nh->nh_type == NH_DISCARD) @@ -251,6 +255,8 @@ vr_pkt_type(struct vr_packet *pkt) pkt_set_network_header(pkt, pkt->vp_data + pull_len); if (eth_proto == VR_ETH_PROTO_IP) pkt->vp_type = VP_TYPE_IP; + else if (eth_proto == VR_ETH_PROTO_IP6) + pkt->vp_type = VP_TYPE_IP6; else if (eth_proto == VR_ETH_PROTO_ARP) pkt->vp_type = VP_TYPE_ARP; else @@ -408,6 +414,17 @@ vr_l3_input(unsigned short vrf, struct vr_packet *pkt, } vr_flow_inet_input(vif->vif_router, vrf, pkt, VR_ETH_PROTO_IP, fmd); return 1; + } else if (pkt->vp_type == VP_TYPE_IP6) { + pkt_set_inner_network_header(pkt, pkt->vp_data); + if (vr_from_vm_mss_adj && vr_pkt_from_vm_tcp_mss_adj && + (vif->vif_type == VIF_TYPE_VIRTUAL)) { + if ((reason = vr_pkt_from_vm_tcp_mss_adj(pkt, VROUTER_OVERLAY_LEN))) { + vr_pfree(pkt, reason); + return 1; + } + } + vr_flow_inet6_input(vif->vif_router, vrf, pkt, VR_ETH_PROTO_IP6, fmd); + return 1; } else if (pkt->vp_type == VP_TYPE_ARP) { return vr_arp_input(vrf, pkt, fmd); } @@ -418,7 +435,7 @@ unsigned int vr_l2_input(unsigned short vrf, struct vr_packet *pkt, struct vr_forwarding_md *fmd) { - int pull_len; + int pull_len = 0; int reason; struct vr_interface *vif = pkt->vp_if; @@ -454,13 +471,13 @@ vr_l2_input(unsigned short vrf, struct vr_packet *pkt, return 1; } } - /* Restore back the L2 headers */ - if (!pkt_push(pkt, pull_len)) { - vr_pfree(pkt, VP_DROP_PULL); - return 1; - } } - + + /* Restore back the L2 headers */ + if (!pkt_push(pkt, pull_len)) { + vr_pfree(pkt, VP_DROP_PULL); + return 1; + } /* Mark the packet as L2 */ pkt->vp_type = VP_TYPE_L2; @@ -473,19 +490,27 @@ vr_l3_well_known_packet(unsigned short vrf, struct vr_packet *pkt) { unsigned char *data = pkt_data(pkt); struct vr_ip *iph; + struct vr_ip6 *ip6; struct vr_udp *udph; unsigned char *l3_hdr; l3_hdr = pkt_network_header(pkt); if (pkt->vp_if->vif_type == VIF_TYPE_VIRTUAL && IS_MAC_BMCAST(data)) { iph = (struct vr_ip *)l3_hdr; - if ((iph->ip_proto == VR_IP_PROTO_UDP) && + if (!vr_ip_is_ip6(iph)) { + if ((iph->ip_proto == VR_IP_PROTO_UDP) && vr_ip_transport_header_valid(iph)) { - udph = (struct vr_udp *)(l3_hdr + iph->ip_hl * 4); - if (udph->udp_sport == htons(68)) { + udph = (struct vr_udp *)(l3_hdr + iph->ip_hl * 4); + if (udph->udp_sport == htons(68)) { + return true; + } + } + } else { //IPv6 + ip6 = (struct vr_ip6 *)l3_hdr; + // 0xFF02 is the multicast address used for NDP, DHCPv6 etc + if (ip6->ip6_dst[0] == 0xFF && ip6->ip6_dst[1] == 0x02) { return true; } - } } diff --git a/dp-core/vr_flow.c b/dp-core/vr_flow.c index 3ef10e257..c26bd0ce1 100644 --- a/dp-core/vr_flow.c +++ b/dp-core/vr_flow.c @@ -14,6 +14,7 @@ #include "vr_fragment.h" #include "vr_datapath.h" #include "vr_hash.h" +#include "vr_ip_mtrie.h" #define VR_NUM_FLOW_TABLES 1 #define VR_DEF_FLOW_ENTRIES (512 * 1024) @@ -47,6 +48,7 @@ extern int vr_ip_input(struct vrouter *, unsigned short, struct vr_packet *, struct vr_forwarding_md *); extern void vr_ip_update_csum(struct vr_packet *, unsigned int, unsigned int); +extern uint16_t vr_icmp6_checksum(void * buffer, int bytes); static void vr_flush_entry(struct vrouter *, struct vr_flow_entry *, struct vr_flow_md *, struct vr_forwarding_md *); @@ -77,6 +79,87 @@ jhash(void *key, uint32_t length, uint32_t interval) } #endif + +bool +vr_valid_link_local_port(struct vrouter *router, int family, + int proto, int port) +{ + unsigned char data; + unsigned int tmp; + + if (!router->vr_link_local_ports) + return false; + + if ((family != AF_INET) || + ((proto != VR_IP_PROTO_TCP) && (proto != VR_IP_PROTO_UDP))) + return false; + + if ((port < VR_DYNAMIC_PORT_START) || (port > VR_DYNAMIC_PORT_END)) + return false; + + tmp = port - VR_DYNAMIC_PORT_START; + if (proto == VR_IP_PROTO_UDP) + tmp += (router->vr_link_local_ports_size * 8 / 2); + + data = router->vr_link_local_ports[(tmp /8)]; + if (data & (1 << (tmp % 8))) + return true; + + return false; +} + +static void +vr_clear_link_local_port(struct vrouter *router, int family, + int proto, int port) +{ + + unsigned char *data; + unsigned int tmp; + + if (!router->vr_link_local_ports) + return; + + if ((family != AF_INET) || + ((proto != VR_IP_PROTO_TCP) && (proto != VR_IP_PROTO_UDP))) + return; + + if ((port < VR_DYNAMIC_PORT_START) || (port > VR_DYNAMIC_PORT_END)) + return; + + tmp = port - VR_DYNAMIC_PORT_START; + if (proto == VR_IP_PROTO_UDP) + tmp += (router->vr_link_local_ports_size * 8 / 2); + + data = &router->vr_link_local_ports[(tmp /8)]; + *data &= (~(1 << (tmp % 8))); +} + +static void +vr_set_link_local_port(struct vrouter *router, int family, + int proto, int port) +{ + + unsigned char *data; + unsigned int tmp; + + if (!router->vr_link_local_ports) + return; + + if ((family != AF_INET) || + ((proto != VR_IP_PROTO_TCP) && (proto != VR_IP_PROTO_UDP))) + return; + + if ((port < VR_DYNAMIC_PORT_START) || (port > VR_DYNAMIC_PORT_END)) + return; + + tmp = port - VR_DYNAMIC_PORT_START; + if (proto == VR_IP_PROTO_UDP) + tmp += (router->vr_link_local_ports_size * 8 / 2); + + data = &router->vr_link_local_ports[tmp /8]; + *data |= (1 << (tmp % 8)); +} + static void vr_flow_reset_mirror(struct vrouter *router, struct vr_flow_entry *fe, unsigned int index) @@ -224,6 +307,7 @@ vr_get_flow_key(struct vr_flow_key *key, uint16_t vlan, struct vr_packet *pkt, case VR_IP_PROTO_TCP: case VR_IP_PROTO_UDP: case VR_IP_PROTO_ICMP: + case VR_IP_PROTO_ICMP6: key->key_src_port = sport; key->key_dst_port = dport; break; @@ -400,14 +484,14 @@ vr_enqueue_flow(struct vr_flow_entry *fe, struct vr_packet *pkt, return 0; } -static int +int vr_flow_forward(unsigned short vrf, struct vr_packet *pkt, unsigned short proto, struct vr_forwarding_md *fmd) { struct vr_interface *vif = pkt->vp_if; struct vrouter *router = vif->vif_router; - if (proto != VR_ETH_PROTO_IP) { + if ((proto != VR_ETH_PROTO_IP) && (proto != VR_ETH_PROTO_IP6)) { vr_pfree(pkt, VP_DROP_FLOW_INVALID_PROTOCOL); return 0; } @@ -434,12 +518,18 @@ vr_flow_nat(unsigned short vrf, struct vr_flow_entry *fe, struct vr_packet *pkt, if (fe->fe_rflow < 0) goto drop; + ip = (struct vr_ip *)pkt_data(pkt); + + if (vr_ip_is_ip6(ip)) { + /* No NAT support for IPv6 yet */ + vr_pfree(pkt, VP_DROP_FLOW_ACTION_INVALID); + return 0; + } + rfe = vr_get_flow_entry(router, fe->fe_rflow); if (!rfe) goto drop; - ip = (struct vr_ip *)pkt_data(pkt); - if (ip->ip_proto == VR_IP_PROTO_ICMP) { icmph = (struct vr_icmp *)((unsigned char *)ip + (ip->ip_hl * 4)); if (vr_icmp_error(icmph)) { @@ -823,6 +913,151 @@ vr_flow_parse(struct vrouter *router, struct vr_flow_key *key, return res; } +extern struct vr_nexthop *(*vr_inet_route_lookup)(unsigned int, + struct vr_route_req *, struct vr_packet *); +unsigned int +vr_flow_inet6_input(struct vrouter *router, unsigned short vrf, + struct vr_packet *pkt, unsigned short proto, + struct vr_forwarding_md *fmd) +{ + struct vr_ip6 *ip6; + struct vr_eth *eth; + unsigned int trap_res = 0; + unsigned short *t_hdr, sport, dport, eth_off; + struct vr_icmp *icmph; + unsigned char *icmp_opt_ptr; + int proxy = 0; + struct vr_route_req rt; + struct vr_nexthop *nh; + struct vr_interface *vif = pkt->vp_if; + uint32_t rt_prefix[4]; + + pkt->vp_type = VP_TYPE_IP6; + ip6 = (struct vr_ip6 *)pkt_network_header(pkt); + /* TODO: Handle options headers */ + t_hdr = (unsigned short *)((char *)ip6 + sizeof(struct vr_ip6)); + switch (ip6->ip6_nxt) { + case VR_IP_PROTO_ICMP6: + /* First word on ICMP and ICMPv6 are same */ + icmph = (struct vr_icmp *)t_hdr; + switch (icmph->icmp_type) { + case VR_ICMP6_TYPE_ECHO_REQ: + case VR_ICMP6_TYPE_ECHO_REPLY: + /* ICMPv6 Echo format is same as ICMP */ + sport = icmph->icmp_eid; + dport = VR_ICMP6_TYPE_ECHO_REPLY; + break; + case VR_ICMP6_TYPE_NEIGH_SOL: //Neighbor Solicit, respond with VRRP MAC + + /* For L2-only networks, bridge the packets */ + if (vif_is_virtual(vif) && !(vif->vif_flags & VIF_FLAG_L3_ENABLED)) { + return 0; + } + + rt.rtr_req.rtr_vrf_id = vrf; + rt.rtr_req.rtr_family = AF_INET6; + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + memcpy(rt.rtr_req.rtr_prefix, &icmph->icmp_data, 16); + rt.rtr_req.rtr_prefix_size = 16; + rt.rtr_req.rtr_prefix_len = IP6_PREFIX_LEN; + rt.rtr_req.rtr_nh_id = 0; + rt.rtr_req.rtr_label_flags = 0; + rt.rtr_req.rtr_src_size = rt.rtr_req.rtr_marker_size = 0; + + nh = vr_inet_route_lookup(vrf, &rt, NULL); + if (!nh || nh->nh_type == NH_DISCARD) { + vr_pfree(pkt, VP_DROP_ARP_NOT_ME); + return 1; + } + + if (rt.rtr_req.rtr_label_flags & VR_RT_HOSTED_FLAG) { + proxy = 1; + } + + /* + * If an L3VPN route is learnt, we need to proxy + */ + if (nh->nh_type == NH_TUNNEL) { + proxy = 1; + } + + /* + * If not l3 vpn route, we default to flooding + */ + if ((nh->nh_type == NH_COMPOSITE) && + ((nh->nh_flags & NH_FLAG_COMPOSITE_EVPN) || + (nh->nh_flags & NH_FLAG_COMPOSITE_L2))) { + nh_output(vrf, pkt, nh, fmd); + return 1; + } else if (!proxy) { + vr_pfree(pkt, VP_DROP_ARP_NOT_ME); + return 1; + } + + /* + * Update IPv6 header + * Copy the IP6 src to IP6 dst + * Copy the target IP n ICMPv6 header as src IP of packet + * Do IP lookup to confirm if we can respond with Neighbor advertisement + */ + memcpy(ip6->ip6_dst, ip6->ip6_src, 16); + memcpy(ip6->ip6_src, &icmph->icmp_data, 16); + ip6->ip6_src[15] = 0xFF; // Mimic a different src IP + + /* Update ICMP header and options */ + icmph->icmp_type = VR_ICMP6_TYPE_NEIGH_AD; + icmp_opt_ptr = ((char*)&icmph->icmp_data[0]) + 16; + *icmp_opt_ptr = 0x02; //Target-link-layer-address + memcpy(icmp_opt_ptr+2, vif->vif_mac, VR_ETHER_ALEN); + + /* + * Update icmp6 checksum + * ICMPv6 option size is 24 bytes (IPv6 + MAC + 2 bytes) + */ + icmph->icmp_csum = + ~(vr_icmp6_checksum(ip6, sizeof(struct vr_ip6) + + sizeof(struct vr_icmp) + 24)); + + /* Update Ethernet headr */ + eth = (struct vr_eth*) ((char*)ip6 - sizeof(struct vr_eth)); + memcpy(eth->eth_dmac, eth->eth_smac, VR_ETHER_ALEN); + memcpy(eth->eth_smac, vif->vif_mac, VR_ETHER_ALEN); + + eth_off = pkt_get_network_header_off(pkt) - sizeof(struct vr_eth); + + pkt_set_data(pkt,eth_off); + + /* Respond back directly*/ + vif->vif_tx(vif, pkt); + + return 1; + case VR_ICMP6_TYPE_ROUTER_SOL: //Router solicit, trap to agent + return vr_trap(pkt, vrf, AGENT_TRAP_L3_PROTOCOLS, NULL); + default: + sport = 0; + dport = icmph->icmp_type; + } + break; + case VR_IP_PROTO_UDP: + sport = *t_hdr; + dport = *(t_hdr + 1); + + if (vif_is_virtual(pkt->vp_if)) { + if ((sport == VR_DHCP6_SPORT) && (dport == VR_DHCP6_DPORT)) { + trap_res = AGENT_TRAP_L3_PROTOCOLS; + return vr_trap(pkt, vrf, trap_res, NULL); + } + } + break; + case VR_IP_PROTO_TCP: + sport = *t_hdr; + dport = *(t_hdr + 1); + default: + break; + } + + return vr_flow_forward(vrf, pkt, proto, fmd); +} unsigned int vr_flow_inet_input(struct vrouter *router, unsigned short vrf, @@ -1183,6 +1418,10 @@ static int vr_flow_delete(struct vrouter *router, vr_flow_req *req, struct vr_flow_entry *fe) { + if (fe->fe_flags & VR_FLOW_FLAG_LINK_LOCAL) + vr_clear_link_local_port(router, AF_INET, fe->fe_key.key_proto, + ntohs(fe->fe_key.key_dst_port)); + fe->fe_action = VR_FLOW_ACTION_DROP; vr_flow_reset_mirror(router, fe, req->fr_index); @@ -1279,6 +1518,16 @@ vr_flow_set(struct vrouter *router, vr_flow_req *req) if (req->fr_flags & VR_FLOW_FLAG_VRFT) fe->fe_dvrf = req->fr_flow_dvrf; + if (req->fr_flags & VR_FLOW_FLAG_LINK_LOCAL) { + if (!(fe->fe_flags & VR_FLOW_FLAG_LINK_LOCAL)) + vr_set_link_local_port(router, AF_INET, fe->fe_key.key_proto, + ntohs(fe->fe_key.key_dst_port)); + } else { + if (fe->fe_flags & VR_FLOW_FLAG_LINK_LOCAL) + vr_clear_link_local_port(router, AF_INET, fe->fe_key.key_proto, + ntohs(fe->fe_key.key_dst_port)); + } + fe->fe_ecmp_nh_index = req->fr_ecmp_nh_index; fe->fe_src_nh_index = req->fr_src_nh_index; fe->fe_action = req->fr_action; @@ -1449,16 +1698,60 @@ vr_flow_table_init(struct vrouter *router) return vr_flow_table_info_init(router); } +void +vr_link_local_ports_reset(struct vrouter *router) +{ + if (router->vr_link_local_ports) { + memset(router->vr_link_local_ports, + 0, router->vr_link_local_ports_size); + } +} + + + +void +vr_link_local_ports_exit(struct vrouter *router) +{ + if (router->vr_link_local_ports) { + vr_free(router->vr_link_local_ports); + router->vr_link_local_ports = NULL; + router->vr_link_local_ports_size = 0; + } +} + +int +vr_link_local_ports_init(struct vrouter *router) +{ + unsigned int port_range, bytes; + + if (router->vr_link_local_ports) + return 0; + + /* Udp and TCP inclusive of low and high limits*/ + port_range = 2 * ((VR_DYNAMIC_PORT_END - VR_DYNAMIC_PORT_START) + 1); + /* Make it 16 bit boundary */ + bytes = (port_range + 15) & ~15; + /* Bits to Bytes */ + bytes /= 8; + router->vr_link_local_ports = vr_zalloc(bytes); + if (!router->vr_link_local_ports) + return -1; + router->vr_link_local_ports_size = bytes; + + return 0; +} /* flow module exit and init */ void vr_flow_exit(struct vrouter *router, bool soft_reset) { vr_flow_table_reset(router); + vr_link_local_ports_reset(router); if (!soft_reset) { vr_flow_table_destroy(router); vr_fragment_table_exit(router); + vr_link_local_ports_exit(router); } return; @@ -1475,5 +1768,8 @@ vr_flow_init(struct vrouter *router) if ((ret = vr_flow_table_init(router))) return ret; + if ((ret = vr_link_local_ports_init(router))) + return ret; + return 0; } diff --git a/dp-core/vr_ip4_mtrie.c b/dp-core/vr_ip_mtrie.c similarity index 66% rename from dp-core/vr_ip4_mtrie.c rename to dp-core/vr_ip_mtrie.c index e2571b753..c5a668c7e 100644 --- a/dp-core/vr_ip4_mtrie.c +++ b/dp-core/vr_ip_mtrie.c @@ -1,14 +1,14 @@ /* - * vr_ip4_mtrie.c -- VRF mtrie management + * vr_ip_mtrie.c -- VRF mtrie management * - * Copyright (c) 2013 Juniper Networks, Inc. All rights reserved. + * Copyright (c) 2014 Juniper Networks, Inc. All rights reserved. */ #include -#include -#include #include "vr_sandesh.h" #include "vr_message.h" -#include "vr_ip4_mtrie.h" +#include "vr_packet.h" +#include "vr_route.h" +#include "vr_ip_mtrie.h" extern struct vr_nexthop *ip4_default_nh; @@ -19,79 +19,96 @@ struct vr_nexthop *(*vr_inet_route_lookup)(unsigned int, struct vr_route_req *, struct vr_packet *); struct vr_vrf_stats *(*vr_inet_vrf_stats)(unsigned short, unsigned int); -static struct ip4_mtrie *mtrie_alloc_vrf(unsigned int); - -int mtrie4_algo_init(struct vr_rtable *, struct rtable_fspec *); -void mtrie4_algo_deinit(struct vr_rtable *, struct rtable_fspec *, bool); - -/* mtrie specific */ -#define IP4_BKT_LEVELS 4 /* 8/8/8/8 */ -struct mtrie_bkt_info ip4_bkt_info[IP4_BKT_LEVELS] = { - { - .bi_size = IP4BUCKET_LEVEL0_SIZE, - .bi_shift = IP4BUCKET_LEVEL0_SHIFT, - .bi_pfx_len = IP4BUCKET_LEVEL0_PFX_LEN, - .bi_mask = IP4BUCKET_LEVEL0_MASK, - .bi_bits = IP4BUCKET_LEVEL0_BITS, - }, - { - .bi_size = IP4BUCKET_LEVEL1_SIZE, - .bi_shift = IP4BUCKET_LEVEL1_SHIFT, - .bi_pfx_len = IP4BUCKET_LEVEL1_PFX_LEN, - .bi_mask = IP4BUCKET_LEVEL1_MASK, - .bi_bits = IP4BUCKET_LEVEL1_BITS, - }, - { - .bi_size = IP4BUCKET_LEVEL2_SIZE, - .bi_shift = IP4BUCKET_LEVEL2_SHIFT, - .bi_pfx_len = IP4BUCKET_LEVEL2_PFX_LEN, - .bi_mask = IP4BUCKET_LEVEL2_MASK, - .bi_bits = IP4BUCKET_LEVEL2_BITS, - }, - { - .bi_size = IP4BUCKET_LEVEL3_SIZE, - .bi_shift = IP4BUCKET_LEVEL3_SHIFT, - .bi_pfx_len = IP4BUCKET_LEVEL3_PFX_LEN, - .bi_mask = IP4BUCKET_LEVEL3_MASK, - .bi_bits = IP4BUCKET_LEVEL3_BITS, - }, -}; - -struct ip4_mtrie **vn_rtable; +static struct ip_mtrie *mtrie_alloc_vrf(unsigned int, unsigned int); + +/* mtrie specific, bucket_info for v4 and v6 */ +#define IP4_BKT_LEVELS (IP4_PREFIX_LEN / IPBUCKET_LEVEL_BITS) +#define IP6_BKT_LEVELS (IP6_PREFIX_LEN / IPBUCKET_LEVEL_BITS) + +struct mtrie_bkt_info ip4_bkt_info[IP4_BKT_LEVELS]; +struct mtrie_bkt_info ip6_bkt_info[IP6_BKT_LEVELS]; + +struct ip_mtrie **vn_rtable[2]; +static int algo_init_done = 0; +static vr_route_req dump_resp; + +static void +mtrie_ip_bkt_info_init(struct mtrie_bkt_info *ip_bkt_info, int pfx_len) +{ + int level; + + ip_bkt_info[0].bi_bits = IPBUCKET_LEVEL_BITS; + ip_bkt_info[0].bi_pfx_len = IPBUCKET_LEVEL_BITS; + ip_bkt_info[0].bi_shift = pfx_len - IPBUCKET_LEVEL_BITS; + ip_bkt_info[0].bi_size = IPBUCKET_LEVEL_SIZE; + ip_bkt_info[0].bi_mask = IPBUCKET_LEVEL_MASK; + + for (level = 1; level < (pfx_len/IPBUCKET_LEVEL_BITS); level++) { + ip_bkt_info[level].bi_bits = IPBUCKET_LEVEL_BITS; + ip_bkt_info[level].bi_pfx_len = ip_bkt_info[level-1].bi_pfx_len + + IPBUCKET_LEVEL_BITS; + ip_bkt_info[level].bi_shift = ip_bkt_info[level-1].bi_shift - IPBUCKET_LEVEL_BITS; + ip_bkt_info[level].bi_size = IPBUCKET_LEVEL_SIZE; + ip_bkt_info[level].bi_mask = IPBUCKET_LEVEL_MASK; + } +} /* * given a vrf id, get the routing table corresponding to the id */ -static inline struct ip4_mtrie * -vrfid_to_mtrie(unsigned int vrf_id) +static inline struct ip_mtrie * +vrfid_to_mtrie(unsigned int vrf_id, unsigned int family) { + int index = 0; + struct ip_mtrie **mtrie_table; if (vrf_id >= VR_MAX_VRFS) return NULL; - return vn_rtable[vrf_id]; + if (family == AF_INET6) + index = 1; + + mtrie_table = vn_rtable[index]; + return mtrie_table[vrf_id]; +} + +#define PREFIX_TO_INDEX(prefix, level) (prefix[level]) + +static inline unsigned int +ip_bkt_get_max_level(int family) +{ + if (family == AF_INET6) + return(IP6_BKT_LEVELS); + else + return(IP4_BKT_LEVELS); } -#define PREFIX_TO_INDEX(prefix, level) \ - ((prefix >> ip4_bkt_info[level].bi_shift) & \ - ip4_bkt_info[level].bi_mask) +static struct mtrie_bkt_info * +ip_bkt_info_get(unsigned int family) +{ + if (family == AF_INET6) + return ip6_bkt_info; + else + return ip4_bkt_info; +} + /* * we have to be careful about 'level' here. assumption is that level * will be passed sane from whomever is calling */ -static inline int +static inline unsigned char rt_to_index(struct vr_route_req *rt, unsigned int level) { return PREFIX_TO_INDEX(rt->rtr_req.rtr_prefix, level); } -static inline struct ip4_bucket_entry * -index_to_entry(struct ip4_bucket *bkt, int index) +static inline struct ip_bucket_entry * +index_to_entry(struct ip_bucket *bkt, int index) { return &bkt->bkt_data[index]; } static void -set_entry_to_bucket(struct ip4_bucket_entry *ent, struct ip4_bucket *bkt) +set_entry_to_bucket(struct ip_bucket_entry *ent, struct ip_bucket *bkt) { struct vr_nexthop *tmp_nh; @@ -111,7 +128,7 @@ set_entry_to_bucket(struct ip4_bucket_entry *ent, struct ip4_bucket *bkt) * with an nh * that you are willing to forget about in your function */ static void -set_entry_to_nh(struct ip4_bucket_entry *entry, struct vr_nexthop *nh) +set_entry_to_nh(struct ip_bucket_entry *entry, struct vr_nexthop *nh) { struct vr_nexthop *tmp_nh; @@ -144,13 +161,13 @@ set_entry_to_nh(struct ip4_bucket_entry *entry, struct vr_nexthop *nh) return; } -static inline struct ip4_bucket * -entry_to_bucket(struct ip4_bucket_entry *ent) +static inline struct ip_bucket * +entry_to_bucket(struct ip_bucket_entry *ent) { unsigned long long_i = ent->entry_long_i; if (PTR_IS_BUCKET(long_i)) - return (struct ip4_bucket *)(long_i & ~0x1UL); + return (struct ip_bucket *)(long_i & ~0x1UL); return NULL; } @@ -158,17 +175,17 @@ entry_to_bucket(struct ip4_bucket_entry *ent) /* * alloc a mtrie bucket */ -static struct ip4_bucket * -mtrie_alloc_bucket(unsigned char level, struct ip4_bucket_entry *parent) +static struct ip_bucket * +mtrie_alloc_bucket(struct mtrie_bkt_info *ip_bkt_info, unsigned char level, struct ip_bucket_entry *parent) { unsigned int bkt_size; unsigned int i; - struct ip4_bucket *bkt; - struct ip4_bucket_entry *ent; + struct ip_bucket *bkt; + struct ip_bucket_entry *ent; - bkt_size = ip4_bkt_info[level].bi_size; - bkt = vr_zalloc(sizeof(struct ip4_bucket) - + sizeof(struct ip4_bucket_entry) * bkt_size); + bkt_size = ip_bkt_info[level].bi_size; + bkt = vr_zalloc(sizeof(struct ip_bucket) + + sizeof(struct ip_bucket_entry) * bkt_size); if (!bkt) return NULL; @@ -184,20 +201,23 @@ mtrie_alloc_bucket(unsigned char level, struct ip4_bucket_entry *parent) } static void -add_to_tree(struct ip4_bucket_entry *ent, int level, struct vr_route_req *rt) +add_to_tree(struct ip_bucket_entry *ent, int level, struct vr_route_req *rt) { unsigned int i; - struct ip4_bucket *bkt; + struct ip_bucket *bkt; + struct mtrie_bkt_info *ip_bkt_info; - if (level >= IP4_BKT_LEVELS - 1) + if (level >= (ip_bkt_get_max_level(rt->rtr_req.rtr_family) - 1)) /* assert here ? */ return; + ip_bkt_info = ip_bkt_info_get(rt->rtr_req.rtr_family); + /* assured that the first one is a bucket */ bkt = entry_to_bucket(ent); level++; - for (i = 0; i < ip4_bkt_info[level].bi_size; i++) { + for (i = 0; i < ip_bkt_info[level].bi_size; i++) { ent = index_to_entry(bkt, i); if (!ENTRY_IS_NEXTHOP(ent)) add_to_tree(ent, level, rt); @@ -214,10 +234,10 @@ add_to_tree(struct ip4_bucket_entry *ent, int level, struct vr_route_req *rt) } static void -mtrie_free_entry(struct ip4_bucket_entry *entry, unsigned int level) +mtrie_free_entry(struct ip_bucket_entry *entry, unsigned int level) { unsigned int i; - struct ip4_bucket *bkt; + struct ip_bucket *bkt; if (ENTRY_IS_NEXTHOP(entry)) { vrouter_put_nexthop(entry->entry_nh_p); @@ -228,7 +248,7 @@ mtrie_free_entry(struct ip4_bucket_entry *entry, unsigned int level) if (!bkt) return; - for (i = 0; i < ip4_bkt_info[level].bi_size; i++) + for (i = 0; i < IPBUCKET_LEVEL_SIZE; i++) if (ENTRY_IS_BUCKET(&bkt->bkt_data[i])) { mtrie_free_entry(&bkt->bkt_data[i], level + 1); } else { @@ -244,10 +264,10 @@ mtrie_free_entry(struct ip4_bucket_entry *entry, unsigned int level) } static void -mtrie_reset_entry(struct ip4_bucket_entry *ent, int level, +mtrie_reset_entry(struct ip_bucket_entry *ent, int level, struct vr_nexthop *nh) { - struct ip4_bucket_entry cp_ent; + struct ip_bucket_entry cp_ent; memcpy(&cp_ent, ent, sizeof(cp_ent)); @@ -277,19 +297,21 @@ mtrie_reset_entry(struct ip4_bucket_entry *ent, int level, * covers them. */ static int -__mtrie_add(struct ip4_mtrie *mtrie, struct vr_route_req *rt) +__mtrie_add(struct ip_mtrie *mtrie, struct vr_route_req *rt) { - int ret, index, level, err_level = 0; - unsigned int i, fin; - struct ip4_bucket *bkt; - struct ip4_bucket_entry *ent, *err_ent = NULL; + int ret, index = 0, level, err_level = 0; + unsigned char i, fin = 0; + struct ip_bucket *bkt; + struct ip_bucket_entry *ent, *err_ent = NULL; struct vr_nexthop *nh, *err_nh = NULL; + struct mtrie_bkt_info *ip_bkt_info = ip_bkt_info_get(rt->rtr_req.rtr_family); ent = &mtrie->root; + nh = ent->entry_nh_p; - for (level = 0; level < IP4_BKT_LEVELS; level++) { + for (level = 0; level < ip_bkt_get_max_level(rt->rtr_req.rtr_family); level++) { if (!ENTRY_IS_BUCKET(ent)) { - bkt = mtrie_alloc_bucket(level, ent); + bkt = mtrie_alloc_bucket(ip_bkt_info, level, ent); set_entry_to_bucket(ent, bkt); if (!err_ent) { err_ent = ent; @@ -307,7 +329,7 @@ __mtrie_add(struct ip4_mtrie *mtrie, struct vr_route_req *rt) index = rt_to_index(rt, level); ent = index_to_entry(bkt, index); - if (rt->rtr_req.rtr_prefix_len > ip4_bkt_info[level].bi_pfx_len) { + if (rt->rtr_req.rtr_prefix_len > ip_bkt_info[level].bi_pfx_len) { if (ENTRY_IS_NEXTHOP(ent)) { nh = ent->entry_nh_p; } @@ -320,18 +342,18 @@ __mtrie_add(struct ip4_mtrie *mtrie, struct vr_route_req *rt) * prefix match */ if ((rt->rtr_req.rtr_prefix_len > - (ip4_bkt_info[level].bi_pfx_len - ip4_bkt_info[level].bi_bits)) && - (rt->rtr_req.rtr_prefix_len <= ip4_bkt_info[level].bi_pfx_len)) { - fin = 1 << (ip4_bkt_info[level].bi_pfx_len - rt->rtr_req.rtr_prefix_len); - } else { - fin = ip4_bkt_info[level].bi_size; + (ip_bkt_info[level].bi_pfx_len - ip_bkt_info[level].bi_bits)) && + (rt->rtr_req.rtr_prefix_len <= ip_bkt_info[level].bi_pfx_len)) { + fin = 1 << (ip_bkt_info[level].bi_pfx_len - rt->rtr_req.rtr_prefix_len); } - fin += index; - if (fin > ip4_bkt_info[level].bi_size) - fin = ip4_bkt_info[level].bi_size; - for (i = index; i < fin; i++) { + /* + * Run through the loop 'fin' times only + * If fin is 0, it actually means 256 ('char' overflow), so run the + * loop 256 times + */ + for (i = index; i <= (ip_bkt_info[level].bi_size-1); i++) { ent = index_to_entry(bkt, i); if (ENTRY_IS_BUCKET(ent)) add_to_tree(ent, level, rt); @@ -342,6 +364,18 @@ __mtrie_add(struct ip4_mtrie *mtrie, struct vr_route_req *rt) ent->entry_label_flags = rt->rtr_req.rtr_label_flags; ent->entry_label = rt->rtr_req.rtr_label; } + if (fin) { + /* Repeat the loop 'fin' times only */ + fin--; + if (fin == 0) + break; + } + /* + * Bailout at the last index, + * the below check takes care of overflow + */ + if (i == (ip_bkt_info[level].bi_size-1)) + break; } break; @@ -359,15 +393,15 @@ __mtrie_add(struct ip4_mtrie *mtrie, struct vr_route_req *rt) static void -ip4_bucket_sched_for_free(struct ip4_bucket *bkt, int level) +ip_bucket_sched_for_free(struct ip_bucket *bkt, int level) { unsigned int i; - struct ip4_bucket_entry *tmp_ent; + struct ip_bucket_entry *tmp_ent; if (!vr_not_ready) vr_delay_op(); - for (i = 0; i < ip4_bkt_info[level].bi_size; i++) { + for (i = 0; i < IPBUCKET_LEVEL_SIZE; i++) { tmp_ent = &bkt->bkt_data[i]; if (tmp_ent->entry_nh_p) { vrouter_put_nexthop(tmp_ent->entry_nh_p); @@ -377,9 +411,9 @@ ip4_bucket_sched_for_free(struct ip4_bucket *bkt, int level) } static void -free_bucket(struct ip4_bucket_entry *ent, int level, struct vr_route_req *rt) +free_bucket(struct ip_bucket_entry *ent, int level, struct vr_route_req *rt) { - struct ip4_bucket *bkt; + struct ip_bucket *bkt; if (ENTRY_IS_NEXTHOP(ent)) { return; @@ -390,16 +424,17 @@ free_bucket(struct ip4_bucket_entry *ent, int level, struct vr_route_req *rt) ent->entry_label_flags = rt->rtr_req.rtr_label_flags; ent->entry_label = rt->rtr_req.rtr_label; - ip4_bucket_sched_for_free(bkt, level); + ip_bucket_sched_for_free(bkt, level); } static int -__mtrie_delete(struct vr_route_req *rt, struct ip4_bucket_entry *ent, +__mtrie_delete(struct vr_route_req *rt, struct ip_bucket_entry *ent, unsigned char level) { unsigned int index, i, fin; - struct ip4_bucket *bkt; - struct ip4_bucket_entry *tmp_ent; + struct ip_bucket *bkt; + struct ip_bucket_entry *tmp_ent; + struct mtrie_bkt_info *ip_bkt_info = ip_bkt_info_get(rt->rtr_req.rtr_family); if (ENTRY_IS_NEXTHOP(ent)) return -ENOENT; @@ -407,21 +442,21 @@ __mtrie_delete(struct vr_route_req *rt, struct ip4_bucket_entry *ent, bkt = entry_to_bucket(ent); index = rt_to_index(rt, level); - if (rt->rtr_req.rtr_prefix_len > ip4_bkt_info[level].bi_pfx_len) { + if (rt->rtr_req.rtr_prefix_len > ip_bkt_info[level].bi_pfx_len) { tmp_ent = index_to_entry(bkt, index); __mtrie_delete(rt, tmp_ent, level + 1); } else { if ((rt->rtr_req.rtr_prefix_len > - (ip4_bkt_info[level].bi_pfx_len - ip4_bkt_info[level].bi_bits)) && - (rt->rtr_req.rtr_prefix_len <= ip4_bkt_info[level].bi_pfx_len)) { - fin = 1 << (ip4_bkt_info[level].bi_pfx_len - rt->rtr_req.rtr_prefix_len); + (ip_bkt_info[level].bi_pfx_len - ip_bkt_info[level].bi_bits)) && + (rt->rtr_req.rtr_prefix_len <= ip_bkt_info[level].bi_pfx_len)) { + fin = 1 << (ip_bkt_info[level].bi_pfx_len - rt->rtr_req.rtr_prefix_len); } else { - fin = ip4_bkt_info[level].bi_size; + fin = ip_bkt_info[level].bi_size; } fin += index; - if (fin > ip4_bkt_info[level].bi_size) - fin = ip4_bkt_info[level].bi_size; + if (fin > ip_bkt_info[level].bi_size) + fin = ip_bkt_info[level].bi_size; for (i = index; i < fin; i++) { tmp_ent = index_to_entry(bkt, i); @@ -437,7 +472,7 @@ __mtrie_delete(struct vr_route_req *rt, struct ip4_bucket_entry *ent, } /* check if current bucket neds to be deleted */ - for (i = 1; i < ip4_bkt_info[level].bi_size; i++) { + for (i = 1; i < ip_bkt_info[level].bi_size; i++) { if ((bkt->bkt_data[i].entry_long_i == bkt->bkt_data[0].entry_long_i) && (bkt->bkt_data[i].entry_label_flags == bkt->bkt_data[0].entry_label_flags) && @@ -468,13 +503,16 @@ mtrie_dumper_route_encode(struct vr_message_dumper *dumper, vr_route_req *resp) static void mtrie_dumper_make_response(struct vr_message_dumper *dumper, vr_route_req *resp, - struct ip4_bucket_entry *ent, unsigned int prefix, unsigned int prefix_len) + struct ip_bucket_entry *ent, int8_t *prefix, unsigned int prefix_len) { vr_route_req *req = (vr_route_req *)dumper->dump_req; resp->rtr_vrf_id = req->rtr_vrf_id; resp->rtr_family = req->rtr_family; - resp->rtr_prefix = prefix; + memcpy(resp->rtr_prefix, prefix, RT_IP_ADDR_SIZE(req->rtr_family)); + resp->rtr_prefix_size = req->rtr_prefix_size; + resp->rtr_marker_size = resp->rtr_src_size = 0; + resp->rtr_marker = resp->rtr_src = NULL; resp->rtr_prefix_len = prefix_len; resp->rtr_rid = req->rtr_rid; resp->rtr_label_flags = ent->entry_label_flags; @@ -489,28 +527,37 @@ mtrie_dumper_make_response(struct vr_message_dumper *dumper, vr_route_req *resp, } static int -mtrie_dump_entry(struct vr_message_dumper *dumper, struct ip4_bucket_entry *ent, - unsigned int byte, int level) +mtrie_dump_entry(struct vr_message_dumper *dumper, struct ip_bucket_entry *ent, + int8_t *prefix, int level) { -#ifdef VR_ROUTE_DEBUG - unsigned char *addr; -#endif - unsigned int i = 0, prefix; + unsigned char i = 0; + unsigned int j; int ret; - struct ip4_bucket *bkt; - struct ip4_bucket_entry *ent_p = ent; - vr_route_req *req, resp; + struct ip_bucket *bkt; + struct ip_bucket_entry *ent_p = ent; + struct mtrie_bkt_info *ip_bkt_info; + vr_route_req *req; + int done = 0; + uint32_t rt_prefix[4]; req = dumper->dump_req; + + ip_bkt_info = ip_bkt_info_get(req->rtr_family); if (!dumper->dump_been_to_marker) { i = PREFIX_TO_INDEX(req->rtr_marker, level); bkt = entry_to_bucket(ent); ent = index_to_entry(bkt, i); - prefix = byte | (i << ip4_bkt_info[level].bi_shift); - if ((prefix == (unsigned int)req->rtr_marker && - ip4_bkt_info[level].bi_pfx_len == req->rtr_marker_plen)) + prefix[level] = i; + + if ((!memcmp(prefix, req->rtr_marker, ip_bkt_info[level].bi_pfx_len/8)) && + (ip_bkt_info[level].bi_pfx_len == req->rtr_marker_plen)) { dumper->dump_been_to_marker = 1; + } + + /* take care of overflow */ + if (i == (ip_bkt_info[level].bi_size - 1)) + done = 1; if (ENTRY_IS_BUCKET(ent) && !dumper->dump_been_to_marker) { if (mtrie_dump_entry(dumper, ent, prefix, level + 1)) @@ -524,55 +571,52 @@ mtrie_dump_entry(struct vr_message_dumper *dumper, struct ip4_bucket_entry *ent, } if (ENTRY_IS_BUCKET(ent_p)) { + if (done) + return 0; + j = ip_bkt_info[level].bi_size - i; bkt = entry_to_bucket(ent_p); - for (; i < ip4_bkt_info[level].bi_size; i++) { + for (; j > 0; j--, i++) { ent = &bkt->bkt_data[i]; - prefix = byte | (i << ip4_bkt_info[level].bi_shift); + prefix[level] = i; if (mtrie_dump_entry(dumper, ent, prefix, level + 1) < 0) return -1; } } else if (ent_p->entry_nh_p) { - mtrie_dumper_make_response(dumper, &resp, ent_p, byte, - ip4_bkt_info[level - 1].bi_pfx_len); - -#ifdef VR_ROUTE_DEBUG - addr = (unsigned char *)&byte; - vr_printf("%u.%u.%u.%u/%u\t\t", addr[3], addr[2], addr[1], addr[0], - ip4_bkt_info[level - 1].bi_pfx_len); - if (ent_p->entry_label_flags) { - vr_printf("%d\t", ent_p->entry_label); - } else { - vr_printf("N/A\t"); - } - vr_printf("%d\n", ent_p->entry_nh_p->nh_id); -#endif + dump_resp.rtr_prefix = (uint8_t*)&rt_prefix; + mtrie_dumper_make_response(dumper, &dump_resp, ent_p, prefix, + ip_bkt_info[level - 1].bi_pfx_len); + + ret = mtrie_dumper_route_encode(dumper, &dump_resp); - ret = mtrie_dumper_route_encode(dumper, &resp); + dump_resp.rtr_prefix = NULL; if (ret <= 0) - return -1; + return -1; } return 0; } static int -mtrie_walk(struct vr_message_dumper *dumper) +mtrie_walk(struct vr_message_dumper *dumper, unsigned int family) { vr_route_req *req; - struct ip4_mtrie *mtrie; - struct ip4_bucket_entry *ent; + struct ip_mtrie *mtrie; + struct ip_bucket_entry *ent; + int ret = 0; + uint32_t rt_prefix[4]; req = (vr_route_req *)dumper->dump_req; - mtrie = vrfid_to_mtrie(req->rtr_vrf_id); + mtrie = vrfid_to_mtrie(req->rtr_vrf_id, family); if (!mtrie) return -EINVAL; ent = &mtrie->root; + if (ENTRY_IS_BUCKET(ent)) { - return mtrie_dump_entry(dumper, ent, 0, 0); + ret = mtrie_dump_entry(dumper, ent, (uint8_t*)&rt_prefix, 0); } - return 0; + return ret; } static int @@ -587,10 +631,10 @@ mtrie_dump(struct vr_rtable * __unsued, struct vr_route_req *rt) goto generate_response; } - if (!((vr_route_req *)(dumper->dump_req))->rtr_marker) + if (((vr_route_req *)(dumper->dump_req))->rtr_marker_size == 0) dumper->dump_been_to_marker = 1; - ret = mtrie_walk(dumper); + ret = mtrie_walk(dumper, rt->rtr_req.rtr_family); generate_response: vr_message_dump_exit(dumper, ret); @@ -614,9 +658,9 @@ static int mtrie_delete(struct vr_rtable * _unused, struct vr_route_req *rt) { int vrf_id = rt->rtr_req.rtr_vrf_id; - struct ip4_mtrie *rtable; + struct ip_mtrie *rtable; - rtable = vrfid_to_mtrie(vrf_id); + rtable = vrfid_to_mtrie(vrf_id, rt->rtr_req.rtr_family); if (!rtable) return -ENOENT; @@ -636,7 +680,10 @@ mtrie_stats(unsigned short vrf, unsigned int cpu) if (vrf >= VR_MAX_VRFS) return &invalid_vrf_stats[cpu]; - return &((mtrie_vrf_stats[vrf])[cpu]); + if (mtrie_vrf_stats) + return &((mtrie_vrf_stats[vrf])[cpu]); + + return NULL; } static int @@ -745,35 +792,46 @@ mtrie_lookup(unsigned int vrf_id, struct vr_route_req *rt, { unsigned int level, index; unsigned long ptr; - struct ip4_mtrie *table; - struct ip4_bucket *bkt; - struct ip4_bucket_entry *ent; + struct ip_mtrie *table; + struct ip_bucket *bkt; + struct ip_bucket_entry *ent; + struct vr_nexthop *default_nh, *ret_nh; + + default_nh = ip4_default_nh; /* we do not support any thing other than /32 route lookup */ - if (rt->rtr_req.rtr_prefix_len != IP4_PREFIX_LEN) - return ip4_default_nh; + if ((rt->rtr_req.rtr_family == AF_INET) && + (rt->rtr_req.rtr_prefix_len != IP4_PREFIX_LEN)) + return default_nh; + + if ((rt->rtr_req.rtr_family == AF_INET6) && + (rt->rtr_req.rtr_prefix_len != IP6_PREFIX_LEN)) + return default_nh; - table = vrfid_to_mtrie(vrf_id); + table = vrfid_to_mtrie(vrf_id, rt->rtr_req.rtr_family); if (!table) - return ip4_default_nh; + return default_nh; ent = &table->root; + ptr = ent->entry_long_i; if (!ptr) - return ip4_default_nh; + return default_nh; if (PTR_IS_NEXTHOP(ptr)) { rt->rtr_req.rtr_label_flags = ent->entry_label_flags; rt->rtr_req.rtr_label = ent->entry_label; rt->rtr_req.rtr_prefix_len = ent->entry_prefix_len; - return PTR_TO_NEXTHOP(ptr); + ret_nh = PTR_TO_NEXTHOP(ptr); + + return ret_nh; } bkt = PTR_TO_BUCKET(ptr); if (!bkt) - return ip4_default_nh; + return default_nh; - for (level = 0; level < IP4_BKT_LEVELS; level++) { + for (level = 0; level < ip_bkt_get_max_level(rt->rtr_req.rtr_family); level++) { index = rt_to_index(rt, level); ent = index_to_entry(bkt, index); ptr = ent->entry_long_i; @@ -781,7 +839,8 @@ mtrie_lookup(unsigned int vrf_id, struct vr_route_req *rt, rt->rtr_req.rtr_label_flags = ent->entry_label_flags; rt->rtr_req.rtr_label = ent->entry_label; rt->rtr_req.rtr_prefix_len = ent->entry_prefix_len; - return PTR_TO_NEXTHOP(ptr); + ret_nh = PTR_TO_NEXTHOP(ptr); + return ret_nh; } bkt = PTR_TO_BUCKET(ptr); @@ -801,10 +860,10 @@ static int mtrie_add(struct vr_rtable * _unused, struct vr_route_req *rt) { unsigned int vrf_id = rt->rtr_req.rtr_vrf_id; - struct ip4_mtrie *mtrie = vrfid_to_mtrie(vrf_id); + struct ip_mtrie *mtrie = vrfid_to_mtrie(vrf_id, rt->rtr_req.rtr_family); int ret; - mtrie = (mtrie ? : mtrie_alloc_vrf(vrf_id)); + mtrie = (mtrie ? : mtrie_alloc_vrf(vrf_id, rt->rtr_req.rtr_family)); if (!mtrie) return -ENOMEM; @@ -812,12 +871,12 @@ mtrie_add(struct vr_rtable * _unused, struct vr_route_req *rt) if (!rt->rtr_nh) return -ENOENT; - if ((!(rt->rtr_req.rtr_label_flags & VR_RT_LABEL_VALID_FLAG)) && (rt->rtr_nh->nh_type == NH_TUNNEL)) { vrouter_put_nexthop(rt->rtr_nh); return -EINVAL; } + ret = __mtrie_add(mtrie, rt); vrouter_put_nexthop(rt->rtr_nh); return ret; @@ -840,15 +899,21 @@ mtrie_get(unsigned int vrf_id, struct vr_route_req *rt) return 0; } -static struct ip4_mtrie * -mtrie_alloc_vrf(unsigned int vrf_id) +static struct ip_mtrie * +mtrie_alloc_vrf(unsigned int vrf_id, unsigned int family) { - struct ip4_mtrie *mtrie; + struct ip_mtrie *mtrie; + struct ip_mtrie **mtrie_table; + int index = 0; + + if (family == AF_INET6) + index = 1; - mtrie = vr_zalloc(sizeof(struct ip4_mtrie)); + mtrie = vr_zalloc(sizeof(struct ip_mtrie)); if (mtrie) { mtrie->root.entry_nh_p = vrouter_get_nexthop(0, NH_DISCARD_ID); - vn_rtable[vrf_id] = mtrie; + mtrie_table = vn_rtable[index]; + mtrie_table[vrf_id] = mtrie; } return mtrie; @@ -857,17 +922,21 @@ mtrie_alloc_vrf(unsigned int vrf_id) static void mtrie_free_vrf(struct vr_rtable *rtable, unsigned int vrf_id) { - struct ip4_mtrie *mtrie; - struct ip4_mtrie **vrf_tables; - - vrf_tables = (struct ip4_mtrie **)rtable->algo_data; - mtrie = vrf_tables[vrf_id]; - if (!mtrie) - return; - - mtrie_free_entry(&mtrie->root, 0); - vrf_tables[vrf_id] = NULL; - vr_free(mtrie); + struct ip_mtrie *mtrie; + struct ip_mtrie **vrf_tables; + int i; + + /* Free V4 and V6 tables */ + for (i=0; i<2; i++) { + vrf_tables = vn_rtable[i]; + mtrie = vrf_tables[vrf_id]; + if (!mtrie) + continue; + + mtrie_free_entry(&mtrie->root, 0); + vrf_tables[vrf_id] = NULL; + vr_free(mtrie); + } return; } @@ -896,22 +965,25 @@ mtrie_stats_cleanup(struct vr_rtable *rtable) } void -mtrie4_algo_deinit(struct vr_rtable *rtable, struct rtable_fspec *fs, bool soft_reset) +mtrie_algo_deinit(struct vr_rtable *rtable, struct rtable_fspec *fs, bool soft_reset) { unsigned int i; - if (!vn_rtable) + if (!vn_rtable[0]) return; mtrie_stats_cleanup(rtable); - vn_rtable = NULL; for (i = 0; i < fs->rtb_max_vrfs; i++) mtrie_free_vrf(rtable, i); + *vn_rtable[0] = *vn_rtable[1] = NULL; + vr_free(rtable->algo_data); rtable->algo_data = NULL; + algo_init_done = 0; + return; } @@ -967,12 +1039,15 @@ mtrie_stats_init(struct vr_rtable *rtable) } int -mtrie4_algo_init(struct vr_rtable *rtable, struct rtable_fspec *fs) +mtrie_algo_init(struct vr_rtable *rtable, struct rtable_fspec *fs) { int ret = 0; unsigned int table_memory; - table_memory = sizeof(void *) * fs->rtb_max_vrfs; + if (algo_init_done) + return 0; + + table_memory = 2 * sizeof(void *) * fs->rtb_max_vrfs; rtable->algo_data = vr_zalloc(table_memory); if (!rtable->algo_data) return vr_module_error(-ENOMEM, __FUNCTION__, __LINE__, table_memory); @@ -994,8 +1069,14 @@ mtrie4_algo_init(struct vr_rtable *rtable, struct rtable_fspec *fs) vr_inet_route_lookup = mtrie_lookup; vr_inet_vrf_stats = mtrie_stats; /* local cache */ - vn_rtable = (struct ip4_mtrie **)rtable->algo_data; + vn_rtable[0] = (struct ip_mtrie **)rtable->algo_data; // V4 table + vn_rtable[1] = (struct ip_mtrie **)((char*)rtable->algo_data + + fs->rtb_max_vrfs); // V6 table + + mtrie_ip_bkt_info_init(ip4_bkt_info, IP4_PREFIX_LEN); + mtrie_ip_bkt_info_init(ip6_bkt_info, IP6_PREFIX_LEN); + algo_init_done = 1; return 0; init_fail: diff --git a/dp-core/vr_mcast.c b/dp-core/vr_mcast.c index 72d794488..b4c0a7997 100644 --- a/dp-core/vr_mcast.c +++ b/dp-core/vr_mcast.c @@ -86,8 +86,15 @@ mcast_lookup(unsigned int vrf_id, struct vr_route_req *rt, struct vr_mcast_entry_key key; key.vrf_id = rt->rtr_req.rtr_vrf_id; - key.src_ip = rt->rtr_req.rtr_src; - key.dst_ip = rt->rtr_req.rtr_prefix; + if (rt->rtr_req.rtr_src_size) + key.src_ip = ntohl(*(uint32_t*)rt->rtr_req.rtr_src); + else + key.src_ip = 0; + + if (rt->rtr_req.rtr_prefix_size) + key.dst_ip = ntohl(*(uint32_t*)rt->rtr_req.rtr_prefix); + else + key.dst_ip = 0; ent = vr_find_mcast_entry(&key); if (ent) { @@ -137,9 +144,16 @@ mcast_delete(struct vr_rtable * _unused, struct vr_route_req *rt) struct vr_mcast_entry_key key; key.vrf_id = rt->rtr_req.rtr_vrf_id; - key.src_ip = rt->rtr_req.rtr_src; - key.dst_ip = rt->rtr_req.rtr_prefix; - + if (rt->rtr_req.rtr_src_size) + key.src_ip = ntohl(*(uint32_t*)rt->rtr_req.rtr_src); + else + key.src_ip = 0; + + if (rt->rtr_req.rtr_prefix_size) + key.dst_ip = ntohl(*(uint32_t*)rt->rtr_req.rtr_prefix); + else + key.dst_ip = 0; + ent = vr_find_mcast_entry(&key); if (!ent) return -ENOENT; @@ -156,8 +170,15 @@ __mcast_add(struct vr_route_req *rt) struct vr_mcast_entry_key key; key.vrf_id = rt->rtr_req.rtr_vrf_id; - key.src_ip = rt->rtr_req.rtr_src; - key.dst_ip = rt->rtr_req.rtr_prefix; + if (rt->rtr_req.rtr_src_size) + key.src_ip = ntohl(*(uint32_t*)rt->rtr_req.rtr_src); + else + key.src_ip = 0; + + if (rt->rtr_req.rtr_prefix_size) + key.dst_ip = ntohl(*(uint32_t*)rt->rtr_req.rtr_prefix); + else + key.dst_ip = 0; ent = vr_find_mcast_entry(&key); if (!ent) { @@ -200,9 +221,9 @@ mcast_add(struct vr_rtable * _unused, struct vr_route_req *rt) static void mcast_make_req(struct vr_route_req *resp, struct vr_mcast_entry *ent) { - memset(resp, 0, sizeof(struct vr_route_req)); - resp->rtr_req.rtr_prefix = ent->key.dst_ip; - resp->rtr_req.rtr_src = ent->key.src_ip; + *(uint32_t*)resp->rtr_req.rtr_prefix = ntohl(ent->key.dst_ip); + *(uint32_t*)resp->rtr_req.rtr_src = ntohl(ent->key.src_ip); + resp->rtr_req.rtr_prefix_size = resp->rtr_req.rtr_src_size = 4; resp->rtr_req.rtr_vrf_id = ent->key.vrf_id; if (ent->nh) resp->rtr_req.rtr_nh_id = ent->nh->nh_id; @@ -220,6 +241,7 @@ __mcast_dump(struct vr_message_dumper *dumper) int ret; struct vr_mcast_entry *ent; unsigned int i; + uint32_t rt_prefix, rt_src; for(i = 0; i < (vr_mcast_entries + vr_mcast_oentries); i++) { ent = (struct vr_mcast_entry *) vr_get_hentry_by_index(vn_rtable, i); @@ -229,12 +251,16 @@ __mcast_dump(struct vr_message_dumper *dumper) if (ent->key.vrf_id != req->rtr_req.rtr_vrf_id) continue; if (dumper->dump_been_to_marker == 0) { - if ((ent->key.src_ip == (unsigned int)req->rtr_req.rtr_src) && - (ent->key.dst_ip == (unsigned int)req->rtr_req.rtr_prefix) && + if ((ent->key.src_ip == ntohl(*(unsigned int*)req->rtr_req.rtr_src)) && + (ent->key.dst_ip == ntohl(*(unsigned int*)req->rtr_req.rtr_prefix)) && (ent->key.vrf_id == req->rtr_req.rtr_vrf_id)) { dumper->dump_been_to_marker = 1; } } else { + memset(&resp, 0, sizeof(struct vr_route_req)); + resp.rtr_req.rtr_src = (uint8_t*)&rt_src; + resp.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + mcast_make_req(&resp, ent); ret = vr_message_dump_object(dumper, VR_ROUTE_OBJECT_ID, &resp); if (ret <= 0) @@ -340,18 +366,25 @@ vr_mcast_forward(struct vrouter *router, unsigned short vrf, struct vr_route_req rt; struct vr_nexthop *nh; struct vr_ip *ip; + uint32_t rt_prefix, rt_src; pkt->vp_type = VP_TYPE_IP; ip = (struct vr_ip *)pkt_data(pkt); rt.rtr_req.rtr_vrf_id = vrf; rt.rtr_req.rtr_prefix_len = 32; + + rt.rtr_req.rtr_src = (uint8_t*)&rt_src; + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + rt.rtr_req.rtr_src_size = rt.rtr_req.rtr_prefix_size = 4; + rt.rtr_req.rtr_marker_size = 0; + if (IS_MCAST_LINK_LOCAL(ip->ip_daddr) || IS_BCAST_IP(ip->ip_daddr)) { - rt.rtr_req.rtr_src = 0; - rt.rtr_req.rtr_prefix = 0xFFFFFFFF; + memset(rt.rtr_req.rtr_src, 0, 4); + *(uint32_t*)rt.rtr_req.rtr_prefix = 0xFFFFFFFF; } else { - rt.rtr_req.rtr_src = ip->ip_saddr; - rt.rtr_req.rtr_prefix = ip->ip_daddr; + *(uint32_t*)rt.rtr_req.rtr_prefix = ntohl(ip->ip_daddr); + *(uint32_t*)rt.rtr_req.rtr_src = ntohl(ip->ip_saddr); } nh = mcast_lookup(vrf, &rt, pkt); diff --git a/dp-core/vr_nexthop.c b/dp-core/vr_nexthop.c index 6ec5e9d84..8ce882aaf 100644 --- a/dp-core/vr_nexthop.c +++ b/dp-core/vr_nexthop.c @@ -25,6 +25,7 @@ struct vr_nexthop *vr_inet_src_lookup(unsigned short, struct vr_ip *, struct vr_packet *); extern struct vr_vrf_stats *(*vr_inet_vrf_stats)(unsigned short, unsigned int); struct vr_nexthop *ip4_default_nh; +struct vr_nexthop *ip6_default_nh; struct vr_nexthop * __vrouter_get_nexthop(struct vrouter *router, unsigned int index) @@ -1357,11 +1358,16 @@ nh_output(unsigned short vrf, struct vr_packet *pkt, * Typical example for this situation is when the packet reaches the * target VM's server from an ECMP-ed service chain. */ + ip = (struct vr_ip *)pkt_network_header(pkt); + if (vr_ip_is_ip6(ip)) { + pkt->vp_type = VP_TYPE_IP6; + vr_flow_inet6_input(nh->nh_router, vrf, pkt, VR_ETH_PROTO_IP6, fmd); + return 1; + } if (!(pkt->vp_flags & VP_FLAG_FLOW_SET)) { if (nh->nh_flags & NH_FLAG_POLICY_ENABLED) { need_flow_lookup = true; } else { - ip = (struct vr_ip *)pkt_network_header(pkt); src_nh = vr_inet_src_lookup(vrf, ip, pkt); if (src_nh && src_nh->nh_type == NH_COMPOSITE && src_nh->nh_flags & NH_FLAG_COMPOSITE_ECMP) { @@ -1371,8 +1377,9 @@ nh_output(unsigned short vrf, struct vr_packet *pkt, if (need_flow_lookup) { pkt->vp_flags |= VP_FLAG_FLOW_GET; - return vr_flow_inet_input(nh->nh_router, vrf, + vr_flow_inet_input(nh->nh_router, vrf, pkt, VR_ETH_PROTO_IP, fmd); + return 1; } } } @@ -1463,17 +1470,25 @@ nh_encap_l3_unicast(unsigned short vrf, struct vr_packet *pkt, { struct vr_interface *vif; struct vr_vrf_stats *stats; -#ifdef VROUTER_CONFIG_DIAG struct vr_ip *ip; -#endif stats = vr_inet_vrf_stats(vrf, pkt->vp_cpu); vif = nh->nh_dev; - pkt->vp_type = VP_TYPE_IP; -#ifdef VROUTER_CONFIG_DIAG ip = (struct vr_ip *)pkt_network_header(pkt); - + if (vr_ip_is_ip6(ip)) { + pkt->vp_type = VP_TYPE_IP6; + if (stats) { + if ((pkt->vp_flags & VP_FLAG_GRO) && + (vif->vif_type == VIF_TYPE_VIRTUAL)) { + stats->vrf_gros++; + } else { + stats->vrf_encaps++; + } + } + } else { + pkt->vp_type = VP_TYPE_IP; +#ifdef VROUTER_CONFIG_DIAG if (ip->ip_csum == VR_DIAG_IP_CSUM) { pkt->vp_flags &= ~VP_FLAG_GRO; if (stats) @@ -1492,6 +1507,7 @@ nh_encap_l3_unicast(unsigned short vrf, struct vr_packet *pkt, #ifdef VROUTER_CONFIG_DIAG } #endif + } /* * For packets being sent up a tap interface, retain the MPLS label @@ -1517,6 +1533,18 @@ nh_encap_l3_unicast(unsigned short vrf, struct vr_packet *pkt, pkt_pull(pkt, VR_MPLS_HDR_LEN); } } else { + + /* + * Same NH for both V4 and V6, update the rewrite data with correct ethtype + */ + if (pkt->vp_type == VP_TYPE_IP6) { + nh->nh_data[nh->nh_encap_len-2] = 0x86; + nh->nh_data[nh->nh_encap_len-1] = 0xDD; + } else { + nh->nh_data[nh->nh_encap_len-2] = 0x08; + nh->nh_data[nh->nh_encap_len-1] = 0x00; + } + if (!vif->vif_set_rewrite(vif, pkt, nh->nh_data, nh->nh_encap_len)) { vr_pfree(pkt, VP_DROP_REWRITE_FAIL); diff --git a/dp-core/vr_proto_ip.c b/dp-core/vr_proto_ip.c index 9116838cf..b207f961f 100644 --- a/dp-core/vr_proto_ip.c +++ b/dp-core/vr_proto_ip.c @@ -9,6 +9,7 @@ #include "vr_mpls.h" #include "vr_vxlan.h" #include "vr_mcast.h" +#include "vr_ip_mtrie.h" extern struct vr_nexthop *(*vr_inet_route_lookup)(unsigned int, struct vr_route_req *, struct vr_packet *); @@ -38,17 +39,64 @@ vr_generate_unique_ip_id() return vr_ip_id; } +/* + * Calculates ICMP6 checksum + * buffer is pointer to ip6 header, all values other than src, dst and plen are ZERO + * bytes is total length of ip6 header, icmp header and icmp option + */ +uint16_t +vr_icmp6_checksum (void * buffer, int bytes) { + uint32_t total; + uint16_t * ptr; + int num_words; + + total = 0; + ptr = (uint16_t *) buffer; + num_words = (bytes + 1) / 2; + + while (num_words--) total += *ptr++; + + /* + * Fold in any carries + * - the addition may cause another carry so we loop + */ + while (total & 0xffff0000) total = (total >> 16) + (total & 0xffff); + + return (uint16_t) total; +} + struct vr_nexthop * vr_inet_src_lookup(unsigned short vrf, struct vr_ip *ip, struct vr_packet *pkt) { struct vr_route_req rt; + struct vr_nexthop *nh; + struct vr_ip6 *ip6; + uint32_t rt_prefix[4]; + + if (!ip || !pkt) + return NULL; rt.rtr_req.rtr_vrf_id = vrf; - rt.rtr_req.rtr_prefix = ntohl(ip->ip_saddr); - rt.rtr_req.rtr_prefix_len = 32; + if (!vr_ip_is_ip6(ip)) { + pkt->vp_type = VP_TYPE_IP; + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + *(uint32_t*)rt.rtr_req.rtr_prefix = (ip->ip_saddr); + rt.rtr_req.rtr_prefix_size = 4; + rt.rtr_req.rtr_prefix_len = IP4_PREFIX_LEN; + } else { + ip6 = (struct vr_ip6 *)pkt_data(pkt); + pkt->vp_type = VP_TYPE_IP6; + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + rt.rtr_req.rtr_prefix_size = 16; + memcpy(rt.rtr_req.rtr_prefix, ip6->ip6_src, 16); + rt.rtr_req.rtr_prefix_len = IP6_PREFIX_LEN; + } + rt.rtr_req.rtr_src_size = rt.rtr_req.rtr_marker_size = 0; rt.rtr_req.rtr_nh_id = 0; - return vr_inet_route_lookup(vrf, &rt, pkt); + nh = vr_inet_route_lookup(vrf, &rt, pkt); + + return nh; } int @@ -58,19 +106,42 @@ vr_forward(struct vrouter *router, unsigned short vrf, struct vr_route_req rt; struct vr_nexthop *nh; struct vr_ip *ip; + struct vr_ip6 *ip6, *outer_ip6; + struct vr_icmp *icmph; struct vr_forwarding_md rt_fmd; + struct vr_interface *vif; + int family = AF_INET, status, encap_len = 0; + short plen; + uint32_t rt_prefix[4]; if (pkt->vp_flags & VP_FLAG_MULTICAST) { return vr_mcast_forward(router, vrf, pkt, fmd); } - pkt->vp_type = VP_TYPE_IP; ip = (struct vr_ip *)pkt_data(pkt); - + if (vr_ip_is_ip6(ip)) { + family = AF_INET6; + ip6 = (struct vr_ip6 *)pkt_data(pkt); + pkt->vp_type = VP_TYPE_IP6; + } else { + pkt->vp_type = VP_TYPE_IP; + } + rt.rtr_req.rtr_vrf_id = vrf; - rt.rtr_req.rtr_prefix = ntohl(ip->ip_daddr); - rt.rtr_req.rtr_prefix_len = 32; + rt.rtr_req.rtr_family = family; + if (family == AF_INET) { + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + *(uint32_t*)rt.rtr_req.rtr_prefix = (ip->ip_daddr); + rt.rtr_req.rtr_prefix_size = 4; + rt.rtr_req.rtr_prefix_len = IP4_PREFIX_LEN; + } else { + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + rt.rtr_req.rtr_prefix_size = 16; + memcpy(rt.rtr_req.rtr_prefix, ip6->ip6_dst, 16); + rt.rtr_req.rtr_prefix_len = IP6_PREFIX_LEN; + } rt.rtr_req.rtr_nh_id = 0; + rt.rtr_req.rtr_src_size = rt.rtr_req.rtr_marker_size = 0; nh = vr_inet_route_lookup(vrf, &rt, pkt); if (rt.rtr_req.rtr_label_flags & VR_RT_LABEL_VALID_FLAG) { @@ -81,7 +152,65 @@ vr_forward(struct vrouter *router, unsigned short vrf, fmd->fmd_label = rt.rtr_req.rtr_label; } - return nh_output(vrf, pkt, nh, fmd); + vif = nh->nh_dev; + + if (vif) { + if (vif->vif_type == VIF_TYPE_PHYSICAL) { + encap_len = sizeof(struct vr_eth) + sizeof(struct vr_ip)+ sizeof(struct vr_udp) +sizeof(unsigned int); + } + + if (family == AF_INET) { + if ((ip->ip_frag_off & VR_IP_DF) && + (vif->vif_mtu < (sizeof(struct vr_ip)+ip->ip_len+encap_len))) { + } + } else if (family == AF_INET6) { + plen = ntohs(ip6->ip6_plen); + /* Handle PMTU for inet6 */ + if (vif->vif_mtu < (sizeof(struct vr_ip6)+plen+encap_len)) { + /*Send ICMP too big message */ + if (pkt->vp_data < (sizeof(struct vr_ip6) + sizeof(struct vr_icmp))) { + /* Not enough head room to add ip6/icmpv6 headers*/ + vr_pfree(pkt, VP_DROP_PUSH); + return 0; + } + icmph = (struct vr_icmp*) pkt_push(pkt, sizeof(struct vr_icmp)); + icmph->icmp_type = VR_ICMP6_TYPE_PKT_TOO_BIG; + icmph->icmp_code = 0; + icmph->icmp_csum = 0; + icmph->icmp_eid = 0; + icmph->icmp_eseq = htons(vif->vif_mtu - encap_len); /*set MTU in lower bytes of second word*/ + + /* Build the outer header */ + outer_ip6 = (struct vr_ip6*) pkt_push(pkt, sizeof(struct vr_ip6)); + memset(outer_ip6, 0, sizeof(struct vr_ip6)); + memcpy(outer_ip6, ip6, sizeof(struct vr_ip6)); + memcpy(outer_ip6->ip6_dst, ip6->ip6_src, 16); + memcpy(outer_ip6->ip6_src, ip6->ip6_dst, 16); + outer_ip6->ip6_src[15] = 0xff; //Mimic the GW IP as the src IP + + if (pkt->vp_if->vif_mtu >= (plen + 2*sizeof(struct vr_ip6) + + sizeof(struct vr_icmp))) { + outer_ip6->ip6_plen = htons(plen + sizeof(struct vr_ip6) + sizeof(struct vr_icmp)); + } else { + /* TODO: Chop the packet at the tail for the added header*/ + } + + /* Calculate ICMP6 checksum */ + icmph->icmp_csum = ~(vr_icmp6_checksum(outer_ip6, + sizeof(struct vr_ip6) + sizeof(struct vr_icmp))); + + /* Update packet pointers, perform route lookup and forward */ + pkt_set_network_header(pkt, pkt->vp_data); + + memcpy(rt.rtr_req.rtr_prefix, outer_ip6->ip6_dst, 16); + nh = vr_inet_route_lookup(vrf, &rt, pkt); + } + } + } + + status = nh_output(vrf, pkt, nh, fmd); + + return status; } /* @@ -275,20 +404,40 @@ vr_ip_rcv(struct vrouter *router, struct vr_packet *pkt, * lets subject it to flow processing. */ if (pkt->vp_nh->nh_flags & NH_FLAG_RELAXED_POLICY) { - if (!(pkt->vp_flags & VP_FLAG_FLOW_SET) && - !(pkt->vp_flags & (VP_FLAG_TO_ME | VP_FLAG_FROM_DP))) { - /* Force the flow lookup */ - pkt->vp_flags |= VP_FLAG_FLOW_GET; + unsigned short l4_size = 0; + unsigned char ip_proto = ip->ip_proto; + if (ip_proto == VR_IP_PROTO_UDP) { + l4_size = sizeof(struct vr_udp); + } else if (ip_proto == VR_IP_PROTO_TCP) { + l4_size = sizeof(struct vr_tcp); + } - /* Get back the IP header */ - if (!pkt_push(pkt, hlen)) { + if (l4_size) { + unsigned short l4_port = 0; + if (vr_pkt_may_pull(pkt, l4_size)) { drop_reason = VP_DROP_PUSH; goto drop_pkt; } - /* Subject it to flow for Linklocal */ - return vr_flow_inet_input(pkt->vp_nh->nh_router, + l4_port = *(unsigned short *) (pkt_data(pkt) + 2); + if (vr_valid_link_local_port(router, AF_INET, + ip_proto, ntohs(l4_port))) { + if (!(pkt->vp_flags & VP_FLAG_FLOW_SET) && + !(pkt->vp_flags & (VP_FLAG_TO_ME | + VP_FLAG_FROM_DP))) { + /* Force the flow lookup */ + pkt->vp_flags |= VP_FLAG_FLOW_GET; + + /* Get back the IP header */ + if (!pkt_push(pkt, hlen)) { + drop_reason = VP_DROP_PUSH; + goto drop_pkt; + } + /* Subject it to flow for Linklocal */ + return vr_flow_inet_input(pkt->vp_nh->nh_router, pkt->vp_nh->nh_vrf, pkt, VR_ETH_PROTO_IP, fmd); + } + } } } vif = pkt->vp_nh->nh_dev; @@ -337,7 +486,7 @@ vr_ip_input(struct vrouter *router, unsigned short vrf, struct vr_ip *ip; ip = (struct vr_ip *)pkt_data(pkt); - if (ip->ip_version != 4 || ip->ip_hl < 5) + if (ip->ip_version == 4 && ip->ip_hl < 5) goto corrupt_pkt; return vr_forward(router, vrf, pkt, fmd); @@ -448,7 +597,6 @@ vr_ip_partial_csum(struct vr_ip *ip) return csum; } - bool vr_has_to_fragment(struct vr_interface *vif, struct vr_packet *pkt, unsigned int tun_len) @@ -488,16 +636,23 @@ vr_myip(struct vr_interface *vif, unsigned int ip) { struct vr_route_req rt; struct vr_nexthop *nh; + uint32_t rt_prefix; if (vif->vif_type != VIF_TYPE_PHYSICAL) return 1; + rt.rtr_req.rtr_vrf_id = vif->vif_vrf; - rt.rtr_req.rtr_prefix = ntohl(ip); - rt.rtr_req.rtr_prefix_len = 32; + rt.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + + *(uint32_t*)rt.rtr_req.rtr_prefix = (ip); + rt.rtr_req.rtr_prefix_len = IP4_PREFIX_LEN; + rt.rtr_req.rtr_prefix_size = 4; + rt.rtr_req.rtr_src_size = rt.rtr_req.rtr_marker_size = 0; rt.rtr_req.rtr_nh_id = 0; nh = vr_inet_route_lookup(vif->vif_vrf, &rt, NULL); + if (!nh || nh->nh_type != NH_RCV) return 0; diff --git a/dp-core/vr_route.c b/dp-core/vr_route.c index 0b92a68fb..4e826e0e7 100644 --- a/dp-core/vr_route.c +++ b/dp-core/vr_route.c @@ -11,8 +11,8 @@ #include "vr_sandesh.h" static struct rtable_fspec rtable_families[]; -extern int mtrie4_algo_init(struct vr_rtable *, struct rtable_fspec *); -extern void mtrie4_algo_deinit(struct vr_rtable *, struct rtable_fspec *, bool); +extern int mtrie_algo_init(struct vr_rtable *, struct rtable_fspec *); +extern void mtrie_algo_deinit(struct vr_rtable *, struct rtable_fspec *, bool); extern int mcast_algo_init(struct vr_rtable *, struct rtable_fspec *); extern void mcast_algo_deinit(struct vr_rtable *, struct rtable_fspec *, bool); extern int bridge_table_init(struct vr_rtable *, struct rtable_fspec *); @@ -35,6 +35,8 @@ vr_get_family(unsigned int family) return &rtable_families[0]; case AF_BRIDGE: return &rtable_families[1]; + case AF_INET6: + return &rtable_families[2]; default: return NULL; @@ -49,15 +51,34 @@ vr_route_delete(vr_route_req *req) struct rtable_fspec *fs; struct vr_route_req vr_req; int ret; + uint32_t rt_prefix[4]; fs = vr_get_family(req->rtr_family); if (!fs) ret = -ENOENT; else { vr_req.rtr_req = *req; + + if (req->rtr_family != AF_BRIDGE && !req->rtr_prefix_size) { + ret = -EINVAL; + goto error; + } + + if (req ->rtr_family == AF_BRIDGE && + (!req->rtr_mac_size || !req->rtr_mac)) { + ret = -EINVAL; + goto error; + } + + if (req ->rtr_family != AF_BRIDGE) { + vr_req.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + memcpy(vr_req.rtr_req.rtr_prefix, req->rtr_prefix, RT_IP_ADDR_SIZE(req->rtr_family)); + } + vr_req.rtr_req.rtr_src_size = vr_req.rtr_req.rtr_marker_size = 0; ret = fs->route_del(fs, &vr_req); } +error: vr_send_response(ret); return ret; @@ -69,12 +90,22 @@ vr_route_add(vr_route_req *req) struct rtable_fspec *fs; struct vr_route_req vr_req; int ret; + uint32_t rt_prefix[4]; fs = vr_get_family(req->rtr_family); if (!fs) { ret = -ENOENT; } else { vr_req.rtr_req = *req; + if (req->rtr_prefix_size) { + vr_req.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + memcpy(vr_req.rtr_req.rtr_prefix, req->rtr_prefix, RT_IP_ADDR_SIZE(req->rtr_family)); + vr_req.rtr_req.rtr_src_size = vr_req.rtr_req.rtr_marker_size = 0; + vr_req.rtr_req.rtr_prefix_size = req->rtr_prefix_size; + } else { + vr_req.rtr_req.rtr_prefix = NULL; + } + ret = fs->route_add(fs, &vr_req); } @@ -119,8 +150,25 @@ vr_route_get(vr_route_req *req) struct vrouter *router; struct vr_rtable *rtable; int ret = 0; + uint32_t rt_prefix[4], rt_src[4]; vr_req.rtr_req = *req; + + vr_req.rtr_req.rtr_marker_size = 0; + vr_req.rtr_req.rtr_prefix_size = req->rtr_prefix_size; + if (req->rtr_prefix_size) { + vr_req.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + memcpy(vr_req.rtr_req.rtr_prefix, req->rtr_prefix, RT_IP_ADDR_SIZE(req->rtr_family)); + } else + vr_req.rtr_req.rtr_prefix = NULL; + + vr_req.rtr_req.rtr_src_size = req->rtr_src_size; + if (req->rtr_src_size) { + vr_req.rtr_req.rtr_src = (uint8_t*)&rt_src; + memcpy(vr_req.rtr_req.rtr_src, req->rtr_src, RT_IP_ADDR_SIZE(req->rtr_family)); + } else + vr_req.rtr_req.rtr_src = NULL; + router = vrouter_get(req->rtr_rid); if (!router) { ret = -ENOENT; @@ -148,18 +196,43 @@ vr_route_dump(vr_route_req *req) struct vrouter *router; struct vr_rtable *rtable = NULL; int ret; + uint32_t rt_prefix[4], rt_src[4], rt_marker[4]; vr_req.rtr_req = *req; + vr_req.rtr_req.rtr_prefix_size = req->rtr_prefix_size; + if (req->rtr_prefix_size) { + vr_req.rtr_req.rtr_prefix = (uint8_t*)&rt_prefix; + memcpy(vr_req.rtr_req.rtr_prefix, req->rtr_prefix, RT_IP_ADDR_SIZE(req->rtr_family)); + } else { + vr_req.rtr_req.rtr_prefix = NULL; + } + + vr_req.rtr_req.rtr_marker_size = req->rtr_marker_size; + vr_req.rtr_req.rtr_marker_plen = req->rtr_prefix_len; + if (req->rtr_marker_size) { + vr_req.rtr_req.rtr_marker = (uint8_t*)&rt_marker; + memcpy(vr_req.rtr_req.rtr_marker, req->rtr_marker, RT_IP_ADDR_SIZE(req->rtr_family)); + } else { + vr_req.rtr_req.rtr_marker = NULL; + } + + vr_req.rtr_req.rtr_src_size = req->rtr_src_size; + if (req->rtr_src_size) { + vr_req.rtr_req.rtr_src = (uint8_t*)&rt_src; + memcpy(vr_req.rtr_req.rtr_src, req->rtr_src, RT_IP_ADDR_SIZE(req->rtr_family)); + } else + vr_req.rtr_req.rtr_src = NULL; + router = vrouter_get(req->rtr_rid); if (!router) { ret = -ENOENT; goto generate_error; } else { - if (req->rtr_family == AF_INET) { - rtable = vr_get_inet_table(router, req->rtr_rt_type); - } else if (req->rtr_family == AF_BRIDGE) { + if (req->rtr_family == AF_BRIDGE) { rtable = router->vr_bridge_rtable; + } else { + rtable = vr_get_inet_table(router, req->rtr_rt_type); } if (!rtable) { @@ -169,7 +242,6 @@ vr_route_dump(vr_route_req *req) ret = rtable->algo_dump(NULL, &vr_req); } - return ret; generate_error: @@ -315,15 +387,13 @@ vr_vrf_stats_req_process(void *s_req) return; } - -#define VR_INET_MAX_PLEN 32 - int inet_route_add(struct rtable_fspec *fs, struct vr_route_req *req) { + int i; struct vr_rtable *rtable; struct vrouter *router; - unsigned int pmask; + unsigned int pmask, pmask_byte; router = vrouter_get(req->rtr_req.rtr_rid); if (!router) @@ -332,16 +402,36 @@ inet_route_add(struct rtable_fspec *fs, struct vr_route_req *req) rtable = vr_get_inet_table(router, req->rtr_req.rtr_rt_type); if (!rtable || ((unsigned int)req->rtr_req.rtr_vrf_id > fs->rtb_max_vrfs) || - ((unsigned int)(req->rtr_req.rtr_prefix_len) > VR_INET_MAX_PLEN)) + ((unsigned int)(req->rtr_req.rtr_prefix_len) > + (RT_IP_ADDR_SIZE(req->rtr_req.rtr_family)*8))) return -EINVAL; - if (req->rtr_req.rtr_prefix_len) { - pmask = ~((1 << (32 - req->rtr_req.rtr_prefix_len)) - 1); - req->rtr_req.rtr_prefix &= pmask; - } else - req->rtr_req.rtr_prefix = 0; - - return rtable->algo_add(rtable, req); + if (req->rtr_req.rtr_prefix) { + + if (req->rtr_req.rtr_family == AF_INET) + pmask = ~((1 << (32 - req->rtr_req.rtr_prefix_len)) - 1); + else + pmask = 0; //TBD: Assume V6 prefix length will be multiple of 8 + + pmask_byte = req->rtr_req.rtr_prefix_len/8; + if (pmask_byte < (RT_IP_ADDR_SIZE(req->rtr_req.rtr_family)-1)) { + for (i=pmask_byte+1; irtr_req.rtr_family); i++) { + req->rtr_req.rtr_prefix[i] = 0; + pmask = pmask >> 8; + } + req->rtr_req.rtr_prefix[pmask_byte] = + req->rtr_req.rtr_prefix[pmask_byte] & (pmask & 0xff); + } + } + + if (rtable) { + if (rtable->algo_add) + return rtable->algo_add(rtable, req); + else + return -1; + } else { + return -1; + } } int @@ -350,7 +440,8 @@ inet_route_del(struct rtable_fspec *fs, struct vr_route_req *req) struct vr_rtable *rtable; struct vrouter *router; - if ((unsigned int)(req->rtr_req.rtr_prefix_len) > VR_INET_MAX_PLEN || + if (((unsigned int)(req->rtr_req.rtr_prefix_len) > + (RT_IP_ADDR_SIZE(req->rtr_req.rtr_family)*8)) || (unsigned int)(req->rtr_req.rtr_vrf_id) >= VR_MAX_VRFS) return -EINVAL; @@ -505,8 +596,8 @@ static struct rtable_fspec rtable_families[] = { .rtb_family_deinit = inet_rtb_family_deinit, .route_add = inet_route_add, .route_del = inet_route_del, - .algo_init[RT_UCAST] = mtrie4_algo_init, - .algo_deinit[RT_UCAST] = mtrie4_algo_deinit, + .algo_init[RT_UCAST] = mtrie_algo_init, + .algo_deinit[RT_UCAST] = mtrie_algo_deinit, .algo_init[RT_MCAST] = mcast_algo_init, .algo_deinit[RT_MCAST] = mcast_algo_deinit, }, @@ -519,6 +610,18 @@ static struct rtable_fspec rtable_families[] = { .route_del = bridge_entry_del, .algo_init[RT_UCAST] = bridge_table_init, .algo_deinit[RT_UCAST] = bridge_table_deinit, + }, + { + .rtb_family = AF_INET6, + .rtb_max_vrfs = VR_MAX_VRFS, + .rtb_family_init = inet_rtb_family_init, + .rtb_family_deinit = inet_rtb_family_deinit, + .route_add = inet_route_add, + .route_del = inet_route_del, + .algo_init[RT_UCAST] = mtrie_algo_init, + .algo_deinit[RT_UCAST] = mtrie_algo_deinit, + .algo_init[RT_MCAST] = mcast_algo_init, + .algo_deinit[RT_MCAST] = mcast_algo_deinit, } }; diff --git a/freebsd/Makefile b/freebsd/Makefile index 6b16c7ee4..0d7d2ccb8 100644 --- a/freebsd/Makefile +++ b/freebsd/Makefile @@ -35,6 +35,7 @@ SRCS += vr_message.c \ vr_sandesh.c \ vr_queue.c \ vr_index_table.c \ + vr_ip_mtrie.c \ vrouter.c \ vr_route.c \ vr_nexthop.c \ @@ -43,7 +44,6 @@ SRCS += vr_message.c \ vr_packet.c \ vr_proto_ip.c \ vr_mpls.c \ - vr_ip4_mtrie.c \ vr_response.c \ vr_flow.c \ vr_mirror.c \ diff --git a/freebsd/vrouter_mod.c b/freebsd/vrouter_mod.c index 489472591..d3b24e165 100644 --- a/freebsd/vrouter_mod.c +++ b/freebsd/vrouter_mod.c @@ -94,6 +94,7 @@ struct host_os *vrouter_get_host(void); unsigned int vr_num_cpus = 1; int vr_log_level = 0; +int vrouter_dbg = 0; /* Create malloc type for vrouter */ MALLOC_DECLARE(M_VROUTER); diff --git a/host/vrouter_host_mod.c b/host/vrouter_host_mod.c index ed77244e2..69b566662 100644 --- a/host/vrouter_host_mod.c +++ b/host/vrouter_host_mod.c @@ -13,7 +13,7 @@ #include "host/vr_host_packet.h" #include "ulinux.h" -#define PAGE_SIZE 4096 +#define PAGE_SIZE 4096 unsigned int vr_num_cpus = 1; static bool vr_host_inited = false; diff --git a/include/genetlink.h b/include/genetlink.h index 75f8b0250..1756068b0 100644 --- a/include/genetlink.h +++ b/include/genetlink.h @@ -12,12 +12,12 @@ * Generic Netlink Message Header */ struct genlmsghdr { - uint8_t cmd; - uint8_t version; - uint16_t reserved; + uint8_t cmd; + uint8_t version; + uint16_t reserved; }; -#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr)) +#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr)) /* * Below identifiers have similar meaning as in Linux and the same @@ -25,12 +25,12 @@ struct genlmsghdr { * code has been defined. */ enum { - GENL_ID_GENERATE = 0, - GENL_ID_CTRL = NLMSG_MIN_TYPE, - CTRL_CMD_NEWFAMILY = 1, - CTRL_CMD_GETFAMILY = 3, - CTRL_ATTR_FAMILY_ID = 1, - CTRL_ATTR_FAMILY_NAME, + GENL_ID_GENERATE = 0, + GENL_ID_CTRL = NLMSG_MIN_TYPE, + CTRL_CMD_NEWFAMILY = 1, + CTRL_CMD_GETFAMILY = 3, + CTRL_ATTR_FAMILY_ID = 1, + CTRL_ATTR_FAMILY_NAME, }; #endif /* FAKE_GENERIC_NETLINK_H */ diff --git a/include/netlink.h b/include/netlink.h index 3bd0fb904..686eefce1 100644 --- a/include/netlink.h +++ b/include/netlink.h @@ -9,7 +9,7 @@ #include /* Only supported via "fake implementation" */ -#define NETLINK_GENERIC 0 +#define NETLINK_GENERIC 0 /* @@ -22,42 +22,42 @@ * Netlink Message Header */ struct nlmsghdr { - uint32_t nlmsg_len; /* Length of message including header; - * header needs to be padded to - * NLMSG_ALIGNTO */ - uint16_t nlmsg_type; /* Message content */ - uint16_t nlmsg_flags; /* Additional flags */ - uint32_t nlmsg_seq; /* Sequence number */ - uint32_t nlmsg_pid; /* Sending process port ID */ + uint32_t nlmsg_len; /* Length of message including header; + * header needs to be padded to + * NLMSG_ALIGNTO */ + uint16_t nlmsg_type; /* Message content */ + uint16_t nlmsg_flags; /* Additional flags */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ }; /* Original alignment, from Linux, has been preserved */ -#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGNTO 4U #define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) -#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) #define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) #define NLMSG_DATA(nlhp) ((void*)(((char*)nlhp) + NLMSG_HDRLEN) -#define NLMSG_NEXT(nlhp,len) ((len) -= NLMSG_ALIGN((nlhp)->nlmsg_len), \ - (struct nlmsghdr*)(((char*)(nlhp)) + \ - NLMSG_ALIGN((nlhp)->nlmsg_len))) +#define NLMSG_NEXT(nlhp,len) ((len) -= NLMSG_ALIGN((nlhp)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlhp)) + \ + NLMSG_ALIGN((nlhp)->nlmsg_len))) -#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ - (nlhp)->nlmsg_len >= sizeof(struct nlmsghdr) && \ - (nlhp)->nlmsg_len <= (len)) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlhp)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlhp)->nlmsg_len <= (len)) -#define NLM_F_REQUEST 1 /* Request type of message */ -#define NLM_F_MULTI 2 /* Part of multiple message sequence */ -#define NLM_F_ACK 4 /* Ack reply */ +#define NLM_F_REQUEST 1 /* Request type of message */ +#define NLM_F_MULTI 2 /* Part of multiple message sequence */ +#define NLM_F_ACK 4 /* Ack reply */ -#define NLM_F_DUMP 0x300 -#define NLM_F_CREATE 0x400 +#define NLM_F_DUMP 0x300 +#define NLM_F_CREATE 0x400 -#define NLMSG_ERROR 0x2 -#define NLMSG_DONE 0x3 /* End of multi-message stream */ +#define NLMSG_ERROR 0x2 +#define NLMSG_DONE 0x3 /* End of multi-message stream */ -#define NLMSG_MIN_TYPE 0x10 /* Linux reserves below values for control - * messages */ +#define NLMSG_MIN_TYPE 0x10 /* Linux reserves below values for control + * messages */ /* @@ -69,14 +69,14 @@ struct nlmsghdr { * header does state. */ struct nlattr { - uint16_t nla_len; /* Length of attribute header - * aligned to NLA_ALIGNTO + payload - * length. */ - uint16_t nla_type; + uint16_t nla_len; /* Length of attribute header + * aligned to NLA_ALIGNTO + payload + * length. */ + uint16_t nla_type; }; -#define NLA_ALIGNTO 4 -#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) -#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) #endif /* FAKE_NETLINK_H */ diff --git a/include/vr_compat.h b/include/vr_compat.h index 6f52f010e..366a98e5f 100644 --- a/include/vr_compat.h +++ b/include/vr_compat.h @@ -18,9 +18,14 @@ typedef u64 netdev_features_t; * As per lxr, skb_get_rxhash exists in 3.13 versions and disappeared in * 3.14. We do not know of in between versions. However, the ubuntu * sources for 3.13.0-32 does not have it (for which the LINUX_VERSION - * CODE is 199947, which corresponds to 3.13.11) and hence the following + * CODE is 199947, which corresponds to 3.13.11) and hence the following. + * + * But then in 3.13.0-36, ubuntu did + * + * #define skb_get_rxhash skb_get_hash */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,11)) +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,11)) && \ + !(defined(skb_get_rxhash)) static inline __u32 skb_get_rxhash(struct sk_buff *skb) { @@ -115,9 +120,9 @@ enum rx_handler_result { typedef enum rx_handler_result rx_handler_result_t; -#define VLAN_CFI_MASK 0x1000 +#define VLAN_CFI_MASK 0x1000 #define VLAN_TAG_PRESENT VLAN_CFI_MASK -#define ARPHRD_VOID 0xFFFF +#define ARPHRD_VOID 0xFFFF #if (RHEL_MAJOR != 6) && (RHEL_MINOR != 4) diff --git a/include/vr_flow.h b/include/vr_flow.h index 7e872cab7..5d7469621 100644 --- a/include/vr_flow.h +++ b/include/vr_flow.h @@ -21,6 +21,7 @@ #define VR_RFLOW_VALID 0x1000 #define VR_FLOW_FLAG_MIRROR 0x2000 #define VR_FLOW_FLAG_VRFT 0x4000 +#define VR_FLOW_FLAG_LINK_LOCAL 0x8000 /* rest of the flags are action specific */ @@ -166,6 +167,9 @@ struct vr_flow_entry { #define VR_UDP_DNS_SPORT (17 << 16 | htons(53)) #define VR_TCP_DNS_SPORT (6 << 16 | htons(53)) +#define VR_DHCP6_SPORT htons(546) +#define VR_DHCP6_DPORT htons(547) + #define VR_DNS_SERVER_PORT htons(53) struct vr_flow_md { @@ -187,11 +191,17 @@ extern int vr_flow_init(struct vrouter *); extern void vr_flow_exit(struct vrouter *, bool); extern unsigned int vr_flow_inet_input(struct vrouter *, unsigned short, struct vr_packet *, unsigned short, struct vr_forwarding_md *); +extern unsigned int vr_flow_inet6_input(struct vrouter *, unsigned short, + struct vr_packet *, unsigned short, struct vr_forwarding_md *); + +extern int vr_flow_forward(unsigned short vrf, struct vr_packet *pkt, + unsigned short proto, struct vr_forwarding_md *fmd); extern inline unsigned int vr_flow_bypass(struct vrouter *, struct vr_flow_key *, struct vr_packet *, unsigned int *); void *vr_flow_get_va(struct vrouter *, uint64_t); unsigned int vr_flow_table_size(struct vrouter *); unsigned int vr_oflow_table_size(struct vrouter *); struct vr_flow_entry * vr_get_flow_entry(struct vrouter *, int ); +bool vr_valid_link_local_port(struct vrouter *, int , int , int ); #endif /* __VR_FLOW_H__ */ diff --git a/include/vr_ip4_mtrie.h b/include/vr_ip4_mtrie.h deleted file mode 100644 index 5c0647980..000000000 --- a/include/vr_ip4_mtrie.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * vr_ip4_mtrie.h -- - * - * Copyright (c) 2013 Juniper Networks, Inc. All rights reserved. - */ -#ifndef __VR_IP4_MTRIE_H__ -#define __VR_IP4_MTRIE_H__ - -#ifdef __cplusplus -extern "C" { -#endif -struct ip4_bucket; - -/* - * Override the least significant bit of a pointer to indicate whether it - * points to a bucket or nexthop. - */ -#define ENTRY_IS_BUCKET(EPtr) (((EPtr)->entry_long_i) & 0x1ul) -#define ENTRY_IS_NEXTHOP(EPtr) !ENTRY_IS_BUCKET(EPtr) - -#define PTR_IS_BUCKET(ptr) ((ptr) & 0x1ul) -#define PTR_IS_NEXTHOP(ptr) !PTR_IS_BUCKET(ptr) -#define PTR_TO_BUCKET(ptr) ((struct ip4_bucket *)((ptr) ^ 0x1ul)) -#define PTR_TO_NEXTHOP(ptr) ((struct vr_nexthop *)(ptr)) - -struct ip4_bucket_entry { - union { - struct vr_nexthop *nexthop_p; - struct ip4_bucket *bucket_p; - unsigned long long_i; - } entry_data; - - unsigned int entry_prefix_len:8; - unsigned int entry_label_flags:4; - unsigned int entry_label:20; -}; - -#define entry_nh_p entry_data.nexthop_p -#define entry_bkt_p entry_data.bucket_p -#define entry_long_i entry_data.long_i - -struct ip4_bucket { - struct ip4_bucket_entry bkt_data[0]; -}; - -/* - * Ip4Mtrie - * - * Ip4Mtrie ensures that an IP lookup can be performed in 3 data fetches. It - * organizes the lookup buckets in a (16 + 8 + 8) structure. - */ -struct ip4_mtrie { - struct ip4_bucket_entry root; -}; - -#define IP4_PREFIX_LEN 32 - -#define IP4BUCKET_LEVEL0 0 -#define IP4BUCKET_LEVEL0_BITS 8 -#define IP4BUCKET_LEVEL0_PFX_LEN IP4BUCKET_LEVEL0_BITS -#define IP4BUCKET_LEVEL0_SHIFT (IP4_PREFIX_LEN - IP4BUCKET_LEVEL0_BITS) -#define IP4BUCKET_LEVEL0_SIZE (1 << IP4BUCKET_LEVEL0_BITS) -#define IP4BUCKET_LEVEL0_MASK (IP4BUCKET_LEVEL0_SIZE - 1) - -#define IP4BUCKET_LEVEL1 1 -#define IP4BUCKET_LEVEL1_BITS 8 -#define IP4BUCKET_LEVEL1_PFX_LEN (IP4BUCKET_LEVEL0_PFX_LEN + \ - IP4BUCKET_LEVEL1_BITS) -#define IP4BUCKET_LEVEL1_SHIFT (IP4BUCKET_LEVEL0_SHIFT - IP4BUCKET_LEVEL1_BITS) -#define IP4BUCKET_LEVEL1_SIZE (1 << IP4BUCKET_LEVEL1_BITS) -#define IP4BUCKET_LEVEL1_MASK (IP4BUCKET_LEVEL1_SIZE - 1) - -#define IP4BUCKET_LEVEL2 2 -#define IP4BUCKET_LEVEL2_BITS 8 -#define IP4BUCKET_LEVEL2_PFX_LEN (IP4BUCKET_LEVEL1_PFX_LEN + \ - IP4BUCKET_LEVEL2_BITS) -#define IP4BUCKET_LEVEL2_SHIFT (IP4BUCKET_LEVEL1_SHIFT - IP4BUCKET_LEVEL2_BITS) -#define IP4BUCKET_LEVEL2_SIZE (1 << IP4BUCKET_LEVEL2_BITS) -#define IP4BUCKET_LEVEL2_MASK (IP4BUCKET_LEVEL2_SIZE - 1) - -#define IP4BUCKET_LEVEL3 3 -#define IP4BUCKET_LEVEL3_BITS 8 -#define IP4BUCKET_LEVEL3_PFX_LEN (IP4BUCKET_LEVEL2_PFX_LEN + \ - IP4BUCKET_LEVEL3_BITS) -#define IP4BUCKET_LEVEL3_SHIFT (IP4BUCKET_LEVEL2_SHIFT - IP4BUCKET_LEVEL3_BITS) -#define IP4BUCKET_LEVEL3_SIZE (1 << IP4BUCKET_LEVEL3_BITS) -#define IP4BUCKET_LEVEL3_MASK (IP4BUCKET_LEVEL3_SIZE - 1) - -struct mtrie_bkt_info { - unsigned char bi_bits; - unsigned char bi_shift; - unsigned char bi_pfx_len; - unsigned int bi_mask; - unsigned int bi_size; -}; - - -#ifdef __cplusplus -} -#endif -#endif /* __VR_IP4_MTRIE_H__ */ diff --git a/include/vr_ip_mtrie.h b/include/vr_ip_mtrie.h new file mode 100644 index 000000000..5edbfbe85 --- /dev/null +++ b/include/vr_ip_mtrie.h @@ -0,0 +1,78 @@ +/* + * vr_ip_mtrie.h -- + * + * Copyright (c) 2014 Juniper Networks, Inc. All rights reserved. + */ +#ifndef __VR_IP_MTRIE_H__ +#define __VR_IP_MTRIE_H__ + +#ifdef __cplusplus +extern "C" { +#endif +struct ip_bucket; + +/* + * Override the least significant bit of a pointer to indicate whether it + * points to a bucket or nexthop. + */ +#define ENTRY_IS_BUCKET(EPtr) (((EPtr)->entry_long_i) & 0x1ul) +#define ENTRY_IS_NEXTHOP(EPtr) !ENTRY_IS_BUCKET(EPtr) + +#define PTR_IS_BUCKET(ptr) ((ptr) & 0x1ul) +#define PTR_IS_NEXTHOP(ptr) !PTR_IS_BUCKET(ptr) +#define PTR_TO_BUCKET(ptr) ((struct ip_bucket *)((ptr) ^ 0x1ul)) +#define PTR_TO_NEXTHOP(ptr) ((struct vr_nexthop *)(ptr)) + +struct ip_bucket_entry { + union { + struct vr_nexthop *nexthop_p; + struct ip_bucket *bucket_p; + unsigned long long_i; + } entry_data; + + unsigned int entry_prefix_len:8; + unsigned int entry_label_flags:4; + unsigned int entry_label:20; +}; + +#define entry_nh_p entry_data.nexthop_p +#define entry_bkt_p entry_data.bucket_p +#define entry_long_i entry_data.long_i + +struct ip_bucket { + struct ip_bucket_entry bkt_data[0]; +}; + +/* + * IpMtrie + * + * IpMtrie ensures that an IPv4 lookup can be performed in 3 data fetches. + * IPv6 lookup will require 15 data fetches. + * + */ +struct ip_mtrie { + struct ip_bucket_entry root; +}; + +#define IP4_PREFIX_LEN 32 +#define IP6_PREFIX_LEN 128 + +#define IPBUCKET_LEVEL_BITS 8 +#define IPBUCKET_LEVEL_PFX_LEN IPBUCKET_LEVEL_BITS +#define IPBUCKET_LEVEL_SIZE (1 << IPBUCKET_LEVEL_BITS) +#define IPBUCKET_LEVEL_MASK (IPBUCKET_LEVEL_SIZE - 1) + + +struct mtrie_bkt_info { + unsigned int bi_bits; + unsigned char bi_shift; + unsigned char bi_pfx_len; + unsigned int bi_mask; + unsigned int bi_size; +}; + + +#ifdef __cplusplus +} +#endif +#endif /* __VR_IP_MTRIE_H__ */ diff --git a/include/vr_linux.h b/include/vr_linux.h index 144937f4b..0232169e8 100644 --- a/include/vr_linux.h +++ b/include/vr_linux.h @@ -12,6 +12,6 @@ vp_os_packet(struct vr_packet *pkt) return CONTAINER_OF(cb, struct sk_buff, pkt); } -#define VROUTER_VERSIONID "1.0" +#define VROUTER_VERSIONID "1.0" #endif /* __VR_LINUX_H__ */ diff --git a/include/vr_os.h b/include/vr_os.h index 2035365c8..c2f96fcef 100644 --- a/include/vr_os.h +++ b/include/vr_os.h @@ -81,7 +81,7 @@ typedef unsigned int __u32; * BSD has no family AF_BRIDGE so to avoid to many ifdef in ksync and * vrouter code it is defined here in the same way as in LINUX */ -#define AF_BRIDGE 7 +#define AF_BRIDGE 7 #if defined(_KERNEL) #define vr_printf(format, arg...) printf(format, ##arg) diff --git a/include/vr_packet.h b/include/vr_packet.h index a526fb862..fb07ec9b2 100644 --- a/include/vr_packet.h +++ b/include/vr_packet.h @@ -18,12 +18,12 @@ #define VR_ETHER_PROTO_MAC_OFF 1 #define VR_ETHER_PROTO_MAC_LEN 2 - #define VR_IP_PROTO_ICMP 1 #define VR_IP_PROTO_IGMP 2 #define VR_IP_PROTO_TCP 6 #define VR_IP_PROTO_UDP 17 #define VR_IP_PROTO_GRE 47 +#define VR_IP_PROTO_ICMP6 58 #define VR_GRE_FLAG_CSUM (ntohs(0x8000)) #define VR_GRE_FLAG_KEY (ntohs(0x2000)) @@ -36,6 +36,8 @@ /* Size of GRE header with key */ #define VR_GRE_KEY_HDR_LEN 8 +#define VR_DYNAMIC_PORT_START 32768 +#define VR_DYNAMIC_PORT_END 65535 /* * Overlay length used for TCP MSS adjust. For UDP outer header, overlay @@ -86,9 +88,9 @@ /* * Values to define how to proceed with handling a packet. */ -#define PKT_RET_FAST_PATH 1 -#define PKT_RET_SLOW_PATH 2 -#define PKT_RET_ERROR 3 +#define PKT_RET_FAST_PATH 1 +#define PKT_RET_SLOW_PATH 2 +#define PKT_RET_ERROR 3 /* * Values to define the MPLS tunnel type @@ -260,6 +262,7 @@ struct vr_vlan_hdr { #define VR_ETH_PROTO_ARP 0x806 #define VR_ETH_PROTO_IP 0x800 +#define VR_ETH_PROTO_IP6 0x86DD #define VR_ETH_PROTO_VLAN 0x8100 #define VR_DIAG_IP_CSUM 0xffff @@ -317,6 +320,47 @@ struct vr_ip { unsigned int ip_daddr; } __attribute__((packed)); +struct vr_ip6 { +#ifdef __KERNEL__ +#if defined(__LITTLE_ENDIAN_BITFIELD) + unsigned int + ip6_flow:20, + ip6_priority:8, + ip6_version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned int + ip6_version:4, + ip6_priority:8, + ip6_flow:20; +#endif +#else +#if (__BYTE_ORDER == __LITTLE_ENDIAN) + unsigned int + ip6_flow:20, + ip6_priority:8, + ip6_version:4; +#elif (__BYTE_ORDER == __BIG_ENDIAN) + unsigned int + ip6_version:4, + ip6_priority:8, + ip6_flow:20; +#endif +#endif + unsigned short ip6_plen; /* payload length */ + unsigned char ip6_nxt; /* next header */ + unsigned char ip6_hlim; /* hop limit */ + unsigned char ip6_src[16]; /* source address */ + unsigned char ip6_dst[16]; /* destination address */ +} __attribute__((packed)); + +static inline bool +vr_ip_is_ip6(struct vr_ip *iph) +{ + if ((iph->ip_version & 0xf) == 0x4) + return false; + else + return true; +} static inline unsigned char *pkt_network_header(struct vr_packet *); static inline bool @@ -337,6 +381,9 @@ vr_ip_fragment_tail(struct vr_ip *iph) bool more = (frag & VR_IP_MF) ? true : false; unsigned short offset = frag & VR_IP_FRAG_OFFSET_MASK; + if (vr_ip_is_ip6(iph)) + return false; + if (!more && offset) return true; @@ -361,6 +408,9 @@ vr_ip_fragment_head(struct vr_ip *iph) bool more = (frag & VR_IP_MF) ? true : false; unsigned short offset = frag & VR_IP_FRAG_OFFSET_MASK; + if (vr_ip_is_ip6(iph)) + return false; + if (more && !offset) return true; @@ -374,6 +424,9 @@ vr_ip_fragment(struct vr_ip *iph) bool more = (frag & VR_IP_MF) ? true : false; unsigned short offset = frag & VR_IP_FRAG_OFFSET_MASK; + if (vr_ip_is_ip6(iph)) + return false; + if (offset || more) return true; @@ -386,6 +439,9 @@ vr_ip_transport_header_valid(struct vr_ip *iph) unsigned short frag = ntohs(iph->ip_frag_off); unsigned short offset = frag & VR_IP_FRAG_OFFSET_MASK; + if (vr_ip_is_ip6(iph)) + return true; + if (offset) return false; @@ -434,6 +490,13 @@ struct vr_udp { #define VR_ICMP_TYPE_ECHO 8 #define VR_ICMP_TYPE_TIME_EXCEEDED 11 +#define VR_ICMP6_TYPE_PKT_TOO_BIG 2 +#define VR_ICMP6_TYPE_ECHO_REQ 128 +#define VR_ICMP6_TYPE_ECHO_REPLY 129 +#define VR_ICMP6_TYPE_ROUTER_SOL 133 +#define VR_ICMP6_TYPE_NEIGH_SOL 135 +#define VR_ICMP6_TYPE_NEIGH_AD 136 + struct vr_icmp { uint8_t icmp_type; uint8_t icmp_code; @@ -441,6 +504,7 @@ struct vr_icmp { /* now only for icmp echo */ uint16_t icmp_eid; uint16_t icmp_eseq; + uint8_t icmp_data[0]; // Compatibility with ICMPv6 } __attribute__((packed)); static inline bool @@ -654,7 +718,7 @@ static inline unsigned char * pkt_push(struct vr_packet *pkt, unsigned int len) { if (len > pkt->vp_data) - return NULL; + return NULL; pkt->vp_data -= len; pkt->vp_len += len; diff --git a/include/vr_route.h b/include/vr_route.h index 213a1b9b7..52d927b6f 100644 --- a/include/vr_route.h +++ b/include/vr_route.h @@ -19,6 +19,9 @@ extern "C" { #define IS_LINK_LOCAL_IP(ip) \ ((ntohl(ip) & METADATA_IP_MASK) == METADATA_IP_SUBNET) +#define RT_IP_ADDR_SIZE(family) \ + ((family == AF_INET6)?16:4) + struct vrouter; struct rtable_fspec; diff --git a/include/vrouter.h b/include/vrouter.h index 043118f5b..ffbb52695 100644 --- a/include/vrouter.h +++ b/include/vrouter.h @@ -105,6 +105,7 @@ struct host_os { (*is_label_l2)(unsigned int, unsigned int, unsigned short *), int *, int *); + int (*hos_pkt_may_pull)(struct vr_packet *, unsigned int); }; #define vr_malloc vrouter_host->hos_malloc @@ -143,6 +144,7 @@ struct host_os { #define vr_pull_inner_headers_fast vrouter_host->hos_pull_inner_headers_fast #define vr_get_udp_src_port vrouter_host->hos_get_udp_src_port #define vr_pkt_from_vm_tcp_mss_adj vrouter_host->hos_pkt_from_vm_tcp_mss_adj +#define vr_pkt_may_pull vrouter_host->hos_pkt_may_pull struct vrouter { unsigned int vr_num_if; @@ -179,6 +181,9 @@ struct vrouter { uint64_t **vr_pdrop_stats; + uint16_t vr_link_local_ports_size; + unsigned char *vr_link_local_ports; + struct vr_interface *vr_agent_if; struct vr_interface *vr_host_if; struct vr_interface *vr_eth_if; diff --git a/linux/vhost_dev.c b/linux/vhost_dev.c index 65ab3cf70..7c751a0ff 100644 --- a/linux/vhost_dev.c +++ b/linux/vhost_dev.c @@ -225,6 +225,8 @@ vhost_if_add(struct vr_interface *vif) vp->vp_vifp = vif; if (vif->vif_type == VIF_TYPE_HOST) { + dev->features |= (NETIF_F_GSO | NETIF_F_TSO | NETIF_F_SG | NETIF_F_IP_CSUM); + if (vif->vif_bridge) { vp->vp_phys_dev = (struct net_device *)vif->vif_bridge->vif_os; diff --git a/linux/vr_host_interface.c b/linux/vr_host_interface.c index 396415824..ffc218509 100644 --- a/linux/vr_host_interface.c +++ b/linux/vr_host_interface.c @@ -752,6 +752,8 @@ linux_if_tx(struct vr_interface *vif, struct vr_packet *pkt) struct sk_buff *skb = vp_os_packet(pkt); struct skb_shared_info *sinfo; struct vr_ip *ip; + struct vr_ip6 *ip6; + int proto; unsigned short network_off, transport_off, cksum_off; #if CONFIG_XEN && (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,32)) unsigned char *data; @@ -823,7 +825,14 @@ linux_if_tx(struct vr_interface *vif, struct vr_packet *pkt) if (network_off) { ip = (struct vr_ip *)(pkt_data_at_offset(pkt, network_off)); - transport_off = network_off + (ip->ip_hl * 4); + if (!vr_ip_is_ip6(ip)) { + transport_off = network_off + (ip->ip_hl * 4); + proto = ip->ip_proto; + } else { + ip6 = (struct vr_ip6 *)ip; + transport_off = network_off + sizeof(struct vr_ip6); + proto = ip6->ip6_nxt; + } skb_set_network_header(skb, (network_off - skb_headroom(skb))); skb_reset_mac_len(skb); @@ -837,9 +846,9 @@ linux_if_tx(struct vr_interface *vif, struct vr_packet *pkt) */ if (pkt->vp_flags & VP_FLAG_CSUM_PARTIAL) { cksum_off = skb->csum_offset; - if (ip->ip_proto == VR_IP_PROTO_TCP) + if (proto == VR_IP_PROTO_TCP) cksum_off = offsetof(struct vr_tcp, tcp_csum); - else if (ip->ip_proto == VR_IP_PROTO_UDP) + else if (proto == VR_IP_PROTO_UDP) cksum_off = offsetof(struct vr_udp, udp_csum); skb_partial_csum_set(skb, (transport_off - skb_headroom(skb)), cksum_off); @@ -856,7 +865,7 @@ linux_if_tx(struct vr_interface *vif, struct vr_packet *pkt) * packet, the value will be wrong, and that's where the following * check comes into picture */ - if (ip->ip_proto == VR_IP_PROTO_UDP) { + if (proto == VR_IP_PROTO_UDP) { sinfo = skb_shinfo(skb); if (!(sinfo->gso_type & SKB_GSO_UDP)) { sinfo->gso_type &= ~(SKB_GSO_TCPV4 | SKB_GSO_TCP_ECN | @@ -932,6 +941,7 @@ linux_to_vr(struct vr_interface *vif, struct sk_buff *skb) if (!pkt) return 0; + pkt->vp_flags |= VP_FLAG_GSO; vif->vif_rx(vif, pkt, VLAN_ID_INVALID); return 0; diff --git a/linux/vrouter_mod.c b/linux/vrouter_mod.c index e1c620eba..67f215d7f 100644 --- a/linux/vrouter_mod.c +++ b/linux/vrouter_mod.c @@ -691,7 +691,7 @@ lh_get_udp_src_port(struct vr_packet *pkt, struct vr_forwarding_md *fmd, * otherwise. */ static void -lh_adjust_tcp_mss(struct tcphdr *tcph, struct sk_buff *skb, unsigned short overlay_len) +lh_adjust_tcp_mss(struct tcphdr *tcph, struct sk_buff *skb, unsigned short overlay_len, unsigned short hlen) { int opt_off = sizeof(struct tcphdr); u8 *opt_ptr = (u8 *) tcph; @@ -732,8 +732,7 @@ lh_adjust_tcp_mss(struct tcphdr *tcph, struct sk_buff *skb, unsigned short overl } max_mss = dev->mtu - - (overlay_len + sizeof(struct vr_ip) + - sizeof(struct tcphdr)); + (overlay_len + hlen + sizeof(struct tcphdr)); if (pkt_mss > max_mss) { opt_ptr[opt_off+2] = (max_mss & 0xff00) >> 8; @@ -773,43 +772,52 @@ static int lh_pkt_from_vm_tcp_mss_adj(struct vr_packet *pkt, unsigned short overlay_len) { struct sk_buff *skb = vp_os_packet(pkt); - int hlen, pull_len; + int hlen, pull_len, proto; struct vr_ip *iph; + struct vr_ip6 *ip6h; struct tcphdr *tcph; /* * Pull enough of the header into the linear part of the skb to be * able to inspect/modify the TCP header MSS value. */ + iph = (struct vr_ip *) (skb->head + pkt->vp_data); + pull_len = pkt->vp_data - (skb_headroom(skb)); - pull_len += sizeof(struct vr_ip); - if (!pskb_may_pull(skb, pull_len)) { - return VP_DROP_PULL; - } + if (vr_ip_is_ip6(iph)) { - iph = (struct vr_ip *) (skb->head + pkt->vp_data); + ip6h = (struct vr_ip6 *)iph; + pull_len += sizeof(struct vr_ip6); + proto = ip6h->ip6_nxt; + hlen = sizeof(struct vr_ip6); + } else { + /* + * If this is a fragment and not the first one, it can be ignored + */ + if (iph->ip_frag_off & htons(IP_OFFSET)) { + goto out; + } - if (iph->ip_proto != VR_IP_PROTO_TCP) { - goto out; + pull_len += sizeof(struct vr_ip); + proto = iph->ip_proto; + hlen = iph->ip_hl * 4; } - /* - * If this is a fragment and not the first one, it can be ignored - */ - if (iph->ip_frag_off & htons(IP_OFFSET)) { + if (!pskb_may_pull(skb, pull_len)) { + return VP_DROP_PULL; + } + + if (proto != VR_IP_PROTO_TCP) { goto out; } - hlen = iph->ip_hl * 4; - pull_len += (hlen - sizeof(struct vr_ip)); pull_len += sizeof(struct tcphdr); if (!pskb_may_pull(skb, pull_len)) { return VP_DROP_PULL; } - iph = (struct vr_ip *) (skb->head + pkt->vp_data); tcph = (struct tcphdr *) ((char *) iph + hlen); if ((tcph->doff << 2) <= (sizeof(struct tcphdr))) { @@ -825,10 +833,7 @@ lh_pkt_from_vm_tcp_mss_adj(struct vr_packet *pkt, unsigned short overlay_len) return VP_DROP_PULL; } - iph = (struct vr_ip *) (skb->head + pkt->vp_data); - tcph = (struct tcphdr *) ((char *) iph + hlen); - - lh_adjust_tcp_mss(tcph, skb, overlay_len); + lh_adjust_tcp_mss(tcph, skb, overlay_len, hlen); out: lh_reset_skb_fields(pkt); @@ -1618,6 +1623,21 @@ lh_pull_inner_headers_fast_gre(struct vr_packet *pkt, int return 1; } +static int +lh_pkt_may_pull(struct vr_packet *pkt, unsigned int len) +{ + struct sk_buff *skb = vp_os_packet(pkt); + unsigned int pull_len; + + pull_len = pkt->vp_data - skb_headroom(skb); + pull_len += len; + if (!pskb_may_pull(skb, pull_len)) + return -1; + + lh_reset_skb_fields(pkt); + return 0; +} + /* * lh_pull_inner_headers_fast - faster version of lh_pull_inner_headers that * avoids multiple calls to pskb_may_pull(). In the common case, this @@ -1906,7 +1926,7 @@ lh_pull_inner_headers(struct vr_packet *pkt, */ skb_push(skb, skb_pull_len); if (tcph && vr_to_vm_mss_adj) { - lh_adjust_tcp_mss(tcph, skb, vrouter_overlay_len); + lh_adjust_tcp_mss(tcph, skb, vrouter_overlay_len, sizeof(struct vr_ip)); } } else { if ((iph->ip_proto == VR_IP_PROTO_TCP) && @@ -1920,7 +1940,7 @@ lh_pull_inner_headers(struct vr_packet *pkt, } skb_push(skb, tcpoff); if (vr_to_vm_mss_adj) { - lh_adjust_tcp_mss(tcph, skb, vrouter_overlay_len); + lh_adjust_tcp_mss(tcph, skb, vrouter_overlay_len, sizeof(struct vr_ip)); } } } @@ -2082,6 +2102,7 @@ struct host_os linux_host = { .hos_pull_inner_headers_fast = lh_pull_inner_headers_fast, .hos_get_udp_src_port = lh_get_udp_src_port, .hos_pkt_from_vm_tcp_mss_adj = lh_pkt_from_vm_tcp_mss_adj, + .hos_pkt_may_pull = lh_pkt_may_pull, }; struct host_os * diff --git a/sandesh/vr.sandesh b/sandesh/vr.sandesh index dc23dc207..0558cd57e 100644 --- a/sandesh/vr.sandesh +++ b/sandesh/vr.sandesh @@ -92,14 +92,14 @@ buffer sandesh vr_route_req { 2: i32 rtr_vrf_id; 3: i32 rtr_family; 4: i32 rtr_rt_type; - 5: i32 rtr_prefix; - 6: i32 rtr_src; + 5: list rtr_prefix; + 6: list rtr_src; 7: i32 rtr_prefix_len; 8: i16 rtr_rid; 9: i16 rtr_label_flags; 10: i32 rtr_label; 11: i32 rtr_nh_id; - 12: i32 rtr_marker; + 12: list rtr_marker; 13: i32 rtr_marker_plen; 14: list rtr_mac; 15: i32 rtr_replace_plen; diff --git a/utils/flow.c b/utils/flow.c index 1b1066a8b..ec6548ccd 100644 --- a/utils/flow.c +++ b/utils/flow.c @@ -194,6 +194,9 @@ dump_table(struct flow_table *ft) case VR_FLOW_FLAG_DPAT: flag_string[fi++] = 'P'; flag_string[fi++] = 'd'; + break; + case VR_FLOW_FLAG_LINK_LOCAL: + flag_string[fi++] = 'L'; } break; diff --git a/utils/rt.c b/utils/rt.c index 663809a2b..2eca3cce2 100644 --- a/utils/rt.c +++ b/utils/rt.c @@ -42,6 +42,7 @@ static struct nl_client *cl; static int resp_code; static vr_route_req rt_req; +static uint8_t rt_prefix[16], rt_src[16], rt_marker[16]; static bool cmd_proxy_set = false; static int cmd_set, dump_set; @@ -55,17 +56,18 @@ static int cmd_vrf_id = -1, cmd_family_id; static int cmd_op = -1; static int cmd_nh_id = -1; -static uint32_t cmd_prefix = 0, cmd_plen = 0, cmd_src = 0xffffffff; +static uint8_t cmd_prefix[16], cmd_src[16]; +static uint32_t cmd_plen = 0; static int32_t cmd_label; static int cmd_rt_type = RT_UCAST; static uint32_t cmd_replace_plen = 100; static char cmd_dst_mac[6]; - static void Usage(void); static void usage_internal(void); #define INET_FAMILY_STRING "inet" #define BRIDGE_FAMILY_STRING "bridge" +#define INET6_FAMILY_STRING "inet6" #define UCST_TABLE_STRING "ucst" #define MCST_TABLE_STRING "mcst" @@ -86,6 +88,8 @@ family_string_to_id(char *fname) return AF_INET; else if (!strncmp(fname, BRIDGE_FAMILY_STRING, strlen(BRIDGE_FAMILY_STRING))) return AF_BRIDGE; + else if (!strncmp(fname, INET6_FAMILY_STRING, strlen(INET6_FAMILY_STRING))) + return AF_INET6; return -1; } @@ -93,16 +97,37 @@ family_string_to_id(char *fname) void vr_route_req_process(void *s_req) { - int ret, i; - struct in_addr addr; + int ret = 0, i; + int8_t addr[16]; char flags[4]; vr_route_req *rt = (vr_route_req *)s_req; - rt_req.rtr_marker = rt->rtr_prefix; - rt_req.rtr_marker_plen = rt->rtr_prefix_len; - rt_req.rtr_prefix = rt->rtr_prefix; + if (!rt_req.rtr_prefix) { + rt_req.rtr_prefix = rt_prefix; + rt_req.rtr_marker = rt_marker; + rt_req.rtr_src = rt_src; + + } + rt_req.rtr_prefix_size = rt_req.rtr_marker_size = rt_req.rtr_src_size = 0; + + + if (rt->rtr_prefix_size) { + memcpy(rt_req.rtr_prefix, rt->rtr_prefix, RT_IP_ADDR_SIZE(rt->rtr_family)); + memcpy(rt_req.rtr_marker, rt->rtr_prefix, RT_IP_ADDR_SIZE(rt->rtr_family)); + rt_req.rtr_prefix_size = rt_req.rtr_marker_size = RT_IP_ADDR_SIZE(rt->rtr_family); + } else { + memset(rt_req.rtr_prefix, 0, 16); + memset(rt_req.rtr_marker, 0, 16); + } + + if (rt->rtr_src_size) { + memcpy(rt_req.rtr_src, rt->rtr_src, RT_IP_ADDR_SIZE(rt->rtr_family)); + rt_req.rtr_src_size = RT_IP_ADDR_SIZE(rt->rtr_family); + } else { + memset(rt_req.rtr_src, 0, 16); + } + rt_req.rtr_prefix_len = rt->rtr_prefix_len; - rt_req.rtr_src = rt->rtr_src; rt_req.rtr_rt_type = rt->rtr_rt_type; if (rt->rtr_mac) { if (!rt_req.rtr_mac) { @@ -113,10 +138,14 @@ vr_route_req_process(void *s_req) } rt_req.rtr_vrf_id = rt->rtr_vrf_id; - if (rt->rtr_family == AF_INET) { + if ((rt->rtr_family == AF_INET) || + (rt->rtr_family == AF_INET6)) { if (rt->rtr_rt_type == RT_UCAST) { - addr.s_addr = htonl(rt->rtr_prefix); - ret = printf("%s/%-2d", inet_ntoa(addr), rt->rtr_prefix_len); + + if (rt->rtr_prefix_size) { + inet_ntop(rt->rtr_family, rt->rtr_prefix, addr, 16); + ret = printf("%s/%-2d", addr, rt->rtr_prefix_len); + } for (i = ret; i < 21; i++) printf(" "); @@ -147,11 +176,13 @@ vr_route_req_process(void *s_req) printf("%7d", rt->rtr_nh_id); printf("\n"); - } else { - addr.s_addr = htonl(rt->rtr_src); - ret = printf("%s,", inet_ntoa(addr)); - addr.s_addr = htonl(rt->rtr_prefix); - ret += printf("%s", inet_ntoa(addr)); + } else { + if (rt->rtr_src_size) + inet_ntop(rt->rtr_family, rt->rtr_src, addr, 16); + ret = printf("%s,", addr); + if (rt->rtr_prefix_size) + inet_ntop(rt->rtr_family, rt->rtr_prefix, addr, 16); + ret += printf("%s", addr); for (i = ret; i < 33; i++) printf(" "); printf(" "); @@ -159,12 +190,20 @@ vr_route_req_process(void *s_req) printf("\n"); } } else { - printf("%12s %5d", ether_ntoa((struct ether_addr *)(rt->rtr_mac)), rt->rtr_vrf_id); + ret = printf("%s", ether_ntoa((struct ether_addr *)(rt->rtr_mac))); + for(i = ret; i < 21; i++) + printf(" "); + + ret = printf("%5d", rt->rtr_vrf_id); + for(i = ret; i < 12; i++) + printf(" "); if (rt->rtr_label_flags & VR_RT_LABEL_VALID_FLAG) - printf("%5d ", rt->rtr_label); + ret = printf("%5d", rt->rtr_label); else - printf("%5c ", '-'); - printf("%3d \n",rt->rtr_nh_id); + ret = printf("%5c", '-'); + for(i = ret; i < 12; i++) + printf(" "); + printf("%7d\n",rt->rtr_nh_id); } return; @@ -186,20 +225,42 @@ vr_response_process(void *s) static vr_route_req * -vr_build_route_request(unsigned int op, int family, unsigned int prefix, unsigned int p_len, - unsigned int nh_id, unsigned int vrf, int label, - unsigned int rt_type, unsigned int src, char *eth, uint32_t replace_plen) +vr_build_route_request(unsigned int op, int family, int8_t *prefix, + unsigned int p_len, unsigned int nh_id, unsigned int vrf, int label, + unsigned int rt_type, int8_t *src, char *eth, uint32_t replace_plen) { + int i; + char buf[64]; rt_req.rtr_family = family; rt_req.rtr_vrf_id = vrf; rt_req.rtr_rid = 0; rt_req.h_op = op; + if (!rt_req.rtr_prefix) { + rt_req.rtr_prefix = rt_prefix; + rt_req.rtr_marker = rt_marker; + rt_req.rtr_src = rt_src; + } + rt_req.rtr_prefix_size = rt_req.rtr_marker_size = rt_req.rtr_src_size = 0; + switch (rt_req.h_op) { case SANDESH_OP_DUMP: - rt_req.rtr_marker = prefix; + if (prefix) { + memcpy(rt_req.rtr_prefix, prefix, RT_IP_ADDR_SIZE(family)); + memcpy(rt_req.rtr_marker, prefix, RT_IP_ADDR_SIZE(family)); + rt_req.rtr_marker_size = rt_req.rtr_prefix_size = RT_IP_ADDR_SIZE(family); + } else { + memset(rt_req.rtr_prefix, 0, 16); + memset(rt_req.rtr_marker, 0, 16); + } + if (src) { + memcpy(rt_req.rtr_src, src, RT_IP_ADDR_SIZE(family)); + rt_req.rtr_src_size = RT_IP_ADDR_SIZE(family); + } else { + memset(rt_req.rtr_src, 0, 16); + } + rt_req.rtr_marker_plen = p_len; - rt_req.rtr_src = src; rt_req.rtr_rt_type = rt_type; rt_req.rtr_vrf_id = vrf; if (!rt_req.rtr_mac) { @@ -211,7 +272,20 @@ vr_build_route_request(unsigned int op, int family, unsigned int prefix, unsigne default: rt_req.rtr_nh_id = nh_id; - rt_req.rtr_prefix = ntohl(prefix); + if (cmd_prefix_set) { + memcpy(rt_req.rtr_prefix, prefix, RT_IP_ADDR_SIZE(family)); + rt_req.rtr_prefix_size = RT_IP_ADDR_SIZE(family); + + inet_ntop(family, rt_req.rtr_prefix, buf, sizeof(buf)); + printf ("Adding prefix %s \n Prefix: ", buf); + for (i=0; i< RT_IP_ADDR_SIZE(family); i++) { + printf("%x:", prefix[i]); + } + printf ("\n"); + } else { + rt_req.rtr_prefix = NULL; + } + rt_req.rtr_prefix_len = p_len; rt_req.rtr_label_flags = 0; rt_req.rtr_rt_type = rt_type; @@ -220,11 +294,13 @@ vr_build_route_request(unsigned int op, int family, unsigned int prefix, unsigne if (cmd_proxy_set) rt_req.rtr_label_flags |= VR_RT_HOSTED_FLAG; - if (family == AF_INET) { + if ((family == AF_INET) || + (family == AF_INET6)) { if (rt_type == RT_UCAST) { - rt_req.rtr_src = 0; + *rt_req.rtr_src = 0; } else { - rt_req.rtr_src = src; + memcpy(rt_req.rtr_src, src, RT_IP_ADDR_SIZE(family)); + rt_req.rtr_src_size = RT_IP_ADDR_SIZE(family); } if (label != -1) { rt_req.rtr_label = label; @@ -331,7 +407,7 @@ vr_route_op(void) return -errno; if (cmd_op == SANDESH_OP_DUMP) { - if (cmd_family_id == AF_INET) { + if ((cmd_family_id == AF_INET) || (cmd_family_id == AF_INET6)) { if (cmd_rt_type == RT_UCAST) { printf("Kernel IP routing table %d/%d/unicast\n", req->rtr_rid, cmd_vrf_id); printf("Destination PPL Flags Label Nexthop\n"); @@ -340,8 +416,8 @@ vr_route_op(void) printf("(Src,Group) Nexthop\n"); } } else { - printf("Kernel L2 Bridge table %d\n", req->rtr_rid); - printf("DestMac Vrf Label/VNID Nexthop\n"); + printf("Kernel L2 Bridge table %d/%d\n", req->rtr_rid, cmd_vrf_id); + printf("DestMac Vrf Label/VNID Nexthop\n"); } } @@ -365,7 +441,7 @@ usage_internal() " P \n" " l \n" " t