1 month ago · 608904b9dd
--- a/overlay/wireguard_tun_linux.go
+++ b/overlay/wireguard_tun_linux.go
@@ -0,0 +1,102 @@
 
															+//go:build linux && !android && !e2e_testing
														
 
															+
														
 
															+package overlay
														
 
															+
														
 
															+import (
														
 
															+	"fmt"
														
 
															+	"sync"
														
 
															+
														
 
															+	wgtun "github.com/slackhq/nebula/wgstack/tun"
														
 
															+)
														
 
															+
														
 
															+type wireguardTunIO struct {
														
 
															+	dev       wgtun.Device
														
 
															+	mtu       int
														
 
															+	batchSize int
														
 
															+
														
 
															+	readMu   sync.Mutex
														
 
															+	readBufs [][]byte
														
 
															+	readLens []int
														
 
															+	pending  [][]byte
														
 
															+	pendIdx  int
														
 
															+
														
 
															+	writeMu   sync.Mutex
														
 
															+	writeBuf  []byte
														
 
															+	writeWrap [][]byte
														
 
															+}
														
 
															+
														
 
															+func newWireguardTunIO(dev wgtun.Device, mtu int) *wireguardTunIO {
														
 
															+	batch := dev.BatchSize()
														
 
															+	if batch <= 0 {
														
 
															+		batch = 1
														
 
															+	}
														
 
															+	if mtu <= 0 {
														
 
															+		mtu = DefaultMTU
														
 
															+	}
														
 
															+	bufs := make([][]byte, batch)
														
 
															+	for i := range bufs {
														
 
															+		bufs[i] = make([]byte, wgtun.VirtioNetHdrLen+mtu)
														
 
															+	}
														
 
															+	return &wireguardTunIO{
														
 
															+		dev:       dev,
														
 
															+		mtu:       mtu,
														
 
															+		batchSize: batch,
														
 
															+		readBufs:  bufs,
														
 
															+		readLens:  make([]int, batch),
														
 
															+		pending:   make([][]byte, 0, batch),
														
 
															+		writeBuf:  make([]byte, wgtun.VirtioNetHdrLen+mtu),
														
 
															+		writeWrap: make([][]byte, 1),
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (w *wireguardTunIO) Read(p []byte) (int, error) {
														
 
															+	w.readMu.Lock()
														
 
															+	defer w.readMu.Unlock()
														
 
															+
														
 
															+	for {
														
 
															+		if w.pendIdx < len(w.pending) {
														
 
															+			segment := w.pending[w.pendIdx]
														
 
															+			w.pendIdx++
														
 
															+			n := copy(p, segment)
														
 
															+			return n, nil
														
 
															+		}
														
 
															+
														
 
															+		n, err := w.dev.Read(w.readBufs, w.readLens, wgtun.VirtioNetHdrLen)
														
 
															+		if err != nil {
														
 
															+			return 0, err
														
 
															+		}
														
 
															+		w.pending = w.pending[:0]
														
 
															+		w.pendIdx = 0
														
 
															+		for i := 0; i < n; i++ {
														
 
															+			length := w.readLens[i]
														
 
															+			if length == 0 {
														
 
															+				continue
														
 
															+			}
														
 
															+			segment := w.readBufs[i][wgtun.VirtioNetHdrLen : wgtun.VirtioNetHdrLen+length]
														
 
															+			w.pending = append(w.pending, segment)
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (w *wireguardTunIO) Write(p []byte) (int, error) {
														
 
															+	if len(p) > w.mtu {
														
 
															+		return 0, fmt.Errorf("wireguard tun: payload exceeds MTU (%d > %d)", len(p), w.mtu)
														
 
															+	}
														
 
															+	w.writeMu.Lock()
														
 
															+	defer w.writeMu.Unlock()
														
 
															+	buf := w.writeBuf[:wgtun.VirtioNetHdrLen+len(p)]
														
 
															+	for i := 0; i < wgtun.VirtioNetHdrLen; i++ {
														
 
															+		buf[i] = 0
														
 
															+	}
														
 
															+	copy(buf[wgtun.VirtioNetHdrLen:], p)
														
 
															+	w.writeWrap[0] = buf
														
 
															+	n, err := w.dev.Write(w.writeWrap, wgtun.VirtioNetHdrLen)
														
 
															+	if err != nil {
														
 
															+		return n, err
														
 
															+	}
														
 
															+	return len(p), nil
														
 
															+}
														
 
															+
														
 
															+func (w *wireguardTunIO) Close() error {
														
 
															+	return nil
														
 
															+}
														
--- a/udp/wireguard_conn_linux.go
+++ b/udp/wireguard_conn_linux.go
@@ -0,0 +1,132 @@
 
															+//go:build linux && !android && !e2e_testing
														
 
															+
														
 
															+package udp
														
 
															+
														
 
															+import (
														
 
															+	"errors"
														
 
															+	"net"
														
 
															+	"net/netip"
														
 
															+	"sync"
														
 
															+	"sync/atomic"
														
 
															+
														
 
															+	"github.com/sirupsen/logrus"
														
 
															+	"github.com/slackhq/nebula/config"
														
 
															+	wgconn "github.com/slackhq/nebula/wgstack/conn"
														
 
															+)
														
 
															+
														
 
															+// WGConn adapts WireGuard's batched UDP bind implementation to Nebula's udp.Conn interface.
														
 
															+type WGConn struct {
														
 
															+	l         *logrus.Logger
														
 
															+	bind      *wgconn.StdNetBind
														
 
															+	recvers   []wgconn.ReceiveFunc
														
 
															+	batch     int
														
 
															+	localIP   netip.Addr
														
 
															+	localPort uint16
														
 
															+	closed    atomic.Bool
														
 
															+
														
 
															+	closeOnce sync.Once
														
 
															+}
														
 
															+
														
 
															+// NewWireguardListener creates a UDP listener backed by WireGuard's StdNetBind.
														
 
															+func NewWireguardListener(l *logrus.Logger, ip netip.Addr, port int, multi bool, batch int) (Conn, error) {
														
 
															+	bind := wgconn.NewStdNetBindForAddr(ip, multi)
														
 
															+	recvers, actualPort, err := bind.Open(uint16(port))
														
 
															+	if err != nil {
														
 
															+		return nil, err
														
 
															+	}
														
 
															+	if batch <= 0 || batch > bind.BatchSize() {
														
 
															+		batch = bind.BatchSize()
														
 
															+	}
														
 
															+	return &WGConn{
														
 
															+		l:         l,
														
 
															+		bind:      bind,
														
 
															+		recvers:   recvers,
														
 
															+		batch:     batch,
														
 
															+		localIP:   ip,
														
 
															+		localPort: actualPort,
														
 
															+	}, nil
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) Rebind() error {
														
 
															+	// WireGuard's bind does not support rebinding in place.
														
 
															+	return nil
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) LocalAddr() (netip.AddrPort, error) {
														
 
															+	if !c.localIP.IsValid() || c.localIP.IsUnspecified() {
														
 
															+		// Fallback to wildcard IPv4 for display purposes.
														
 
															+		return netip.AddrPortFrom(netip.IPv4Unspecified(), c.localPort), nil
														
 
															+	}
														
 
															+	return netip.AddrPortFrom(c.localIP, c.localPort), nil
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) listen(fn wgconn.ReceiveFunc, r EncReader) {
														
 
															+	batchSize := c.batch
														
 
															+	packets := make([][]byte, batchSize)
														
 
															+	for i := range packets {
														
 
															+		packets[i] = make([]byte, MTU)
														
 
															+	}
														
 
															+	sizes := make([]int, batchSize)
														
 
															+	endpoints := make([]wgconn.Endpoint, batchSize)
														
 
															+
														
 
															+	for {
														
 
															+		if c.closed.Load() {
														
 
															+			return
														
 
															+		}
														
 
															+		n, err := fn(packets, sizes, endpoints)
														
 
															+		if err != nil {
														
 
															+			if errors.Is(err, net.ErrClosed) {
														
 
															+				return
														
 
															+			}
														
 
															+			if c.l != nil {
														
 
															+				c.l.WithError(err).Debug("wireguard UDP listener receive error")
														
 
															+			}
														
 
															+			continue
														
 
															+		}
														
 
															+		for i := 0; i < n; i++ {
														
 
															+			if sizes[i] == 0 {
														
 
															+				continue
														
 
															+			}
														
 
															+			stdEp, ok := endpoints[i].(*wgconn.StdNetEndpoint)
														
 
															+			if !ok {
														
 
															+				if c.l != nil {
														
 
															+					c.l.Warn("wireguard UDP listener received unexpected endpoint type")
														
 
															+				}
														
 
															+				continue
														
 
															+			}
														
 
															+			addr := stdEp.AddrPort
														
 
															+			r(addr, packets[i][:sizes[i]])
														
 
															+			endpoints[i] = nil
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) ListenOut(r EncReader) {
														
 
															+	for _, fn := range c.recvers {
														
 
															+		go c.listen(fn, r)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) WriteTo(b []byte, addr netip.AddrPort) error {
														
 
															+	if len(b) == 0 {
														
 
															+		return nil
														
 
															+	}
														
 
															+	if c.closed.Load() {
														
 
															+		return net.ErrClosed
														
 
															+	}
														
 
															+	ep := &wgconn.StdNetEndpoint{AddrPort: addr}
														
 
															+	return c.bind.Send([][]byte{b}, ep)
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) ReloadConfig(*config.C) {
														
 
															+	// WireGuard bind currently does not expose runtime configuration knobs.
														
 
															+}
														
 
															+
														
 
															+func (c *WGConn) Close() error {
														
 
															+	var err error
														
 
															+	c.closeOnce.Do(func() {
														
 
															+		c.closed.Store(true)
														
 
															+		err = c.bind.Close()
														
 
															+	})
														
 
															+	return err
														
 
															+}
														
--- a/udp/wireguard_conn_unsupported.go
+++ b/udp/wireguard_conn_unsupported.go
@@ -0,0 +1,15 @@
 
															+//go:build !linux || android || e2e_testing
														
 
															+
														
 
															+package udp
														
 
															+
														
 
															+import (
														
 
															+	"fmt"
														
 
															+	"net/netip"
														
 
															+
														
 
															+	"github.com/sirupsen/logrus"
														
 
															+)
														
 
															+
														
 
															+// NewWireguardListener is only available on Linux builds.
														
 
															+func NewWireguardListener(*logrus.Logger, netip.Addr, int, bool, int) (Conn, error) {
														
 
															+	return nil, fmt.Errorf("wireguard experimental UDP listener is only supported on Linux")
														
 
															+}
														
--- a/wgstack/conn/bind_std.go
+++ b/wgstack/conn/bind_std.go
@@ -0,0 +1,513 @@
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+import (
														
 
															+	"context"
														
 
															+	"errors"
														
 
															+	"net"
														
 
															+	"net/netip"
														
 
															+	"runtime"
														
 
															+	"strconv"
														
 
															+	"sync"
														
 
															+	"syscall"
														
 
															+
														
 
															+	"golang.org/x/net/ipv4"
														
 
															+	"golang.org/x/net/ipv6"
														
 
															+	"golang.org/x/sys/unix"
														
 
															+)
														
 
															+
														
 
															+var (
														
 
															+	_ Bind = (*StdNetBind)(nil)
														
 
															+)
														
 
															+
														
 
															+// StdNetBind implements Bind for all platforms. While Windows has its own Bind
														
 
															+// (see bind_windows.go), it may fall back to StdNetBind.
														
 
															+// TODO: Remove usage of ipv{4,6}.PacketConn when net.UDPConn has comparable
														
 
															+// methods for sending and receiving multiple datagrams per-syscall. See the
														
 
															+// proposal in https://github.com/golang/go/issues/45886#issuecomment-1218301564.
														
 
															+type StdNetBind struct {
														
 
															+	mu     sync.Mutex // protects all fields except as specified
														
 
															+	ipv4   *net.UDPConn
														
 
															+	ipv6   *net.UDPConn
														
 
															+	ipv4PC *ipv4.PacketConn // will be nil on non-Linux
														
 
															+	ipv6PC *ipv6.PacketConn // will be nil on non-Linux
														
 
															+
														
 
															+	// these three fields are not guarded by mu
														
 
															+	udpAddrPool  sync.Pool
														
 
															+	ipv4MsgsPool sync.Pool
														
 
															+	ipv6MsgsPool sync.Pool
														
 
															+
														
 
															+	blackhole4 bool
														
 
															+	blackhole6 bool
														
 
															+
														
 
															+	listenAddr4 string
														
 
															+	listenAddr6 string
														
 
															+	bindV4      bool
														
 
															+	bindV6      bool
														
 
															+	reusePort   bool
														
 
															+}
														
 
															+
														
 
															+func newStdNetBind() *StdNetBind {
														
 
															+	return &StdNetBind{
														
 
															+		udpAddrPool: sync.Pool{
														
 
															+			New: func() any {
														
 
															+				return &net.UDPAddr{
														
 
															+					IP: make([]byte, 16),
														
 
															+				}
														
 
															+			},
														
 
															+		},
														
 
															+
														
 
															+		ipv4MsgsPool: sync.Pool{
														
 
															+			New: func() any {
														
 
															+				msgs := make([]ipv4.Message, IdealBatchSize)
														
 
															+				for i := range msgs {
														
 
															+					msgs[i].Buffers = make(net.Buffers, 1)
														
 
															+					msgs[i].OOB = make([]byte, srcControlSize)
														
 
															+				}
														
 
															+				return &msgs
														
 
															+			},
														
 
															+		},
														
 
															+
														
 
															+		ipv6MsgsPool: sync.Pool{
														
 
															+			New: func() any {
														
 
															+				msgs := make([]ipv6.Message, IdealBatchSize)
														
 
															+				for i := range msgs {
														
 
															+					msgs[i].Buffers = make(net.Buffers, 1)
														
 
															+					msgs[i].OOB = make([]byte, srcControlSize)
														
 
															+				}
														
 
															+				return &msgs
														
 
															+			},
														
 
															+		},
														
 
															+		bindV4:    true,
														
 
															+		bindV6:    true,
														
 
															+		reusePort: false,
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// NewStdNetBind creates a bind that listens on all interfaces.
														
 
															+func NewStdNetBind() *StdNetBind {
														
 
															+	return newStdNetBind()
														
 
															+}
														
 
															+
														
 
															+// NewStdNetBindForAddr creates a bind that listens on a specific address.
														
 
															+// If addr is IPv4, only the IPv4 socket will be created. For IPv6, only the
														
 
															+// IPv6 socket will be created.
														
 
															+func NewStdNetBindForAddr(addr netip.Addr, reusePort bool) *StdNetBind {
														
 
															+	b := newStdNetBind()
														
 
															+	if addr.IsValid() {
														
 
															+		if addr.Is4() {
														
 
															+			b.listenAddr4 = addr.Unmap().String()
														
 
															+			b.bindV4 = true
														
 
															+			b.bindV6 = false
														
 
															+		} else {
														
 
															+			b.listenAddr6 = addr.Unmap().String()
														
 
															+			b.bindV6 = true
														
 
															+			b.bindV4 = false
														
 
															+		}
														
 
															+	}
														
 
															+	b.reusePort = reusePort
														
 
															+	return b
														
 
															+}
														
 
															+
														
 
															+type StdNetEndpoint struct {
														
 
															+	// AddrPort is the endpoint destination.
														
 
															+	netip.AddrPort
														
 
															+	// src is the current sticky source address and interface index, if supported.
														
 
															+	src struct {
														
 
															+		netip.Addr
														
 
															+		ifidx int32
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+var (
														
 
															+	_ Bind     = (*StdNetBind)(nil)
														
 
															+	_ Endpoint = &StdNetEndpoint{}
														
 
															+)
														
 
															+
														
 
															+func (*StdNetBind) ParseEndpoint(s string) (Endpoint, error) {
														
 
															+	e, err := netip.ParseAddrPort(s)
														
 
															+	if err != nil {
														
 
															+		return nil, err
														
 
															+	}
														
 
															+	return &StdNetEndpoint{
														
 
															+		AddrPort: e,
														
 
															+	}, nil
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) ClearSrc() {
														
 
															+	e.src.ifidx = 0
														
 
															+	e.src.Addr = netip.Addr{}
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) DstIP() netip.Addr {
														
 
															+	return e.AddrPort.Addr()
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) SrcIP() netip.Addr {
														
 
															+	return e.src.Addr
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) SrcIfidx() int32 {
														
 
															+	return e.src.ifidx
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) DstToBytes() []byte {
														
 
															+	b, _ := e.AddrPort.MarshalBinary()
														
 
															+	return b
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) DstToString() string {
														
 
															+	return e.AddrPort.String()
														
 
															+}
														
 
															+
														
 
															+func (e *StdNetEndpoint) SrcToString() string {
														
 
															+	return e.src.Addr.String()
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) listenNet(network string, host string, port int) (*net.UDPConn, int, error) {
														
 
															+	lc := listenConfig()
														
 
															+	if s.reusePort {
														
 
															+		base := lc.Control
														
 
															+		lc.Control = func(network, address string, c syscall.RawConn) error {
														
 
															+			if base != nil {
														
 
															+				if err := base(network, address, c); err != nil {
														
 
															+					return err
														
 
															+				}
														
 
															+			}
														
 
															+			return c.Control(func(fd uintptr) {
														
 
															+				_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_REUSEPORT, 1)
														
 
															+			})
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	addr := ":" + strconv.Itoa(port)
														
 
															+	if host != "" {
														
 
															+		addr = net.JoinHostPort(host, strconv.Itoa(port))
														
 
															+	}
														
 
															+
														
 
															+	conn, err := lc.ListenPacket(context.Background(), network, addr)
														
 
															+	if err != nil {
														
 
															+		return nil, 0, err
														
 
															+	}
														
 
															+
														
 
															+	// Retrieve port.
														
 
															+	laddr := conn.LocalAddr()
														
 
															+	uaddr, err := net.ResolveUDPAddr(
														
 
															+		laddr.Network(),
														
 
															+		laddr.String(),
														
 
															+	)
														
 
															+	if err != nil {
														
 
															+		return nil, 0, err
														
 
															+	}
														
 
															+	return conn.(*net.UDPConn), uaddr.Port, nil
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) openIPv4(port int) (*net.UDPConn, *ipv4.PacketConn, int, error) {
														
 
															+	if !s.bindV4 {
														
 
															+		return nil, nil, port, nil
														
 
															+	}
														
 
															+	host := s.listenAddr4
														
 
															+	conn, actualPort, err := s.listenNet("udp4", host, port)
														
 
															+	if err != nil {
														
 
															+		if errors.Is(err, syscall.EAFNOSUPPORT) {
														
 
															+			return nil, nil, port, nil
														
 
															+		}
														
 
															+		return nil, nil, port, err
														
 
															+	}
														
 
															+	if runtime.GOOS != "linux" {
														
 
															+		return conn, nil, actualPort, nil
														
 
															+	}
														
 
															+	pc := ipv4.NewPacketConn(conn)
														
 
															+	return conn, pc, actualPort, nil
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) openIPv6(port int) (*net.UDPConn, *ipv6.PacketConn, int, error) {
														
 
															+	if !s.bindV6 {
														
 
															+		return nil, nil, port, nil
														
 
															+	}
														
 
															+	host := s.listenAddr6
														
 
															+	conn, actualPort, err := s.listenNet("udp6", host, port)
														
 
															+	if err != nil {
														
 
															+		if errors.Is(err, syscall.EAFNOSUPPORT) {
														
 
															+			return nil, nil, port, nil
														
 
															+		}
														
 
															+		return nil, nil, port, err
														
 
															+	}
														
 
															+	if runtime.GOOS != "linux" {
														
 
															+		return conn, nil, actualPort, nil
														
 
															+	}
														
 
															+	pc := ipv6.NewPacketConn(conn)
														
 
															+	return conn, pc, actualPort, nil
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) Open(uport uint16) ([]ReceiveFunc, uint16, error) {
														
 
															+	s.mu.Lock()
														
 
															+	defer s.mu.Unlock()
														
 
															+
														
 
															+	var err error
														
 
															+	var tries int
														
 
															+
														
 
															+	if s.ipv4 != nil || s.ipv6 != nil {
														
 
															+		return nil, 0, ErrBindAlreadyOpen
														
 
															+	}
														
 
															+
														
 
															+	// Attempt to open ipv4 and ipv6 listeners on the same port.
														
 
															+	// If uport is 0, we can retry on failure.
														
 
															+again:
														
 
															+	port := int(uport)
														
 
															+	var v4conn *net.UDPConn
														
 
															+	var v6conn *net.UDPConn
														
 
															+	var v4pc *ipv4.PacketConn
														
 
															+	var v6pc *ipv6.PacketConn
														
 
															+
														
 
															+	v4conn, v4pc, port, err = s.openIPv4(port)
														
 
															+	if err != nil {
														
 
															+		return nil, 0, err
														
 
															+	}
														
 
															+
														
 
															+	// Listen on the same port as we're using for ipv4.
														
 
															+	v6conn, v6pc, port, err = s.openIPv6(port)
														
 
															+	if uport == 0 && errors.Is(err, syscall.EADDRINUSE) && tries < 100 {
														
 
															+		if v4conn != nil {
														
 
															+			v4conn.Close()
														
 
															+		}
														
 
															+		tries++
														
 
															+		goto again
														
 
															+	}
														
 
															+	if err != nil {
														
 
															+		if v4conn != nil {
														
 
															+			v4conn.Close()
														
 
															+		}
														
 
															+		return nil, 0, err
														
 
															+	}
														
 
															+
														
 
															+	var fns []ReceiveFunc
														
 
															+	if v4conn != nil {
														
 
															+		s.ipv4 = v4conn
														
 
															+		if v4pc != nil {
														
 
															+			s.ipv4PC = v4pc
														
 
															+		}
														
 
															+		fns = append(fns, s.makeReceiveIPv4(v4pc, v4conn))
														
 
															+	}
														
 
															+	if v6conn != nil {
														
 
															+		s.ipv6 = v6conn
														
 
															+		if v6pc != nil {
														
 
															+			s.ipv6PC = v6pc
														
 
															+		}
														
 
															+		fns = append(fns, s.makeReceiveIPv6(v6pc, v6conn))
														
 
															+	}
														
 
															+	if len(fns) == 0 {
														
 
															+		return nil, 0, syscall.EAFNOSUPPORT
														
 
															+	}
														
 
															+
														
 
															+	return fns, uint16(port), nil
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) makeReceiveIPv4(pc *ipv4.PacketConn, conn *net.UDPConn) ReceiveFunc {
														
 
															+	return func(bufs [][]byte, sizes []int, eps []Endpoint) (n int, err error) {
														
 
															+		msgs := s.ipv4MsgsPool.Get().(*[]ipv4.Message)
														
 
															+		defer s.ipv4MsgsPool.Put(msgs)
														
 
															+		for i := range bufs {
														
 
															+			(*msgs)[i].Buffers[0] = bufs[i]
														
 
															+		}
														
 
															+		var numMsgs int
														
 
															+		if runtime.GOOS == "linux" && pc != nil {
														
 
															+			numMsgs, err = pc.ReadBatch(*msgs, 0)
														
 
															+			if err != nil {
														
 
															+				return 0, err
														
 
															+			}
														
 
															+		} else {
														
 
															+			msg := &(*msgs)[0]
														
 
															+			msg.N, msg.NN, _, msg.Addr, err = conn.ReadMsgUDP(msg.Buffers[0], msg.OOB)
														
 
															+			if err != nil {
														
 
															+				return 0, err
														
 
															+			}
														
 
															+			numMsgs = 1
														
 
															+		}
														
 
															+		for i := 0; i < numMsgs; i++ {
														
 
															+			msg := &(*msgs)[i]
														
 
															+			sizes[i] = msg.N
														
 
															+			addrPort := msg.Addr.(*net.UDPAddr).AddrPort()
														
 
															+			ep := &StdNetEndpoint{AddrPort: addrPort} // TODO: remove allocation
														
 
															+			getSrcFromControl(msg.OOB[:msg.NN], ep)
														
 
															+			eps[i] = ep
														
 
															+		}
														
 
															+		return numMsgs, nil
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) makeReceiveIPv6(pc *ipv6.PacketConn, conn *net.UDPConn) ReceiveFunc {
														
 
															+	return func(bufs [][]byte, sizes []int, eps []Endpoint) (n int, err error) {
														
 
															+		msgs := s.ipv6MsgsPool.Get().(*[]ipv6.Message)
														
 
															+		defer s.ipv6MsgsPool.Put(msgs)
														
 
															+		for i := range bufs {
														
 
															+			(*msgs)[i].Buffers[0] = bufs[i]
														
 
															+		}
														
 
															+		var numMsgs int
														
 
															+		if runtime.GOOS == "linux" && pc != nil {
														
 
															+			numMsgs, err = pc.ReadBatch(*msgs, 0)
														
 
															+			if err != nil {
														
 
															+				return 0, err
														
 
															+			}
														
 
															+		} else {
														
 
															+			msg := &(*msgs)[0]
														
 
															+			msg.N, msg.NN, _, msg.Addr, err = conn.ReadMsgUDP(msg.Buffers[0], msg.OOB)
														
 
															+			if err != nil {
														
 
															+				return 0, err
														
 
															+			}
														
 
															+			numMsgs = 1
														
 
															+		}
														
 
															+		for i := 0; i < numMsgs; i++ {
														
 
															+			msg := &(*msgs)[i]
														
 
															+			sizes[i] = msg.N
														
 
															+			addrPort := msg.Addr.(*net.UDPAddr).AddrPort()
														
 
															+			ep := &StdNetEndpoint{AddrPort: addrPort} // TODO: remove allocation
														
 
															+			getSrcFromControl(msg.OOB[:msg.NN], ep)
														
 
															+			eps[i] = ep
														
 
															+		}
														
 
															+		return numMsgs, nil
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// TODO: When all Binds handle IdealBatchSize, remove this dynamic function and
														
 
															+// rename the IdealBatchSize constant to BatchSize.
														
 
															+func (s *StdNetBind) BatchSize() int {
														
 
															+	if runtime.GOOS == "linux" {
														
 
															+		return IdealBatchSize
														
 
															+	}
														
 
															+	return 1
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) Close() error {
														
 
															+	s.mu.Lock()
														
 
															+	defer s.mu.Unlock()
														
 
															+
														
 
															+	var err1, err2 error
														
 
															+	if s.ipv4 != nil {
														
 
															+		err1 = s.ipv4.Close()
														
 
															+		s.ipv4 = nil
														
 
															+		s.ipv4PC = nil
														
 
															+	}
														
 
															+	if s.ipv6 != nil {
														
 
															+		err2 = s.ipv6.Close()
														
 
															+		s.ipv6 = nil
														
 
															+		s.ipv6PC = nil
														
 
															+	}
														
 
															+	s.blackhole4 = false
														
 
															+	s.blackhole6 = false
														
 
															+	if err1 != nil {
														
 
															+		return err1
														
 
															+	}
														
 
															+	return err2
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) Send(bufs [][]byte, endpoint Endpoint) error {
														
 
															+	s.mu.Lock()
														
 
															+	blackhole := s.blackhole4
														
 
															+	conn := s.ipv4
														
 
															+	var (
														
 
															+		pc4 *ipv4.PacketConn
														
 
															+		pc6 *ipv6.PacketConn
														
 
															+	)
														
 
															+	is6 := false
														
 
															+	if endpoint.DstIP().Is6() {
														
 
															+		blackhole = s.blackhole6
														
 
															+		conn = s.ipv6
														
 
															+		pc6 = s.ipv6PC
														
 
															+		is6 = true
														
 
															+	} else {
														
 
															+		pc4 = s.ipv4PC
														
 
															+	}
														
 
															+	s.mu.Unlock()
														
 
															+
														
 
															+	if blackhole {
														
 
															+		return nil
														
 
															+	}
														
 
															+	if conn == nil {
														
 
															+		return syscall.EAFNOSUPPORT
														
 
															+	}
														
 
															+	if is6 {
														
 
															+		return s.send6(conn, pc6, endpoint, bufs)
														
 
															+	} else {
														
 
															+		return s.send4(conn, pc4, endpoint, bufs)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) send4(conn *net.UDPConn, pc *ipv4.PacketConn, ep Endpoint, bufs [][]byte) error {
														
 
															+	ua := s.udpAddrPool.Get().(*net.UDPAddr)
														
 
															+	as4 := ep.DstIP().As4()
														
 
															+	copy(ua.IP, as4[:])
														
 
															+	ua.IP = ua.IP[:4]
														
 
															+	ua.Port = int(ep.(*StdNetEndpoint).Port())
														
 
															+	msgs := s.ipv4MsgsPool.Get().(*[]ipv4.Message)
														
 
															+	for i, buf := range bufs {
														
 
															+		(*msgs)[i].Buffers[0] = buf
														
 
															+		(*msgs)[i].Addr = ua
														
 
															+		setSrcControl(&(*msgs)[i].OOB, ep.(*StdNetEndpoint))
														
 
															+	}
														
 
															+	var (
														
 
															+		n     int
														
 
															+		err   error
														
 
															+		start int
														
 
															+	)
														
 
															+	if runtime.GOOS == "linux" && pc != nil {
														
 
															+		for {
														
 
															+			n, err = pc.WriteBatch((*msgs)[start:len(bufs)], 0)
														
 
															+			if err != nil || n == len((*msgs)[start:len(bufs)]) {
														
 
															+				break
														
 
															+			}
														
 
															+			start += n
														
 
															+		}
														
 
															+	} else {
														
 
															+		for i, buf := range bufs {
														
 
															+			_, _, err = conn.WriteMsgUDP(buf, (*msgs)[i].OOB, ua)
														
 
															+			if err != nil {
														
 
															+				break
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	s.udpAddrPool.Put(ua)
														
 
															+	s.ipv4MsgsPool.Put(msgs)
														
 
															+	return err
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) send6(conn *net.UDPConn, pc *ipv6.PacketConn, ep Endpoint, bufs [][]byte) error {
														
 
															+	ua := s.udpAddrPool.Get().(*net.UDPAddr)
														
 
															+	as16 := ep.DstIP().As16()
														
 
															+	copy(ua.IP, as16[:])
														
 
															+	ua.IP = ua.IP[:16]
														
 
															+	ua.Port = int(ep.(*StdNetEndpoint).Port())
														
 
															+	msgs := s.ipv6MsgsPool.Get().(*[]ipv6.Message)
														
 
															+	for i, buf := range bufs {
														
 
															+		(*msgs)[i].Buffers[0] = buf
														
 
															+		(*msgs)[i].Addr = ua
														
 
															+		setSrcControl(&(*msgs)[i].OOB, ep.(*StdNetEndpoint))
														
 
															+	}
														
 
															+	var (
														
 
															+		n     int
														
 
															+		err   error
														
 
															+		start int
														
 
															+	)
														
 
															+	if runtime.GOOS == "linux" && pc != nil {
														
 
															+		for {
														
 
															+			n, err = pc.WriteBatch((*msgs)[start:len(bufs)], 0)
														
 
															+			if err != nil || n == len((*msgs)[start:len(bufs)]) {
														
 
															+				break
														
 
															+			}
														
 
															+			start += n
														
 
															+		}
														
 
															+	} else {
														
 
															+		for i, buf := range bufs {
														
 
															+			_, _, err = conn.WriteMsgUDP(buf, (*msgs)[i].OOB, ua)
														
 
															+			if err != nil {
														
 
															+				break
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	s.udpAddrPool.Put(ua)
														
 
															+	s.ipv6MsgsPool.Put(msgs)
														
 
															+	return err
														
 
															+}
														
--- a/wgstack/conn/conn.go
+++ b/wgstack/conn/conn.go
@@ -0,0 +1,131 @@
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+import (
														
 
															+	"errors"
														
 
															+	"fmt"
														
 
															+	"net/netip"
														
 
															+	"reflect"
														
 
															+	"runtime"
														
 
															+	"strings"
														
 
															+)
														
 
															+
														
 
															+const (
														
 
															+	IdealBatchSize = 128 // maximum number of packets handled per read and write
														
 
															+)
														
 
															+
														
 
															+// A ReceiveFunc receives at least one packet from the network and writes them
														
 
															+// into packets. On a successful read it returns the number of elements of
														
 
															+// sizes, packets, and endpoints that should be evaluated. Some elements of
														
 
															+// sizes may be zero, and callers should ignore them. Callers must pass a sizes
														
 
															+// and eps slice with a length greater than or equal to the length of packets.
														
 
															+// These lengths must not exceed the length of the associated Bind.BatchSize().
														
 
															+type ReceiveFunc func(packets [][]byte, sizes []int, eps []Endpoint) (n int, err error)
														
 
															+
														
 
															+// A Bind listens on a port for both IPv6 and IPv4 UDP traffic.
														
 
															+//
														
 
															+// A Bind interface may also be a PeekLookAtSocketFd or BindSocketToInterface,
														
 
															+// depending on the platform-specific implementation.
														
 
															+type Bind interface {
														
 
															+	// Open puts the Bind into a listening state on a given port and reports the actual
														
 
															+	// port that it bound to. Passing zero results in a random selection.
														
 
															+	// fns is the set of functions that will be called to receive packets.
														
 
															+	Open(port uint16) (fns []ReceiveFunc, actualPort uint16, err error)
														
 
															+
														
 
															+	// Close closes the Bind listener.
														
 
															+	// All fns returned by Open must return net.ErrClosed after a call to Close.
														
 
															+	Close() error
														
 
															+
														
 
															+	// SetMark sets the mark for each packet sent through this Bind.
														
 
															+	// This mark is passed to the kernel as the socket option SO_MARK.
														
 
															+	SetMark(mark uint32) error
														
 
															+
														
 
															+	// Send writes one or more packets in bufs to address ep. The length of
														
 
															+	// bufs must not exceed BatchSize().
														
 
															+	Send(bufs [][]byte, ep Endpoint) error
														
 
															+
														
 
															+	// ParseEndpoint creates a new endpoint from a string.
														
 
															+	ParseEndpoint(s string) (Endpoint, error)
														
 
															+
														
 
															+	// BatchSize is the number of buffers expected to be passed to
														
 
															+	// the ReceiveFuncs, and the maximum expected to be passed to SendBatch.
														
 
															+	BatchSize() int
														
 
															+}
														
 
															+
														
 
															+// BindSocketToInterface is implemented by Bind objects that support being
														
 
															+// tied to a single network interface. Used by wireguard-windows.
														
 
															+type BindSocketToInterface interface {
														
 
															+	BindSocketToInterface4(interfaceIndex uint32, blackhole bool) error
														
 
															+	BindSocketToInterface6(interfaceIndex uint32, blackhole bool) error
														
 
															+}
														
 
															+
														
 
															+// PeekLookAtSocketFd is implemented by Bind objects that support having their
														
 
															+// file descriptor peeked at. Used by wireguard-android.
														
 
															+type PeekLookAtSocketFd interface {
														
 
															+	PeekLookAtSocketFd4() (fd int, err error)
														
 
															+	PeekLookAtSocketFd6() (fd int, err error)
														
 
															+}
														
 
															+
														
 
															+// An Endpoint maintains the source/destination caching for a peer.
														
 
															+//
														
 
															+//	dst: the remote address of a peer ("endpoint" in uapi terminology)
														
 
															+//	src: the local address from which datagrams originate going to the peer
														
 
															+type Endpoint interface {
														
 
															+	ClearSrc()           // clears the source address
														
 
															+	SrcToString() string // returns the local source address (ip:port)
														
 
															+	DstToString() string // returns the destination address (ip:port)
														
 
															+	DstToBytes() []byte  // used for mac2 cookie calculations
														
 
															+	DstIP() netip.Addr
														
 
															+	SrcIP() netip.Addr
														
 
															+}
														
 
															+
														
 
															+var (
														
 
															+	ErrBindAlreadyOpen   = errors.New("bind is already open")
														
 
															+	ErrWrongEndpointType = errors.New("endpoint type does not correspond with bind type")
														
 
															+)
														
 
															+
														
 
															+func (fn ReceiveFunc) PrettyName() string {
														
 
															+	name := runtime.FuncForPC(reflect.ValueOf(fn).Pointer()).Name()
														
 
															+	// 0. cheese/taco.beansIPv6.func12.func21218-fm
														
 
															+	name = strings.TrimSuffix(name, "-fm")
														
 
															+	// 1. cheese/taco.beansIPv6.func12.func21218
														
 
															+	if idx := strings.LastIndexByte(name, '/'); idx != -1 {
														
 
															+		name = name[idx+1:]
														
 
															+		// 2. taco.beansIPv6.func12.func21218
														
 
															+	}
														
 
															+	for {
														
 
															+		var idx int
														
 
															+		for idx = len(name) - 1; idx >= 0; idx-- {
														
 
															+			if name[idx] < '0' || name[idx] > '9' {
														
 
															+				break
														
 
															+			}
														
 
															+		}
														
 
															+		if idx == len(name)-1 {
														
 
															+			break
														
 
															+		}
														
 
															+		const dotFunc = ".func"
														
 
															+		if !strings.HasSuffix(name[:idx+1], dotFunc) {
														
 
															+			break
														
 
															+		}
														
 
															+		name = name[:idx+1-len(dotFunc)]
														
 
															+		// 3. taco.beansIPv6.func12
														
 
															+		// 4. taco.beansIPv6
														
 
															+	}
														
 
															+	if idx := strings.LastIndexByte(name, '.'); idx != -1 {
														
 
															+		name = name[idx+1:]
														
 
															+		// 5. beansIPv6
														
 
															+	}
														
 
															+	if name == "" {
														
 
															+		return fmt.Sprintf("%p", fn)
														
 
															+	}
														
 
															+	if strings.HasSuffix(name, "IPv4") {
														
 
															+		return "v4"
														
 
															+	}
														
 
															+	if strings.HasSuffix(name, "IPv6") {
														
 
															+		return "v6"
														
 
															+	}
														
 
															+	return name
														
 
															+}
														
--- a/wgstack/conn/controlfns.go
+++ b/wgstack/conn/controlfns.go
@@ -0,0 +1,42 @@
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+import (
														
 
															+	"net"
														
 
															+	"syscall"
														
 
															+)
														
 
															+
														
 
															+// UDP socket read/write buffer size (7MB). The value of 7MB is chosen as it is
														
 
															+// the max supported by a default configuration of macOS. Some platforms will
														
 
															+// silently clamp the value to other maximums, such as linux clamping to
														
 
															+// net.core.{r,w}mem_max (see _linux.go for additional implementation that works
														
 
															+// around this limitation)
														
 
															+const socketBufferSize = 7 << 20
														
 
															+
														
 
															+// controlFn is the callback function signature from net.ListenConfig.Control.
														
 
															+// It is used to apply platform specific configuration to the socket prior to
														
 
															+// bind.
														
 
															+type controlFn func(network, address string, c syscall.RawConn) error
														
 
															+
														
 
															+// controlFns is a list of functions that are called from the listen config
														
 
															+// that can apply socket options.
														
 
															+var controlFns = []controlFn{}
														
 
															+
														
 
															+// listenConfig returns a net.ListenConfig that applies the controlFns to the
														
 
															+// socket prior to bind. This is used to apply socket buffer sizing and packet
														
 
															+// information OOB configuration for sticky sockets.
														
 
															+func listenConfig() *net.ListenConfig {
														
 
															+	return &net.ListenConfig{
														
 
															+		Control: func(network, address string, c syscall.RawConn) error {
														
 
															+			for _, fn := range controlFns {
														
 
															+				if err := fn(network, address, c); err != nil {
														
 
															+					return err
														
 
															+				}
														
 
															+			}
														
 
															+			return nil
														
 
															+		},
														
 
															+	}
														
 
															+}
														
--- a/wgstack/conn/controlfns_linux.go
+++ b/wgstack/conn/controlfns_linux.go
@@ -0,0 +1,62 @@
 
															+//go:build linux
														
 
															+
														
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+import (
														
 
															+	"fmt"
														
 
															+	"runtime"
														
 
															+	"syscall"
														
 
															+
														
 
															+	"golang.org/x/sys/unix"
														
 
															+)
														
 
															+
														
 
															+func init() {
														
 
															+	controlFns = append(controlFns,
														
 
															+
														
 
															+		// Attempt to set the socket buffer size beyond net.core.{r,w}mem_max by
														
 
															+		// using SO_*BUFFORCE. This requires CAP_NET_ADMIN, and is allowed here to
														
 
															+		// fail silently - the result of failure is lower performance on very fast
														
 
															+		// links or high latency links.
														
 
															+		func(network, address string, c syscall.RawConn) error {
														
 
															+			return c.Control(func(fd uintptr) {
														
 
															+				// Set up to *mem_max
														
 
															+				_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RCVBUF, socketBufferSize)
														
 
															+				_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_SNDBUF, socketBufferSize)
														
 
															+				// Set beyond *mem_max if CAP_NET_ADMIN
														
 
															+				_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_RCVBUFFORCE, socketBufferSize)
														
 
															+				_ = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, unix.SO_SNDBUFFORCE, socketBufferSize)
														
 
															+			})
														
 
															+		},
														
 
															+
														
 
															+		// Enable receiving of the packet information (IP_PKTINFO for IPv4,
														
 
															+		// IPV6_PKTINFO for IPv6) that is used to implement sticky socket support.
														
 
															+		func(network, address string, c syscall.RawConn) error {
														
 
															+			var err error
														
 
															+			switch network {
														
 
															+			case "udp4":
														
 
															+				if runtime.GOOS != "android" {
														
 
															+					c.Control(func(fd uintptr) {
														
 
															+						err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_PKTINFO, 1)
														
 
															+					})
														
 
															+				}
														
 
															+			case "udp6":
														
 
															+				c.Control(func(fd uintptr) {
														
 
															+					if runtime.GOOS != "android" {
														
 
															+						err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_RECVPKTINFO, 1)
														
 
															+						if err != nil {
														
 
															+							return
														
 
															+						}
														
 
															+					}
														
 
															+					err = unix.SetsockoptInt(int(fd), unix.IPPROTO_IPV6, unix.IPV6_V6ONLY, 1)
														
 
															+				})
														
 
															+			default:
														
 
															+				err = fmt.Errorf("unhandled network: %s: %w", network, unix.EINVAL)
														
 
															+			}
														
 
															+			return err
														
 
															+		},
														
 
															+	)
														
 
															+}
														
--- a/wgstack/conn/default.go
+++ b/wgstack/conn/default.go
@@ -0,0 +1,9 @@
 
															+//go:build !windows
														
 
															+
														
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+func NewDefaultBind() Bind { return NewStdNetBind() }
														
--- a/wgstack/conn/mark_unix.go
+++ b/wgstack/conn/mark_unix.go
@@ -0,0 +1,64 @@
 
															+//go:build linux || openbsd || freebsd
														
 
															+
														
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+import (
														
 
															+	"runtime"
														
 
															+
														
 
															+	"golang.org/x/sys/unix"
														
 
															+)
														
 
															+
														
 
															+var fwmarkIoctl int
														
 
															+
														
 
															+func init() {
														
 
															+	switch runtime.GOOS {
														
 
															+	case "linux", "android":
														
 
															+		fwmarkIoctl = 36 /* unix.SO_MARK */
														
 
															+	case "freebsd":
														
 
															+		fwmarkIoctl = 0x1015 /* unix.SO_USER_COOKIE */
														
 
															+	case "openbsd":
														
 
															+		fwmarkIoctl = 0x1021 /* unix.SO_RTABLE */
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (s *StdNetBind) SetMark(mark uint32) error {
														
 
															+	var operr error
														
 
															+	if fwmarkIoctl == 0 {
														
 
															+		return nil
														
 
															+	}
														
 
															+	if s.ipv4 != nil {
														
 
															+		fd, err := s.ipv4.SyscallConn()
														
 
															+		if err != nil {
														
 
															+			return err
														
 
															+		}
														
 
															+		err = fd.Control(func(fd uintptr) {
														
 
															+			operr = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, fwmarkIoctl, int(mark))
														
 
															+		})
														
 
															+		if err == nil {
														
 
															+			err = operr
														
 
															+		}
														
 
															+		if err != nil {
														
 
															+			return err
														
 
															+		}
														
 
															+	}
														
 
															+	if s.ipv6 != nil {
														
 
															+		fd, err := s.ipv6.SyscallConn()
														
 
															+		if err != nil {
														
 
															+			return err
														
 
															+		}
														
 
															+		err = fd.Control(func(fd uintptr) {
														
 
															+			operr = unix.SetsockoptInt(int(fd), unix.SOL_SOCKET, fwmarkIoctl, int(mark))
														
 
															+		})
														
 
															+		if err == nil {
														
 
															+			err = operr
														
 
															+		}
														
 
															+		if err != nil {
														
 
															+			return err
														
 
															+		}
														
 
															+	}
														
 
															+	return nil
														
 
															+}
														
--- a/wgstack/conn/sticky_linux.go
+++ b/wgstack/conn/sticky_linux.go
@@ -0,0 +1,116 @@
 
															+//go:build linux && !android
														
 
															+
														
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package conn
														
 
															+
														
 
															+import (
														
 
															+	"net/netip"
														
 
															+	"unsafe"
														
 
															+
														
 
															+	"golang.org/x/sys/unix"
														
 
															+)
														
 
															+
														
 
															+// getSrcFromControl parses the control for PKTINFO and if found updates ep with
														
 
															+// the source information found.
														
 
															+func getSrcFromControl(control []byte, ep *StdNetEndpoint) {
														
 
															+	ep.ClearSrc()
														
 
															+
														
 
															+	var (
														
 
															+		hdr  unix.Cmsghdr
														
 
															+		data []byte
														
 
															+		rem  []byte = control
														
 
															+		err  error
														
 
															+	)
														
 
															+
														
 
															+	for len(rem) > unix.SizeofCmsghdr {
														
 
															+		hdr, data, rem, err = unix.ParseOneSocketControlMessage(rem)
														
 
															+		if err != nil {
														
 
															+			return
														
 
															+		}
														
 
															+
														
 
															+		if hdr.Level == unix.IPPROTO_IP &&
														
 
															+			hdr.Type == unix.IP_PKTINFO {
														
 
															+
														
 
															+			info := pktInfoFromBuf[unix.Inet4Pktinfo](data)
														
 
															+			ep.src.Addr = netip.AddrFrom4(info.Spec_dst)
														
 
															+			ep.src.ifidx = info.Ifindex
														
 
															+
														
 
															+			return
														
 
															+		}
														
 
															+
														
 
															+		if hdr.Level == unix.IPPROTO_IPV6 &&
														
 
															+			hdr.Type == unix.IPV6_PKTINFO {
														
 
															+
														
 
															+			info := pktInfoFromBuf[unix.Inet6Pktinfo](data)
														
 
															+			ep.src.Addr = netip.AddrFrom16(info.Addr)
														
 
															+			ep.src.ifidx = int32(info.Ifindex)
														
 
															+
														
 
															+			return
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// pktInfoFromBuf returns type T populated from the provided buf via copy(). It
														
 
															+// panics if buf is of insufficient size.
														
 
															+func pktInfoFromBuf[T unix.Inet4Pktinfo | unix.Inet6Pktinfo](buf []byte) (t T) {
														
 
															+	size := int(unsafe.Sizeof(t))
														
 
															+	if len(buf) < size {
														
 
															+		panic("pktInfoFromBuf: buffer too small")
														
 
															+	}
														
 
															+	copy(unsafe.Slice((*byte)(unsafe.Pointer(&t)), size), buf)
														
 
															+	return t
														
 
															+}
														
 
															+
														
 
															+// setSrcControl sets an IP{V6}_PKTINFO in control based on the source address
														
 
															+// and source ifindex found in ep. control's len will be set to 0 in the event
														
 
															+// that ep is a default value.
														
 
															+func setSrcControl(control *[]byte, ep *StdNetEndpoint) {
														
 
															+	*control = (*control)[:cap(*control)]
														
 
															+	if len(*control) < int(unsafe.Sizeof(unix.Cmsghdr{})) {
														
 
															+		*control = (*control)[:0]
														
 
															+		return
														
 
															+	}
														
 
															+
														
 
															+	if ep.src.ifidx == 0 && !ep.SrcIP().IsValid() {
														
 
															+		*control = (*control)[:0]
														
 
															+		return
														
 
															+	}
														
 
															+
														
 
															+	if len(*control) < srcControlSize {
														
 
															+		*control = (*control)[:0]
														
 
															+		return
														
 
															+	}
														
 
															+
														
 
															+	hdr := (*unix.Cmsghdr)(unsafe.Pointer(&(*control)[0]))
														
 
															+	if ep.SrcIP().Is4() {
														
 
															+		hdr.Level = unix.IPPROTO_IP
														
 
															+		hdr.Type = unix.IP_PKTINFO
														
 
															+		hdr.SetLen(unix.CmsgLen(unix.SizeofInet4Pktinfo))
														
 
															+
														
 
															+		info := (*unix.Inet4Pktinfo)(unsafe.Pointer(&(*control)[unix.SizeofCmsghdr]))
														
 
															+		info.Ifindex = ep.src.ifidx
														
 
															+		if ep.SrcIP().IsValid() {
														
 
															+			info.Spec_dst = ep.SrcIP().As4()
														
 
															+		}
														
 
															+		*control = (*control)[:unix.CmsgSpace(unix.SizeofInet4Pktinfo)]
														
 
															+	} else {
														
 
															+		hdr.Level = unix.IPPROTO_IPV6
														
 
															+		hdr.Type = unix.IPV6_PKTINFO
														
 
															+		hdr.SetLen(unix.CmsgLen(unix.SizeofInet6Pktinfo))
														
 
															+
														
 
															+		info := (*unix.Inet6Pktinfo)(unsafe.Pointer(&(*control)[unix.SizeofCmsghdr]))
														
 
															+		info.Ifindex = uint32(ep.src.ifidx)
														
 
															+		if ep.SrcIP().IsValid() {
														
 
															+			info.Addr = ep.SrcIP().As16()
														
 
															+		}
														
 
															+		*control = (*control)[:unix.CmsgSpace(unix.SizeofInet6Pktinfo)]
														
 
															+	}
														
 
															+
														
 
															+}
														
 
															+
														
 
															+var srcControlSize = unix.CmsgSpace(unix.SizeofInet6Pktinfo)
														
 
															+
														
 
															+const StdNetSupportsStickySockets = true
														
--- a/wgstack/tun/checksum.go
+++ b/wgstack/tun/checksum.go
@@ -0,0 +1,42 @@
 
															+package tun
														
 
															+
														
 
															+import "encoding/binary"
														
 
															+
														
 
															+// TODO: Explore SIMD and/or other assembly optimizations.
														
 
															+func checksumNoFold(b []byte, initial uint64) uint64 {
														
 
															+	ac := initial
														
 
															+	i := 0
														
 
															+	n := len(b)
														
 
															+	for n >= 4 {
														
 
															+		ac += uint64(binary.BigEndian.Uint32(b[i : i+4]))
														
 
															+		n -= 4
														
 
															+		i += 4
														
 
															+	}
														
 
															+	for n >= 2 {
														
 
															+		ac += uint64(binary.BigEndian.Uint16(b[i : i+2]))
														
 
															+		n -= 2
														
 
															+		i += 2
														
 
															+	}
														
 
															+	if n == 1 {
														
 
															+		ac += uint64(b[i]) << 8
														
 
															+	}
														
 
															+	return ac
														
 
															+}
														
 
															+
														
 
															+func checksum(b []byte, initial uint64) uint16 {
														
 
															+	ac := checksumNoFold(b, initial)
														
 
															+	ac = (ac >> 16) + (ac & 0xffff)
														
 
															+	ac = (ac >> 16) + (ac & 0xffff)
														
 
															+	ac = (ac >> 16) + (ac & 0xffff)
														
 
															+	ac = (ac >> 16) + (ac & 0xffff)
														
 
															+	return uint16(ac)
														
 
															+}
														
 
															+
														
 
															+func pseudoHeaderChecksumNoFold(protocol uint8, srcAddr, dstAddr []byte, totalLen uint16) uint64 {
														
 
															+	sum := checksumNoFold(srcAddr, 0)
														
 
															+	sum = checksumNoFold(dstAddr, sum)
														
 
															+	sum = checksumNoFold([]byte{0, protocol}, sum)
														
 
															+	tmp := make([]byte, 2)
														
 
															+	binary.BigEndian.PutUint16(tmp, totalLen)
														
 
															+	return checksumNoFold(tmp, sum)
														
 
															+}
														
--- a/wgstack/tun/export.go
+++ b/wgstack/tun/export.go
@@ -0,0 +1,3 @@
 
															+package tun
														
 
															+
														
 
															+const VirtioNetHdrLen = virtioNetHdrLen
														
--- a/wgstack/tun/tcp_offload_linux.go
+++ b/wgstack/tun/tcp_offload_linux.go
@@ -0,0 +1,630 @@
 
															+//go:build linux
														
 
															+
														
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package tun
														
 
															+
														
 
															+import (
														
 
															+	"bytes"
														
 
															+	"encoding/binary"
														
 
															+	"errors"
														
 
															+	"io"
														
 
															+	"unsafe"
														
 
															+
														
 
															+	wgconn "github.com/slackhq/nebula/wgstack/conn"
														
 
															+	"golang.org/x/sys/unix"
														
 
															+)
														
 
															+
														
 
															+var ErrTooManySegments = errors.New("tun: too many segments for TSO")
														
 
															+
														
 
															+const tcpFlagsOffset = 13
														
 
															+
														
 
															+const (
														
 
															+	tcpFlagFIN uint8 = 0x01
														
 
															+	tcpFlagPSH uint8 = 0x08
														
 
															+	tcpFlagACK uint8 = 0x10
														
 
															+)
														
 
															+
														
 
															+// virtioNetHdr is defined in the kernel in include/uapi/linux/virtio_net.h. The
														
 
															+// kernel symbol is virtio_net_hdr.
														
 
															+type virtioNetHdr struct {
														
 
															+	flags      uint8
														
 
															+	gsoType    uint8
														
 
															+	hdrLen     uint16
														
 
															+	gsoSize    uint16
														
 
															+	csumStart  uint16
														
 
															+	csumOffset uint16
														
 
															+}
														
 
															+
														
 
															+func (v *virtioNetHdr) decode(b []byte) error {
														
 
															+	if len(b) < virtioNetHdrLen {
														
 
															+		return io.ErrShortBuffer
														
 
															+	}
														
 
															+	copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen), b[:virtioNetHdrLen])
														
 
															+	return nil
														
 
															+}
														
 
															+
														
 
															+func (v *virtioNetHdr) encode(b []byte) error {
														
 
															+	if len(b) < virtioNetHdrLen {
														
 
															+		return io.ErrShortBuffer
														
 
															+	}
														
 
															+	copy(b[:virtioNetHdrLen], unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen))
														
 
															+	return nil
														
 
															+}
														
 
															+
														
 
															+const (
														
 
															+	// virtioNetHdrLen is the length in bytes of virtioNetHdr. This matches the
														
 
															+	// shape of the C ABI for its kernel counterpart -- sizeof(virtio_net_hdr).
														
 
															+	virtioNetHdrLen = int(unsafe.Sizeof(virtioNetHdr{}))
														
 
															+)
														
 
															+
														
 
															+// flowKey represents the key for a flow.
														
 
															+type flowKey struct {
														
 
															+	srcAddr, dstAddr [16]byte
														
 
															+	srcPort, dstPort uint16
														
 
															+	rxAck            uint32 // varying ack values should not be coalesced. Treat them as separate flows.
														
 
															+}
														
 
															+
														
 
															+// tcpGROTable holds flow and coalescing information for the purposes of GRO.
														
 
															+type tcpGROTable struct {
														
 
															+	itemsByFlow map[flowKey][]tcpGROItem
														
 
															+	itemsPool   [][]tcpGROItem
														
 
															+}
														
 
															+
														
 
															+func newTCPGROTable() *tcpGROTable {
														
 
															+	t := &tcpGROTable{
														
 
															+		itemsByFlow: make(map[flowKey][]tcpGROItem, wgconn.IdealBatchSize),
														
 
															+		itemsPool:   make([][]tcpGROItem, wgconn.IdealBatchSize),
														
 
															+	}
														
 
															+	for i := range t.itemsPool {
														
 
															+		t.itemsPool[i] = make([]tcpGROItem, 0, wgconn.IdealBatchSize)
														
 
															+	}
														
 
															+	return t
														
 
															+}
														
 
															+
														
 
															+func newFlowKey(pkt []byte, srcAddr, dstAddr, tcphOffset int) flowKey {
														
 
															+	key := flowKey{}
														
 
															+	addrSize := dstAddr - srcAddr
														
 
															+	copy(key.srcAddr[:], pkt[srcAddr:dstAddr])
														
 
															+	copy(key.dstAddr[:], pkt[dstAddr:dstAddr+addrSize])
														
 
															+	key.srcPort = binary.BigEndian.Uint16(pkt[tcphOffset:])
														
 
															+	key.dstPort = binary.BigEndian.Uint16(pkt[tcphOffset+2:])
														
 
															+	key.rxAck = binary.BigEndian.Uint32(pkt[tcphOffset+8:])
														
 
															+	return key
														
 
															+}
														
 
															+
														
 
															+// lookupOrInsert looks up a flow for the provided packet and metadata,
														
 
															+// returning the packets found for the flow, or inserting a new one if none
														
 
															+// is found.
														
 
															+func (t *tcpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) ([]tcpGROItem, bool) {
														
 
															+	key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
														
 
															+	items, ok := t.itemsByFlow[key]
														
 
															+	if ok {
														
 
															+		return items, ok
														
 
															+	}
														
 
															+	// TODO: insert() performs another map lookup. This could be rearranged to avoid.
														
 
															+	t.insert(pkt, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex)
														
 
															+	return nil, false
														
 
															+}
														
 
															+
														
 
															+// insert an item in the table for the provided packet and packet metadata.
														
 
															+func (t *tcpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) {
														
 
															+	key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
														
 
															+	item := tcpGROItem{
														
 
															+		key:       key,
														
 
															+		bufsIndex: uint16(bufsIndex),
														
 
															+		gsoSize:   uint16(len(pkt[tcphOffset+tcphLen:])),
														
 
															+		iphLen:    uint8(tcphOffset),
														
 
															+		tcphLen:   uint8(tcphLen),
														
 
															+		sentSeq:   binary.BigEndian.Uint32(pkt[tcphOffset+4:]),
														
 
															+		pshSet:    pkt[tcphOffset+tcpFlagsOffset]&tcpFlagPSH != 0,
														
 
															+	}
														
 
															+	items, ok := t.itemsByFlow[key]
														
 
															+	if !ok {
														
 
															+		items = t.newItems()
														
 
															+	}
														
 
															+	items = append(items, item)
														
 
															+	t.itemsByFlow[key] = items
														
 
															+}
														
 
															+
														
 
															+func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
														
 
															+	items, _ := t.itemsByFlow[item.key]
														
 
															+	items[i] = item
														
 
															+}
														
 
															+
														
 
															+func (t *tcpGROTable) deleteAt(key flowKey, i int) {
														
 
															+	items, _ := t.itemsByFlow[key]
														
 
															+	items = append(items[:i], items[i+1:]...)
														
 
															+	t.itemsByFlow[key] = items
														
 
															+}
														
 
															+
														
 
															+// tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
														
 
															+// of a GRO evaluation across a vector of packets.
														
 
															+type tcpGROItem struct {
														
 
															+	key       flowKey
														
 
															+	sentSeq   uint32 // the sequence number
														
 
															+	bufsIndex uint16 // the index into the original bufs slice
														
 
															+	numMerged uint16 // the number of packets merged into this item
														
 
															+	gsoSize   uint16 // payload size
														
 
															+	iphLen    uint8  // ip header len
														
 
															+	tcphLen   uint8  // tcp header len
														
 
															+	pshSet    bool   // psh flag is set
														
 
															+}
														
 
															+
														
 
															+func (t *tcpGROTable) newItems() []tcpGROItem {
														
 
															+	var items []tcpGROItem
														
 
															+	items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
														
 
															+	return items
														
 
															+}
														
 
															+
														
 
															+func (t *tcpGROTable) reset() {
														
 
															+	for k, items := range t.itemsByFlow {
														
 
															+		items = items[:0]
														
 
															+		t.itemsPool = append(t.itemsPool, items)
														
 
															+		delete(t.itemsByFlow, k)
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+// canCoalesce represents the outcome of checking if two TCP packets are
														
 
															+// candidates for coalescing.
														
 
															+type canCoalesce int
														
 
															+
														
 
															+const (
														
 
															+	coalescePrepend     canCoalesce = -1
														
 
															+	coalesceUnavailable canCoalesce = 0
														
 
															+	coalesceAppend      canCoalesce = 1
														
 
															+)
														
 
															+
														
 
															+// tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
														
 
															+// described by item. This function makes considerations that match the kernel's
														
 
															+// GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
														
 
															+func tcpPacketsCanCoalesce(pkt []byte, iphLen, tcphLen uint8, seq uint32, pshSet bool, gsoSize uint16, item tcpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
														
 
															+	pktTarget := bufs[item.bufsIndex][bufsOffset:]
														
 
															+	if tcphLen != item.tcphLen {
														
 
															+		// cannot coalesce with unequal tcp options len
														
 
															+		return coalesceUnavailable
														
 
															+	}
														
 
															+	if tcphLen > 20 {
														
 
															+		if !bytes.Equal(pkt[iphLen+20:iphLen+tcphLen], pktTarget[item.iphLen+20:iphLen+tcphLen]) {
														
 
															+			// cannot coalesce with unequal tcp options
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+	}
														
 
															+	if pkt[0]>>4 == 6 {
														
 
															+		if pkt[0] != pktTarget[0] || pkt[1]>>4 != pktTarget[1]>>4 {
														
 
															+			// cannot coalesce with unequal Traffic class values
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if pkt[7] != pktTarget[7] {
														
 
															+			// cannot coalesce with unequal Hop limit values
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+	} else {
														
 
															+		if pkt[1] != pktTarget[1] {
														
 
															+			// cannot coalesce with unequal ToS values
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if pkt[6]>>5 != pktTarget[6]>>5 {
														
 
															+			// cannot coalesce with unequal DF or reserved bits. MF is checked
														
 
															+			// further up the stack.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if pkt[8] != pktTarget[8] {
														
 
															+			// cannot coalesce with unequal TTL values
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+	}
														
 
															+	// seq adjacency
														
 
															+	lhsLen := item.gsoSize
														
 
															+	lhsLen += item.numMerged * item.gsoSize
														
 
															+	if seq == item.sentSeq+uint32(lhsLen) { // pkt aligns following item from a seq num perspective
														
 
															+		if item.pshSet {
														
 
															+			// We cannot append to a segment that has the PSH flag set, PSH
														
 
															+			// can only be set on the final segment in a reassembled group.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if len(pktTarget[iphLen+tcphLen:])%int(item.gsoSize) != 0 {
														
 
															+			// A smaller than gsoSize packet has been appended previously.
														
 
															+			// Nothing can come after a smaller packet on the end.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if gsoSize > item.gsoSize {
														
 
															+			// We cannot have a larger packet following a smaller one.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		return coalesceAppend
														
 
															+	} else if seq+uint32(gsoSize) == item.sentSeq { // pkt aligns in front of item from a seq num perspective
														
 
															+		if pshSet {
														
 
															+			// We cannot prepend with a segment that has the PSH flag set, PSH
														
 
															+			// can only be set on the final segment in a reassembled group.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if gsoSize < item.gsoSize {
														
 
															+			// We cannot have a larger packet following a smaller one.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		if gsoSize > item.gsoSize && item.numMerged > 0 {
														
 
															+			// There's at least one previous merge, and we're larger than all
														
 
															+			// previous. This would put multiple smaller packets on the end.
														
 
															+			return coalesceUnavailable
														
 
															+		}
														
 
															+		return coalescePrepend
														
 
															+	}
														
 
															+	return coalesceUnavailable
														
 
															+}
														
 
															+
														
 
															+func tcpChecksumValid(pkt []byte, iphLen uint8, isV6 bool) bool {
														
 
															+	srcAddrAt := ipv4SrcAddrOffset
														
 
															+	addrSize := 4
														
 
															+	if isV6 {
														
 
															+		srcAddrAt = ipv6SrcAddrOffset
														
 
															+		addrSize = 16
														
 
															+	}
														
 
															+	tcpTotalLen := uint16(len(pkt) - int(iphLen))
														
 
															+	tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, pkt[srcAddrAt:srcAddrAt+addrSize], pkt[srcAddrAt+addrSize:srcAddrAt+addrSize*2], tcpTotalLen)
														
 
															+	return ^checksum(pkt[iphLen:], tcpCSumNoFold) == 0
														
 
															+}
														
 
															+
														
 
															+// coalesceResult represents the result of attempting to coalesce two TCP
														
 
															+// packets.
														
 
															+type coalesceResult int
														
 
															+
														
 
															+const (
														
 
															+	coalesceInsufficientCap coalesceResult = 0
														
 
															+	coalescePSHEnding       coalesceResult = 1
														
 
															+	coalesceItemInvalidCSum coalesceResult = 2
														
 
															+	coalescePktInvalidCSum  coalesceResult = 3
														
 
															+	coalesceSuccess         coalesceResult = 4
														
 
															+)
														
 
															+
														
 
															+// coalesceTCPPackets attempts to coalesce pkt with the packet described by
														
 
															+// item, returning the outcome. This function may swap bufs elements in the
														
 
															+// event of a prepend as item's bufs index is already being tracked for writing
														
 
															+// to a Device.
														
 
															+func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize uint16, seq uint32, pshSet bool, item *tcpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
														
 
															+	var pktHead []byte // the packet that will end up at the front
														
 
															+	headersLen := item.iphLen + item.tcphLen
														
 
															+	coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
														
 
															+
														
 
															+	// Copy data
														
 
															+	if mode == coalescePrepend {
														
 
															+		pktHead = pkt
														
 
															+		if cap(pkt)-bufsOffset < coalescedLen {
														
 
															+			// We don't want to allocate a new underlying array if capacity is
														
 
															+			// too small.
														
 
															+			return coalesceInsufficientCap
														
 
															+		}
														
 
															+		if pshSet {
														
 
															+			return coalescePSHEnding
														
 
															+		}
														
 
															+		if item.numMerged == 0 {
														
 
															+			if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
														
 
															+				return coalesceItemInvalidCSum
														
 
															+			}
														
 
															+		}
														
 
															+		if !tcpChecksumValid(pkt, item.iphLen, isV6) {
														
 
															+			return coalescePktInvalidCSum
														
 
															+		}
														
 
															+		item.sentSeq = seq
														
 
															+		extendBy := coalescedLen - len(pktHead)
														
 
															+		bufs[pktBuffsIndex] = append(bufs[pktBuffsIndex], make([]byte, extendBy)...)
														
 
															+		copy(bufs[pktBuffsIndex][bufsOffset+len(pkt):], bufs[item.bufsIndex][bufsOffset+int(headersLen):])
														
 
															+		// Flip the slice headers in bufs as part of prepend. The index of item
														
 
															+		// is already being tracked for writing.
														
 
															+		bufs[item.bufsIndex], bufs[pktBuffsIndex] = bufs[pktBuffsIndex], bufs[item.bufsIndex]
														
 
															+	} else {
														
 
															+		pktHead = bufs[item.bufsIndex][bufsOffset:]
														
 
															+		if cap(pktHead)-bufsOffset < coalescedLen {
														
 
															+			// We don't want to allocate a new underlying array if capacity is
														
 
															+			// too small.
														
 
															+			return coalesceInsufficientCap
														
 
															+		}
														
 
															+		if item.numMerged == 0 {
														
 
															+			if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
														
 
															+				return coalesceItemInvalidCSum
														
 
															+			}
														
 
															+		}
														
 
															+		if !tcpChecksumValid(pkt, item.iphLen, isV6) {
														
 
															+			return coalescePktInvalidCSum
														
 
															+		}
														
 
															+		if pshSet {
														
 
															+			// We are appending a segment with PSH set.
														
 
															+			item.pshSet = pshSet
														
 
															+			pktHead[item.iphLen+tcpFlagsOffset] |= tcpFlagPSH
														
 
															+		}
														
 
															+		extendBy := len(pkt) - int(headersLen)
														
 
															+		bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
														
 
															+		copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
														
 
															+	}
														
 
															+
														
 
															+	if gsoSize > item.gsoSize {
														
 
															+		item.gsoSize = gsoSize
														
 
															+	}
														
 
															+	hdr := virtioNetHdr{
														
 
															+		flags:      unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
														
 
															+		hdrLen:     uint16(headersLen),
														
 
															+		gsoSize:    uint16(item.gsoSize),
														
 
															+		csumStart:  uint16(item.iphLen),
														
 
															+		csumOffset: 16,
														
 
															+	}
														
 
															+
														
 
															+	// Recalculate the total len (IPv4) or payload len (IPv6). Recalculate the
														
 
															+	// (IPv4) header checksum.
														
 
															+	if isV6 {
														
 
															+		hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV6
														
 
															+		binary.BigEndian.PutUint16(pktHead[4:], uint16(coalescedLen)-uint16(item.iphLen)) // set new payload len
														
 
															+	} else {
														
 
															+		hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV4
														
 
															+		pktHead[10], pktHead[11] = 0, 0                               // clear checksum field
														
 
															+		binary.BigEndian.PutUint16(pktHead[2:], uint16(coalescedLen)) // set new total length
														
 
															+		iphCSum := ^checksum(pktHead[:item.iphLen], 0)                // compute checksum
														
 
															+		binary.BigEndian.PutUint16(pktHead[10:], iphCSum)             // set checksum field
														
 
															+	}
														
 
															+	hdr.encode(bufs[item.bufsIndex][bufsOffset-virtioNetHdrLen:])
														
 
															+
														
 
															+	// Calculate the pseudo header checksum and place it at the TCP checksum
														
 
															+	// offset. Downstream checksum offloading will combine this with computation
														
 
															+	// of the tcp header and payload checksum.
														
 
															+	addrLen := 4
														
 
															+	addrOffset := ipv4SrcAddrOffset
														
 
															+	if isV6 {
														
 
															+		addrLen = 16
														
 
															+		addrOffset = ipv6SrcAddrOffset
														
 
															+	}
														
 
															+	srcAddrAt := bufsOffset + addrOffset
														
 
															+	srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
														
 
															+	dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
														
 
															+	psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, srcAddr, dstAddr, uint16(coalescedLen-int(item.iphLen)))
														
 
															+	binary.BigEndian.PutUint16(pktHead[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
														
 
															+
														
 
															+	item.numMerged++
														
 
															+	return coalesceSuccess
														
 
															+}
														
 
															+
														
 
															+const (
														
 
															+	ipv4FlagMoreFragments uint8 = 0x20
														
 
															+)
														
 
															+
														
 
															+const (
														
 
															+	ipv4SrcAddrOffset = 12
														
 
															+	ipv6SrcAddrOffset = 8
														
 
															+	maxUint16         = 1<<16 - 1
														
 
															+)
														
 
															+
														
 
															+// tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
														
 
															+// existing packets tracked in table. It will return false when pktI is not
														
 
															+// coalesced, otherwise true. This indicates to the caller if bufs[pktI]
														
 
															+// should be written to the Device.
														
 
															+func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool) (pktCoalesced bool) {
														
 
															+	pkt := bufs[pktI][offset:]
														
 
															+	if len(pkt) > maxUint16 {
														
 
															+		// A valid IPv4 or IPv6 packet will never exceed this.
														
 
															+		return false
														
 
															+	}
														
 
															+	iphLen := int((pkt[0] & 0x0F) * 4)
														
 
															+	if isV6 {
														
 
															+		iphLen = 40
														
 
															+		ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
														
 
															+		if ipv6HPayloadLen != len(pkt)-iphLen {
														
 
															+			return false
														
 
															+		}
														
 
															+	} else {
														
 
															+		totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
														
 
															+		if totalLen != len(pkt) {
														
 
															+			return false
														
 
															+		}
														
 
															+	}
														
 
															+	if len(pkt) < iphLen {
														
 
															+		return false
														
 
															+	}
														
 
															+	tcphLen := int((pkt[iphLen+12] >> 4) * 4)
														
 
															+	if tcphLen < 20 || tcphLen > 60 {
														
 
															+		return false
														
 
															+	}
														
 
															+	if len(pkt) < iphLen+tcphLen {
														
 
															+		return false
														
 
															+	}
														
 
															+	if !isV6 {
														
 
															+		if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
														
 
															+			// no GRO support for fragmented segments for now
														
 
															+			return false
														
 
															+		}
														
 
															+	}
														
 
															+	tcpFlags := pkt[iphLen+tcpFlagsOffset]
														
 
															+	var pshSet bool
														
 
															+	// not a candidate if any non-ACK flags (except PSH+ACK) are set
														
 
															+	if tcpFlags != tcpFlagACK {
														
 
															+		if pkt[iphLen+tcpFlagsOffset] != tcpFlagACK|tcpFlagPSH {
														
 
															+			return false
														
 
															+		}
														
 
															+		pshSet = true
														
 
															+	}
														
 
															+	gsoSize := uint16(len(pkt) - tcphLen - iphLen)
														
 
															+	// not a candidate if payload len is 0
														
 
															+	if gsoSize < 1 {
														
 
															+		return false
														
 
															+	}
														
 
															+	seq := binary.BigEndian.Uint32(pkt[iphLen+4:])
														
 
															+	srcAddrOffset := ipv4SrcAddrOffset
														
 
															+	addrLen := 4
														
 
															+	if isV6 {
														
 
															+		srcAddrOffset = ipv6SrcAddrOffset
														
 
															+		addrLen = 16
														
 
															+	}
														
 
															+	items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
														
 
															+	if !existing {
														
 
															+		return false
														
 
															+	}
														
 
															+	for i := len(items) - 1; i >= 0; i-- {
														
 
															+		// In the best case of packets arriving in order iterating in reverse is
														
 
															+		// more efficient if there are multiple items for a given flow. This
														
 
															+		// also enables a natural table.deleteAt() in the
														
 
															+		// coalesceItemInvalidCSum case without the need for index tracking.
														
 
															+		// This algorithm makes a best effort to coalesce in the event of
														
 
															+		// unordered packets, where pkt may land anywhere in items from a
														
 
															+		// sequence number perspective, however once an item is inserted into
														
 
															+		// the table it is never compared across other items later.
														
 
															+		item := items[i]
														
 
															+		can := tcpPacketsCanCoalesce(pkt, uint8(iphLen), uint8(tcphLen), seq, pshSet, gsoSize, item, bufs, offset)
														
 
															+		if can != coalesceUnavailable {
														
 
															+			result := coalesceTCPPackets(can, pkt, pktI, gsoSize, seq, pshSet, &item, bufs, offset, isV6)
														
 
															+			switch result {
														
 
															+			case coalesceSuccess:
														
 
															+				table.updateAt(item, i)
														
 
															+				return true
														
 
															+			case coalesceItemInvalidCSum:
														
 
															+				// delete the item with an invalid csum
														
 
															+				table.deleteAt(item.key, i)
														
 
															+			case coalescePktInvalidCSum:
														
 
															+				// no point in inserting an item that we can't coalesce
														
 
															+				return false
														
 
															+			default:
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	// failed to coalesce with any other packets; store the item in the flow
														
 
															+	table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
														
 
															+	return false
														
 
															+}
														
 
															+
														
 
															+func isTCP4NoIPOptions(b []byte) bool {
														
 
															+	if len(b) < 40 {
														
 
															+		return false
														
 
															+	}
														
 
															+	if b[0]>>4 != 4 {
														
 
															+		return false
														
 
															+	}
														
 
															+	if b[0]&0x0F != 5 {
														
 
															+		return false
														
 
															+	}
														
 
															+	if b[9] != unix.IPPROTO_TCP {
														
 
															+		return false
														
 
															+	}
														
 
															+	return true
														
 
															+}
														
 
															+
														
 
															+func isTCP6NoEH(b []byte) bool {
														
 
															+	if len(b) < 60 {
														
 
															+		return false
														
 
															+	}
														
 
															+	if b[0]>>4 != 6 {
														
 
															+		return false
														
 
															+	}
														
 
															+	if b[6] != unix.IPPROTO_TCP {
														
 
															+		return false
														
 
															+	}
														
 
															+	return true
														
 
															+}
														
 
															+
														
 
															+// handleGRO evaluates bufs for GRO, and writes the indices of the resulting
														
 
															+// packets into toWrite. toWrite, tcp4Table, and tcp6Table should initially be
														
 
															+// empty (but non-nil), and are passed in to save allocs as the caller may reset
														
 
															+// and recycle them across vectors of packets.
														
 
															+func handleGRO(bufs [][]byte, offset int, tcp4Table, tcp6Table *tcpGROTable, toWrite *[]int) error {
														
 
															+	for i := range bufs {
														
 
															+		if offset < virtioNetHdrLen || offset > len(bufs[i])-1 {
														
 
															+			return errors.New("invalid offset")
														
 
															+		}
														
 
															+		var coalesced bool
														
 
															+		switch {
														
 
															+		case isTCP4NoIPOptions(bufs[i][offset:]): // ipv4 packets w/IP options do not coalesce
														
 
															+			coalesced = tcpGRO(bufs, offset, i, tcp4Table, false)
														
 
															+		case isTCP6NoEH(bufs[i][offset:]): // ipv6 packets w/extension headers do not coalesce
														
 
															+			coalesced = tcpGRO(bufs, offset, i, tcp6Table, true)
														
 
															+		}
														
 
															+		if !coalesced {
														
 
															+			hdr := virtioNetHdr{}
														
 
															+			err := hdr.encode(bufs[i][offset-virtioNetHdrLen:])
														
 
															+			if err != nil {
														
 
															+				return err
														
 
															+			}
														
 
															+			*toWrite = append(*toWrite, i)
														
 
															+		}
														
 
															+	}
														
 
															+	return nil
														
 
															+}
														
 
															+
														
 
															+// tcpTSO splits packets from in into outBuffs, writing the size of each
														
 
															+// element into sizes. It returns the number of buffers populated, and/or an
														
 
															+// error.
														
 
															+func tcpTSO(in []byte, hdr virtioNetHdr, outBuffs [][]byte, sizes []int, outOffset int) (int, error) {
														
 
															+	iphLen := int(hdr.csumStart)
														
 
															+	srcAddrOffset := ipv6SrcAddrOffset
														
 
															+	addrLen := 16
														
 
															+	if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
														
 
															+		in[10], in[11] = 0, 0 // clear ipv4 header checksum
														
 
															+		srcAddrOffset = ipv4SrcAddrOffset
														
 
															+		addrLen = 4
														
 
															+	}
														
 
															+	tcpCSumAt := int(hdr.csumStart + hdr.csumOffset)
														
 
															+	in[tcpCSumAt], in[tcpCSumAt+1] = 0, 0 // clear tcp checksum
														
 
															+	firstTCPSeqNum := binary.BigEndian.Uint32(in[hdr.csumStart+4:])
														
 
															+	nextSegmentDataAt := int(hdr.hdrLen)
														
 
															+	i := 0
														
 
															+	for ; nextSegmentDataAt < len(in); i++ {
														
 
															+		if i == len(outBuffs) {
														
 
															+			return i - 1, ErrTooManySegments
														
 
															+		}
														
 
															+		nextSegmentEnd := nextSegmentDataAt + int(hdr.gsoSize)
														
 
															+		if nextSegmentEnd > len(in) {
														
 
															+			nextSegmentEnd = len(in)
														
 
															+		}
														
 
															+		segmentDataLen := nextSegmentEnd - nextSegmentDataAt
														
 
															+		totalLen := int(hdr.hdrLen) + segmentDataLen
														
 
															+		sizes[i] = totalLen
														
 
															+		out := outBuffs[i][outOffset:]
														
 
															+
														
 
															+		copy(out, in[:iphLen])
														
 
															+		if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
														
 
															+			// For IPv4 we are responsible for incrementing the ID field,
														
 
															+			// updating the total len field, and recalculating the header
														
 
															+			// checksum.
														
 
															+			if i > 0 {
														
 
															+				id := binary.BigEndian.Uint16(out[4:])
														
 
															+				id += uint16(i)
														
 
															+				binary.BigEndian.PutUint16(out[4:], id)
														
 
															+			}
														
 
															+			binary.BigEndian.PutUint16(out[2:], uint16(totalLen))
														
 
															+			ipv4CSum := ^checksum(out[:iphLen], 0)
														
 
															+			binary.BigEndian.PutUint16(out[10:], ipv4CSum)
														
 
															+		} else {
														
 
															+			// For IPv6 we are responsible for updating the payload length field.
														
 
															+			binary.BigEndian.PutUint16(out[4:], uint16(totalLen-iphLen))
														
 
															+		}
														
 
															+
														
 
															+		// TCP header
														
 
															+		copy(out[hdr.csumStart:hdr.hdrLen], in[hdr.csumStart:hdr.hdrLen])
														
 
															+		tcpSeq := firstTCPSeqNum + uint32(hdr.gsoSize*uint16(i))
														
 
															+		binary.BigEndian.PutUint32(out[hdr.csumStart+4:], tcpSeq)
														
 
															+		if nextSegmentEnd != len(in) {
														
 
															+			// FIN and PSH should only be set on last segment
														
 
															+			clearFlags := tcpFlagFIN | tcpFlagPSH
														
 
															+			out[hdr.csumStart+tcpFlagsOffset] &^= clearFlags
														
 
															+		}
														
 
															+
														
 
															+		// payload
														
 
															+		copy(out[hdr.hdrLen:], in[nextSegmentDataAt:nextSegmentEnd])
														
 
															+
														
 
															+		// TCP checksum
														
 
															+		tcpHLen := int(hdr.hdrLen - hdr.csumStart)
														
 
															+		tcpLenForPseudo := uint16(tcpHLen + segmentDataLen)
														
 
															+		tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, in[srcAddrOffset:srcAddrOffset+addrLen], in[srcAddrOffset+addrLen:srcAddrOffset+addrLen*2], tcpLenForPseudo)
														
 
															+		tcpCSum := ^checksum(out[hdr.csumStart:totalLen], tcpCSumNoFold)
														
 
															+		binary.BigEndian.PutUint16(out[hdr.csumStart+hdr.csumOffset:], tcpCSum)
														
 
															+
														
 
															+		nextSegmentDataAt += int(hdr.gsoSize)
														
 
															+	}
														
 
															+	return i, nil
														
 
															+}
														
 
															+
														
 
															+func gsoNoneChecksum(in []byte, cSumStart, cSumOffset uint16) error {
														
 
															+	cSumAt := cSumStart + cSumOffset
														
 
															+	// The initial value at the checksum offset should be summed with the
														
 
															+	// checksum we compute. This is typically the pseudo-header checksum.
														
 
															+	initial := binary.BigEndian.Uint16(in[cSumAt:])
														
 
															+	in[cSumAt], in[cSumAt+1] = 0, 0
														
 
															+	binary.BigEndian.PutUint16(in[cSumAt:], ^checksum(in[cSumStart:], uint64(initial)))
														
 
															+	return nil
														
 
															+}
														
--- a/wgstack/tun/tun.go
+++ b/wgstack/tun/tun.go
@@ -0,0 +1,52 @@
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package tun
														
 
															+
														
 
															+import (
														
 
															+	"os"
														
 
															+)
														
 
															+
														
 
															+type Event int
														
 
															+
														
 
															+const (
														
 
															+	EventUp = 1 << iota
														
 
															+	EventDown
														
 
															+	EventMTUUpdate
														
 
															+)
														
 
															+
														
 
															+type Device interface {
														
 
															+	// File returns the file descriptor of the device.
														
 
															+	File() *os.File
														
 
															+
														
 
															+	// Read one or more packets from the Device (without any additional headers).
														
 
															+	// On a successful read it returns the number of packets read, and sets
														
 
															+	// packet lengths within the sizes slice. len(sizes) must be >= len(bufs).
														
 
															+	// A nonzero offset can be used to instruct the Device on where to begin
														
 
															+	// reading into each element of the bufs slice.
														
 
															+	Read(bufs [][]byte, sizes []int, offset int) (n int, err error)
														
 
															+
														
 
															+	// Write one or more packets to the device (without any additional headers).
														
 
															+	// On a successful write it returns the number of packets written. A nonzero
														
 
															+	// offset can be used to instruct the Device on where to begin writing from
														
 
															+	// each packet contained within the bufs slice.
														
 
															+	Write(bufs [][]byte, offset int) (int, error)
														
 
															+
														
 
															+	// MTU returns the MTU of the Device.
														
 
															+	MTU() (int, error)
														
 
															+
														
 
															+	// Name returns the current name of the Device.
														
 
															+	Name() (string, error)
														
 
															+
														
 
															+	// Events returns a channel of type Event, which is fed Device events.
														
 
															+	Events() <-chan Event
														
 
															+
														
 
															+	// Close stops the Device and closes the Event channel.
														
 
															+	Close() error
														
 
															+
														
 
															+	// BatchSize returns the preferred/max number of packets that can be read or
														
 
															+	// written in a single read/write call. BatchSize must not change over the
														
 
															+	// lifetime of a Device.
														
 
															+	BatchSize() int
														
 
															+}
														
--- a/wgstack/tun/tun_linux.go
+++ b/wgstack/tun/tun_linux.go
@@ -0,0 +1,652 @@
 
															+//go:build linux
														
 
															+
														
 
															+// SPDX-License-Identifier: MIT
														
 
															+//
														
 
															+// Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
														
 
															+
														
 
															+package tun
														
 
															+
														
 
															+/* Implementation of the TUN device interface for linux
														
 
															+ */
														
 
															+
														
 
															+import (
														
 
															+	"errors"
														
 
															+	"fmt"
														
 
															+	"os"
														
 
															+	"sync"
														
 
															+	"syscall"
														
 
															+	"time"
														
 
															+	"unsafe"
														
 
															+
														
 
															+	wgconn "github.com/slackhq/nebula/wgstack/conn"
														
 
															+	"golang.org/x/sys/unix"
														
 
															+	"golang.zx2c4.com/wireguard/rwcancel"
														
 
															+)
														
 
															+
														
 
															+const (
														
 
															+	cloneDevicePath = "/dev/net/tun"
														
 
															+	ifReqSize       = unix.IFNAMSIZ + 64
														
 
															+)
														
 
															+
														
 
															+type NativeTun struct {
														
 
															+	tunFile                 *os.File
														
 
															+	index                   int32      // if index
														
 
															+	errors                  chan error // async error handling
														
 
															+	events                  chan Event // device related events
														
 
															+	netlinkSock             int
														
 
															+	netlinkCancel           *rwcancel.RWCancel
														
 
															+	hackListenerClosed      sync.Mutex
														
 
															+	statusListenersShutdown chan struct{}
														
 
															+	batchSize               int
														
 
															+	vnetHdr                 bool
														
 
															+
														
 
															+	closeOnce sync.Once
														
 
															+
														
 
															+	nameOnce  sync.Once // guards calling initNameCache, which sets following fields
														
 
															+	nameCache string    // name of interface
														
 
															+	nameErr   error
														
 
															+
														
 
															+	readOpMu sync.Mutex                    // readOpMu guards readBuff
														
 
															+	readBuff [virtioNetHdrLen + 65535]byte // if vnetHdr every read() is prefixed by virtioNetHdr
														
 
															+
														
 
															+	writeOpMu                  sync.Mutex // writeOpMu guards toWrite, tcp4GROTable, tcp6GROTable
														
 
															+	toWrite                    []int
														
 
															+	tcp4GROTable, tcp6GROTable *tcpGROTable
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) File() *os.File {
														
 
															+	return tun.tunFile
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) routineHackListener() {
														
 
															+	defer tun.hackListenerClosed.Unlock()
														
 
															+	/* This is needed for the detection to work across network namespaces
														
 
															+	 * If you are reading this and know a better method, please get in touch.
														
 
															+	 */
														
 
															+	last := 0
														
 
															+	const (
														
 
															+		up   = 1
														
 
															+		down = 2
														
 
															+	)
														
 
															+	for {
														
 
															+		sysconn, err := tun.tunFile.SyscallConn()
														
 
															+		if err != nil {
														
 
															+			return
														
 
															+		}
														
 
															+		err2 := sysconn.Control(func(fd uintptr) {
														
 
															+			_, err = unix.Write(int(fd), nil)
														
 
															+		})
														
 
															+		if err2 != nil {
														
 
															+			return
														
 
															+		}
														
 
															+		switch err {
														
 
															+		case unix.EINVAL:
														
 
															+			if last != up {
														
 
															+				// If the tunnel is up, it reports that write() is
														
 
															+				// allowed but we provided invalid data.
														
 
															+				tun.events <- EventUp
														
 
															+				last = up
														
 
															+			}
														
 
															+		case unix.EIO:
														
 
															+			if last != down {
														
 
															+				// If the tunnel is down, it reports that no I/O
														
 
															+				// is possible, without checking our provided data.
														
 
															+				tun.events <- EventDown
														
 
															+				last = down
														
 
															+			}
														
 
															+		default:
														
 
															+			return
														
 
															+		}
														
 
															+		select {
														
 
															+		case <-time.After(time.Second):
														
 
															+			// nothing
														
 
															+		case <-tun.statusListenersShutdown:
														
 
															+			return
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func createNetlinkSocket() (int, error) {
														
 
															+	sock, err := unix.Socket(unix.AF_NETLINK, unix.SOCK_RAW|unix.SOCK_CLOEXEC, unix.NETLINK_ROUTE)
														
 
															+	if err != nil {
														
 
															+		return -1, err
														
 
															+	}
														
 
															+	saddr := &unix.SockaddrNetlink{
														
 
															+		Family: unix.AF_NETLINK,
														
 
															+		Groups: unix.RTMGRP_LINK | unix.RTMGRP_IPV4_IFADDR | unix.RTMGRP_IPV6_IFADDR,
														
 
															+	}
														
 
															+	err = unix.Bind(sock, saddr)
														
 
															+	if err != nil {
														
 
															+		return -1, err
														
 
															+	}
														
 
															+	return sock, nil
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) routineNetlinkListener() {
														
 
															+	defer func() {
														
 
															+		unix.Close(tun.netlinkSock)
														
 
															+		tun.hackListenerClosed.Lock()
														
 
															+		close(tun.events)
														
 
															+		tun.netlinkCancel.Close()
														
 
															+	}()
														
 
															+
														
 
															+	for msg := make([]byte, 1<<16); ; {
														
 
															+		var err error
														
 
															+		var msgn int
														
 
															+		for {
														
 
															+			msgn, _, _, _, err = unix.Recvmsg(tun.netlinkSock, msg[:], nil, 0)
														
 
															+			if err == nil || !rwcancel.RetryAfterError(err) {
														
 
															+				break
														
 
															+			}
														
 
															+			if !tun.netlinkCancel.ReadyRead() {
														
 
															+				tun.errors <- fmt.Errorf("netlink socket closed: %w", err)
														
 
															+				return
														
 
															+			}
														
 
															+		}
														
 
															+		if err != nil {
														
 
															+			tun.errors <- fmt.Errorf("failed to receive netlink message: %w", err)
														
 
															+			return
														
 
															+		}
														
 
															+
														
 
															+		select {
														
 
															+		case <-tun.statusListenersShutdown:
														
 
															+			return
														
 
															+		default:
														
 
															+		}
														
 
															+
														
 
															+		wasEverUp := false
														
 
															+		for remain := msg[:msgn]; len(remain) >= unix.SizeofNlMsghdr; {
														
 
															+
														
 
															+			hdr := *(*unix.NlMsghdr)(unsafe.Pointer(&remain[0]))
														
 
															+
														
 
															+			if int(hdr.Len) > len(remain) {
														
 
															+				break
														
 
															+			}
														
 
															+
														
 
															+			switch hdr.Type {
														
 
															+			case unix.NLMSG_DONE:
														
 
															+				remain = []byte{}
														
 
															+
														
 
															+			case unix.RTM_NEWLINK:
														
 
															+				info := *(*unix.IfInfomsg)(unsafe.Pointer(&remain[unix.SizeofNlMsghdr]))
														
 
															+				remain = remain[hdr.Len:]
														
 
															+
														
 
															+				if info.Index != tun.index {
														
 
															+					// not our interface
														
 
															+					continue
														
 
															+				}
														
 
															+
														
 
															+				if info.Flags&unix.IFF_RUNNING != 0 {
														
 
															+					tun.events <- EventUp
														
 
															+					wasEverUp = true
														
 
															+				}
														
 
															+
														
 
															+				if info.Flags&unix.IFF_RUNNING == 0 {
														
 
															+					// Don't emit EventDown before we've ever emitted EventUp.
														
 
															+					// This avoids a startup race with HackListener, which
														
 
															+					// might detect Up before we have finished reporting Down.
														
 
															+					if wasEverUp {
														
 
															+						tun.events <- EventDown
														
 
															+					}
														
 
															+				}
														
 
															+
														
 
															+				tun.events <- EventMTUUpdate
														
 
															+
														
 
															+			default:
														
 
															+				remain = remain[hdr.Len:]
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func getIFIndex(name string) (int32, error) {
														
 
															+	fd, err := unix.Socket(
														
 
															+		unix.AF_INET,
														
 
															+		unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
														
 
															+		0,
														
 
															+	)
														
 
															+	if err != nil {
														
 
															+		return 0, err
														
 
															+	}
														
 
															+
														
 
															+	defer unix.Close(fd)
														
 
															+
														
 
															+	var ifr [ifReqSize]byte
														
 
															+	copy(ifr[:], name)
														
 
															+	_, _, errno := unix.Syscall(
														
 
															+		unix.SYS_IOCTL,
														
 
															+		uintptr(fd),
														
 
															+		uintptr(unix.SIOCGIFINDEX),
														
 
															+		uintptr(unsafe.Pointer(&ifr[0])),
														
 
															+	)
														
 
															+
														
 
															+	if errno != 0 {
														
 
															+		return 0, errno
														
 
															+	}
														
 
															+
														
 
															+	return *(*int32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ])), nil
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) setMTU(n int) error {
														
 
															+	name, err := tun.Name()
														
 
															+	if err != nil {
														
 
															+		return err
														
 
															+	}
														
 
															+
														
 
															+	// open datagram socket
														
 
															+	fd, err := unix.Socket(
														
 
															+		unix.AF_INET,
														
 
															+		unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
														
 
															+		0,
														
 
															+	)
														
 
															+	if err != nil {
														
 
															+		return err
														
 
															+	}
														
 
															+	defer unix.Close(fd)
														
 
															+
														
 
															+	var ifr [ifReqSize]byte
														
 
															+	copy(ifr[:], name)
														
 
															+	*(*uint32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ])) = uint32(n)
														
 
															+
														
 
															+	_, _, errno := unix.Syscall(
														
 
															+		unix.SYS_IOCTL,
														
 
															+		uintptr(fd),
														
 
															+		uintptr(unix.SIOCSIFMTU),
														
 
															+		uintptr(unsafe.Pointer(&ifr[0])),
														
 
															+	)
														
 
															+
														
 
															+	if errno != 0 {
														
 
															+		return errno
														
 
															+	}
														
 
															+	return nil
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) routineNetlinkRead() {
														
 
															+	defer func() {
														
 
															+		unix.Close(tun.netlinkSock)
														
 
															+		tun.hackListenerClosed.Lock()
														
 
															+		close(tun.events)
														
 
															+		tun.netlinkCancel.Close()
														
 
															+	}()
														
 
															+
														
 
															+	for msg := make([]byte, 1<<16); ; {
														
 
															+		var err error
														
 
															+		var msgn int
														
 
															+		for {
														
 
															+			msgn, _, _, _, err = unix.Recvmsg(tun.netlinkSock, msg[:], nil, 0)
														
 
															+			if err == nil || !rwcancel.RetryAfterError(err) {
														
 
															+				break
														
 
															+			}
														
 
															+			if !tun.netlinkCancel.ReadyRead() {
														
 
															+				tun.errors <- fmt.Errorf("netlink socket closed: %w", err)
														
 
															+				return
														
 
															+			}
														
 
															+		}
														
 
															+		if err != nil {
														
 
															+			tun.errors <- fmt.Errorf("failed to receive netlink message: %w", err)
														
 
															+			return
														
 
															+		}
														
 
															+
														
 
															+		wasEverUp := false
														
 
															+		for remain := msg[:msgn]; len(remain) >= unix.SizeofNlMsghdr; {
														
 
															+
														
 
															+			hdr := *(*unix.NlMsghdr)(unsafe.Pointer(&remain[0]))
														
 
															+
														
 
															+			if int(hdr.Len) > len(remain) {
														
 
															+				break
														
 
															+			}
														
 
															+
														
 
															+			switch hdr.Type {
														
 
															+			case unix.NLMSG_DONE:
														
 
															+				remain = []byte{}
														
 
															+
														
 
															+			case unix.RTM_NEWLINK:
														
 
															+				info := *(*unix.IfInfomsg)(unsafe.Pointer(&remain[unix.SizeofNlMsghdr]))
														
 
															+				remain = remain[hdr.Len:]
														
 
															+
														
 
															+				if info.Index != tun.index {
														
 
															+					continue
														
 
															+				}
														
 
															+
														
 
															+				if info.Flags&unix.IFF_RUNNING != 0 {
														
 
															+					tun.events <- EventUp
														
 
															+					wasEverUp = true
														
 
															+				}
														
 
															+
														
 
															+				if info.Flags&unix.IFF_RUNNING == 0 {
														
 
															+					if wasEverUp {
														
 
															+						tun.events <- EventDown
														
 
															+					}
														
 
															+				}
														
 
															+				tun.events <- EventMTUUpdate
														
 
															+
														
 
															+			default:
														
 
															+				remain = remain[hdr.Len:]
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) routineNetlink() {
														
 
															+	var err error
														
 
															+
														
 
															+	tun.netlinkSock, err = createNetlinkSocket()
														
 
															+	if err != nil {
														
 
															+		tun.errors <- fmt.Errorf("failed to create netlink socket: %w", err)
														
 
															+		return
														
 
															+	}
														
 
															+
														
 
															+	tun.netlinkCancel, err = rwcancel.NewRWCancel(tun.netlinkSock)
														
 
															+	if err != nil {
														
 
															+		tun.errors <- fmt.Errorf("failed to create netlink cancel: %w", err)
														
 
															+		return
														
 
															+	}
														
 
															+
														
 
															+	go tun.routineNetlinkListener()
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) Close() error {
														
 
															+	var err1, err2 error
														
 
															+	tun.closeOnce.Do(func() {
														
 
															+		if tun.statusListenersShutdown != nil {
														
 
															+			close(tun.statusListenersShutdown)
														
 
															+			if tun.netlinkCancel != nil {
														
 
															+				err1 = tun.netlinkCancel.Cancel()
														
 
															+			}
														
 
															+		} else if tun.events != nil {
														
 
															+			close(tun.events)
														
 
															+		}
														
 
															+		err2 = tun.tunFile.Close()
														
 
															+	})
														
 
															+	if err1 != nil {
														
 
															+		return err1
														
 
															+	}
														
 
															+	return err2
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) BatchSize() int {
														
 
															+	return tun.batchSize
														
 
															+}
														
 
															+
														
 
															+const (
														
 
															+	// TODO: support TSO with ECN bits
														
 
															+	tunOffloads = unix.TUN_F_CSUM | unix.TUN_F_TSO4 | unix.TUN_F_TSO6
														
 
															+)
														
 
															+
														
 
															+func (tun *NativeTun) initFromFlags(name string) error {
														
 
															+	sc, err := tun.tunFile.SyscallConn()
														
 
															+	if err != nil {
														
 
															+		return err
														
 
															+	}
														
 
															+	if e := sc.Control(func(fd uintptr) {
														
 
															+		var (
														
 
															+			ifr *unix.Ifreq
														
 
															+		)
														
 
															+		ifr, err = unix.NewIfreq(name)
														
 
															+		if err != nil {
														
 
															+			return
														
 
															+		}
														
 
															+		err = unix.IoctlIfreq(int(fd), unix.TUNGETIFF, ifr)
														
 
															+		if err != nil {
														
 
															+			return
														
 
															+		}
														
 
															+		got := ifr.Uint16()
														
 
															+		if got&unix.IFF_VNET_HDR != 0 {
														
 
															+			err = unix.IoctlSetInt(int(fd), unix.TUNSETOFFLOAD, tunOffloads)
														
 
															+			if err != nil {
														
 
															+				return
														
 
															+			}
														
 
															+			tun.vnetHdr = true
														
 
															+			tun.batchSize = wgconn.IdealBatchSize
														
 
															+		} else {
														
 
															+			tun.batchSize = 1
														
 
															+		}
														
 
															+	}); e != nil {
														
 
															+		return e
														
 
															+	}
														
 
															+	return err
														
 
															+}
														
 
															+
														
 
															+// CreateTUN creates a Device with the provided name and MTU.
														
 
															+func CreateTUN(name string, mtu int) (Device, error) {
														
 
															+	nfd, err := unix.Open(cloneDevicePath, unix.O_RDWR|unix.O_CLOEXEC, 0)
														
 
															+	if err != nil {
														
 
															+		return nil, fmt.Errorf("CreateTUN(%q) failed; %s does not exist", name, cloneDevicePath)
														
 
															+	}
														
 
															+	fd := os.NewFile(uintptr(nfd), cloneDevicePath)
														
 
															+	tun, err := CreateTUNFromFile(fd, mtu)
														
 
															+	if err != nil {
														
 
															+		return nil, err
														
 
															+	}
														
 
															+	if name != "tun" {
														
 
															+		if err := tun.(*NativeTun).initFromFlags(name); err != nil {
														
 
															+			tun.Close()
														
 
															+			return nil, fmt.Errorf("CreateTUN(%q) failed to set flags: %w", name, err)
														
 
															+		}
														
 
															+	}
														
 
															+	return tun, nil
														
 
															+}
														
 
															+
														
 
															+// CreateTUNFromFile creates a Device from an os.File with the provided MTU.
														
 
															+func CreateTUNFromFile(file *os.File, mtu int) (Device, error) {
														
 
															+	tun := &NativeTun{
														
 
															+		tunFile: file,
														
 
															+		errors:  make(chan error, 5),
														
 
															+		events:  make(chan Event, 5),
														
 
															+	}
														
 
															+
														
 
															+	var err error
														
 
															+	tun.index, err = getIFIndex("tun")
														
 
															+	if err != nil {
														
 
															+		return nil, fmt.Errorf("failed to get TUN index: %w", err)
														
 
															+	}
														
 
															+
														
 
															+	if err = tun.setMTU(mtu); err != nil {
														
 
															+		return nil, fmt.Errorf("failed to set MTU: %w", err)
														
 
															+	}
														
 
															+
														
 
															+	tun.statusListenersShutdown = make(chan struct{})
														
 
															+	go tun.routineNetlink()
														
 
															+
														
 
															+	if tun.batchSize == 0 {
														
 
															+		tun.batchSize = 1
														
 
															+	}
														
 
															+
														
 
															+	tun.tcp4GROTable = newTCPGROTable()
														
 
															+	tun.tcp6GROTable = newTCPGROTable()
														
 
															+
														
 
															+	return tun, nil
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) Name() (string, error) {
														
 
															+	tun.nameOnce.Do(tun.initNameCache)
														
 
															+	return tun.nameCache, tun.nameErr
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) initNameCache() {
														
 
															+	sysconn, err := tun.tunFile.SyscallConn()
														
 
															+	if err != nil {
														
 
															+		tun.nameErr = err
														
 
															+		return
														
 
															+	}
														
 
															+	err = sysconn.Control(func(fd uintptr) {
														
 
															+		var ifr [ifReqSize]byte
														
 
															+		_, _, errno := unix.Syscall(
														
 
															+			unix.SYS_IOCTL,
														
 
															+			fd,
														
 
															+			uintptr(unix.TUNGETIFF),
														
 
															+			uintptr(unsafe.Pointer(&ifr[0])),
														
 
															+		)
														
 
															+		if errno != 0 {
														
 
															+			tun.nameErr = errno
														
 
															+			return
														
 
															+		}
														
 
															+		tun.nameCache = unix.ByteSliceToString(ifr[:])
														
 
															+	})
														
 
															+	if err != nil && tun.nameErr == nil {
														
 
															+		tun.nameErr = err
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) MTU() (int, error) {
														
 
															+	name, err := tun.Name()
														
 
															+	if err != nil {
														
 
															+		return 0, err
														
 
															+	}
														
 
															+
														
 
															+	// open datagram socket
														
 
															+	fd, err := unix.Socket(
														
 
															+		unix.AF_INET,
														
 
															+		unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
														
 
															+		0,
														
 
															+	)
														
 
															+	if err != nil {
														
 
															+		return 0, err
														
 
															+	}
														
 
															+	defer unix.Close(fd)
														
 
															+
														
 
															+	var ifr [ifReqSize]byte
														
 
															+	copy(ifr[:], name)
														
 
															+
														
 
															+	_, _, errno := unix.Syscall(
														
 
															+		unix.SYS_IOCTL,
														
 
															+		uintptr(fd),
														
 
															+		uintptr(unix.SIOCGIFMTU),
														
 
															+		uintptr(unsafe.Pointer(&ifr[0])),
														
 
															+	)
														
 
															+
														
 
															+	if errno != 0 {
														
 
															+		return 0, errno
														
 
															+	}
														
 
															+
														
 
															+	return int(*(*uint32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ]))), nil
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) Events() <-chan Event {
														
 
															+	return tun.events
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) Write(bufs [][]byte, offset int) (int, error) {
														
 
															+	tun.writeOpMu.Lock()
														
 
															+	defer func() {
														
 
															+		tun.tcp4GROTable.reset()
														
 
															+		tun.tcp6GROTable.reset()
														
 
															+		tun.writeOpMu.Unlock()
														
 
															+	}()
														
 
															+	var (
														
 
															+		errs  error
														
 
															+		total int
														
 
															+	)
														
 
															+	tun.toWrite = tun.toWrite[:0]
														
 
															+	if tun.vnetHdr {
														
 
															+		err := handleGRO(bufs, offset, tun.tcp4GROTable, tun.tcp6GROTable, &tun.toWrite)
														
 
															+		if err != nil {
														
 
															+			return 0, err
														
 
															+		}
														
 
															+		offset -= virtioNetHdrLen
														
 
															+	} else {
														
 
															+		for i := range bufs {
														
 
															+			tun.toWrite = append(tun.toWrite, i)
														
 
															+		}
														
 
															+	}
														
 
															+	for _, bufsI := range tun.toWrite {
														
 
															+		n, err := tun.tunFile.Write(bufs[bufsI][offset:])
														
 
															+		if errors.Is(err, syscall.EBADFD) {
														
 
															+			return total, os.ErrClosed
														
 
															+		}
														
 
															+		if err != nil {
														
 
															+			errs = errors.Join(errs, err)
														
 
															+		} else {
														
 
															+			total += n
														
 
															+		}
														
 
															+	}
														
 
															+	return total, errs
														
 
															+}
														
 
															+
														
 
															+// handleVirtioRead splits in into bufs, leaving offset bytes at the front of
														
 
															+// each buffer. It mutates sizes to reflect the size of each element of bufs,
														
 
															+// and returns the number of packets read.
														
 
															+func handleVirtioRead(in []byte, bufs [][]byte, sizes []int, offset int) (int, error) {
														
 
															+	var hdr virtioNetHdr
														
 
															+	if err := hdr.decode(in); err != nil {
														
 
															+		return 0, err
														
 
															+	}
														
 
															+	in = in[virtioNetHdrLen:]
														
 
															+	if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_NONE {
														
 
															+		if hdr.flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
														
 
															+			if err := gsoNoneChecksum(in, hdr.csumStart, hdr.csumOffset); err != nil {
														
 
															+				return 0, err
														
 
															+			}
														
 
															+		}
														
 
															+		if len(in) > len(bufs[0][offset:]) {
														
 
															+			return 0, fmt.Errorf("read len %d overflows bufs element len %d", len(in), len(bufs[0][offset:]))
														
 
															+		}
														
 
															+		n := copy(bufs[0][offset:], in)
														
 
															+		sizes[0] = n
														
 
															+		return 1, nil
														
 
															+	}
														
 
															+	if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV4 && hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV6 {
														
 
															+		return 0, fmt.Errorf("unsupported virtio GSO type: %d", hdr.gsoType)
														
 
															+	}
														
 
															+
														
 
															+	ipVersion := in[0] >> 4
														
 
															+	switch ipVersion {
														
 
															+	case 4:
														
 
															+		if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV4 {
														
 
															+			return 0, fmt.Errorf("ip header version: %d, GSO type: %d", ipVersion, hdr.gsoType)
														
 
															+		}
														
 
															+	case 6:
														
 
															+		if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV6 {
														
 
															+			return 0, fmt.Errorf("ip header version: %d, GSO type: %d", ipVersion, hdr.gsoType)
														
 
															+		}
														
 
															+	default:
														
 
															+		return 0, fmt.Errorf("invalid ip header version: %d", ipVersion)
														
 
															+	}
														
 
															+
														
 
															+	if len(in) <= int(hdr.csumStart+12) {
														
 
															+		return 0, errors.New("packet is too short")
														
 
															+	}
														
 
															+	tcpHLen := uint16(in[hdr.csumStart+12] >> 4 * 4)
														
 
															+	if tcpHLen < 20 || tcpHLen > 60 {
														
 
															+		return 0, fmt.Errorf("tcp header len is invalid: %d", tcpHLen)
														
 
															+	}
														
 
															+	hdr.hdrLen = hdr.csumStart + tcpHLen
														
 
															+	if len(in) < int(hdr.hdrLen) {
														
 
															+		return 0, fmt.Errorf("length of packet (%d) < virtioNetHdr.hdrLen (%d)", len(in), hdr.hdrLen)
														
 
															+	}
														
 
															+	if hdr.hdrLen < hdr.csumStart {
														
 
															+		return 0, fmt.Errorf("virtioNetHdr.hdrLen (%d) < virtioNetHdr.csumStart (%d)", hdr.hdrLen, hdr.csumStart)
														
 
															+	}
														
 
															+	cSumAt := int(hdr.csumStart + hdr.csumOffset)
														
 
															+	if cSumAt+1 >= len(in) {
														
 
															+		return 0, fmt.Errorf("end of checksum offset (%d) exceeds packet length (%d)", cSumAt+1, len(in))
														
 
															+	}
														
 
															+
														
 
															+	return tcpTSO(in, hdr, bufs, sizes, offset)
														
 
															+}
														
 
															+
														
 
															+func (tun *NativeTun) Read(bufs [][]byte, sizes []int, offset int) (int, error) {
														
 
															+	tun.readOpMu.Lock()
														
 
															+	defer tun.readOpMu.Unlock()
														
 
															+	select {
														
 
															+	case err := <-tun.errors:
														
 
															+		return 0, err
														
 
															+	default:
														
 
															+		readInto := bufs[0][offset:]
														
 
															+		if tun.vnetHdr {
														
 
															+			readInto = tun.readBuff[:]
														
 
															+		}
														
 
															+		n, err := tun.tunFile.Read(readInto)
														
 
															+		if errors.Is(err, syscall.EBADFD) {
														
 
															+			err = os.ErrClosed
														
 
															+		}
														
 
															+		if err != nil {
														
 
															+			return 0, err
														
 
															+		}
														
 
															+		if tun.vnetHdr {
														
 
															+			return handleVirtioRead(readInto[:n], bufs, sizes, offset)
														
 
															+		}
														
 
															+		sizes[0] = n
														
 
															+		return 1, nil
														
 
															+	}
														
 
															+}