mirror of
https://github.com/slackhq/nebula.git
synced 2026-05-09 21:51:39 -07:00
tun/linux: coalesce WriteGSO into single write() to avoid 4.19 UAF
The scatter-gather writev path in WriteGSO triggered a kernel-side use-after-free in tun_chr_write_iter → sock_alloc_send_pskb → skb_set_owner_w on Linux 4.19 TUN when the virtio_net_hdr requested TSO segmentation. The skb write-memory refcount (sk_wmem_alloc) underflowed, producing paired traces of refcount_t: addition on 0 (in the write path) and refcount_t: underflow (in the paired recv socket), reliably rebooting UBIOS UXG-Pro routers under iperf3 -R. Match wireguard-go's design: coalesce the virtio_net_hdr, IP/TCP header, and all payload fragments into a single contiguous per-queue scratch buffer, then emit the superpacket with a single write() syscall. wireguard-go's offload path handles GRO-merged TSO superpackets this way and has no equivalent failure mode (see tun/tun_linux.go Write — it writes bufs[bufsI][offset:] with a single tunFile.Write call after coalesce). Cost: one extra memcpy per superpacket (bounded at ~64KiB by the virtio spec). Unit tests pass (go test ./overlay/tio/...). Field testing on UXG-Pro (4.19) pending.
This commit is contained in:
parent
c9d5a6e35a
commit
2bc200103f
1 changed files with 89 additions and 26 deletions
|
|
@ -32,6 +32,15 @@ const tunDrainCap = 64 //256
|
|||
// any reallocations.
|
||||
const gsoInitialPayIovs = 66
|
||||
|
||||
// gsoWriteBufCap is the initial per-queue coalesce scratch capacity used by
|
||||
// WriteGSO to assemble [virtio_hdr || IP/TCP hdr || pays...] into a single
|
||||
// contiguous buffer so we can emit the superpacket via a single write()
|
||||
// instead of writev(). One worst-case TSO superpacket is bounded by the
|
||||
// virtio spec at 64KiB; 128KiB gives comfortable slack for the 10-byte
|
||||
// virtio header, the IP/TCP header, and any future size bumps. Grown on
|
||||
// demand if a superpacket exceeds this.
|
||||
const gsoWriteBufCap = tunSegBufSize
|
||||
|
||||
// validVnetHdr is the 10-byte virtio_net_hdr we prepend to every non-GSO TUN
|
||||
// write. Only flag set is VIRTIO_NET_HDR_F_DATA_VALID, which marks the skb
|
||||
// CHECKSUM_UNNECESSARY so the receiving network stack skips L4 checksum
|
||||
|
|
@ -65,10 +74,20 @@ type Offload struct {
|
|||
// by WriteGSO. Separate from validVnetHdr so a concurrent non-GSO Write on
|
||||
// another queue never observes a half-written header.
|
||||
gsoHdrBuf [virtioNetHdrLen]byte
|
||||
// gsoIovs is the writev iovec scratch for WriteGSO. Sized to hold the
|
||||
// virtio header + IP/TCP header + up to gsoInitialPayIovs payload
|
||||
// fragments; grown on demand if a coalescer pushes more.
|
||||
// gsoIovs is a legacy writev iovec scratch. No longer used by the
|
||||
// WriteGSO path (which coalesces into gsoWriteBuf and uses a single
|
||||
// write()) but retained for any other iovec-based path that may use it.
|
||||
gsoIovs []unix.Iovec
|
||||
|
||||
// gsoWriteBuf is a per-queue scratch used by WriteGSO to coalesce the
|
||||
// virtio_net_hdr + IP/TCP header + payload fragments into a single
|
||||
// contiguous buffer, which is then written to the TUN fd with one
|
||||
// write() syscall. This mirrors wireguard-go's approach and avoids
|
||||
// triggering a kernel refcount use-after-free in skb_set_owner_w /
|
||||
// sock_wfree observed on Linux 4.19 TUN when scatter-gather writev is
|
||||
// combined with GSO-flagged virtio_net_hdr in the tun_chr_write_iter
|
||||
// path. Grown on demand if a superpacket exceeds the initial cap.
|
||||
gsoWriteBuf []byte
|
||||
}
|
||||
|
||||
func newOffload(fd int, shutdownFd int) (*Offload, error) {
|
||||
|
|
@ -90,8 +109,9 @@ func newOffload(fd int, shutdownFd int) (*Offload, error) {
|
|||
{Fd: int32(shutdownFd), Events: unix.POLLIN},
|
||||
},
|
||||
|
||||
segBuf: make([]byte, tunSegBufCap),
|
||||
gsoIovs: make([]unix.Iovec, 2, 2+gsoInitialPayIovs),
|
||||
segBuf: make([]byte, tunSegBufCap),
|
||||
gsoIovs: make([]unix.Iovec, 2, 2+gsoInitialPayIovs),
|
||||
gsoWriteBuf: make([]byte, 0, gsoWriteBufCap),
|
||||
}
|
||||
|
||||
out.writeIovs[0].Base = &validVnetHdr[0]
|
||||
|
|
@ -291,18 +311,59 @@ func (r *Offload) rawWrite(iovs []unix.Iovec) (int, error) {
|
|||
}
|
||||
}
|
||||
|
||||
// rawWriteSingle writes buf to the TUN fd with a single write() syscall.
|
||||
// Unlike rawWrite (which uses writev), this avoids the kernel
|
||||
// scatter-gather path that triggers a use-after-free in
|
||||
// tun_chr_write_iter → sock_alloc_send_pskb → skb_set_owner_w on Linux
|
||||
// 4.19 TUN when the virtio_net_hdr requests TSO segmentation. The caller
|
||||
// is responsible for including the virtio_net_hdr prefix in buf.
|
||||
func (r *Offload) rawWriteSingle(buf []byte) (int, error) {
|
||||
for {
|
||||
n, err := unix.Write(r.fd, buf)
|
||||
if err == nil {
|
||||
if n < virtioNetHdrLen {
|
||||
return 0, io.ErrShortWrite
|
||||
}
|
||||
return n - virtioNetHdrLen, nil
|
||||
}
|
||||
if err == unix.EAGAIN {
|
||||
if werr := r.blockOnWrite(); werr != nil {
|
||||
return 0, werr
|
||||
}
|
||||
continue
|
||||
}
|
||||
if err == unix.EINTR {
|
||||
continue
|
||||
}
|
||||
if err == unix.EBADF {
|
||||
return 0, os.ErrClosed
|
||||
}
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
// GSOSupported reports whether this queue was opened with IFF_VNET_HDR and
|
||||
// can accept WriteGSO. When false, callers should fall back to per-segment
|
||||
// Write calls.
|
||||
func (r *Offload) GSOSupported() bool { return true }
|
||||
|
||||
// WriteGSO emits a TCP TSO superpacket in a single writev. hdr is the
|
||||
// IPv4/IPv6 + TCP header prefix (already finalized — total length, IP csum,
|
||||
// and TCP pseudo-header partial set by the caller). pays are payload
|
||||
// fragments whose concatenation forms the full coalesced payload; each
|
||||
// slice is read-only and must stay valid until return. gsoSize is the MSS;
|
||||
// every segment except possibly the last is exactly gsoSize bytes.
|
||||
// csumStart is the byte offset where the TCP header begins within hdr.
|
||||
// WriteGSO emits a TCP TSO superpacket. hdr is the IPv4/IPv6 + TCP header
|
||||
// prefix (already finalized — total length, IP csum, and TCP pseudo-header
|
||||
// partial set by the caller). pays are payload fragments whose concatenation
|
||||
// forms the full coalesced payload. gsoSize is the MSS; every segment except
|
||||
// possibly the last is exactly gsoSize bytes. csumStart is the byte offset
|
||||
// where the TCP header begins within hdr.
|
||||
//
|
||||
// Implementation note: this path coalesces [virtio_hdr || hdr || pays...]
|
||||
// into a single contiguous scratch buffer (r.gsoWriteBuf) and emits it via
|
||||
// one write() syscall rather than writev() with a scatter-gather iovec.
|
||||
// The scatter-gather path triggered a kernel-side use-after-free on Linux
|
||||
// 4.19 TUN where tun_chr_write_iter → sock_alloc_send_pskb →
|
||||
// skb_set_owner_w could be invoked with a zero sk_wmem_alloc, crashing
|
||||
// the router. The single-write path mirrors wireguard-go's design (see
|
||||
// golang.zx2c4.com/wireguard/tun/tun_linux.go Write — it always coalesces
|
||||
// GRO-merged data into a single contiguous buffer before calling
|
||||
// tunFile.Write) and has no equivalent failure mode.
|
||||
func (r *Offload) WriteGSO(hdr []byte, pays [][]byte, gsoSize uint16, isV6 bool, csumStart uint16) error {
|
||||
if len(hdr) == 0 || len(pays) == 0 {
|
||||
return nil
|
||||
|
|
@ -334,24 +395,26 @@ func (r *Offload) WriteGSO(hdr []byte, pays [][]byte, gsoSize uint16, isV6 bool,
|
|||
}
|
||||
vhdr.encode(r.gsoHdrBuf[:])
|
||||
|
||||
// Build the iovec array: [virtio_hdr, hdr, pays...]. r.gsoIovs[0] is
|
||||
// wired to gsoHdrBuf at construction and never changes.
|
||||
need := 2 + len(pays)
|
||||
if cap(r.gsoIovs) < need {
|
||||
grown := make([]unix.Iovec, need)
|
||||
grown[0] = r.gsoIovs[0]
|
||||
r.gsoIovs = grown
|
||||
// Coalesce [virtio_hdr || hdr || pays...] into a single contiguous
|
||||
// buffer. This avoids the kernel scatter-gather write path entirely.
|
||||
need := virtioNetHdrLen + len(hdr) + totalPay
|
||||
if cap(r.gsoWriteBuf) < need {
|
||||
// Grow geometrically to amortize reallocs.
|
||||
newCap := cap(r.gsoWriteBuf) * 2
|
||||
if newCap < need {
|
||||
newCap = need
|
||||
}
|
||||
r.gsoWriteBuf = make([]byte, 0, newCap)
|
||||
} else {
|
||||
r.gsoIovs = r.gsoIovs[:need]
|
||||
r.gsoWriteBuf = r.gsoWriteBuf[:0]
|
||||
}
|
||||
r.gsoIovs[1].Base = &hdr[0]
|
||||
r.gsoIovs[1].SetLen(len(hdr))
|
||||
for i, p := range pays {
|
||||
r.gsoIovs[2+i].Base = &p[0]
|
||||
r.gsoIovs[2+i].SetLen(len(p))
|
||||
r.gsoWriteBuf = append(r.gsoWriteBuf, r.gsoHdrBuf[:]...)
|
||||
r.gsoWriteBuf = append(r.gsoWriteBuf, hdr...)
|
||||
for _, p := range pays {
|
||||
r.gsoWriteBuf = append(r.gsoWriteBuf, p...)
|
||||
}
|
||||
|
||||
_, err := r.rawWrite(r.gsoIovs)
|
||||
_, err := r.rawWriteSingle(r.gsoWriteBuf)
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue