tun_linux.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. //go:build linux
  2. // SPDX-License-Identifier: MIT
  3. //
  4. // Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
  5. package tun
  6. /* Implementation of the TUN device interface for linux
  7. */
  8. import (
  9. "errors"
  10. "fmt"
  11. "os"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "unsafe"
  16. wgconn "github.com/slackhq/nebula/wgstack/conn"
  17. "golang.org/x/sys/unix"
  18. "golang.zx2c4.com/wireguard/rwcancel"
  19. )
  20. const (
  21. cloneDevicePath = "/dev/net/tun"
  22. ifReqSize = unix.IFNAMSIZ + 64
  23. )
  24. type NativeTun struct {
  25. tunFile *os.File
  26. index int32 // if index
  27. errors chan error // async error handling
  28. events chan Event // device related events
  29. netlinkSock int
  30. netlinkCancel *rwcancel.RWCancel
  31. hackListenerClosed sync.Mutex
  32. statusListenersShutdown chan struct{}
  33. batchSize int
  34. vnetHdr bool
  35. closeOnce sync.Once
  36. nameOnce sync.Once // guards calling initNameCache, which sets following fields
  37. nameCache string // name of interface
  38. nameErr error
  39. readOpMu sync.Mutex // readOpMu guards readBuff
  40. readBuff [virtioNetHdrLen + 65535]byte // if vnetHdr every read() is prefixed by virtioNetHdr
  41. writeOpMu sync.Mutex // writeOpMu guards toWrite, tcp4GROTable, tcp6GROTable
  42. toWrite []int
  43. tcp4GROTable, tcp6GROTable *tcpGROTable
  44. }
  45. func (tun *NativeTun) File() *os.File {
  46. return tun.tunFile
  47. }
  48. func (tun *NativeTun) routineHackListener() {
  49. defer tun.hackListenerClosed.Unlock()
  50. /* This is needed for the detection to work across network namespaces
  51. * If you are reading this and know a better method, please get in touch.
  52. */
  53. last := 0
  54. const (
  55. up = 1
  56. down = 2
  57. )
  58. for {
  59. sysconn, err := tun.tunFile.SyscallConn()
  60. if err != nil {
  61. return
  62. }
  63. err2 := sysconn.Control(func(fd uintptr) {
  64. _, err = unix.Write(int(fd), nil)
  65. })
  66. if err2 != nil {
  67. return
  68. }
  69. switch err {
  70. case unix.EINVAL:
  71. if last != up {
  72. // If the tunnel is up, it reports that write() is
  73. // allowed but we provided invalid data.
  74. tun.events <- EventUp
  75. last = up
  76. }
  77. case unix.EIO:
  78. if last != down {
  79. // If the tunnel is down, it reports that no I/O
  80. // is possible, without checking our provided data.
  81. tun.events <- EventDown
  82. last = down
  83. }
  84. default:
  85. return
  86. }
  87. select {
  88. case <-time.After(time.Second):
  89. // nothing
  90. case <-tun.statusListenersShutdown:
  91. return
  92. }
  93. }
  94. }
  95. func createNetlinkSocket() (int, error) {
  96. sock, err := unix.Socket(unix.AF_NETLINK, unix.SOCK_RAW|unix.SOCK_CLOEXEC, unix.NETLINK_ROUTE)
  97. if err != nil {
  98. return -1, err
  99. }
  100. saddr := &unix.SockaddrNetlink{
  101. Family: unix.AF_NETLINK,
  102. Groups: unix.RTMGRP_LINK | unix.RTMGRP_IPV4_IFADDR | unix.RTMGRP_IPV6_IFADDR,
  103. }
  104. err = unix.Bind(sock, saddr)
  105. if err != nil {
  106. return -1, err
  107. }
  108. return sock, nil
  109. }
  110. func (tun *NativeTun) routineNetlinkListener() {
  111. defer func() {
  112. unix.Close(tun.netlinkSock)
  113. tun.hackListenerClosed.Lock()
  114. close(tun.events)
  115. tun.netlinkCancel.Close()
  116. }()
  117. for msg := make([]byte, 1<<16); ; {
  118. var err error
  119. var msgn int
  120. for {
  121. msgn, _, _, _, err = unix.Recvmsg(tun.netlinkSock, msg[:], nil, 0)
  122. if err == nil || !rwcancel.RetryAfterError(err) {
  123. break
  124. }
  125. if !tun.netlinkCancel.ReadyRead() {
  126. tun.errors <- fmt.Errorf("netlink socket closed: %w", err)
  127. return
  128. }
  129. }
  130. if err != nil {
  131. tun.errors <- fmt.Errorf("failed to receive netlink message: %w", err)
  132. return
  133. }
  134. select {
  135. case <-tun.statusListenersShutdown:
  136. return
  137. default:
  138. }
  139. wasEverUp := false
  140. for remain := msg[:msgn]; len(remain) >= unix.SizeofNlMsghdr; {
  141. hdr := *(*unix.NlMsghdr)(unsafe.Pointer(&remain[0]))
  142. if int(hdr.Len) > len(remain) {
  143. break
  144. }
  145. switch hdr.Type {
  146. case unix.NLMSG_DONE:
  147. remain = []byte{}
  148. case unix.RTM_NEWLINK:
  149. info := *(*unix.IfInfomsg)(unsafe.Pointer(&remain[unix.SizeofNlMsghdr]))
  150. remain = remain[hdr.Len:]
  151. if info.Index != tun.index {
  152. // not our interface
  153. continue
  154. }
  155. if info.Flags&unix.IFF_RUNNING != 0 {
  156. tun.events <- EventUp
  157. wasEverUp = true
  158. }
  159. if info.Flags&unix.IFF_RUNNING == 0 {
  160. // Don't emit EventDown before we've ever emitted EventUp.
  161. // This avoids a startup race with HackListener, which
  162. // might detect Up before we have finished reporting Down.
  163. if wasEverUp {
  164. tun.events <- EventDown
  165. }
  166. }
  167. tun.events <- EventMTUUpdate
  168. default:
  169. remain = remain[hdr.Len:]
  170. }
  171. }
  172. }
  173. }
  174. func getIFIndex(name string) (int32, error) {
  175. fd, err := unix.Socket(
  176. unix.AF_INET,
  177. unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
  178. 0,
  179. )
  180. if err != nil {
  181. return 0, err
  182. }
  183. defer unix.Close(fd)
  184. var ifr [ifReqSize]byte
  185. copy(ifr[:], name)
  186. _, _, errno := unix.Syscall(
  187. unix.SYS_IOCTL,
  188. uintptr(fd),
  189. uintptr(unix.SIOCGIFINDEX),
  190. uintptr(unsafe.Pointer(&ifr[0])),
  191. )
  192. if errno != 0 {
  193. return 0, errno
  194. }
  195. return *(*int32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ])), nil
  196. }
  197. func (tun *NativeTun) setMTU(n int) error {
  198. name, err := tun.Name()
  199. if err != nil {
  200. return err
  201. }
  202. // open datagram socket
  203. fd, err := unix.Socket(
  204. unix.AF_INET,
  205. unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
  206. 0,
  207. )
  208. if err != nil {
  209. return err
  210. }
  211. defer unix.Close(fd)
  212. var ifr [ifReqSize]byte
  213. copy(ifr[:], name)
  214. *(*uint32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ])) = uint32(n)
  215. _, _, errno := unix.Syscall(
  216. unix.SYS_IOCTL,
  217. uintptr(fd),
  218. uintptr(unix.SIOCSIFMTU),
  219. uintptr(unsafe.Pointer(&ifr[0])),
  220. )
  221. if errno != 0 {
  222. return errno
  223. }
  224. return nil
  225. }
  226. func (tun *NativeTun) routineNetlinkRead() {
  227. defer func() {
  228. unix.Close(tun.netlinkSock)
  229. tun.hackListenerClosed.Lock()
  230. close(tun.events)
  231. tun.netlinkCancel.Close()
  232. }()
  233. for msg := make([]byte, 1<<16); ; {
  234. var err error
  235. var msgn int
  236. for {
  237. msgn, _, _, _, err = unix.Recvmsg(tun.netlinkSock, msg[:], nil, 0)
  238. if err == nil || !rwcancel.RetryAfterError(err) {
  239. break
  240. }
  241. if !tun.netlinkCancel.ReadyRead() {
  242. tun.errors <- fmt.Errorf("netlink socket closed: %w", err)
  243. return
  244. }
  245. }
  246. if err != nil {
  247. tun.errors <- fmt.Errorf("failed to receive netlink message: %w", err)
  248. return
  249. }
  250. wasEverUp := false
  251. for remain := msg[:msgn]; len(remain) >= unix.SizeofNlMsghdr; {
  252. hdr := *(*unix.NlMsghdr)(unsafe.Pointer(&remain[0]))
  253. if int(hdr.Len) > len(remain) {
  254. break
  255. }
  256. switch hdr.Type {
  257. case unix.NLMSG_DONE:
  258. remain = []byte{}
  259. case unix.RTM_NEWLINK:
  260. info := *(*unix.IfInfomsg)(unsafe.Pointer(&remain[unix.SizeofNlMsghdr]))
  261. remain = remain[hdr.Len:]
  262. if info.Index != tun.index {
  263. continue
  264. }
  265. if info.Flags&unix.IFF_RUNNING != 0 {
  266. tun.events <- EventUp
  267. wasEverUp = true
  268. }
  269. if info.Flags&unix.IFF_RUNNING == 0 {
  270. if wasEverUp {
  271. tun.events <- EventDown
  272. }
  273. }
  274. tun.events <- EventMTUUpdate
  275. default:
  276. remain = remain[hdr.Len:]
  277. }
  278. }
  279. }
  280. }
  281. func (tun *NativeTun) routineNetlink() {
  282. var err error
  283. tun.netlinkSock, err = createNetlinkSocket()
  284. if err != nil {
  285. tun.errors <- fmt.Errorf("failed to create netlink socket: %w", err)
  286. return
  287. }
  288. tun.netlinkCancel, err = rwcancel.NewRWCancel(tun.netlinkSock)
  289. if err != nil {
  290. tun.errors <- fmt.Errorf("failed to create netlink cancel: %w", err)
  291. return
  292. }
  293. go tun.routineNetlinkListener()
  294. }
  295. func (tun *NativeTun) Close() error {
  296. var err1, err2 error
  297. tun.closeOnce.Do(func() {
  298. if tun.statusListenersShutdown != nil {
  299. close(tun.statusListenersShutdown)
  300. if tun.netlinkCancel != nil {
  301. err1 = tun.netlinkCancel.Cancel()
  302. }
  303. } else if tun.events != nil {
  304. close(tun.events)
  305. }
  306. err2 = tun.tunFile.Close()
  307. })
  308. if err1 != nil {
  309. return err1
  310. }
  311. return err2
  312. }
  313. func (tun *NativeTun) BatchSize() int {
  314. return tun.batchSize
  315. }
  316. const (
  317. // TODO: support TSO with ECN bits
  318. tunOffloads = unix.TUN_F_CSUM | unix.TUN_F_TSO4 | unix.TUN_F_TSO6
  319. )
  320. func (tun *NativeTun) initFromFlags(name string) error {
  321. sc, err := tun.tunFile.SyscallConn()
  322. if err != nil {
  323. return err
  324. }
  325. if e := sc.Control(func(fd uintptr) {
  326. var (
  327. ifr *unix.Ifreq
  328. )
  329. ifr, err = unix.NewIfreq(name)
  330. if err != nil {
  331. return
  332. }
  333. err = unix.IoctlIfreq(int(fd), unix.TUNGETIFF, ifr)
  334. if err != nil {
  335. return
  336. }
  337. got := ifr.Uint16()
  338. if got&unix.IFF_VNET_HDR != 0 {
  339. err = unix.IoctlSetInt(int(fd), unix.TUNSETOFFLOAD, tunOffloads)
  340. if err != nil {
  341. return
  342. }
  343. tun.vnetHdr = true
  344. tun.batchSize = wgconn.IdealBatchSize
  345. } else {
  346. tun.batchSize = 1
  347. }
  348. }); e != nil {
  349. return e
  350. }
  351. return err
  352. }
  353. // CreateTUN creates a Device with the provided name and MTU.
  354. func CreateTUN(name string, mtu int) (Device, error) {
  355. nfd, err := unix.Open(cloneDevicePath, unix.O_RDWR|unix.O_CLOEXEC, 0)
  356. if err != nil {
  357. return nil, fmt.Errorf("CreateTUN(%q) failed; %s does not exist", name, cloneDevicePath)
  358. }
  359. fd := os.NewFile(uintptr(nfd), cloneDevicePath)
  360. tun, err := CreateTUNFromFile(fd, mtu)
  361. if err != nil {
  362. return nil, err
  363. }
  364. if name != "tun" {
  365. if err := tun.(*NativeTun).initFromFlags(name); err != nil {
  366. tun.Close()
  367. return nil, fmt.Errorf("CreateTUN(%q) failed to set flags: %w", name, err)
  368. }
  369. }
  370. return tun, nil
  371. }
  372. // CreateTUNFromFile creates a Device from an os.File with the provided MTU.
  373. func CreateTUNFromFile(file *os.File, mtu int) (Device, error) {
  374. tun := &NativeTun{
  375. tunFile: file,
  376. errors: make(chan error, 5),
  377. events: make(chan Event, 5),
  378. }
  379. name, err := tun.Name()
  380. if err != nil {
  381. return nil, fmt.Errorf("failed to determine TUN name: %w", err)
  382. }
  383. if err := tun.initFromFlags(name); err != nil {
  384. return nil, fmt.Errorf("failed to query TUN flags: %w", err)
  385. }
  386. if tun.batchSize == 0 {
  387. tun.batchSize = 1
  388. }
  389. tun.index, err = getIFIndex(name)
  390. if err != nil {
  391. return nil, fmt.Errorf("failed to get TUN index: %w", err)
  392. }
  393. if err = tun.setMTU(mtu); err != nil {
  394. return nil, fmt.Errorf("failed to set MTU: %w", err)
  395. }
  396. tun.statusListenersShutdown = make(chan struct{})
  397. go tun.routineNetlink()
  398. if tun.batchSize == 0 {
  399. tun.batchSize = 1
  400. }
  401. tun.tcp4GROTable = newTCPGROTable()
  402. tun.tcp6GROTable = newTCPGROTable()
  403. return tun, nil
  404. }
  405. func (tun *NativeTun) Name() (string, error) {
  406. tun.nameOnce.Do(tun.initNameCache)
  407. return tun.nameCache, tun.nameErr
  408. }
  409. func (tun *NativeTun) initNameCache() {
  410. sysconn, err := tun.tunFile.SyscallConn()
  411. if err != nil {
  412. tun.nameErr = err
  413. return
  414. }
  415. err = sysconn.Control(func(fd uintptr) {
  416. var ifr [ifReqSize]byte
  417. _, _, errno := unix.Syscall(
  418. unix.SYS_IOCTL,
  419. fd,
  420. uintptr(unix.TUNGETIFF),
  421. uintptr(unsafe.Pointer(&ifr[0])),
  422. )
  423. if errno != 0 {
  424. tun.nameErr = errno
  425. return
  426. }
  427. tun.nameCache = unix.ByteSliceToString(ifr[:])
  428. })
  429. if err != nil && tun.nameErr == nil {
  430. tun.nameErr = err
  431. }
  432. }
  433. func (tun *NativeTun) MTU() (int, error) {
  434. name, err := tun.Name()
  435. if err != nil {
  436. return 0, err
  437. }
  438. // open datagram socket
  439. fd, err := unix.Socket(
  440. unix.AF_INET,
  441. unix.SOCK_DGRAM|unix.SOCK_CLOEXEC,
  442. 0,
  443. )
  444. if err != nil {
  445. return 0, err
  446. }
  447. defer unix.Close(fd)
  448. var ifr [ifReqSize]byte
  449. copy(ifr[:], name)
  450. _, _, errno := unix.Syscall(
  451. unix.SYS_IOCTL,
  452. uintptr(fd),
  453. uintptr(unix.SIOCGIFMTU),
  454. uintptr(unsafe.Pointer(&ifr[0])),
  455. )
  456. if errno != 0 {
  457. return 0, errno
  458. }
  459. return int(*(*uint32)(unsafe.Pointer(&ifr[unix.IFNAMSIZ]))), nil
  460. }
  461. func (tun *NativeTun) Events() <-chan Event {
  462. return tun.events
  463. }
  464. func (tun *NativeTun) Write(bufs [][]byte, offset int) (int, error) {
  465. tun.writeOpMu.Lock()
  466. defer func() {
  467. tun.tcp4GROTable.reset()
  468. tun.tcp6GROTable.reset()
  469. tun.writeOpMu.Unlock()
  470. }()
  471. var (
  472. errs error
  473. total int
  474. )
  475. tun.toWrite = tun.toWrite[:0]
  476. if tun.vnetHdr {
  477. err := handleGRO(bufs, offset, tun.tcp4GROTable, tun.tcp6GROTable, &tun.toWrite)
  478. if err != nil {
  479. return 0, err
  480. }
  481. offset -= virtioNetHdrLen
  482. } else {
  483. for i := range bufs {
  484. tun.toWrite = append(tun.toWrite, i)
  485. }
  486. }
  487. for _, bufsI := range tun.toWrite {
  488. n, err := tun.tunFile.Write(bufs[bufsI][offset:])
  489. if errors.Is(err, syscall.EBADFD) {
  490. return total, os.ErrClosed
  491. }
  492. if err != nil {
  493. errs = errors.Join(errs, err)
  494. } else {
  495. total += n
  496. }
  497. }
  498. return total, errs
  499. }
  500. // handleVirtioRead splits in into bufs, leaving offset bytes at the front of
  501. // each buffer. It mutates sizes to reflect the size of each element of bufs,
  502. // and returns the number of packets read.
  503. func handleVirtioRead(in []byte, bufs [][]byte, sizes []int, offset int) (int, error) {
  504. var hdr virtioNetHdr
  505. if err := hdr.decode(in); err != nil {
  506. return 0, err
  507. }
  508. in = in[virtioNetHdrLen:]
  509. if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_NONE {
  510. if hdr.flags&unix.VIRTIO_NET_HDR_F_NEEDS_CSUM != 0 {
  511. if err := gsoNoneChecksum(in, hdr.csumStart, hdr.csumOffset); err != nil {
  512. return 0, err
  513. }
  514. }
  515. if len(in) > len(bufs[0][offset:]) {
  516. return 0, fmt.Errorf("read len %d overflows bufs element len %d", len(in), len(bufs[0][offset:]))
  517. }
  518. n := copy(bufs[0][offset:], in)
  519. sizes[0] = n
  520. return 1, nil
  521. }
  522. if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV4 && hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV6 {
  523. return 0, fmt.Errorf("unsupported virtio GSO type: %d", hdr.gsoType)
  524. }
  525. ipVersion := in[0] >> 4
  526. switch ipVersion {
  527. case 4:
  528. if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV4 {
  529. return 0, fmt.Errorf("ip header version: %d, GSO type: %d", ipVersion, hdr.gsoType)
  530. }
  531. case 6:
  532. if hdr.gsoType != unix.VIRTIO_NET_HDR_GSO_TCPV6 {
  533. return 0, fmt.Errorf("ip header version: %d, GSO type: %d", ipVersion, hdr.gsoType)
  534. }
  535. default:
  536. return 0, fmt.Errorf("invalid ip header version: %d", ipVersion)
  537. }
  538. if len(in) <= int(hdr.csumStart+12) {
  539. return 0, errors.New("packet is too short")
  540. }
  541. tcpHLen := uint16(in[hdr.csumStart+12] >> 4 * 4)
  542. if tcpHLen < 20 || tcpHLen > 60 {
  543. return 0, fmt.Errorf("tcp header len is invalid: %d", tcpHLen)
  544. }
  545. hdr.hdrLen = hdr.csumStart + tcpHLen
  546. if len(in) < int(hdr.hdrLen) {
  547. return 0, fmt.Errorf("length of packet (%d) < virtioNetHdr.hdrLen (%d)", len(in), hdr.hdrLen)
  548. }
  549. if hdr.hdrLen < hdr.csumStart {
  550. return 0, fmt.Errorf("virtioNetHdr.hdrLen (%d) < virtioNetHdr.csumStart (%d)", hdr.hdrLen, hdr.csumStart)
  551. }
  552. cSumAt := int(hdr.csumStart + hdr.csumOffset)
  553. if cSumAt+1 >= len(in) {
  554. return 0, fmt.Errorf("end of checksum offset (%d) exceeds packet length (%d)", cSumAt+1, len(in))
  555. }
  556. return tcpTSO(in, hdr, bufs, sizes, offset)
  557. }
  558. func (tun *NativeTun) Read(bufs [][]byte, sizes []int, offset int) (int, error) {
  559. tun.readOpMu.Lock()
  560. defer tun.readOpMu.Unlock()
  561. select {
  562. case err := <-tun.errors:
  563. return 0, err
  564. default:
  565. readInto := bufs[0][offset:]
  566. if tun.vnetHdr {
  567. readInto = tun.readBuff[:]
  568. }
  569. n, err := tun.tunFile.Read(readInto)
  570. if errors.Is(err, syscall.EBADFD) {
  571. err = os.ErrClosed
  572. }
  573. if err != nil {
  574. return 0, err
  575. }
  576. if tun.vnetHdr {
  577. return handleVirtioRead(readInto[:n], bufs, sizes, offset)
  578. }
  579. sizes[0] = n
  580. return 1, nil
  581. }
  582. }