tcp_offload_linux.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. //go:build linux
  2. // SPDX-License-Identifier: MIT
  3. //
  4. // Copyright (C) 2017-2023 WireGuard LLC. All Rights Reserved.
  5. package tun
  6. import (
  7. "bytes"
  8. "encoding/binary"
  9. "errors"
  10. "io"
  11. "unsafe"
  12. wgconn "github.com/slackhq/nebula/wgstack/conn"
  13. "golang.org/x/sys/unix"
  14. )
  15. var ErrTooManySegments = errors.New("tun: too many segments for TSO")
  16. const tcpFlagsOffset = 13
  17. const (
  18. tcpFlagFIN uint8 = 0x01
  19. tcpFlagPSH uint8 = 0x08
  20. tcpFlagACK uint8 = 0x10
  21. )
  22. // virtioNetHdr is defined in the kernel in include/uapi/linux/virtio_net.h. The
  23. // kernel symbol is virtio_net_hdr.
  24. type virtioNetHdr struct {
  25. flags uint8
  26. gsoType uint8
  27. hdrLen uint16
  28. gsoSize uint16
  29. csumStart uint16
  30. csumOffset uint16
  31. }
  32. func (v *virtioNetHdr) decode(b []byte) error {
  33. if len(b) < virtioNetHdrLen {
  34. return io.ErrShortBuffer
  35. }
  36. copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen), b[:virtioNetHdrLen])
  37. return nil
  38. }
  39. func (v *virtioNetHdr) encode(b []byte) error {
  40. if len(b) < virtioNetHdrLen {
  41. return io.ErrShortBuffer
  42. }
  43. copy(b[:virtioNetHdrLen], unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen))
  44. return nil
  45. }
  46. const (
  47. // virtioNetHdrLen is the length in bytes of virtioNetHdr. This matches the
  48. // shape of the C ABI for its kernel counterpart -- sizeof(virtio_net_hdr).
  49. virtioNetHdrLen = int(unsafe.Sizeof(virtioNetHdr{}))
  50. )
  51. // flowKey represents the key for a flow.
  52. type flowKey struct {
  53. srcAddr, dstAddr [16]byte
  54. srcPort, dstPort uint16
  55. rxAck uint32 // varying ack values should not be coalesced. Treat them as separate flows.
  56. }
  57. // tcpGROTable holds flow and coalescing information for the purposes of GRO.
  58. type tcpGROTable struct {
  59. itemsByFlow map[flowKey][]tcpGROItem
  60. itemsPool [][]tcpGROItem
  61. }
  62. func newTCPGROTable() *tcpGROTable {
  63. t := &tcpGROTable{
  64. itemsByFlow: make(map[flowKey][]tcpGROItem, wgconn.IdealBatchSize),
  65. itemsPool: make([][]tcpGROItem, wgconn.IdealBatchSize),
  66. }
  67. for i := range t.itemsPool {
  68. t.itemsPool[i] = make([]tcpGROItem, 0, wgconn.IdealBatchSize)
  69. }
  70. return t
  71. }
  72. func newFlowKey(pkt []byte, srcAddr, dstAddr, tcphOffset int) flowKey {
  73. key := flowKey{}
  74. addrSize := dstAddr - srcAddr
  75. copy(key.srcAddr[:], pkt[srcAddr:dstAddr])
  76. copy(key.dstAddr[:], pkt[dstAddr:dstAddr+addrSize])
  77. key.srcPort = binary.BigEndian.Uint16(pkt[tcphOffset:])
  78. key.dstPort = binary.BigEndian.Uint16(pkt[tcphOffset+2:])
  79. key.rxAck = binary.BigEndian.Uint32(pkt[tcphOffset+8:])
  80. return key
  81. }
  82. // lookupOrInsert looks up a flow for the provided packet and metadata,
  83. // returning the packets found for the flow, or inserting a new one if none
  84. // is found.
  85. func (t *tcpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) ([]tcpGROItem, bool) {
  86. key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
  87. items, ok := t.itemsByFlow[key]
  88. if ok {
  89. return items, ok
  90. }
  91. // TODO: insert() performs another map lookup. This could be rearranged to avoid.
  92. t.insert(pkt, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex)
  93. return nil, false
  94. }
  95. // insert an item in the table for the provided packet and packet metadata.
  96. func (t *tcpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) {
  97. key := newFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
  98. item := tcpGROItem{
  99. key: key,
  100. bufsIndex: uint16(bufsIndex),
  101. gsoSize: uint16(len(pkt[tcphOffset+tcphLen:])),
  102. iphLen: uint8(tcphOffset),
  103. tcphLen: uint8(tcphLen),
  104. sentSeq: binary.BigEndian.Uint32(pkt[tcphOffset+4:]),
  105. pshSet: pkt[tcphOffset+tcpFlagsOffset]&tcpFlagPSH != 0,
  106. }
  107. items, ok := t.itemsByFlow[key]
  108. if !ok {
  109. items = t.newItems()
  110. }
  111. items = append(items, item)
  112. t.itemsByFlow[key] = items
  113. }
  114. func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
  115. items, _ := t.itemsByFlow[item.key]
  116. items[i] = item
  117. }
  118. func (t *tcpGROTable) deleteAt(key flowKey, i int) {
  119. items, _ := t.itemsByFlow[key]
  120. items = append(items[:i], items[i+1:]...)
  121. t.itemsByFlow[key] = items
  122. }
  123. // tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
  124. // of a GRO evaluation across a vector of packets.
  125. type tcpGROItem struct {
  126. key flowKey
  127. sentSeq uint32 // the sequence number
  128. bufsIndex uint16 // the index into the original bufs slice
  129. numMerged uint16 // the number of packets merged into this item
  130. gsoSize uint16 // payload size
  131. iphLen uint8 // ip header len
  132. tcphLen uint8 // tcp header len
  133. pshSet bool // psh flag is set
  134. }
  135. func (t *tcpGROTable) newItems() []tcpGROItem {
  136. var items []tcpGROItem
  137. items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
  138. return items
  139. }
  140. func (t *tcpGROTable) reset() {
  141. for k, items := range t.itemsByFlow {
  142. items = items[:0]
  143. t.itemsPool = append(t.itemsPool, items)
  144. delete(t.itemsByFlow, k)
  145. }
  146. }
  147. // canCoalesce represents the outcome of checking if two TCP packets are
  148. // candidates for coalescing.
  149. type canCoalesce int
  150. const (
  151. coalescePrepend canCoalesce = -1
  152. coalesceUnavailable canCoalesce = 0
  153. coalesceAppend canCoalesce = 1
  154. )
  155. // tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
  156. // described by item. This function makes considerations that match the kernel's
  157. // GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
  158. func tcpPacketsCanCoalesce(pkt []byte, iphLen, tcphLen uint8, seq uint32, pshSet bool, gsoSize uint16, item tcpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
  159. pktTarget := bufs[item.bufsIndex][bufsOffset:]
  160. if tcphLen != item.tcphLen {
  161. // cannot coalesce with unequal tcp options len
  162. return coalesceUnavailable
  163. }
  164. if tcphLen > 20 {
  165. if !bytes.Equal(pkt[iphLen+20:iphLen+tcphLen], pktTarget[item.iphLen+20:iphLen+tcphLen]) {
  166. // cannot coalesce with unequal tcp options
  167. return coalesceUnavailable
  168. }
  169. }
  170. if pkt[0]>>4 == 6 {
  171. if pkt[0] != pktTarget[0] || pkt[1]>>4 != pktTarget[1]>>4 {
  172. // cannot coalesce with unequal Traffic class values
  173. return coalesceUnavailable
  174. }
  175. if pkt[7] != pktTarget[7] {
  176. // cannot coalesce with unequal Hop limit values
  177. return coalesceUnavailable
  178. }
  179. } else {
  180. if pkt[1] != pktTarget[1] {
  181. // cannot coalesce with unequal ToS values
  182. return coalesceUnavailable
  183. }
  184. if pkt[6]>>5 != pktTarget[6]>>5 {
  185. // cannot coalesce with unequal DF or reserved bits. MF is checked
  186. // further up the stack.
  187. return coalesceUnavailable
  188. }
  189. if pkt[8] != pktTarget[8] {
  190. // cannot coalesce with unequal TTL values
  191. return coalesceUnavailable
  192. }
  193. }
  194. // seq adjacency
  195. lhsLen := item.gsoSize
  196. lhsLen += item.numMerged * item.gsoSize
  197. if seq == item.sentSeq+uint32(lhsLen) { // pkt aligns following item from a seq num perspective
  198. if item.pshSet {
  199. // We cannot append to a segment that has the PSH flag set, PSH
  200. // can only be set on the final segment in a reassembled group.
  201. return coalesceUnavailable
  202. }
  203. if len(pktTarget[iphLen+tcphLen:])%int(item.gsoSize) != 0 {
  204. // A smaller than gsoSize packet has been appended previously.
  205. // Nothing can come after a smaller packet on the end.
  206. return coalesceUnavailable
  207. }
  208. if gsoSize > item.gsoSize {
  209. // We cannot have a larger packet following a smaller one.
  210. return coalesceUnavailable
  211. }
  212. return coalesceAppend
  213. } else if seq+uint32(gsoSize) == item.sentSeq { // pkt aligns in front of item from a seq num perspective
  214. if pshSet {
  215. // We cannot prepend with a segment that has the PSH flag set, PSH
  216. // can only be set on the final segment in a reassembled group.
  217. return coalesceUnavailable
  218. }
  219. if gsoSize < item.gsoSize {
  220. // We cannot have a larger packet following a smaller one.
  221. return coalesceUnavailable
  222. }
  223. if gsoSize > item.gsoSize && item.numMerged > 0 {
  224. // There's at least one previous merge, and we're larger than all
  225. // previous. This would put multiple smaller packets on the end.
  226. return coalesceUnavailable
  227. }
  228. return coalescePrepend
  229. }
  230. return coalesceUnavailable
  231. }
  232. func tcpChecksumValid(pkt []byte, iphLen uint8, isV6 bool) bool {
  233. srcAddrAt := ipv4SrcAddrOffset
  234. addrSize := 4
  235. if isV6 {
  236. srcAddrAt = ipv6SrcAddrOffset
  237. addrSize = 16
  238. }
  239. tcpTotalLen := uint16(len(pkt) - int(iphLen))
  240. tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, pkt[srcAddrAt:srcAddrAt+addrSize], pkt[srcAddrAt+addrSize:srcAddrAt+addrSize*2], tcpTotalLen)
  241. return ^checksum(pkt[iphLen:], tcpCSumNoFold) == 0
  242. }
  243. // coalesceResult represents the result of attempting to coalesce two TCP
  244. // packets.
  245. type coalesceResult int
  246. const (
  247. coalesceInsufficientCap coalesceResult = 0
  248. coalescePSHEnding coalesceResult = 1
  249. coalesceItemInvalidCSum coalesceResult = 2
  250. coalescePktInvalidCSum coalesceResult = 3
  251. coalesceSuccess coalesceResult = 4
  252. )
  253. // coalesceTCPPackets attempts to coalesce pkt with the packet described by
  254. // item, returning the outcome. This function may swap bufs elements in the
  255. // event of a prepend as item's bufs index is already being tracked for writing
  256. // to a Device.
  257. func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize uint16, seq uint32, pshSet bool, item *tcpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
  258. var pktHead []byte // the packet that will end up at the front
  259. headersLen := item.iphLen + item.tcphLen
  260. coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
  261. // Copy data
  262. if mode == coalescePrepend {
  263. pktHead = pkt
  264. if cap(pkt)-bufsOffset < coalescedLen {
  265. // We don't want to allocate a new underlying array if capacity is
  266. // too small.
  267. return coalesceInsufficientCap
  268. }
  269. if pshSet {
  270. return coalescePSHEnding
  271. }
  272. if item.numMerged == 0 {
  273. if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
  274. return coalesceItemInvalidCSum
  275. }
  276. }
  277. if !tcpChecksumValid(pkt, item.iphLen, isV6) {
  278. return coalescePktInvalidCSum
  279. }
  280. item.sentSeq = seq
  281. extendBy := coalescedLen - len(pktHead)
  282. bufs[pktBuffsIndex] = append(bufs[pktBuffsIndex], make([]byte, extendBy)...)
  283. copy(bufs[pktBuffsIndex][bufsOffset+len(pkt):], bufs[item.bufsIndex][bufsOffset+int(headersLen):])
  284. // Flip the slice headers in bufs as part of prepend. The index of item
  285. // is already being tracked for writing.
  286. bufs[item.bufsIndex], bufs[pktBuffsIndex] = bufs[pktBuffsIndex], bufs[item.bufsIndex]
  287. } else {
  288. pktHead = bufs[item.bufsIndex][bufsOffset:]
  289. if cap(pktHead)-bufsOffset < coalescedLen {
  290. // We don't want to allocate a new underlying array if capacity is
  291. // too small.
  292. return coalesceInsufficientCap
  293. }
  294. if item.numMerged == 0 {
  295. if !tcpChecksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, isV6) {
  296. return coalesceItemInvalidCSum
  297. }
  298. }
  299. if !tcpChecksumValid(pkt, item.iphLen, isV6) {
  300. return coalescePktInvalidCSum
  301. }
  302. if pshSet {
  303. // We are appending a segment with PSH set.
  304. item.pshSet = pshSet
  305. pktHead[item.iphLen+tcpFlagsOffset] |= tcpFlagPSH
  306. }
  307. extendBy := len(pkt) - int(headersLen)
  308. bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
  309. copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
  310. }
  311. if gsoSize > item.gsoSize {
  312. item.gsoSize = gsoSize
  313. }
  314. hdr := virtioNetHdr{
  315. flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
  316. hdrLen: uint16(headersLen),
  317. gsoSize: uint16(item.gsoSize),
  318. csumStart: uint16(item.iphLen),
  319. csumOffset: 16,
  320. }
  321. // Recalculate the total len (IPv4) or payload len (IPv6). Recalculate the
  322. // (IPv4) header checksum.
  323. if isV6 {
  324. hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV6
  325. binary.BigEndian.PutUint16(pktHead[4:], uint16(coalescedLen)-uint16(item.iphLen)) // set new payload len
  326. } else {
  327. hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV4
  328. pktHead[10], pktHead[11] = 0, 0 // clear checksum field
  329. binary.BigEndian.PutUint16(pktHead[2:], uint16(coalescedLen)) // set new total length
  330. iphCSum := ^checksum(pktHead[:item.iphLen], 0) // compute checksum
  331. binary.BigEndian.PutUint16(pktHead[10:], iphCSum) // set checksum field
  332. }
  333. hdr.encode(bufs[item.bufsIndex][bufsOffset-virtioNetHdrLen:])
  334. // Calculate the pseudo header checksum and place it at the TCP checksum
  335. // offset. Downstream checksum offloading will combine this with computation
  336. // of the tcp header and payload checksum.
  337. addrLen := 4
  338. addrOffset := ipv4SrcAddrOffset
  339. if isV6 {
  340. addrLen = 16
  341. addrOffset = ipv6SrcAddrOffset
  342. }
  343. srcAddrAt := bufsOffset + addrOffset
  344. srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
  345. dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
  346. psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, srcAddr, dstAddr, uint16(coalescedLen-int(item.iphLen)))
  347. binary.BigEndian.PutUint16(pktHead[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
  348. item.numMerged++
  349. return coalesceSuccess
  350. }
  351. const (
  352. ipv4FlagMoreFragments uint8 = 0x20
  353. )
  354. const (
  355. ipv4SrcAddrOffset = 12
  356. ipv6SrcAddrOffset = 8
  357. maxUint16 = 1<<16 - 1
  358. )
  359. // tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
  360. // existing packets tracked in table. It will return false when pktI is not
  361. // coalesced, otherwise true. This indicates to the caller if bufs[pktI]
  362. // should be written to the Device.
  363. func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool) (pktCoalesced bool) {
  364. pkt := bufs[pktI][offset:]
  365. if len(pkt) > maxUint16 {
  366. // A valid IPv4 or IPv6 packet will never exceed this.
  367. return false
  368. }
  369. iphLen := int((pkt[0] & 0x0F) * 4)
  370. if isV6 {
  371. iphLen = 40
  372. ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
  373. if ipv6HPayloadLen != len(pkt)-iphLen {
  374. return false
  375. }
  376. } else {
  377. totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
  378. if totalLen != len(pkt) {
  379. return false
  380. }
  381. }
  382. if len(pkt) < iphLen {
  383. return false
  384. }
  385. tcphLen := int((pkt[iphLen+12] >> 4) * 4)
  386. if tcphLen < 20 || tcphLen > 60 {
  387. return false
  388. }
  389. if len(pkt) < iphLen+tcphLen {
  390. return false
  391. }
  392. if !isV6 {
  393. if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
  394. // no GRO support for fragmented segments for now
  395. return false
  396. }
  397. }
  398. tcpFlags := pkt[iphLen+tcpFlagsOffset]
  399. var pshSet bool
  400. // not a candidate if any non-ACK flags (except PSH+ACK) are set
  401. if tcpFlags != tcpFlagACK {
  402. if pkt[iphLen+tcpFlagsOffset] != tcpFlagACK|tcpFlagPSH {
  403. return false
  404. }
  405. pshSet = true
  406. }
  407. gsoSize := uint16(len(pkt) - tcphLen - iphLen)
  408. // not a candidate if payload len is 0
  409. if gsoSize < 1 {
  410. return false
  411. }
  412. seq := binary.BigEndian.Uint32(pkt[iphLen+4:])
  413. srcAddrOffset := ipv4SrcAddrOffset
  414. addrLen := 4
  415. if isV6 {
  416. srcAddrOffset = ipv6SrcAddrOffset
  417. addrLen = 16
  418. }
  419. items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
  420. if !existing {
  421. return false
  422. }
  423. for i := len(items) - 1; i >= 0; i-- {
  424. // In the best case of packets arriving in order iterating in reverse is
  425. // more efficient if there are multiple items for a given flow. This
  426. // also enables a natural table.deleteAt() in the
  427. // coalesceItemInvalidCSum case without the need for index tracking.
  428. // This algorithm makes a best effort to coalesce in the event of
  429. // unordered packets, where pkt may land anywhere in items from a
  430. // sequence number perspective, however once an item is inserted into
  431. // the table it is never compared across other items later.
  432. item := items[i]
  433. can := tcpPacketsCanCoalesce(pkt, uint8(iphLen), uint8(tcphLen), seq, pshSet, gsoSize, item, bufs, offset)
  434. if can != coalesceUnavailable {
  435. result := coalesceTCPPackets(can, pkt, pktI, gsoSize, seq, pshSet, &item, bufs, offset, isV6)
  436. switch result {
  437. case coalesceSuccess:
  438. table.updateAt(item, i)
  439. return true
  440. case coalesceItemInvalidCSum:
  441. // delete the item with an invalid csum
  442. table.deleteAt(item.key, i)
  443. case coalescePktInvalidCSum:
  444. // no point in inserting an item that we can't coalesce
  445. return false
  446. default:
  447. }
  448. }
  449. }
  450. // failed to coalesce with any other packets; store the item in the flow
  451. table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
  452. return false
  453. }
  454. func isTCP4NoIPOptions(b []byte) bool {
  455. if len(b) < 40 {
  456. return false
  457. }
  458. if b[0]>>4 != 4 {
  459. return false
  460. }
  461. if b[0]&0x0F != 5 {
  462. return false
  463. }
  464. if b[9] != unix.IPPROTO_TCP {
  465. return false
  466. }
  467. return true
  468. }
  469. func isTCP6NoEH(b []byte) bool {
  470. if len(b) < 60 {
  471. return false
  472. }
  473. if b[0]>>4 != 6 {
  474. return false
  475. }
  476. if b[6] != unix.IPPROTO_TCP {
  477. return false
  478. }
  479. return true
  480. }
  481. // handleGRO evaluates bufs for GRO, and writes the indices of the resulting
  482. // packets into toWrite. toWrite, tcp4Table, and tcp6Table should initially be
  483. // empty (but non-nil), and are passed in to save allocs as the caller may reset
  484. // and recycle them across vectors of packets.
  485. func handleGRO(bufs [][]byte, offset int, tcp4Table, tcp6Table *tcpGROTable, toWrite *[]int) error {
  486. for i := range bufs {
  487. if offset < virtioNetHdrLen || offset > len(bufs[i])-1 {
  488. return errors.New("invalid offset")
  489. }
  490. var coalesced bool
  491. switch {
  492. case isTCP4NoIPOptions(bufs[i][offset:]): // ipv4 packets w/IP options do not coalesce
  493. coalesced = tcpGRO(bufs, offset, i, tcp4Table, false)
  494. case isTCP6NoEH(bufs[i][offset:]): // ipv6 packets w/extension headers do not coalesce
  495. coalesced = tcpGRO(bufs, offset, i, tcp6Table, true)
  496. }
  497. if !coalesced {
  498. hdr := virtioNetHdr{}
  499. err := hdr.encode(bufs[i][offset-virtioNetHdrLen:])
  500. if err != nil {
  501. return err
  502. }
  503. *toWrite = append(*toWrite, i)
  504. }
  505. }
  506. return nil
  507. }
  508. // tcpTSO splits packets from in into outBuffs, writing the size of each
  509. // element into sizes. It returns the number of buffers populated, and/or an
  510. // error.
  511. func tcpTSO(in []byte, hdr virtioNetHdr, outBuffs [][]byte, sizes []int, outOffset int) (int, error) {
  512. iphLen := int(hdr.csumStart)
  513. srcAddrOffset := ipv6SrcAddrOffset
  514. addrLen := 16
  515. if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
  516. in[10], in[11] = 0, 0 // clear ipv4 header checksum
  517. srcAddrOffset = ipv4SrcAddrOffset
  518. addrLen = 4
  519. }
  520. tcpCSumAt := int(hdr.csumStart + hdr.csumOffset)
  521. in[tcpCSumAt], in[tcpCSumAt+1] = 0, 0 // clear tcp checksum
  522. firstTCPSeqNum := binary.BigEndian.Uint32(in[hdr.csumStart+4:])
  523. nextSegmentDataAt := int(hdr.hdrLen)
  524. i := 0
  525. for ; nextSegmentDataAt < len(in); i++ {
  526. if i == len(outBuffs) {
  527. return i - 1, ErrTooManySegments
  528. }
  529. nextSegmentEnd := nextSegmentDataAt + int(hdr.gsoSize)
  530. if nextSegmentEnd > len(in) {
  531. nextSegmentEnd = len(in)
  532. }
  533. segmentDataLen := nextSegmentEnd - nextSegmentDataAt
  534. totalLen := int(hdr.hdrLen) + segmentDataLen
  535. sizes[i] = totalLen
  536. out := outBuffs[i][outOffset:]
  537. copy(out, in[:iphLen])
  538. if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 {
  539. // For IPv4 we are responsible for incrementing the ID field,
  540. // updating the total len field, and recalculating the header
  541. // checksum.
  542. if i > 0 {
  543. id := binary.BigEndian.Uint16(out[4:])
  544. id += uint16(i)
  545. binary.BigEndian.PutUint16(out[4:], id)
  546. }
  547. binary.BigEndian.PutUint16(out[2:], uint16(totalLen))
  548. ipv4CSum := ^checksum(out[:iphLen], 0)
  549. binary.BigEndian.PutUint16(out[10:], ipv4CSum)
  550. } else {
  551. // For IPv6 we are responsible for updating the payload length field.
  552. binary.BigEndian.PutUint16(out[4:], uint16(totalLen-iphLen))
  553. }
  554. // TCP header
  555. copy(out[hdr.csumStart:hdr.hdrLen], in[hdr.csumStart:hdr.hdrLen])
  556. tcpSeq := firstTCPSeqNum + uint32(hdr.gsoSize*uint16(i))
  557. binary.BigEndian.PutUint32(out[hdr.csumStart+4:], tcpSeq)
  558. if nextSegmentEnd != len(in) {
  559. // FIN and PSH should only be set on last segment
  560. clearFlags := tcpFlagFIN | tcpFlagPSH
  561. out[hdr.csumStart+tcpFlagsOffset] &^= clearFlags
  562. }
  563. // payload
  564. copy(out[hdr.hdrLen:], in[nextSegmentDataAt:nextSegmentEnd])
  565. // TCP checksum
  566. tcpHLen := int(hdr.hdrLen - hdr.csumStart)
  567. tcpLenForPseudo := uint16(tcpHLen + segmentDataLen)
  568. tcpCSumNoFold := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, in[srcAddrOffset:srcAddrOffset+addrLen], in[srcAddrOffset+addrLen:srcAddrOffset+addrLen*2], tcpLenForPseudo)
  569. tcpCSum := ^checksum(out[hdr.csumStart:totalLen], tcpCSumNoFold)
  570. binary.BigEndian.PutUint16(out[hdr.csumStart+hdr.csumOffset:], tcpCSum)
  571. nextSegmentDataAt += int(hdr.gsoSize)
  572. }
  573. return i, nil
  574. }
  575. func gsoNoneChecksum(in []byte, cSumStart, cSumOffset uint16) error {
  576. cSumAt := cSumStart + cSumOffset
  577. // The initial value at the checksum offset should be summed with the
  578. // checksum we compute. This is typically the pseudo-header checksum.
  579. initial := binary.BigEndian.Uint16(in[cSumAt:])
  580. in[cSumAt], in[cSumAt+1] = 0, 0
  581. binary.BigEndian.PutUint16(in[cSumAt:], ^checksum(in[cSumStart:], uint64(initial)))
  582. return nil
  583. }