connection_manager.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. package nebula
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "net/netip"
  7. "time"
  8. "github.com/rcrowley/go-metrics"
  9. "github.com/sirupsen/logrus"
  10. "github.com/slackhq/nebula/cert"
  11. "github.com/slackhq/nebula/header"
  12. )
  13. type trafficDecision int
  14. const (
  15. doNothing trafficDecision = 0
  16. deleteTunnel trafficDecision = 1 // delete the hostinfo on our side, do not notify the remote
  17. closeTunnel trafficDecision = 2 // delete the hostinfo and notify the remote
  18. swapPrimary trafficDecision = 3
  19. migrateRelays trafficDecision = 4
  20. tryRehandshake trafficDecision = 5
  21. sendTestPacket trafficDecision = 6
  22. )
  23. type connectionManager struct {
  24. in map[uint32]struct{}
  25. inLock syncRWMutex
  26. out map[uint32]struct{}
  27. outLock syncRWMutex
  28. // relayUsed holds which relay localIndexs are in use
  29. relayUsed map[uint32]struct{}
  30. relayUsedLock syncRWMutex
  31. hostMap *HostMap
  32. trafficTimer *LockingTimerWheel[uint32]
  33. intf *Interface
  34. pendingDeletion map[uint32]struct{}
  35. punchy *Punchy
  36. checkInterval time.Duration
  37. pendingDeletionInterval time.Duration
  38. metricsTxPunchy metrics.Counter
  39. l *logrus.Logger
  40. }
  41. func newConnectionManager(ctx context.Context, l *logrus.Logger, intf *Interface, checkInterval, pendingDeletionInterval time.Duration, punchy *Punchy) *connectionManager {
  42. var max time.Duration
  43. if checkInterval < pendingDeletionInterval {
  44. max = pendingDeletionInterval
  45. } else {
  46. max = checkInterval
  47. }
  48. nc := &connectionManager{
  49. hostMap: intf.hostMap,
  50. in: make(map[uint32]struct{}),
  51. inLock: newSyncRWMutex("connection-manager-in"),
  52. out: make(map[uint32]struct{}),
  53. outLock: newSyncRWMutex("connection-manager-out"),
  54. relayUsed: make(map[uint32]struct{}),
  55. relayUsedLock: newSyncRWMutex("connection-manager-relay-used"),
  56. trafficTimer: NewLockingTimerWheel[uint32]("connection-manager-timer", time.Millisecond*500, max),
  57. intf: intf,
  58. pendingDeletion: make(map[uint32]struct{}),
  59. checkInterval: checkInterval,
  60. pendingDeletionInterval: pendingDeletionInterval,
  61. punchy: punchy,
  62. metricsTxPunchy: metrics.GetOrRegisterCounter("messages.tx.punchy", nil),
  63. l: l,
  64. }
  65. nc.Start(ctx)
  66. return nc
  67. }
  68. func (n *connectionManager) In(localIndex uint32) {
  69. n.inLock.RLock()
  70. // If this already exists, return
  71. if _, ok := n.in[localIndex]; ok {
  72. n.inLock.RUnlock()
  73. return
  74. }
  75. n.inLock.RUnlock()
  76. n.inLock.Lock()
  77. n.in[localIndex] = struct{}{}
  78. n.inLock.Unlock()
  79. }
  80. func (n *connectionManager) Out(localIndex uint32) {
  81. n.outLock.RLock()
  82. // If this already exists, return
  83. if _, ok := n.out[localIndex]; ok {
  84. n.outLock.RUnlock()
  85. return
  86. }
  87. n.outLock.RUnlock()
  88. n.outLock.Lock()
  89. n.out[localIndex] = struct{}{}
  90. n.outLock.Unlock()
  91. }
  92. func (n *connectionManager) RelayUsed(localIndex uint32) {
  93. n.relayUsedLock.RLock()
  94. // If this already exists, return
  95. if _, ok := n.relayUsed[localIndex]; ok {
  96. n.relayUsedLock.RUnlock()
  97. return
  98. }
  99. n.relayUsedLock.RUnlock()
  100. n.relayUsedLock.Lock()
  101. n.relayUsed[localIndex] = struct{}{}
  102. n.relayUsedLock.Unlock()
  103. }
  104. // getAndResetTrafficCheck returns if there was any inbound or outbound traffic within the last tick and
  105. // resets the state for this local index
  106. func (n *connectionManager) getAndResetTrafficCheck(localIndex uint32) (bool, bool) {
  107. n.inLock.Lock()
  108. n.outLock.Lock()
  109. _, in := n.in[localIndex]
  110. _, out := n.out[localIndex]
  111. delete(n.in, localIndex)
  112. delete(n.out, localIndex)
  113. n.inLock.Unlock()
  114. n.outLock.Unlock()
  115. return in, out
  116. }
  117. func (n *connectionManager) AddTrafficWatch(localIndex uint32) {
  118. // Use a write lock directly because it should be incredibly rare that we are ever already tracking this index
  119. n.outLock.Lock()
  120. if _, ok := n.out[localIndex]; ok {
  121. n.outLock.Unlock()
  122. return
  123. }
  124. n.out[localIndex] = struct{}{}
  125. n.trafficTimer.Add(localIndex, n.checkInterval)
  126. n.outLock.Unlock()
  127. }
  128. func (n *connectionManager) Start(ctx context.Context) {
  129. go n.Run(ctx)
  130. }
  131. func (n *connectionManager) Run(ctx context.Context) {
  132. //TODO: this tick should be based on the min wheel tick? Check firewall
  133. clockSource := time.NewTicker(500 * time.Millisecond)
  134. defer clockSource.Stop()
  135. p := []byte("")
  136. nb := make([]byte, 12, 12)
  137. out := make([]byte, mtu)
  138. for {
  139. select {
  140. case <-ctx.Done():
  141. return
  142. case now := <-clockSource.C:
  143. n.trafficTimer.Advance(now)
  144. for {
  145. localIndex, has := n.trafficTimer.Purge()
  146. if !has {
  147. break
  148. }
  149. n.doTrafficCheck(localIndex, p, nb, out, now)
  150. }
  151. }
  152. }
  153. }
  154. func (n *connectionManager) doTrafficCheck(localIndex uint32, p, nb, out []byte, now time.Time) {
  155. decision, hostinfo, primary := n.makeTrafficDecision(localIndex, now)
  156. switch decision {
  157. case deleteTunnel:
  158. if n.hostMap.DeleteHostInfo(hostinfo) {
  159. // Only clearing the lighthouse cache if this is the last hostinfo for this vpn ip in the hostmap
  160. n.intf.lightHouse.DeleteVpnAddrs(hostinfo.vpnAddrs)
  161. }
  162. case closeTunnel:
  163. n.intf.sendCloseTunnel(hostinfo)
  164. n.intf.closeTunnel(hostinfo)
  165. case swapPrimary:
  166. n.swapPrimary(hostinfo, primary)
  167. case migrateRelays:
  168. n.migrateRelayUsed(hostinfo, primary)
  169. case tryRehandshake:
  170. n.tryRehandshake(hostinfo)
  171. case sendTestPacket:
  172. n.intf.SendMessageToHostInfo(header.Test, header.TestRequest, hostinfo, p, nb, out)
  173. }
  174. n.resetRelayTrafficCheck(hostinfo)
  175. }
  176. func (n *connectionManager) resetRelayTrafficCheck(hostinfo *HostInfo) {
  177. if hostinfo != nil {
  178. n.relayUsedLock.Lock()
  179. defer n.relayUsedLock.Unlock()
  180. // No need to migrate any relays, delete usage info now.
  181. for _, idx := range hostinfo.relayState.CopyRelayForIdxs() {
  182. delete(n.relayUsed, idx)
  183. }
  184. }
  185. }
  186. func (n *connectionManager) migrateRelayUsed(oldhostinfo, newhostinfo *HostInfo) {
  187. relayFor := oldhostinfo.relayState.CopyAllRelayFor()
  188. for _, r := range relayFor {
  189. existing, ok := newhostinfo.relayState.QueryRelayForByIp(r.PeerAddr)
  190. var index uint32
  191. var relayFrom netip.Addr
  192. var relayTo netip.Addr
  193. switch {
  194. case ok && existing.State == Established:
  195. // This relay already exists in newhostinfo, then do nothing.
  196. continue
  197. case ok && existing.State == Requested:
  198. // The relay exists in a Requested state; re-send the request
  199. index = existing.LocalIndex
  200. switch r.Type {
  201. case TerminalType:
  202. relayFrom = n.intf.myVpnAddrs[0]
  203. relayTo = existing.PeerAddr
  204. case ForwardingType:
  205. relayFrom = existing.PeerAddr
  206. relayTo = newhostinfo.vpnAddrs[0]
  207. default:
  208. // should never happen
  209. }
  210. case !ok:
  211. n.relayUsedLock.RLock()
  212. if _, relayUsed := n.relayUsed[r.LocalIndex]; !relayUsed {
  213. // The relay hasn't been used; don't migrate it.
  214. n.relayUsedLock.RUnlock()
  215. continue
  216. }
  217. n.relayUsedLock.RUnlock()
  218. // The relay doesn't exist at all; create some relay state and send the request.
  219. var err error
  220. index, err = AddRelay(n.l, newhostinfo, n.hostMap, r.PeerAddr, nil, r.Type, Requested)
  221. if err != nil {
  222. n.l.WithError(err).Error("failed to migrate relay to new hostinfo")
  223. continue
  224. }
  225. switch r.Type {
  226. case TerminalType:
  227. relayFrom = n.intf.myVpnAddrs[0]
  228. relayTo = r.PeerAddr
  229. case ForwardingType:
  230. relayFrom = r.PeerAddr
  231. relayTo = newhostinfo.vpnAddrs[0]
  232. default:
  233. // should never happen
  234. }
  235. }
  236. // Send a CreateRelayRequest to the peer.
  237. req := NebulaControl{
  238. Type: NebulaControl_CreateRelayRequest,
  239. InitiatorRelayIndex: index,
  240. }
  241. switch newhostinfo.GetCert().Certificate.Version() {
  242. case cert.Version1:
  243. if !relayFrom.Is4() {
  244. n.l.Error("can not migrate v1 relay with a v6 network because the relay is not running a current nebula version")
  245. continue
  246. }
  247. if !relayTo.Is4() {
  248. n.l.Error("can not migrate v1 relay with a v6 remote network because the relay is not running a current nebula version")
  249. continue
  250. }
  251. b := relayFrom.As4()
  252. req.OldRelayFromAddr = binary.BigEndian.Uint32(b[:])
  253. b = relayTo.As4()
  254. req.OldRelayToAddr = binary.BigEndian.Uint32(b[:])
  255. case cert.Version2:
  256. req.RelayFromAddr = netAddrToProtoAddr(relayFrom)
  257. req.RelayToAddr = netAddrToProtoAddr(relayTo)
  258. default:
  259. newhostinfo.logger(n.l).Error("Unknown certificate version found while attempting to migrate relay")
  260. continue
  261. }
  262. msg, err := req.Marshal()
  263. if err != nil {
  264. n.l.WithError(err).Error("failed to marshal Control message to migrate relay")
  265. } else {
  266. n.intf.SendMessageToHostInfo(header.Control, 0, newhostinfo, msg, make([]byte, 12), make([]byte, mtu))
  267. n.l.WithFields(logrus.Fields{
  268. "relayFrom": req.RelayFromAddr,
  269. "relayTo": req.RelayToAddr,
  270. "initiatorRelayIndex": req.InitiatorRelayIndex,
  271. "responderRelayIndex": req.ResponderRelayIndex,
  272. "vpnAddrs": newhostinfo.vpnAddrs}).
  273. Info("send CreateRelayRequest")
  274. }
  275. }
  276. }
  277. func (n *connectionManager) makeTrafficDecision(localIndex uint32, now time.Time) (trafficDecision, *HostInfo, *HostInfo) {
  278. n.hostMap.RLock()
  279. defer n.hostMap.RUnlock()
  280. hostinfo := n.hostMap.Indexes[localIndex]
  281. if hostinfo == nil {
  282. n.l.WithField("localIndex", localIndex).Debugf("Not found in hostmap")
  283. delete(n.pendingDeletion, localIndex)
  284. return doNothing, nil, nil
  285. }
  286. if n.isInvalidCertificate(now, hostinfo) {
  287. delete(n.pendingDeletion, hostinfo.localIndexId)
  288. return closeTunnel, hostinfo, nil
  289. }
  290. primary := n.hostMap.Hosts[hostinfo.vpnAddrs[0]]
  291. mainHostInfo := true
  292. if primary != nil && primary != hostinfo {
  293. mainHostInfo = false
  294. }
  295. // Check for traffic on this hostinfo
  296. inTraffic, outTraffic := n.getAndResetTrafficCheck(localIndex)
  297. // A hostinfo is determined alive if there is incoming traffic
  298. if inTraffic {
  299. decision := doNothing
  300. if n.l.Level >= logrus.DebugLevel {
  301. hostinfo.logger(n.l).
  302. WithField("tunnelCheck", m{"state": "alive", "method": "passive"}).
  303. Debug("Tunnel status")
  304. }
  305. delete(n.pendingDeletion, hostinfo.localIndexId)
  306. if mainHostInfo {
  307. decision = tryRehandshake
  308. } else {
  309. if n.shouldSwapPrimary(hostinfo, primary) {
  310. decision = swapPrimary
  311. } else {
  312. // migrate the relays to the primary, if in use.
  313. decision = migrateRelays
  314. }
  315. }
  316. n.trafficTimer.Add(hostinfo.localIndexId, n.checkInterval)
  317. if !outTraffic {
  318. // Send a punch packet to keep the NAT state alive
  319. n.sendPunch(hostinfo)
  320. }
  321. return decision, hostinfo, primary
  322. }
  323. if _, ok := n.pendingDeletion[hostinfo.localIndexId]; ok {
  324. // We have already sent a test packet and nothing was returned, this hostinfo is dead
  325. hostinfo.logger(n.l).
  326. WithField("tunnelCheck", m{"state": "dead", "method": "active"}).
  327. Info("Tunnel status")
  328. delete(n.pendingDeletion, hostinfo.localIndexId)
  329. return deleteTunnel, hostinfo, nil
  330. }
  331. decision := doNothing
  332. if hostinfo != nil && hostinfo.ConnectionState != nil && mainHostInfo {
  333. if !outTraffic {
  334. // If we aren't sending or receiving traffic then its an unused tunnel and we don't to test the tunnel.
  335. // Just maintain NAT state if configured to do so.
  336. n.sendPunch(hostinfo)
  337. n.trafficTimer.Add(hostinfo.localIndexId, n.checkInterval)
  338. return doNothing, nil, nil
  339. }
  340. if n.punchy.GetTargetEverything() {
  341. // This is similar to the old punchy behavior with a slight optimization.
  342. // We aren't receiving traffic but we are sending it, punch on all known
  343. // ips in case we need to re-prime NAT state
  344. n.sendPunch(hostinfo)
  345. }
  346. if n.l.Level >= logrus.DebugLevel {
  347. hostinfo.logger(n.l).
  348. WithField("tunnelCheck", m{"state": "testing", "method": "active"}).
  349. Debug("Tunnel status")
  350. }
  351. // Send a test packet to trigger an authenticated tunnel test, this should suss out any lingering tunnel issues
  352. decision = sendTestPacket
  353. } else {
  354. if n.l.Level >= logrus.DebugLevel {
  355. hostinfo.logger(n.l).Debugf("Hostinfo sadness")
  356. }
  357. }
  358. n.pendingDeletion[hostinfo.localIndexId] = struct{}{}
  359. n.trafficTimer.Add(hostinfo.localIndexId, n.pendingDeletionInterval)
  360. return decision, hostinfo, nil
  361. }
  362. func (n *connectionManager) shouldSwapPrimary(current, primary *HostInfo) bool {
  363. // The primary tunnel is the most recent handshake to complete locally and should work entirely fine.
  364. // If we are here then we have multiple tunnels for a host pair and neither side believes the same tunnel is primary.
  365. // Let's sort this out.
  366. // Only one side should swap because if both swap then we may never resolve to a single tunnel.
  367. // vpn addr is static across all tunnels for this host pair so lets
  368. // use that to determine if we should consider swapping.
  369. if current.vpnAddrs[0].Compare(n.intf.myVpnAddrs[0]) < 0 {
  370. // Their primary vpn addr is less than mine. Do not swap.
  371. return false
  372. }
  373. crt := n.intf.pki.getCertState().getCertificate(current.ConnectionState.myCert.Version())
  374. // If this tunnel is using the latest certificate then we should swap it to primary for a bit and see if things
  375. // settle down.
  376. return bytes.Equal(current.ConnectionState.myCert.Signature(), crt.Signature())
  377. }
  378. func (n *connectionManager) swapPrimary(current, primary *HostInfo) {
  379. n.hostMap.Lock()
  380. // Make sure the primary is still the same after the write lock. This avoids a race with a rehandshake.
  381. if n.hostMap.Hosts[current.vpnAddrs[0]] == primary {
  382. n.hostMap.unlockedMakePrimary(current)
  383. }
  384. n.hostMap.Unlock()
  385. }
  386. // isInvalidCertificate will check if we should destroy a tunnel if pki.disconnect_invalid is true and
  387. // the certificate is no longer valid. Block listed certificates will skip the pki.disconnect_invalid
  388. // check and return true.
  389. func (n *connectionManager) isInvalidCertificate(now time.Time, hostinfo *HostInfo) bool {
  390. remoteCert := hostinfo.GetCert()
  391. if remoteCert == nil {
  392. return false
  393. }
  394. caPool := n.intf.pki.GetCAPool()
  395. err := caPool.VerifyCachedCertificate(now, remoteCert)
  396. if err == nil {
  397. return false
  398. }
  399. if !n.intf.disconnectInvalid.Load() && err != cert.ErrBlockListed {
  400. // Block listed certificates should always be disconnected
  401. return false
  402. }
  403. hostinfo.logger(n.l).WithError(err).
  404. WithField("fingerprint", remoteCert.Fingerprint).
  405. Info("Remote certificate is no longer valid, tearing down the tunnel")
  406. return true
  407. }
  408. func (n *connectionManager) sendPunch(hostinfo *HostInfo) {
  409. if !n.punchy.GetPunch() {
  410. // Punching is disabled
  411. return
  412. }
  413. if n.punchy.GetTargetEverything() {
  414. hostinfo.remotes.ForEach(n.hostMap.GetPreferredRanges(), func(addr netip.AddrPort, preferred bool) {
  415. n.metricsTxPunchy.Inc(1)
  416. n.intf.outside.WriteTo([]byte{1}, addr)
  417. })
  418. } else if hostinfo.remote.IsValid() {
  419. n.metricsTxPunchy.Inc(1)
  420. n.intf.outside.WriteTo([]byte{1}, hostinfo.remote)
  421. }
  422. }
  423. func (n *connectionManager) tryRehandshake(hostinfo *HostInfo) {
  424. cs := n.intf.pki.getCertState()
  425. curCrt := hostinfo.ConnectionState.myCert
  426. myCrt := cs.getCertificate(curCrt.Version())
  427. if curCrt.Version() >= cs.defaultVersion && bytes.Equal(curCrt.Signature(), myCrt.Signature()) == true {
  428. // The current tunnel is using the latest certificate and version, no need to rehandshake.
  429. return
  430. }
  431. n.l.WithField("vpnAddrs", hostinfo.vpnAddrs).
  432. WithField("reason", "local certificate is not current").
  433. Info("Re-handshaking with remote")
  434. n.intf.handshakeManager.StartHandshake(hostinfo.vpnAddrs[0], nil)
  435. }