handshake_manager.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. package nebula
  2. import (
  3. "bytes"
  4. "context"
  5. "crypto/rand"
  6. "encoding/binary"
  7. "errors"
  8. "net"
  9. "sync"
  10. "time"
  11. "github.com/rcrowley/go-metrics"
  12. "github.com/sirupsen/logrus"
  13. "github.com/slackhq/nebula/header"
  14. "github.com/slackhq/nebula/iputil"
  15. "github.com/slackhq/nebula/udp"
  16. )
  17. const (
  18. DefaultHandshakeTryInterval = time.Millisecond * 100
  19. DefaultHandshakeRetries = 10
  20. DefaultHandshakeTriggerBuffer = 64
  21. DefaultUseRelays = true
  22. )
  23. var (
  24. defaultHandshakeConfig = HandshakeConfig{
  25. tryInterval: DefaultHandshakeTryInterval,
  26. retries: DefaultHandshakeRetries,
  27. triggerBuffer: DefaultHandshakeTriggerBuffer,
  28. useRelays: DefaultUseRelays,
  29. }
  30. )
  31. type HandshakeConfig struct {
  32. tryInterval time.Duration
  33. retries int
  34. triggerBuffer int
  35. useRelays bool
  36. messageMetrics *MessageMetrics
  37. }
  38. type HandshakeManager struct {
  39. // Mutex for interacting with the vpnIps and indexes maps
  40. sync.RWMutex
  41. vpnIps map[iputil.VpnIp]*HostInfo
  42. indexes map[uint32]*HostInfo
  43. mainHostMap *HostMap
  44. lightHouse *LightHouse
  45. outside udp.Conn
  46. config HandshakeConfig
  47. OutboundHandshakeTimer *LockingTimerWheel[iputil.VpnIp]
  48. messageMetrics *MessageMetrics
  49. metricInitiated metrics.Counter
  50. metricTimedOut metrics.Counter
  51. l *logrus.Logger
  52. // can be used to trigger outbound handshake for the given vpnIp
  53. trigger chan iputil.VpnIp
  54. }
  55. func NewHandshakeManager(l *logrus.Logger, tunCidr *net.IPNet, preferredRanges []*net.IPNet, mainHostMap *HostMap, lightHouse *LightHouse, outside udp.Conn, config HandshakeConfig) *HandshakeManager {
  56. return &HandshakeManager{
  57. vpnIps: map[iputil.VpnIp]*HostInfo{},
  58. indexes: map[uint32]*HostInfo{},
  59. mainHostMap: mainHostMap,
  60. lightHouse: lightHouse,
  61. outside: outside,
  62. config: config,
  63. trigger: make(chan iputil.VpnIp, config.triggerBuffer),
  64. OutboundHandshakeTimer: NewLockingTimerWheel[iputil.VpnIp](config.tryInterval, hsTimeout(config.retries, config.tryInterval)),
  65. messageMetrics: config.messageMetrics,
  66. metricInitiated: metrics.GetOrRegisterCounter("handshake_manager.initiated", nil),
  67. metricTimedOut: metrics.GetOrRegisterCounter("handshake_manager.timed_out", nil),
  68. l: l,
  69. }
  70. }
  71. func (c *HandshakeManager) Run(ctx context.Context, f EncWriter) {
  72. clockSource := time.NewTicker(c.config.tryInterval)
  73. defer clockSource.Stop()
  74. for {
  75. select {
  76. case <-ctx.Done():
  77. return
  78. case vpnIP := <-c.trigger:
  79. c.handleOutbound(vpnIP, f, true)
  80. case now := <-clockSource.C:
  81. c.NextOutboundHandshakeTimerTick(now, f)
  82. }
  83. }
  84. }
  85. func (c *HandshakeManager) NextOutboundHandshakeTimerTick(now time.Time, f EncWriter) {
  86. c.OutboundHandshakeTimer.Advance(now)
  87. for {
  88. vpnIp, has := c.OutboundHandshakeTimer.Purge()
  89. if !has {
  90. break
  91. }
  92. c.handleOutbound(vpnIp, f, false)
  93. }
  94. }
  95. func (c *HandshakeManager) handleOutbound(vpnIp iputil.VpnIp, f EncWriter, lighthouseTriggered bool) {
  96. hostinfo := c.QueryVpnIp(vpnIp)
  97. if hostinfo == nil {
  98. return
  99. }
  100. hostinfo.Lock()
  101. defer hostinfo.Unlock()
  102. // We may have raced to completion but now that we have a lock we should ensure we have not yet completed.
  103. if hostinfo.HandshakeComplete {
  104. // Ensure we don't exist in the pending hostmap anymore since we have completed
  105. c.DeleteHostInfo(hostinfo)
  106. return
  107. }
  108. // Check if we have a handshake packet to transmit yet
  109. if !hostinfo.HandshakeReady {
  110. // There is currently a slight race in getOrHandshake due to ConnectionState not being part of the HostInfo directly
  111. // Our hostinfo here was added to the pending map and the wheel may have ticked to us before we created ConnectionState
  112. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  113. return
  114. }
  115. // If we are out of time, clean up
  116. if hostinfo.HandshakeCounter >= c.config.retries {
  117. hostinfo.logger(c.l).WithField("udpAddrs", hostinfo.remotes.CopyAddrs(c.mainHostMap.preferredRanges)).
  118. WithField("initiatorIndex", hostinfo.localIndexId).
  119. WithField("remoteIndex", hostinfo.remoteIndexId).
  120. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  121. WithField("durationNs", time.Since(hostinfo.handshakeStart).Nanoseconds()).
  122. Info("Handshake timed out")
  123. c.metricTimedOut.Inc(1)
  124. c.DeleteHostInfo(hostinfo)
  125. return
  126. }
  127. // Get a remotes object if we don't already have one.
  128. // This is mainly to protect us as this should never be the case
  129. // NB ^ This comment doesn't jive. It's how the thing gets initialized.
  130. // It's the common path. Should it update every time, in case a future LH query/queries give us more info?
  131. if hostinfo.remotes == nil {
  132. hostinfo.remotes = c.lightHouse.QueryCache(vpnIp)
  133. }
  134. remotes := hostinfo.remotes.CopyAddrs(c.mainHostMap.preferredRanges)
  135. remotesHaveChanged := !udp.AddrSlice(remotes).Equal(hostinfo.HandshakeLastRemotes)
  136. // We only care about a lighthouse trigger if we have new remotes to send to.
  137. // This is a very specific optimization for a fast lighthouse reply.
  138. if lighthouseTriggered && !remotesHaveChanged {
  139. // If we didn't return here a lighthouse could cause us to aggressively send handshakes
  140. return
  141. }
  142. hostinfo.HandshakeLastRemotes = remotes
  143. // TODO: this will generate a load of queries for hosts with only 1 ip
  144. // (such as ones registered to the lighthouse with only a private IP)
  145. // So we only do it one time after attempting 5 handshakes already.
  146. if len(remotes) <= 1 && hostinfo.HandshakeCounter == 5 {
  147. // If we only have 1 remote it is highly likely our query raced with the other host registered within the lighthouse
  148. // Our vpnIp here has a tunnel with a lighthouse but has yet to send a host update packet there so we only know about
  149. // the learned public ip for them. Query again to short circuit the promotion counter
  150. c.lightHouse.QueryServer(vpnIp, f)
  151. }
  152. // Send the handshake to all known ips, stage 2 takes care of assigning the hostinfo.remote based on the first to reply
  153. var sentTo []*udp.Addr
  154. hostinfo.remotes.ForEach(c.mainHostMap.preferredRanges, func(addr *udp.Addr, _ bool) {
  155. c.messageMetrics.Tx(header.Handshake, header.MessageSubType(hostinfo.HandshakePacket[0][1]), 1)
  156. err := c.outside.WriteTo(hostinfo.HandshakePacket[0], addr)
  157. if err != nil {
  158. hostinfo.logger(c.l).WithField("udpAddr", addr).
  159. WithField("initiatorIndex", hostinfo.localIndexId).
  160. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  161. WithError(err).Error("Failed to send handshake message")
  162. } else {
  163. sentTo = append(sentTo, addr)
  164. }
  165. })
  166. // Don't be too noisy or confusing if we fail to send a handshake - if we don't get through we'll eventually log a timeout,
  167. // so only log when the list of remotes has changed
  168. if remotesHaveChanged {
  169. hostinfo.logger(c.l).WithField("udpAddrs", sentTo).
  170. WithField("initiatorIndex", hostinfo.localIndexId).
  171. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  172. Info("Handshake message sent")
  173. } else if c.l.IsLevelEnabled(logrus.DebugLevel) {
  174. hostinfo.logger(c.l).WithField("udpAddrs", sentTo).
  175. WithField("initiatorIndex", hostinfo.localIndexId).
  176. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  177. Debug("Handshake message sent")
  178. }
  179. if c.config.useRelays && len(hostinfo.remotes.relays) > 0 {
  180. hostinfo.logger(c.l).WithField("relays", hostinfo.remotes.relays).Info("Attempt to relay through hosts")
  181. // Send a RelayRequest to all known Relay IP's
  182. for _, relay := range hostinfo.remotes.relays {
  183. // Don't relay to myself, and don't relay through the host I'm trying to connect to
  184. if *relay == vpnIp || *relay == c.lightHouse.myVpnIp {
  185. continue
  186. }
  187. relayHostInfo := c.mainHostMap.QueryVpnIp(*relay)
  188. if relayHostInfo == nil || relayHostInfo.remote == nil {
  189. hostinfo.logger(c.l).WithField("relay", relay.String()).Info("Establish tunnel to relay target")
  190. f.Handshake(*relay)
  191. continue
  192. }
  193. // Check the relay HostInfo to see if we already established a relay through it
  194. if existingRelay, ok := relayHostInfo.relayState.QueryRelayForByIp(vpnIp); ok {
  195. switch existingRelay.State {
  196. case Established:
  197. hostinfo.logger(c.l).WithField("relay", relay.String()).Info("Send handshake via relay")
  198. f.SendVia(relayHostInfo, existingRelay, hostinfo.HandshakePacket[0], make([]byte, 12), make([]byte, mtu), false)
  199. case Requested:
  200. hostinfo.logger(c.l).WithField("relay", relay.String()).Info("Re-send CreateRelay request")
  201. // Re-send the CreateRelay request, in case the previous one was lost.
  202. m := NebulaControl{
  203. Type: NebulaControl_CreateRelayRequest,
  204. InitiatorRelayIndex: existingRelay.LocalIndex,
  205. RelayFromIp: uint32(c.lightHouse.myVpnIp),
  206. RelayToIp: uint32(vpnIp),
  207. }
  208. msg, err := m.Marshal()
  209. if err != nil {
  210. hostinfo.logger(c.l).
  211. WithError(err).
  212. Error("Failed to marshal Control message to create relay")
  213. } else {
  214. // This must send over the hostinfo, not over hm.Hosts[ip]
  215. f.SendMessageToHostInfo(header.Control, 0, relayHostInfo, msg, make([]byte, 12), make([]byte, mtu))
  216. c.l.WithFields(logrus.Fields{
  217. "relayFrom": c.lightHouse.myVpnIp,
  218. "relayTo": vpnIp,
  219. "initiatorRelayIndex": existingRelay.LocalIndex,
  220. "relay": *relay}).
  221. Info("send CreateRelayRequest")
  222. }
  223. default:
  224. hostinfo.logger(c.l).
  225. WithField("vpnIp", vpnIp).
  226. WithField("state", existingRelay.State).
  227. WithField("relay", relayHostInfo.vpnIp).
  228. Errorf("Relay unexpected state")
  229. }
  230. } else {
  231. // No relays exist or requested yet.
  232. if relayHostInfo.remote != nil {
  233. idx, err := AddRelay(c.l, relayHostInfo, c.mainHostMap, vpnIp, nil, TerminalType, Requested)
  234. if err != nil {
  235. hostinfo.logger(c.l).WithField("relay", relay.String()).WithError(err).Info("Failed to add relay to hostmap")
  236. }
  237. m := NebulaControl{
  238. Type: NebulaControl_CreateRelayRequest,
  239. InitiatorRelayIndex: idx,
  240. RelayFromIp: uint32(c.lightHouse.myVpnIp),
  241. RelayToIp: uint32(vpnIp),
  242. }
  243. msg, err := m.Marshal()
  244. if err != nil {
  245. hostinfo.logger(c.l).
  246. WithError(err).
  247. Error("Failed to marshal Control message to create relay")
  248. } else {
  249. f.SendMessageToHostInfo(header.Control, 0, relayHostInfo, msg, make([]byte, 12), make([]byte, mtu))
  250. c.l.WithFields(logrus.Fields{
  251. "relayFrom": c.lightHouse.myVpnIp,
  252. "relayTo": vpnIp,
  253. "initiatorRelayIndex": idx,
  254. "relay": *relay}).
  255. Info("send CreateRelayRequest")
  256. }
  257. }
  258. }
  259. }
  260. }
  261. // Increment the counter to increase our delay, linear backoff
  262. hostinfo.HandshakeCounter++
  263. // If a lighthouse triggered this attempt then we are still in the timer wheel and do not need to re-add
  264. if !lighthouseTriggered {
  265. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  266. }
  267. }
  268. // AddVpnIp will try to handshake with the provided vpn ip and return the hostinfo for it.
  269. func (c *HandshakeManager) AddVpnIp(vpnIp iputil.VpnIp) *HostInfo {
  270. // A write lock is used to avoid having to recheck the map and trading a read lock for a write lock
  271. c.Lock()
  272. defer c.Unlock()
  273. if hostinfo, ok := c.vpnIps[vpnIp]; ok {
  274. // We are already tracking this vpn ip
  275. return hostinfo
  276. }
  277. hostinfo := &HostInfo{
  278. vpnIp: vpnIp,
  279. HandshakePacket: make(map[uint8][]byte, 0),
  280. relayState: RelayState{
  281. relays: map[iputil.VpnIp]struct{}{},
  282. relayForByIp: map[iputil.VpnIp]*Relay{},
  283. relayForByIdx: map[uint32]*Relay{},
  284. },
  285. }
  286. c.vpnIps[vpnIp] = hostinfo
  287. c.metricInitiated.Inc(1)
  288. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval)
  289. return hostinfo
  290. }
  291. var (
  292. ErrExistingHostInfo = errors.New("existing hostinfo")
  293. ErrAlreadySeen = errors.New("already seen")
  294. ErrLocalIndexCollision = errors.New("local index collision")
  295. )
  296. // CheckAndComplete checks for any conflicts in the main and pending hostmap
  297. // before adding hostinfo to main. If err is nil, it was added. Otherwise err will be:
  298. //
  299. // ErrAlreadySeen if we already have an entry in the hostmap that has seen the
  300. // exact same handshake packet
  301. //
  302. // ErrExistingHostInfo if we already have an entry in the hostmap for this
  303. // VpnIp and the new handshake was older than the one we currently have
  304. //
  305. // ErrLocalIndexCollision if we already have an entry in the main or pending
  306. // hostmap for the hostinfo.localIndexId.
  307. func (c *HandshakeManager) CheckAndComplete(hostinfo *HostInfo, handshakePacket uint8, f *Interface) (*HostInfo, error) {
  308. c.Lock()
  309. defer c.Unlock()
  310. c.mainHostMap.Lock()
  311. defer c.mainHostMap.Unlock()
  312. // Check if we already have a tunnel with this vpn ip
  313. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.vpnIp]
  314. if found && existingHostInfo != nil {
  315. testHostInfo := existingHostInfo
  316. for testHostInfo != nil {
  317. // Is it just a delayed handshake packet?
  318. if bytes.Equal(hostinfo.HandshakePacket[handshakePacket], testHostInfo.HandshakePacket[handshakePacket]) {
  319. return testHostInfo, ErrAlreadySeen
  320. }
  321. testHostInfo = testHostInfo.next
  322. }
  323. // Is this a newer handshake?
  324. if existingHostInfo.lastHandshakeTime >= hostinfo.lastHandshakeTime && !existingHostInfo.ConnectionState.initiator {
  325. return existingHostInfo, ErrExistingHostInfo
  326. }
  327. existingHostInfo.logger(c.l).Info("Taking new handshake")
  328. }
  329. existingIndex, found := c.mainHostMap.Indexes[hostinfo.localIndexId]
  330. if found {
  331. // We have a collision, but for a different hostinfo
  332. return existingIndex, ErrLocalIndexCollision
  333. }
  334. existingIndex, found = c.indexes[hostinfo.localIndexId]
  335. if found && existingIndex != hostinfo {
  336. // We have a collision, but for a different hostinfo
  337. return existingIndex, ErrLocalIndexCollision
  338. }
  339. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  340. if found && existingRemoteIndex != nil && existingRemoteIndex.vpnIp != hostinfo.vpnIp {
  341. // We have a collision, but this can happen since we can't control
  342. // the remote ID. Just log about the situation as a note.
  343. hostinfo.logger(c.l).
  344. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", existingRemoteIndex.vpnIp).
  345. Info("New host shadows existing host remoteIndex")
  346. }
  347. c.mainHostMap.unlockedAddHostInfo(hostinfo, f)
  348. return existingHostInfo, nil
  349. }
  350. // Complete is a simpler version of CheckAndComplete when we already know we
  351. // won't have a localIndexId collision because we already have an entry in the
  352. // pendingHostMap. An existing hostinfo is returned if there was one.
  353. func (c *HandshakeManager) Complete(hostinfo *HostInfo, f *Interface) {
  354. c.Lock()
  355. defer c.Unlock()
  356. c.mainHostMap.Lock()
  357. defer c.mainHostMap.Unlock()
  358. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  359. if found && existingRemoteIndex != nil {
  360. // We have a collision, but this can happen since we can't control
  361. // the remote ID. Just log about the situation as a note.
  362. hostinfo.logger(c.l).
  363. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", existingRemoteIndex.vpnIp).
  364. Info("New host shadows existing host remoteIndex")
  365. }
  366. // We need to remove from the pending hostmap first to avoid undoing work when after to the main hostmap.
  367. c.unlockedDeleteHostInfo(hostinfo)
  368. c.mainHostMap.unlockedAddHostInfo(hostinfo, f)
  369. }
  370. // AddIndexHostInfo generates a unique localIndexId for this HostInfo
  371. // and adds it to the pendingHostMap. Will error if we are unable to generate
  372. // a unique localIndexId
  373. func (c *HandshakeManager) AddIndexHostInfo(h *HostInfo) error {
  374. c.Lock()
  375. defer c.Unlock()
  376. c.mainHostMap.RLock()
  377. defer c.mainHostMap.RUnlock()
  378. for i := 0; i < 32; i++ {
  379. index, err := generateIndex(c.l)
  380. if err != nil {
  381. return err
  382. }
  383. _, inPending := c.indexes[index]
  384. _, inMain := c.mainHostMap.Indexes[index]
  385. if !inMain && !inPending {
  386. h.localIndexId = index
  387. c.indexes[index] = h
  388. return nil
  389. }
  390. }
  391. return errors.New("failed to generate unique localIndexId")
  392. }
  393. func (c *HandshakeManager) DeleteHostInfo(hostinfo *HostInfo) {
  394. c.Lock()
  395. defer c.Unlock()
  396. c.unlockedDeleteHostInfo(hostinfo)
  397. }
  398. func (c *HandshakeManager) unlockedDeleteHostInfo(hostinfo *HostInfo) {
  399. delete(c.vpnIps, hostinfo.vpnIp)
  400. if len(c.vpnIps) == 0 {
  401. c.vpnIps = map[iputil.VpnIp]*HostInfo{}
  402. }
  403. delete(c.indexes, hostinfo.localIndexId)
  404. if len(c.vpnIps) == 0 {
  405. c.indexes = map[uint32]*HostInfo{}
  406. }
  407. if c.l.Level >= logrus.DebugLevel {
  408. c.l.WithField("hostMap", m{"mapTotalSize": len(c.vpnIps),
  409. "vpnIp": hostinfo.vpnIp, "indexNumber": hostinfo.localIndexId, "remoteIndexNumber": hostinfo.remoteIndexId}).
  410. Debug("Pending hostmap hostInfo deleted")
  411. }
  412. }
  413. func (c *HandshakeManager) QueryVpnIp(vpnIp iputil.VpnIp) *HostInfo {
  414. c.RLock()
  415. defer c.RUnlock()
  416. return c.vpnIps[vpnIp]
  417. }
  418. func (c *HandshakeManager) QueryIndex(index uint32) *HostInfo {
  419. c.RLock()
  420. defer c.RUnlock()
  421. return c.indexes[index]
  422. }
  423. func (c *HandshakeManager) GetPreferredRanges() []*net.IPNet {
  424. return c.mainHostMap.preferredRanges
  425. }
  426. func (c *HandshakeManager) ForEachVpnIp(f controlEach) {
  427. c.RLock()
  428. defer c.RUnlock()
  429. for _, v := range c.vpnIps {
  430. f(v)
  431. }
  432. }
  433. func (c *HandshakeManager) ForEachIndex(f controlEach) {
  434. c.RLock()
  435. defer c.RUnlock()
  436. for _, v := range c.indexes {
  437. f(v)
  438. }
  439. }
  440. func (c *HandshakeManager) EmitStats() {
  441. c.RLock()
  442. hostLen := len(c.vpnIps)
  443. indexLen := len(c.indexes)
  444. c.RUnlock()
  445. metrics.GetOrRegisterGauge("hostmap.pending.hosts", nil).Update(int64(hostLen))
  446. metrics.GetOrRegisterGauge("hostmap.pending.indexes", nil).Update(int64(indexLen))
  447. c.mainHostMap.EmitStats()
  448. }
  449. // Utility functions below
  450. func generateIndex(l *logrus.Logger) (uint32, error) {
  451. b := make([]byte, 4)
  452. // Let zero mean we don't know the ID, so don't generate zero
  453. var index uint32
  454. for index == 0 {
  455. _, err := rand.Read(b)
  456. if err != nil {
  457. l.Errorln(err)
  458. return 0, err
  459. }
  460. index = binary.BigEndian.Uint32(b)
  461. }
  462. if l.Level >= logrus.DebugLevel {
  463. l.WithField("index", index).
  464. Debug("Generated index")
  465. }
  466. return index, nil
  467. }
  468. func hsTimeout(tries int, interval time.Duration) time.Duration {
  469. return time.Duration(tries / 2 * ((2 * int(interval)) + (tries-1)*int(interval)))
  470. }