handshake_manager.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. package nebula
  2. import (
  3. "bytes"
  4. "context"
  5. "crypto/rand"
  6. "encoding/binary"
  7. "errors"
  8. "net"
  9. "time"
  10. "github.com/rcrowley/go-metrics"
  11. "github.com/sirupsen/logrus"
  12. "github.com/slackhq/nebula/header"
  13. "github.com/slackhq/nebula/iputil"
  14. "github.com/slackhq/nebula/udp"
  15. )
  16. const (
  17. DefaultHandshakeTryInterval = time.Millisecond * 100
  18. DefaultHandshakeRetries = 10
  19. DefaultHandshakeTriggerBuffer = 64
  20. DefaultUseRelays = true
  21. )
  22. var (
  23. defaultHandshakeConfig = HandshakeConfig{
  24. tryInterval: DefaultHandshakeTryInterval,
  25. retries: DefaultHandshakeRetries,
  26. triggerBuffer: DefaultHandshakeTriggerBuffer,
  27. useRelays: DefaultUseRelays,
  28. }
  29. )
  30. type HandshakeConfig struct {
  31. tryInterval time.Duration
  32. retries int
  33. triggerBuffer int
  34. useRelays bool
  35. messageMetrics *MessageMetrics
  36. }
  37. type HandshakeManager struct {
  38. pendingHostMap *HostMap
  39. mainHostMap *HostMap
  40. lightHouse *LightHouse
  41. outside *udp.Conn
  42. config HandshakeConfig
  43. OutboundHandshakeTimer *LockingTimerWheel[iputil.VpnIp]
  44. messageMetrics *MessageMetrics
  45. metricInitiated metrics.Counter
  46. metricTimedOut metrics.Counter
  47. l *logrus.Logger
  48. // can be used to trigger outbound handshake for the given vpnIp
  49. trigger chan iputil.VpnIp
  50. }
  51. func NewHandshakeManager(l *logrus.Logger, tunCidr *net.IPNet, preferredRanges []*net.IPNet, mainHostMap *HostMap, lightHouse *LightHouse, outside *udp.Conn, config HandshakeConfig) *HandshakeManager {
  52. return &HandshakeManager{
  53. pendingHostMap: NewHostMap(l, "pending", tunCidr, preferredRanges),
  54. mainHostMap: mainHostMap,
  55. lightHouse: lightHouse,
  56. outside: outside,
  57. config: config,
  58. trigger: make(chan iputil.VpnIp, config.triggerBuffer),
  59. OutboundHandshakeTimer: NewLockingTimerWheel[iputil.VpnIp](config.tryInterval, hsTimeout(config.retries, config.tryInterval)),
  60. messageMetrics: config.messageMetrics,
  61. metricInitiated: metrics.GetOrRegisterCounter("handshake_manager.initiated", nil),
  62. metricTimedOut: metrics.GetOrRegisterCounter("handshake_manager.timed_out", nil),
  63. l: l,
  64. }
  65. }
  66. func (c *HandshakeManager) Run(ctx context.Context, f udp.EncWriter) {
  67. clockSource := time.NewTicker(c.config.tryInterval)
  68. defer clockSource.Stop()
  69. for {
  70. select {
  71. case <-ctx.Done():
  72. return
  73. case vpnIP := <-c.trigger:
  74. c.handleOutbound(vpnIP, f, true)
  75. case now := <-clockSource.C:
  76. c.NextOutboundHandshakeTimerTick(now, f)
  77. }
  78. }
  79. }
  80. func (c *HandshakeManager) NextOutboundHandshakeTimerTick(now time.Time, f udp.EncWriter) {
  81. c.OutboundHandshakeTimer.Advance(now)
  82. for {
  83. vpnIp, has := c.OutboundHandshakeTimer.Purge()
  84. if !has {
  85. break
  86. }
  87. c.handleOutbound(vpnIp, f, false)
  88. }
  89. }
  90. func (c *HandshakeManager) handleOutbound(vpnIp iputil.VpnIp, f udp.EncWriter, lighthouseTriggered bool) {
  91. hostinfo, err := c.pendingHostMap.QueryVpnIp(vpnIp)
  92. if err != nil {
  93. return
  94. }
  95. hostinfo.Lock()
  96. defer hostinfo.Unlock()
  97. // We may have raced to completion but now that we have a lock we should ensure we have not yet completed.
  98. if hostinfo.HandshakeComplete {
  99. // Ensure we don't exist in the pending hostmap anymore since we have completed
  100. c.pendingHostMap.DeleteHostInfo(hostinfo)
  101. return
  102. }
  103. // Check if we have a handshake packet to transmit yet
  104. if !hostinfo.HandshakeReady {
  105. // There is currently a slight race in getOrHandshake due to ConnectionState not being part of the HostInfo directly
  106. // Our hostinfo here was added to the pending map and the wheel may have ticked to us before we created ConnectionState
  107. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  108. return
  109. }
  110. // If we are out of time, clean up
  111. if hostinfo.HandshakeCounter >= c.config.retries {
  112. hostinfo.logger(c.l).WithField("udpAddrs", hostinfo.remotes.CopyAddrs(c.pendingHostMap.preferredRanges)).
  113. WithField("initiatorIndex", hostinfo.localIndexId).
  114. WithField("remoteIndex", hostinfo.remoteIndexId).
  115. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  116. WithField("durationNs", time.Since(hostinfo.handshakeStart).Nanoseconds()).
  117. Info("Handshake timed out")
  118. c.metricTimedOut.Inc(1)
  119. c.pendingHostMap.DeleteHostInfo(hostinfo)
  120. return
  121. }
  122. // We only care about a lighthouse trigger before the first handshake transmit attempt. This is a very specific
  123. // optimization for a fast lighthouse reply
  124. //TODO: it would feel better to do this once, anytime, as our delay increases over time
  125. if lighthouseTriggered && hostinfo.HandshakeCounter > 0 {
  126. // If we didn't return here a lighthouse could cause us to aggressively send handshakes
  127. return
  128. }
  129. // Get a remotes object if we don't already have one.
  130. // This is mainly to protect us as this should never be the case
  131. // NB ^ This comment doesn't jive. It's how the thing gets initialized.
  132. // It's the common path. Should it update every time, in case a future LH query/queries give us more info?
  133. if hostinfo.remotes == nil {
  134. hostinfo.remotes = c.lightHouse.QueryCache(vpnIp)
  135. }
  136. //TODO: this will generate a load of queries for hosts with only 1 ip (i'm not using a lighthouse, static mapped)
  137. if hostinfo.remotes.Len(c.pendingHostMap.preferredRanges) <= 1 {
  138. // If we only have 1 remote it is highly likely our query raced with the other host registered within the lighthouse
  139. // Our vpnIp here has a tunnel with a lighthouse but has yet to send a host update packet there so we only know about
  140. // the learned public ip for them. Query again to short circuit the promotion counter
  141. c.lightHouse.QueryServer(vpnIp, f)
  142. }
  143. // Send a the handshake to all known ips, stage 2 takes care of assigning the hostinfo.remote based on the first to reply
  144. var sentTo []*udp.Addr
  145. hostinfo.remotes.ForEach(c.pendingHostMap.preferredRanges, func(addr *udp.Addr, _ bool) {
  146. c.messageMetrics.Tx(header.Handshake, header.MessageSubType(hostinfo.HandshakePacket[0][1]), 1)
  147. err = c.outside.WriteTo(hostinfo.HandshakePacket[0], addr)
  148. if err != nil {
  149. hostinfo.logger(c.l).WithField("udpAddr", addr).
  150. WithField("initiatorIndex", hostinfo.localIndexId).
  151. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  152. WithError(err).Error("Failed to send handshake message")
  153. } else {
  154. sentTo = append(sentTo, addr)
  155. }
  156. })
  157. // Don't be too noisy or confusing if we fail to send a handshake - if we don't get through we'll eventually log a timeout
  158. if len(sentTo) > 0 {
  159. hostinfo.logger(c.l).WithField("udpAddrs", sentTo).
  160. WithField("initiatorIndex", hostinfo.localIndexId).
  161. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  162. Info("Handshake message sent")
  163. }
  164. if c.config.useRelays && len(hostinfo.remotes.relays) > 0 {
  165. hostinfo.logger(c.l).WithField("relayIps", hostinfo.remotes.relays).Info("Attempt to relay through hosts")
  166. // Send a RelayRequest to all known Relay IP's
  167. for _, relay := range hostinfo.remotes.relays {
  168. // Don't relay to myself, and don't relay through the host I'm trying to connect to
  169. if *relay == vpnIp || *relay == c.lightHouse.myVpnIp {
  170. continue
  171. }
  172. relayHostInfo, err := c.mainHostMap.QueryVpnIp(*relay)
  173. if err != nil || relayHostInfo.remote == nil {
  174. hostinfo.logger(c.l).WithError(err).WithField("relay", relay.String()).Info("Establish tunnel to relay target.")
  175. f.Handshake(*relay)
  176. continue
  177. }
  178. // Check the relay HostInfo to see if we already established a relay through it
  179. if existingRelay, ok := relayHostInfo.relayState.QueryRelayForByIp(vpnIp); ok {
  180. switch existingRelay.State {
  181. case Established:
  182. hostinfo.logger(c.l).WithField("relay", relay.String()).Info("Send handshake via relay")
  183. f.SendVia(relayHostInfo, existingRelay, hostinfo.HandshakePacket[0], make([]byte, 12), make([]byte, mtu), false)
  184. case Requested:
  185. hostinfo.logger(c.l).WithField("relay", relay.String()).Info("Re-send CreateRelay request")
  186. // Re-send the CreateRelay request, in case the previous one was lost.
  187. m := NebulaControl{
  188. Type: NebulaControl_CreateRelayRequest,
  189. InitiatorRelayIndex: existingRelay.LocalIndex,
  190. RelayFromIp: uint32(c.lightHouse.myVpnIp),
  191. RelayToIp: uint32(vpnIp),
  192. }
  193. msg, err := m.Marshal()
  194. if err != nil {
  195. hostinfo.logger(c.l).
  196. WithError(err).
  197. Error("Failed to marshal Control message to create relay")
  198. } else {
  199. f.SendMessageToVpnIp(header.Control, 0, *relay, msg, make([]byte, 12), make([]byte, mtu))
  200. }
  201. default:
  202. hostinfo.logger(c.l).
  203. WithField("vpnIp", vpnIp).
  204. WithField("state", existingRelay.State).
  205. WithField("relayVpnIp", relayHostInfo.vpnIp).
  206. Errorf("Relay unexpected state")
  207. }
  208. } else {
  209. // No relays exist or requested yet.
  210. if relayHostInfo.remote != nil {
  211. idx, err := AddRelay(c.l, relayHostInfo, c.mainHostMap, vpnIp, nil, TerminalType, Requested)
  212. if err != nil {
  213. hostinfo.logger(c.l).WithField("relay", relay.String()).WithError(err).Info("Failed to add relay to hostmap")
  214. }
  215. m := NebulaControl{
  216. Type: NebulaControl_CreateRelayRequest,
  217. InitiatorRelayIndex: idx,
  218. RelayFromIp: uint32(c.lightHouse.myVpnIp),
  219. RelayToIp: uint32(vpnIp),
  220. }
  221. msg, err := m.Marshal()
  222. if err != nil {
  223. hostinfo.logger(c.l).
  224. WithError(err).
  225. Error("Failed to marshal Control message to create relay")
  226. } else {
  227. f.SendMessageToVpnIp(header.Control, 0, *relay, msg, make([]byte, 12), make([]byte, mtu))
  228. }
  229. }
  230. }
  231. }
  232. }
  233. // Increment the counter to increase our delay, linear backoff
  234. hostinfo.HandshakeCounter++
  235. // If a lighthouse triggered this attempt then we are still in the timer wheel and do not need to re-add
  236. if !lighthouseTriggered {
  237. //TODO: feel like we dupe handshake real fast in a tight loop, why?
  238. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  239. }
  240. }
  241. func (c *HandshakeManager) AddVpnIp(vpnIp iputil.VpnIp, init func(*HostInfo)) *HostInfo {
  242. hostinfo, created := c.pendingHostMap.AddVpnIp(vpnIp, init)
  243. if created {
  244. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval)
  245. c.metricInitiated.Inc(1)
  246. }
  247. return hostinfo
  248. }
  249. var (
  250. ErrExistingHostInfo = errors.New("existing hostinfo")
  251. ErrAlreadySeen = errors.New("already seen")
  252. ErrLocalIndexCollision = errors.New("local index collision")
  253. ErrExistingHandshake = errors.New("existing handshake")
  254. )
  255. // CheckAndComplete checks for any conflicts in the main and pending hostmap
  256. // before adding hostinfo to main. If err is nil, it was added. Otherwise err will be:
  257. //
  258. // ErrAlreadySeen if we already have an entry in the hostmap that has seen the
  259. // exact same handshake packet
  260. //
  261. // ErrExistingHostInfo if we already have an entry in the hostmap for this
  262. // VpnIp and the new handshake was older than the one we currently have
  263. //
  264. // ErrLocalIndexCollision if we already have an entry in the main or pending
  265. // hostmap for the hostinfo.localIndexId.
  266. func (c *HandshakeManager) CheckAndComplete(hostinfo *HostInfo, handshakePacket uint8, overwrite bool, f *Interface) (*HostInfo, error) {
  267. c.pendingHostMap.Lock()
  268. defer c.pendingHostMap.Unlock()
  269. c.mainHostMap.Lock()
  270. defer c.mainHostMap.Unlock()
  271. // Check if we already have a tunnel with this vpn ip
  272. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.vpnIp]
  273. if found && existingHostInfo != nil {
  274. // Is it just a delayed handshake packet?
  275. if bytes.Equal(hostinfo.HandshakePacket[handshakePacket], existingHostInfo.HandshakePacket[handshakePacket]) {
  276. return existingHostInfo, ErrAlreadySeen
  277. }
  278. // Is this a newer handshake?
  279. if existingHostInfo.lastHandshakeTime >= hostinfo.lastHandshakeTime {
  280. return existingHostInfo, ErrExistingHostInfo
  281. }
  282. existingHostInfo.logger(c.l).Info("Taking new handshake")
  283. }
  284. existingIndex, found := c.mainHostMap.Indexes[hostinfo.localIndexId]
  285. if found {
  286. // We have a collision, but for a different hostinfo
  287. return existingIndex, ErrLocalIndexCollision
  288. }
  289. existingIndex, found = c.pendingHostMap.Indexes[hostinfo.localIndexId]
  290. if found && existingIndex != hostinfo {
  291. // We have a collision, but for a different hostinfo
  292. return existingIndex, ErrLocalIndexCollision
  293. }
  294. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  295. if found && existingRemoteIndex != nil && existingRemoteIndex.vpnIp != hostinfo.vpnIp {
  296. // We have a collision, but this can happen since we can't control
  297. // the remote ID. Just log about the situation as a note.
  298. hostinfo.logger(c.l).
  299. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", existingRemoteIndex.vpnIp).
  300. Info("New host shadows existing host remoteIndex")
  301. }
  302. // Check if we are also handshaking with this vpn ip
  303. pendingHostInfo, found := c.pendingHostMap.Hosts[hostinfo.vpnIp]
  304. if found && pendingHostInfo != nil {
  305. if !overwrite {
  306. // We won, let our pending handshake win
  307. return pendingHostInfo, ErrExistingHandshake
  308. }
  309. // We lost, take this handshake and move any cached packets over so they get sent
  310. pendingHostInfo.ConnectionState.queueLock.Lock()
  311. hostinfo.packetStore = append(hostinfo.packetStore, pendingHostInfo.packetStore...)
  312. c.pendingHostMap.unlockedDeleteHostInfo(pendingHostInfo)
  313. pendingHostInfo.ConnectionState.queueLock.Unlock()
  314. pendingHostInfo.logger(c.l).Info("Handshake race lost, replacing pending handshake with completed tunnel")
  315. }
  316. if existingHostInfo != nil {
  317. // We are going to overwrite this entry, so remove the old references
  318. delete(c.mainHostMap.Hosts, existingHostInfo.vpnIp)
  319. delete(c.mainHostMap.Indexes, existingHostInfo.localIndexId)
  320. delete(c.mainHostMap.RemoteIndexes, existingHostInfo.remoteIndexId)
  321. for _, relayIdx := range existingHostInfo.relayState.CopyRelayForIdxs() {
  322. delete(c.mainHostMap.Relays, relayIdx)
  323. }
  324. }
  325. c.mainHostMap.addHostInfo(hostinfo, f)
  326. return existingHostInfo, nil
  327. }
  328. // Complete is a simpler version of CheckAndComplete when we already know we
  329. // won't have a localIndexId collision because we already have an entry in the
  330. // pendingHostMap
  331. func (c *HandshakeManager) Complete(hostinfo *HostInfo, f *Interface) {
  332. c.pendingHostMap.Lock()
  333. defer c.pendingHostMap.Unlock()
  334. c.mainHostMap.Lock()
  335. defer c.mainHostMap.Unlock()
  336. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.vpnIp]
  337. if found && existingHostInfo != nil {
  338. // We are going to overwrite this entry, so remove the old references
  339. delete(c.mainHostMap.Hosts, existingHostInfo.vpnIp)
  340. delete(c.mainHostMap.Indexes, existingHostInfo.localIndexId)
  341. delete(c.mainHostMap.RemoteIndexes, existingHostInfo.remoteIndexId)
  342. for _, relayIdx := range existingHostInfo.relayState.CopyRelayForIdxs() {
  343. delete(c.mainHostMap.Relays, relayIdx)
  344. }
  345. }
  346. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  347. if found && existingRemoteIndex != nil {
  348. // We have a collision, but this can happen since we can't control
  349. // the remote ID. Just log about the situation as a note.
  350. hostinfo.logger(c.l).
  351. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", existingRemoteIndex.vpnIp).
  352. Info("New host shadows existing host remoteIndex")
  353. }
  354. c.mainHostMap.addHostInfo(hostinfo, f)
  355. c.pendingHostMap.unlockedDeleteHostInfo(hostinfo)
  356. }
  357. // AddIndexHostInfo generates a unique localIndexId for this HostInfo
  358. // and adds it to the pendingHostMap. Will error if we are unable to generate
  359. // a unique localIndexId
  360. func (c *HandshakeManager) AddIndexHostInfo(h *HostInfo) error {
  361. c.pendingHostMap.Lock()
  362. defer c.pendingHostMap.Unlock()
  363. c.mainHostMap.RLock()
  364. defer c.mainHostMap.RUnlock()
  365. for i := 0; i < 32; i++ {
  366. index, err := generateIndex(c.l)
  367. if err != nil {
  368. return err
  369. }
  370. _, inPending := c.pendingHostMap.Indexes[index]
  371. _, inMain := c.mainHostMap.Indexes[index]
  372. if !inMain && !inPending {
  373. h.localIndexId = index
  374. c.pendingHostMap.Indexes[index] = h
  375. return nil
  376. }
  377. }
  378. return errors.New("failed to generate unique localIndexId")
  379. }
  380. func (c *HandshakeManager) addRemoteIndexHostInfo(index uint32, h *HostInfo) {
  381. c.pendingHostMap.addRemoteIndexHostInfo(index, h)
  382. }
  383. func (c *HandshakeManager) DeleteHostInfo(hostinfo *HostInfo) {
  384. //l.Debugln("Deleting pending hostinfo :", hostinfo)
  385. c.pendingHostMap.DeleteHostInfo(hostinfo)
  386. }
  387. func (c *HandshakeManager) QueryIndex(index uint32) (*HostInfo, error) {
  388. return c.pendingHostMap.QueryIndex(index)
  389. }
  390. func (c *HandshakeManager) EmitStats() {
  391. c.pendingHostMap.EmitStats("pending")
  392. c.mainHostMap.EmitStats("main")
  393. }
  394. // Utility functions below
  395. func generateIndex(l *logrus.Logger) (uint32, error) {
  396. b := make([]byte, 4)
  397. // Let zero mean we don't know the ID, so don't generate zero
  398. var index uint32
  399. for index == 0 {
  400. _, err := rand.Read(b)
  401. if err != nil {
  402. l.Errorln(err)
  403. return 0, err
  404. }
  405. index = binary.BigEndian.Uint32(b)
  406. }
  407. if l.Level >= logrus.DebugLevel {
  408. l.WithField("index", index).
  409. Debug("Generated index")
  410. }
  411. return index, nil
  412. }
  413. func hsTimeout(tries int, interval time.Duration) time.Duration {
  414. return time.Duration(tries / 2 * ((2 * int(interval)) + (tries-1)*int(interval)))
  415. }