handshake_manager.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. package nebula
  2. import (
  3. "bytes"
  4. "context"
  5. "crypto/rand"
  6. "encoding/binary"
  7. "errors"
  8. "net"
  9. "time"
  10. "github.com/rcrowley/go-metrics"
  11. "github.com/sirupsen/logrus"
  12. "github.com/slackhq/nebula/header"
  13. "github.com/slackhq/nebula/iputil"
  14. "github.com/slackhq/nebula/udp"
  15. )
  16. const (
  17. DefaultHandshakeTryInterval = time.Millisecond * 100
  18. DefaultHandshakeRetries = 10
  19. DefaultHandshakeTriggerBuffer = 64
  20. )
  21. var (
  22. defaultHandshakeConfig = HandshakeConfig{
  23. tryInterval: DefaultHandshakeTryInterval,
  24. retries: DefaultHandshakeRetries,
  25. triggerBuffer: DefaultHandshakeTriggerBuffer,
  26. }
  27. )
  28. type HandshakeConfig struct {
  29. tryInterval time.Duration
  30. retries int
  31. triggerBuffer int
  32. messageMetrics *MessageMetrics
  33. }
  34. type HandshakeManager struct {
  35. pendingHostMap *HostMap
  36. mainHostMap *HostMap
  37. lightHouse *LightHouse
  38. outside *udp.Conn
  39. config HandshakeConfig
  40. OutboundHandshakeTimer *SystemTimerWheel
  41. messageMetrics *MessageMetrics
  42. metricInitiated metrics.Counter
  43. metricTimedOut metrics.Counter
  44. l *logrus.Logger
  45. // can be used to trigger outbound handshake for the given vpnIp
  46. trigger chan iputil.VpnIp
  47. }
  48. func NewHandshakeManager(l *logrus.Logger, tunCidr *net.IPNet, preferredRanges []*net.IPNet, mainHostMap *HostMap, lightHouse *LightHouse, outside *udp.Conn, config HandshakeConfig) *HandshakeManager {
  49. return &HandshakeManager{
  50. pendingHostMap: NewHostMap(l, "pending", tunCidr, preferredRanges),
  51. mainHostMap: mainHostMap,
  52. lightHouse: lightHouse,
  53. outside: outside,
  54. config: config,
  55. trigger: make(chan iputil.VpnIp, config.triggerBuffer),
  56. OutboundHandshakeTimer: NewSystemTimerWheel(config.tryInterval, hsTimeout(config.retries, config.tryInterval)),
  57. messageMetrics: config.messageMetrics,
  58. metricInitiated: metrics.GetOrRegisterCounter("handshake_manager.initiated", nil),
  59. metricTimedOut: metrics.GetOrRegisterCounter("handshake_manager.timed_out", nil),
  60. l: l,
  61. }
  62. }
  63. func (c *HandshakeManager) Run(ctx context.Context, f udp.EncWriter) {
  64. clockSource := time.NewTicker(c.config.tryInterval)
  65. defer clockSource.Stop()
  66. for {
  67. select {
  68. case <-ctx.Done():
  69. return
  70. case vpnIP := <-c.trigger:
  71. c.l.WithField("vpnIp", vpnIP).Debug("HandshakeManager: triggered")
  72. c.handleOutbound(vpnIP, f, true)
  73. case now := <-clockSource.C:
  74. c.NextOutboundHandshakeTimerTick(now, f)
  75. }
  76. }
  77. }
  78. func (c *HandshakeManager) NextOutboundHandshakeTimerTick(now time.Time, f udp.EncWriter) {
  79. c.OutboundHandshakeTimer.advance(now)
  80. for {
  81. ep := c.OutboundHandshakeTimer.Purge()
  82. if ep == nil {
  83. break
  84. }
  85. vpnIp := ep.(iputil.VpnIp)
  86. c.handleOutbound(vpnIp, f, false)
  87. }
  88. }
  89. func (c *HandshakeManager) handleOutbound(vpnIp iputil.VpnIp, f udp.EncWriter, lighthouseTriggered bool) {
  90. hostinfo, err := c.pendingHostMap.QueryVpnIp(vpnIp)
  91. if err != nil {
  92. return
  93. }
  94. hostinfo.Lock()
  95. defer hostinfo.Unlock()
  96. // We may have raced to completion but now that we have a lock we should ensure we have not yet completed.
  97. if hostinfo.HandshakeComplete {
  98. // Ensure we don't exist in the pending hostmap anymore since we have completed
  99. c.pendingHostMap.DeleteHostInfo(hostinfo)
  100. return
  101. }
  102. // Check if we have a handshake packet to transmit yet
  103. if !hostinfo.HandshakeReady {
  104. // There is currently a slight race in getOrHandshake due to ConnectionState not being part of the HostInfo directly
  105. // Our hostinfo here was added to the pending map and the wheel may have ticked to us before we created ConnectionState
  106. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  107. return
  108. }
  109. // If we are out of time, clean up
  110. if hostinfo.HandshakeCounter >= c.config.retries {
  111. hostinfo.logger(c.l).WithField("udpAddrs", hostinfo.remotes.CopyAddrs(c.pendingHostMap.preferredRanges)).
  112. WithField("initiatorIndex", hostinfo.localIndexId).
  113. WithField("remoteIndex", hostinfo.remoteIndexId).
  114. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  115. WithField("durationNs", time.Since(hostinfo.handshakeStart).Nanoseconds()).
  116. Info("Handshake timed out")
  117. c.metricTimedOut.Inc(1)
  118. c.pendingHostMap.DeleteHostInfo(hostinfo)
  119. return
  120. }
  121. // We only care about a lighthouse trigger before the first handshake transmit attempt. This is a very specific
  122. // optimization for a fast lighthouse reply
  123. //TODO: it would feel better to do this once, anytime, as our delay increases over time
  124. if lighthouseTriggered && hostinfo.HandshakeCounter > 0 {
  125. // If we didn't return here a lighthouse could cause us to aggressively send handshakes
  126. return
  127. }
  128. // Get a remotes object if we don't already have one.
  129. // This is mainly to protect us as this should never be the case
  130. if hostinfo.remotes == nil {
  131. hostinfo.remotes = c.lightHouse.QueryCache(vpnIp)
  132. }
  133. //TODO: this will generate a load of queries for hosts with only 1 ip (i'm not using a lighthouse, static mapped)
  134. if hostinfo.remotes.Len(c.pendingHostMap.preferredRanges) <= 1 {
  135. // If we only have 1 remote it is highly likely our query raced with the other host registered within the lighthouse
  136. // Our vpnIp here has a tunnel with a lighthouse but has yet to send a host update packet there so we only know about
  137. // the learned public ip for them. Query again to short circuit the promotion counter
  138. c.lightHouse.QueryServer(vpnIp, f)
  139. }
  140. // Send a the handshake to all known ips, stage 2 takes care of assigning the hostinfo.remote based on the first to reply
  141. var sentTo []*udp.Addr
  142. hostinfo.remotes.ForEach(c.pendingHostMap.preferredRanges, func(addr *udp.Addr, _ bool) {
  143. c.messageMetrics.Tx(header.Handshake, header.MessageSubType(hostinfo.HandshakePacket[0][1]), 1)
  144. err = c.outside.WriteTo(hostinfo.HandshakePacket[0], addr)
  145. if err != nil {
  146. hostinfo.logger(c.l).WithField("udpAddr", addr).
  147. WithField("initiatorIndex", hostinfo.localIndexId).
  148. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  149. WithError(err).Error("Failed to send handshake message")
  150. } else {
  151. sentTo = append(sentTo, addr)
  152. }
  153. })
  154. // Don't be too noisy or confusing if we fail to send a handshake - if we don't get through we'll eventually log a timeout
  155. if len(sentTo) > 0 {
  156. hostinfo.logger(c.l).WithField("udpAddrs", sentTo).
  157. WithField("initiatorIndex", hostinfo.localIndexId).
  158. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  159. Info("Handshake message sent")
  160. }
  161. // Increment the counter to increase our delay, linear backoff
  162. hostinfo.HandshakeCounter++
  163. // If a lighthouse triggered this attempt then we are still in the timer wheel and do not need to re-add
  164. if !lighthouseTriggered {
  165. //TODO: feel like we dupe handshake real fast in a tight loop, why?
  166. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  167. }
  168. }
  169. func (c *HandshakeManager) AddVpnIp(vpnIp iputil.VpnIp) *HostInfo {
  170. hostinfo := c.pendingHostMap.AddVpnIp(vpnIp)
  171. // We lock here and use an array to insert items to prevent locking the
  172. // main receive thread for very long by waiting to add items to the pending map
  173. //TODO: what lock?
  174. c.OutboundHandshakeTimer.Add(vpnIp, c.config.tryInterval)
  175. c.metricInitiated.Inc(1)
  176. return hostinfo
  177. }
  178. var (
  179. ErrExistingHostInfo = errors.New("existing hostinfo")
  180. ErrAlreadySeen = errors.New("already seen")
  181. ErrLocalIndexCollision = errors.New("local index collision")
  182. ErrExistingHandshake = errors.New("existing handshake")
  183. )
  184. // CheckAndComplete checks for any conflicts in the main and pending hostmap
  185. // before adding hostinfo to main. If err is nil, it was added. Otherwise err will be:
  186. //
  187. // ErrAlreadySeen if we already have an entry in the hostmap that has seen the
  188. // exact same handshake packet
  189. //
  190. // ErrExistingHostInfo if we already have an entry in the hostmap for this
  191. // VpnIp and the new handshake was older than the one we currently have
  192. //
  193. // ErrLocalIndexCollision if we already have an entry in the main or pending
  194. // hostmap for the hostinfo.localIndexId.
  195. func (c *HandshakeManager) CheckAndComplete(hostinfo *HostInfo, handshakePacket uint8, overwrite bool, f *Interface) (*HostInfo, error) {
  196. c.pendingHostMap.Lock()
  197. defer c.pendingHostMap.Unlock()
  198. c.mainHostMap.Lock()
  199. defer c.mainHostMap.Unlock()
  200. // Check if we already have a tunnel with this vpn ip
  201. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.vpnIp]
  202. if found && existingHostInfo != nil {
  203. // Is it just a delayed handshake packet?
  204. if bytes.Equal(hostinfo.HandshakePacket[handshakePacket], existingHostInfo.HandshakePacket[handshakePacket]) {
  205. return existingHostInfo, ErrAlreadySeen
  206. }
  207. // Is this a newer handshake?
  208. if existingHostInfo.lastHandshakeTime >= hostinfo.lastHandshakeTime {
  209. return existingHostInfo, ErrExistingHostInfo
  210. }
  211. existingHostInfo.logger(c.l).Info("Taking new handshake")
  212. }
  213. existingIndex, found := c.mainHostMap.Indexes[hostinfo.localIndexId]
  214. if found {
  215. // We have a collision, but for a different hostinfo
  216. return existingIndex, ErrLocalIndexCollision
  217. }
  218. existingIndex, found = c.pendingHostMap.Indexes[hostinfo.localIndexId]
  219. if found && existingIndex != hostinfo {
  220. // We have a collision, but for a different hostinfo
  221. return existingIndex, ErrLocalIndexCollision
  222. }
  223. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  224. if found && existingRemoteIndex != nil && existingRemoteIndex.vpnIp != hostinfo.vpnIp {
  225. // We have a collision, but this can happen since we can't control
  226. // the remote ID. Just log about the situation as a note.
  227. hostinfo.logger(c.l).
  228. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", existingRemoteIndex.vpnIp).
  229. Info("New host shadows existing host remoteIndex")
  230. }
  231. // Check if we are also handshaking with this vpn ip
  232. pendingHostInfo, found := c.pendingHostMap.Hosts[hostinfo.vpnIp]
  233. if found && pendingHostInfo != nil {
  234. if !overwrite {
  235. // We won, let our pending handshake win
  236. return pendingHostInfo, ErrExistingHandshake
  237. }
  238. // We lost, take this handshake and move any cached packets over so they get sent
  239. pendingHostInfo.ConnectionState.queueLock.Lock()
  240. hostinfo.packetStore = append(hostinfo.packetStore, pendingHostInfo.packetStore...)
  241. c.pendingHostMap.unlockedDeleteHostInfo(pendingHostInfo)
  242. pendingHostInfo.ConnectionState.queueLock.Unlock()
  243. pendingHostInfo.logger(c.l).Info("Handshake race lost, replacing pending handshake with completed tunnel")
  244. }
  245. if existingHostInfo != nil {
  246. // We are going to overwrite this entry, so remove the old references
  247. delete(c.mainHostMap.Hosts, existingHostInfo.vpnIp)
  248. delete(c.mainHostMap.Indexes, existingHostInfo.localIndexId)
  249. delete(c.mainHostMap.RemoteIndexes, existingHostInfo.remoteIndexId)
  250. }
  251. c.mainHostMap.addHostInfo(hostinfo, f)
  252. return existingHostInfo, nil
  253. }
  254. // Complete is a simpler version of CheckAndComplete when we already know we
  255. // won't have a localIndexId collision because we already have an entry in the
  256. // pendingHostMap
  257. func (c *HandshakeManager) Complete(hostinfo *HostInfo, f *Interface) {
  258. c.pendingHostMap.Lock()
  259. defer c.pendingHostMap.Unlock()
  260. c.mainHostMap.Lock()
  261. defer c.mainHostMap.Unlock()
  262. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.vpnIp]
  263. if found && existingHostInfo != nil {
  264. // We are going to overwrite this entry, so remove the old references
  265. delete(c.mainHostMap.Hosts, existingHostInfo.vpnIp)
  266. delete(c.mainHostMap.Indexes, existingHostInfo.localIndexId)
  267. delete(c.mainHostMap.RemoteIndexes, existingHostInfo.remoteIndexId)
  268. }
  269. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  270. if found && existingRemoteIndex != nil {
  271. // We have a collision, but this can happen since we can't control
  272. // the remote ID. Just log about the situation as a note.
  273. hostinfo.logger(c.l).
  274. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", existingRemoteIndex.vpnIp).
  275. Info("New host shadows existing host remoteIndex")
  276. }
  277. c.mainHostMap.addHostInfo(hostinfo, f)
  278. c.pendingHostMap.unlockedDeleteHostInfo(hostinfo)
  279. }
  280. // AddIndexHostInfo generates a unique localIndexId for this HostInfo
  281. // and adds it to the pendingHostMap. Will error if we are unable to generate
  282. // a unique localIndexId
  283. func (c *HandshakeManager) AddIndexHostInfo(h *HostInfo) error {
  284. c.pendingHostMap.Lock()
  285. defer c.pendingHostMap.Unlock()
  286. c.mainHostMap.RLock()
  287. defer c.mainHostMap.RUnlock()
  288. for i := 0; i < 32; i++ {
  289. index, err := generateIndex(c.l)
  290. if err != nil {
  291. return err
  292. }
  293. _, inPending := c.pendingHostMap.Indexes[index]
  294. _, inMain := c.mainHostMap.Indexes[index]
  295. if !inMain && !inPending {
  296. h.localIndexId = index
  297. c.pendingHostMap.Indexes[index] = h
  298. return nil
  299. }
  300. }
  301. return errors.New("failed to generate unique localIndexId")
  302. }
  303. func (c *HandshakeManager) addRemoteIndexHostInfo(index uint32, h *HostInfo) {
  304. c.pendingHostMap.addRemoteIndexHostInfo(index, h)
  305. }
  306. func (c *HandshakeManager) DeleteHostInfo(hostinfo *HostInfo) {
  307. //l.Debugln("Deleting pending hostinfo :", hostinfo)
  308. c.pendingHostMap.DeleteHostInfo(hostinfo)
  309. }
  310. func (c *HandshakeManager) QueryIndex(index uint32) (*HostInfo, error) {
  311. return c.pendingHostMap.QueryIndex(index)
  312. }
  313. func (c *HandshakeManager) EmitStats() {
  314. c.pendingHostMap.EmitStats("pending")
  315. c.mainHostMap.EmitStats("main")
  316. }
  317. // Utility functions below
  318. func generateIndex(l *logrus.Logger) (uint32, error) {
  319. b := make([]byte, 4)
  320. // Let zero mean we don't know the ID, so don't generate zero
  321. var index uint32
  322. for index == 0 {
  323. _, err := rand.Read(b)
  324. if err != nil {
  325. l.Errorln(err)
  326. return 0, err
  327. }
  328. index = binary.BigEndian.Uint32(b)
  329. }
  330. if l.Level >= logrus.DebugLevel {
  331. l.WithField("index", index).
  332. Debug("Generated index")
  333. }
  334. return index, nil
  335. }
  336. func hsTimeout(tries int, interval time.Duration) time.Duration {
  337. return time.Duration(tries / 2 * ((2 * int(interval)) + (tries-1)*int(interval)))
  338. }