handshake_manager.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. package nebula
  2. import (
  3. "bytes"
  4. "crypto/rand"
  5. "encoding/binary"
  6. "errors"
  7. "net"
  8. "time"
  9. "github.com/rcrowley/go-metrics"
  10. "github.com/sirupsen/logrus"
  11. )
  12. const (
  13. DefaultHandshakeTryInterval = time.Millisecond * 100
  14. DefaultHandshakeRetries = 10
  15. DefaultHandshakeTriggerBuffer = 64
  16. )
  17. var (
  18. defaultHandshakeConfig = HandshakeConfig{
  19. tryInterval: DefaultHandshakeTryInterval,
  20. retries: DefaultHandshakeRetries,
  21. triggerBuffer: DefaultHandshakeTriggerBuffer,
  22. }
  23. )
  24. type HandshakeConfig struct {
  25. tryInterval time.Duration
  26. retries int
  27. triggerBuffer int
  28. messageMetrics *MessageMetrics
  29. }
  30. type HandshakeManager struct {
  31. pendingHostMap *HostMap
  32. mainHostMap *HostMap
  33. lightHouse *LightHouse
  34. outside *udpConn
  35. config HandshakeConfig
  36. OutboundHandshakeTimer *SystemTimerWheel
  37. messageMetrics *MessageMetrics
  38. metricInitiated metrics.Counter
  39. metricTimedOut metrics.Counter
  40. l *logrus.Logger
  41. // can be used to trigger outbound handshake for the given vpnIP
  42. trigger chan uint32
  43. }
  44. func NewHandshakeManager(l *logrus.Logger, tunCidr *net.IPNet, preferredRanges []*net.IPNet, mainHostMap *HostMap, lightHouse *LightHouse, outside *udpConn, config HandshakeConfig) *HandshakeManager {
  45. return &HandshakeManager{
  46. pendingHostMap: NewHostMap(l, "pending", tunCidr, preferredRanges),
  47. mainHostMap: mainHostMap,
  48. lightHouse: lightHouse,
  49. outside: outside,
  50. config: config,
  51. trigger: make(chan uint32, config.triggerBuffer),
  52. OutboundHandshakeTimer: NewSystemTimerWheel(config.tryInterval, hsTimeout(config.retries, config.tryInterval)),
  53. messageMetrics: config.messageMetrics,
  54. metricInitiated: metrics.GetOrRegisterCounter("handshake_manager.initiated", nil),
  55. metricTimedOut: metrics.GetOrRegisterCounter("handshake_manager.timed_out", nil),
  56. l: l,
  57. }
  58. }
  59. func (c *HandshakeManager) Run(f EncWriter) {
  60. clockSource := time.Tick(c.config.tryInterval)
  61. for {
  62. select {
  63. case vpnIP := <-c.trigger:
  64. c.l.WithField("vpnIp", IntIp(vpnIP)).Debug("HandshakeManager: triggered")
  65. c.handleOutbound(vpnIP, f, true)
  66. case now := <-clockSource:
  67. c.NextOutboundHandshakeTimerTick(now, f)
  68. }
  69. }
  70. }
  71. func (c *HandshakeManager) NextOutboundHandshakeTimerTick(now time.Time, f EncWriter) {
  72. c.OutboundHandshakeTimer.advance(now)
  73. for {
  74. ep := c.OutboundHandshakeTimer.Purge()
  75. if ep == nil {
  76. break
  77. }
  78. vpnIP := ep.(uint32)
  79. c.handleOutbound(vpnIP, f, false)
  80. }
  81. }
  82. func (c *HandshakeManager) handleOutbound(vpnIP uint32, f EncWriter, lighthouseTriggered bool) {
  83. hostinfo, err := c.pendingHostMap.QueryVpnIP(vpnIP)
  84. if err != nil {
  85. return
  86. }
  87. hostinfo.Lock()
  88. defer hostinfo.Unlock()
  89. // We may have raced to completion but now that we have a lock we should ensure we have not yet completed.
  90. if hostinfo.HandshakeComplete {
  91. // Ensure we don't exist in the pending hostmap anymore since we have completed
  92. c.pendingHostMap.DeleteHostInfo(hostinfo)
  93. return
  94. }
  95. // Check if we have a handshake packet to transmit yet
  96. if !hostinfo.HandshakeReady {
  97. // There is currently a slight race in getOrHandshake due to ConnectionState not being part of the HostInfo directly
  98. // Our hostinfo here was added to the pending map and the wheel may have ticked to us before we created ConnectionState
  99. c.OutboundHandshakeTimer.Add(vpnIP, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  100. return
  101. }
  102. // If we are out of time, clean up
  103. if hostinfo.HandshakeCounter >= c.config.retries {
  104. hostinfo.logger(c.l).WithField("udpAddrs", hostinfo.remotes.CopyAddrs(c.pendingHostMap.preferredRanges)).
  105. WithField("initiatorIndex", hostinfo.localIndexId).
  106. WithField("remoteIndex", hostinfo.remoteIndexId).
  107. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  108. WithField("durationNs", time.Since(hostinfo.handshakeStart).Nanoseconds()).
  109. Info("Handshake timed out")
  110. c.metricTimedOut.Inc(1)
  111. c.pendingHostMap.DeleteHostInfo(hostinfo)
  112. return
  113. }
  114. // We only care about a lighthouse trigger before the first handshake transmit attempt. This is a very specific
  115. // optimization for a fast lighthouse reply
  116. //TODO: it would feel better to do this once, anytime, as our delay increases over time
  117. if lighthouseTriggered && hostinfo.HandshakeCounter > 0 {
  118. // If we didn't return here a lighthouse could cause us to aggressively send handshakes
  119. return
  120. }
  121. // Get a remotes object if we don't already have one.
  122. // This is mainly to protect us as this should never be the case
  123. if hostinfo.remotes == nil {
  124. hostinfo.remotes = c.lightHouse.QueryCache(vpnIP)
  125. }
  126. //TODO: this will generate a load of queries for hosts with only 1 ip (i'm not using a lighthouse, static mapped)
  127. if hostinfo.remotes.Len(c.pendingHostMap.preferredRanges) <= 1 {
  128. // If we only have 1 remote it is highly likely our query raced with the other host registered within the lighthouse
  129. // Our vpnIP here has a tunnel with a lighthouse but has yet to send a host update packet there so we only know about
  130. // the learned public ip for them. Query again to short circuit the promotion counter
  131. c.lightHouse.QueryServer(vpnIP, f)
  132. }
  133. // Send a the handshake to all known ips, stage 2 takes care of assigning the hostinfo.remote based on the first to reply
  134. var sentTo []*udpAddr
  135. hostinfo.remotes.ForEach(c.pendingHostMap.preferredRanges, func(addr *udpAddr, _ bool) {
  136. c.messageMetrics.Tx(handshake, NebulaMessageSubType(hostinfo.HandshakePacket[0][1]), 1)
  137. err = c.outside.WriteTo(hostinfo.HandshakePacket[0], addr)
  138. if err != nil {
  139. hostinfo.logger(c.l).WithField("udpAddr", addr).
  140. WithField("initiatorIndex", hostinfo.localIndexId).
  141. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  142. WithError(err).Error("Failed to send handshake message")
  143. } else {
  144. sentTo = append(sentTo, addr)
  145. }
  146. })
  147. // Don't be too noisy or confusing if we fail to send a handshake - if we don't get through we'll eventually log a timeout
  148. if len(sentTo) > 0 {
  149. hostinfo.logger(c.l).WithField("udpAddrs", sentTo).
  150. WithField("initiatorIndex", hostinfo.localIndexId).
  151. WithField("handshake", m{"stage": 1, "style": "ix_psk0"}).
  152. Info("Handshake message sent")
  153. }
  154. // Increment the counter to increase our delay, linear backoff
  155. hostinfo.HandshakeCounter++
  156. // If a lighthouse triggered this attempt then we are still in the timer wheel and do not need to re-add
  157. if !lighthouseTriggered {
  158. //TODO: feel like we dupe handshake real fast in a tight loop, why?
  159. c.OutboundHandshakeTimer.Add(vpnIP, c.config.tryInterval*time.Duration(hostinfo.HandshakeCounter))
  160. }
  161. }
  162. func (c *HandshakeManager) AddVpnIP(vpnIP uint32) *HostInfo {
  163. hostinfo := c.pendingHostMap.AddVpnIP(vpnIP)
  164. // We lock here and use an array to insert items to prevent locking the
  165. // main receive thread for very long by waiting to add items to the pending map
  166. //TODO: what lock?
  167. c.OutboundHandshakeTimer.Add(vpnIP, c.config.tryInterval)
  168. c.metricInitiated.Inc(1)
  169. return hostinfo
  170. }
  171. var (
  172. ErrExistingHostInfo = errors.New("existing hostinfo")
  173. ErrAlreadySeen = errors.New("already seen")
  174. ErrLocalIndexCollision = errors.New("local index collision")
  175. ErrExistingHandshake = errors.New("existing handshake")
  176. )
  177. // CheckAndComplete checks for any conflicts in the main and pending hostmap
  178. // before adding hostinfo to main. If err is nil, it was added. Otherwise err will be:
  179. // ErrAlreadySeen if we already have an entry in the hostmap that has seen the
  180. // exact same handshake packet
  181. //
  182. // ErrExistingHostInfo if we already have an entry in the hostmap for this
  183. // VpnIP and the new handshake was older than the one we currently have
  184. //
  185. // ErrLocalIndexCollision if we already have an entry in the main or pending
  186. // hostmap for the hostinfo.localIndexId.
  187. func (c *HandshakeManager) CheckAndComplete(hostinfo *HostInfo, handshakePacket uint8, overwrite bool, f *Interface) (*HostInfo, error) {
  188. c.pendingHostMap.Lock()
  189. defer c.pendingHostMap.Unlock()
  190. c.mainHostMap.Lock()
  191. defer c.mainHostMap.Unlock()
  192. // Check if we already have a tunnel with this vpn ip
  193. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.hostId]
  194. if found && existingHostInfo != nil {
  195. // Is it just a delayed handshake packet?
  196. if bytes.Equal(hostinfo.HandshakePacket[handshakePacket], existingHostInfo.HandshakePacket[handshakePacket]) {
  197. return existingHostInfo, ErrAlreadySeen
  198. }
  199. // Is this a newer handshake?
  200. if existingHostInfo.lastHandshakeTime >= hostinfo.lastHandshakeTime {
  201. return existingHostInfo, ErrExistingHostInfo
  202. }
  203. existingHostInfo.logger(c.l).Info("Taking new handshake")
  204. }
  205. existingIndex, found := c.mainHostMap.Indexes[hostinfo.localIndexId]
  206. if found {
  207. // We have a collision, but for a different hostinfo
  208. return existingIndex, ErrLocalIndexCollision
  209. }
  210. existingIndex, found = c.pendingHostMap.Indexes[hostinfo.localIndexId]
  211. if found && existingIndex != hostinfo {
  212. // We have a collision, but for a different hostinfo
  213. return existingIndex, ErrLocalIndexCollision
  214. }
  215. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  216. if found && existingRemoteIndex != nil && existingRemoteIndex.hostId != hostinfo.hostId {
  217. // We have a collision, but this can happen since we can't control
  218. // the remote ID. Just log about the situation as a note.
  219. hostinfo.logger(c.l).
  220. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", IntIp(existingRemoteIndex.hostId)).
  221. Info("New host shadows existing host remoteIndex")
  222. }
  223. // Check if we are also handshaking with this vpn ip
  224. pendingHostInfo, found := c.pendingHostMap.Hosts[hostinfo.hostId]
  225. if found && pendingHostInfo != nil {
  226. if !overwrite {
  227. // We won, let our pending handshake win
  228. return pendingHostInfo, ErrExistingHandshake
  229. }
  230. // We lost, take this handshake and move any cached packets over so they get sent
  231. pendingHostInfo.ConnectionState.queueLock.Lock()
  232. hostinfo.packetStore = append(hostinfo.packetStore, pendingHostInfo.packetStore...)
  233. c.pendingHostMap.unlockedDeleteHostInfo(pendingHostInfo)
  234. pendingHostInfo.ConnectionState.queueLock.Unlock()
  235. pendingHostInfo.logger(c.l).Info("Handshake race lost, replacing pending handshake with completed tunnel")
  236. }
  237. if existingHostInfo != nil {
  238. // We are going to overwrite this entry, so remove the old references
  239. delete(c.mainHostMap.Hosts, existingHostInfo.hostId)
  240. delete(c.mainHostMap.Indexes, existingHostInfo.localIndexId)
  241. delete(c.mainHostMap.RemoteIndexes, existingHostInfo.remoteIndexId)
  242. }
  243. c.mainHostMap.addHostInfo(hostinfo, f)
  244. return existingHostInfo, nil
  245. }
  246. // Complete is a simpler version of CheckAndComplete when we already know we
  247. // won't have a localIndexId collision because we already have an entry in the
  248. // pendingHostMap
  249. func (c *HandshakeManager) Complete(hostinfo *HostInfo, f *Interface) {
  250. c.pendingHostMap.Lock()
  251. defer c.pendingHostMap.Unlock()
  252. c.mainHostMap.Lock()
  253. defer c.mainHostMap.Unlock()
  254. existingHostInfo, found := c.mainHostMap.Hosts[hostinfo.hostId]
  255. if found && existingHostInfo != nil {
  256. // We are going to overwrite this entry, so remove the old references
  257. delete(c.mainHostMap.Hosts, existingHostInfo.hostId)
  258. delete(c.mainHostMap.Indexes, existingHostInfo.localIndexId)
  259. delete(c.mainHostMap.RemoteIndexes, existingHostInfo.remoteIndexId)
  260. }
  261. existingRemoteIndex, found := c.mainHostMap.RemoteIndexes[hostinfo.remoteIndexId]
  262. if found && existingRemoteIndex != nil {
  263. // We have a collision, but this can happen since we can't control
  264. // the remote ID. Just log about the situation as a note.
  265. hostinfo.logger(c.l).
  266. WithField("remoteIndex", hostinfo.remoteIndexId).WithField("collision", IntIp(existingRemoteIndex.hostId)).
  267. Info("New host shadows existing host remoteIndex")
  268. }
  269. c.mainHostMap.addHostInfo(hostinfo, f)
  270. c.pendingHostMap.unlockedDeleteHostInfo(hostinfo)
  271. }
  272. // AddIndexHostInfo generates a unique localIndexId for this HostInfo
  273. // and adds it to the pendingHostMap. Will error if we are unable to generate
  274. // a unique localIndexId
  275. func (c *HandshakeManager) AddIndexHostInfo(h *HostInfo) error {
  276. c.pendingHostMap.Lock()
  277. defer c.pendingHostMap.Unlock()
  278. c.mainHostMap.RLock()
  279. defer c.mainHostMap.RUnlock()
  280. for i := 0; i < 32; i++ {
  281. index, err := generateIndex(c.l)
  282. if err != nil {
  283. return err
  284. }
  285. _, inPending := c.pendingHostMap.Indexes[index]
  286. _, inMain := c.mainHostMap.Indexes[index]
  287. if !inMain && !inPending {
  288. h.localIndexId = index
  289. c.pendingHostMap.Indexes[index] = h
  290. return nil
  291. }
  292. }
  293. return errors.New("failed to generate unique localIndexId")
  294. }
  295. func (c *HandshakeManager) addRemoteIndexHostInfo(index uint32, h *HostInfo) {
  296. c.pendingHostMap.addRemoteIndexHostInfo(index, h)
  297. }
  298. func (c *HandshakeManager) DeleteHostInfo(hostinfo *HostInfo) {
  299. //l.Debugln("Deleting pending hostinfo :", hostinfo)
  300. c.pendingHostMap.DeleteHostInfo(hostinfo)
  301. }
  302. func (c *HandshakeManager) QueryIndex(index uint32) (*HostInfo, error) {
  303. return c.pendingHostMap.QueryIndex(index)
  304. }
  305. func (c *HandshakeManager) EmitStats() {
  306. c.pendingHostMap.EmitStats("pending")
  307. c.mainHostMap.EmitStats("main")
  308. }
  309. // Utility functions below
  310. func generateIndex(l *logrus.Logger) (uint32, error) {
  311. b := make([]byte, 4)
  312. // Let zero mean we don't know the ID, so don't generate zero
  313. var index uint32
  314. for index == 0 {
  315. _, err := rand.Read(b)
  316. if err != nil {
  317. l.Errorln(err)
  318. return 0, err
  319. }
  320. index = binary.BigEndian.Uint32(b)
  321. }
  322. if l.Level >= logrus.DebugLevel {
  323. l.WithField("index", index).
  324. Debug("Generated index")
  325. }
  326. return index, nil
  327. }
  328. func hsTimeout(tries int, interval time.Duration) time.Duration {
  329. return time.Duration(tries / 2 * ((2 * int(interval)) + (tries-1)*int(interval)))
  330. }