metrics.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. package logic
  2. import (
  3. "encoding/json"
  4. mqtt "github.com/eclipse/paho.mqtt.golang"
  5. "github.com/gravitl/netmaker/database"
  6. "github.com/gravitl/netmaker/logic"
  7. "github.com/gravitl/netmaker/models"
  8. "github.com/gravitl/netmaker/mq"
  9. "github.com/gravitl/netmaker/netclient/ncutils"
  10. "github.com/gravitl/netmaker/servercfg"
  11. "golang.org/x/exp/slog"
  12. "math"
  13. "time"
  14. )
  15. // GetMetrics - gets the metrics
  16. func GetMetrics(nodeid string) (*models.Metrics, error) {
  17. var metrics models.Metrics
  18. record, err := database.FetchRecord(database.METRICS_TABLE_NAME, nodeid)
  19. if err != nil {
  20. if database.IsEmptyRecord(err) {
  21. return &metrics, nil
  22. }
  23. return &metrics, err
  24. }
  25. err = json.Unmarshal([]byte(record), &metrics)
  26. if err != nil {
  27. return &metrics, err
  28. }
  29. return &metrics, nil
  30. }
  31. // UpdateMetrics - updates the metrics of a given client
  32. func UpdateMetrics(nodeid string, metrics *models.Metrics) error {
  33. data, err := json.Marshal(metrics)
  34. if err != nil {
  35. return err
  36. }
  37. return database.Insert(nodeid, string(data), database.METRICS_TABLE_NAME)
  38. }
  39. // DeleteMetrics - deletes metrics of a given node
  40. func DeleteMetrics(nodeid string) error {
  41. return database.DeleteRecord(database.METRICS_TABLE_NAME, nodeid)
  42. }
  43. func MQUpdateMetrics(client mqtt.Client, msg mqtt.Message) {
  44. id, err := mq.GetID(msg.Topic())
  45. if err != nil {
  46. slog.Error("error getting ID sent on ", "topic", msg.Topic(), "error", err)
  47. return
  48. }
  49. currentNode, err := logic.GetNodeByID(id)
  50. if err != nil {
  51. slog.Error("error getting node", "id", id, "error", err)
  52. return
  53. }
  54. decrypted, decryptErr := mq.DecryptMsg(&currentNode, msg.Payload())
  55. if decryptErr != nil {
  56. slog.Error("failed to decrypt message for node", "id", id, "error", decryptErr)
  57. return
  58. }
  59. var newMetrics models.Metrics
  60. if err := json.Unmarshal(decrypted, &newMetrics); err != nil {
  61. slog.Error("error unmarshaling payload", "error", err)
  62. return
  63. }
  64. shouldUpdate := updateNodeMetrics(&currentNode, &newMetrics)
  65. if err = logic.UpdateMetrics(id, &newMetrics); err != nil {
  66. slog.Error("failed to update node metrics", "id", id, "error", err)
  67. return
  68. }
  69. if servercfg.IsMetricsExporter() {
  70. if err := mq.PushMetricsToExporter(newMetrics); err != nil {
  71. slog.Error("failed to push node metrics to exporter", "id", currentNode.ID, "error", err)
  72. }
  73. }
  74. if newMetrics.Connectivity != nil {
  75. err := logic.EnterpriseFailoverFunc(&currentNode)
  76. if err != nil {
  77. slog.Error("failed to failover for node", "id", currentNode.ID, "network", currentNode.Network, "error", err)
  78. }
  79. }
  80. if shouldUpdate {
  81. slog.Info("updating peers after node detected connectivity issues", "id", currentNode.ID, "network", currentNode.Network)
  82. host, err := logic.GetHost(currentNode.HostID.String())
  83. if err == nil {
  84. nodes, err := logic.GetAllNodes()
  85. if err != nil {
  86. return
  87. }
  88. if err = mq.PublishSingleHostPeerUpdate(host, nodes, nil, nil); err != nil {
  89. slog.Warn("failed to publish update after failover peer change for node", "id", currentNode.ID, "network", currentNode.Network, "error", err)
  90. }
  91. }
  92. }
  93. slog.Debug("updated node metrics", "id", id)
  94. }
  95. func updateNodeMetrics(currentNode *models.Node, newMetrics *models.Metrics) bool {
  96. if newMetrics.FailoverPeers == nil {
  97. newMetrics.FailoverPeers = make(map[string]string)
  98. }
  99. oldMetrics, err := logic.GetMetrics(currentNode.ID.String())
  100. if err != nil {
  101. slog.Error("error finding old metrics for node", "id", currentNode.ID, "error", err)
  102. return false
  103. }
  104. if oldMetrics.FailoverPeers == nil {
  105. oldMetrics.FailoverPeers = make(map[string]string)
  106. }
  107. var attachedClients []models.ExtClient
  108. if currentNode.IsIngressGateway {
  109. clients, err := logic.GetExtClientsByID(currentNode.ID.String(), currentNode.Network)
  110. if err == nil {
  111. attachedClients = clients
  112. }
  113. }
  114. if len(attachedClients) > 0 {
  115. // associate ext clients with IDs
  116. for i := range attachedClients {
  117. extMetric := newMetrics.Connectivity[attachedClients[i].PublicKey]
  118. if len(extMetric.NodeName) == 0 &&
  119. len(newMetrics.Connectivity[attachedClients[i].ClientID].NodeName) > 0 { // cover server clients
  120. extMetric = newMetrics.Connectivity[attachedClients[i].ClientID]
  121. if extMetric.TotalReceived > 0 && extMetric.TotalSent > 0 {
  122. extMetric.Connected = true
  123. }
  124. }
  125. extMetric.NodeName = attachedClients[i].ClientID
  126. delete(newMetrics.Connectivity, attachedClients[i].PublicKey)
  127. newMetrics.Connectivity[attachedClients[i].ClientID] = extMetric
  128. }
  129. }
  130. // run through metrics for each peer
  131. for k := range newMetrics.Connectivity {
  132. currMetric := newMetrics.Connectivity[k]
  133. oldMetric := oldMetrics.Connectivity[k]
  134. currMetric.TotalTime += oldMetric.TotalTime
  135. currMetric.Uptime += oldMetric.Uptime // get the total uptime for this connection
  136. if currMetric.TotalReceived < oldMetric.TotalReceived {
  137. currMetric.TotalReceived += oldMetric.TotalReceived
  138. } else {
  139. currMetric.TotalReceived += int64(math.Abs(float64(currMetric.TotalReceived) - float64(oldMetric.TotalReceived)))
  140. }
  141. if currMetric.TotalSent < oldMetric.TotalSent {
  142. currMetric.TotalSent += oldMetric.TotalSent
  143. } else {
  144. currMetric.TotalSent += int64(math.Abs(float64(currMetric.TotalSent) - float64(oldMetric.TotalSent)))
  145. }
  146. if currMetric.Uptime == 0 || currMetric.TotalTime == 0 {
  147. currMetric.PercentUp = 0
  148. } else {
  149. currMetric.PercentUp = 100.0 * (float64(currMetric.Uptime) / float64(currMetric.TotalTime))
  150. }
  151. totalUpMinutes := currMetric.Uptime * ncutils.CheckInInterval
  152. currMetric.ActualUptime = time.Duration(totalUpMinutes) * time.Minute
  153. delete(oldMetrics.Connectivity, k) // remove from old data
  154. newMetrics.Connectivity[k] = currMetric
  155. }
  156. // add nodes that need failover
  157. nodes, err := logic.GetNetworkNodes(currentNode.Network)
  158. if err != nil {
  159. slog.Error("failed to retrieve nodes while updating metrics", "error", err)
  160. return false
  161. }
  162. for _, node := range nodes {
  163. if !newMetrics.Connectivity[node.ID.String()].Connected &&
  164. len(newMetrics.Connectivity[node.ID.String()].NodeName) > 0 &&
  165. node.Connected &&
  166. len(node.FailoverNode) > 0 &&
  167. !node.Failover {
  168. newMetrics.FailoverPeers[node.ID.String()] = node.FailoverNode.String()
  169. }
  170. }
  171. shouldUpdate := len(oldMetrics.FailoverPeers) == 0 && len(newMetrics.FailoverPeers) > 0
  172. for k, v := range oldMetrics.FailoverPeers {
  173. if len(newMetrics.FailoverPeers[k]) > 0 && len(v) == 0 {
  174. shouldUpdate = true
  175. }
  176. if len(v) > 0 && len(newMetrics.FailoverPeers[k]) == 0 {
  177. newMetrics.FailoverPeers[k] = v
  178. }
  179. }
  180. for k := range oldMetrics.Connectivity { // cleanup any left over data, self healing
  181. delete(newMetrics.Connectivity, k)
  182. }
  183. return shouldUpdate
  184. }