2
0
Эх сурвалжийг харах

Controller Metrics & Network Config Request Fix (#2003)

* add new metrics for network config request queue size and sso expirations
* move sso expiration to its own thread in the controller
* fix potential undefined behavior when modifying a set
Grant Limberg 2 жил өмнө
parent
commit
adfbbc3fb0

+ 41 - 28
controller/EmbeddedNetworkController.cpp

@@ -468,6 +468,8 @@ EmbeddedNetworkController::EmbeddedNetworkController(Node *node,const char *ztPa
 	_path(dbPath),
 	_sender((NetworkController::Sender *)0),
 	_db(this),
+	_ssoExpiryRunning(true),
+	_ssoExpiry(std::thread(&EmbeddedNetworkController::_ssoExpiryThread, this)),
 	_rc(rc)
 {
 }
@@ -476,8 +478,11 @@ EmbeddedNetworkController::~EmbeddedNetworkController()
 {
 	std::lock_guard<std::mutex> l(_threads_l);
 	_queue.stop();
-	for(auto t=_threads.begin();t!=_threads.end();++t)
+	for(auto t=_threads.begin();t!=_threads.end();++t) {
 		t->join();
+	}
+	_ssoExpiryRunning = false;
+	_ssoExpiry.join();
 }
 
 void EmbeddedNetworkController::setSSORedirectURL(const std::string &url) {
@@ -1543,7 +1548,7 @@ void EmbeddedNetworkController::_request(
 					*(reinterpret_cast<InetAddress *>(&(r->target))) = t;
 					if (v.ss_family == t.ss_family)
 						*(reinterpret_cast<InetAddress *>(&(r->via))) = v;
-					++nc->routeCount;
+		 			++nc->routeCount;
 				}
 			}
 		}
@@ -1765,10 +1770,9 @@ void EmbeddedNetworkController::_startThreads()
 	const long hwc = std::max((long)std::thread::hardware_concurrency(),(long)1);
 	for(long t=0;t<hwc;++t) {
 		_threads.emplace_back([this]() {
-			std::vector<_MemberStatusKey> expired;
-			nlohmann::json network, member;
 			for(;;) {
 				_RQEntry *qe = (_RQEntry *)0;
+				Metrics::network_config_request_queue_size = _queue.size();
 				auto timedWaitResult = _queue.get(qe, 1000);
 				if (timedWaitResult == BlockingQueue<_RQEntry *>::STOP) {
 					break;
@@ -1782,37 +1786,46 @@ void EmbeddedNetworkController::_startThreads()
 							fprintf(stderr,"ERROR: exception in controller request handling thread: unknown exception" ZT_EOL_S);
 						}
 						delete qe;
+						qe = nullptr;
 					}
 				}
+			}
+		});
+	}
+}
 
-				expired.clear();
-				int64_t now = OSUtils::now();
-				{
-					std::lock_guard<std::mutex> l(_expiringSoon_l);
-					for(auto s=_expiringSoon.begin();s!=_expiringSoon.end();) {
-						const int64_t when = s->first;
-						if (when <= now) {
-							// The user may have re-authorized, so we must actually look it up and check.
-							network.clear();
-							member.clear();
-							if (_db.get(s->second.networkId, network, s->second.nodeId, member)) {
-								int64_t authenticationExpiryTime = (int64_t)OSUtils::jsonInt(member["authenticationExpiryTime"], 0);
-								if (authenticationExpiryTime <= now) {
-									expired.push_back(s->second);
-								}
-							}
-							_expiringSoon.erase(s++);
-						} else {
-							// Don't bother going further into the future than necessary.
-							break;
+void EmbeddedNetworkController::_ssoExpiryThread() {
+	while(_ssoExpiryRunning) {
+		std::vector<_MemberStatusKey> expired;
+		nlohmann::json network, member;
+		int64_t now = OSUtils::now();
+		{
+			std::lock_guard<std::mutex> l(_expiringSoon_l);
+			for(auto s=_expiringSoon.begin();s!=_expiringSoon.end();) {
+				Metrics::sso_expiration_checks++;
+				const int64_t when = s->first;
+				if (when <= now) {
+					// The user may have re-authorized, so we must actually look it up and check.
+					network.clear();
+					member.clear();
+					if (_db.get(s->second.networkId, network, s->second.nodeId, member)) {
+						int64_t authenticationExpiryTime = (int64_t)OSUtils::jsonInt(member["authenticationExpiryTime"], 0);
+						if (authenticationExpiryTime <= now) {
+							expired.push_back(s->second);
 						}
 					}
-				}
-				for(auto e=expired.begin();e!=expired.end();++e) {
-					onNetworkMemberDeauthorize(nullptr, e->networkId, e->nodeId);
+					s = _expiringSoon.erase(s);
+				} else {
+					// Don't bother going further into the future than necessary.
+					break;
 				}
 			}
-		});
+		}
+		for(auto e=expired.begin();e!=expired.end();++e) {
+			Metrics::sso_member_deauth++;
+			onNetworkMemberDeauthorize(nullptr, e->networkId, e->nodeId);
+		}
+		std::this_thread::sleep_for(std::chrono::milliseconds(500));
 	}
 }
 

+ 4 - 0
controller/EmbeddedNetworkController.hpp

@@ -81,6 +81,7 @@ public:
 private:
 	void _request(uint64_t nwid,const InetAddress &fromAddr,uint64_t requestPacketId,const Identity &identity,const Dictionary<ZT_NETWORKCONFIG_METADATA_DICT_CAPACITY> &metaData);
 	void _startThreads();
+	void _ssoExpiryThread();
 
 	std::string networkUpdateFromPostData(uint64_t networkID, const std::string &body);
 
@@ -138,6 +139,9 @@ private:
 	std::vector<std::thread> _threads;
 	std::mutex _threads_l;
 
+	bool _ssoExpiryRunning;
+	std::thread _ssoExpiry;
+
 	std::unordered_map< _MemberStatusKey,_MemberStatus,_MemberStatusHash > _memberStatus;
 	std::mutex _memberStatus_l;
 

+ 9 - 0
node/Metrics.cpp

@@ -206,6 +206,15 @@ namespace ZeroTier {
         prometheus::simpleapi::counter_metric_t member_deauths
         {"controller_member_deauth_count", "number of network member deauths"};
 
+        prometheus::simpleapi::gauge_metric_t network_config_request_queue_size
+        { "controller_network_config_request_queue", "number of entries in the request queue for network configurations" };
+        
+        prometheus::simpleapi::counter_metric_t sso_expiration_checks
+        { "controller_sso_expiration_checks", "number of sso expiration checks done" };
+
+        prometheus::simpleapi::counter_metric_t sso_member_deauth
+        { "controller_sso_timeouts", "number of sso timeouts" };
+
 #ifdef ZT_CONTROLLER_USE_LIBPQ
         // Central Controller Metrics
         prometheus::simpleapi::counter_metric_t pgsql_mem_notification

+ 6 - 0
node/Metrics.hpp

@@ -123,6 +123,10 @@ namespace ZeroTier {
         extern prometheus::simpleapi::counter_metric_t member_auths;
         extern prometheus::simpleapi::counter_metric_t member_deauths;
 
+        extern prometheus::simpleapi::gauge_metric_t network_config_request_queue_size;
+        extern prometheus::simpleapi::counter_metric_t sso_expiration_checks;
+        extern prometheus::simpleapi::counter_metric_t sso_member_deauth;
+
 #ifdef ZT_CONTROLLER_USE_LIBPQ
         // Central Controller Metrics
         extern prometheus::simpleapi::counter_metric_t pgsql_mem_notification;
@@ -132,6 +136,8 @@ namespace ZeroTier {
         extern prometheus::simpleapi::counter_metric_t redis_net_notification;
         extern prometheus::simpleapi::counter_metric_t redis_node_checkin;
 
+        
+
         // Central DB Pool Metrics
         extern prometheus::simpleapi::counter_metric_t conn_counter;
         extern prometheus::simpleapi::counter_metric_t max_pool_size;

+ 5 - 0
osdep/BlockingQueue.hpp

@@ -116,6 +116,11 @@ public:
 		return OK;
 	}
 
+	inline size_t size() const {
+		std::unique_lock<std::mutex> lock(m);
+		return q.size();
+	}
+
 private:
 	std::queue<T> q;
 	mutable std::mutex m;