test_machine_models.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. """
  2. Unit tests for machine module models: Machine, NetworkInterface, Binary, Process.
  3. Tests cover:
  4. 1. Machine model creation and current() method
  5. 2. NetworkInterface model and network detection
  6. 3. Binary model lifecycle and state machine
  7. 4. Process model lifecycle, hierarchy, and state machine
  8. 5. JSONL serialization/deserialization
  9. 6. Manager methods
  10. 7. Process tracking methods (replacing pid_utils)
  11. """
  12. import os
  13. import sys
  14. from pathlib import Path
  15. from datetime import timedelta
  16. from unittest.mock import patch
  17. import pytest
  18. from django.test import TestCase
  19. from django.utils import timezone
  20. from archivebox.machine.models import (
  21. Machine,
  22. NetworkInterface,
  23. Binary,
  24. Process,
  25. BinaryMachine,
  26. ProcessMachine,
  27. MACHINE_RECHECK_INTERVAL,
  28. PROCESS_RECHECK_INTERVAL,
  29. PID_REUSE_WINDOW,
  30. )
  31. class TestMachineModel(TestCase):
  32. """Test the Machine model."""
  33. def setUp(self):
  34. """Reset cached machine between tests."""
  35. import archivebox.machine.models as models
  36. models._CURRENT_MACHINE = None
  37. def test_machine_current_creates_machine(self):
  38. """Machine.current() should create a machine if none exists."""
  39. machine = Machine.current()
  40. self.assertIsNotNone(machine)
  41. self.assertIsNotNone(machine.id)
  42. self.assertIsNotNone(machine.guid)
  43. self.assertEqual(machine.hostname, os.uname().nodename)
  44. self.assertIn(machine.os_family, ['linux', 'darwin', 'windows', 'freebsd'])
  45. def test_machine_current_returns_cached(self):
  46. """Machine.current() should return cached machine within recheck interval."""
  47. machine1 = Machine.current()
  48. machine2 = Machine.current()
  49. self.assertEqual(machine1.id, machine2.id)
  50. def test_machine_current_refreshes_after_interval(self):
  51. """Machine.current() should refresh after recheck interval."""
  52. import archivebox.machine.models as models
  53. machine1 = Machine.current()
  54. # Manually expire the cache by modifying modified_at
  55. machine1.modified_at = timezone.now() - timedelta(seconds=MACHINE_RECHECK_INTERVAL + 1)
  56. machine1.save()
  57. models._CURRENT_MACHINE = machine1
  58. machine2 = Machine.current()
  59. # Should have fetched/updated the machine (same GUID)
  60. self.assertEqual(machine1.guid, machine2.guid)
  61. def test_machine_from_jsonl_update(self):
  62. """Machine.from_json() should update machine config."""
  63. Machine.current() # Ensure machine exists
  64. record = {
  65. 'config': {
  66. 'WGET_BINARY': '/usr/bin/wget',
  67. },
  68. }
  69. result = Machine.from_json(record)
  70. self.assertIsNotNone(result)
  71. self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
  72. def test_machine_from_jsonl_invalid(self):
  73. """Machine.from_json() should return None for invalid records."""
  74. result = Machine.from_json({'invalid': 'record'})
  75. self.assertIsNone(result)
  76. def test_machine_manager_current(self):
  77. """Machine.objects.current() should return current machine."""
  78. machine = Machine.objects.current()
  79. self.assertIsNotNone(machine)
  80. self.assertEqual(machine.id, Machine.current().id)
  81. class TestNetworkInterfaceModel(TestCase):
  82. """Test the NetworkInterface model."""
  83. def setUp(self):
  84. """Reset cached interface between tests."""
  85. import archivebox.machine.models as models
  86. models._CURRENT_MACHINE = None
  87. models._CURRENT_INTERFACE = None
  88. def test_networkinterface_current_creates_interface(self):
  89. """NetworkInterface.current() should create an interface if none exists."""
  90. interface = NetworkInterface.current()
  91. self.assertIsNotNone(interface)
  92. self.assertIsNotNone(interface.id)
  93. self.assertIsNotNone(interface.machine)
  94. self.assertIsNotNone(interface.ip_local)
  95. def test_networkinterface_current_returns_cached(self):
  96. """NetworkInterface.current() should return cached interface within recheck interval."""
  97. interface1 = NetworkInterface.current()
  98. interface2 = NetworkInterface.current()
  99. self.assertEqual(interface1.id, interface2.id)
  100. def test_networkinterface_manager_current(self):
  101. """NetworkInterface.objects.current() should return current interface."""
  102. interface = NetworkInterface.objects.current()
  103. self.assertIsNotNone(interface)
  104. class TestBinaryModel(TestCase):
  105. """Test the Binary model."""
  106. def setUp(self):
  107. """Reset cached binaries and create a machine."""
  108. import archivebox.machine.models as models
  109. models._CURRENT_MACHINE = None
  110. models._CURRENT_BINARIES = {}
  111. self.machine = Machine.current()
  112. def test_binary_creation(self):
  113. """Binary should be created with default values."""
  114. binary = Binary.objects.create(
  115. machine=self.machine,
  116. name='wget',
  117. binproviders='apt,brew,env',
  118. )
  119. self.assertIsNotNone(binary.id)
  120. self.assertEqual(binary.name, 'wget')
  121. self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
  122. self.assertFalse(binary.is_valid)
  123. def test_binary_is_valid(self):
  124. """Binary.is_valid should be True when abspath and version are set."""
  125. binary = Binary.objects.create(
  126. machine=self.machine,
  127. name='wget',
  128. abspath='/usr/bin/wget',
  129. version='1.21',
  130. )
  131. self.assertTrue(binary.is_valid)
  132. def test_binary_manager_get_valid_binary(self):
  133. """BinaryManager.get_valid_binary() should find valid binaries."""
  134. # Create invalid binary (no abspath)
  135. Binary.objects.create(machine=self.machine, name='wget')
  136. # Create valid binary
  137. Binary.objects.create(
  138. machine=self.machine,
  139. name='wget',
  140. abspath='/usr/bin/wget',
  141. version='1.21',
  142. )
  143. result = Binary.objects.get_valid_binary('wget')
  144. self.assertIsNotNone(result)
  145. self.assertEqual(result.abspath, '/usr/bin/wget')
  146. def test_binary_update_and_requeue(self):
  147. """Binary.update_and_requeue() should update fields and save."""
  148. binary = Binary.objects.create(machine=self.machine, name='test')
  149. old_modified = binary.modified_at
  150. binary.update_and_requeue(
  151. status=Binary.StatusChoices.QUEUED,
  152. retry_at=timezone.now() + timedelta(seconds=60),
  153. )
  154. binary.refresh_from_db()
  155. self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
  156. self.assertGreater(binary.modified_at, old_modified)
  157. class TestBinaryStateMachine(TestCase):
  158. """Test the BinaryMachine state machine."""
  159. def setUp(self):
  160. """Create a machine and binary for state machine tests."""
  161. import archivebox.machine.models as models
  162. models._CURRENT_MACHINE = None
  163. self.machine = Machine.current()
  164. self.binary = Binary.objects.create(
  165. machine=self.machine,
  166. name='test-binary',
  167. binproviders='env',
  168. )
  169. def test_binary_state_machine_initial_state(self):
  170. """BinaryMachine should start in queued state."""
  171. sm = BinaryMachine(self.binary)
  172. self.assertEqual(sm.current_state.value, Binary.StatusChoices.QUEUED)
  173. def test_binary_state_machine_can_start(self):
  174. """BinaryMachine.can_start() should check name and binproviders."""
  175. sm = BinaryMachine(self.binary)
  176. self.assertTrue(sm.can_install())
  177. self.binary.binproviders = ''
  178. self.binary.save()
  179. sm = BinaryMachine(self.binary)
  180. self.assertFalse(sm.can_install())
  181. class TestProcessModel(TestCase):
  182. """Test the Process model."""
  183. def setUp(self):
  184. """Create a machine for process tests."""
  185. import archivebox.machine.models as models
  186. models._CURRENT_MACHINE = None
  187. models._CURRENT_PROCESS = None
  188. self.machine = Machine.current()
  189. def test_process_creation(self):
  190. """Process should be created with default values."""
  191. process = Process.objects.create(
  192. machine=self.machine,
  193. cmd=['echo', 'hello'],
  194. pwd='/tmp',
  195. )
  196. self.assertIsNotNone(process.id)
  197. self.assertEqual(process.cmd, ['echo', 'hello'])
  198. self.assertEqual(process.status, Process.StatusChoices.QUEUED)
  199. self.assertIsNone(process.pid)
  200. self.assertIsNone(process.exit_code)
  201. def test_process_to_jsonl(self):
  202. """Process.to_json() should serialize correctly."""
  203. process = Process.objects.create(
  204. machine=self.machine,
  205. cmd=['echo', 'hello'],
  206. pwd='/tmp',
  207. timeout=60,
  208. )
  209. json_data = process.to_json()
  210. self.assertEqual(json_data['type'], 'Process')
  211. self.assertEqual(json_data['cmd'], ['echo', 'hello'])
  212. self.assertEqual(json_data['pwd'], '/tmp')
  213. self.assertEqual(json_data['timeout'], 60)
  214. def test_process_update_and_requeue(self):
  215. """Process.update_and_requeue() should update fields and save."""
  216. process = Process.objects.create(machine=self.machine, cmd=['test'])
  217. old_modified = process.modified_at
  218. process.update_and_requeue(
  219. status=Process.StatusChoices.RUNNING,
  220. pid=12345,
  221. started_at=timezone.now(),
  222. )
  223. process.refresh_from_db()
  224. self.assertEqual(process.status, Process.StatusChoices.RUNNING)
  225. self.assertEqual(process.pid, 12345)
  226. self.assertIsNotNone(process.started_at)
  227. class TestProcessCurrent(TestCase):
  228. """Test Process.current() method."""
  229. def setUp(self):
  230. """Reset caches."""
  231. import archivebox.machine.models as models
  232. models._CURRENT_MACHINE = None
  233. models._CURRENT_PROCESS = None
  234. def test_process_current_creates_record(self):
  235. """Process.current() should create a Process for current PID."""
  236. proc = Process.current()
  237. self.assertIsNotNone(proc)
  238. self.assertEqual(proc.pid, os.getpid())
  239. self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
  240. self.assertIsNotNone(proc.machine)
  241. self.assertIsNotNone(proc.started_at)
  242. def test_process_current_caches(self):
  243. """Process.current() should cache the result."""
  244. proc1 = Process.current()
  245. proc2 = Process.current()
  246. self.assertEqual(proc1.id, proc2.id)
  247. def test_process_detect_type_orchestrator(self):
  248. """_detect_process_type should detect orchestrator."""
  249. with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
  250. result = Process._detect_process_type()
  251. self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
  252. def test_process_detect_type_cli(self):
  253. """_detect_process_type should detect CLI commands."""
  254. with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
  255. result = Process._detect_process_type()
  256. self.assertEqual(result, Process.TypeChoices.CLI)
  257. def test_process_detect_type_worker(self):
  258. """_detect_process_type should detect workers."""
  259. with patch('sys.argv', ['python', '-m', 'crawl_worker']):
  260. result = Process._detect_process_type()
  261. self.assertEqual(result, Process.TypeChoices.WORKER)
  262. class TestProcessHierarchy(TestCase):
  263. """Test Process parent/child relationships."""
  264. def setUp(self):
  265. """Create machine."""
  266. import archivebox.machine.models as models
  267. models._CURRENT_MACHINE = None
  268. self.machine = Machine.current()
  269. def test_process_parent_child(self):
  270. """Process should track parent/child relationships."""
  271. parent = Process.objects.create(
  272. machine=self.machine,
  273. process_type=Process.TypeChoices.CLI,
  274. status=Process.StatusChoices.RUNNING,
  275. pid=1,
  276. started_at=timezone.now(),
  277. )
  278. child = Process.objects.create(
  279. machine=self.machine,
  280. parent=parent,
  281. process_type=Process.TypeChoices.WORKER,
  282. status=Process.StatusChoices.RUNNING,
  283. pid=2,
  284. started_at=timezone.now(),
  285. )
  286. self.assertEqual(child.parent, parent)
  287. self.assertIn(child, parent.children.all())
  288. def test_process_root(self):
  289. """Process.root should return the root of the hierarchy."""
  290. root = Process.objects.create(
  291. machine=self.machine,
  292. process_type=Process.TypeChoices.CLI,
  293. status=Process.StatusChoices.RUNNING,
  294. started_at=timezone.now(),
  295. )
  296. child = Process.objects.create(
  297. machine=self.machine,
  298. parent=root,
  299. status=Process.StatusChoices.RUNNING,
  300. started_at=timezone.now(),
  301. )
  302. grandchild = Process.objects.create(
  303. machine=self.machine,
  304. parent=child,
  305. status=Process.StatusChoices.RUNNING,
  306. started_at=timezone.now(),
  307. )
  308. self.assertEqual(grandchild.root, root)
  309. self.assertEqual(child.root, root)
  310. self.assertEqual(root.root, root)
  311. def test_process_depth(self):
  312. """Process.depth should return depth in tree."""
  313. root = Process.objects.create(
  314. machine=self.machine,
  315. status=Process.StatusChoices.RUNNING,
  316. started_at=timezone.now(),
  317. )
  318. child = Process.objects.create(
  319. machine=self.machine,
  320. parent=root,
  321. status=Process.StatusChoices.RUNNING,
  322. started_at=timezone.now(),
  323. )
  324. self.assertEqual(root.depth, 0)
  325. self.assertEqual(child.depth, 1)
  326. class TestProcessLifecycle(TestCase):
  327. """Test Process lifecycle methods."""
  328. def setUp(self):
  329. """Create machine."""
  330. import archivebox.machine.models as models
  331. models._CURRENT_MACHINE = None
  332. self.machine = Machine.current()
  333. def test_process_is_running_current_pid(self):
  334. """is_running should be True for current PID."""
  335. import psutil
  336. from datetime import datetime
  337. proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
  338. proc = Process.objects.create(
  339. machine=self.machine,
  340. status=Process.StatusChoices.RUNNING,
  341. pid=os.getpid(),
  342. started_at=proc_start,
  343. )
  344. self.assertTrue(proc.is_running)
  345. def test_process_is_running_fake_pid(self):
  346. """is_running should be False for non-existent PID."""
  347. proc = Process.objects.create(
  348. machine=self.machine,
  349. status=Process.StatusChoices.RUNNING,
  350. pid=999999,
  351. started_at=timezone.now(),
  352. )
  353. self.assertFalse(proc.is_running)
  354. def test_process_poll_detects_exit(self):
  355. """poll() should detect exited process."""
  356. proc = Process.objects.create(
  357. machine=self.machine,
  358. status=Process.StatusChoices.RUNNING,
  359. pid=999999,
  360. started_at=timezone.now(),
  361. )
  362. exit_code = proc.poll()
  363. self.assertIsNotNone(exit_code)
  364. proc.refresh_from_db()
  365. self.assertEqual(proc.status, Process.StatusChoices.EXITED)
  366. def test_process_poll_normalizes_negative_exit_code(self):
  367. """poll() should normalize -1 exit codes to 137."""
  368. proc = Process.objects.create(
  369. machine=self.machine,
  370. status=Process.StatusChoices.EXITED,
  371. pid=999999,
  372. exit_code=-1,
  373. started_at=timezone.now(),
  374. )
  375. exit_code = proc.poll()
  376. self.assertEqual(exit_code, 137)
  377. proc.refresh_from_db()
  378. self.assertEqual(proc.exit_code, 137)
  379. def test_process_terminate_dead_process(self):
  380. """terminate() should handle already-dead process."""
  381. proc = Process.objects.create(
  382. machine=self.machine,
  383. status=Process.StatusChoices.RUNNING,
  384. pid=999999,
  385. started_at=timezone.now(),
  386. )
  387. result = proc.terminate()
  388. self.assertFalse(result)
  389. proc.refresh_from_db()
  390. self.assertEqual(proc.status, Process.StatusChoices.EXITED)
  391. class TestProcessClassMethods(TestCase):
  392. """Test Process class methods for querying."""
  393. def setUp(self):
  394. """Create machine."""
  395. import archivebox.machine.models as models
  396. models._CURRENT_MACHINE = None
  397. self.machine = Machine.current()
  398. def test_get_running(self):
  399. """get_running should return running processes."""
  400. proc = Process.objects.create(
  401. machine=self.machine,
  402. process_type=Process.TypeChoices.HOOK,
  403. status=Process.StatusChoices.RUNNING,
  404. pid=99999,
  405. started_at=timezone.now(),
  406. )
  407. running = Process.get_running(process_type=Process.TypeChoices.HOOK)
  408. self.assertIn(proc, running)
  409. def test_get_running_count(self):
  410. """get_running_count should count running processes."""
  411. for i in range(3):
  412. Process.objects.create(
  413. machine=self.machine,
  414. process_type=Process.TypeChoices.HOOK,
  415. status=Process.StatusChoices.RUNNING,
  416. pid=99900 + i,
  417. started_at=timezone.now(),
  418. )
  419. count = Process.get_running_count(process_type=Process.TypeChoices.HOOK)
  420. self.assertGreaterEqual(count, 3)
  421. def test_cleanup_stale_running(self):
  422. """cleanup_stale_running should mark stale processes as exited."""
  423. stale = Process.objects.create(
  424. machine=self.machine,
  425. status=Process.StatusChoices.RUNNING,
  426. pid=999999,
  427. started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
  428. )
  429. cleaned = Process.cleanup_stale_running()
  430. self.assertGreaterEqual(cleaned, 1)
  431. stale.refresh_from_db()
  432. self.assertEqual(stale.status, Process.StatusChoices.EXITED)
  433. class TestProcessStateMachine(TestCase):
  434. """Test the ProcessMachine state machine."""
  435. def setUp(self):
  436. """Create a machine and process for state machine tests."""
  437. import archivebox.machine.models as models
  438. models._CURRENT_MACHINE = None
  439. self.machine = Machine.current()
  440. self.process = Process.objects.create(
  441. machine=self.machine,
  442. cmd=['echo', 'test'],
  443. pwd='/tmp',
  444. )
  445. def test_process_state_machine_initial_state(self):
  446. """ProcessMachine should start in queued state."""
  447. sm = ProcessMachine(self.process)
  448. self.assertEqual(sm.current_state.value, Process.StatusChoices.QUEUED)
  449. def test_process_state_machine_can_start(self):
  450. """ProcessMachine.can_start() should check cmd and machine."""
  451. sm = ProcessMachine(self.process)
  452. self.assertTrue(sm.can_start())
  453. self.process.cmd = []
  454. self.process.save()
  455. sm = ProcessMachine(self.process)
  456. self.assertFalse(sm.can_start())
  457. def test_process_state_machine_is_exited(self):
  458. """ProcessMachine.is_exited() should check exit_code."""
  459. sm = ProcessMachine(self.process)
  460. self.assertFalse(sm.is_exited())
  461. self.process.exit_code = 0
  462. self.process.save()
  463. sm = ProcessMachine(self.process)
  464. self.assertTrue(sm.is_exited())
  465. if __name__ == '__main__':
  466. pytest.main([__file__, '-v'])