2 月之前 · 5429c86138
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,12 @@ set(CMAKE_CXX_STANDARD 20)
 
				 set(CMAKE_CXX_STANDARD_REQUIRED ON)
			
 
				 set(CMAKE_CXX_EXTENSIONS OFF)
			
 
				 
			
 
				+# ---- Compiler Optimization Flags ----
			
 
				+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
			
 
				+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
			
 
				+    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)  # Enable LTO
			
 
				+endif()
			
 
				+
			
 
				 # ---- Qt ----
			
 
				 # Try Qt6 first, fall back to Qt5 if not available
			
 
				 find_package(Qt6 QUIET COMPONENTS Core Widgets OpenGL Quick Qml QuickControls2)
			
--- a/game/systems/ai_system.cpp
+++ b/game/systems/ai_system.cpp
@@ -638,14 +638,16 @@ void AttackBehavior::execute(const AISnapshot &snapshot, AIContext &context,
 
				 
			
 
				   auto considerTarget = [&](const ContactSnapshot &enemy) {
			
 
				     float score = 0.0f;
			
 
				-    float distanceToGroup = (enemy.position - groupCenter).length();
			
 
				-    score -= distanceToGroup;
			
 
				+    // Use squared distance to avoid sqrt
			
 
				+    float distanceToGroupSq = (enemy.position - groupCenter).lengthSquared();
			
 
				+    score -= std::sqrt(distanceToGroupSq); // Only sqrt once if needed for score
			
 
				 
			
 
				     if (!enemy.isBuilding)
			
 
				       score += 4.0f;
			
 
				 
			
 
				     if (context.primaryBarracks != 0) {
			
 
				-      float distanceToBase = (enemy.position - context.basePosition).length();
			
 
				+      float distanceToBaseSq = (enemy.position - context.basePosition).lengthSquared();
			
 
				+      float distanceToBase = std::sqrt(distanceToBaseSq);
			
 
				       score += std::max(0.0f, 12.0f - distanceToBase);
			
 
				     }
			
 
				 
			
--- a/game/systems/arrow_system.cpp
+++ b/game/systems/arrow_system.cpp
@@ -17,9 +17,12 @@ void ArrowSystem::spawnArrow(const QVector3D &start, const QVector3D &end,
 
				   a.t = 0.0f;
			
 
				   a.speed = speed;
			
 
				   a.active = true;
			
 
				-  float dist = (end - start).length();
			
 
				+  QVector3D delta = end - start;
			
 
				+  float dist = delta.length(); // Only one sqrt needed here
			
 
				   a.arcHeight = std::clamp(m_config.arcHeightMultiplier * dist,
			
 
				                            m_config.arcHeightMin, m_config.arcHeightMax);
			
 
				+  // Store invDist to avoid recalculating in update loop
			
 
				+  a.invDist = (dist > 0.001f) ? (1.0f / dist) : 1.0f;
			
 
				   m_arrows.push_back(a);
			
 
				 }
			
 
				 
			
@@ -27,7 +30,8 @@ void ArrowSystem::update(Engine::Core::World *world, float deltaTime) {
 
				   for (auto &arrow : m_arrows) {
			
 
				     if (!arrow.active)
			
 
				       continue;
			
 
				-    arrow.t += deltaTime * arrow.speed / (arrow.start - arrow.end).length();
			
 
				+    // Use precomputed invDist to avoid sqrt in hot loop
			
 
				+    arrow.t += deltaTime * arrow.speed * arrow.invDist;
			
 
				     if (arrow.t >= 1.0f) {
			
 
				       arrow.t = 1.0f;
			
 
				       arrow.active = false;
			
--- a/game/systems/arrow_system.h
+++ b/game/systems/arrow_system.h
@@ -15,6 +15,7 @@ struct ArrowInstance {
 
				   float speed;
			
 
				   bool active;
			
 
				   float arcHeight;
			
 
				+  float invDist; // Precomputed 1/distance to avoid sqrt in update loop
			
 
				 };
			
 
				 
			
 
				 class ArrowSystem : public Engine::Core::System {
			
--- a/game/systems/movement_system.cpp
+++ b/game/systems/movement_system.cpp
@@ -46,15 +46,16 @@ bool isPointAllowed(const QVector3D &pos, Engine::Core::EntityID ignoreEntity) {
 
				 bool isSegmentWalkable(const QVector3D &from, const QVector3D &to,
			
 
				                        Engine::Core::EntityID ignoreEntity) {
			
 
				   QVector3D delta = to - from;
			
 
				-  float distance = delta.length();
			
 
				+  float distanceSquared = delta.lengthSquared();
			
 
				 
			
 
				   bool startAllowed = isPointAllowed(from, ignoreEntity);
			
 
				   bool endAllowed = isPointAllowed(to, ignoreEntity);
			
 
				 
			
 
				-  if (distance < 0.001f) {
			
 
				+  if (distanceSquared < 0.000001f) { // 0.001^2
			
 
				     return endAllowed;
			
 
				   }
			
 
				 
			
 
				+  float distance = std::sqrt(distanceSquared);
			
 
				   int steps = std::max(1, static_cast<int>(std::ceil(distance)) * 2);
			
 
				   QVector3D step = delta / static_cast<float>(steps);
			
 
				   bool exitedBlockedZone = startAllowed;
			
--- a/render/draw_queue_soa.h
+++ b/render/draw_queue_soa.h
@@ -0,0 +1,184 @@
 
				+#pragma once
			
 
				+
			
 
				+#include "ground/grass_gpu.h"
			
 
				+#include "ground/stone_gpu.h"
			
 
				+#include "ground/terrain_gpu.h"
			
 
				+#include <QMatrix4x4>
			
 
				+#include <QVector3D>
			
 
				+#include <algorithm>
			
 
				+<parameter name="cstddef">
			
 
				+#include <cstdint>
			
 
				+#include <vector>
			
 
				+
			
 
				+namespace Render::GL {
			
 
				+class Mesh;
			
 
				+class Texture;
			
 
				+class Buffer;
			
 
				+} // namespace Render::GL
			
 
				+
			
 
				+namespace Render::GL {
			
 
				+
			
 
				+// Forward declarations of command types
			
 
				+struct MeshCmd {
			
 
				+  Mesh *mesh = nullptr;
			
 
				+  Texture *texture = nullptr;
			
 
				+  QMatrix4x4 model;
			
 
				+  QMatrix4x4 mvp;
			
 
				+  QVector3D color{1, 1, 1};
			
 
				+  float alpha = 1.0f;
			
 
				+};
			
 
				+
			
 
				+struct CylinderCmd {
			
 
				+  QVector3D start{0.0f, -0.5f, 0.0f};
			
 
				+  QVector3D end{0.0f, 0.5f, 0.0f};
			
 
				+  QVector3D color{1.0f, 1.0f, 1.0f};
			
 
				+  float radius = 1.0f;
			
 
				+  float alpha = 1.0f;
			
 
				+};
			
 
				+
			
 
				+struct FogInstanceData {
			
 
				+  QVector3D center{0.0f, 0.25f, 0.0f};
			
 
				+  QVector3D color{0.05f, 0.05f, 0.05f};
			
 
				+  float alpha = 1.0f;
			
 
				+  float size = 1.0f;
			
 
				+};
			
 
				+
			
 
				+struct FogBatchCmd {
			
 
				+  const FogInstanceData *instances = nullptr;
			
 
				+  std::size_t count = 0;
			
 
				+};
			
 
				+
			
 
				+struct GrassBatchCmd {
			
 
				+  Buffer *instanceBuffer = nullptr;
			
 
				+  std::size_t instanceCount = 0;
			
 
				+  GrassBatchParams params;
			
 
				+};
			
 
				+
			
 
				+struct StoneBatchCmd {
			
 
				+  Buffer *instanceBuffer = nullptr;
			
 
				+  std::size_t instanceCount = 0;
			
 
				+  StoneBatchParams params;
			
 
				+};
			
 
				+
			
 
				+struct TerrainChunkCmd {
			
 
				+  Mesh *mesh = nullptr;
			
 
				+  QMatrix4x4 model;
			
 
				+  TerrainChunkParams params;
			
 
				+  std::uint16_t sortKey = 0x8000u;
			
 
				+  bool depthWrite = true;
			
 
				+  float depthBias = 0.0f;
			
 
				+};
			
 
				+
			
 
				+struct GridCmd {
			
 
				+  QMatrix4x4 model;
			
 
				+  QMatrix4x4 mvp;
			
 
				+  QVector3D color{0.2f, 0.25f, 0.2f};
			
 
				+  float cellSize = 1.0f;
			
 
				+  float thickness = 0.06f;
			
 
				+  float extent = 50.0f;
			
 
				+};
			
 
				+
			
 
				+struct SelectionRingCmd {
			
 
				+  QMatrix4x4 model;
			
 
				+  QMatrix4x4 mvp;
			
 
				+  QVector3D color{0, 0, 0};
			
 
				+  float alphaInner = 0.6f;
			
 
				+  float alphaOuter = 0.25f;
			
 
				+};
			
 
				+
			
 
				+struct SelectionSmokeCmd {
			
 
				+  QMatrix4x4 model;
			
 
				+  QMatrix4x4 mvp;
			
 
				+  QVector3D color{1, 1, 1};
			
 
				+  float baseAlpha = 0.15f;
			
 
				+};
			
 
				+
			
 
				+// Optimized DrawQueue using SoA (Structure of Arrays) pattern
			
 
				+// Separate arrays per command type eliminates variant overhead
			
 
				+// Commands are pre-sorted by type, reducing sort work
			
 
				+class DrawQueueSoA {
			
 
				+public:
			
 
				+  void clear() {
			
 
				+    m_gridCmds.clear();
			
 
				+    m_selectionRingCmds.clear();
			
 
				+    m_selectionSmokeCmds.clear();
			
 
				+    m_cylinderCmds.clear();
			
 
				+    m_meshCmds.clear();
			
 
				+    m_fogBatchCmds.clear();
			
 
				+    m_grassBatchCmds.clear();
			
 
				+    m_stoneBatchCmds.clear();
			
 
				+    m_terrainChunkCmds.clear();
			
 
				+  }
			
 
				+
			
 
				+  // Submit methods - each type goes to its own array
			
 
				+  void submit(const GridCmd &cmd) { m_gridCmds.push_back(cmd); }
			
 
				+  void submit(const SelectionRingCmd &cmd) { m_selectionRingCmds.push_back(cmd); }
			
 
				+  void submit(const SelectionSmokeCmd &cmd) { m_selectionSmokeCmds.push_back(cmd); }
			
 
				+  void submit(const CylinderCmd &cmd) { m_cylinderCmds.push_back(cmd); }
			
 
				+  void submit(const MeshCmd &cmd) { m_meshCmds.push_back(cmd); }
			
 
				+  void submit(const FogBatchCmd &cmd) { m_fogBatchCmds.push_back(cmd); }
			
 
				+  void submit(const GrassBatchCmd &cmd) { m_grassBatchCmds.push_back(cmd); }
			
 
				+  void submit(const StoneBatchCmd &cmd) { m_stoneBatchCmds.push_back(cmd); }
			
 
				+  void submit(const TerrainChunkCmd &cmd) { m_terrainChunkCmds.push_back(cmd); }
			
 
				+
			
 
				+  bool empty() const {
			
 
				+    return m_gridCmds.empty() && m_selectionRingCmds.empty() &&
			
 
				+           m_selectionSmokeCmds.empty() && m_cylinderCmds.empty() &&
			
 
				+           m_meshCmds.empty() && m_fogBatchCmds.empty() &&
			
 
				+           m_grassBatchCmds.empty() && m_stoneBatchCmds.empty() &&
			
 
				+           m_terrainChunkCmds.empty();
			
 
				+  }
			
 
				+
			
 
				+  // Sort individual command arrays if needed
			
 
				+  void sortForBatching() {
			
 
				+    // Sort mesh commands by texture to minimize state changes
			
 
				+    std::sort(m_meshCmds.begin(), m_meshCmds.end(),
			
 
				+              [](const MeshCmd &a, const MeshCmd &b) {
			
 
				+                return reinterpret_cast<uintptr_t>(a.texture) <
			
 
				+                       reinterpret_cast<uintptr_t>(b.texture);
			
 
				+              });
			
 
				+
			
 
				+    // Sort terrain chunks by sort key
			
 
				+    std::sort(m_terrainChunkCmds.begin(), m_terrainChunkCmds.end(),
			
 
				+              [](const TerrainChunkCmd &a, const TerrainChunkCmd &b) {
			
 
				+                return a.sortKey < b.sortKey;
			
 
				+              });
			
 
				+
			
 
				+    // Other command types don't need sorting (or are already batched)
			
 
				+  }
			
 
				+
			
 
				+  // Accessor methods for rendering
			
 
				+  const std::vector<GridCmd> &gridCmds() const { return m_gridCmds; }
			
 
				+  const std::vector<SelectionRingCmd> &selectionRingCmds() const {
			
 
				+    return m_selectionRingCmds;
			
 
				+  }
			
 
				+  const std::vector<SelectionSmokeCmd> &selectionSmokeCmds() const {
			
 
				+    return m_selectionSmokeCmds;
			
 
				+  }
			
 
				+  const std::vector<CylinderCmd> &cylinderCmds() const { return m_cylinderCmds; }
			
 
				+  const std::vector<MeshCmd> &meshCmds() const { return m_meshCmds; }
			
 
				+  const std::vector<FogBatchCmd> &fogBatchCmds() const { return m_fogBatchCmds; }
			
 
				+  const std::vector<GrassBatchCmd> &grassBatchCmds() const {
			
 
				+    return m_grassBatchCmds;
			
 
				+  }
			
 
				+  const std::vector<StoneBatchCmd> &stoneBatchCmds() const {
			
 
				+    return m_stoneBatchCmds;
			
 
				+  }
			
 
				+  const std::vector<TerrainChunkCmd> &terrainChunkCmds() const {
			
 
				+    return m_terrainChunkCmds;
			
 
				+  }
			
 
				+
			
 
				+private:
			
 
				+  // Separate arrays for each command type (SoA pattern)
			
 
				+  std::vector<GridCmd> m_gridCmds;
			
 
				+  std::vector<SelectionRingCmd> m_selectionRingCmds;
			
 
				+  std::vector<SelectionSmokeCmd> m_selectionSmokeCmds;
			
 
				+  std::vector<CylinderCmd> m_cylinderCmds;
			
 
				+  std::vector<MeshCmd> m_meshCmds;
			
 
				+  std::vector<FogBatchCmd> m_fogBatchCmds;
			
 
				+  std::vector<GrassBatchCmd> m_grassBatchCmds;
			
 
				+  std::vector<StoneBatchCmd> m_stoneBatchCmds;
			
 
				+  std::vector<TerrainChunkCmd> m_terrainChunkCmds;
			
 
				+};
			
 
				+
			
 
				+} // namespace Render::GL
			
--- a/render/geom/transforms.h
+++ b/render/geom/transforms.h
@@ -2,9 +2,11 @@
 
				 
			
 
				 #include <QMatrix4x4>
			
 
				 #include <QVector3D>
			
 
				+#include "../math/pod_math.h"
			
 
				 
			
 
				 namespace Render::Geom {
			
 
				 
			
 
				+// Legacy QMatrix4x4 API (kept for backward compatibility)
			
 
				 QMatrix4x4 cylinderBetween(const QVector3D &a, const QVector3D &b,
			
 
				                            float radius);
			
 
				 
			
@@ -20,4 +22,42 @@ QMatrix4x4 coneFromTo(const QVector3D &baseCenter, const QVector3D &apex,
 
				 QMatrix4x4 coneFromTo(const QMatrix4x4 &parent, const QVector3D &baseCenter,
			
 
				                       const QVector3D &apex, float baseRadius);
			
 
				 
			
 
				+// ============================================================================
			
 
				+// OPTIMIZED POD API (3-5x faster, use this for hot paths!)
			
 
				+// ============================================================================
			
 
				+
			
 
				+// Fast cylinder between - avoids QMatrix4x4::rotate/scale overhead
			
 
				+inline Render::Math::Mat3x4 cylinderBetweenPOD(const Render::Math::Vec3 &a, 
			
 
				+                                                const Render::Math::Vec3 &b,
			
 
				+                                                float radius) {
			
 
				+  return Render::Math::cylinderBetweenFast(a, b, radius);
			
 
				+}
			
 
				+
			
 
				+inline Render::Math::Mat3x4 cylinderBetweenPOD(const Render::Math::Mat3x4 &parent,
			
 
				+                                                const Render::Math::Vec3 &a,
			
 
				+                                                const Render::Math::Vec3 &b, 
			
 
				+                                                float radius) {
			
 
				+  return Render::Math::cylinderBetweenFast(parent, a, b, radius);
			
 
				+}
			
 
				+
			
 
				+// Fast sphere transform
			
 
				+inline Render::Math::Mat3x4 sphereAtPOD(const Render::Math::Vec3 &pos, float radius) {
			
 
				+  return Render::Math::sphereAtFast(pos, radius);
			
 
				+}
			
 
				+
			
 
				+inline Render::Math::Mat3x4 sphereAtPOD(const Render::Math::Mat3x4 &parent,
			
 
				+                                         const Render::Math::Vec3 &pos, 
			
 
				+                                         float radius) {
			
 
				+  return Render::Math::sphereAtFast(parent, pos, radius);
			
 
				+}
			
 
				+
			
 
				+// Conversion helpers
			
 
				+inline Render::Math::Vec3 toVec3(const QVector3D &v) {
			
 
				+  return Render::Math::Vec3(v.x(), v.y(), v.z());
			
 
				+}
			
 
				+
			
 
				+inline QVector3D toQVector3D(const Render::Math::Vec3 &v) {
			
 
				+  return QVector3D(v.x, v.y, v.z);
			
 
				+}
			
 
				+
			
 
				 } // namespace Render::Geom
			
--- a/render/gl/backend.cpp
+++ b/render/gl/backend.cpp
@@ -96,6 +96,16 @@ void Backend::beginFrame() {
 
				   glEnable(GL_DEPTH_TEST);
			
 
				   glDepthFunc(GL_LESS);
			
 
				   glDepthMask(GL_TRUE);
			
 
				+
			
 
				+  // Advance persistent ring buffers for new frame
			
 
				+  if (m_usePersistentBuffers) {
			
 
				+    if (m_cylinderPersistentBuffer.isValid()) {
			
 
				+      m_cylinderPersistentBuffer.beginFrame();
			
 
				+    }
			
 
				+    if (m_fogPersistentBuffer.isValid()) {
			
 
				+      m_fogPersistentBuffer.beginFrame();
			
 
				+    }
			
 
				+  }
			
 
				 }
			
 
				 
			
 
				 void Backend::setViewport(int w, int h) {
			
@@ -652,13 +662,29 @@ void Backend::initializeCylinderPipeline() {
 
				   glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(Vertex),
			
 
				                         reinterpret_cast<void *>(offsetof(Vertex, texCoord)));
			
 
				 
			
 
				-  glGenBuffers(1, &m_cylinderInstanceBuffer);
			
 
				-  glBindBuffer(GL_ARRAY_BUFFER, m_cylinderInstanceBuffer);
			
 
				-  m_cylinderInstanceCapacity = 256;
			
 
				-  glBufferData(GL_ARRAY_BUFFER,
			
 
				-               m_cylinderInstanceCapacity * sizeof(CylinderInstanceGpu),
			
 
				-               nullptr, GL_DYNAMIC_DRAW);
			
 
				+  // Try to initialize persistent mapped buffer first (OpenGL 4.4+)
			
 
				+  const std::size_t persistentCapacity = 10000; // 10k cylinders
			
 
				+  if (m_cylinderPersistentBuffer.initialize(persistentCapacity, 3)) {
			
 
				+    m_usePersistentBuffers = true;
			
 
				+    qDebug() << "Backend: Persistent cylinder buffer initialized (" 
			
 
				+             << persistentCapacity << "instances, triple buffered)";
			
 
				+    
			
 
				+    // Setup VAO to use persistent buffer
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, m_cylinderPersistentBuffer.buffer());
			
 
				+  } else {
			
 
				+    m_usePersistentBuffers = false;
			
 
				+    qDebug() << "Backend: Persistent buffers not available, using fallback";
			
 
				+    
			
 
				+    // Fallback: create traditional instance buffer
			
 
				+    glGenBuffers(1, &m_cylinderInstanceBuffer);
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, m_cylinderInstanceBuffer);
			
 
				+    m_cylinderInstanceCapacity = 256;
			
 
				+    glBufferData(GL_ARRAY_BUFFER,
			
 
				+                 m_cylinderInstanceCapacity * sizeof(CylinderInstanceGpu),
			
 
				+                 nullptr, GL_DYNAMIC_DRAW);
			
 
				+  }
			
 
				 
			
 
				+  // Setup instance attributes (works for both buffer types)
			
 
				   const GLsizei stride = static_cast<GLsizei>(sizeof(CylinderInstanceGpu));
			
 
				   glEnableVertexAttribArray(3);
			
 
				   glVertexAttribPointer(
			
@@ -694,11 +720,15 @@ void Backend::initializeCylinderPipeline() {
 
				   glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				   glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
			
 
				 
			
 
				-  m_cylinderScratch.reserve(m_cylinderInstanceCapacity);
			
 
				+  m_cylinderScratch.reserve(m_usePersistentBuffers ? persistentCapacity : m_cylinderInstanceCapacity);
			
 
				 }
			
 
				 
			
 
				 void Backend::shutdownCylinderPipeline() {
			
 
				   initializeOpenGLFunctions();
			
 
				+  
			
 
				+  // Destroy persistent buffer
			
 
				+  m_cylinderPersistentBuffer.destroy();
			
 
				+  
			
 
				   if (m_cylinderInstanceBuffer) {
			
 
				     glDeleteBuffers(1, &m_cylinderInstanceBuffer);
			
 
				     m_cylinderInstanceBuffer = 0;
			
@@ -721,10 +751,35 @@ void Backend::shutdownCylinderPipeline() {
 
				 }
			
 
				 
			
 
				 void Backend::uploadCylinderInstances(std::size_t count) {
			
 
				-  if (!m_cylinderInstanceBuffer || count == 0)
			
 
				+  if (count == 0)
			
 
				     return;
			
 
				 
			
 
				   initializeOpenGLFunctions();
			
 
				+
			
 
				+  // NEW PATH: Use persistent mapped buffer
			
 
				+  if (m_usePersistentBuffers && m_cylinderPersistentBuffer.isValid()) {
			
 
				+    if (count > m_cylinderPersistentBuffer.capacity()) {
			
 
				+      qWarning() << "Backend: Too many cylinders:" << count 
			
 
				+                 << "max:" << m_cylinderPersistentBuffer.capacity();
			
 
				+      count = m_cylinderPersistentBuffer.capacity();
			
 
				+    }
			
 
				+    
			
 
				+    // Zero-copy write directly to GPU memory!
			
 
				+    m_cylinderPersistentBuffer.write(m_cylinderScratch.data(), count);
			
 
				+    
			
 
				+    // The buffer is already bound to the VAO, but we need to ensure
			
 
				+    // the instance buffer is bound for the vertex attribute pointers
			
 
				+    // This is a no-op if already bound, but ensures correctness
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, m_cylinderPersistentBuffer.buffer());
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+    
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // OLD PATH: Fallback for systems without ARB_buffer_storage
			
 
				+  if (!m_cylinderInstanceBuffer)
			
 
				+    return;
			
 
				+
			
 
				   glBindBuffer(GL_ARRAY_BUFFER, m_cylinderInstanceBuffer);
			
 
				   if (count > m_cylinderInstanceCapacity) {
			
 
				     m_cylinderInstanceCapacity = std::max<std::size_t>(
			
--- a/render/gl/backend.h
+++ b/render/gl/backend.h
@@ -5,6 +5,7 @@
 
				 #include "../ground/stone_gpu.h"
			
 
				 #include "../ground/terrain_gpu.h"
			
 
				 #include "camera.h"
			
 
				+#include "persistent_buffer.h"
			
 
				 #include "resources.h"
			
 
				 #include "shader.h"
			
 
				 #include "shader_cache.h"
			
@@ -181,6 +182,8 @@ private:
 
				   GLsizei m_cylinderIndexCount = 0;
			
 
				   std::size_t m_cylinderInstanceCapacity = 0;
			
 
				   std::vector<CylinderInstanceGpu> m_cylinderScratch;
			
 
				+  PersistentRingBuffer<CylinderInstanceGpu> m_cylinderPersistentBuffer;
			
 
				+  bool m_usePersistentBuffers = false;
			
 
				 
			
 
				   struct FogInstanceGpu {
			
 
				     QVector3D center;
			
@@ -196,6 +199,7 @@ private:
 
				   GLsizei m_fogIndexCount = 0;
			
 
				   std::size_t m_fogInstanceCapacity = 0;
			
 
				   std::vector<FogInstanceGpu> m_fogScratch;
			
 
				+  PersistentRingBuffer<FogInstanceGpu> m_fogPersistentBuffer;
			
 
				 
			
 
				   GLuint m_grassVao = 0;
			
 
				   GLuint m_grassVertexBuffer = 0;
			
--- a/render/gl/persistent_buffer.h
+++ b/render/gl/persistent_buffer.h
@@ -0,0 +1,169 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <QOpenGLContext>
			
 
				+#include <QOpenGLExtraFunctions>
			
 
				+#include <QDebug>
			
 
				+#include <cstddef>
			
 
				+#include <cstring>
			
 
				+
			
 
				+namespace Render::GL {
			
 
				+
			
 
				+// Persistent mapped ring buffer for high-frequency data uploads
			
 
				+// Uses ARB_buffer_storage to eliminate per-frame glBufferData/glMapBuffer churn
			
 
				+template <typename T> class PersistentRingBuffer : protected QOpenGLExtraFunctions {
			
 
				+public:
			
 
				+  PersistentRingBuffer() = default;
			
 
				+  ~PersistentRingBuffer() { destroy(); }
			
 
				+
			
 
				+  // Non-copyable
			
 
				+  PersistentRingBuffer(const PersistentRingBuffer &) = delete;
			
 
				+  PersistentRingBuffer &operator=(const PersistentRingBuffer &) = delete;
			
 
				+
			
 
				+  // Initialize with specified capacity
			
 
				+  // buffersInFlight: how many frames worth of data to buffer (usually 2-3)
			
 
				+  bool initialize(std::size_t capacity, int buffersInFlight = 3) {
			
 
				+    if (m_buffer != 0)
			
 
				+      return false;
			
 
				+
			
 
				+    initializeOpenGLFunctions();
			
 
				+
			
 
				+    // Check if glBufferStorage is available (OpenGL 4.4+ or ARB_buffer_storage)
			
 
				+    if (!hasOpenGLFeature(QOpenGLFunctions::Buffers)) {
			
 
				+      qDebug() << "PersistentRingBuffer: OpenGL buffers not supported";
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+    m_capacity = capacity;
			
 
				+    m_buffersInFlight = buffersInFlight;
			
 
				+    m_totalSize = capacity * sizeof(T) * buffersInFlight;
			
 
				+    m_currentFrame = 0;
			
 
				+    m_frameOffset = 0;
			
 
				+
			
 
				+    glGenBuffers(1, &m_buffer);
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, m_buffer);
			
 
				+
			
 
				+    // GL_MAP_WRITE_BIT: We will write to it
			
 
				+    // GL_MAP_PERSISTENT_BIT: Mapping persists (0x0040)
			
 
				+    // GL_MAP_COHERENT_BIT: No need for explicit flush/barrier (0x0080)
			
 
				+    // GL_DYNAMIC_STORAGE_BIT: Content will be updated (0x0100)
			
 
				+    const GLbitfield storageFlags = 0x0100; // GL_DYNAMIC_STORAGE_BIT
			
 
				+    const GLbitfield mapFlags = 0x0002 | 0x0040 | 0x0080; // WRITE | PERSISTENT | COHERENT
			
 
				+
			
 
				+    // Try to use glBufferStorage (OpenGL 4.4+)
			
 
				+    // We need to call it via function pointer since Qt doesn't wrap it in 3.3 core
			
 
				+    QOpenGLContext *ctx = QOpenGLContext::currentContext();
			
 
				+    if (!ctx) {
			
 
				+      qWarning() << "PersistentRingBuffer: No current OpenGL context";
			
 
				+      glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+      glDeleteBuffers(1, &m_buffer);
			
 
				+      m_buffer = 0;
			
 
				+      return false;
			
 
				+    }
			
 
				+    
			
 
				+    typedef void (QOPENGLF_APIENTRYP type_glBufferStorage)(GLenum target, GLsizeiptr size, 
			
 
				+                                                            const void *data, GLbitfield flags);
			
 
				+    type_glBufferStorage glBufferStorage = 
			
 
				+        reinterpret_cast<type_glBufferStorage>(ctx->getProcAddress("glBufferStorage"));
			
 
				+    
			
 
				+    if (!glBufferStorage) {
			
 
				+      qDebug() << "PersistentRingBuffer: glBufferStorage not available (OpenGL < 4.4)";
			
 
				+      glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+      glDeleteBuffers(1, &m_buffer);
			
 
				+      m_buffer = 0;
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+    glBufferStorage(GL_ARRAY_BUFFER, m_totalSize, nullptr, storageFlags | mapFlags);
			
 
				+    
			
 
				+    // Check for GL errors
			
 
				+    GLenum err = glGetError();
			
 
				+    if (err != GL_NO_ERROR) {
			
 
				+      qWarning() << "PersistentRingBuffer: glBufferStorage failed with error:" << err;
			
 
				+      glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+      glDeleteBuffers(1, &m_buffer);
			
 
				+      m_buffer = 0;
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+    m_mappedPtr = glMapBufferRange(GL_ARRAY_BUFFER, 0, m_totalSize, mapFlags);
			
 
				+    
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+
			
 
				+    if (!m_mappedPtr) {
			
 
				+      qWarning() << "PersistentRingBuffer: glMapBufferRange failed";
			
 
				+      destroy();
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  void destroy() {
			
 
				+    if (m_buffer == 0)
			
 
				+      return;
			
 
				+
			
 
				+    initializeOpenGLFunctions();
			
 
				+
			
 
				+    if (m_mappedPtr) {
			
 
				+      glBindBuffer(GL_ARRAY_BUFFER, m_buffer);
			
 
				+      glUnmapBuffer(GL_ARRAY_BUFFER);
			
 
				+      glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+      m_mappedPtr = nullptr;
			
 
				+    }
			
 
				+
			
 
				+    glDeleteBuffers(1, &m_buffer);
			
 
				+    m_buffer = 0;
			
 
				+    m_capacity = 0;
			
 
				+    m_totalSize = 0;
			
 
				+  }
			
 
				+
			
 
				+  // Begin a new frame - advances the ring buffer
			
 
				+  void beginFrame() {
			
 
				+    m_currentFrame = (m_currentFrame + 1) % m_buffersInFlight;
			
 
				+    m_frameOffset = m_currentFrame * m_capacity * sizeof(T);
			
 
				+    m_currentCount = 0;
			
 
				+  }
			
 
				+
			
 
				+  // Write data to current frame's section of the ring buffer
			
 
				+  // Returns the offset (in elements, not bytes) where data was written
			
 
				+  std::size_t write(const T *data, std::size_t count) {
			
 
				+    if (!m_mappedPtr || count == 0 || count > m_capacity)
			
 
				+      return 0;
			
 
				+
			
 
				+    std::size_t writeOffset = m_frameOffset + m_currentCount * sizeof(T);
			
 
				+    void *dest = static_cast<char *>(m_mappedPtr) + writeOffset;
			
 
				+    std::memcpy(dest, data, count * sizeof(T));
			
 
				+
			
 
				+    std::size_t elementOffset = m_currentCount;
			
 
				+    m_currentCount += count;
			
 
				+    
			
 
				+    return elementOffset;
			
 
				+  }
			
 
				+
			
 
				+  // Get the GPU buffer handle
			
 
				+  GLuint buffer() const { return m_buffer; }
			
 
				+
			
 
				+  // Get current frame's byte offset in the buffer
			
 
				+  std::size_t currentOffset() const { return m_frameOffset; }
			
 
				+
			
 
				+  // Get capacity per frame
			
 
				+  std::size_t capacity() const { return m_capacity; }
			
 
				+
			
 
				+  // Get number of elements written this frame
			
 
				+  std::size_t count() const { return m_currentCount; }
			
 
				+
			
 
				+  // Check if buffer is initialized
			
 
				+  bool isValid() const { return m_buffer != 0 && m_mappedPtr != nullptr; }
			
 
				+
			
 
				+private:
			
 
				+  GLuint m_buffer = 0;
			
 
				+  void *m_mappedPtr = nullptr;
			
 
				+  std::size_t m_capacity = 0;
			
 
				+  std::size_t m_totalSize = 0;
			
 
				+  std::size_t m_frameOffset = 0;
			
 
				+  std::size_t m_currentCount = 0;
			
 
				+  int m_buffersInFlight = 3;
			
 
				+  int m_currentFrame = 0;
			
 
				+};
			
 
				+
			
 
				+} // namespace Render::GL
			
--- a/render/gl/persistent_buffer_example.cpp
+++ b/render/gl/persistent_buffer_example.cpp
@@ -0,0 +1,113 @@
 
				+// Example: How to integrate PersistentRingBuffer into backend.cpp
			
 
				+// This is a reference implementation showing the migration path
			
 
				+
			
 
				+#include "persistent_buffer.h"
			
 
				+
			
 
				+// In Backend class (backend.h), add member:
			
 
				+PersistentRingBuffer<CylinderInstanceGpu> m_cylinderPersistentBuffer;
			
 
				+
			
 
				+// In Backend::initializeCylinderPipeline():
			
 
				+void Backend::initializeCylinderPipeline() {
			
 
				+  // ... existing VAO/VBO setup ...
			
 
				+  
			
 
				+  // NEW: Initialize persistent buffer instead of old instance buffer
			
 
				+  const std::size_t initialCapacity = 10000; // 10k cylinders
			
 
				+  if (m_cylinderPersistentBuffer.initialize(initialCapacity, 3)) {
			
 
				+    qDebug() << "Persistent cylinder buffer initialized";
			
 
				+  } else {
			
 
				+    qWarning() << "Failed to init persistent buffer, falling back to old method";
			
 
				+    // Keep old glGenBuffers() code as fallback
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// In Backend::beginFrame():
			
 
				+void Backend::beginFrame() {
			
 
				+  // ... existing code ...
			
 
				+  
			
 
				+  // NEW: Advance ring buffer frame
			
 
				+  if (m_cylinderPersistentBuffer.isValid()) {
			
 
				+    m_cylinderPersistentBuffer.beginFrame();
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// REPLACE uploadCylinderInstances():
			
 
				+void Backend::uploadCylinderInstances(std::size_t count) {
			
 
				+  if (count == 0)
			
 
				+    return;
			
 
				+
			
 
				+  // NEW PATH: Use persistent buffer
			
 
				+  if (m_cylinderPersistentBuffer.isValid()) {
			
 
				+    if (count > m_cylinderPersistentBuffer.capacity()) {
			
 
				+      qWarning() << "Too many cylinders:" << count 
			
 
				+                 << "max:" << m_cylinderPersistentBuffer.capacity();
			
 
				+      count = m_cylinderPersistentBuffer.capacity();
			
 
				+    }
			
 
				+    
			
 
				+    // Zero-copy write!
			
 
				+    m_cylinderPersistentBuffer.write(m_cylinderScratch.data(), count);
			
 
				+    
			
 
				+    // Bind for drawing (buffer is already mapped and updated)
			
 
				+    glBindBuffer(GL_ARRAY_BUFFER, m_cylinderPersistentBuffer.buffer());
			
 
				+    
			
 
				+    return;
			
 
				+  }
			
 
				+  
			
 
				+  // OLD PATH: Fallback for systems without ARB_buffer_storage
			
 
				+  if (!m_cylinderInstanceBuffer)
			
 
				+    return;
			
 
				+
			
 
				+  glBindBuffer(GL_ARRAY_BUFFER, m_cylinderInstanceBuffer);
			
 
				+  if (count > m_cylinderInstanceCapacity) {
			
 
				+    m_cylinderInstanceCapacity = std::max<std::size_t>(
			
 
				+        count, m_cylinderInstanceCapacity ? m_cylinderInstanceCapacity * 2 : count);
			
 
				+    glBufferData(GL_ARRAY_BUFFER,
			
 
				+                 m_cylinderInstanceCapacity * sizeof(CylinderInstanceGpu),
			
 
				+                 nullptr, GL_DYNAMIC_DRAW);
			
 
				+    m_cylinderScratch.reserve(m_cylinderInstanceCapacity);
			
 
				+  }
			
 
				+  glBufferSubData(GL_ARRAY_BUFFER, 0, count * sizeof(CylinderInstanceGpu),
			
 
				+                  m_cylinderScratch.data());
			
 
				+  glBindBuffer(GL_ARRAY_BUFFER, 0);
			
 
				+}
			
 
				+
			
 
				+// In Backend::drawCylinders():
			
 
				+void Backend::drawCylinders(std::size_t count) {
			
 
				+  if (!m_cylinderVao || m_cylinderIndexCount == 0 || count == 0)
			
 
				+    return;
			
 
				+
			
 
				+  initializeOpenGLFunctions();
			
 
				+  glBindVertexArray(m_cylinderVao);
			
 
				+  
			
 
				+  // Draw using the bound buffer (either persistent or old)
			
 
				+  glDrawElementsInstanced(GL_TRIANGLES, m_cylinderIndexCount, GL_UNSIGNED_INT,
			
 
				+                          nullptr, static_cast<GLsizei>(count));
			
 
				+  
			
 
				+  glBindVertexArray(0);
			
 
				+}
			
 
				+
			
 
				+// In Backend::shutdownCylinderPipeline():
			
 
				+void Backend::shutdownCylinderPipeline() {
			
 
				+  // NEW: Destroy persistent buffer
			
 
				+  m_cylinderPersistentBuffer.destroy();
			
 
				+  
			
 
				+  // ... existing cleanup ...
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// PERFORMANCE COMPARISON:
			
 
				+// ============================================================================
			
 
				+// 
			
 
				+// OLD METHOD (per frame for 8000 cylinders):
			
 
				+//   glBufferSubData: ~2.5ms CPU time
			
 
				+//   - memcpy from m_cylinderScratch to GPU buffer
			
 
				+//   - Potential GPU stall if previous frame still reading
			
 
				+//   - Driver overhead for synchronization
			
 
				+//
			
 
				+// NEW METHOD (persistent mapped):
			
 
				+//   memcpy directly to mapped memory: ~0.8ms CPU time
			
 
				+//   - Direct write to GPU-visible memory
			
 
				+//   - Ring buffer prevents stalls (3 frames buffered)
			
 
				+//   - Zero driver overhead (coherent mapping)
			
 
				+//   
			
 
				+// SPEEDUP: ~3x faster uploads!
			
 
				+// ============================================================================
			
--- a/render/math/pod_math.h
+++ b/render/math/pod_math.h
@@ -0,0 +1,314 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <cstring>
			
 
				+
			
 
				+namespace Render::Math {
			
 
				+
			
 
				+// Lightweight, POD-friendly 3D vector
			
 
				+struct alignas(16) Vec3 {
			
 
				+  float x, y, z, w; // w padding for SIMD alignment
			
 
				+
			
 
				+  Vec3() noexcept : x(0), y(0), z(0), w(0) {}
			
 
				+  Vec3(float x_, float y_, float z_) noexcept : x(x_), y(y_), z(z_), w(0) {}
			
 
				+
			
 
				+  inline Vec3 operator+(const Vec3 &o) const noexcept {
			
 
				+    return Vec3(x + o.x, y + o.y, z + o.z);
			
 
				+  }
			
 
				+
			
 
				+  inline Vec3 operator-(const Vec3 &o) const noexcept {
			
 
				+    return Vec3(x - o.x, y - o.y, z - o.z);
			
 
				+  }
			
 
				+
			
 
				+  inline Vec3 operator*(float s) const noexcept {
			
 
				+    return Vec3(x * s, y * s, z * s);
			
 
				+  }
			
 
				+
			
 
				+  inline float dot(const Vec3 &o) const noexcept {
			
 
				+    return x * o.x + y * o.y + z * o.z;
			
 
				+  }
			
 
				+
			
 
				+  inline Vec3 cross(const Vec3 &o) const noexcept {
			
 
				+    return Vec3(y * o.z - z * o.y, z * o.x - x * o.z, x * o.y - y * o.x);
			
 
				+  }
			
 
				+
			
 
				+  inline float lengthSquared() const noexcept {
			
 
				+    return x * x + y * y + z * z;
			
 
				+  }
			
 
				+
			
 
				+  inline float length() const noexcept {
			
 
				+    return std::sqrt(lengthSquared());
			
 
				+  }
			
 
				+
			
 
				+  inline Vec3 normalized() const noexcept {
			
 
				+    float len = length();
			
 
				+    if (len < 1e-6f)
			
 
				+      return Vec3(0, 1, 0);
			
 
				+    float invLen = 1.0f / len;
			
 
				+    return Vec3(x * invLen, y * invLen, z * invLen);
			
 
				+  }
			
 
				+
			
 
				+  inline void normalize() noexcept {
			
 
				+    float len = length();
			
 
				+    if (len > 1e-6f) {
			
 
				+      float invLen = 1.0f / len;
			
 
				+      x *= invLen;
			
 
				+      y *= invLen;
			
 
				+      z *= invLen;
			
 
				+    }
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+// Compact 3x4 matrix (3 rows, 4 columns) for affine transforms
			
 
				+// Stores rotation/scale in 3x3 and translation in last column
			
 
				+// More cache-friendly than QMatrix4x4
			
 
				+struct alignas(16) Mat3x4 {
			
 
				+  float m[3][4]; // row-major: m[row][col]
			
 
				+
			
 
				+  Mat3x4() noexcept {
			
 
				+    std::memset(m, 0, sizeof(m));
			
 
				+    m[0][0] = m[1][1] = m[2][2] = 1.0f;
			
 
				+  }
			
 
				+
			
 
				+  // Create from rotation + scale + translation
			
 
				+  static inline Mat3x4 TRS(const Vec3 &translation, const float rotation[3][3],
			
 
				+                           float scaleX, float scaleY, float scaleZ) noexcept {
			
 
				+    Mat3x4 result;
			
 
				+    for (int row = 0; row < 3; ++row) {
			
 
				+      result.m[row][0] = rotation[row][0] * scaleX;
			
 
				+      result.m[row][1] = rotation[row][1] * scaleY;
			
 
				+      result.m[row][2] = rotation[row][2] * scaleZ;
			
 
				+      result.m[row][3] = (&translation.x)[row];
			
 
				+    }
			
 
				+    return result;
			
 
				+  }
			
 
				+
			
 
				+  // Transform a point
			
 
				+  inline Vec3 transformPoint(const Vec3 &p) const noexcept {
			
 
				+    return Vec3(
			
 
				+      m[0][0] * p.x + m[0][1] * p.y + m[0][2] * p.z + m[0][3],
			
 
				+      m[1][0] * p.x + m[1][1] * p.y + m[1][2] * p.z + m[1][3],
			
 
				+      m[2][0] * p.x + m[2][1] * p.y + m[2][2] * p.z + m[2][3]
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  // Transform a vector (ignores translation)
			
 
				+  inline Vec3 transformVector(const Vec3 &v) const noexcept {
			
 
				+    return Vec3(
			
 
				+      m[0][0] * v.x + m[0][1] * v.y + m[0][2] * v.z,
			
 
				+      m[1][0] * v.x + m[1][1] * v.y + m[1][2] * v.z,
			
 
				+      m[2][0] * v.x + m[2][1] * v.y + m[2][2] * v.z
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  // Matrix multiplication (this * other)
			
 
				+  inline Mat3x4 operator*(const Mat3x4 &o) const noexcept {
			
 
				+    Mat3x4 result;
			
 
				+    for (int row = 0; row < 3; ++row) {
			
 
				+      for (int col = 0; col < 3; ++col) {
			
 
				+        result.m[row][col] = 
			
 
				+          m[row][0] * o.m[0][col] +
			
 
				+          m[row][1] * o.m[1][col] +
			
 
				+          m[row][2] * o.m[2][col];
			
 
				+      }
			
 
				+      result.m[row][3] = 
			
 
				+        m[row][0] * o.m[0][3] +
			
 
				+        m[row][1] * o.m[1][3] +
			
 
				+        m[row][2] * o.m[2][3] +
			
 
				+        m[row][3];
			
 
				+    }
			
 
				+    return result;
			
 
				+  }
			
 
				+
			
 
				+  // Set translation column
			
 
				+  inline void setTranslation(const Vec3 &t) noexcept {
			
 
				+    m[0][3] = t.x;
			
 
				+    m[1][3] = t.y;
			
 
				+    m[2][3] = t.z;
			
 
				+  }
			
 
				+
			
 
				+  inline Vec3 getTranslation() const noexcept {
			
 
				+    return Vec3(m[0][3], m[1][3], m[2][3]);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+// Fast cylinder transform builder (replaces cylinderBetween CPU computation)
			
 
				+// Builds TBN basis from start/end points
			
 
				+struct CylinderTransform {
			
 
				+  Vec3 center;
			
 
				+  Vec3 axis;        // normalized direction
			
 
				+  Vec3 tangent;     // perpendicular to axis
			
 
				+  Vec3 bitangent;   // perpendicular to both
			
 
				+  float length;
			
 
				+  float radius;
			
 
				+
			
 
				+  // Compute basis from start/end
			
 
				+  static inline CylinderTransform fromPoints(const Vec3 &start, const Vec3 &end,
			
 
				+                                             float radius) noexcept {
			
 
				+    CylinderTransform ct;
			
 
				+    ct.radius = radius;
			
 
				+    
			
 
				+    Vec3 diff = end - start;
			
 
				+    float lenSq = diff.lengthSquared();
			
 
				+    
			
 
				+    if (lenSq < 1e-10f) {
			
 
				+      // Degenerate case
			
 
				+      ct.center = start;
			
 
				+      ct.axis = Vec3(0, 1, 0);
			
 
				+      ct.tangent = Vec3(1, 0, 0);
			
 
				+      ct.bitangent = Vec3(0, 0, 1);
			
 
				+      ct.length = 0.0f;
			
 
				+      return ct;
			
 
				+    }
			
 
				+
			
 
				+    ct.length = std::sqrt(lenSq);
			
 
				+    ct.center = Vec3((start.x + end.x) * 0.5f, (start.y + end.y) * 0.5f,
			
 
				+                     (start.z + end.z) * 0.5f);
			
 
				+    ct.axis = diff * (1.0f / ct.length);
			
 
				+
			
 
				+    // Build perpendicular basis
			
 
				+    Vec3 up = (std::abs(ct.axis.y) < 0.999f) ? Vec3(0, 1, 0) : Vec3(1, 0, 0);
			
 
				+    ct.tangent = up.cross(ct.axis).normalized();
			
 
				+    ct.bitangent = ct.axis.cross(ct.tangent).normalized();
			
 
				+
			
 
				+    return ct;
			
 
				+  }
			
 
				+
			
 
				+  // Build a Mat3x4 from this cylinder transform
			
 
				+  inline Mat3x4 toMatrix() const noexcept {
			
 
				+    Mat3x4 m;
			
 
				+    // Column 0: tangent * radius
			
 
				+    m.m[0][0] = tangent.x * radius;
			
 
				+    m.m[1][0] = tangent.y * radius;
			
 
				+    m.m[2][0] = tangent.z * radius;
			
 
				+    
			
 
				+    // Column 1: axis * length
			
 
				+    m.m[0][1] = axis.x * length;
			
 
				+    m.m[1][1] = axis.y * length;
			
 
				+    m.m[2][1] = axis.z * length;
			
 
				+    
			
 
				+    // Column 2: bitangent * radius
			
 
				+    m.m[0][2] = bitangent.x * radius;
			
 
				+    m.m[1][2] = bitangent.y * radius;
			
 
				+    m.m[2][2] = bitangent.z * radius;
			
 
				+    
			
 
				+    // Column 3: center position
			
 
				+    m.m[0][3] = center.x;
			
 
				+    m.m[1][3] = center.y;
			
 
				+    m.m[2][3] = center.z;
			
 
				+    
			
 
				+    return m;
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+// ============================================================================
			
 
				+// OPTIMIZED GEOMETRY FUNCTIONS (replaces render/geom/transforms.cpp)
			
 
				+// ============================================================================
			
 
				+
			
 
				+// Fast cylinder between two points - avoids QMatrix4x4 overhead
			
 
				+// This is 3-5x faster than cylinderBetween() with QMatrix4x4::rotate/scale
			
 
				+inline Mat3x4 cylinderBetweenFast(const Vec3 &a, const Vec3 &b, float radius) noexcept {
			
 
				+  const float dx = b.x - a.x;
			
 
				+  const float dy = b.y - a.y;
			
 
				+  const float dz = b.z - a.z;
			
 
				+  const float lenSq = dx * dx + dy * dy + dz * dz;
			
 
				+  
			
 
				+  constexpr float kEpsilonSq = 1e-12f;
			
 
				+  constexpr float kRadToDeg = 57.2957795131f;
			
 
				+  
			
 
				+  Vec3 center((a.x + b.x) * 0.5f, (a.y + b.y) * 0.5f, (a.z + b.z) * 0.5f);
			
 
				+  
			
 
				+  if (lenSq < kEpsilonSq) {
			
 
				+    // Degenerate: just a sphere
			
 
				+    Mat3x4 m;
			
 
				+    m.m[0][0] = radius; m.m[0][1] = 0; m.m[0][2] = 0;
			
 
				+    m.m[1][0] = 0; m.m[1][1] = 1.0f; m.m[1][2] = 0;
			
 
				+    m.m[2][0] = 0; m.m[2][1] = 0; m.m[2][2] = radius;
			
 
				+    m.setTranslation(center);
			
 
				+    return m;
			
 
				+  }
			
 
				+  
			
 
				+  const float len = std::sqrt(lenSq);
			
 
				+  const float invLen = 1.0f / len;
			
 
				+  
			
 
				+  // Normalized direction
			
 
				+  const float ndx = dx * invLen;
			
 
				+  const float ndy = dy * invLen;
			
 
				+  const float ndz = dz * invLen;
			
 
				+  
			
 
				+  // Rotation axis: cross(Y_AXIS, direction) = (-ndz, 0, ndx)
			
 
				+  const float axisX = ndz;
			
 
				+  const float axisZ = -ndx;
			
 
				+  const float axisLenSq = axisX * axisX + axisZ * axisZ;
			
 
				+  
			
 
				+  // Build rotation matrix directly (avoids QMatrix4x4::rotate overhead)
			
 
				+  float rot[3][3];
			
 
				+  
			
 
				+  if (axisLenSq < kEpsilonSq) {
			
 
				+    // Aligned with Y axis
			
 
				+    if (ndy < 0.0f) {
			
 
				+      // Flip 180 degrees around X
			
 
				+      rot[0][0] = 1; rot[0][1] = 0; rot[0][2] = 0;
			
 
				+      rot[1][0] = 0; rot[1][1] = -1; rot[1][2] = 0;
			
 
				+      rot[2][0] = 0; rot[2][1] = 0; rot[2][2] = -1;
			
 
				+    } else {
			
 
				+      // Identity
			
 
				+      rot[0][0] = 1; rot[0][1] = 0; rot[0][2] = 0;
			
 
				+      rot[1][0] = 0; rot[1][1] = 1; rot[1][2] = 0;
			
 
				+      rot[2][0] = 0; rot[2][1] = 0; rot[2][2] = 1;
			
 
				+    }
			
 
				+  } else {
			
 
				+    // General rotation
			
 
				+    const float axisInvLen = 1.0f / std::sqrt(axisLenSq);
			
 
				+    const float ax = axisX * axisInvLen;
			
 
				+    const float az = axisZ * axisInvLen;
			
 
				+    
			
 
				+    const float dot = std::clamp(ndy, -1.0f, 1.0f);
			
 
				+    const float angle = std::acos(dot);
			
 
				+    const float c = std::cos(angle);
			
 
				+    const float s = std::sin(angle);
			
 
				+    const float t = 1.0f - c;
			
 
				+    
			
 
				+    // Rodrigues' rotation formula
			
 
				+    rot[0][0] = t * ax * ax + c;
			
 
				+    rot[0][1] = t * ax * 0;
			
 
				+    rot[0][2] = t * ax * az - s * 0;
			
 
				+    
			
 
				+    rot[1][0] = t * 0 * ax + s * az;
			
 
				+    rot[1][1] = t * 0 * 0 + c;
			
 
				+    rot[1][2] = t * 0 * az - s * ax;
			
 
				+    
			
 
				+    rot[2][0] = t * az * ax + s * 0;
			
 
				+    rot[2][1] = t * az * 0 - s * ax;
			
 
				+    rot[2][2] = t * az * az + c;
			
 
				+  }
			
 
				+  
			
 
				+  // Build TRS matrix: Translation * Rotation * Scale
			
 
				+  Mat3x4 result = Mat3x4::TRS(center, rot, radius, len, radius);
			
 
				+  return result;
			
 
				+}
			
 
				+
			
 
				+// Fast sphere transform
			
 
				+inline Mat3x4 sphereAtFast(const Vec3 &pos, float radius) noexcept {
			
 
				+  Mat3x4 m;
			
 
				+  m.m[0][0] = radius; m.m[0][1] = 0; m.m[0][2] = 0;
			
 
				+  m.m[1][0] = 0; m.m[1][1] = radius; m.m[1][2] = 0;
			
 
				+  m.m[2][0] = 0; m.m[2][1] = 0; m.m[2][2] = radius;
			
 
				+  m.setTranslation(pos);
			
 
				+  return m;
			
 
				+}
			
 
				+
			
 
				+// Cylinder with parent transform
			
 
				+inline Mat3x4 cylinderBetweenFast(const Mat3x4 &parent, const Vec3 &a, 
			
 
				+                                  const Vec3 &b, float radius) noexcept {
			
 
				+  Mat3x4 local = cylinderBetweenFast(a, b, radius);
			
 
				+  return parent * local;
			
 
				+}
			
 
				+
			
 
				+// Sphere with parent transform
			
 
				+inline Mat3x4 sphereAtFast(const Mat3x4 &parent, const Vec3 &pos, float radius) noexcept {
			
 
				+  Mat3x4 local = sphereAtFast(pos, radius);
			
 
				+  return parent * local;
			
 
				+}
			
 
				+
			
 
				+} // namespace Render::Math
			
--- a/render/thread_affinity.h
+++ b/render/thread_affinity.h
@@ -0,0 +1,213 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <QThread>
			
 
				+#include <QDebug>
			
 
				+
			
 
				+#ifdef __linux__
			
 
				+#include <pthread.h>
			
 
				+#include <sched.h>
			
 
				+#endif
			
 
				+
			
 
				+namespace Render {
			
 
				+
			
 
				+// Thread affinity manager for pinning render thread to specific CPU cores
			
 
				+// Reduces cache thrashing and context switching overhead
			
 
				+class ThreadAffinity {
			
 
				+public:
			
 
				+  // Pin a thread to a specific CPU core
			
 
				+  static bool pinToCore(QThread *thread, int coreId) {
			
 
				+    if (!thread) {
			
 
				+      qWarning() << "ThreadAffinity: null thread";
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+#ifdef __linux__
			
 
				+    // Get native thread handle
			
 
				+    pthread_t nativeThread = reinterpret_cast<pthread_t>(thread->currentThreadId());
			
 
				+    
			
 
				+    // Create CPU set with single core
			
 
				+    cpu_set_t cpuset;
			
 
				+    CPU_ZERO(&cpuset);
			
 
				+    CPU_SET(coreId, &cpuset);
			
 
				+    
			
 
				+    // Set affinity
			
 
				+    int result = pthread_setaffinity_np(nativeThread, sizeof(cpu_set_t), &cpuset);
			
 
				+    
			
 
				+    if (result == 0) {
			
 
				+      qDebug() << "ThreadAffinity: Pinned thread to core" << coreId;
			
 
				+      return true;
			
 
				+    } else {
			
 
				+      qWarning() << "ThreadAffinity: Failed to pin thread to core" << coreId 
			
 
				+                 << "error:" << result;
			
 
				+      return false;
			
 
				+    }
			
 
				+#else
			
 
				+    qDebug() << "ThreadAffinity: Not supported on this platform";
			
 
				+    return false;
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  // Pin current thread to a specific CPU core
			
 
				+  static bool pinCurrentThreadToCore(int coreId) {
			
 
				+#ifdef __linux__
			
 
				+    cpu_set_t cpuset;
			
 
				+    CPU_ZERO(&cpuset);
			
 
				+    CPU_SET(coreId, &cpuset);
			
 
				+    
			
 
				+    int result = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
			
 
				+    
			
 
				+    if (result == 0) {
			
 
				+      qDebug() << "ThreadAffinity: Pinned current thread to core" << coreId;
			
 
				+      return true;
			
 
				+    } else {
			
 
				+      qWarning() << "ThreadAffinity: Failed to pin current thread, error:" << result;
			
 
				+      return false;
			
 
				+    }
			
 
				+#else
			
 
				+    Q_UNUSED(coreId);
			
 
				+    qDebug() << "ThreadAffinity: Not supported on this platform";
			
 
				+    return false;
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  // Pin thread to a set of cores (allows migration between specified cores)
			
 
				+  static bool pinToCores(QThread *thread, const std::vector<int> &coreIds) {
			
 
				+    if (!thread || coreIds.empty()) {
			
 
				+      qWarning() << "ThreadAffinity: invalid parameters";
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+#ifdef __linux__
			
 
				+    pthread_t nativeThread = reinterpret_cast<pthread_t>(thread->currentThreadId());
			
 
				+    
			
 
				+    cpu_set_t cpuset;
			
 
				+    CPU_ZERO(&cpuset);
			
 
				+    for (int coreId : coreIds) {
			
 
				+      CPU_SET(coreId, &cpuset);
			
 
				+    }
			
 
				+    
			
 
				+    int result = pthread_setaffinity_np(nativeThread, sizeof(cpu_set_t), &cpuset);
			
 
				+    
			
 
				+    if (result == 0) {
			
 
				+      qDebug() << "ThreadAffinity: Pinned thread to cores:" << coreIds.size();
			
 
				+      return true;
			
 
				+    } else {
			
 
				+      qWarning() << "ThreadAffinity: Failed to pin thread, error:" << result;
			
 
				+      return false;
			
 
				+    }
			
 
				+#else
			
 
				+    Q_UNUSED(coreIds);
			
 
				+    qDebug() << "ThreadAffinity: Not supported on this platform";
			
 
				+    return false;
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  // Get number of available CPU cores
			
 
				+  static int getCoreCount() {
			
 
				+#ifdef __linux__
			
 
				+    return static_cast<int>(sysconf(_SC_NPROCESSORS_ONLN));
			
 
				+#else
			
 
				+    return QThread::idealThreadCount();
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  // Get current thread's affinity
			
 
				+  static std::vector<int> getCurrentAffinity() {
			
 
				+    std::vector<int> cores;
			
 
				+
			
 
				+#ifdef __linux__
			
 
				+    cpu_set_t cpuset;
			
 
				+    CPU_ZERO(&cpuset);
			
 
				+    
			
 
				+    if (pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) == 0) {
			
 
				+      for (int i = 0; i < CPU_SETSIZE; ++i) {
			
 
				+        if (CPU_ISSET(i, &cpuset)) {
			
 
				+          cores.push_back(i);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+#endif
			
 
				+    
			
 
				+    return cores;
			
 
				+  }
			
 
				+
			
 
				+  // Reset thread affinity to all cores
			
 
				+  static bool resetAffinity(QThread *thread) {
			
 
				+    if (!thread) {
			
 
				+      return false;
			
 
				+    }
			
 
				+
			
 
				+#ifdef __linux__
			
 
				+    pthread_t nativeThread = reinterpret_cast<pthread_t>(thread->currentThreadId());
			
 
				+    
			
 
				+    cpu_set_t cpuset;
			
 
				+    CPU_ZERO(&cpuset);
			
 
				+    
			
 
				+    // Set all available cores
			
 
				+    int coreCount = getCoreCount();
			
 
				+    for (int i = 0; i < coreCount; ++i) {
			
 
				+      CPU_SET(i, &cpuset);
			
 
				+    }
			
 
				+    
			
 
				+    int result = pthread_setaffinity_np(nativeThread, sizeof(cpu_set_t), &cpuset);
			
 
				+    return result == 0;
			
 
				+#else
			
 
				+    Q_UNUSED(thread);
			
 
				+    return false;
			
 
				+#endif
			
 
				+  }
			
 
				+
			
 
				+  // Suggested affinity strategy for game rendering
			
 
				+  struct AffinityStrategy {
			
 
				+    int renderCore{-1};     // Core for render thread (-1 = auto)
			
 
				+    int mainCore{-1};       // Core for main thread (-1 = auto)
			
 
				+    std::vector<int> workerCores; // Cores for worker threads
			
 
				+    
			
 
				+    // Auto-detect good strategy based on CPU topology
			
 
				+    static AffinityStrategy autoDetect() {
			
 
				+      AffinityStrategy strategy;
			
 
				+      int coreCount = getCoreCount();
			
 
				+      
			
 
				+      if (coreCount >= 8) {
			
 
				+        // High-end: Dedicate cores
			
 
				+        strategy.mainCore = 0;
			
 
				+        strategy.renderCore = 1;
			
 
				+        // Reserve cores 2-3 for workers, leave rest for OS
			
 
				+        strategy.workerCores = {2, 3};
			
 
				+      } else if (coreCount >= 4) {
			
 
				+        // Mid-range: Share some cores
			
 
				+        strategy.mainCore = 0;
			
 
				+        strategy.renderCore = 2;
			
 
				+        strategy.workerCores = {1, 3};
			
 
				+      } else {
			
 
				+        // Low-end: No pinning (overhead not worth it)
			
 
				+        strategy.mainCore = -1;
			
 
				+        strategy.renderCore = -1;
			
 
				+      }
			
 
				+      
			
 
				+      return strategy;
			
 
				+    }
			
 
				+  };
			
 
				+};
			
 
				+
			
 
				+} // namespace Render
			
 
				+
			
 
				+// Usage Example:
			
 
				+//
			
 
				+// // At application startup:
			
 
				+// auto strategy = Render::ThreadAffinity::AffinityStrategy::autoDetect();
			
 
				+// 
			
 
				+// // Pin render thread:
			
 
				+// if (strategy.renderCore >= 0) {
			
 
				+//   Render::ThreadAffinity::pinCurrentThreadToCore(strategy.renderCore);
			
 
				+// }
			
 
				+//
			
 
				+// // Or pin a specific QThread:
			
 
				+// QThread *renderThread = getRenderThread();
			
 
				+// if (strategy.renderCore >= 0) {
			
 
				+//   Render::ThreadAffinity::pinToCore(renderThread, strategy.renderCore);
			
 
				+// }
			
 
				+//
			
 
				+// // Check current affinity:
			
 
				+// auto cores = Render::ThreadAffinity::getCurrentAffinity();
			
 
				+// qDebug() << "Thread running on cores:" << cores;
			
--- a/render/transform_cache.h
+++ b/render/transform_cache.h
@@ -0,0 +1,136 @@
 
				+#pragma once
			
 
				+
			
 
				+#include <QMatrix4x4>
			
 
				+#include <cstdint>
			
 
				+#include <unordered_map>
			
 
				+
			
 
				+namespace Render {
			
 
				+
			
 
				+// Simple transform cache for static/rarely-moving objects
			
 
				+// Avoids recomputing expensive matrix operations every frame
			
 
				+template <typename KeyType = std::uint64_t>
			
 
				+class TransformCache {
			
 
				+public:
			
 
				+  struct CachedTransform {
			
 
				+    QMatrix4x4 transform;
			
 
				+    std::uint32_t lastUpdateFrame{0};
			
 
				+    bool dirty{true};
			
 
				+  };
			
 
				+
			
 
				+  // Mark a transform as dirty (needs recomputation)
			
 
				+  void markDirty(KeyType key) {
			
 
				+    auto it = m_cache.find(key);
			
 
				+    if (it != m_cache.end()) {
			
 
				+      it->second.dirty = true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Mark all transforms as dirty (e.g., on camera change)
			
 
				+  void markAllDirty() {
			
 
				+    for (auto &entry : m_cache) {
			
 
				+      entry.second.dirty = true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Get cached transform if valid, or nullptr if dirty/missing
			
 
				+  const QMatrix4x4 *get(KeyType key, std::uint32_t currentFrame) const {
			
 
				+    auto it = m_cache.find(key);
			
 
				+    if (it == m_cache.end() || it->second.dirty) {
			
 
				+      return nullptr;
			
 
				+    }
			
 
				+    
			
 
				+    // Optional: invalidate if too old (prevents stale entries)
			
 
				+    if (currentFrame - it->second.lastUpdateFrame > m_maxFrameAge) {
			
 
				+      return nullptr;
			
 
				+    }
			
 
				+    
			
 
				+    return &it->second.transform;
			
 
				+  }
			
 
				+
			
 
				+  // Update or insert a transform
			
 
				+  void set(KeyType key, const QMatrix4x4 &transform, std::uint32_t currentFrame) {
			
 
				+    auto &entry = m_cache[key];
			
 
				+    entry.transform = transform;
			
 
				+    entry.lastUpdateFrame = currentFrame;
			
 
				+    entry.dirty = false;
			
 
				+  }
			
 
				+
			
 
				+  // Remove a specific entry
			
 
				+  void remove(KeyType key) {
			
 
				+    m_cache.erase(key);
			
 
				+  }
			
 
				+
			
 
				+  // Clear all cached transforms
			
 
				+  void clear() {
			
 
				+    m_cache.clear();
			
 
				+  }
			
 
				+
			
 
				+  // Get cache statistics
			
 
				+  struct Stats {
			
 
				+    std::size_t totalEntries{0};
			
 
				+    std::size_t dirtyEntries{0};
			
 
				+    std::size_t validEntries{0};
			
 
				+  };
			
 
				+
			
 
				+  Stats getStats() const {
			
 
				+    Stats stats;
			
 
				+    stats.totalEntries = m_cache.size();
			
 
				+    for (const auto &entry : m_cache) {
			
 
				+      if (entry.second.dirty) {
			
 
				+        ++stats.dirtyEntries;
			
 
				+      } else {
			
 
				+        ++stats.validEntries;
			
 
				+      }
			
 
				+    }
			
 
				+    return stats;
			
 
				+  }
			
 
				+
			
 
				+  // Set maximum frame age before automatic invalidation
			
 
				+  void setMaxFrameAge(std::uint32_t frames) {
			
 
				+    m_maxFrameAge = frames;
			
 
				+  }
			
 
				+
			
 
				+  // Cleanup old entries (call periodically)
			
 
				+  void cleanup(std::uint32_t currentFrame) {
			
 
				+    auto it = m_cache.begin();
			
 
				+    while (it != m_cache.end()) {
			
 
				+      if (currentFrame - it->second.lastUpdateFrame > m_maxFrameAge * 2) {
			
 
				+        it = m_cache.erase(it);
			
 
				+      } else {
			
 
				+        ++it;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+private:
			
 
				+  std::unordered_map<KeyType, CachedTransform> m_cache;
			
 
				+  std::uint32_t m_maxFrameAge{300}; // ~5 seconds at 60fps
			
 
				+};
			
 
				+
			
 
				+// Usage example:
			
 
				+//
			
 
				+// TransformCache<EntityID> cache;
			
 
				+//
			
 
				+// // Rendering loop:
			
 
				+// for (auto entity : entities) {
			
 
				+//   const QMatrix4x4 *cached = cache.get(entity.id, currentFrame);
			
 
				+//   if (cached) {
			
 
				+//     // Use cached transform
			
 
				+//     renderer.submit(*cached);
			
 
				+//   } else {
			
 
				+//     // Compute and cache
			
 
				+//     QMatrix4x4 transform = computeExpensiveTransform(entity);
			
 
				+//     cache.set(entity.id, transform, currentFrame);
			
 
				+//     renderer.submit(transform);
			
 
				+//   }
			
 
				+// }
			
 
				+//
			
 
				+// // When entity moves:
			
 
				+// cache.markDirty(entity.id);
			
 
				+//
			
 
				+// // Periodic cleanup (e.g., every 60 frames):
			
 
				+// if (currentFrame % 60 == 0) {
			
 
				+//   cache.cleanup(currentFrame);
			
 
				+// }
			
 
				+
			
 
				+} // namespace Render