| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764 |
- // File: crn_clusterizer.h
- // See Copyright Notice and license at the end of inc/crnlib.h
- #pragma once
- #include "crn_matrix.h"
- namespace crnlib
- {
- template<typename VectorType>
- class clusterizer
- {
- public:
- clusterizer() :
- m_overall_variance(0.0f),
- m_split_index(0),
- m_heap_size(0),
- m_quick(false)
- {
- }
- void clear()
- {
- m_training_vecs.clear();
- m_codebook.clear();
- m_nodes.clear();
- m_overall_variance = 0.0f;
- m_split_index = 0;
- m_heap_size = 0;
- m_quick = false;
- }
- void reserve_training_vecs(uint num_expected)
- {
- m_training_vecs.reserve(num_expected);
- }
- void add_training_vec(const VectorType& v, uint weight)
- {
- m_training_vecs.push_back( std::make_pair(v, weight) );
- }
- typedef bool (*progress_callback_func_ptr)(uint percentage_completed, void* pData);
- bool generate_codebook(uint max_size, progress_callback_func_ptr pProgress_callback = NULL, void* pProgress_data = NULL, bool quick = false)
- {
- if (m_training_vecs.empty())
- return false;
- m_quick = quick;
- double ttsum = 0.0f;
- vq_node root;
- root.m_vectors.reserve(m_training_vecs.size());
- for (uint i = 0; i < m_training_vecs.size(); i++)
- {
- const VectorType& v = m_training_vecs[i].first;
- const uint weight = m_training_vecs[i].second;
- root.m_centroid += (v * (float)weight);
- root.m_total_weight += weight;
- root.m_vectors.push_back(i);
- ttsum += v.dot(v) * weight;
- }
- root.m_variance = (float)(ttsum - (root.m_centroid.dot(root.m_centroid) / root.m_total_weight));
- root.m_centroid *= (1.0f / root.m_total_weight);
- m_nodes.clear();
- m_nodes.reserve(max_size * 2 + 1);
- m_nodes.push_back(root);
- m_heap.resize(max_size + 1);
- m_heap[1] = 0;
- m_heap_size = 1;
- m_split_index = 0;
- uint total_leaves = 1;
- m_left_children.reserve(m_training_vecs.size() + 1);
- m_right_children.reserve(m_training_vecs.size() + 1);
- int prev_percentage = -1;
- while ((total_leaves < max_size) && (m_heap_size))
- {
- int worst_node_index = m_heap[1];
- m_heap[1] = m_heap[m_heap_size];
- m_heap_size--;
- if (m_heap_size)
- down_heap(1);
- split_node(worst_node_index);
- total_leaves++;
- if ((pProgress_callback) && ((total_leaves & 63) == 0) && (max_size))
- {
- int cur_percentage = (total_leaves * 100U + (max_size / 2U)) / max_size;
- if (cur_percentage != prev_percentage)
- {
- if (!(*pProgress_callback)(cur_percentage, pProgress_data))
- return false;
- prev_percentage = cur_percentage;
- }
- }
- }
- m_codebook.clear();
- m_overall_variance = 0.0f;
- for (uint i = 0; i < m_nodes.size(); i++)
- {
- vq_node& node = m_nodes[i];
- if (node.m_left != -1)
- {
- CRNLIB_ASSERT(node.m_right != -1);
- continue;
- }
- CRNLIB_ASSERT((node.m_left == -1) && (node.m_right == -1));
- node.m_codebook_index = m_codebook.size();
- m_codebook.push_back(node.m_centroid);
- m_overall_variance += node.m_variance;
- }
- m_heap.clear();
- m_left_children.clear();
- m_right_children.clear();
- return true;
- }
- inline uint get_num_training_vecs() const { return m_training_vecs.size(); }
- const VectorType& get_training_vec(uint index) const { return m_training_vecs[index].first; }
- uint get_training_vec_weight(uint index) const { return m_training_vecs[index].second; }
- typedef crnlib::vector< std::pair<VectorType, uint> > training_vec_array;
- const training_vec_array& get_training_vecs() const { return m_training_vecs; }
- training_vec_array& get_training_vecs() { return m_training_vecs; }
- inline float get_overall_variance() const { return m_overall_variance; }
- inline uint get_codebook_size() const
- {
- return m_codebook.size();
- }
- inline const VectorType& get_codebook_entry(uint index) const
- {
- return m_codebook[index];
- }
- VectorType& get_codebook_entry(uint index)
- {
- return m_codebook[index];
- }
- typedef crnlib::vector<VectorType> vector_vec_type;
- inline const vector_vec_type& get_codebook() const
- {
- return m_codebook;
- }
- uint find_best_codebook_entry(const VectorType& v) const
- {
- uint cur_node_index = 0;
- for ( ; ; )
- {
- const vq_node& cur_node = m_nodes[cur_node_index];
- if (cur_node.m_left == -1)
- return cur_node.m_codebook_index;
- const vq_node& left_node = m_nodes[cur_node.m_left];
- const vq_node& right_node = m_nodes[cur_node.m_right];
- float left_dist = left_node.m_centroid.squared_distance(v);
- float right_dist = right_node.m_centroid.squared_distance(v);
- if (left_dist < right_dist)
- cur_node_index = cur_node.m_left;
- else
- cur_node_index = cur_node.m_right;
- }
- }
- const VectorType& find_best_codebook_entry(const VectorType& v, uint max_codebook_size) const
- {
- uint cur_node_index = 0;
- for ( ; ; )
- {
- const vq_node& cur_node = m_nodes[cur_node_index];
- if ((cur_node.m_left == -1) || ((cur_node.m_codebook_index + 1) >= (int)max_codebook_size))
- return cur_node.m_centroid;
- const vq_node& left_node = m_nodes[cur_node.m_left];
- const vq_node& right_node = m_nodes[cur_node.m_right];
- float left_dist = left_node.m_centroid.squared_distance(v);
- float right_dist = right_node.m_centroid.squared_distance(v);
- if (left_dist < right_dist)
- cur_node_index = cur_node.m_left;
- else
- cur_node_index = cur_node.m_right;
- }
- }
- uint find_best_codebook_entry_fs(const VectorType& v) const
- {
- float best_dist = math::cNearlyInfinite;
- uint best_index = 0;
- for (uint i = 0; i < m_codebook.size(); i++)
- {
- float dist = m_codebook[i].squared_distance(v);
- if (dist < best_dist)
- {
- best_dist = dist;
- best_index = i;
- if (best_dist == 0.0f)
- break;
- }
- }
- return best_index;
- }
- void retrieve_clusters(uint max_clusters, crnlib::vector< crnlib::vector<uint> >& clusters) const
- {
- clusters.resize(0);
- clusters.reserve(max_clusters);
- crnlib::vector<uint> stack;
- stack.reserve(512);
- uint cur_node_index = 0;
- for ( ; ; )
- {
- const vq_node& cur_node = m_nodes[cur_node_index];
- if ( (cur_node.is_leaf()) || ((cur_node.m_codebook_index + 2) > (int)max_clusters) )
- {
- clusters.resize(clusters.size() + 1);
- clusters.back() = cur_node.m_vectors;
- if (stack.empty())
- break;
- cur_node_index = stack.back();
- stack.pop_back();
- continue;
- }
- cur_node_index = cur_node.m_left;
- stack.push_back(cur_node.m_right);
- }
- }
- private:
- training_vec_array m_training_vecs;
- struct vq_node
- {
- vq_node() : m_centroid(cClear), m_total_weight(0), m_left(-1), m_right(-1), m_codebook_index(-1), m_unsplittable(false) { }
- VectorType m_centroid;
- uint64 m_total_weight;
- float m_variance;
- crnlib::vector<uint> m_vectors;
- int m_left;
- int m_right;
- int m_codebook_index;
- bool m_unsplittable;
- bool is_leaf() const { return m_left < 0; }
- };
- typedef crnlib::vector<vq_node> node_vec_type;
- node_vec_type m_nodes;
- vector_vec_type m_codebook;
- float m_overall_variance;
- uint m_split_index;
- crnlib::vector<uint> m_heap;
- uint m_heap_size;
- bool m_quick;
- void insert_heap(uint node_index)
- {
- const float variance = m_nodes[node_index].m_variance;
- uint pos = ++m_heap_size;
- if (m_heap_size >= m_heap.size())
- m_heap.resize(m_heap_size + 1);
- for ( ; ; )
- {
- uint parent = pos >> 1;
- if (!parent)
- break;
- float parent_variance = m_nodes[m_heap[parent]].m_variance;
- if (parent_variance > variance)
- break;
- m_heap[pos] = m_heap[parent];
- pos = parent;
- }
- m_heap[pos] = node_index;
- }
- void down_heap(uint pos)
- {
- uint child;
- uint orig = m_heap[pos];
- const float orig_variance = m_nodes[orig].m_variance;
- while ((child = (pos << 1)) <= m_heap_size)
- {
- if (child < m_heap_size)
- {
- if (m_nodes[m_heap[child]].m_variance < m_nodes[m_heap[child + 1]].m_variance)
- child++;
- }
- if (orig_variance > m_nodes[m_heap[child]].m_variance)
- break;
- m_heap[pos] = m_heap[child];
- pos = child;
- }
- m_heap[pos] = orig;
- }
- void compute_split_estimate(VectorType& left_child_res, VectorType& right_child_res, const vq_node& parent_node)
- {
- VectorType furthest(0);
- double furthest_dist = -1.0f;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- double dist = v.squared_distance(parent_node.m_centroid);
- if (dist > furthest_dist)
- {
- furthest_dist = dist;
- furthest = v;
- }
- }
- VectorType opposite(0);
- double opposite_dist = -1.0f;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- double dist = v.squared_distance(furthest);
- if (dist > opposite_dist)
- {
- opposite_dist = dist;
- opposite = v;
- }
- }
- left_child_res = (furthest + parent_node.m_centroid) * .5f;
- right_child_res = (opposite + parent_node.m_centroid) * .5f;
- }
- void compute_split_pca(VectorType& left_child_res, VectorType& right_child_res, const vq_node& parent_node)
- {
- if (parent_node.m_vectors.size() == 2)
- {
- left_child_res = m_training_vecs[parent_node.m_vectors[0]].first;
- right_child_res = m_training_vecs[parent_node.m_vectors[1]].first;
- return;
- }
- const uint N = VectorType::num_elements;
- matrix<N, N, float> covar;
- covar.clear();
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const VectorType v(m_training_vecs[parent_node.m_vectors[i]].first - parent_node.m_centroid);
- const VectorType w(v * (float)m_training_vecs[parent_node.m_vectors[i]].second);
- for (uint x = 0; x < N; x++)
- for (uint y = x; y < N; y++)
- covar[x][y] = covar[x][y] + v[x] * w[y];
- }
- float one_over_total_weight = 1.0f / parent_node.m_total_weight;
- for (uint x = 0; x < N; x++)
- for (uint y = x; y < N; y++)
- covar[x][y] *= one_over_total_weight;
- for (uint x = 0; x < (N - 1); x++)
- for (uint y = x + 1; y < N; y++)
- covar[y][x] = covar[x][y];
- VectorType axis;//(1.0f);
- if (N == 1)
- axis.set(1.0f);
- else
- {
- for (uint i = 0; i < N; i++)
- axis[i] = math::lerp(.75f, 1.25f, i * (1.0f / math::maximum<int>(N - 1, 1)));
- }
- VectorType prev_axis(axis);
- for (uint iter = 0; iter < 10; iter++)
- {
- VectorType x;
- double max_sum = 0;
- for (uint i = 0; i < N; i++)
- {
- double sum = 0;
- for (uint j = 0; j < N; j++)
- sum += axis[j] * covar[i][j];
- x[i] = static_cast<float>(sum);
- max_sum = math::maximum(max_sum, fabs(sum));
- }
- if (max_sum != 0.0f)
- x *= static_cast<float>(1.0f / max_sum);
- VectorType delta_axis(prev_axis - x);
- prev_axis = axis;
- axis = x;
- if (delta_axis.norm() < .0025f)
- break;
- }
- axis.normalize();
- VectorType left_child(0.0f);
- VectorType right_child(0.0f);
- double left_weight = 0.0f;
- double right_weight = 0.0f;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const float weight = (float)m_training_vecs[parent_node.m_vectors[i]].second;
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- double t = (v - parent_node.m_centroid) * axis;
- if (t < 0.0f)
- {
- left_child += v * weight;
- left_weight += weight;
- }
- else
- {
- right_child += v * weight;
- right_weight += weight;
- }
- }
- if ((left_weight > 0.0f) && (right_weight > 0.0f))
- {
- left_child_res = left_child * (float)(1.0f / left_weight);
- right_child_res = right_child * (float)(1.0f / right_weight);
- }
- else
- {
- compute_split_estimate(left_child_res, right_child_res, parent_node);
- }
- }
- #if 0
- void compute_split_pca2(VectorType& left_child_res, VectorType& right_child_res, const vq_node& parent_node)
- {
- if (parent_node.m_vectors.size() == 2)
- {
- left_child_res = m_training_vecs[parent_node.m_vectors[0]].first;
- right_child_res = m_training_vecs[parent_node.m_vectors[1]].first;
- return;
- }
- const uint N = VectorType::num_elements;
- VectorType furthest;
- double furthest_dist = -1.0f;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- double dist = v.squared_distance(parent_node.m_centroid);
- if (dist > furthest_dist)
- {
- furthest_dist = dist;
- furthest = v;
- }
- }
- VectorType opposite;
- double opposite_dist = -1.0f;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- double dist = v.squared_distance(furthest);
- if (dist > opposite_dist)
- {
- opposite_dist = dist;
- opposite = v;
- }
- }
- VectorType axis(opposite - furthest);
- if (axis.normalize() < .000125f)
- {
- left_child_res = (furthest + parent_node.m_centroid) * .5f;
- right_child_res = (opposite + parent_node.m_centroid) * .5f;
- return;
- }
- for (uint iter = 0; iter < 2; iter++)
- {
- double next_axis[N];
- utils::zero_object(next_axis);
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const double weight = m_training_vecs[parent_node.m_vectors[i]].second;
- VectorType v(m_training_vecs[parent_node.m_vectors[i]].first - parent_node.m_centroid);
- double dot = (v * axis) * weight;
- for (uint j = 0; j < N; j++)
- next_axis[j] += dot * v[j];
- }
- double w = 0.0f;
- for (uint j = 0; j < N; j++)
- w += next_axis[j] * next_axis[j];
- if (w > 0.0f)
- {
- w = 1.0f / sqrt(w);
- for (uint j = 0; j < N; j++)
- axis[j] = static_cast<float>(next_axis[j] * w);
- }
- else
- break;
- }
- VectorType left_child(0.0f);
- VectorType right_child(0.0f);
- double left_weight = 0.0f;
- double right_weight = 0.0f;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const float weight = (float)m_training_vecs[parent_node.m_vectors[i]].second;
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- double t = (v - parent_node.m_centroid) * axis;
- if (t < 0.0f)
- {
- left_child += v * weight;
- left_weight += weight;
- }
- else
- {
- right_child += v * weight;
- right_weight += weight;
- }
- }
- if ((left_weight > 0.0f) && (right_weight > 0.0f))
- {
- left_child_res = left_child * (float)(1.0f / left_weight);
- right_child_res = right_child * (float)(1.0f / right_weight);
- }
- else
- {
- left_child_res = (furthest + parent_node.m_centroid) * .5f;
- right_child_res = (opposite + parent_node.m_centroid) * .5f;
- }
- }
- #endif
- // thread safety warning: shared state!
- crnlib::vector<uint> m_left_children;
- crnlib::vector<uint> m_right_children;
- void split_node(uint index)
- {
- vq_node& parent_node = m_nodes[index];
- if (parent_node.m_vectors.size() == 1)
- return;
- VectorType left_child, right_child;
- if (m_quick)
- compute_split_estimate(left_child, right_child, parent_node);
- else
- compute_split_pca(left_child, right_child, parent_node);
- uint64 left_weight = 0;
- uint64 right_weight = 0;
- float prev_total_variance = 1e+10f;
- float left_variance = 0.0f;
- float right_variance = 0.0f;
- const uint cMaxLoops = m_quick ? 2 : 8;
- for (uint total_loops = 0; total_loops < cMaxLoops; total_loops++)
- {
- m_left_children.resize(0);
- m_right_children.resize(0);
- VectorType new_left_child(cClear);
- VectorType new_right_child(cClear);
- double left_ttsum = 0.0f;
- double right_ttsum = 0.0f;
- left_weight = 0;
- right_weight = 0;
- for (uint i = 0; i < parent_node.m_vectors.size(); i++)
- {
- const VectorType& v = m_training_vecs[parent_node.m_vectors[i]].first;
- const uint weight = m_training_vecs[parent_node.m_vectors[i]].second;
- double left_dist2 = left_child.squared_distance(v);
- double right_dist2 = right_child.squared_distance(v);
- if (left_dist2 < right_dist2)
- {
- m_left_children.push_back(parent_node.m_vectors[i]);
- new_left_child += (v * (float)weight);
- left_weight += weight;
- left_ttsum += v.dot(v) * weight;
- }
- else
- {
- m_right_children.push_back(parent_node.m_vectors[i]);
- new_right_child += (v * (float)weight);
- right_weight += weight;
- right_ttsum += v.dot(v) * weight;
- }
- }
- if ((!left_weight) || (!right_weight))
- {
- parent_node.m_unsplittable = true;
- return;
- }
- left_variance = (float)(left_ttsum - (new_left_child.dot(new_left_child) / left_weight));
- right_variance = (float)(right_ttsum - (new_right_child.dot(new_right_child) / right_weight));
- new_left_child *= (1.0f / left_weight);
- new_right_child *= (1.0f / right_weight);
- left_child = new_left_child;
- left_weight = left_weight;
- right_child = new_right_child;
- right_weight = right_weight;
- float total_variance = left_variance + right_variance;
- if (total_variance < .00001f)
- break;
- //const float variance_delta_thresh = .00001f;
- const float variance_delta_thresh = .00125f;
- if (((prev_total_variance - total_variance) / total_variance) < variance_delta_thresh)
- break;
- prev_total_variance = total_variance;
- }
- const uint left_child_index = m_nodes.size();
- const uint right_child_index = m_nodes.size() + 1;
- parent_node.m_left = m_nodes.size();
- parent_node.m_right = m_nodes.size() + 1;
- parent_node.m_codebook_index = m_split_index;
- m_split_index++;
- m_nodes.resize(m_nodes.size() + 2);
- // parent_node is invalid now, because m_nodes has been changed
- vq_node& left_child_node = m_nodes[left_child_index];
- vq_node& right_child_node = m_nodes[right_child_index];
- left_child_node.m_centroid = left_child;
- left_child_node.m_total_weight = left_weight;
- left_child_node.m_vectors.swap(m_left_children);
- left_child_node.m_variance = left_variance;
- if ((left_child_node.m_vectors.size() > 1) && (left_child_node.m_variance > 0.0f))
- insert_heap(left_child_index);
- right_child_node.m_centroid = right_child;
- right_child_node.m_total_weight = right_weight;
- right_child_node.m_vectors.swap(m_right_children);
- right_child_node.m_variance = right_variance;
- if ((right_child_node.m_vectors.size() > 1) && (right_child_node.m_variance > 0.0f))
- insert_heap(right_child_index);
- }
- };
- } // namespace crnlib
|