clustering.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /*************************************************************************
  2. * Copyright (c) 2011 AT&T Intellectual Property
  3. * All rights reserved. This program and the accompanying materials
  4. * are made available under the terms of the Eclipse Public License v1.0
  5. * which accompanies this distribution, and is available at
  6. * https://www.eclipse.org/legal/epl-v10.html
  7. *
  8. * Contributors: Details at https://graphviz.org
  9. *************************************************************************/
  10. #define STANDALONE
  11. #include <math.h>
  12. #include <sparse/general.h>
  13. #include <sparse/SparseMatrix.h>
  14. #include <sparse/clustering.h>
  15. #include <stdbool.h>
  16. #include <util/alloc.h>
  17. static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_init(SparseMatrix A, int level){
  18. int n = A->n, i, j;
  19. assert(A->type == MATRIX_TYPE_REAL);
  20. assert(SparseMatrix_is_symmetric(A, false));
  21. if (!A) return NULL;
  22. assert(A->m == n);
  23. Multilevel_Modularity_Clustering grid = gv_alloc(sizeof(struct Multilevel_Modularity_Clustering_struct));
  24. grid->level = level;
  25. grid->n = n;
  26. grid->A = A;
  27. grid->P = NULL;
  28. grid->next = NULL;
  29. grid->prev = NULL;
  30. grid->delete_top_level_A = false;
  31. grid->matching = gv_calloc(n, sizeof(double));
  32. grid->deg = NULL;
  33. grid->agglomerate_regardless = false;
  34. if (level == 0){
  35. double modularity = 0;
  36. int *ia = A->ia, *ja = A->ja;
  37. double deg_total = 0;
  38. double *deg, *a = A->a;
  39. grid->deg_total = 0.;
  40. grid->deg = gv_calloc(n, sizeof(double));
  41. deg = grid->deg;
  42. double *indeg = gv_calloc(n, sizeof(double));
  43. for (i = 0; i < n; i++){
  44. deg[i] = 0;
  45. indeg[i] = 0.;
  46. for (j = ia[i]; j < ia[i+1]; j++){
  47. deg[i] += a[j];
  48. if (ja[j] == i) indeg[i] = a[j];
  49. }
  50. deg_total += deg[i];
  51. }
  52. deg_total = fmax(deg_total, 1);
  53. for (i = 0; i < n; i++){
  54. modularity += (indeg[i] - deg[i]*deg[i]/deg_total)/deg_total;
  55. }
  56. grid->deg_total = deg_total;
  57. grid->deg = deg;
  58. grid->modularity = modularity;
  59. free(indeg);
  60. }
  61. return grid;
  62. }
  63. static void Multilevel_Modularity_Clustering_delete(Multilevel_Modularity_Clustering grid){
  64. if (!grid) return;
  65. if (grid->A){
  66. if (grid->level == 0) {
  67. if (grid->delete_top_level_A) SparseMatrix_delete(grid->A);
  68. } else {
  69. SparseMatrix_delete(grid->A);
  70. }
  71. }
  72. SparseMatrix_delete(grid->P);
  73. free(grid->matching);
  74. free(grid->deg);
  75. Multilevel_Modularity_Clustering_delete(grid->next);
  76. free(grid);
  77. }
  78. static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_establish(Multilevel_Modularity_Clustering grid, int ncluster_target){
  79. int *matching = grid->matching;
  80. SparseMatrix A = grid->A;
  81. int n = grid->n, level = grid->level, nc = 0;
  82. double modularity = 0;
  83. int *ia = A->ia, *ja = A->ja;
  84. double *deg = grid->deg;
  85. int i, j, jj, jc, jmax;
  86. double inv_deg_total = 1./ grid->deg_total;
  87. double gain;
  88. double maxgain;
  89. double total_gain = 0;
  90. modularity = grid->modularity;
  91. double *deg_new = gv_calloc(n, sizeof(double));
  92. double *deg_inter = gv_calloc(n, sizeof(double));
  93. int *mask = gv_calloc(n, sizeof(int));
  94. for (i = 0; i < n; i++) mask[i] = -1;
  95. assert(n == A->n);
  96. for (i = 0; i < n; i++) matching[i] = UNMATCHED;
  97. /* gain in merging node i into cluster j is
  98. deg(i,j)/deg_total - 2*deg(i)*deg(j)/deg_total^2
  99. */
  100. double *a = A->a;
  101. for (i = 0; i < n; i++){
  102. if (matching[i] != UNMATCHED) continue;
  103. /* accumulate connections between i and clusters */
  104. for (j = ia[i]; j < ia[i+1]; j++){
  105. jj = ja[j];
  106. if (jj == i) continue;
  107. if ((jc=matching[jj]) != UNMATCHED){
  108. if (mask[jc] != i) {
  109. mask[jc] = i;
  110. deg_inter[jc] = a[j];
  111. } else {
  112. deg_inter[jc] += a[j];
  113. }
  114. }
  115. }
  116. maxgain = 0;
  117. jmax = -1;
  118. for (j = ia[i]; j < ia[i+1]; j++){
  119. jj = ja[j];
  120. if (jj == i) continue;
  121. if ((jc=matching[jj]) == UNMATCHED){
  122. /* the first 2 is due to the fact that deg_iter gives edge weight from i to jj, but there are also edges from jj to i */
  123. gain = (2*a[j] - 2*deg[i]*deg[jj]*inv_deg_total)*inv_deg_total;
  124. } else {
  125. if (deg_inter[jc] > 0){
  126. /* the first 2 is due to the fact that deg_iter gives edge weight from i to jc, but there are also edges from jc to i */
  127. gain = (2*deg_inter[jc] - 2*deg[i]*deg_new[jc]*inv_deg_total)*inv_deg_total;
  128. // printf("mod = %f deg_inter[jc] =%f, deg[i] = %f, deg_new[jc]=%f, gain = %f\n",modularity, deg_inter[jc],deg[i],deg_new[jc],gain);
  129. deg_inter[jc] = -1; /* so that we do not redo the calulation when we hit another neighbor in cluster jc */
  130. } else {
  131. gain = -1;
  132. }
  133. }
  134. if (jmax < 0 || gain > maxgain){
  135. maxgain = gain;
  136. jmax = jj;
  137. }
  138. }
  139. /* now merge i and jmax */
  140. if (maxgain > 0 || grid->agglomerate_regardless){
  141. total_gain += maxgain;
  142. jc = matching[jmax];
  143. if (jc == UNMATCHED){
  144. //fprintf(stderr, "maxgain=%f, merge %d, %d\n",maxgain, i, jmax);
  145. matching[i] = matching[jmax] = nc;
  146. deg_new[nc] = deg[i] + deg[jmax];
  147. nc++;
  148. } else {
  149. //fprintf(stderr, "maxgain=%f, merge with existing cluster %d, %d\n",maxgain, i, jc);
  150. deg_new[jc] += deg[i];
  151. matching[i] = jc;
  152. }
  153. } else {
  154. assert(maxgain <= 0);
  155. matching[i] = nc;
  156. deg_new[nc] = deg[i];
  157. nc++;
  158. }
  159. }
  160. if (Verbose) fprintf(stderr,"modularity = %f new modularity = %f level = %d, n = %d, nc = %d, gain = %g\n", modularity, modularity + total_gain,
  161. level, n, nc, total_gain);
  162. if (ncluster_target > 0){
  163. if (nc <= ncluster_target && n >= ncluster_target){
  164. if (n - ncluster_target > ncluster_target - nc){/* ncluster = nc */
  165. } else if (n - ncluster_target <= ncluster_target - nc){/* ncluster_target close to n */
  166. fprintf(stderr,"ncluster_target = %d, close to n=%d\n", ncluster_target, n);
  167. for (i = 0; i < n; i++) matching[i] = i;
  168. free(deg_new);
  169. goto RETURN;
  170. }
  171. } else if (n < ncluster_target){
  172. fprintf(stderr,"n < target\n");
  173. for (i = 0; i < n; i++) matching[i] = i;
  174. free(deg_new);
  175. goto RETURN;
  176. }
  177. }
  178. if (nc >= 1 && (total_gain > 0 || nc < n)){
  179. /* now set up restriction and prolongation operator */
  180. SparseMatrix P, R, R0, B, cA;
  181. double one = 1.;
  182. Multilevel_Modularity_Clustering cgrid;
  183. R0 = SparseMatrix_new(nc, n, 1, MATRIX_TYPE_REAL, FORMAT_COORD);
  184. for (i = 0; i < n; i++){
  185. jj = matching[i];
  186. SparseMatrix_coordinate_form_add_entry(R0, jj, i, &one);
  187. }
  188. R = SparseMatrix_from_coordinate_format(R0);
  189. SparseMatrix_delete(R0);
  190. P = SparseMatrix_transpose(R);
  191. B = SparseMatrix_multiply(R, A);
  192. SparseMatrix_delete(R);
  193. if (!B) {
  194. free(deg_new);
  195. goto RETURN;
  196. }
  197. cA = SparseMatrix_multiply(B, P);
  198. SparseMatrix_delete(B);
  199. if (!cA) {
  200. free(deg_new);
  201. goto RETURN;
  202. }
  203. grid->P = P;
  204. level++;
  205. cgrid = Multilevel_Modularity_Clustering_init(cA, level);
  206. cgrid->deg = deg_new;
  207. cgrid->modularity = grid->modularity + total_gain;
  208. cgrid->deg_total = grid->deg_total;
  209. cgrid = Multilevel_Modularity_Clustering_establish(cgrid, ncluster_target);
  210. grid->next = cgrid;
  211. cgrid->prev = grid;
  212. } else {
  213. /* if we want a small number of cluster but right now we have too many, we will force agglomeration */
  214. if (ncluster_target > 0 && nc > ncluster_target && !(grid->agglomerate_regardless)){
  215. grid->agglomerate_regardless = true;
  216. free(deg_inter);
  217. free(mask);
  218. free(deg_new);
  219. return Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
  220. }
  221. /* no more improvement, stop and final clustering found */
  222. for (i = 0; i < n; i++) matching[i] = i;
  223. free(deg_new);
  224. }
  225. RETURN:
  226. free(deg_inter);
  227. free(mask);
  228. return grid;
  229. }
  230. static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_new(SparseMatrix A0, int ncluster_target){
  231. /* ncluster_target is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
  232. is desired. The resulting clustering will give as close to this number as possible.
  233. If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
  234. . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
  235. . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
  236. . selected among nc or nc2, which ever is closer to ncluster_target.
  237. Default: ncluster_target <= 0 */
  238. Multilevel_Modularity_Clustering grid;
  239. SparseMatrix A = A0;
  240. if (!SparseMatrix_is_symmetric(A, false) || A->type != MATRIX_TYPE_REAL){
  241. A = SparseMatrix_get_real_adjacency_matrix_symmetrized(A);
  242. }
  243. grid = Multilevel_Modularity_Clustering_init(A, 0);
  244. grid = Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
  245. if (A != A0) grid->delete_top_level_A = true; // be sure to clean up later
  246. return grid;
  247. }
  248. static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target,
  249. int *nclusters, int **assignment, double *modularity){
  250. /* find a clustering of vertices by maximize modularity
  251. A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
  252. ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
  253. is desired. The resulting clustering will give as close to this number as possible.
  254. If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
  255. . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
  256. . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
  257. . selected among nc or nc2, which ever is closer to ncluster_target.
  258. Default: ncluster_target <= 0
  259. nclusters: on output the number of clusters
  260. assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
  261. */
  262. Multilevel_Modularity_Clustering grid, cgrid;
  263. int *matching, i;
  264. SparseMatrix P;
  265. assert(A->m == A->n);
  266. *modularity = 0.;
  267. grid = Multilevel_Modularity_Clustering_new(A, ncluster_target);
  268. /* find coarsest */
  269. cgrid = grid;
  270. while (cgrid->next){
  271. cgrid = cgrid->next;
  272. }
  273. /* project clustering up */
  274. double *u = gv_calloc(cgrid->n, sizeof(double));
  275. for (i = 0; i < cgrid->n; i++) u[i] = (double) (cgrid->matching)[i];
  276. *nclusters = cgrid->n;
  277. *modularity = cgrid->modularity;
  278. while (cgrid->prev){
  279. double *v = NULL;
  280. P = cgrid->prev->P;
  281. SparseMatrix_multiply_vector(P, u, &v);
  282. free(u);
  283. u = v;
  284. cgrid = cgrid->prev;
  285. }
  286. if (*assignment){
  287. matching = *assignment;
  288. } else {
  289. matching = gv_calloc(grid->n, sizeof(int));
  290. *assignment = matching;
  291. }
  292. for (i = 0; i < grid->n; i++) (matching)[i] = (int) u[i];
  293. free(u);
  294. Multilevel_Modularity_Clustering_delete(grid);
  295. }
  296. void modularity_clustering(SparseMatrix A, bool inplace, int ncluster_target,
  297. int *nclusters, int **assignment, double *modularity){
  298. /* find a clustering of vertices by maximize modularity
  299. A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
  300. inplace: whether A can e modified. If true, A will be modified by removing diagonal.
  301. ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
  302. is desired. The resulting clustering will give as close to this number as possible.
  303. If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
  304. . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
  305. . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
  306. . selected among nc or nc2, which ever is closer to ncluster_target.
  307. Default: ncluster_target <= 0
  308. nclusters: on output the number of clusters
  309. assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
  310. */
  311. SparseMatrix B;
  312. assert(A->m == A->n);
  313. B = SparseMatrix_symmetrize(A, false);
  314. if (!inplace && B == A) {
  315. B = SparseMatrix_copy(A);
  316. }
  317. B = SparseMatrix_remove_diagonal(B);
  318. if (B->type != MATRIX_TYPE_REAL) B = SparseMatrix_set_entries_to_real_one(B);
  319. hierachical_modularity_clustering(B, ncluster_target, nclusters, assignment, modularity);
  320. if (B != A) SparseMatrix_delete(B);
  321. }