ht.c 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. #include <stdio.h>
  2. #include <stdint.h>
  3. #include <time.h>
  4. #define NOB_IMPLEMENTATION
  5. #include "nob.h"
  6. typedef struct {
  7. Nob_String_View key;
  8. size_t value;
  9. bool occupied;
  10. } FreqKV;
  11. typedef struct {
  12. FreqKV *items;
  13. size_t count;
  14. size_t capacity;
  15. } FreqKVs;
  16. FreqKV *find_key(FreqKVs haystack, Nob_String_View needle)
  17. {
  18. for (size_t i = 0; i < haystack.count; ++i) {
  19. if (nob_sv_eq(haystack.items[i].key, needle)) {
  20. return &haystack.items[i];
  21. }
  22. }
  23. return NULL;
  24. }
  25. int compare_freqkv_count_reversed(const void *a, const void *b)
  26. {
  27. const FreqKV *akv = a;
  28. const FreqKV *bkv = b;
  29. return (int)bkv->value - (int)akv->value;
  30. }
  31. double clock_get_secs(void)
  32. {
  33. struct timespec ts = {0};
  34. int ret = clock_gettime(CLOCK_MONOTONIC, &ts);
  35. assert(ret == 0);
  36. return (double)ts.tv_sec + ts.tv_nsec*1e-9;
  37. }
  38. void naive_analysis(Nob_String_View content, const char *file_path)
  39. {
  40. nob_log(NOB_INFO, "Analyzing %s linearly", file_path);
  41. nob_log(NOB_INFO, " Size: %zu bytes", content.count);
  42. double begin = clock_get_secs();
  43. FreqKVs freq = {0}; // TODO: a bit of memory leak
  44. size_t count = 0;
  45. for (; content.count > 0; ++count) {
  46. content = nob_sv_trim_left(content);
  47. Nob_String_View token = nob_sv_chop_by_space(&content);
  48. FreqKV *kv = find_key(freq, token);
  49. if (kv) {
  50. kv->value += 1;
  51. } else {
  52. nob_da_append(&freq, ((FreqKV) {
  53. .key = token,
  54. .value = 1
  55. }));
  56. }
  57. }
  58. double end = clock_get_secs();
  59. nob_log(NOB_INFO, " Tokens: %zu tokens", freq.count);
  60. qsort(freq.items, freq.count, sizeof(freq.items[0]), compare_freqkv_count_reversed);
  61. nob_log(NOB_INFO, " Top 10 tokens");
  62. for (size_t i = 0; i < 10 && i < freq.count; ++i) {
  63. nob_log(NOB_INFO, " %zu: "SV_Fmt" => %zu", i, SV_Arg(freq.items[i].key), freq.items[i].value);
  64. }
  65. nob_log(NOB_INFO, " Elapsed time %.03lfs", end - begin);
  66. }
  67. uint32_t djb2(uint8_t *buf, size_t buf_size)
  68. {
  69. uint32_t hash = 5381;
  70. for (size_t i = 0; i < buf_size; ++i) {
  71. hash = ((hash << 5) + hash) + (uint32_t)buf[i]; /* hash * 33 + c */
  72. }
  73. return hash;
  74. }
  75. #define hash_init(ht, cap) \
  76. do { \
  77. (ht)->items = malloc(sizeof(*(ht)->items)*cap); \
  78. memset((ht)->items, 0, sizeof(*(ht)->items)*cap); \
  79. (ht)->count = 0; \
  80. (ht)->capacity = (cap); \
  81. } while(0)
  82. #define hash_find(ht, key, hash, eq)
  83. bool hash_analysis(Nob_String_View content, const char *file_path)
  84. {
  85. nob_log(NOB_INFO, "Analyzing %s with Hash Table", file_path);
  86. nob_log(NOB_INFO, " Size: %zu bytes", content.count);
  87. double begin = clock_get_secs();
  88. FreqKVs ht = {0};
  89. hash_init(&ht, 1000*1000);
  90. size_t count = 0;
  91. for (; content.count > 0; ++count) {
  92. content = nob_sv_trim_left(content);
  93. Nob_String_View token = nob_sv_chop_by_space(&content);
  94. uint32_t h = djb2((uint8_t*)token.data, token.count)%ht.capacity;
  95. for (size_t i = 0; i < ht.capacity && ht.items[h].occupied && !nob_sv_eq(ht.items[h].key, token); ++i) {
  96. h = (h + 1)%ht.capacity;
  97. }
  98. if (ht.items[h].occupied) {
  99. if (!nob_sv_eq(ht.items[h].key, token)) {
  100. nob_log(NOB_ERROR, "Table overflow");
  101. return false;
  102. }
  103. ht.items[h].value += 1;
  104. } else {
  105. ht.items[h].occupied = true;
  106. ht.items[h].key = token;
  107. ht.items[h].value = 1;
  108. }
  109. }
  110. double end = clock_get_secs();
  111. FreqKVs freq = {0};
  112. for (size_t i = 0; i < ht.capacity; ++i) {
  113. if (ht.items[i].occupied) {
  114. nob_da_append(&freq, ht.items[i]);
  115. }
  116. }
  117. qsort(freq.items, freq.count, sizeof(freq.items[0]), compare_freqkv_count_reversed);
  118. nob_log(NOB_INFO, " Tokens: %zu tokens", freq.count);
  119. nob_log(NOB_INFO, " Top 10 tokens");
  120. for (size_t i = 0; i < 10 && i < freq.count; ++i) {
  121. nob_log(NOB_INFO, " %zu: "SV_Fmt" => %zu", i, SV_Arg(freq.items[i].key), freq.items[i].value);
  122. }
  123. nob_log(NOB_INFO, " Elapsed time %.03lfs", end - begin);
  124. return true;
  125. }
  126. int main(int argc, char **argv)
  127. {
  128. const char *program = nob_shift_args(&argc, &argv);
  129. if (argc <= 0) {
  130. nob_log(NOB_ERROR, "No input is provided");
  131. nob_log(NOB_INFO, "Usage: %s <input.txt>", program);
  132. return 1;
  133. }
  134. const char *file_path = nob_shift_args(&argc, &argv);
  135. Nob_String_Builder buf = {0};
  136. if (!nob_read_entire_file(file_path, &buf)) return 1;
  137. Nob_String_View content = {
  138. .data = buf.items,
  139. .count = buf.count,
  140. };
  141. naive_analysis(content, file_path);
  142. if (!hash_analysis(content, file_path)) return 1;
  143. return 0;
  144. }