2
0

data.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*
  2. * Copyright (c) Facebook, Inc.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #include "data.h"
  11. #include <assert.h>
  12. #include <errno.h>
  13. #include <stdio.h>
  14. #include <string.h>
  15. #include <stdlib.h> /* free() */
  16. #include <sys/stat.h>
  17. #include <curl/curl.h>
  18. #include "mem.h"
  19. #include "util.h"
  20. #define XXH_STATIC_LINKING_ONLY
  21. #include "xxhash.h"
  22. /**
  23. * Data objects
  24. */
  25. #define REGRESSION_RELEASE(x) \
  26. "https://github.com/facebook/zstd/releases/download/regression-data/" x
  27. data_t silesia = {
  28. .name = "silesia",
  29. .type = data_type_dir,
  30. .data =
  31. {
  32. .url = REGRESSION_RELEASE("silesia.tar.zst"),
  33. .xxhash64 = 0x48a199f92f93e977LL,
  34. },
  35. };
  36. data_t silesia_tar = {
  37. .name = "silesia.tar",
  38. .type = data_type_file,
  39. .data =
  40. {
  41. .url = REGRESSION_RELEASE("silesia.tar.zst"),
  42. .xxhash64 = 0x48a199f92f93e977LL,
  43. },
  44. };
  45. data_t github = {
  46. .name = "github",
  47. .type = data_type_dir,
  48. .data =
  49. {
  50. .url = REGRESSION_RELEASE("github.tar.zst"),
  51. .xxhash64 = 0xa9b1b44b020df292LL,
  52. },
  53. .dict =
  54. {
  55. .url = REGRESSION_RELEASE("github.dict.zst"),
  56. .xxhash64 = 0x1eddc6f737d3cb53LL,
  57. },
  58. };
  59. data_t github_tar = {
  60. .name = "github.tar",
  61. .type = data_type_file,
  62. .data =
  63. {
  64. .url = REGRESSION_RELEASE("github.tar.zst"),
  65. .xxhash64 = 0xa9b1b44b020df292LL,
  66. },
  67. .dict =
  68. {
  69. .url = REGRESSION_RELEASE("github.dict.zst"),
  70. .xxhash64 = 0x1eddc6f737d3cb53LL,
  71. },
  72. };
  73. static data_t* g_data[] = {
  74. &silesia,
  75. &silesia_tar,
  76. &github,
  77. &github_tar,
  78. NULL,
  79. };
  80. data_t const* const* data = (data_t const* const*)g_data;
  81. /**
  82. * data helpers.
  83. */
  84. int data_has_dict(data_t const* data) {
  85. return data->dict.url != NULL;
  86. }
  87. /**
  88. * data buffer helper functions (documented in header).
  89. */
  90. data_buffer_t data_buffer_create(size_t const capacity) {
  91. data_buffer_t buffer = {};
  92. buffer.data = (uint8_t*)malloc(capacity);
  93. if (buffer.data == NULL)
  94. return buffer;
  95. buffer.capacity = capacity;
  96. return buffer;
  97. }
  98. data_buffer_t data_buffer_read(char const* filename) {
  99. data_buffer_t buffer = {};
  100. uint64_t const size = UTIL_getFileSize(filename);
  101. if (size == UTIL_FILESIZE_UNKNOWN) {
  102. fprintf(stderr, "unknown size for %s\n", filename);
  103. return buffer;
  104. }
  105. buffer.data = (uint8_t*)malloc(size);
  106. if (buffer.data == NULL) {
  107. fprintf(stderr, "malloc failed\n");
  108. return buffer;
  109. }
  110. buffer.capacity = size;
  111. FILE* file = fopen(filename, "rb");
  112. if (file == NULL) {
  113. fprintf(stderr, "file null\n");
  114. goto err;
  115. }
  116. buffer.size = fread(buffer.data, 1, buffer.capacity, file);
  117. fclose(file);
  118. if (buffer.size != buffer.capacity) {
  119. fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
  120. goto err;
  121. }
  122. return buffer;
  123. err:
  124. free(buffer.data);
  125. memset(&buffer, 0, sizeof(buffer));
  126. return buffer;
  127. }
  128. data_buffer_t data_buffer_get_data(data_t const* data) {
  129. data_buffer_t const kEmptyBuffer = {};
  130. if (data->type != data_type_file)
  131. return kEmptyBuffer;
  132. return data_buffer_read(data->data.path);
  133. }
  134. data_buffer_t data_buffer_get_dict(data_t const* data) {
  135. data_buffer_t const kEmptyBuffer = {};
  136. if (!data_has_dict(data))
  137. return kEmptyBuffer;
  138. return data_buffer_read(data->dict.path);
  139. }
  140. int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
  141. size_t const size =
  142. buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
  143. int const cmp = memcmp(buffer1.data, buffer2.data, size);
  144. if (cmp != 0)
  145. return cmp;
  146. if (buffer1.size < buffer2.size)
  147. return -1;
  148. if (buffer1.size == buffer2.size)
  149. return 0;
  150. assert(buffer1.size > buffer2.size);
  151. return 1;
  152. }
  153. void data_buffer_free(data_buffer_t buffer) {
  154. free(buffer.data);
  155. }
  156. /**
  157. * data filenames helpers.
  158. */
  159. FileNamesTable* data_filenames_get(data_t const* data)
  160. {
  161. char const* const path = data->data.path;
  162. return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
  163. }
  164. /**
  165. * data buffers helpers.
  166. */
  167. data_buffers_t data_buffers_get(data_t const* data) {
  168. data_buffers_t buffers = {.size = 0};
  169. FileNamesTable* const filenames = data_filenames_get(data);
  170. if (filenames == NULL) return buffers;
  171. if (filenames->tableSize == 0) {
  172. UTIL_freeFileNamesTable(filenames);
  173. return buffers;
  174. }
  175. data_buffer_t* buffersPtr =
  176. (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
  177. if (buffersPtr == NULL) {
  178. UTIL_freeFileNamesTable(filenames);
  179. return buffers;
  180. }
  181. buffers.buffers = (data_buffer_t const*)buffersPtr;
  182. buffers.size = filenames->tableSize;
  183. for (size_t i = 0; i < filenames->tableSize; ++i) {
  184. buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
  185. if (buffersPtr[i].data == NULL) {
  186. data_buffers_t const kEmptyBuffer = {};
  187. data_buffers_free(buffers);
  188. UTIL_freeFileNamesTable(filenames);
  189. return kEmptyBuffer;
  190. }
  191. }
  192. UTIL_freeFileNamesTable(filenames);
  193. return buffers;
  194. }
  195. /**
  196. * Frees the data buffers.
  197. */
  198. void data_buffers_free(data_buffers_t buffers) {
  199. free((data_buffer_t*)buffers.buffers);
  200. }
  201. /**
  202. * Initialization and download functions.
  203. */
  204. static char* g_data_dir = NULL;
  205. /* mkdir -p */
  206. static int ensure_directory_exists(char const* indir) {
  207. char* const dir = strdup(indir);
  208. char* end = dir;
  209. int ret = 0;
  210. if (dir == NULL) {
  211. ret = EINVAL;
  212. goto out;
  213. }
  214. do {
  215. /* Find the next directory level. */
  216. for (++end; *end != '\0' && *end != '/'; ++end)
  217. ;
  218. /* End the string there, make the directory, and restore the string. */
  219. char const save = *end;
  220. *end = '\0';
  221. int const isdir = UTIL_isDirectory(dir);
  222. ret = mkdir(dir, S_IRWXU);
  223. *end = save;
  224. /* Its okay if the directory already exists. */
  225. if (ret == 0 || (errno == EEXIST && isdir))
  226. continue;
  227. ret = errno;
  228. fprintf(stderr, "mkdir() failed\n");
  229. goto out;
  230. } while (*end != '\0');
  231. ret = 0;
  232. out:
  233. free(dir);
  234. return ret;
  235. }
  236. /** Concatenate 3 strings into a new buffer. */
  237. static char* cat3(char const* str1, char const* str2, char const* str3) {
  238. size_t const size1 = strlen(str1);
  239. size_t const size2 = strlen(str2);
  240. size_t const size3 = str3 == NULL ? 0 : strlen(str3);
  241. size_t const size = size1 + size2 + size3 + 1;
  242. char* const dst = (char*)malloc(size);
  243. if (dst == NULL)
  244. return NULL;
  245. strcpy(dst, str1);
  246. strcpy(dst + size1, str2);
  247. if (str3 != NULL)
  248. strcpy(dst + size1 + size2, str3);
  249. assert(strlen(dst) == size1 + size2 + size3);
  250. return dst;
  251. }
  252. static char* cat2(char const* str1, char const* str2) {
  253. return cat3(str1, str2, NULL);
  254. }
  255. /**
  256. * State needed by the curl callback.
  257. * It takes data from curl, hashes it, and writes it to the file.
  258. */
  259. typedef struct {
  260. FILE* file;
  261. XXH64_state_t xxhash64;
  262. int error;
  263. } curl_data_t;
  264. /** Create the curl state. */
  265. static curl_data_t curl_data_create(
  266. data_resource_t const* resource,
  267. data_type_t type) {
  268. curl_data_t cdata = {};
  269. XXH64_reset(&cdata.xxhash64, 0);
  270. assert(UTIL_isDirectory(g_data_dir));
  271. if (type == data_type_file) {
  272. /* Decompress the resource and store to the path. */
  273. char* cmd = cat3("zstd -dqfo '", resource->path, "'");
  274. if (cmd == NULL) {
  275. cdata.error = ENOMEM;
  276. return cdata;
  277. }
  278. cdata.file = popen(cmd, "w");
  279. free(cmd);
  280. } else {
  281. /* Decompress and extract the resource to the cache directory. */
  282. char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
  283. if (cmd == NULL) {
  284. cdata.error = ENOMEM;
  285. return cdata;
  286. }
  287. cdata.file = popen(cmd, "w");
  288. free(cmd);
  289. }
  290. if (cdata.file == NULL) {
  291. cdata.error = errno;
  292. }
  293. return cdata;
  294. }
  295. /** Free the curl state. */
  296. static int curl_data_free(curl_data_t cdata) {
  297. return pclose(cdata.file);
  298. }
  299. /** curl callback. Updates the hash, and writes to the file. */
  300. static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
  301. curl_data_t* cdata = (curl_data_t*)ptr;
  302. size_t const written = fwrite(data, size, count, cdata->file);
  303. XXH64_update(&cdata->xxhash64, data, written * size);
  304. return written;
  305. }
  306. static int curl_download_resource(
  307. CURL* curl,
  308. data_resource_t const* resource,
  309. data_type_t type) {
  310. curl_data_t cdata;
  311. /* Download the data. */
  312. if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
  313. return EINVAL;
  314. if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
  315. return EINVAL;
  316. cdata = curl_data_create(resource, type);
  317. if (cdata.error != 0)
  318. return cdata.error;
  319. int const curl_err = curl_easy_perform(curl);
  320. int const close_err = curl_data_free(cdata);
  321. if (curl_err) {
  322. fprintf(
  323. stderr,
  324. "downloading '%s' for '%s' failed\n",
  325. resource->url,
  326. resource->path);
  327. return EIO;
  328. }
  329. if (close_err) {
  330. fprintf(stderr, "writing data to '%s' failed\n", resource->path);
  331. return EIO;
  332. }
  333. /* check that the file exists. */
  334. if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
  335. fprintf(stderr, "output file '%s' does not exist\n", resource->path);
  336. return EIO;
  337. }
  338. if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
  339. fprintf(
  340. stderr, "output directory '%s' does not exist\n", resource->path);
  341. return EIO;
  342. }
  343. /* Check that the hash matches. */
  344. if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
  345. fprintf(
  346. stderr,
  347. "checksum does not match: 0x%llxLL != 0x%llxLL\n",
  348. (unsigned long long)XXH64_digest(&cdata.xxhash64),
  349. (unsigned long long)resource->xxhash64);
  350. return EINVAL;
  351. }
  352. return 0;
  353. }
  354. /** Download a single data object. */
  355. static int curl_download_datum(CURL* curl, data_t const* data) {
  356. int ret;
  357. ret = curl_download_resource(curl, &data->data, data->type);
  358. if (ret != 0)
  359. return ret;
  360. if (data_has_dict(data)) {
  361. ret = curl_download_resource(curl, &data->dict, data_type_file);
  362. if (ret != 0)
  363. return ret;
  364. }
  365. return ret;
  366. }
  367. /** Download all the data. */
  368. static int curl_download_data(data_t const* const* data) {
  369. if (curl_global_init(CURL_GLOBAL_ALL) != 0)
  370. return EFAULT;
  371. curl_data_t cdata = {};
  372. CURL* curl = curl_easy_init();
  373. int err = EFAULT;
  374. if (curl == NULL)
  375. return EFAULT;
  376. if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
  377. goto out;
  378. if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
  379. goto out;
  380. if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
  381. goto out;
  382. assert(data != NULL);
  383. for (; *data != NULL; ++data) {
  384. if (curl_download_datum(curl, *data) != 0)
  385. goto out;
  386. }
  387. err = 0;
  388. out:
  389. curl_easy_cleanup(curl);
  390. curl_global_cleanup();
  391. return err;
  392. }
  393. /** Fill the path member variable of the data objects. */
  394. static int data_create_paths(data_t* const* data, char const* dir) {
  395. size_t const dirlen = strlen(dir);
  396. assert(data != NULL);
  397. for (; *data != NULL; ++data) {
  398. data_t* const datum = *data;
  399. datum->data.path = cat3(dir, "/", datum->name);
  400. if (datum->data.path == NULL)
  401. return ENOMEM;
  402. if (data_has_dict(datum)) {
  403. datum->dict.path = cat2(datum->data.path, ".dict");
  404. if (datum->dict.path == NULL)
  405. return ENOMEM;
  406. }
  407. }
  408. return 0;
  409. }
  410. /** Free the path member variable of the data objects. */
  411. static void data_free_paths(data_t* const* data) {
  412. assert(data != NULL);
  413. for (; *data != NULL; ++data) {
  414. data_t* datum = *data;
  415. free((void*)datum->data.path);
  416. free((void*)datum->dict.path);
  417. datum->data.path = NULL;
  418. datum->dict.path = NULL;
  419. }
  420. }
  421. static char const kStampName[] = "STAMP";
  422. static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
  423. if (!MEM_isLittleEndian())
  424. data = MEM_swap64(data);
  425. XXH64_update(state, &data, sizeof(data));
  426. }
  427. /** Hash the data to create the stamp. */
  428. static uint64_t stamp_hash(data_t const* const* data) {
  429. XXH64_state_t state;
  430. XXH64_reset(&state, 0);
  431. assert(data != NULL);
  432. for (; *data != NULL; ++data) {
  433. data_t const* datum = *data;
  434. /* We don't care about the URL that we fetch from. */
  435. /* The path is derived from the name. */
  436. XXH64_update(&state, datum->name, strlen(datum->name));
  437. xxh_update_le(&state, datum->data.xxhash64);
  438. xxh_update_le(&state, datum->dict.xxhash64);
  439. xxh_update_le(&state, datum->type);
  440. }
  441. return XXH64_digest(&state);
  442. }
  443. /** Check if the stamp matches the stamp in the cache directory. */
  444. static int stamp_check(char const* dir, data_t const* const* data) {
  445. char* stamp = cat3(dir, "/", kStampName);
  446. uint64_t const expected = stamp_hash(data);
  447. XXH64_canonical_t actual;
  448. FILE* stampfile = NULL;
  449. int matches = 0;
  450. if (stamp == NULL)
  451. goto out;
  452. if (!UTIL_isRegularFile(stamp)) {
  453. fprintf(stderr, "stamp does not exist: recreating the data cache\n");
  454. goto out;
  455. }
  456. stampfile = fopen(stamp, "rb");
  457. if (stampfile == NULL) {
  458. fprintf(stderr, "could not open stamp: recreating the data cache\n");
  459. goto out;
  460. }
  461. size_t b;
  462. if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
  463. fprintf(stderr, "invalid stamp: recreating the data cache\n");
  464. goto out;
  465. }
  466. matches = (expected == XXH64_hashFromCanonical(&actual));
  467. if (matches)
  468. fprintf(stderr, "stamp matches: reusing the cached data\n");
  469. else
  470. fprintf(stderr, "stamp does not match: recreating the data cache\n");
  471. out:
  472. free(stamp);
  473. if (stampfile != NULL)
  474. fclose(stampfile);
  475. return matches;
  476. }
  477. /** On success write a new stamp, on failure delete the old stamp. */
  478. static int
  479. stamp_write(char const* dir, data_t const* const* data, int const data_err) {
  480. char* stamp = cat3(dir, "/", kStampName);
  481. FILE* stampfile = NULL;
  482. int err = EIO;
  483. if (stamp == NULL)
  484. return ENOMEM;
  485. if (data_err != 0) {
  486. err = data_err;
  487. goto out;
  488. }
  489. XXH64_canonical_t hash;
  490. XXH64_canonicalFromHash(&hash, stamp_hash(data));
  491. stampfile = fopen(stamp, "wb");
  492. if (stampfile == NULL)
  493. goto out;
  494. if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
  495. goto out;
  496. err = 0;
  497. fprintf(stderr, "stamped new data cache\n");
  498. out:
  499. if (err != 0)
  500. /* Ignore errors. */
  501. unlink(stamp);
  502. free(stamp);
  503. if (stampfile != NULL)
  504. fclose(stampfile);
  505. return err;
  506. }
  507. int data_init(char const* dir) {
  508. int err;
  509. if (dir == NULL)
  510. return EINVAL;
  511. /* This must be first to simplify logic. */
  512. err = ensure_directory_exists(dir);
  513. if (err != 0)
  514. return err;
  515. /* Save the cache directory. */
  516. g_data_dir = strdup(dir);
  517. if (g_data_dir == NULL)
  518. return ENOMEM;
  519. err = data_create_paths(g_data, dir);
  520. if (err != 0)
  521. return err;
  522. /* If the stamp matches then we are good to go.
  523. * This must be called before any modifications to the data cache.
  524. * After this point, we MUST call stamp_write() to update the STAMP,
  525. * since we've updated the data cache.
  526. */
  527. if (stamp_check(dir, data))
  528. return 0;
  529. err = curl_download_data(data);
  530. if (err != 0)
  531. goto out;
  532. out:
  533. /* This must be last, since it must know if data_init() succeeded. */
  534. stamp_write(dir, data, err);
  535. return err;
  536. }
  537. void data_finish(void) {
  538. data_free_paths(g_data);
  539. free(g_data_dir);
  540. g_data_dir = NULL;
  541. }