fastaredux.cpp 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. #pragma warning(disable:4838)
  2. #pragma warning(disable:4305)
  3. #define _GNU_SOURCE
  4. #include <stdlib.h>
  5. #include <stdio.h>
  6. #include <string.h>
  7. #include <assert.h>
  8. #define MIN(x, y) ((x < y) ? x : y)
  9. #define LINELEN 60
  10. #define SLOTS 4095
  11. struct aminoacid {
  12. char c;
  13. float p;
  14. };
  15. static struct aminoacid *lu[SLOTS + 1];
  16. static void repeat_fasta(const char *alu, size_t n)
  17. {
  18. const size_t alulen = strlen(alu);
  19. assert(alulen < 1024);
  20. char buf[1024 + LINELEN];
  21. size_t pos = 0, bytes;
  22. memcpy(buf, alu, alulen);
  23. memcpy(buf + alulen, alu, LINELEN);
  24. while (n) {
  25. bytes = MIN(LINELEN, n);
  26. //fwrite_unlocked(buf + pos, bytes, 1, stdout);
  27. //putchar_unlocked('\n');
  28. pos += bytes;
  29. if (pos > alulen)
  30. pos -= alulen;
  31. n -= bytes;
  32. }
  33. }
  34. static void acc_probs(struct aminoacid *table)
  35. {
  36. struct aminoacid *iter = table;
  37. while ((++iter)->c) {
  38. iter->p += (iter - 1)->p;
  39. }
  40. for (int i = 0; i <= SLOTS; ++i) {
  41. while (i > (table->p * SLOTS))
  42. ++table;
  43. lu[i] = table;
  44. }
  45. }
  46. static float rng(float max)
  47. {
  48. const unsigned int IM = 139968, IA = 3877, IC = 29573;
  49. static unsigned int seed = 42;
  50. seed = (seed * IA + IC) % IM;
  51. return max * seed / IM;
  52. }
  53. static char nextc()
  54. {
  55. float r;
  56. struct aminoacid *iter;
  57. r = rng(1.0f);
  58. iter = lu[(int) (r * SLOTS)];
  59. while (iter->p < r)
  60. ++iter;
  61. return iter->c;
  62. }
  63. static void random_fasta(struct aminoacid *table, size_t n)
  64. {
  65. size_t i, lines = n / LINELEN;
  66. const size_t chars_left = n % LINELEN;
  67. char buf[LINELEN + 1];
  68. while (lines--) {
  69. for (i = 0; i < LINELEN; ++i) {
  70. buf[i] = nextc();
  71. }
  72. buf[i] = '\n';
  73. //fwrite_unlocked(buf, i + 1, 1, stdout);
  74. }
  75. for (i = 0; i < chars_left; ++i)
  76. buf[i] = nextc();
  77. //fwrite_unlocked(buf, i, 1, stdout);
  78. }
  79. void FastaRedux(int n)
  80. {
  81. //const size_t n = (argc > 1) ? atoi(argv[1]) : 1000;
  82. const char alu [] =
  83. "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTG"
  84. "GGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGA"
  85. "GACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAA"
  86. "AATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT"
  87. "CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAAC"
  88. "CCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTG"
  89. "CACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA";
  90. struct aminoacid iub [] = {
  91. { 'a', 0.27 },
  92. { 'c', 0.12 },
  93. { 'g', 0.12 },
  94. { 't', 0.27 },
  95. { 'B', 0.02 },
  96. { 'D', 0.02 },
  97. { 'H', 0.02 },
  98. { 'K', 0.02 },
  99. { 'M', 0.02 },
  100. { 'N', 0.02 },
  101. { 'R', 0.02 },
  102. { 'S', 0.02 },
  103. { 'V', 0.02 },
  104. { 'W', 0.02 },
  105. { 'Y', 0.02 },
  106. { 0, 0 }
  107. };
  108. struct aminoacid homosapiens [] = {
  109. { 'a', 0.3029549426680 },
  110. { 'c', 0.1979883004921 },
  111. { 'g', 0.1975473066391 },
  112. { 't', 0.3015094502008 },
  113. { 0, 0 }
  114. };
  115. //fputs_unlocked(">ONE Homo sapiens alu\n", stdout);
  116. repeat_fasta(alu, n * 2);
  117. //fputs_unlocked(">TWO IUB ambiguity codes\n", stdout);
  118. acc_probs(iub);
  119. random_fasta(iub, n * 3);
  120. //fputs_unlocked(">THREE Homo sapiens frequency\n", stdout);
  121. acc_probs(homosapiens);
  122. random_fasta(homosapiens, n * 5);
  123. //putchar_unlocked('\n');
  124. }