fasta.lua 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. -- Fasta benchmark from benchmarks game
  2. -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/fasta.html
  3. --
  4. -- Translated from the Java and Python versions found at
  5. -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/fasta-java-2.html
  6. -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/fasta-python3-1.html
  7. --
  8. -- This differs from the Lua versions of the benchmark in some aspects
  9. -- * Don't use load/eval metaprogramming
  10. -- * Repeat fasta now works correctly when #alu < 60
  11. -- * Use linear search (actually faster than binary search in the tests)
  12. -- * Use // integer division
  13. local IM = 139968
  14. local IA = 3877
  15. local IC = 29573
  16. local seed = 42
  17. local function random(max)
  18. seed = (seed * IA + IC) % IM
  19. return max * seed / IM;
  20. end
  21. local WIDTH = 60
  22. local function print_fasta_header(id, desc)
  23. io.write(">" .. id .. " " .. desc .. "\n")
  24. end
  25. local function repeat_fasta(id, desc, alu, n)
  26. print_fasta_header(id, desc)
  27. local alusize = #alu
  28. local aluwrap = alu .. alu
  29. while #aluwrap < alusize + WIDTH do
  30. aluwrap = aluwrap .. alu
  31. end
  32. local lines = n // WIDTH
  33. local last_line = n % WIDTH
  34. local start = 0 -- (This index is 0-based bacause of the % operator)
  35. for _ = 1, lines do
  36. local stop = start + WIDTH
  37. io.write(string.sub(aluwrap, start+1, stop))
  38. io.write("\n")
  39. start = stop % alusize
  40. end
  41. if last_line > 0 then
  42. io.write(string.sub(aluwrap, start+1, start + last_line))
  43. io.write("\n")
  44. end
  45. end
  46. local function linear_search(ps, p)
  47. for i = 1, #ps do
  48. if ps[i]>= p then
  49. return i
  50. end
  51. end
  52. return 1
  53. end
  54. local function random_fasta(id, desc, frequencies, n)
  55. print_fasta_header(id, desc)
  56. -- Prepare the cummulative probability table
  57. local nitems = #frequencies
  58. local letters = {}
  59. local probs = {}
  60. do
  61. local total = 0.0
  62. for i = 1, nitems do
  63. local o = frequencies[i]
  64. local c = o[1]
  65. local p = o[2]
  66. total = total + p
  67. letters[i] = c
  68. probs[i] = total
  69. end
  70. probs[nitems] = 1.0
  71. end
  72. -- Generate the output
  73. local col = 0
  74. for _ = 1, n do
  75. local ix = linear_search(probs, random(1.0))
  76. local c = letters[ix]
  77. io.write(c)
  78. col = col + 1
  79. if col >= WIDTH then
  80. io.write("\n")
  81. col = 0
  82. end
  83. end
  84. if col > 0 then
  85. io.write("\n")
  86. end
  87. end
  88. local HUMAN_ALU =
  89. "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG" ..
  90. "GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA" ..
  91. "CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT" ..
  92. "ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA" ..
  93. "GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG" ..
  94. "AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC" ..
  95. "AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"
  96. local IUB = {
  97. { 'a', 0.27 },
  98. { 'c', 0.12 },
  99. { 'g', 0.12 },
  100. { 't', 0.27 },
  101. { 'B', 0.02 },
  102. { 'D', 0.02 },
  103. { 'H', 0.02 },
  104. { 'K', 0.02 },
  105. { 'M', 0.02 },
  106. { 'N', 0.02 },
  107. { 'R', 0.02 },
  108. { 'S', 0.02 },
  109. { 'V', 0.02 },
  110. { 'W', 0.02 },
  111. { 'Y', 0.02 },
  112. }
  113. local HOMO_SAPIENS = {
  114. { 'a', 0.3029549426680 },
  115. { 'c', 0.1979883004921 },
  116. { 'g', 0.1975473066391 },
  117. { 't', 0.3015094502008 },
  118. }
  119. return function(N)
  120. N = N or 100
  121. repeat_fasta("ONE", "Homo sapiens alu", HUMAN_ALU, N*2)
  122. random_fasta('TWO', 'IUB ambiguity codes', IUB, N*3)
  123. random_fasta('THREE', 'Homo sapiens frequency', HOMO_SAPIENS, N*5)
  124. end