knucleotide.lua 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. -- k-nucleotide norm benchmark from benchmarks game
  2. -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/knucleotide.html#knucleotide
  3. --
  4. -- Original Lua code by Mike Pall:
  5. -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/knucleotide-lua-2.html
  6. --
  7. -- Original Ruby code:
  8. -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/knucleotide-mri-2.html
  9. --
  10. -- Modifications by Hugo Gualandi:
  11. -- * reindented the code
  12. -- * rewrote in more idiomatic lua
  13. --
  14. -- Expected output (N = 5500):
  15. -- 1.274224153
  16. --
  17. -- Reads the last sequence generated by the FASTA benchmark
  18. --
  19. local function read_input(f)
  20. for line in f:lines() do
  21. if string.sub(line, 1, 6) == ">THREE" then
  22. break
  23. end
  24. end
  25. local lines = {}
  26. for line in f:lines() do
  27. table.insert(lines, line)
  28. end
  29. return string.upper(table.concat(lines))
  30. end
  31. --
  32. -- Compute the frequency table for substrings of size `length`.
  33. -- The particular iteration order, using a moving reading frame, is mandatory.
  34. --
  35. local function compute_freqs(seq, length)
  36. local freqs = setmetatable({}, { __index = function() return 0 end })
  37. local n = #seq - length + 1
  38. for f = 1, length do
  39. for i = f, n, length do
  40. local s = string.sub(seq, i, i+length-1)
  41. freqs[s] = freqs[s] + 1
  42. end
  43. end
  44. return n, freqs
  45. end
  46. --
  47. -- Print the most common subsequences, in decreasing order
  48. --
  49. local function sort_by_freq(seq, length)
  50. local n, freq = compute_freqs(seq, length)
  51. local seqs = {}
  52. for k, _ in pairs(freq) do
  53. seqs[#seqs+1] = k
  54. end
  55. table.sort(seqs, function(a, b)
  56. local fa, fb = freq[a], freq[b]
  57. return (fa == fb) and (a < b) or (fa > fb)
  58. end)
  59. for _, c in ipairs(seqs) do
  60. io.write(string.format("%s %0.3f\n", c, freq[c]*100/n))
  61. end
  62. io.write("\n")
  63. end
  64. local function single_freq(seq, s)
  65. local _, freq = compute_freqs(seq, #s)
  66. io.write(freq[s], "\t", s, "\n")
  67. end
  68. return function(N)
  69. local f, err = io.open("fasta-output-"..tostring(N)..".txt", "r")
  70. if not f then error(err) end
  71. local seq = read_input(f)
  72. sort_by_freq(seq, 1)
  73. sort_by_freq(seq, 2)
  74. single_freq(seq, "GGT")
  75. single_freq(seq, "GGTA")
  76. single_freq(seq, "GGTATT")
  77. single_freq(seq, "GGTATTTTAATT")
  78. single_freq(seq, "GGTATTTTAATTTATAGT")
  79. end