12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- -- k-nucleotide norm benchmark from benchmarks game
- -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/knucleotide.html#knucleotide
- --
- -- Original Lua code by Mike Pall:
- -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/knucleotide-lua-2.html
- --
- -- Original Ruby code:
- -- https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/knucleotide-mri-2.html
- --
- -- Modifications by Hugo Gualandi:
- -- * reindented the code
- -- * rewrote in more idiomatic lua
- --
- -- Expected output (N = 5500):
- -- 1.274224153
- --
- -- Reads the last sequence generated by the FASTA benchmark
- --
- local function read_input(f)
- for line in f:lines() do
- if string.sub(line, 1, 6) == ">THREE" then
- break
- end
- end
- local lines = {}
- for line in f:lines() do
- table.insert(lines, line)
- end
- return string.upper(table.concat(lines))
- end
- --
- -- Compute the frequency table for substrings of size `length`.
- -- The particular iteration order, using a moving reading frame, is mandatory.
- --
- local function compute_freqs(seq, length)
- local freqs = setmetatable({}, { __index = function() return 0 end })
- local n = #seq - length + 1
- for f = 1, length do
- for i = f, n, length do
- local s = string.sub(seq, i, i+length-1)
- freqs[s] = freqs[s] + 1
- end
- end
- return n, freqs
- end
- --
- -- Print the most common subsequences, in decreasing order
- --
- local function sort_by_freq(seq, length)
- local n, freq = compute_freqs(seq, length)
- local seqs = {}
- for k, _ in pairs(freq) do
- seqs[#seqs+1] = k
- end
- table.sort(seqs, function(a, b)
- local fa, fb = freq[a], freq[b]
- return (fa == fb) and (a < b) or (fa > fb)
- end)
- for _, c in ipairs(seqs) do
- io.write(string.format("%s %0.3f\n", c, freq[c]*100/n))
- end
- io.write("\n")
- end
- local function single_freq(seq, s)
- local _, freq = compute_freqs(seq, #s)
- io.write(freq[s], "\t", s, "\n")
- end
- return function(N)
- local f, err = io.open("fasta-output-"..tostring(N)..".txt", "r")
- if not f then error(err) end
- local seq = read_input(f)
- sort_by_freq(seq, 1)
- sort_by_freq(seq, 2)
- single_freq(seq, "GGT")
- single_freq(seq, "GGTA")
- single_freq(seq, "GGTATT")
- single_freq(seq, "GGTATTTTAATT")
- single_freq(seq, "GGTATTTTAATTTATAGT")
- end
|