token-delta.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. #!/usr/bin/env python
  2. import os
  3. import re
  4. import subprocess
  5. import sys
  6. import tempfile
  7. ###
  8. class DeltaAlgorithm(object):
  9. def __init__(self):
  10. self.cache = set()
  11. def test(self, changes):
  12. abstract
  13. ###
  14. def getTestResult(self, changes):
  15. # There is no reason to cache successful tests because we will
  16. # always reduce the changeset when we see one.
  17. changeset = frozenset(changes)
  18. if changeset in self.cache:
  19. return False
  20. elif not self.test(changes):
  21. self.cache.add(changeset)
  22. return False
  23. else:
  24. return True
  25. def run(self, changes, force=False):
  26. # Make sure the initial test passes, if not then (a) either
  27. # the user doesn't expect monotonicity, and we may end up
  28. # doing O(N^2) tests, or (b) the test is wrong. Avoid the
  29. # O(N^2) case unless user requests it.
  30. if not force:
  31. if not self.getTestResult(changes):
  32. raise ValueError,'Initial test passed to delta fails.'
  33. # Check empty set first to quickly find poor test functions.
  34. if self.getTestResult(set()):
  35. return set()
  36. else:
  37. return self.delta(changes, self.split(changes))
  38. def split(self, S):
  39. """split(set) -> [sets]
  40. Partition a set into one or two pieces.
  41. """
  42. # There are many ways to split, we could do a better job with more
  43. # context information (but then the API becomes grosser).
  44. L = list(S)
  45. mid = len(L)//2
  46. if mid==0:
  47. return L,
  48. else:
  49. return L[:mid],L[mid:]
  50. def delta(self, c, sets):
  51. # assert(reduce(set.union, sets, set()) == c)
  52. # If there is nothing left we can remove, we are done.
  53. if len(sets) <= 1:
  54. return c
  55. # Look for a passing subset.
  56. res = self.search(c, sets)
  57. if res is not None:
  58. return res
  59. # Otherwise, partition sets if possible; if not we are done.
  60. refined = sum(map(list, map(self.split, sets)), [])
  61. if len(refined) == len(sets):
  62. return c
  63. return self.delta(c, refined)
  64. def search(self, c, sets):
  65. for i,S in enumerate(sets):
  66. # If test passes on this subset alone, recurse.
  67. if self.getTestResult(S):
  68. return self.delta(S, self.split(S))
  69. # Otherwise if we have more than two sets, see if test
  70. # pases without this subset.
  71. if len(sets) > 2:
  72. complement = sum(sets[:i] + sets[i+1:],[])
  73. if self.getTestResult(complement):
  74. return self.delta(complement, sets[:i] + sets[i+1:])
  75. ###
  76. class Token:
  77. def __init__(self, type, data, flags, file, line, column):
  78. self.type = type
  79. self.data = data
  80. self.flags = flags
  81. self.file = file
  82. self.line = line
  83. self.column = column
  84. kTokenRE = re.compile(r"""([a-z_]+) '(.*)'\t(.*)\tLoc=<(.*):(.*):(.*)>""",
  85. re.DOTALL | re.MULTILINE)
  86. def getTokens(path):
  87. p = subprocess.Popen(['clang','-dump-raw-tokens',path],
  88. stdin=subprocess.PIPE,
  89. stdout=subprocess.PIPE,
  90. stderr=subprocess.PIPE)
  91. out,err = p.communicate()
  92. tokens = []
  93. collect = None
  94. for ln in err.split('\n'):
  95. # Silly programmers refuse to print in simple machine readable
  96. # formats. Whatever.
  97. if collect is None:
  98. collect = ln
  99. else:
  100. collect = collect + '\n' + ln
  101. if 'Loc=<' in ln and ln.endswith('>'):
  102. ln,collect = collect,None
  103. tokens.append(Token(*kTokenRE.match(ln).groups()))
  104. return tokens
  105. ###
  106. class TMBDDelta(DeltaAlgorithm):
  107. def __init__(self, testProgram, tokenLists, log):
  108. def patchName(name, suffix):
  109. base,ext = os.path.splitext(name)
  110. return base + '.' + suffix + ext
  111. super(TMBDDelta, self).__init__()
  112. self.testProgram = testProgram
  113. self.tokenLists = tokenLists
  114. self.tempFiles = [patchName(f,'tmp')
  115. for f,_ in self.tokenLists]
  116. self.targetFiles = [patchName(f,'ok')
  117. for f,_ in self.tokenLists]
  118. self.log = log
  119. self.numTests = 0
  120. def writeFiles(self, changes, fileNames):
  121. assert len(fileNames) == len(self.tokenLists)
  122. byFile = [[] for i in self.tokenLists]
  123. for i,j in changes:
  124. byFile[i].append(j)
  125. for i,(file,tokens) in enumerate(self.tokenLists):
  126. f = open(fileNames[i],'w')
  127. for j in byFile[i]:
  128. f.write(tokens[j])
  129. f.close()
  130. return byFile
  131. def test(self, changes):
  132. self.numTests += 1
  133. byFile = self.writeFiles(changes, self.tempFiles)
  134. if self.log:
  135. print >>sys.stderr, 'TEST - ',
  136. if self.log > 1:
  137. for i,(file,_) in enumerate(self.tokenLists):
  138. indices = byFile[i]
  139. if i:
  140. sys.stderr.write('\n ')
  141. sys.stderr.write('%s:%d tokens: [' % (file,len(byFile[i])))
  142. prev = None
  143. for j in byFile[i]:
  144. if prev is None or j != prev + 1:
  145. if prev:
  146. sys.stderr.write('%d][' % prev)
  147. sys.stderr.write(str(j))
  148. sys.stderr.write(':')
  149. prev = j
  150. if byFile[i]:
  151. sys.stderr.write(str(byFile[i][-1]))
  152. sys.stderr.write('] ')
  153. else:
  154. print >>sys.stderr, ', '.join(['%s:%d tokens' % (file, len(byFile[i]))
  155. for i,(file,_) in enumerate(self.tokenLists)]),
  156. p = subprocess.Popen([self.testProgram] + self.tempFiles)
  157. res = p.wait() == 0
  158. if res:
  159. self.writeFiles(changes, self.targetFiles)
  160. if self.log:
  161. print >>sys.stderr, '=> %s' % res
  162. else:
  163. if res:
  164. print '\nSUCCESS (%d tokens)' % len(changes)
  165. else:
  166. sys.stderr.write('.')
  167. return res
  168. def run(self):
  169. res = super(TMBDDelta, self).run([(i,j)
  170. for i,(file,tokens) in enumerate(self.tokenLists)
  171. for j in range(len(tokens))])
  172. self.writeFiles(res, self.targetFiles)
  173. if not self.log:
  174. print >>sys.stderr
  175. return res
  176. def tokenBasedMultiDelta(program, files, log):
  177. # Read in the lists of tokens.
  178. tokenLists = [(file, [t.data for t in getTokens(file)])
  179. for file in files]
  180. numTokens = sum([len(tokens) for _,tokens in tokenLists])
  181. print "Delta on %s with %d tokens." % (', '.join(files), numTokens)
  182. tbmd = TMBDDelta(program, tokenLists, log)
  183. res = tbmd.run()
  184. print "Finished %s with %d tokens (in %d tests)." % (', '.join(tbmd.targetFiles),
  185. len(res),
  186. tbmd.numTests)
  187. def main():
  188. from optparse import OptionParser, OptionGroup
  189. parser = OptionParser("%prog <test program> {files+}")
  190. parser.add_option("", "--debug", dest="debugLevel",
  191. help="set debug level [default %default]",
  192. action="store", type=int, default=0)
  193. (opts, args) = parser.parse_args()
  194. if len(args) <= 1:
  195. parser.error('Invalid number of arguments.')
  196. program,files = args[0],args[1:]
  197. md = tokenBasedMultiDelta(program, files, log=opts.debugLevel)
  198. if __name__ == '__main__':
  199. try:
  200. main()
  201. except KeyboardInterrupt:
  202. print >>sys.stderr,'Interrupted.'
  203. os._exit(1) # Avoid freeing our giant cache.