parsePolyglot.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. #!/usr/bin/env python
  2. from __future__ import print_function
  3. import os, re, sys
  4. # == globals
  5. printDebug = False
  6. try:
  7. unicode
  8. except NameError:
  9. unicode = str
  10. # ==
  11. class yamlValue(unicode):
  12. linenumber = None
  13. def __new__(cls, value, linenumber=None):
  14. if isinstance(value, unicode):
  15. real = unicode.__new__(cls, value)
  16. else:
  17. real = unicode.__new__(cls, value, "utf-8")
  18. if linenumber is not None:
  19. real.linenumber = int(linenumber)
  20. return real
  21. def __repr__(self):
  22. real = super(yamlValue, self).__repr__()
  23. return real.lstrip('u')
  24. def parseYAML(source):
  25. def debug(message):
  26. if printDebug and message:
  27. message = str(message).rstrip()
  28. if message:
  29. print(message)
  30. sys.stdout.flush()
  31. commentLineRegex = re.compile('^\s*#')
  32. yamlLineRegex = re.compile('^(?P<indent> *)((?P<itemMarker>- +)(?P<itemContent>.*)|((?P<key>[\w\.]+)(?P<keyExtra>: *))?(?P<content>.*))\s*$')
  33. def parseYAML_inner(source, indent):
  34. returnItem = None
  35. for linenumber, line in source:
  36. if line == '': # no newline, so EOF
  37. break
  38. debug('line %d (%d):%s' % (linenumber, indent, line))
  39. if line.strip() == '' or commentLineRegex.match(line): # empty or comment line, ignore
  40. debug('\tempty/comment line')
  41. continue
  42. # - parse line
  43. parsedLine = yamlLineRegex.match(line)
  44. if not parsedLine:
  45. raise Exception('Unparseable YAML line %d: %s' % (linenumber, line.rstrip()))
  46. lineIndent = len(parsedLine.group('indent'))
  47. lineItemMarker = parsedLine.group('itemMarker')
  48. lineKey = parsedLine.group('key') or ''
  49. lineKeyExtra = parsedLine.group('keyExtra') or ''
  50. lineContent = (parsedLine.group('content') or parsedLine.group('itemContent') or '').strip()
  51. # - handle end-of-sections
  52. if lineIndent < indent:
  53. # we have dropped out of this item, push back the line and return what we have
  54. source.send((linenumber, line))
  55. debug('\tout one level')
  56. return returnItem
  57. # - array item
  58. if lineItemMarker:
  59. debug('\tarray item')
  60. # item in an array
  61. if returnItem is None:
  62. debug('\tnew array, indent is %d' % lineIndent)
  63. returnItem = []
  64. indent = lineIndent
  65. elif not isinstance(returnItem, list):
  66. raise Exception('Bad YAML, got a list item while working on a %s on line %d: %s' % (returnItem.__class__.__name__, linenumber, line.rstrip()))
  67. indentLevel = lineIndent + len(lineItemMarker)
  68. source.send((linenumber, (' ' * (indentLevel) )+ lineContent))
  69. returnItem += [parseYAML_inner(source=source, indent=indent + 1)]
  70. # - dict item
  71. elif lineKey:
  72. debug('\tdict item')
  73. if returnItem is None:
  74. debug('\tnew dict, indent is %d' % lineIndent)
  75. # new dict
  76. returnItem = {}
  77. indent = lineIndent
  78. elif not isinstance(returnItem, dict):
  79. raise Exception('Bad YAML, got a dict value while working on a %s on line %d: %s' % (returnItem.__class__.__name__, linenumber, line.rstrip()))
  80. indentLevel = lineIndent + len(lineKey) + len(lineKeyExtra)
  81. source.send((linenumber, (' ' * indentLevel) + lineContent))
  82. returnItem[lineKey] = parseYAML_inner(source=source, indent=indent + 1)
  83. # - data - one or more lines of text
  84. else:
  85. debug('\tvalue')
  86. if returnItem is None:
  87. returnItem = yamlValue('', linenumber)
  88. if lineContent.strip() in ('|', '|-', '>'):
  89. continue # yaml multiline marker
  90. elif not isinstance(returnItem, yamlValue):
  91. raise Exception('Bad YAML, got a value while working on a %s on line %d: %s' % (returnItem.__class__.__name__, linenumber, line.rstrip()))
  92. if returnItem:
  93. returnItem = yamlValue(returnItem + "\n" + lineContent, returnItem.linenumber) # str subclasses are not fun
  94. else:
  95. returnItem = yamlValue(lineContent, linenumber)
  96. return returnItem
  97. def parseYAML_generator(source):
  98. if hasattr(source, 'capitalize'):
  99. if os.path.isfile(source):
  100. source = open(source, 'r')
  101. else:
  102. source = source.splitlines(True)
  103. elif hasattr(source, 'readlines'):
  104. pass # the for loop will already work
  105. backlines = []
  106. for linenumber, line in enumerate(source):
  107. backline = None
  108. usedLine = False
  109. while usedLine is False or backlines:
  110. if backlines:
  111. backline = yield backlines.pop()
  112. else:
  113. usedLine = True
  114. backline = yield (linenumber + 1, line)
  115. while backline: # loops returning None for every send()
  116. assert isinstance(backline, tuple)
  117. assert isinstance(backline[0], int)
  118. backlines.append(backline)
  119. backline = yield None
  120. return parseYAML_inner(parseYAML_generator(source), indent=0)
  121. if __name__ == '__main__':
  122. import optparse, pprint
  123. parser = optparse.OptionParser()
  124. parser.add_option("-d", "--debug", dest="debug", action="store_true", default=False, help="print debug information")
  125. (options, args) = parser.parse_args()
  126. printDebug = options.debug
  127. if len(args) < 1:
  128. parser.error('%s needs files to process' % os.path.basename(__file__))
  129. for filePath in args:
  130. if not os.path.isfile(filePath):
  131. sys.exit('target is not an existing file: %s' % os.path.basename(__file__))
  132. for filePath in args:
  133. print('=== %s' % filePath)
  134. pprint.pprint(parseYAML(filePath))