123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- #!/usr/bin/env python
- # This file is part of exrex.
- #
- # exrex is free software: you can redistribute it and/or modify
- # it under the terms of the GNU Affero General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # exrex is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Affero General Public License for more details.
- #
- # You should have received a copy of the GNU Affero General Public License
- # along with exrex. If not, see < http://www.gnu.org/licenses/ >.
- #
- # (C) 2012- by Adam Tauber, <[email protected]>
- try:
- from future_builtins import map, range
- except:
- pass
- from re import sre_parse
- from itertools import product, chain, tee
- from random import choice,randint
- import string
- __all__ = ('generate', 'CATEGORIES', 'count', 'parse', 'getone')
- CATEGORIES = {'category_space' : sorted(sre_parse.WHITESPACE)
- ,'category_digit' : sorted(sre_parse.DIGITS)
- ,'category_any' : [chr(x) for x in range(32, 123)]
- ,'category_word' : sorted( frozenset(string.ascii_letters + string.digits + "_") )
- }
- def comb(g, i):
- for c in g:
- g2,i = tee(i)
- for c2 in g2:
- yield c+c2
- def mappend(g, c):
- for cc in g:
- yield cc+c
- def _in(d):
- ret = []
- neg = False
- for i in d:
- if i[0] == 'range':
- subs = map(chr, range(i[1][0], i[1][1]+1))
- if neg:
- for char in subs:
- try:
- ret.remove(char)
- except:
- pass
- else:
- ret.extend(subs)
- elif i[0] == 'literal':
- if neg:
- try:
- ret.remove(chr(i[1]))
- except:
- pass
- else:
- ret.append(chr(i[1]))
- elif i[0] == 'category':
- subs = CATEGORIES.get(i[1], [''])
- if neg:
- for char in subs:
- try:
- ret.remove(char)
- except:
- pass
- else:
- ret.extend(subs)
- elif i[0] == 'negate':
- ret = list(CATEGORIES['category_any'])
- neg = True
- return ret
- def prods(orig, ran, items):
- for o in orig:
- for r in ran:
- for s in product(items, repeat=r):
- yield o+''.join(s)
- def ggen(g1, f, *args, **kwargs):
- for a in g1:
- g2 = f(*args, **kwargs)
- if isinstance(g2, int):
- yield g2
- else:
- for b in g2:
- yield a+b
- def _gen(d, limit=20, count=False):
- """docstring for _gen"""
- ret = ['']
- strings = 0
- for i in d:
- if i[0] == 'in':
- subs = _in(i[1])
- if count:
- strings = (strings or 1) * len(subs)
- ret = comb(ret, subs)
- elif i[0] == 'literal':
- ret = mappend(ret, chr(i[1]))
- elif i[0] == 'category':
- subs = CATEGORIES.get(i[1], [''])
- if count:
- strings = (strings or 1) * len(subs)
- ret = comb(ret, subs)
- elif i[0] == 'any':
- subs = CATEGORIES['category_any']
- if count:
- strings = (strings or 1) * len(subs)
- ret = comb(ret, subs)
- elif i[0] == 'max_repeat':
- chars = filter(None, _gen(list(i[1][2]), limit))
- if i[1][1]+1 - i[1][0] >= limit:
- ran = range(i[1][0], i[1][0]+limit)
- else:
- ran = range(i[1][0], i[1][1]+1)
- if count:
- for i in ran:
- strings += pow(len(chars), i)
- ret = prods(ret, ran, chars)
- elif i[0] == 'branch':
- subs = list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1]))
- if count:
- strings = (strings or 1) * (len(subs) or 1)
- ret = comb(ret, subs)
- elif i[0] == 'subpattern':
- if count:
- strings = (strings or 1) * (sum(ggen([0], _gen, i[1][1], limit=limit, count=True)) or 1)
- ret = ggen(ret, _gen, i[1][1], limit=limit, count=False)
- # ignore ^ and $
- elif i[0] == 'at':
- continue
- elif i[0] == 'not_literal':
- subs = list(CATEGORIES['category_any'])
- subs.remove(chr(i[1]))
- if count:
- strings = (strings or 1) * len(subs)
- ret = comb(ret, subs)
- elif i[0] == 'assert':
- print i[1][1]
- continue
- else:
- #print('[!] cannot handle expression ' + repr(i))
- raise Exception('[!] cannot handle expression ' + repr(i))
- if count:
- return strings
- return ret
- def _randone(d, limit=20):
- """docstring for _randone"""
- ret = ''
- for i in d:
- if i[0] == 'in':
- ret += choice(_in(i[1]))
- elif i[0] == 'literal':
- ret += chr(i[1])
- elif i[0] == 'category':
- ret += choice(CATEGORIES.get(i[1], ['']))
- elif i[0] == 'any':
- ret += choice(CATEGORIES['category_any'])
- elif i[0] == 'max_repeat':
- chars = filter(None, _gen(list(i[1][2]), limit))
- if i[1][1]+1 - i[1][0] >= limit:
- min,max = i[1][0], i[1][0]+limit
- else:
- min,max = i[1][0], i[1][1]
- for _ in range(randint(min, max)):
- ret += choice(chars)
- elif i[0] == 'branch':
- ret += choice(list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1])))
- elif i[0] == 'subpattern':
- ret += _randone(i[1][1], limit)
- elif i[0] == 'at':
- continue
- elif i[0] == 'not_literal':
- c=list(CATEGORIES['category_any'])
- c.remove(chr(i[1]))
- ret += choice(c)
- else:
- #print('[!] cannot handle expression "%s"' % str(i))
- raise Exception('[!] cannot handle expression "%s"' % str(i))
- return ret
- def parse(s):
- """Regular expression parser
- :param s: Regular expression
- :type s: str
- :rtype: list
- """
- r = sre_parse.parse(s)
- return list(r)
- def generate(s, limit=20):
- """Creates a generator that generates all matching strings to a given regular expression
- :param s: Regular expression
- :type s: str
- :param limit: Range limit
- :type limit: int
- :returns: string generator object
- """
- return _gen(parse(s), limit)
- def count(s, limit=20):
- """Counts all matching strings to a given regular expression
- :param s: Regular expression
- :type s: str
- :param limit: Range limit
- :type limit: int
- :rtype: int
- :returns: number of matching strings
- """
- return _gen(parse(s), limit, count=True)
- def getone(regex_string, limit=20):
- """Returns a random matching string to a given regular expression
- """
- return _randone(parse(regex_string), limit)
- def argparser():
- import argparse
- from sys import stdout
- argp = argparse.ArgumentParser(description='exrex - regular expression string generator')
- argp.add_argument('-o', '--output'
- ,help = 'Output file - default is STDOUT'
- ,metavar = 'FILE'
- ,default = stdout
- ,type = argparse.FileType('w')
- )
- argp.add_argument('-l', '--limit'
- ,help = 'Max limit for range size - default is 20'
- ,default = 20
- ,action = 'store'
- ,type = int
- ,metavar = 'N'
- )
- argp.add_argument('-c', '--count'
- ,help = 'Count matching strings'
- ,default = False
- ,action = 'store_true'
- )
- argp.add_argument('-r', '--random'
- ,help = 'Returns a random string that matches to the regex'
- ,default = False
- ,action = 'store_true'
- )
- argp.add_argument('-d', '--delimiter'
- ,help = 'Delimiter - default is \\n'
- ,default = '\n'
- )
- argp.add_argument('-v', '--verbose'
- ,action = 'store_true'
- ,help = 'Verbose mode'
- ,default = False
- )
- argp.add_argument('regex'
- ,metavar = 'REGEX'
- ,help = 'REGEX string'
- )
- return vars(argp.parse_args())
- def __main__():
- from sys import exit, stderr
- # 'as(d|f)qw(e|r|s)[a-zA-Z]{2,3}'
- # 'as(QWE|Z([XC]|Y|U)V){2,3}asdf'
- # '.?'
- # '.+'
- # 'asdf.{1,4}qwer{2,5}'
- # 'a(b)?(c)?(d)?'
- # 'a[b][c][d]?[e]?
- args = argparser()
- if args['verbose']:
- args['output'].write('%r%s' % (parse(args['regex'], limit=args['limit']), args['delimiter']))
- if args['count']:
- args['output'].write('%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter']))
- exit(0)
- if args['random']:
- args['output'].write('%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter']))
- exit(0)
- try:
- g = generate(args['regex'], args['limit'])
- except Exception, e:
- print >> stderr, '[!] Error: ', e
- exit(1)
- for s in g:
- try:
- args['output'].write(s+args['delimiter'])
- except:
- break
- if __name__ == '__main__':
- __main__()
|