#!/usr/bin/env python # regex automated generation # by sandro gauci # do not distribute yet import re def regexbrute(teststring,initre='',groups=True): import string regexrange = list(string.lowercase) regexrange += list(string.uppercase) regexrange += list(string.digits) regexrange += list(string.whitespace) if groups: regexrange += ['\.','-'] regexrange += ['\d','[a-fA-F0-9]','\w','\s'] #regexrange += ["{1,2}"] for x in regexrange: testre = initre + x m = re.match(testre,teststring) if m is not None: yield(testre) def regextest(regex,strings): for string in strings: if re.match(regex,string) is None: return False return True def findrepetition(regex,strings): finalregex = regex i = 2 minimum = 1 maximum = 1 matchall = True matchone = True while matchall: newregex = regex + '{%s}' % i i += 1 for string in strings: if re.match(newregex,string) is None: matchall = False break if matchall: minimum = i-1 finalregex = newregex i -= 1 if minimum > 1: while matchone: matchone = False newregex = regex + '{%s}' % i #print "xx",newregex i += 1 for string in strings: if re.match(newregex,string) is not None: matchone = True maximum = i-1 #print "matchone",maximum if maximum > 1: finalregex = regex + '{%s,%s}' % (minimum,maximum) elif minimum > 1: finalregex = regex + '{%s}' % (minimum) return finalregex def generateregex(stuff,groups=True): currentregex='' strings = stuff if len(strings) == 0: return '' string = strings[0] while 1: match = regexbrute(string,currentregex,groups=groups) prevregex = currentregex try: currentregex = match.next() except StopIteration: break while not regextest(currentregex,strings): try: currentregex = match.next() except StopIteration: currentregex = prevregex break if currentregex == prevregex: break currentregex = findrepetition(currentregex,strings) return currentregex def cleanact(stuff): stuffcp = stuff[:] candidates = dict() for thing in stuff: s = thing.split()[0] for thingcp in stuffcp: if thingcp.startswith(s): if not candidates.has_key(s): candidates[s] = 0 candidates[s] += 1 vals = candidates.values() vals.sort(reverse=True) maxmatch = vals[0] loosers=list() for c in candidates.keys(): if candidates[c] != maxmatch: loosers.append(c) for thing in stuff: for looser in loosers: if thing.startswith(looser): try: stuff.remove(thing) except ValueError: pass return stuff def getbestmatch(stuff): from regen import generateregex return generateregex(cleanact(stuff),groups=False) if __name__ == "__main__": # get pattern of to tag generated by asterisk stuff = \ """as79dc6a19 as71abb6a5 as1539e695 as2f5a4a5c as737b2d15 as522cf696 as3c28c041 as18f51e5d as30c67500 as143d9f50 as56961a2b as4fa1e751 as79e8cfed as0d71fedd as4877352b as25e96203 as316bc91f as70f90a8b as0e210b64 as059414ff as76d576ba as552be295 as71fe09e4 as629ac20c as7e38fdd3 as45d033e9 as3a50e3fd as5d0f1be9 as719d50e0 as1182ddf2 as6aa1839d as1f24be2b as162ff2b4 as2f17bf61 as1beee964 as14ecdccd as265d1496 as26c2f484 as201a90c8 as166fb461 as7b3bdcea as1a39fc3a as494a8065 as72ec8ee9 as033518df as428039ab as24d181bc as3079f6da as0444e4cc as75e5e132 as66515468 as5a3df9fd as4b49c60a as0cfbcf3a as781231da as2a59181b as26c345af as0fe9f831 as537ffa69 as13ae012e as0c3f869a as14650226 as58ab1e9b as75c397e3""".splitlines() print generateregex(stuff)