0

Where the illegal character set is in many different ranges and individual points, what would be the most efficient way to check strings against such illegal set.

I timed two methods, and one is surprisingly much slower than the other (check the code below - and assuming my timing has no issues). Can the search pattern method below be improved on, and not being restricted to using regex's.

import re
import timeit

# match pattern
matchPat = re.compile(r'[^'
                   r'\u0000-\u0008'    # C0 block first segment
                   r'\u000B\u000C'    # allow TAB U+0009, LF U+000A, and CR U+000D
                   r'\u000E-\u001F'    # rest of C0
                   r'\u007F'           # disallow DEL U+007F
                   r'\u0080-\u009F'    # All C1 block
                   r'\u2028\u2029'     # LS and PS unicode newlines
                   r'\uD800-\uDFFF'    # surrogate block
                   r'\uFFFE\uFFFF'     # non-characters
                   r'\uFEFF]*$',       # BOM only allowed at the start of the stream
                   )

# search pattern
searchPat = re.compile(r'['
                   r'\u0000-\u0008'    # C0 block first segment
                   r'\u000B\u000C'    # allow TAB U+0009, LF U+000A, and CR U+000D
                   r'\u000E-\u001F'    # rest of C0
                   r'\u007F'           # disallow DEL U+007F
                   r'\u0080-\u009F'    # All C1 block
                   r'\u2028\u2029'     # LS and PS unicode newlines
                   r'\uD800-\uDFFF'    # surrogate block
                   r'\uFFFE\uFFFF'     # non-characters
                   r'\uFEFF]',         # BOM only allowed at the start of the stream
                   )

s = 'allow TAB 0009, LF 000A, and CR 000D -- only allowed at the start of the stream' # sample legal string

def fmatch(s):
    if matchPat.match(s):
        valid = True

def fsearch(s):
    if searchPat.search(s):
        valid = False

print ('fmatch==',timeit.timeit("fmatch(s)", setup="from __main__ import fmatch,s", number=1000000))
print ('fsearch==',timeit.timeit("fsearch(s)", setup="from __main__ import fsearch,s", number=1000000))


$ python3 valid.py
fmatch== 5.631323281995719
fsearch== 1.320517893997021
4

0 回答 0