我认为直接操作字符串或仅修改单个正则表达式模式会更有效。不幸的是,后者只会返回真或假。
无论如何,我根据要求创建了一个脚本,虽然它会接受每个月的 31 天,但添加更多限制应该相当容易。
from re import sub, match
def match_string(input, pattern, valid_words, date_format):
errors = []
# makes sure that input and pattern are compatible
regex_pattern = sub(r'#{1,3}', '(.+?)', pattern)
if not match(regex_pattern, input):
return 'Error: Input doesn\'t match pattern!'
# converts the data_format to a regex
date_regex = sub(r'%d', '(?P<day>\d+)', date_format)
date_regex = sub(r'%m', '(?P<month>\d+)', date_regex)
date_regex = sub(r'%y', '(?P<year>\d+)', date_regex)
# extracts the dates
regex_pattern = sub(r'###', '(.+?)', pattern)
regex_pattern = sub(r'##', '(?:.+?)', regex_pattern)
regex_pattern = sub(r'#', '(?:.+?)', regex_pattern)
for date in match(regex_pattern, input).groups():
m = match(date_regex, date)
if not m:
errors.append('Error: %s is not a valid date!' % date)
else:
if int(m.group('day')) < 1 or 31 < int(m.group('day')):
errors.append('Error: %s is not a valid day!' % m.group('day'))
if int(m.group('month')) < 1 or 12 < int(m.group('month')):
errors.append('Error: %s is not a valid month!' % m.group('month'))
# extracts the generic words
regex_pattern = sub(r'###', '(?:.+?)', pattern)
regex_pattern = sub(r'##', '(.+?)', regex_pattern)
regex_pattern = sub(r'#', '(?:.+?)', regex_pattern)
for word in match(regex_pattern, input).groups():
if not word.strip() in valid_words:
errors.append('Error: %s is not a valid word!' % word)
# extracts the numbers
regex_pattern = sub(r'###', '(?:.+?)', pattern)
regex_pattern = sub(r'##', '(?:.+?)', regex_pattern)
regex_pattern = sub(r'#', '(.+?)', regex_pattern)
for number in match(regex_pattern, input).groups():
if not match(r'\d+', number):
errors.append('Error: %s is not a valid number!' % number)
if len(errors) == 0:
return True
else:
return '\n'.join(errors)
print match_string('1 and 2 are numbers foo and bar are strings 12-1-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
print
print match_string('1 is a number foo is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print
print match_string('foo is a number bar is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print
print match_string('1 is a number bar is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print
print match_string('1 is a number foo is a string January is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
产生结果:
True
True
Error: bar is not a valid word!
Error: foo is not a valid number!
Error: bar is not a valid word!
Error: January is not a valid date!
如您所见,您的第二个示例有两个错误,而不是一个。
编辑:
我在不使用正则表达式的情况下重新制作了程序。它应该更有效率。看起来它只遍历输入一次,但它仍然使用startswith() 方法多次读取一些字符。
此版本在检测到错误时立即返回。因此它只会检测每个输入的第一个错误。
def match_string(input, pattern, valid_words, date_format):
print '\n> match_string(\'%s\', \'%s\', %s, \'%s\')' % (input, pattern, valid_words, date_format)
digits = '0123456789'
inputIndex = 0
patternIndex = 0
while inputIndex < len(input) and patternIndex < len(pattern):
if pattern[patternIndex] == '#':
patternIndex += 1
if pattern[patternIndex] == '#':
patternIndex += 1
if pattern[patternIndex] == '#':
# validate date
date_formatIndex = 0
while inputIndex < len(input) and date_formatIndex < len(date_format):
if input[inputIndex] == date_format[date_formatIndex]:
inputIndex += 1
date_formatIndex += 1
elif input[inputIndex] in digits:
startIndex = inputIndex
while inputIndex < len(input) and input[inputIndex] in digits:
inputIndex += 1
number = int(input[startIndex:inputIndex])
if date_format[date_formatIndex:].startswith('%y'):
placeholder = True
elif date_format[date_formatIndex:].startswith('%m'):
if number < 1 or 12 < number:
return 'Error: expected a month between 1 and 12\n input %d -> "...%s"\n pattern %d -> "...%s"\n date format %d -> "...%s"' % (startIndex, input[startIndex:], patternIndex - 2, pattern[patternIndex - 2:], date_formatIndex, date_format[date_formatIndex:])
elif date_format[date_formatIndex:].startswith('%d'):
if number < 1 or 31 < number:
return 'Error: expected a day between 1 and 31\n input %d -> "...%s"\n pattern %d -> "...%s"\n date format %d -> "...%s"' % (startIndex, input[startIndex:], patternIndex - 2, pattern[patternIndex - 2:], date_formatIndex, date_format[date_formatIndex:])
else:
return 'Error: input doesn\'t match date format\n input %d -> "...%s"\n pattern %d -> "...%s"\n date format %d -> "...%s"' % (startIndex, input[startIndex:], patternIndex - 2, pattern[patternIndex - 2:], date_formatIndex, date_format[date_formatIndex:])
date_formatIndex += 2
else:
return 'Error: input doesn\'t match date format\n input %d -> "...%s"\n pattern %d -> "...%s"\n date format %d -> "...%s"' % (inputIndex, input[inputIndex:], patternIndex - 2, pattern[patternIndex - 2:], date_formatIndex, date_format[date_formatIndex:])
patternIndex += 1
else:
# validate word
valid = False
for word in valid_words:
if input[inputIndex:].startswith(word):
valid = True
inputIndex += len(word)
break
if not valid:
return 'Error: expected a valid word\n input %d -> "...%s"\n pattern %d -> "...%s"' % (inputIndex, input[inputIndex:], patternIndex - 2, pattern[patternIndex - 2:])
else:
# validate number
if not input[inputIndex] in digits:
return 'Error: expected a number\n input %d -> "...%s"\n pattern %d -> "...%s"' % (inputIndex, input[inputIndex:], patternIndex - 1, pattern[patternIndex - 1:])
while inputIndex < len(input) and input[inputIndex] in digits:
inputIndex += 1
elif input[inputIndex] != pattern[patternIndex]:
return 'Error: input and pattern do not match\n input %d -> "...%s"\n pattern %d -> "...%s"' % (inputIndex, input[inputIndex:], patternIndex, pattern[patternIndex:])
else:
inputIndex += 1
patternIndex += 1
return True
print match_string('1 and 2 are numbers foo and bar are strings 12-1-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
print match_string('1 is a number foo is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print match_string('foo is a number bar is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print match_string('1 is a number bar is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print match_string('1 is a number foo is a string January is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
print match_string('1 and 2 are numbers foo and bar are strings 15-1-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
print match_string('1 and 2 are numbers foo and bar are strings 08-42-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
print match_string('1 and 2 are numbers foo and bar are strings 08;4;2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
print match_string('1 and 2 are numbers foo and bar are strings 08-4-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '~%m-%d-%y')
产生结果(我添加了更多测试):
> match_string('1 and 2 are numbers foo and bar are strings 12-1-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
True
> match_string('1 is a number foo is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
True
> match_string('foo is a number bar is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
Error: expected a number
input 0 -> "...foo is a number bar is a string 12-1-2013 is a date"
pattern 0 -> "...# is a number ## is a string ### is a date"
> match_string('1 is a number bar is a string 12-1-2013 is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
Error: expected a valid word
input 14 -> "...bar is a string 12-1-2013 is a date"
pattern 14 -> "...## is a string ### is a date"
> match_string('1 is a number foo is a string January is a date', '# is a number ## is a string ### is a date', ['foo'], '%m-%d-%y')
Error: input doesn't match date format
input 30 -> "...January is a date"
pattern 29 -> "...### is a date"
date format 0 -> "...%m-%d-%y"
> match_string('1 and 2 are numbers foo and bar are strings 15-1-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
Error: expected a month between 1 and 12
input 44 -> "...15-1-2013 is a date"
pattern 42 -> "...### is a date"
date format 0 -> "...%m-%d-%y"
> match_string('1 and 2 are numbers foo and bar are strings 08-42-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
Error: expected a day between 1 and 31
input 47 -> "...42-2013 is a date"
pattern 42 -> "...### is a date"
date format 3 -> "...%d-%y"
> match_string('1 and 2 are numbers foo and bar are strings 08;4;2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '%m-%d-%y')
Error: input doesn't match date format
input 46 -> "...;4;2013 is a date"
pattern 42 -> "...### is a date"
date format 2 -> "...-%d-%y"
> match_string('1 and 2 are numbers foo and bar are strings 08-4-2013 is a date', '# and # are numbers ## and ## are strings ### is a date', ['foo', 'bar'], '~%m-%d-%y')
Error: input doesn't match date format
input 44 -> "...08-4-2013 is a date"
pattern 42 -> "...### is a date"
date format 0 -> "...~%m-%d-%y"