根据马克的回答,我制作了一个pyparsing
对我来说更清晰的小脚本(包括可能的关键匹配):
#!/usr/bin/env python
from pyparsing import Word, alphanums, Or, Optional, Combine
schema = Or(['http://', 'https://']).setResultsName('schema')
word = Word(alphanums + '-', min=1)
bucket_name = word.setResultsName('bucket')
region = word.setResultsName('region')
key = Optional('/' + word.setResultsName('key'))
"bucket.s3.amazonaws.com"
opt1 = Combine(schema + bucket_name + '.s3.amazonaws.com' + key)
"bucket.s3-aws-region.amazonaws.com"
opt2 = Combine(schema + bucket_name + '.' + region + '.amazonaws.com' + key)
"s3.amazonaws.com/bucket"
opt3 = Combine(schema + 's3.amazonaws.com/' + bucket_name + key)
"s3-aws-region.amazonaws.com/bucket"
opt4 = Combine(schema + region + ".amazonaws.com/" + bucket_name + key)
tests = [
"http://bucket-name.s3.amazonaws.com",
"https://bucket-name.s3-aws-region-name.amazonaws.com",
"http://s3.amazonaws.com/bucket-name",
"https://s3-aws-region-name.amazonaws.com/bucket-name",
"http://bucket-name.s3.amazonaws.com/key-name",
"https://bucket-name.s3-aws-region-name.amazonaws.com/key-name",
"http://s3.amazonaws.com/bucket-name/key-name",
"https://s3-aws-region-name.amazonaws.com/bucket-name/key-name",
]
s3_url = Or([opt1, opt2, opt3, opt4]).setResultsName('url')
for test in tests:
result = s3_url.parseString(test)
print "found url: " + str(result.url)
print "schema: " + str(result.schema)
print "bucket name: " + str(result.bucket)
print "key name: " + str(result.key)
最初我让马克的脚本也检索密钥(对象):
def parse_s3_url(url):
""" Gets bucket name and region from url, matching any of the different formats for S3 urls
* http://bucket.s3.amazonaws.com
* http://bucket.s3-aws-region.amazonaws.com
* http://s3.amazonaws.com/bucket
* http://s3-aws-region.amazonaws.com/bucket
returns bucket name, region
"""
match = re.search('^https?://([^.]+).s3.amazonaws.com(/\([^.]+\))', url)
if match:
return match.group(1), None, match.group(2)
match = re.search('^https?://([^.]+).s3-([^.]+).amazonaws.com/', url)
if match:
return match.group(1), match.group(2), match.group(3)
match = re.search('^https?://s3.amazonaws.com/([^\/]+)', url)
if match:
return match.group(1), None, match.group(2)
match = re.search('^https?://s3-([^.]+).amazonaws.com/([^\/]+)', url)
if match:
return match.group(2), match.group(1), match.group(3)
return None, None, None