我有一个用 django 开发的网站,其中包含大约 25000 个子网址。我需要一些东西来列出网站中的所有网址并检查链接是否定期断开,所以我更喜欢做一些我可以作为脚本运行的事情。
我应该遵循哪种方法?任何的想法 ?
我有一个用 django 开发的网站,其中包含大约 25000 个子网址。我需要一些东西来列出网站中的所有网址并检查链接是否定期断开,所以我更喜欢做一些我可以作为脚本运行的事情。
我应该遵循哪种方法?任何的想法 ?
这是基于出色的@sneawo 答案的改进课程。特点包括:
settings.ROOT_URLCONF
;欢迎改进。
from django import test
from django.core.urlresolvers import reverse
from django.conf import settings
import importlib
class UrlsTest(test.TestCase):
def test_responses(self, allowed_http_codes=[200, 302, 405],
credentials={}, logout_url="", default_kwargs={}, quiet=False):
"""
Test all pattern in root urlconf and included ones.
Do GET requests only.
A pattern is skipped if any of the conditions applies:
- pattern has no name in urlconf
- pattern expects any positinal parameters
- pattern expects keyword parameters that are not specified in @default_kwargs
If response code is not in @allowed_http_codes, fail the test.
if @credentials dict is specified (e.g. username and password),
login before run tests.
If @logout_url is specified, then check if we accidentally logged out
the client while testing, and login again
Specify @default_kwargs to be used for patterns that expect keyword parameters,
e.g. if you specify default_kwargs={'username': 'testuser'}, then
for pattern url(r'^accounts/(?P<username>[\.\w-]+)/$'
the url /accounts/testuser/ will be tested.
If @quiet=False, print all the urls checked. If status code of the response is not 200,
print the status code.
"""
module = importlib.import_module(settings.ROOT_URLCONF)
if credentials:
self.client.login(**credentials)
def check_urls(urlpatterns, prefix=''):
for pattern in urlpatterns:
if hasattr(pattern, 'url_patterns'):
# this is an included urlconf
new_prefix = prefix
if pattern.namespace:
new_prefix = prefix + (":" if prefix else "") + pattern.namespace
check_urls(pattern.url_patterns, prefix=new_prefix)
params = {}
skip = False
regex = pattern.regex
if regex.groups > 0:
# the url expects parameters
# use default_kwargs supplied
if regex.groups > len(regex.groupindex.keys()) \
or set(regex.groupindex.keys()) - set(default_kwargs.keys()):
# there are positional parameters OR
# keyword parameters that are not supplied in default_kwargs
# so we skip the url
skip = True
else:
for key in set(default_kwargs.keys()) & set(regex.groupindex.keys()):
params[key] = default_kwargs[key]
if hasattr(pattern, "name") and pattern.name:
name = pattern.name
else:
# if pattern has no name, skip it
skip = True
name = ""
fullname = (prefix + ":" + name) if prefix else name
if not skip:
url = reverse(fullname, kwargs=params)
response = self.client.get(url)
self.assertIn(response.status_code, allowed_http_codes)
# print status code if it is not 200
status = "" if response.status_code == 200 else str(response.status_code) + " "
if not quiet:
print(status + url)
if url == logout_url and credentials:
# if we just tested logout, then login again
self.client.login(**credentials)
else:
if not quiet:
print("SKIP " + regex.pattern + " " + fullname)
check_urls(module.urlpatterns)
在django-extensions中使用show-urls命令作为起点。(文档)
python manage.py show_urls
对于没有参数的简单 url,可以使用这样的测试:
from django import test
from django.core.urlresolvers import reverse
from foo.urls import urlpatterns
class UrlsTest(test.TestCase):
def test_responses(self):
for url in urlpatterns:
response = self.client.get(reverse(url.name))
self.assertEqual(response.status_code, 200)
如果您的页面已经上传到 Web 服务器,零编码解决方案是使用免费的W3C Link Checker。它将尝试在页面中找到的每个链接并提供一个很好的摘要。
在 Django 2.2.x 中,我不得不使用@sneawo 出色答案的这个稍微修改过的版本:
from django import test
from django.urls import reverse, URLPattern
from myapp.urls import urlpatterns
class MyAppUrlsTest(test.SimpleTestCase):
def test_responses(self):
for url in urlpatterns:
# For now, perform only GET requests and ignore URLs that need arguments.
if not isinstance(url, URLPattern) or url.pattern.regex.groups or not url.name:
continue
urlpath = reverse(url.name)
response = self.client.get(urlpath, follow=True)
self.assertEqual(response.status_code, 200)
请注意,我还通过忽略它们来解释需要参数的视图。对于我特定的、简单化的用例,这也让我可以通过不在name
我的urlpatterns
.
另请参阅https://github.com/encode/django-rest-framework/pull/5500#issue-146618375。
我采用的方法与使用的方法略有不同reverse
,而是实际加载站点并查找所有“href”,然后遵循所有这些等。下面的代码将所有调用打印为层次结构。目前它断言响应代码 200(在以下链接之后),如果您正在测试 25000 个子站点,那么只记录响应代码然后搜索输出可能是有意义的。
from django.conf import settings
from django.test.testcases import TestCase
import re
from urlparse import urlsplit, urljoin
class GenericTestCase( TestCase ):
fixtures = []
def test_links( self ):
self.p1 = re.compile( r'href="([^"]*)"' )
self.p2 = re.compile( r"href='([^']*)'" )
self.visited_urls = set()
self.visit( '/', 0 )
def visit( self, url, depth ):
print( '-' * depth + url ),
self.visited_urls.add( url )
response = self.client.get( url, follow=True )
if response.redirect_chain:
url = urlsplit( response.redirect_chain[-1][0] ).path
print( ' => ' + url )
if url in self.visited_urls:
return
self.visited_urls.add( url )
else:
print( '' )
self.assertEquals( response.status_code, 200 )
refs = self.get_refs( response.content )
for relative_url in refs:
absolute_url = urljoin( url, relative_url )
if not self.skip_url( absolute_url, relative_url ):
self.visit( absolute_url, depth + 1 )
def skip_url( self, absolute_url, relative_url ):
return absolute_url in self.visited_urls \
or ':' in absolute_url \
or absolute_url.startswith( settings.STATIC_URL ) \
or relative_url.startswith( '#' )
def get_refs( self, text ):
urls = set()
urls.update( self.p1.findall( text ) )
urls.update( self.p2.findall( text ) )
return urls