我在 DB 中有一张包含近 2000 万条记录的表。我想通过正则表达式进行搜索。当记录数大约为 10 万时,一切都很好。但现在它需要相当长的时间,甚至有时会导致超时。我是否需要迁移到 SQL 数据库可能是postgresql
弹性搜索或类似的东西。由于此表中的记录预计将增加 200 亿以上。djongo
是否有一种方法可以通过保持与我用来连接的设置相同的设置来提高效率,django
或者mongodb
我必须使用任何其他数据库进行快速搜索。
我的模型架构是
from djongo import models as model
class User(model.Model):
email = model.CharField(max_length=50, default='')
source = model.CharField(default='unknown',max_length=150)
username = model.CharField(max_length=150, default='')
hash = model.CharField(max_length=255, default='')
salt = model.CharField(max_length=255, default='')
ipaddress = model.CharField(max_length=50,default='')
lastipaddress = model.CharField(max_length=50,default='')
name = model.CharField(max_length=150, default='')
dateofbirth = model.CharField(max_length=100, default='')
phonenumber = model.CharField(max_length=100, default='')
firstname = model.CharField(max_length=150, default='')
lastname = model.CharField(max_length=150, default='')
address = model.CharField(max_length=255, default='')
objects = model.DjongoManager()
当 post 请求发送到 django 时调用此方法
@api_view(['POST'])
@authentication_classes([authentication.TokenAuthentication])
@permission_classes([permissions.IsAdminUser])
def search(request):
if 'username' in request.data:
username = request.data['username']
if 'email' in request.data:
useremail = request.data['email']
if 'userid' in request.data:
userid = request.data['userid']
if 'query' in request.data:
query = request.data['query']
else:
return Response(status.HTTP_400_BAD_REQUEST)
obj = {}
obj['query'] = query
obj['type'] = type
obj['wildcard'] = wildcard
obj['regex'] = regex
if not (type in ['email', 'domain', 'username'] and wildcard == 'false' and regex == 'false'):
obj['request'] = request
final = []
print('wildcard', wildcard)
print('regex', regex)
print('type', type)
if wildcard == 'true' or regex == 'true':
with concurrent.futures.ThreadPoolExecutor() as executor:
t1 = executor.submit(getRecordsFromDB, obj)
final = t1.result()
return final
由上述方法调用 this 执行正则表达式查询
def getRecordsFromDB(obj):
max_limit = 10000
if obj['wildcard'] == "false" and obj['regex'] == "true":
print("yes regex thing")
if obj['type'] == 'domain':
obj['query'] = r'.+@{1}' + obj['query']
obj['type'] = 'email'
try:
pagination_class = LimitOffsetPagination
paginator = pagination_class()
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
}).count()
if queryset > max_limit:
return Response(status.HTTP_507_INSUFFICIENT_STORAGE)
else:
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
})
page = paginator.paginate_queryset(queryset, obj['request'])
serializer = UserSerializer(page, many=True)
return paginator.get_paginated_response(serializer.data)
except Exception as err:
print(f'Other error occurred: {err}')
return Response(status.HTTP_422_UNPROCESSABLE_ENTITY)
elif obj['wildcard'] == "true" and obj['regex'] == "false":
print("yes wildcard thing")
#obj['query'] = obj['query'].replace('.', r'\.')
obj['query'] = re.escape(obj['query'])
obj['query'] = obj['query'].replace('\*', r'[a-zA-Z0-9-_.]*')
print('below is the respective regex for the given query')
print(obj['query'])
if obj['query'][0] != r'*' and obj['type'] != 'domain':
print('yes here where it should not be')
obj['query'] = r'^' + obj['query']
if len(obj['query']) > 1:
if obj['query'][-1] != r'*':
obj['query'] = obj['query'] + r'$'
print('final regex ', obj['query'])
if obj['type'] == 'domain':
obj['query'] = r'.+@{1}' + obj['query']
obj['type'] = 'email'
print('very final regex ', obj['query'])
try:
pagination_class = LimitOffsetPagination
paginator = pagination_class()
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
}).count()
if queryset > max_limit:
return Response(status.HTTP_507_INSUFFICIENT_STORAGE)
else:
queryset = User.objects.mongo_find({
obj['type']: {'$regex': obj['query']}
})
page = paginator.paginate_queryset(queryset, obj['request'])
serializer = UserSerializer(page, many=True)
return paginator.get_paginated_response(serializer.data)
except Exception as err:
print(f'Other error occurred: {err}')
return Response(status.HTTP_422_UNPROCESSABLE_ENTITY)
return records