python - 使用 Django REST 框架的 Solr 搜索结果

Question

我正在为我们的 Web API 和Solr使用 Django REST Framework来支持搜索。当前，在的子类中，我重写以获取具有 Solr 搜索结果的 a：ListAPIViewget_queryset()QuerySet

class ClipList(generics.ListAPIView):
    """
    List all Clips. 

    Permissions: IsAuthenticatedOrReadOnly

    Parameters:
    query -- Search for Clips. EX: clips/?query=aaron%20rodgers
    """
    model = Clip
    serializer_class = ClipSerializer
    permission_classes = (permissions.IsAuthenticatedOrReadOnly,)

    def get_queryset(self):
        params = request.GET
        query = params.get('query', None)

        queryset = Clip.objects.all()

        if query is not None:
            conn = solr.Solr(settings.SOLR_URL)
            sh = solr.SearchHandler(conn, "/select")
            response = sh(query)
            ids = []

            for result in response.results:
                ids.append(result['id'])

            # filter the initial queryset based on the Clip identifiers from the Solr response
            # PROBLEM: This does not preserve the order of the results as it equates to
            # `SELECT * FROM clips WHERE id in (75, 89, 106, 45)`.
            # SQL is not guaranteed to return the results in the same order used in the WHERE clause.
            # There is no way (that I'm aware of) to do this in SQL.
            queryset = queryset.filter(pk__in=ids)

        return queryset

但是，正如评论中所解释的，这不会保留结果的顺序。我意识到我可以制作一组 Python 剪辑对象，但是我会失去对 Django 的惰性评估，QuerySet结果可能会很大并且会被分页。

我查看了Haystack，我的理解是上面使用 Haystack 的代码如下所示：

    def get_queryset(self):
        params = self.request.GET
        query = params.get('query', None)

        search_queryset = SearchQuerySet().filter(content=query)

        return search_queryset

这超级简单，并且会保持结果的顺序，但是 Django REST Framework 不会序列化SearchQuerySets.

REST 框架中是否有我可以覆盖的方法允许序列化SearchQuerySets？或者有没有办法在不使用 Haystack 或 Python 集的情况下保持排名结果的顺序？

score 5 · Accepted Answer

我们想出了这个解决方案。它模仿了 Django 和 REST 框架的习语。

简而言之，我会解释（希望代码和注释是更深入的解释:)）。我们没有使用常规的 DjangoPage来支持分页，而是FakePage可以使用不耦合到对象列表的总计数来初始化它。这是黑客。是的，这是一个“黑客”，但它是一个非常简单的解决方案。我们的替代方案是重新实现QuerySet. 对我们来说，简单的解决方案总是胜出。尽管很简单，但它是可重用且高性能的。

使用假页面，我们有一个抽象SearchListModelMixin类，它知道如何获取使用我们的假页面的序列化程序。mixin 应用于具体的视图类，如SearchListCreateAPIView. 代码如下。如果其他人有需要，我可以解释得更透彻。

class FakePage(object):
    """
    Fake page used by Django paginator.
    Required for wrapping responses from Solr.
    """

    def __init__(self, object_list, number, total_count):
        """
        Create fake page instance.

        Args:
            object_list: list of objects, represented by a page
            number: 1-based page number
            total_count: total count of objects (in all pages)
        """
        self.object_list = object_list
        self.number = number
        # count of objects per page equals to length of list
        self.per_page = len(object_list)
        if self.per_page > 0:
            self.num_pages = total_count // self.per_page
        else:
            self.num_pages = 0
        self.total_count = total_count

    def __repr__(self):
        return '<Page %s of %s>' % (self.number, self.num_pages)

    def __len__(self):
        return len(self.object_list)

    def __getitem__(self, index):
        if not isinstance(index, (slice,) + six.integer_types):
            raise TypeError
        # The object_list is converted to a list so that if it was a QuerySet
        # it won't be a database hit per __getitem__.
        if not isinstance(self.object_list, list):
            self.object_list = list(self.object_list)
        return self.object_list[index]

    def total_count(self):
        return self.total_count

    def has_next(self):
        return self.number < self.num_pages

    def has_previous(self):
        return self.number > 1

    def has_other_pages(self):
        return self.has_previous() or self.has_next()

    def next_page_number(self):
        if self.has_next():
            return self.number + 1
        raise EmptyPage('Next page does not exist')

    def previous_page_number(self):
        if self.has_previous():
            return self.number - 1
        raise EmptyPage('Previous page does not exist')

    def start_index(self):
        """
        Returns the 1-based index of the first object on this page,
        relative to total objects in the paginator.
        """
        # Special case, return zero if no items.
        if self.total_count == 0:
            return 0
        return (self.per_page * (self.number - 1)) + 1

    def end_index(self):
        """
        Returns the 1-based index of the last object on this page,
        relative to total objects found (hits).
        """
        # Special case for the last page because there can be orphans.
        if self.number == self.num_pages:
            return self.total_count
        return self.number * self.per_page    

class SearchListModelMixin(object):
    """
    List search results or a queryset.
    """
    # Set this attribute to make a custom URL paramter for the query.
    # EX: ../widgets?query=something
    query_param = 'query'

    # Set this attribute to define the type of search.
    # This determines the source of the query value.
    # For example, for regular text search,
    # the query comes from a URL param. However, for a related search,
    # the query is a unique identifier in the URL itself.
    search_type = SearchType.QUERY

    def search_list(self, request, *args, **kwargs):
        # Get the query from the URL parameters dictionary.
        query = self.request.GET.get(self.query_param, None)

        # If there is no query use default REST Framework behavior.
        if query is None and self.search_type == SearchType.QUERY:
            return self.list(request, *args, **kwargs)

        # Get the page of objects and the total count of the results.
        if hasattr(self, 'get_search_results'):
            self.object_list, total_count = self.get_search_results()
            if not isinstance(self.object_list, list) and not isinstance(total_count, int):
               raise ImproperlyConfigured("'%s.get_search_results()' must return (list, int)"
                                    % self.__class__.__name__)
        else:
            raise ImproperlyConfigured("'%s' must define 'get_search_results()'"
                                    % self.__class__.__name__)

        # Normally, we would be serializing a QuerySet,
        # which is lazily evaluated and has the entire result set.
        # Here, we just have a Python list containing only the elements for the
        # requested page. Thus, we must generate a fake page,
        # simulating a Django page in order to fully support pagination.
        # Otherwise, the `count` field would be equal to the page size.
        page = FakePage(self.object_list,
            int(self.request.GET.get(self.page_kwarg, 1)),
            total_count)

        # Prepare a SearchPaginationSerializer
        # with the object_serializer_class
        # set to the serializer_class of the APIView.
        serializer = self.get_search_pagination_serializer(page)

        return Response(serializer.data)

    def get_search_pagination_serializer(self, page):
        """
        Return a serializer instance to use with paginated search data.
        """
        class SerializerClass(SearchPaginationSerializer):
            class Meta:
                object_serializer_class = self.get_serializer_class()

        pagination_serializer_class = SerializerClass
        context = self.get_serializer_context()
        return pagination_serializer_class(instance=page, context=context)

    def get_solr_results(self, sort=None):
        """
        This method is optional. It encapsulates logic for a Solr request
        for a list of ids using pagination. Another method could be provided to
        do the search request. 
        """
        queryset = super(self.__class__, self).get_queryset()

        conn = solr.Solr(settings.SOLR_URL)

        # Create a SearchHandler and get the query with respect to 
        # the type of search.
        if self.search_type == SearchType.QUERY:
            query = self.request.GET.get(self.query_param, None)
            sh = solr.SearchHandler(conn, "/select")
        elif self.search_type == SearchType.RELATED:
            query  = str(self.kwargs[self.lookup_field])
            sh = solr.SearchHandler(conn, "/morelikethis")

        # Get the pagination information and populate kwargs.
        page_num = int(self.request.GET.get(self.page_kwarg, 1))
        per_page = self.get_paginate_by()
        offset = (page_num - 1) * per_page
        kwargs = {'rows': per_page, 'start': offset}
        if sort:
            kwargs['sort'] = sort

        # Perform the Solr request and build a list of results.
        # For now, this only gets the id field, but this could be 
        # customized later.
        response = sh(query, 'id', **kwargs)
        results = [int(r['id']) for r in response.results]

        # Get a dictionary representing a page of objects.
        # The dict has pk keys and object values, sorted by id.
        object_dict = queryset.in_bulk(results)

        # Build the sorted list of objects.
        sorted_objects = []
        for pk in results:
            obj = object_dict.get(pk, None)
            if obj:
                sorted_objects.append(obj)
        return sorted_objects, response.numFound

class SearchType(object):
    """
    This enum-like class defines types of Solr searches
    that can be used in APIViews.
    """
    QUERY = 1
    RELATED = 2


class SearchPaginationSerializer(pagination.BasePaginationSerializer):
    count = serializers.Field(source='total_count')
    next = pagination.NextPageField(source='*')
    previous = pagination.PreviousPageField(source='*')


class SearchListCreateAPIView(SearchListModelMixin, generics.ListCreateAPIView):
    """
    Concrete view for listing search results, a queryset, or creating a model instance.
    """
    def get(self, request, *args, **kwargs):
        return self.search_list(request, *args, **kwargs)


class SearchListAPIView(SearchListModelMixin, generics.ListAPIView):
    """
    Concrete view for listing search results or a queryset.
    """
    def get(self, request, *args, **kwargs):
        return self.search_list(request, *args, **kwargs)


class WidgetList(SearchListCreateAPIView):
    """
    List all Widgets or create a new Widget.
    """
    model = Widget
    queryset = Widget.objects.all()
    serializer_class = WidgetSerializer
    permission_classes = (permissions.IsAuthenticatedOrReadOnly,)
    search_type = SearchType.QUERY # this is default, but explicitly declared here    

    def get_queryset(self):
        """
        The method implemented for database powered results.
        I'm simply illustrating the default behavior here.
        """
        queryset = super(WidgetList, self).get_queryset()
        return queryset

    def get_search_results(self):
        """
        The method implemented for search engine powered results.
        """
        return self.get_solr_results(solr_sort)

class RelatedList(SearchListAPIView):
    """
    List all related Widgets.
    """
    model = Widget
    queryset = Widget.objects.all()
    serializer_class = WdigetSerializer
    permission_classes = (permissions.IsAuthenticatedOrReadOnly,)
    search_type = SearchType.RELATED

    def get_search_results(self):
        return self.get_solr_results()

python - 使用 Django REST 框架的 Solr 搜索结果

1 回答 1

Related

Reference