2

我有一个内容清单。每个内容都有一个观看此内容的用户列表。我想使用 Python 创建一个如下图所示的图表。

我知道圆的半径与观看内容的用户数量成正比。圆圈之间的距离与联合用户的数量成正比。

所以我对解决这个问题的任何变体(算法或现有包)感兴趣。另外,也许有人知道,这些图表是如何被调用的(链接云?)。

在此处输入图像描述

你有什么想法吗?

4

1 回答 1

0

感谢所有回复。我认为如果我在这里描述我如何解决我的问题会很有用。

这是一些代码。首先,我使用分层/kmeans 聚类对数据进行聚类。然后我准备了一个简单的字典并将其转换为 d3 样式的 json。Json 与http://bl.ocks.org/mbostock/4063530上的示例中的 html 一起使用。

#coding: utf-8
import argparse
import json
import logging
import sys
import numpy
import pylab
from sklearn.cluster import KMeans
from common import init_db_connection
from numpy import array
from scipy.cluster import hierarchy

logger = logging.getLogger("build2")

CLUSTERING_TYPE_KMEANS = "KMEANS"
CLUSTERING_TYPE_HIERARCHY = "HIERARCHY"

def get_coord_names(cursor, limit):
    sql = """
        SELECT DISTINCT unnest(user_ids) ivi_id
        FROM (
            SELECT user_ids
            FROM content_watched_by_users cw
                JOIN content c ON c.id = cw.content_id
            ORDER BY array_length(ivi_ids, 1) DESC
            LIMIT %s
        ) as t
        ORDER BY ivi_id;
    """
    logger.info(cursor.mogrify(sql, (limit,)))
    cursor.execute(sql, (limit,))

    coord_names = [x[0] for x in cursor]
    return coord_names


def get_matrix_of_observations_and_objects(cursor, coords_name, limit):
    sql = """
        SELECT c.title, user_ids
        FROM content_watched_by_users cw
            JOIN content c ON c.id = cw.content_id
        ORDER BY array_length(user_ids, 1) DESC LIMIT %s"""
    logger.info(cursor.mogrify(sql, (limit,)))
    cursor.execute(sql, (limit,))

    logger.info(u"Начинаем получать матрицу наблюдений и массив объектов")
    matrix = []
    objects = []

    for content_id, user_ids in cursor:
        logger.info(u"Обрабатывается %s", content_id)
        objects.append((content_id, len(user_ids)))

        row = [0] * len(coords_name)
        for user_id in user_ids:
            try:
                row[coords_name.index(user_id)] = 1
            except ValueError:
                logger.error(u"Что-то не так с user_ids %s", user_ids)

        matrix.append(row)
    logger.info(u"Матрица наблюдений и массив объектов получены")
    return array(matrix), objects


def fcluster_to_d3_dict(fcluster, objects, name_cluster=False):
    d = {"name": "", "children": []}
    for i in range(max(fcluster)):
        d["children"].append({"name": "", "children": []})

    for index, parent_id in enumerate(fcluster):
        parent = d["children"][parent_id - 1]
        parent["children"].append({"name": objects[index][0], "size": objects[index][1]})

        if name_cluster and not parent["name"]:
            parent["name"] = objects[index][0]
    return d


def code_to_d3_dict(code, objects, name_cluster=False):
    d = {"name": "", "children": []}
    for i in range(max(code) + 1):
        d["children"].append({"name": "", "children": []})

    for index, parent_id in enumerate(code):
        parent = d["children"][parent_id]
        parent["children"].append({"name": objects[index][0], "size": objects[index][1]})

        if name_cluster and not parent["name"]:
            parent["name"] = objects[index][0]
    return d


def save_to_json(result_dict, output_file="d3/flare.json"):
    logger.info(u"Перегоняем в JSON")
    f = open(output_file, "w")
    json.dump(result_dict, f)
    f.close()
    logger.info(u"JSON сохранен по адресу: %s", output_file)


def hierarchy_clustering(matrix, objects, threshold, name_cluster):
    Z = hierarchy.linkage(matrix, method='ward')
    fcluster = hierarchy.fcluster(Z, threshold, 'distance')

    hierarchy.dendrogram(Z)
    pylab.savefig("temp.png")

    logger.info(fcluster)
    result_dict = fcluster_to_d3_dict(fcluster, objects, name_cluster)
    return result_dict


def kmeans_clustering(matrix, objects, k, name_cluster=False):
    S = 1 - (matrix / numpy.max(matrix))
    db = KMeans(n_clusters=k).fit(S)
    logger.info(db.labels_)
    result_dict = code_to_d3_dict(db.labels_, objects, name_cluster)
    return result_dict


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=u'Скрипт для получения красивого графа')
    # БД
    parser.add_argument('--db_host', default="localhost", type=str, dest="db_host",
                        help=u'Хост БД, по умолчанию: localhost')
    parser.add_argument('--db_port', default="5432", type=str, dest="db_port",
                        help=u'Порт БД, по умолчанию: 5432')
    parser.add_argument('--db_name', default="da_test", type=str, dest="db_name",
                        help=u'Имя БД, по умолчанию: da')
    parser.add_argument('--db_user', default="da", type=str, dest="db_user",
                        help=u'Пользователь БД, по умолчанию: da')
    # общее
    parser.add_argument("--log-level", default='INFO', type=str, dest="log_level",
                        choices=['DEBUG', 'INFO', 'WARNINGS', 'ERROR'], help=u"Уровень логирования")
    parser.add_argument('-l', '--limit', required=True, type=int, dest="limit",
                        help=u'Количество контента в выборке. '
                              u'Контент осортирован по количеству просмотревших его пользователей')
    parser.add_argument('-o', '--output', required=True, type=str, dest="output_file_path",
                        help=u'Куда сохранять JSON-результат')
    parser.add_argument('-n', '--name_cluster', action="store_true", dest="name_cluster",
                        help=u'Именовать кластеры по первому элементу в кластере')
    parser.add_argument("-c", "--clustering", default=CLUSTERING_TYPE_KMEANS, type=str, dest="clustering_type",
                        choices=[CLUSTERING_TYPE_KMEANS, CLUSTERING_TYPE_HIERARCHY], help=u"Тип кластеризации")
    # kmeans
    parser.add_argument('-k', '--max_k', type=int, dest="max_k",
                        help=u'Максимальное число кластеров. Только для kmeans')
    # иерархическая
    parser.add_argument('-t', '--threshold', type=float, dest="threshold",
                        help=u'Граница разделения на плоские кластеры. Только для иерархической кластеризации')

    args = parser.parse_args()
    logging.basicConfig(stream=sys.stdout, level=getattr(logging, args.log_level), format="%(asctime)s :: %(message)s")

    connection = init_db_connection(args.db_host, args.db_port, args.db_user, args.db_name)
    cursor = connection.cursor()
    coords_name = get_coord_names(cursor, args.limit)
    matrix, objects = get_matrix_of_observations_and_objects(cursor, coords_name, args.limit)
    connection.close()

    if args.clustering_type == CLUSTERING_TYPE_KMEANS:
        result_dict = kmeans_clustering(matrix, objects, args.max_k, args.name_cluster)
    elif args.clustering_type == CLUSTERING_TYPE_HIERARCHY:
        result_dict = hierarchy_clustering(matrix, objects, args.threshold, args.name_cluster)
    else:
        raise Exception(u"Неизвестный тип кластеризации")
    save_to_json(result_dict, args.output_file_path)

结果如下:

K均值聚类

于 2013-07-25T21:08:01.350 回答