0

我在 git-hub ( https://github.com/bonzanini/Book-SocialMediaMiningPython/blob/master/Chap02-03/twitter_cluster_users.py ) 上找到了这段代码,用于根据他们在 twitter 上的 bios 使用 k-means 对用户个人资料进行聚类。我有一个 .jsonl 文件,其中包含以下格式的行:

 {"id": 66634375, "id_str": "66634375", "name": "JonathanvanderGeer", "screen_name": "jonathanvdgeer", "location": "\u00dcT: 52.079226,4.282848", "description": "politiek adviseur van vice-premier en LNV-minister Carola Schouten| gelovig|familiemens|levensgenieter|1984|hardloper|toneel|RvT-lid theaterschool Rabarber"....}

但我需要在打印集群时知道带有 bios 的用户名。我该怎么做?我想在这个集群中添加位置信息。将位置作为第二列添加到矩阵并使用这两个特征计算 tfidf。我怎么能这样做呢?

import sys
import json
from argparse import ArgumentParser
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


def get_parser():
parser = ArgumentParser("Clustering of followers")
parser.add_argument('--filename')
parser.add_argument('--k', type=int)
parser.add_argument('--min-df', type=int, default=2)
parser.add_argument('--max-df', type=float, default=0.8)
parser.add_argument('--max-features', type=int, default=None)
parser.add_argument('--no-idf', dest='use_idf', default=True, action='store_false')
parser.add_argument('--min-ngram', type=int, default=1)
parser.add_argument('--max-ngram', type=int, default=1)
return parser


if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()

with open(args.filename) as f:

    users = []
    for line in f:
        profile = json.loads(line)
        users.append(profile['description'])

    vectorizer = TfidfVectorizer(max_df=args.max_df,
                                 min_df=args.min_df,
                                 max_features=args.max_features,
                                 stop_words='english',
                                 ngram_range=(args.min_ngram, args.max_ngram),
                                 use_idf=args.use_idf)




    X = vectorizer.fit_transform(users)
    print("Data dimensions: {}".format(X.shape))
    print(X)




    for i, feature in enumerate(vectorizer.get_feature_names()):
        print(i, feature)



    km = KMeans(n_clusters=args.k)
    km.fit(X)
    clusters = defaultdict(list)
    for i, label in enumerate(km.labels_):
        clusters[label].append(users[i])

    # print 10 user description for this cluster
    for label, descriptions in clusters.items():
        print('---------- Cluster {}'.format(label))
        for desc in descriptions[:10]:
            print(desc)'
4

0 回答 0