我在 git-hub ( https://github.com/bonzanini/Book-SocialMediaMiningPython/blob/master/Chap02-03/twitter_cluster_users.py ) 上找到了这段代码,用于根据他们在 twitter 上的 bios 使用 k-means 对用户个人资料进行聚类。我有一个 .jsonl 文件,其中包含以下格式的行:
{"id": 66634375, "id_str": "66634375", "name": "JonathanvanderGeer", "screen_name": "jonathanvdgeer", "location": "\u00dcT: 52.079226,4.282848", "description": "politiek adviseur van vice-premier en LNV-minister Carola Schouten| gelovig|familiemens|levensgenieter|1984|hardloper|toneel|RvT-lid theaterschool Rabarber"....}
但我需要在打印集群时知道带有 bios 的用户名。我该怎么做?我想在这个集群中添加位置信息。将位置作为第二列添加到矩阵并使用这两个特征计算 tfidf。我怎么能这样做呢?
import sys
import json
from argparse import ArgumentParser
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
def get_parser():
parser = ArgumentParser("Clustering of followers")
parser.add_argument('--filename')
parser.add_argument('--k', type=int)
parser.add_argument('--min-df', type=int, default=2)
parser.add_argument('--max-df', type=float, default=0.8)
parser.add_argument('--max-features', type=int, default=None)
parser.add_argument('--no-idf', dest='use_idf', default=True, action='store_false')
parser.add_argument('--min-ngram', type=int, default=1)
parser.add_argument('--max-ngram', type=int, default=1)
return parser
if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
with open(args.filename) as f:
users = []
for line in f:
profile = json.loads(line)
users.append(profile['description'])
vectorizer = TfidfVectorizer(max_df=args.max_df,
min_df=args.min_df,
max_features=args.max_features,
stop_words='english',
ngram_range=(args.min_ngram, args.max_ngram),
use_idf=args.use_idf)
X = vectorizer.fit_transform(users)
print("Data dimensions: {}".format(X.shape))
print(X)
for i, feature in enumerate(vectorizer.get_feature_names()):
print(i, feature)
km = KMeans(n_clusters=args.k)
km.fit(X)
clusters = defaultdict(list)
for i, label in enumerate(km.labels_):
clusters[label].append(users[i])
# print 10 user description for this cluster
for label, descriptions in clusters.items():
print('---------- Cluster {}'.format(label))
for desc in descriptions[:10]:
print(desc)'