首先定义您的数据 - 在我的情况下,这些是维基百科中的所有页面及其浏览量(也删除特殊页面):
WITH wiki_prefixes AS (SELECT ['File:', 'Talk:', 'Template_talk:', 'Wikipedia:', 'Category:', 'User_talk:', 'Page:', 'Template:', 'Category_talk:' , 'User:', 'Author:', 'Portal:', 'Wikipedia_talk:', 'Portal_talk:', 'File_talk:', 'Draft:', 'Help:', 'Draft_talk:', 'en:', 'Book_talk:', 'Module:', 'MOS:', 'Special:', 'Book:'] x)
, data AS (
SELECT *
FROM `fh-bigquery.wikipedia_extracts.201912_en_totals`
WHERE title NOT IN ('-', 'Main_Page')
AND (
title NOT LIKE '%:%'
OR REGEXP_EXTRACT(title, '[^:]*:') NOT IN UNNEST((SELECT(x) FROM wiki_prefixes))
)
)
获得要绘制的数据后,您可以将每一行放入 100 或 1000 个存储桶之一 - 加上每个使用此查询表示的总数的累积百分比:
SELECT ROUND(100*cum_views/total_views,3) cum_percent, *
FROM (
SELECT SUM(views) OVER(ORDER BY bucket) cum_views, *, SUM(views) OVER() total_views
FROM (
SELECT 1+fhoffa.x.int(rn/(SELECT (1+COUNT(*))/1000 FROM data)) bucket, COUNT(*) pages, SUM(views) views
, STRING_AGG(title ORDER BY views DESC LIMIT 3) sample_titles
FROM (
SELECT title, views, ROW_NUMBER() OVER (ORDER BY views) rn
FROM data
)
GROUP BY 1
)
)
如果您想在 Data Studio 中将其可视化,请检查以下步骤: