我创建了一个带有重复记录的新表。我正在尝试找到对记录进行重复数据删除的最有效方法,因为这将在具有数百万条记录的表上运行。如果您使用多个 CTE 嵌套,那么您的数据结构是什么,处理是在内存中完成还是在有大量数据时写入临时表是否重要。
create or replace table t1.cte4 as
WITH t1 AS (
SELECT 1 as id,'eren' AS last_name UNION ALL
SELECT 1 as id,'yilmaz' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 1 as id,'kaya' AS last_name UNION ALL
SELECT 2 as id,'smith' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'jones' AS last_name UNION ALL
SELECT 2 as id,'brown' AS last_name
)
SELECT id,ARRAY_AGG(STRUCT(last_name)) AS last_name_rec
FROM t1
GROUP BY id;
我可以按如下方式删除重复项。
QUERY 1 How to dedup the concat_struct ?
select id,
STRING_AGG( distinct ln.last_name ,'~') as concat_string,
ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from `t1.cte4`, unnest(last_name_rec) ln
group by id;
QUERY 2 Is there a better way then this to dedup?
select distinct id,
TO_JSON_STRING(ARRAY_AGG(ln.last_name) OVER (PARTITION BY id)) json_string
from `t1.cte4`, unnest(last_name_rec) ln
group by id,
ln.last_name;
我如何将它从表中取出,而不是使用 CTE。这不会重复数据删除。
select id, ARRAY_AGG(STRUCT( ln.last_name )) as concat_struct
from t1.cte4,
unnest(last_name_rec) ln group by id;
我不能这样做。
select id, ARRAY_AGG(distinct STRUCT( ln.last_name )) as concat_struct from t1.cte4,
unnest(last_name_rec) ln group by id;