sql - 有没有办法使用高级 BigQuery 功能进行这种类似 Wordle 的 SQL 查询？

Question

我正在尝试使用 BigQuery 使用类似于 Wordle 的规则进行一些单词/字母分析。

你知道该怎么做：

信件在场但在其他位置
正确位置的字母
⬜ 字母不存在。

我有一个有效的查询，但出于三个原因我并不十分满意：

它看起来不太像 BigQuery-y
对于大量单词，查询不能很好地扩展
延伸到，比如说，7个字母的单词看起来真的，真的，真的很可怕，很乱，任何数量的讨厌的形容词都可以很容易地描述它

我这样做的方式模糊地让我想起了递归 CTE，但遗憾的是，这些对我来说完全难以捉摸......

让它更快并不是一个真正的问题，但我想知道是否有更高级/优雅的方式（因为缺乏更好的术语）来完成相同的结果：递归 CTE，TVF 我猜使用 Javascript UDF 是一种选择，但我'想远离他们，而是探索仅 SQL 的替代方案。

这是丑陋的查询：

with words as (
    select normalized_word as word, 'solution' word_type
    from unnest(["SCAPE","PETER","SMACK","MAMMA"]) as normalized_word
),
split_word as (
    select word,letter,word_type,
    row_number() over(partition by word) as pos
    from words, unnest(split(word,'')) as letter
),
check_correct as (
    select a.word word1,any_value(a.word_type) as word_type,b.word word2,
     string_agg(if(a.letter=b.letter,'',a.letter),'' order by a.pos) w1,
     string_agg(if(a.letter=b.letter,'❌',b.letter),'' order by a.pos) w2,
     sum(if(a.letter=b.letter,1,0)) c,0 p
    from split_word as a, split_word as b
    where a.word <> b.word
    and b.word_type = 'solution'
    and a.pos = b.pos
    group by a.word,b.word
),
check_first as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,1,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,1,1))>0,concat('',right(w1,4)),w1) w1,
    if(instr(w2,substring(w1,1,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,1,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,1,1)))),
      w2) w2,
    from check_correct
),
check_second as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,2,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,2,1))>0,concat(left(w1,1),'',right(w1,3)),w1) w1,
    if(instr(w2,substring(w1,2,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,2,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,2,1)))),
      w2) w2,
    from check_first
),
check_third as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,3,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,3,1))>0,concat(left(w1,2),'',right(w1,2)),w1) w1,
    if(instr(w2,substring(w1,3,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,3,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,3,1)))),
      w2) w2,
    from check_second
),
check_fourth as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,4,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,4,1))>0,concat(left(w1,3),'',right(w1,1)),w1) w1, 
    if(instr(w2,substring(w1,4,1))>0,
      concat(
        left(w2,instr(w2,substring(w1,4,1))-1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,4,1)))),
      w2) w2,
    from check_third
),
check_fifth as (
    select word1,word_type,word2,c,
    if(instr(w2,substring(w1,5,1))>0,p+1,p) p,
    if(instr(w2,substring(w1,5,1))>0,concat(left(w1,4),''),w1) w1,
    if(instr(w2,substring(w1,5,1))>0, 
      concat(
        left(w2,instr(w2,substring(w1,5,1)) - 1),
        '❌',
        right(w2,length(w2)-instr(w2,substring(w1,5,1)))),
      w2) w2, -- for completeness
    from check_fourth
),
final_result as (
    select word1 as guess, word_type as guess_type, word2 as solution, c as correct, p as present, 
    length(w1)-c-p as absent,
    regexp_replace(w1, r'([A-Z])', '⬜') as wordle, w1, w2 
    from check_fifth
) 

select *
from final_result
order by guess

结果如下所示：

注意一些有趣的边缘情况，猜测词中有重复的字母：2、3、5。我丑陋的查询通过在解决方案中“消耗”匹配的字母并用 ❌ 替换它们来避免进一步匹配来管理这个。这是的唯一目的w2：避免多于一场比赛。

更新

米哈伊尔的解决方案几乎可以工作，但当猜测单词有重复字母时失败：

score 2 · Accepted Answer

考虑以下方法 - 适用于任意数量的字母

with words as (
    select normalized_word as word, 'solution' word_type
    from unnest(["SHAPE","PETER","TAPES","JUMBO","NINJA","MAMMA"]) as normalized_word
), pairs as (
  select t1.word as guess, t2.word as solution
  from words t1, words t2
), greens as (
  select guess, solution, x, offset, color
  from pairs t, unnest(array(
      select as struct x, offset, if(x=y, '', '⬜') color
      from unnest(split(guess, '')) x with offset 
      left join unnest(split(solution, '')) y with offset 
      using(offset)
    ))
  where guess != solution
), yellows_temp as (
  select guess, solution, x, color
  from pairs t, unnest(array(
      select as struct x, '' color
      from unnest(split(guess, '')) x with offset as pos1
      join unnest(split(solution, '')) y with offset as pos2
      on x = y and pos1 != pos2
      group by x, color
    ))
  where guess != solution
), yellows as (
  select guess, solution, x, '' color 
  from yellows_temp y
  left join (
    select guess, solution, x
    from greens
    where color = ''
  ) g
  using (guess, solution, x)
  where g.x is null
)
select guess, solution, 
  countif(g.color = '') correct, 
  count(distinct if(g.color = '⬜' and y.color = '', y.x, null)) present,
  string_agg(if(g.color = '⬜' and y.color = '', '', g.color), '' order by offset) as wordle,
  string_agg(if(g.color = '⬜' and y.color = '', '', if(g.color = '', '', g.x)), '' order by offset) as w1
from greens g
left join yellows y 
using(guess, solution, x)
group by guess, solution
-- order by guess, solution

带输出

为您留下 w2 - 使用上面作为起点应该相对简单:o)

sql - 有没有办法使用高级 BigQuery 功能进行这种类似 Wordle 的 SQL 查询？

1 回答 1

Related

Reference