1

有一个非常大的表(超过 2 亿行)
sID int, wordID int (PK sID, wordID)

想要找到具有完全相同 wordID(并且没有额外内容)
的 sID 对于超过 100 个 wordID 的 sID,完全匹配的机会会下降,因此愿意将其限制为 100
(但希望达到 1000)

如果这是学校,sID 是班级,wordID 是学生。
然后我想找到拥有完全相同学生的班级。

sID, wordID
1, 1
1, 2
1, 3
2, 2
2, 3
3, 1
3, 4
5, 1
5, 2
6, 2
6, 3
7, 1
7, 2
8, 1
8, 1

sID 6 和 2 具有完全相同的
wordID sID 5、7 和 8 具有完全相同的 wordID

这就是我到目前为止所拥有的
我想消除两个 delete #temp3_sID1_sID2 并在上面的插入中处理它
但我会尝试任何想法
它不像你可以轻松创建一个包含 2 亿行的表来测试

  drop table #temp_sID_wordCount
  drop table #temp_count_wordID_sID 
  drop table #temp3_wordID_sID_forThatCount
  drop table #temp3_sID1_sID2
  drop table #temp3_sID1_sID2_keep
  create table #temp_sID_wordCount  (sID int primary key, ccount int not null)
  create table #temp_count_wordID_sID  (ccount int not null, wordID int not null, sID int not null, primary key (ccount, wordID, sID)) 
  create table #temp3_wordID_sID_forThatCount  (wordID int not null, sID int not null, primary key(wordID, sID))
  create table #temp3_sID1_sID2_keep  (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
  create table #temp3_sID1_sID2  (sID1 int not null, sID2 int not null, primary key(sID1, sID2))
  insert into #temp_sID_wordCount 
  select sID, count(*) as ccount 
   FROM [FTSindexWordOnce] with (nolock)
   group by sID 
   order by sID;
  select count(*) from #temp_sID_wordCount where ccount <= 100;  -- 701,966
  truncate table #temp_count_wordID_sID
  insert into #temp_count_wordID_sID 
  select #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID 
    from #temp_sID_wordCount
    join [FTSindexWordOnce] with (nolock) 
      on [FTSindexWordOnce].sID = #temp_sID_wordCount.sID
     and ccount >= 1 and ccount <= 10
   order by #temp_sID_wordCount.ccount, [FTSindexWordOnce].wordID, [FTSindexWordOnce].sID;
  select count(*) from #temp_sID_wordCount;  -- 34,860,090

    truncate table #temp3_sID1_sID2_keep
    declare cur cursor for 
    select top 10 ccount from #temp_count_wordID_sID group by ccount order by ccount

    open cur
    declare @count int, @sIDcur int
    fetch next from cur into @count
    while (@@FETCH_STATUS = 0)
    begin
      --print (@count)
      --select count(*), @count from #temp_sID_wordCount where #temp_sID_wordCount.ccount = @count
      truncate table #temp3_wordID_sID_forThatCount
      truncate table #temp3_sID1_sID2

      -- wordID and sID for that unique word count 
      -- they can only be exact if they have the same word count
      insert into #temp3_wordID_sID_forThatCount 
      select       #temp_count_wordID_sID.wordID
                 , #temp_count_wordID_sID.sID
      from #temp_count_wordID_sID
      where #temp_count_wordID_sID.ccount = @count
      order by  #temp_count_wordID_sID.wordID, #temp_count_wordID_sID.sID 

      -- select count(*) from  #temp3_wordID_sID_forThatCount

      -- this has some duplicates 
      -- sID1 is the group 
      insert into #temp3_sID1_sID2
      select w1.sID, w2.sID
        from #temp3_wordID_sID_forThatCount as w1 with (nolock)
        join #temp3_wordID_sID_forThatCount as w2 with (nolock)
          on w1.wordID = w2.wordID
         and w1.sID <= w2.sID         
       group by w1.sID, w2.sID
       having count(*) = @count
       order by w1.sID, w2.sID

      -- get rid of the goups of 1      
      delete #temp3_sID1_sID2  
      where  sID1 in (select sID1 from #temp3_sID1_sID2 group by sID1 having count(*) = 1)

      -- get rid of the double dips         
      delete #temp3_sID1_sID2
       where #temp3_sID1_sID2.sID1 in 
              (select distinct s1del.sID1 -- these are the double dips 
                from #temp3_sID1_sID2 as s1base with (nolock) 
                join #temp3_sID1_sID2 as s1del with (nolock)
                  on s1del.sID1 > s1base.sID1 
                 and s1Del.sID1 = s1base.sID2)

      insert into #temp3_sID1_sID2_keep      
      select #temp3_sID1_sID2.sID1
           , #temp3_sID1_sID2.sID2
        from #temp3_sID1_sID2 with (nolock)
        order by #temp3_sID1_sID2.sID1, #temp3_sID1_sID2.sID2

    fetch next from cur into  @count
    end
    close cur
    deallocate cur

 select *
 FROM #temp3_sID1_sID2_keep  with (nolock)
 order by 1,2
4

1 回答 1

1

所以,正如我所看到的,任务是找到相等的子集。

首先我们可以找到相等的子集对:

;with tmp1 as (select sID, cnt = count(wordID) from [Table] group by sID)
select s1.sID, s2.sID
from tmp1 s1
    cross join tmp1 s2
    cross apply (
        select count(1)
        from [Table] d1
            join [Table] d2 on d2.wordID = d1.wordID
        where d1.sID = s1.sID and d2.sID = s2.sID
    ) c(cnt)
where s1.cnt = s2.cnt
    and s1.sID > s2.sID
    and s1.cnt = c.cnt

输出是:

sID        sID
----------- -----------
6           2
7           5
8           5
8           7

然后,如有必要,可以将对组合成组:

sID         gNum
----------- -----------
2           1
6           1
5           2
7           2
8           2

请参阅下面的 SqlFiddle 示例中的详细信息。

SqlFiddle Sample


另一种方法是为每个子集数据计算哈希函数:

;with a as (
    select distinct sID from [Table]
)
select sID,
    hashbytes('sha1', (
        select cast(wordID as varchar(10)) + '|'
        from [Table]
        where sID = a.sID
        order by wordID
        for xml path('')))
from a

然后可以根据哈希值对子集进行分组。

SqlFiddle Sample

最后一个在我的机器上花费了不到一分钟的时间来测试大约 1000 万行的数据(20k sID 值,每个值最多 1k wordID)。您还可以通过排除没有 wordID 计数与任何其他匹配的 sID 来优化它。

于 2013-10-24T22:29:24.650 回答