0

我需要按大致日期连接两个表,以使表中的每一行old仅与表中最接近的日期匹配new- 一对一。不允许有行的副本new- 只匹配一次以获得最小的差异。

这是一些尝试的示例:

CREATE TABLE `new` (`ID` int(2), `date` datetime, `new` varchar(1));
INSERT INTO `new` (`ID`, `date`, `new`) VALUES
(1, '2016-03-02 12:20:00', 't'),
(1, '2016-03-07 12:20:00', 'u'),
(1, '2016-04-02 12:20:00', 'v'),
(2, '2016-04-12 11:03:00', 'x');

CREATE TABLE `old` (`ID` int(2), `date` datetime, `old` varchar(1));
INSERT INTO `old` (`ID`, `date`, `old`) VALUES
(1, '2016-03-07 12:20:00', 'a'),
(1, '2016-04-02 12:20:00', 'b'),
(1, '2016-03-01 10:09:00', 'c'),
(1, '2015-04-12 10:09:00', 'd'),
(1, '2016-03-03 12:20:00', 'e');

我期待的输出是这样的:

   ID            old.date  old            new.date  new
    1 2016-03-07 12:20:00    a 2016-03-07 12:20:00    u
    1 2016-04-02 12:20:00    b 2016-04-02 12:20:00    v
    1 2016-03-01 10:09:00    c                NULL NULL
    1 2015-04-12 10:09:00    d                NULL NULL
    1 2016-03-03 12:20:00    e 2016-03-02 12:20:00    t
    2                NULL NULL 2016-04-12 11:03:00    x

我能够通过以下方式接近这一点:

SELECT * FROM old A LEFT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day, A.date, B.date))<2
UNION
SELECT * FROM old A RIGHT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day, A.date, B.date))<2
ORDER BY old

但显然这最终会在指定的时间窗口内匹配多行,而不仅仅是最佳匹配。玩天数对我来说不是一个解决方案,因为在实践中我有两个巨大的表要加入,这需要与一个时间窗口一起工作,在这个时间窗口中,许多行中有多个匹配项。

4

2 回答 2

0

似乎实现近似一对一连接的唯一方法是cursor在存储过程中使用 a 。

感谢@Strawberry 为我指明了正确的方向 - 您将在下面看到您的代码片段被重用。这是最终对我有用的解决方案。它输出排序不同的记录,但至少它是真正的一对一匹配。

DROP PROCEDURE IF EXISTS amerge;

DELIMITER //

CREATE PROCEDURE amerge()

BEGIN
  /* Necessary declarations */
  DECLARE o_ID INT DEFAULT 0;
  DECLARE o_date VARCHAR(30) DEFAULT 0;
  DECLARE o_old VARCHAR(2);
  DECLARE o_mdiff FLOAT;
  DECLARE ct INT DEFAULT 0;
  DECLARE done INT DEFAULT FALSE;
  DECLARE cursor1 CURSOR FOR SELECT ID, date, old, mdiff FROM t1;
  DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;

  /* Temporary tables */
  -- copy of 'old' with diff column = min difference
  CREATE TEMPORARY TABLE t1
    SELECT old.*,MIN(ABS(TIMESTAMPDIFF(hour, old.date, new.date))) AS mdiff
      FROM old JOIN new ON old.ID=new.ID
      GROUP BY old.ID, old.date
      ORDER BY mdiff ASC;

  -- cartesian join with abs(diff) column
    CREATE TEMPORARY TABLE t2
    SELECT old.ID AS ID_1, old.date AS date_1, new.ID as ID_2, new.date AS date_2, old, new,
        ABS(TIMESTAMPDIFF(hour, old.date, new.date)) AS diff
      FROM old CROSS JOIN new ON old.ID=new.ID
      ORDER BY diff ASC;

  -- empty table to fill in with the results
  CREATE TEMPORARY TABLE t3
    (id_1 INT, date_1 DATETIME, id_2 INT, date_2 DATETIME, old VARCHAR(2), new VARCHAR(2), diff FLOAT);

  /* Cursor */
  OPEN cursor1;
  getparams: LOOP
        FETCH cursor1 INTO o_ID, o_date, o_old, o_mdiff;
        IF done THEN
            LEAVE getparams;
        END IF;
      SELECT COUNT(*) FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff INTO ct;
    CASE ct
        WHEN 0 THEN
          INSERT INTO t3 VALUES (o_ID, o_date, NULL, NULL, o_old, NULL, o_mdiff);
        ELSE
          INSERT INTO t3 SELECT * FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff LIMIT 1;
    END CASE;
    DELETE FROM t2 WHERE t2.ID_2=o_ID AND t2.date_2 IN (SELECT date_2 FROM t3 WHERE t3.date_1=o_date);
    END LOOP getparams;
    CLOSE cursor1;

  /* Workaround for error of reopening temp tables in MySQL */
  DROP TEMPORARY TABLE t2;
  CREATE TEMPORARY TABLE t2
      SELECT * FROM t3;

  /* Output */
  SELECT * FROM t2
  UNION
  SELECT NULL AS ID_1, NULL AS date_1, new.ID as ID_2, new.date AS date_2, NULL AS old, new.new AS new, NULL AS diff
    FROM new LEFT JOIN t3 ON t3.ID_2=new.ID AND t3.date_2 = new.date WHERE t3.ID_2 IS NULL;


END //

DELIMITER ;

CALL amerge();

输出是(使用上面示例中的数据,PRIMARY 键设置为 ID+日期):

id_1             date_1 id_2             date_2  old  new diff  
1   2016-03-07 12:20:00 1   2016-03-07 12:20:00    a    u    0  
1   2016-04-02 12:20:00 1   2016-04-02 12:20:00    b    v    0  
1   2016-03-03 12:20:00 1   2016-03-02 12:20:00    e    t   24  
1   2016-03-01 10:09:00                    NULL    c NULL   26  
1   2015-04-12 10:09:00                    NULL    d NULL 7802  
                   NULL 2   2016-04-12 11:03:00 NULL    x NULL      
于 2020-10-13T18:48:20.773 回答
0

考虑以下...

DROP TABLE IF EXISTS new;

CREATE TABLE new 
(ID INT NOT NULL
,date DATETIME 
,new CHAR(1)
,PRIMARY KEY(ID,date)
);

INSERT INTO new VALUES
(1, '2016-03-02 12:20:00', 't'),
(1, '2016-03-07 12:20:00', 'u'),
(1, '2016-04-02 12:20:00', 'v'),
(2, '2016-04-12 11:03:00', 'x')
;

CREATE TABLE old 
(ID INT NOT NULL
,date DATETIME
,old CHAR(1)
,PRIMARY KEY(ID,date)
);

INSERT INTO old VALUES
(1, '2016-03-07 12:20:00', 'a'),
(1, '2016-04-02 12:20:00', 'b'),
(1, '2016-03-01 10:09:00', 'c'),
(1, '2015-04-12 10:09:00', 'd'),
(1, '2016-03-03 12:20:00', 'e');
    
SELECT a.id old_id
     , a.date old_date
     , a.old 
     , b.id new_id
     , b.date new_date
     , b.new
  FROM 
     ( SELECT old.*
            , MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta 
         FROM old 
         JOIN new ON new.id = old.id
        GROUP 
           BY old.id
            , old.date
     ) a
  LEFT
  JOIN new b
    ON b.id = a.id
   AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta

 UNION 
 
 SELECT a.id old_id
      , a.date old_date
      , a.old 
      , b.id new_id
      , b.date new_date
      , b.new
   FROM 
      ( SELECT old.*
             , MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta 
          FROM old 
          JOIN new ON new.id = old.id
         GROUP 
            BY old.id
             , old.date
      ) a
   RIGHT
   JOIN new b
     ON b.id = a.id
    AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
  ORDER 
     BY old IS NULL, old
 ;

+--------+---------------------+------+--------+---------------------+------+
| old_id | old_date            | old  | new_id | new_date            | new  |
+--------+---------------------+------+--------+---------------------+------+
|      1 | 2016-03-07 12:20:00 | a    |      1 | 2016-03-07 12:20:00 | u    |
|      1 | 2016-04-02 12:20:00 | b    |      1 | 2016-04-02 12:20:00 | v    |
|      1 | 2016-03-01 10:09:00 | c    |      1 | 2016-03-02 12:20:00 | t    |
|      1 | 2015-04-12 10:09:00 | d    |      1 | 2016-03-02 12:20:00 | t    |
|      1 | 2016-03-03 12:20:00 | e    |      1 | 2016-03-02 12:20:00 | t    |
|   NULL | NULL                | NULL |      2 | 2016-04-12 11:03:00 | x    |
+--------+---------------------+------+--------+---------------------+------+

对于这个难题的最后一部分 - 删除重复,我可能会在应用程序代码中处理它。

于 2020-10-10T09:12:51.577 回答