5

我正在尝试实现 Kimball 数据集市,它在维度表中使用 -1 和 -2 行来表示迟到的维度和 NULL 业务键。我在下面有一个示例代码,它为事实和维度数据创建一个临时表,为数据集市创建两个维度表和一个事实表。这是我的示例代码,其中包含 SQL 中的数据:

--drop table stg_sales
--go
CREATE TABLE dbo.stg_sales
  (
     stg_sales_id       INT IDENTITY(1, 1) NOT NULL,
     sales_number       INT NOT NULL,
     sales_amt          INT NULL,
     cust_number        INT NULL,
     cust_firstname     NVARCHAR(50) NULL,
     cust_lastname      NVARCHAR(100) NULL,
     cust_address       NVARCHAR(500) NULL,
     salesperson_number INT NULL,
     CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id)
  )

go

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (123,
        434,
        2342,
        'Jim',
        'Moriaty',
        'something',
        23)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (124,
        234,
        2342,
        'Jim',
        'Moriaty',
        'something',
        23)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (125,
        434,
        4545,
        'Joe',
        'Esk',
        'someother',
        24)

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (126,
        434,
        5555,
        'Daniel',
        'Hart',
        'Someaddr',
        NULL) --salesperson_number business key missing here

INSERT stg_sales
       (sales_number,
        sales_amt,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        salesperson_number)
VALUES (127,
        333,
        4444,
        'Pat',
        'Smith',
        'Someaddr',
        30)

SELECT *
FROM   stg_sales

--create a dimension and fact tables
--drop table dbo.dim_customer
--go
CREATE TABLE dbo.dim_customer
  (
     customer_wid   INT IDENTITY(1, 1) NOT NULL,
     cust_number    INT NULL,
     cust_firstname NVARCHAR(50) NULL,
     cust_lastname  NVARCHAR(100) NULL,
     cust_address   NVARCHAR(500) NULL,
     date_insert    DATETIME2 NOT NULL DEFAULT (Getdate()),
     date_update    DATETIME2 NULL,
     is_current     BIT NOT NULL
     CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid)
     CONSTRAINT chk_is_current CHECK (is_current IN (0, 1))
  )

go

SET IDENTITY_INSERT dbo.dim_customer ON

INSERT dbo.dim_customer
       (customer_wid,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        date_insert,
        date_update,
        is_current)
VALUES (-1,
        -1,
        'unknown',
        'unknown',
        'unknown',
        Getdate(),
        Getdate(),
        1)

INSERT dbo.dim_customer
       (customer_wid,
        cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        date_insert,
        date_update,
        is_current)
VALUES (-2,
        -2,
        'Error',
        'Error',
        'Error',
        Getdate(),
        Getdate(),
        1)

SET IDENTITY_INSERT dbo.dim_customer OFF

--insert data into dimension table
INSERT dbo.dim_customer
       (cust_number,
        cust_firstname,
        cust_lastname,
        cust_address,
        is_current)
SELECT DISTINCT cust_number,
                cust_firstname,
                cust_lastname,
                cust_address,
                1 AS is_current
FROM   dbo.stg_sales
WHERE  cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension)
SELECT *
FROM   dbo.dim_customer

DROP TABLE dbo.dim_salesperson

--create salesperson table
CREATE TABLE dbo.dim_salesperson
  (
     salesperson_wid       INT IDENTITY(1, 1) NOT NULL,
     salesperson_number    INT NULL,
     salesperson_firstname NVARCHAR(50) NULL,
     salesperson_lastname  NVARCHAR(100) NULL,
     salesperson_address   NVARCHAR(500) NULL,
     date_insert           DATETIME2 NOT NULL DEFAULT (Getdate()),
     date_update           DATETIME2 NULL,
     is_current            BIT NOT NULL
     CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid)
     CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1))
  )

go

SET IDENTITY_INSERT dbo.dim_salesperson ON

INSERT dbo.dim_salesperson
       (salesperson_wid,
        salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        date_insert,
        date_update,
        is_current)
VALUES (-1,
        -1,
        'Not available',
        'Not available',
        'Not available',
        Getdate(),
        Getdate(),
        1)

INSERT dbo.dim_salesperson
       (salesperson_wid,
        salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        date_insert,
        date_update,
        is_current)
VALUES (-2,
        -2,
        'Error',
        'Error',
        'Error',
        Getdate(),
        Getdate(),
        1)

SET IDENTITY_INSERT dbo.dim_salesperson OFF

--insert data into salesperson
INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (23,
        'John',
        'Fox',
        'something',
        1)

INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (24,
        'Hadley',
        'Fox',
        'something',
        1)

INSERT dbo.dim_salesperson
       (salesperson_number,
        salesperson_firstname,
        salesperson_lastname,
        salesperson_address,
        is_current)
VALUES (30,
        'Ashley',
        'Fox',
        'something',
        1)

SELECT *
FROM   dbo.dim_salesperson

SELECT *
FROM   dbo.stg_sales

--create and populate the fact table
--drop table dbo.f_sales
--go
CREATE TABLE dbo.f_sales
  (
     sales_number    INT NOT NULL,
     customer_wid    INT NOT NULL,
     salesperson_wid INT NOT NULL,
     sales_amt       INT NULL
     CONSTRAINT pk_f_sales PRIMARY KEY (sales_number)
     CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES
     dbo.dim_customer(customer_wid),
     CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES
     dbo.dim_salesperson(salesperson_wid)
  )

--populate the fact table 
INSERT dbo.f_sales
       (sales_number,
        customer_wid,
        salesperson_wid,
        sales_amt)
SELECT stg.sales_number,
       Isnull(dimcust.customer_wid, -1)  AS customer_wid,
       --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
       Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid,
       --NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source?
       stg.sales_amt
FROM   dbo.stg_sales AS stg
       LEFT JOIN dbo.dim_customer AS dimcust
              ON stg.cust_number = dimcust.cust_number
       LEFT JOIN dbo.dim_salesperson AS dimsp
              ON stg.salesperson_number = dimsp.salesperson_number

SELECT *
FROM   dbo.f_sales

如何为源系统中缺少业务键的行分配 -2。您可以从 Kimball 阅读更多关于此实现背后的理论:

这基本上是我想要实现的目标:

处理维度和度量中的 NULL

编辑:

我想我可以在左连接中使用COALESCEor ISNULL,它似乎产生了正确的结果:

INSERT dbo.f_sales
       (sales_number,
        customer_wid,
        salesperson_wid,
        sales_amt)
SELECT stg.sales_number,
       Isnull(dimcust.customer_wid, -1)  AS customer_wid,
       --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
       dimsp.salesperson_wid,
       stg.sales_amt
FROM   dbo.stg_sales AS stg
       LEFT JOIN dbo.dim_customer AS dimcust
              ON COALESCE(stg.cust_number, -2) = dimcust.cust_number
       LEFT JOIN dbo.dim_salesperson AS dimsp
              ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number
4

2 回答 2

4

纯粹作为一种查找技术

-- add nullable keys to the staging table
alter table dbo.stg_sales ADD
  sales_person_wid integer null
, customer_wid     integer null
;

-- insert to staging table here (as in your example)

-- lookup sales person key
update dbo.stg_sales 
 set sales_person_wid = p.sales_person_wid
from dbo.stg_sales as s , dbo.dim_salesperson as p
where s.salesperson_number =  p.salesperson_number ;

-- decide what to do with missing business keys
update dbo.stg_sales 
 set sales_person_wid = -2
where sales_person_wid is null ;


-- do similar for customer

-- now all keys in staging table are not null

-- load to fact table

但是,通常的技术是在提取或清理期间为事务分配特殊(未知、n/a、错误)业务密钥。换句话说,Error可以在记录进入暂存表之前分配特殊的业务密钥。

最后要注意的是,迟到的维度意味着salesperson_number操作系统知道业务密钥(所以,salesperson_number会是not null,但不会存在于维度表中。您不必将此事务保存在某处,并在记录到达维度后尝试更新 FK (salesperson_wid);一天后左右。

于 2012-10-12T16:16:02.637 回答
0

我认为迟到的事实是与事实相关的维度正在缓慢变化,而事实迟到了。假设您有一个销售系统,并且有一个名为“Opal Fruit”的产品,该产品将名称更改为“Starburst”。公司想跟踪改名是否会影响销售额,所以设置了类型 2 缓变维度(SCD)。在 SCD 中,他们跟踪产品名称,因此每次产品名称更改时,他们都会为该产品发布一条新记录,使用相同的业务(自然)密钥但不同的代理密钥,并记录有效的起始日期和截止日期为那个名字。因此,假设“Opal Fruit”的有效期为 1970 年 1 月 1 日,有效期为 1995 年 3 月 28 日。产品名称为“Opal Fruit”的版本的有效期为 '29/3/1995,有效期为 null,

现在,假设在 1995 年 1 月 4 日,在更名后的一天,他们收到一些交易日期为 1995 年 1 月 4 日的销售交易,还有一些交易日期为 1995 年 3 月 27 日的销售交易。延迟到达意味着给定维度的事务在产品维度更改之后到达。因此,1995 年 3 月 27 日到达的交易需要产品名称为“Opal Fruitt”的产品维度的代理键,而 1995 年 1 月 4 日到达的交易需要产品维度的代理键,其中名字叫“星爆”。您需要做的是计算出交易日期的“Opal Fruit / Starburst”产品的最新版本,并在交易进入销售事实时将该产品代理键分配给交易。

这就是我对迟到的事实的理解。在处理它们的维度元素到达之前到达的事实肯定是早期到达的事实,而不是迟到的事实。也许这只是一个术语。

于 2012-10-22T14:45:15.423 回答