我在这里得到了很多帮助,利用所有这些信息,我可以组合出一个可行的解决方案。但是,我可能搞砸了一些事情,导致低权重的名称重复。根据我用较高权重标记的常用名称,我想要一个更均匀分布的全名列表。重复应该没问题,但它们可能是 TOP First names + TOP Last names 的组合。这不是我观察到的行为。
SELECT count(full_name) as total, full_name FROM _get_sample_data(20000) group by (full_name) order by total DESC
上面的 SELECT 中最多有 13 个重复的名字,那些重复的名字是那些被标记为低权重的名字。
功能:
CREATE
OR REPLACE FUNCTION _get_sample_data(p_number_of_records INT) RETURNS TABLE(
full_name TEXT,
email TEXT,
phone TEXT,
document TEXT,
date_of_birth DATE
) AS $$
WITH first_names_weighted AS
(
SELECT
first_name,
SUM(ratio) OVER (
ORDER BY
first_name) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
first_name) AS upper_bound
FROM
common_first_name
),
middle_names_weighted AS
(
SELECT
first_name AS middle_name,
SUM(ratio) OVER (
ORDER BY
first_name) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
first_name) AS upper_bound
FROM
common_first_name
),
last_names_weighted AS
(
SELECT
last_name,
SUM(ratio) OVER (
ORDER BY
last_name) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
last_name) AS upper_bound
FROM
common_last_name
),
email_domain_weighted AS
(
SELECT
domain_name,
SUM(ratio) OVER (
ORDER BY
domain_name ) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
domain_name ) AS upper_bound
FROM
common_email_domain
),
document_type_weighted AS
(
SELECT
country_iso_document_type,
SUM(ratio) OVER (
ORDER BY
country_iso_document_type ) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
country_iso_document_type ) AS upper_bound
FROM
common_document_type
),
year_of_birth_weighted AS
(
SELECT
YEAR,
SUM(ratio) OVER (
ORDER BY
YEAR ) - ratio AS lower_bound,
SUM(ratio) OVER (
ORDER BY
YEAR ) AS upper_bound
FROM
common_year_of_birth
),
randoms AS
(
SELECT
random() * (
SELECT
SUM(ratio)
FROM
common_first_name ) AS f_random,
random() * (
SELECT
SUM(ratio)
FROM
common_first_name ) AS m_random,
random() * (
SELECT
SUM(ratio)
FROM
common_last_name ) AS l_random,
random() * (
SELECT
SUM(ratio)
FROM
common_email_domain ) AS e_random,
random() * (
SELECT
SUM(ratio)
FROM
common_document_type ) AS d_random,
random() * (
SELECT
SUM(ratio)
FROM
common_year_of_birth ) AS y_random
FROM
generate_series(1, p_number_of_records )
)
SELECT
--r,
BTRIM(first_name)||' '||
CASE
WHEN
random() < 0.5
THEN
''
ELSE
BTRIM(middle_name)||' '
END
||BTRIM(last_name)||
CASE
WHEN
random() < 0.9
THEN
''
ELSE
CASE
WHEN
random() < 0.1
THEN
' Junior'
ELSE
CASE
WHEN
random() < 0.1
THEN
' II'
ELSE
CASE
WHEN
random() < 0.1
THEN
' III'
ELSE
CASE
WHEN
random() < 0.1
THEN
' IV'
ELSE
' Jr.'
END
END
END
END
END
AS full_name, --TODO: Honorif and Suffix
LOWER( SUBSTRING( first_name, 1, 3 + (random() * (length(first_name) - 3))::INTEGER) ||
CASE
WHEN
random() < 0.7
THEN
'.'
ELSE
CASE
WHEN
random() > 0.5
THEN
'_'
ELSE
''
END
END
|| SUBSTRING( last_name, 1, 3 + (random() * (length(last_name) - 3))::INTEGER)||
CASE
WHEN
random() > 0.5
THEN
'.'
ELSE
''
END
||
round((1800 + random()*1200))::TEXT || '@' || domain_name )
AS email,
'+1 ' || lpad(round(random() * 999)::text, 3, '0') || ' ' || lpad(round(random() * 999)::text, 3, '0') || ' ' || lpad(round(random() * 9999)::text, 4, '0') AS phone,
CASE
WHEN
POSITION('.sin' IN country_iso_document_type) > 1
THEN
country_iso_document_type || '.' || lpad((round(random() * 999))::TEXT, 3, '0') || '-' || lpad((round(random() * 999))::TEXT, 3, '0') || '-' || lpad((round(random() * 999))::TEXT, 3, '0')
ELSE
CASE
WHEN
POSITION('.driver' IN country_iso_document_type) > 1
THEN
country_iso_document_type || '.' || chr((65 + random()*25)::INT) || lpad(CAST(round(random() * 99999) AS text), 5, '0') || '-' || lpad((round(random() * 99999))::TEXT, 5, '0') || '-' || lpad((round(random() * 99999))::TEXT, 5, '0')
ELSE
CASE
WHEN
POSITION('.passport' IN country_iso_document_type) > 1
THEN
country_iso_document_type || '.' || chr((65 + random()*25)::INT) || chr((65 + random()*25)::INT) || lpad((round(random() * 999999999999))::TEXT, 12, '0')
ELSE
NULL
END
END
END
AS document,
make_date(YEAR, (round(1 + random() * 11))::INTEGER, 1) + make_interval(days => (round(1 + random() * 30))::INTEGER) AS dob
FROM
randoms r
CROSS JOIN
first_names_weighted f
CROSS JOIN
last_names_weighted l
CROSS JOIN
email_domain_weighted e
CROSS JOIN
document_type_weighted d
CROSS JOIN
year_of_birth_weighted y
CROSS JOIN
middle_names_weighted m
WHERE
f.lower_bound <= r.f_random
AND r.f_random <= f.upper_bound
AND l.lower_bound <= r.l_random
AND r.l_random <= l.upper_bound
AND e.lower_bound <= r.e_random
AND r.e_random <= e.upper_bound
AND d.lower_bound <= r.d_random
AND r.d_random <= d.upper_bound
AND y.lower_bound <= r.y_random
AND r.y_random <= y.upper_bound
AND m.lower_bound <= r.m_random
AND r.m_random <= m.upper_bound;
$$ language sql;
我有 1000 个最常见的名字和 1000 个最常见的姓氏(我从互联网上获得的)。约翰应该更频繁,但事实并非如此。
正在生成数据,但不知何故,它们并没有反映我在具有通用名称的表中放置的权重。
PS:如果有人觉得有用,我可以分享表结构和 SAMPLE DATA。
我正在使用 supabase.io (PG 13.3)
请,任何帮助将不胜感激。
谢谢