我有很多需要处理的文本字段。为了处理它们,我需要做的第一件事是规范化我处理的字符集。我需要我的输出字符串包含以下内容;
AZ、0-9 和空格,我希望所有小写字母都转换为大写字母。
所以我在pl/sql中使用了以下内容;
X := UPPER(TRIM(REGEXP_REPLACE
(REGEXP_REPLACE(X, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
这个比较慢。什么会更快?
我有很多需要处理的文本字段。为了处理它们,我需要做的第一件事是规范化我处理的字符集。我需要我的输出字符串包含以下内容;
AZ、0-9 和空格,我希望所有小写字母都转换为大写字母。
所以我在pl/sql中使用了以下内容;
X := UPPER(TRIM(REGEXP_REPLACE
(REGEXP_REPLACE(X, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
这个比较慢。什么会更快?
您可以尝试这种方法,根据一些(非常)松散的测试,它看起来要快得多。它是一个本地编译的函数:
CREATE OR REPLACE function clean_string(
in_string in varchar2)
return varchar2 AS
out_string varchar2(4000) := '';
in_length number;
cnt number := 0;
in_char char(1);
out_char char(1);
dec_char number;
prev_space boolean := false;
begin
--dbms_output.put_line('In string: ' || in_string);
in_length := LENGTH(in_string);
while cnt < in_length
LOOP
cnt := cnt + 1;
in_char := substr(in_string, cnt, 1);
dec_char := ascii(in_char);
-- blank out non alphanumerics
IF (
(dec_char >= 48 AND dec_char <= 57) OR
(dec_char >= 65 AND dec_char <= 90) OR
(dec_char >= 97 AND dec_char <= 122)
) THEN
--keep it
out_char := in_char;
ELSE
out_char := ' ';
END IF;
IF (NOT(prev_space AND out_char = ' ')) THEN
out_string := out_string || out_char;
END IF;
<<endloop>>
IF (out_char = ' ') THEN
prev_space := true;
ELSE
prev_space := false;
END IF;
END LOOP;
return trim(upper(out_string));
end;
ALTER SESSION SET PLSQL_CODE_TYPE=NATIVE;
ALTER function clean_string COMPILE;
为了测试,我从一个表中提取了 500 万行并清理了一些字符串:
set serveroutput on
declare
cursor sel_cur1 is
select name, clean_string(name) as cln_name,
address1, clean_string(address1) as cln_addr1,
address2, clean_string(address2) as cln_addr2,
city, clean_string(city) as cln_city,
state, clean_string(state) as cln_state,
postalcode, clean_string(postalcode) as cln_zip
from my_table
where rownum <= 5000000;
cursor sel_cur2 is
select name,
address1,
address2,
city,
state,
postalcode
from my_table
where rownum <= 5000000;
l_cnt integer := 0;
l_cln_name varchar2(100);
l_cln_addr1 varchar2(100);
l_cln_addr2 varchar2(100);
l_cln_city varchar2(100);
l_cln_state varchar2(100);
l_cln_zip varchar2(100);
l_interval interval day to second(4);
l_start timestamp;
l_end timestamp;
begin
l_start := systimestamp;
for rec in sel_cur2
loop
l_cnt := l_cnt + 1;
l_cln_name := clean_string(rec.name);
l_cln_addr1 := clean_string(rec.address1);
l_cln_addr2 := clean_string(rec.address2);
l_cln_city := clean_string(rec.city);
l_cln_state := clean_string(rec.state);
l_cln_zip := clean_string(rec.postalcode);
end loop;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('Procedural approach timing: ' || l_interval);
-------------------------------------------------
l_cnt := 0;
l_start := systimestamp;
for rec in sel_cur1
loop
-- cleaning already done in SQL
l_cnt := l_cnt + 1;
end loop;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('SQL approach timing: ' || l_interval);
-------------------------------------------------
l_cnt := 0;
l_start := systimestamp;
for rec in sel_cur2
loop
l_cnt := l_cnt + 1;
l_cln_name := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.name, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_addr1 := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.address1, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_addr2 := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.address2, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_city := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.city, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_state := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.state, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
l_cln_zip := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.postalcode, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
end loop;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('Existing approach timing: ' || l_interval);
end;
输出是:
Procedural approach timing: +00 00:02:04.0320
SQL approach timing: +00 00:02:49.4326
Existing approach timing: +00 00:05:50.1607
此外,本机编译似乎只有助于处理的过程方法(而不是从 SQL 查询调用函数),但似乎比 regexp_replace 解决方案快得多。希望有帮助。
首先,让我说我并没有真正回答我自己的问题,但我接受了 tbone 的回答。提供此答案的原因是评论不允许我发布我真正想要的内容。
我通过一些调整创建了一个与 tbone 几乎相同的函数,通过更改我处理小写字符范围的方式摆脱了 UPPER,并将数字更改为 binary_integer。
FUNCTION CLEAN_STRING(IN_STRING in VARCHAR2) RETURN VARCHAR2
AS
OUT_STRING VARCHAR2(32767) := '';
IN_LENGTH BINARY_INTEGER;
CNT BINARY_INTEGER := 0;
IN_CHAR CHAR(1);
OUT_CHAR CHAR(1);
DEC_CHAR BINARY_INTEGER;
PREV_SPACE BOOLEAN := FALSE;
BEGIN
IN_LENGTH := LENGTH(IN_STRING);
WHILE CNT < IN_LENGTH
LOOP
CNT := CNT + 1;
IN_CHAR := SUBSTR(IN_STRING, CNT, 1);
DEC_CHAR := ASCII(IN_CHAR);
-- blank out non alphanumerics
IF ((DEC_CHAR >= 48 AND DEC_CHAR <= 57) OR
(DEC_CHAR >= 65 AND DEC_CHAR <= 90))
THEN
--keep it
OUT_CHAR := IN_CHAR;
ELSE
IF (DEC_CHAR >= 97 AND DEC_CHAR <= 122)
THEN
OUT_CHAR := CHR(DEC_CHAR - 32);
ELSE
OUT_CHAR := ' ';
END IF;
END IF;
IF (NOT(PREV_SPACE AND OUT_CHAR = ' '))
THEN
OUT_STRING := OUT_STRING || OUT_CHAR;
END IF;
<<endloop>>
IF (OUT_CHAR = ' ') THEN
PREV_SPACE := TRUE;
ELSE
PREV_SPACE := FALSE;
END IF;
END LOOP;
RETURN TRIM(OUT_STRING);
END CLEAN_STRING;
然后我像 tbone 一样创建了一个简单的测试装置,但我测试了三个不同的例程。首先,我验证它们是否都返回相同的结果,然后对每个例程进行计时。这是测试台;
set serveroutput on
DECLARE
CURSOR PATHMAST_CURS
IS
SELECT PATHMAST_TEXT_DIAGNOSIS FROM PATHMAST WHERE ROWNUM < 100000;
DUMMY CLOB;
DUMMY_1 CLOB;
DUMMY_2 CLOB;
l_interval interval day to second(4);
l_start timestamp;
l_end timestamp;
diff_count_1 binary_integer := 0;
diff_count_2 binary_integer := 0;
BEGIN
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '), '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
DUMMY_1 := pathmast_utility_3.CLEAN_STRING(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '));
DUMMY_2 := regexp_replace(trim(translate(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '),'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()''_+-={[}]|/\":;,.<>?µ’±€'||chr(9),'ABCDEFGHIJKLMNOPQRSTUVWXYZ ')),'( )* ',' ');
IF DUMMY_1 != DUMMY
THEN
diff_count_1 := diff_count_1 + 1;
END IF;
IF DUMMY_2 != DUMMY
THEN
diff_count_2 := diff_count_2 + 1;
dbms_output.put_line('Regexp: ' || DUMMY);
dbms_output.put_line('Translate: ' || DUMMY_2);
END IF;
END LOOP;
dbms_output.put_line('CLEAN_STRING differences: ' || diff_count_1);
dbms_output.put_line('Translate differences: ' || diff_count_2);
l_start := systimestamp;
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(PATH_REC.PATHMAST_TEXT_DIAGNOSIS, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
END LOOP;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('Regexp approach timing: ' || l_interval);
-------------------------------------------------
l_start := systimestamp;
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := pathmast_utility_3.CLEAN_STRING(PATH_REC.PATHMAST_TEXT_DIAGNOSIS);
END LOOP;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('CLEAN_STRING approach timing: ' || l_interval);
-------------------------------------------------
l_start := systimestamp;
FOR PATH_REC IN PATHMAST_CURS
LOOP
DUMMY := regexp_replace(trim(translate(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '),'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()''_+-={[}]|/\":;,.<>?µ’±€'||chr(9),'ABCDEFGHIJKLMNOPQRSTUVWXYZ ')),'( )* ',' ');
END LOOP;
l_end := systimestamp;
l_interval := l_end - l_start;
dbms_output.put_line('TRANSLATE approach timing: ' || l_interval);
-------------------------------------------------
END;
这是结果;
anonymous block completed
CLEAN_STRING differences: 0
Translate differences: 0
Regexp approach timing: +00 00:00:52.9160
CLEAN_STRING approach timing: +00 00:00:05.5220
TRANSLATE approach timing: +00 00:00:13.4320
这一切都没有编译原生。所以tbone是大赢家。谢谢你。
如果出于任何原因您想要/需要使用翻译版本,您应该以编程方式构建翻译字符串以获取所有特殊字符。
也许,您可以使用 TRANSLATE 而不是正则表达式来删除特殊字符并将小写转换为大写。
regexp_replace(
trim(
translate(x,
'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()_+-={[}]|/\"'':;,.<>?',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ '
)
),
' {2,}',
' '
)
在具有 1000 行和列的表上进行了尝试,其中随机字符介于 1 到 4000 之间。结果减少了大约 35% 的时间。(未在 PLSQL 中尝试)。