create extension pg_trgm; create table info ( patid integer, fullname jsonb, constraint info_pk primary key (patid), constraint fullname_exists check ( fullname ? 'lname'::text and fullname ? 'fname'::text and fullname ? 'sname'::text ), constraint fullname_notnull check ( (fullname ->> 'lname'::text) is not null and (fullname ->> 'fname'::text) is not null ) ); create function fullname(in_fullname jsonb) returns text language plpgsql immutable as $$ begin return regexp_replace( lower( trim( coalesce(in_fullname->>'lname', '') || ' ' || coalesce(in_fullname->>'fname', '') || ' ' || coalesce(in_fullname->>'sname', '') ) ), '', '', 'g' ); exception when others then raise exception '%', sqlerrm; end; $$;
create index info_gist_idx on info using gist (fullname(fullname) gist_trgm_ops); CREATE INDEX Time: 15054,102 ms explain (analyze, buffers) select patid, fullname(fullname) <-> ' ' as dist from info order by dist limit 10; Limit (cost=0.28..4.35 rows=10 width=8) (actual time=157.378..157.688 rows=10 loops=1) Buffers: shared hit=5743 -> Index Scan using info_gist_idx on info (cost=0.28..126822.96 rows=312084 width=8) (actual time=157.371..157.655 rows=10 loops=1) Order By: (fullname(fullname) <-> ' '::text) Buffers: shared hit=5743 Planning time: 0.225 ms Execution time: 158.223 ms (7 rows)
create index info_trgm_idx on info using gin(fullname(fullname) gin_trgm_ops); CREATE INDEX Time: 10163,401 ms explain (analyze, buffers) select patid, similarity(fullname(fullname), ' ' ) as sml from info where true and fullname(fullname) % ' ' order by sml desc limit 10; Limit (cost=1180.22..1180.25 rows=10 width=8) (actual time=133.086..133.117 rows=8 loops=1) Buffers: shared hit=5741 -> Sort (cost=1180.22..1181.00 rows=312 width=8) (actual time=133.080..133.090 rows=8 loops=1) Sort Key: (similarity(fullname(fullname), ' '::text)) DESC Sort Method: quicksort Memory: 25kB Buffers: shared hit=5741 -> Bitmap Heap Scan on info (cost=26.70..1173.48 rows=312 width=8) (actual time=132.828..133.048 rows=8 loops=1) Recheck Cond: (fullname(fullname) % ' '::text) Heap Blocks: exact=7 Buffers: shared hit=5741 -> Bitmap Index Scan on info_gist_idx (cost=0.00..26.62 rows=312 width=0) (actual time=132.699..132.699 rows=8 loops=1) Index Cond: (fullname(fullname) % ' '::text) Buffers: shared hit=5734 Planning time: 0.573 ms Execution time: 133.225 ms (15 rows)
(||)<->()<->()
without accessing the table for more information about the order of the token in the tuple. Moreover, the recommendation for gin is the physical existence of the tsvector column, since all found pointers to tuples will have to be rechecked in the table. And if physically there is no tsvector column (you built it with a function for the index), then for each tuple you will have to perform an additional tsvector calculation. In general, the rum in this story will be much more productive. create extension rum; create index info_rum_idx on info using rum ( to_tsvector('simple'::regconfig, fullname(fullname)) rum_tsvector_ops ); CREATE INDEX Time: 7.545s (7 seconds) create table patname ( lex text, constraint patname_uniq_idx unique (lex) ); create index patname_fuzzy_idx on patname using gin (lex gin_trgm_ops); CREATE INDEX Time: 0.596s insert into patname (lex) select word from ts_stat($$ select to_tsvector('simple', fullname(fullname)) from info $$); explain (analyze, buffers) with fio as ( select lexeme as lex, positions[1] as pos from unnest(to_tsvector('simple',' ')) ), query as( select to_tsquery('simple', string_agg(q.tq,'&')) as q from ( select f.pos, '('||string_agg(p.lex,'|')||')' as tq from fio as f join patname as p on p.lex % f.lex group by f.pos ) as q ) select to_tsvector('simple'::regconfig, fullname(fullname)) <=> (select q from query) as rank, * from info where to_tsvector('simple'::regconfig, fullname(fullname)) @@ (select q from query) order by to_tsvector('simple'::regconfig, fullname(fullname)) <=> (select q from query); Sort (cost=6453.71..6457.61 rows=1560 width=100) (actual time=68.201..68.202 rows=1 loops=1) Sort Key: ((to_tsvector('simple'::regconfig, fullname(info.fullname)) <=> $3)) Sort Method: quicksort Memory: 25kB Buffers: shared hit=536 CTE fio -> Function Scan on unnest (cost=0.00..0.10 rows=10 width=34) (actual time=0.023..0.034 rows=3 loops=1) CTE query -> Aggregate (cost=1484.60..1484.86 rows=1 width=32) (actual time=11.829..11.830 rows=1 loops=1) Buffers: shared hit=325 -> HashAggregate (cost=1484.30..1484.48 rows=10 width=34) (actual time=11.640..11.644 rows=2 loops=1) Group Key: f.pos Buffers: shared hit=325 -> Nested Loop (cost=16.58..1480.53 rows=755 width=19) (actual time=2.940..11.442 rows=62 loops=1) Buffers: shared hit=325 -> CTE Scan on fio f (cost=0.00..0.20 rows=10 width=34) (actual time=0.028..0.053 rows=3 loops=1) -> Bitmap Heap Scan on patname p (cost=16.58..147.28 rows=75 width=17) (actual time=1.905..3.717 rows=21 loops=3) Recheck Cond: (lex % f.lex) Rows Removed by Index Recheck: 321 Heap Blocks: exact=275 Buffers: shared hit=325 -> Bitmap Index Scan on patname_fuzzy_idx (cost=0.00..16.57 rows=75 width=0) (actual time=1.277..1.277 rows=342 loops=3) Index Cond: (lex % f.lex) Buffers: shared hit=50 InitPlan 3 (returns $3) -> CTE Scan on query (cost=0.00..0.02 rows=1 width=32) (actual time=0.004..0.006 rows=1 loops=1) InitPlan 4 (returns $4) -> CTE Scan on query query_1 (cost=0.00..0.02 rows=1 width=32) (actual time=11.834..11.839 rows=1 loops=1) Buffers: shared hit=325 -> Bitmap Heap Scan on info (cost=31.99..4885.97 rows=1560 width=100) (actual time=68.184..68.187 rows=1 loops=1) Recheck Cond: (to_tsvector('simple'::regconfig, fullname(fullname)) @@ $4) Heap Blocks: exact=1 Buffers: shared hit=536 -> Bitmap Index Scan on info_rum_idx (cost=0.00..31.60 rows=1560 width=0) (actual time=67.847..67.847 rows=1 loops=1) Index Cond: (to_tsvector('simple'::regconfig, fullname(fullname)) @@ $4) Buffers: shared hit=517 Planning time: 5.012 ms Execution time: 68.583 ms (37 rows)
create or replace function phoneme (in_lexeme text) returns text language plpython3u immutable as $$ import re class Lexeme: def __init__(self, body): """ :type body: str """ self.body = body.lower().strip() # self._vowels = {"(?:|||)": "", "[]": "", "[]": "", "[]": ""} # self._consonants = {"": "", "": "", "": "", "": ""} # , _deafening_chars self._deafening_chars = ["", "", "", "", "", "", ""] # self._removable_chars = {"[]": ""} def _remove_double_chars(self): return Lexeme("".join((char for num, char in enumerate(self.body) if char != self.body[num - 1]))) def _deafen_consonants(self): modified_body = "" for num, char in enumerate(self.body): if char in self._consonants and ( num < len(self.body) - 1 and self.body[num + 1] in self._deafening_chars or num == len(self.body) - 1 ): modified_body += self._consonants[char] else: modified_body += char return Lexeme(modified_body) @staticmethod def _regexp_replace(text, char_dict): modified_body = text for item in char_dict: modified_body = re.sub(item, char_dict[item], modified_body) return Lexeme(modified_body) def _replace_vowels(self): return self._regexp_replace(self.body, self._vowels) def _remove_chars(self): return self._regexp_replace(self.body, self._removable_chars) def metaphone(self): return self._remove_chars()._replace_vowels()._deafen_consonants()._remove_double_chars().body return Lexeme(in_lexeme).metaphone() $$; create or replace function metaphone (in_phonemes text) returns text language plpgsql immutable as $$ begin return ( select string_agg(q.lex,' ') from ( select phoneme(lexeme) as lex from unnest(to_tsvector('simple', in_phonemes)) order by positions ) as q ); exception when others then raise '%', SQLERRM using errcode = SQLSTATE; end; $$; create index info_metaphone_idx on info ( metaphone(fullname(fullname)) text_pattern_ops ); CREATE INDEX Time: 114.757s (a minute) explain (analyze, buffers) select patid, fullname from info where metaphone(fullname(fullname)) like regexp_replace(metaphone(' '),'\s','%','g')||'%' limit 10; Limit (cost=76.03..1388.96 rows=10 width=96) (actual time=22.452..129.944 rows=3 loops=1) Buffers: shared hit=239 -> Bitmap Heap Scan on info (cost=76.03..4146.10 rows=31 width=96) (actual time=22.447..129.927 rows=3 loops=1) Filter: (metaphone(fullname(fullname)) ~~ '%%%'::text) Rows Removed by Filter: 244 Heap Blocks: exact=234 Buffers: shared hit=239 -> Bitmap Index Scan on info_metaphone_idx (cost=0.00..76.02 rows=1560 width=0) (actual time=0.061..0.061 rows=247 loops=1) Index Cond: ((metaphone(fullname(fullname)) ~>=~ ''::text) AND (metaphone(fullname(fullname)) ~<~ ''::text)) Buffers: shared hit=5 Planning time: 1.012 ms Execution time: 129.977 ms (12 rows) Time: 131,802 ms
create index info_metaphone_trgm_idx on info using gin (metaphone(fullname(fullname)) gin_trgm_ops); CREATE INDEX Time: 124.713s (2 minutes) explain (analyze, buffers) select patid, fullname from info where metaphone(fullname(fullname)) like '%'||regexp_replace(metaphone(' '),'\s','%','g')||'%' limit 10; Limit (cost=92.24..134.98 rows=10 width=96) (actual time=9.562..10.638 rows=3 loops=1) Buffers: shared hit=103 -> Bitmap Heap Scan on info (cost=92.24..224.74 rows=31 width=96) (actual time=9.554..10.617 rows=3 loops=1) Recheck Cond: (metaphone(fullname(fullname)) ~~ '%%%%'::text) Heap Blocks: exact=2 Buffers: shared hit=103 -> Bitmap Index Scan on info_metaphone_trgm_idx (cost=0.00..92.23 rows=31 width=0) (actual time=8.354..8.354 rows=3 loops=1) Index Cond: (metaphone(fullname(fullname)) ~~ '%%%%'::text) Buffers: shared hit=101 Planning time: 2.029 ms Execution time: 10.726 ms (11 rows) Time: 14,480 ms
create or replace function phoneme (in_lexeme text) returns text language plpgsql immutable as $$ declare res varchar(100) DEFAULT ''; begin res := lower(in_lexeme); res := regexp_replace(res,'[]','','g'); res := regexp_replace(res,'(|||)','','g'); res := regexp_replace(res,'[]','','g'); res := regexp_replace(res,'[]','','g'); res := regexp_replace(res,'','','g'); res := regexp_replace(res,'([]|$)','\1','g'); res := regexp_replace(res,'([]|$)','\1','g'); res := regexp_replace(res,'([]|$)','\1','g'); res := regexp_replace(res,'([]|$)','\1','g'); res := regexp_replace(res,'([]|$)','\1','g'); res := regexp_replace(res,'','','g'); res := regexp_replace(res,'','','g'); res := regexp_replace(res,'(.)\1','\1','g'); return res; exception when others then raise exception '%', sqlerrm; end; $$;
Search type | Index creation time | Index size | Missing search speed | Remarks |
Gist trigrams | 15 sec | 45 MB | 158 ms | |
Gin trigrams | 10 sec | 18 MB | 133 ms | |
Trigrams and full text search | 7.6 seconds | 18.8 MB | 68 ms | Worse selectivity, you need to maintain a dictionary of tokens |
Metaphone btree | 114 seconds | 22 MB | 131 ms | Plpython3u insecure language |
Metaphone trigram | 124 sec | 15 MB | 14 ms | Plpython3u insecure language |
The implementation of the metagram trigram from movEAX | 77.8 seconds | 16 MB | 14 ms | Plpython3u insecure language |
Implementation of Ivan Milovanov on plpgsql | 72.0 seconds | 16 MB | 14 ms |
select metaphone(' ') similar to mquery(' ');
create or replace function mquery(in_fullname text) returns text language plpgsql immutable as $$ declare res text; begin res := metaphone(in_fullname); res := regexp_replace(res, '(|)', '(|)', 'g'); res := regexp_replace(res, '(|)', '(|)', 'g'); res := regexp_replace(res, '(|)', '(|)', 'g'); res := regexp_replace(res, '(|)', '(|)', 'g'); res := regexp_replace(res, '(|)', '(|)', 'g'); res := regexp_replace(res, '\s', '%', 'g'); return '%'||res||'%'; exception when others then raise exception '%', sqlerrm; end; $$;
Source: https://habr.com/ru/post/341142/
All Articles