PostgreSQL 實踐 - 內容社區(如論壇)圖式搜索應用

標簽

PostgreSQL , 圖數據庫 , 論壇搜索 , 推薦 , 數組 , smlar相似搜索

背景

通常一個內容社區網站可能需要記錄這麼一些數據: 文章，用戶，標簽。

還有三者之間的關係，包括，文章的標簽，用戶閱讀了文章，用戶收藏了文章，用戶關注了某用戶，用戶是某篇文章的作者。

最終要實現毫無人道的查詢，例如：

閱讀了此篇文章的人還在閱讀什麼其他文章，和我愛好相近的人都有哪些等等等等。

其中文章數量幾千萬，用戶數量接近一千萬。

如何實現這樣的需求呢？

實際上PostgreSQL裏麵的數組、smlar實現這個需求非常的方便。下麵開始設計和壓測。

數組用於存儲正向和反向關係，標簽等。

smlar用於查詢相似的數組（找出愛好相似的人）。

設計

元數據

1、用戶表

create table users(  
  uid int primary key,             -- 用戶ID  
  info text,           -- 附加信息  
  crt_time timestamp   -- 時間  
);

2、標簽表

create table tags(  
  tagid int primary key,              -- 標簽ID  
  info text,              -- 附加信息   
  crt_time timestamp      -- 時間  
);

3、文章表

create table arts(  
  artid int primary key,      -- 文章ID  
  info text,      -- 附加信息、內容  
  uids int[],     -- 用戶IDs（作者可能是多人，所以使用數組）  
  tags int[]      -- 標簽  
);

關係數據

1、正向關係

1.1、文章被誰看過

create table art_uids_view (  
  artid int primary key,  
  uids int[]  
);

1.2、文章被誰收藏過

create table art_uids_like (  
  artid int primary key,  
  uids int[]  
);

2、反向關係

2.1、用戶看過哪些文章，包含哪些標簽

create table uid_arts_view (  
  uid int primary key,  
  arts int[],  
  tags int[]  
);

2.2、用戶收藏了哪些文章，包含哪些標簽

create table uid_arts_like (  
  uid int primary key,  
  arts int[],  
  tags int[]  
);

查詢

1、閱讀了此篇文章的其他人還在閱讀什麼其他文章，(過濾當前文章、以及我閱讀過的文章)。

邏輯如下，寫成UDF即可：

create extension intarray ;  -- 創建intarry插件，簡化數組的加減  
  
select (uids - $current_uid) into v1 from art_uids_view where artid = $current_artid ;  -- 取出閱讀了當前文章的所有用戶(減去當前用戶)  
  
select (unnest(arts) as arts, count(*) as cnt) into v2 from uid_arts_view where uid = any (v1) group by 1 ;  -- 獲取閱讀了同樣文章的人，還閱讀了哪些文章  
  
select arts into v3 from uid_arts_view where uid= $current_uid ;          -- 當前用戶閱讀了哪些文章  
  
result = v2.arts - v3 ;   -- 其他人閱讀的所有文章 減去 當前用戶閱讀的文章，得到其他人閱讀的文章。  按重疊數從大到小排序，推薦給用戶

UDF如下，都能使用索引，都是聚合後的點查，性能很讚：

create or replace function rec_arts_view(  
  i1 int,  -- 文章ID  
  i2 int,  -- 當前用戶ID  
  i3 int   -- limit  
) returns setof int as $$  
declare  
  res int[];  -- 結果  
  v1 int[];   -- 文章被哪些用戶閱讀了  
begin  
    
  -- 文章被哪些用戶閱讀了  
  select (uids - i2) into v1 from art_uids_view where artid = i1 ;  
    
  -- 閱讀了這篇文章的其他用戶，閱讀了哪些文章，排除當前用戶已閱讀的，按重複率排序，返回N條。  
  -- 如果閱讀了該文章的其他人，還閱讀了很多其他文章，排序可能會略耗時。
  return query  
  select t1.arts from   
  (  
    select unnest(arts) arts, count(*) cnt from uid_arts_view where uid = any (v1) group by 1  
  ) t1  
  left join  
  (  
    select unnest(arts) arts, 1 cnt from uid_arts_view where uid= i2   
  ) t2  
  on (t1.arts=t2.arts)   
  where t2.* is null  
  order by t1.cnt desc  
  limit i3;  
  
end;  
$$ language plpgsql strict;

2、與我（閱讀文章）愛好相近的人有哪些，走GIN索引，性能很讚。

create extension smlar;  
  
set smlar.type='overlap';        
set smlar.threshold=?;             -- 設置重疊閾值  
    
select arts into v1 from uid_arts_view where uid = ?;       -- 我閱讀了哪些文章    
  
select      
    *,      
    smlar( arts, v1, 'N.i' )       -- 其他人與我閱讀的文章的重疊數是多少  
  from      
    uid_arts_view      
  where      
    arts % v1                      -- where cosine similarity >= smlar.threshold      
;

3、與我（閱讀文章標簽）愛好相近的人有哪些。

與2類似，略。

4、與我（收藏文章）愛好相近的人有哪些。

與2類似，略。

5、與我（收藏文章標簽）愛好相近的人有哪些。

與2類似，略。

生成正反向關係的UDF

使用UDF，減少交互次數，完成以下幾類業務邏輯的操作。UDF可以使用plpgsql編寫，很簡單，本文略：

https://www.postgresql.org/docs/10/static/plpgsql.html

1、新建文章的行為，自動產生標簽，並更新或追加標簽表。

insert into tags values ();  
  
insert into arts values ();

2、閱讀行為，修改正向反向關係。

文章的tags信息從arts裏獲取  
  
insert into art_uids_view values ();  
  
insert into uid_arts_view values ();

3、收藏行為，修改正向反向關係。

文章的tags信息從arts裏獲取  
  
insert into art_uids_like values ();  
  
insert into uid_arts_like values ();

索引

-- smlar 相似查詢  
create index idx_gin_1 on art_uids_view using gin ( uids _int4_sml_ops );    
create index idx_gin_2 on art_uids_like using gin ( uids _int4_sml_ops );    
  
create index idx_gin_3 on uid_arts_view using gin ( arts _int4_sml_ops );    
create index idx_gin_4 on uid_arts_view using gin ( tags _int4_sml_ops );    
  
create index idx_gin_5 on uid_arts_like using gin ( arts _int4_sml_ops );    
create index idx_gin_6 on uid_arts_like using gin ( tags _int4_sml_ops );    
  
create index idx_gin_7 on art_uids_view using gin ( uids _int4_sml_ops );    
create index idx_gin_8 on art_uids_like using gin ( uids _int4_sml_ops );

可選索引

-- 數組相交、包含查詢  
create index idx_gin_01 on art_uids_view using gin ( uids gin__int_ops );    
create index idx_gin_02 on art_uids_like using gin ( uids gin__int_ops );    
  
create index idx_gin_03 on uid_arts_view using gin ( arts gin__int_ops );    
create index idx_gin_04 on uid_arts_view using gin ( tags gin__int_ops );    
  
create index idx_gin_05 on uid_arts_like using gin ( arts gin__int_ops );    
create index idx_gin_06 on uid_arts_like using gin ( tags gin__int_ops );    
  
create index idx_gin_07 on art_uids_view using gin ( uids gin__int_ops );    
create index idx_gin_08 on art_uids_like using gin ( uids gin__int_ops );

填充測試數據

1、生成1000萬用戶

insert into users select id, md5(id::text), now() from generate_series(1,10000000) t(id);

2、生成10萬標簽

insert into tags select id, md5(id::text), now() from generate_series(1,100000) t(id);

3、生成5000萬文章

create or replace function gen_arr(int,int) returns int[] as $$  
  select array(select ceil(random()*$1) from generate_series(1,$2))::int[];  
$$ language sql strict;

insert into arts select id, md5(id::text),   
  gen_arr(10000000 ,3),    
  gen_arr(100000 ,10)   
from generate_series(1,50000000) t(id);

4、生成正向關係，平均每篇文章被500人閱讀，被50人收藏。

insert into art_uids_view select id, gen_arr(10000000, 500) from generate_series(1,50000000) t(id);  
  
insert into art_uids_like select id, gen_arr(10000000, 50) from generate_series(1,50000000) t(id);

5、生成反向關係（按理說，反向關係和正向關係應該一一對應，為了測試方便，我這裏就不對應了，測試效果是一樣的）

平均每人閱讀1000篇文章，涉及500個標簽。收藏100篇文章，涉及50個標簽。

insert into uid_arts_view select id, gen_arr(50000000, 1000), gen_arr(100000, 500) from generate_series(1,10000000) t(id);  
  
insert into uid_arts_like select id, gen_arr(50000000, 100), gen_arr(100000, 50) from generate_series(1,10000000) t(id);

性能測試

1、閱讀了此篇文章的其他人還在閱讀什麼其他文章，(過濾當前文章、以及我閱讀過的文章)。

select rec_arts_view(1,2,10);    -- 文章ID=1， 當前用戶ID=2，  返回10條推薦文章給當前用戶。

其他人一共閱讀了約50萬其他文章，獲取加排序耗時：200毫秒。

postgres=# select count(*) from rec_arts_view(1,4,1000000);
 count  
--------
 497524
(1 row)

Time: 565.524 ms

postgres=# select count(*) from rec_arts_view(1,4,10);
 count 
-------
    10
(1 row)

Time: 198.368 ms

2、與我（閱讀文章）愛好相近的人有哪些。

set smlar.type='overlap';        
set smlar.threshold=10;             -- 設置重疊閾值  
    
select arts into v1 from uid_arts_view where uid = 1;       -- 我閱讀了哪些文章  
  
select      
    *,      
    smlar( arts, v1, 'N.i' )       -- 其他人與我閱讀的文章的重疊數是多少  
  from      
    uid_arts_view      
  where      
    arts % v1                      -- where cosine similarity >= smlar.threshold      
;

耗時：2.4毫秒。

 Bitmap Heap Scan on public.uid_arts_view  (cost=933.50..29296.93 rows=10000 width=72) (actual time=1.955..2.351 rows=2 loops=1)
   Output: uid, arts, tags, smlar(arts, '{25213311,49886221,45108456,27929256,20760231,35023889,17507871,43947072,48578113,41561690,39557908,26852171,29310889,5029778,24892381,12174141,9191797,41397570,25795440,27806324,28635176}'::integer[], 'N.i'::text)
   Recheck Cond: (uid_arts_view.arts % '{25213311,49886221,45108456,27929256,20760231,35023889,17507871,43947072,48578113,41561690,39557908,26852171,29310889,5029778,24892381,12174141,9191797,41397570,25795440,27806324,28635176}'::integer[])
   Heap Blocks: exact=2
   Buffers: shared hit=107
   ->  Bitmap Index Scan on idx_gin_3  (cost=0.00..931.00 rows=10000 width=0) (actual time=1.506..1.506 rows=2 loops=1)
         Index Cond: (uid_arts_view.arts % '{25213311,49886221,45108456,27929256,20760231,35023889,17507871,43947072,48578113,41561690,39557908,26852171,29310889,5029778,24892381,12174141,9191797,41397570,25795440,27806324,28635176}'::integer[])
         Buffers: shared hit=85
 Planning time: 0.110 ms
 Execution time: 2.378 ms
(10 rows)

預計算與性能優化

前麵的推薦文章、找相似的人。指的是實時查詢的性能，而實際這些操作都可以預計算的(因為文章增量不會太大、而且文章的閱讀人群變化不會太大)，例如一天刷新一次，那麼像用戶推薦相似用戶，推薦相似文章時，有預計算則直接查詢結果，那性能會提升到0.0N毫秒級響應。沒有預計算的新文章，則實時查詢（並更新到預計算的表中），也能夠毫秒級響應。

預計算還可以做成另一種模式，當有人查詢這篇文章時，根據上次預計算的時間，決定是否需要重新查詢，並更新它。（也就是說，實時計算 + 緩存 + 緩存超時的模式。）

邏輯如下

select xxx from pre_view_tbl where xxx=xxx;  -- 查詢緩存，return

-- 寫入或更新緩存
if not found then
  -- 同步寫入
  insert into pre_view_tbl select xxxx returning *; -- 實時計算, 並返回
else if mod_time < (now() - 超時閾值) then
  -- 異步
  delete from pre_view_tbl where xxx=xxx;
  insert into pre_view_tbl select xxxx; -- 實時計算
end if;

小結

3分開發，7分運營。內容網站與社交軟件類似，運營是重頭戲。運營中關鍵的一環是圈子，圈子可以聚人氣，形成圈子往往靠的是推薦，推薦的源頭又是行為，推薦什麼樣的內容、人給目標，靠的是行為。所謂物以類聚，人以群居，就是這個理。

PostgreSQL 的數組、smlar實現高效的歸類查詢、推薦需求非常的方便。

1、數組用於存儲正向和反向關係，標簽等。

2、smlar用於查詢相似的數組（找出愛好相似的人）。

在社交運營、內容運營場景中，非常方便、高效。

熱點人、熱點文章也不在話下，在其他案例中已經測試過，可以參考本文末尾。

參考

https://www.postgresql.org/docs/10/static/plpgsql.html

《電商內容去重\內容篩選應用(實時識別轉載\盜圖\侵權?) - 文本、圖片集、商品集、數組相似判定的優化和索引技術》

https://www.postgresql.org/docs/10/static/intarray.html

計數、實時需求也口可以使用流計算，案例參考：

《三體高可用PCC大賽 - facebook\微博 like場景 - 數據庫設計與性能壓測》

https://github.com/bitnine-oss/agensgraph

最後更新：2017-10-28 23:34:00

PostgreSQL 實踐 - 內容社區(如論壇)圖式搜索應用

標簽

背景

如何實現這樣的需求呢？

設計

元數據

關係數據

查詢

生成正反向關係的UDF

索引

填充測試數據

性能測試

預計算與性能優化

小結

參考

上一篇： PostgreSQL flashback(閃回) 功能實現與介紹

下一篇： PostgreSQL 空間切割(st_split)功能擴展 - 空間對象網格化

相關內容

熱門內容

最新內容