閱讀305 返回首頁    go 阿裏雲 go 技術社區[雲棲]


PostgreSQL 數據庫巡檢模板和腳本。

https://github.com/digoal/pgsql_admin_script
#!/bin/bash

# 已在CentOS 6.x上進行測試
# author: digoal
# 2015-10
# 用法  . ./generate_report.sh >/tmp/report.log 2>&1
# 生成目錄   grep -E "^----->>>|^\|" /tmp/report.log | sed 's/^----->>>---->>>/    /' | sed '1 i\ \ 目錄\n\n' | sed '$ a\ \n\n\ \ 正文\n\n'

# 請將以下變量修改為與當前環境一致, 並且確保使用這個配置連接任何數據庫都不需要輸入密碼
export PGDATA=/data01/pg_root_1921
export PGHOST=127.0.0.1
export PGPORT=1921
export PGDATABASE=postgres
export PGUSER=postgres
export PGHOME=/opt/pgsql


export DATE=`date +"%Y%m%d%H%M"`
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib:$LD_LIBRARY_PATH
export PATH=$PGHOME/bin:$PATH:.


# 記住當前目錄
PWD=`pwd`

# 獲取postgresql日誌目錄
pg_log_dir=`grep '^\ *[a-z]' $PGDATA/postgresql.conf|awk -F "#" '{print $1}'|grep log_directory|awk -F "=" '{print $2}'`

# 檢查是否standby
is_standby=`psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE -q -A -t -c 'select pg_is_in_recovery()'`


echo "    ----- PostgreSQL 巡檢報告 -----  "
echo "    ===== $DATE        =====  "


if [ $is_standby == 't' ]; then
echo "    ===== 這是standby節點     =====  "
else
echo "    ===== 這是primary節點     =====  "
fi
echo ""


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                      操作係統信息                       |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  主機名: "
hostname -s
echo ""
echo "----->>>---->>>  主機網絡信息: "
ifconfig
echo ""
echo "----->>>---->>>  操作係統內核: "
uname -a
echo ""
echo "----->>>---->>>  內存(MB): "
free -m
echo ""
echo "----->>>---->>>  CPU: "
lscpu
echo ""
echo "----->>>---->>>  塊設備: "
lsblk
echo ""
echo "----->>>---->>>  拓撲: "
lstopo-no-graphics
echo ""
echo "----->>>---->>>  進程樹: "
pstree -a -A -c -l -n -p -u -U -Z
echo ""
echo "----->>>---->>>  操作係統配置: "
echo "----->>>---->>>  /etc/sysctl.conf "
grep "^[a-z]" /etc/sysctl.conf
echo ""
echo "----->>>---->>>  /etc/security/limits.conf "
grep -v "^#" /etc/security/limits.conf|grep -v "^$"
echo ""
echo "----->>>---->>>  /etc/security/limits.d/*.conf "
grep -v "^#" /etc/security/limits.d/*.conf|grep -v "^$"
echo ""
echo "----->>>---->>>  /etc/sysconfig/iptables "
cat /etc/sysconfig/iptables
echo ""
echo "----->>>---->>>  sysctl -a 信息: "
sysctl -a
echo ""
echo "----->>>---->>>  硬盤SMART信息(需要root): "
for i in `smartctl --scan|awk '{print $1}'`; do echo -e "\n\nDEVICE $i"; smartctl -x $i; done
echo ""
echo "----->>>---->>>  /var/log/boot.log "
cat /var/log/boot.log
echo ""
echo "----->>>---->>>  /var/log/cron(需要root) "
cat /var/log/cron
echo ""
echo "----->>>---->>>  /var/log/dmesg "
cat /var/log/dmesg
echo ""
echo "----->>>---->>>  /var/log/messages(需要root) "
tail -n 500 /var/log/messages
echo ""
echo "----->>>---->>>  /var/log/secure(需要root) "
cat /var/log/secure
echo ""
echo "----->>>---->>>  /var/log/wtmp "
who -a /var/log/wtmp
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                       數據庫信息                        |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  數據庫版本: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select version()'

echo "----->>>---->>>  用戶已安裝的插件版本: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),* from pg_extension'
done

echo "----->>>---->>>  用戶使用了多少種數據類型: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),b.typname,count(*) from pg_attribute a,pg_type b where a.atttypid=b.oid and a.attrelid in (select oid from pg_class where relnamespace not in (select oid from pg_namespace where nspname ~ $$^pg_$$ or nspname=$$information_schema$$)) group by 1,2 order by 3 desc'
done

echo "----->>>---->>>  用戶創建了多少對象: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),rolname,nspname,relkind,count(*) from pg_class a,pg_authid b,pg_namespace c where a.relnamespace=c.oid and a.relowner=b.oid and nspname !~ $$^pg_$$ and nspname<>$$information_schema$$ group by 1,2,3,4 order by 5 desc'
done

echo "----->>>---->>>  用戶對象占用空間的柱狀圖: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),buk this_buk_no,cnt rels_in_this_buk,pg_size_pretty(min) buk_min,pg_size_pretty(max) buk_max from( select row_number() over (partition by buk order by tsize),tsize,buk,min(tsize) over (partition by buk),max(tsize) over (partition by buk),count(*) over (partition by buk) cnt from ( select pg_relation_size(a.oid) tsize, width_bucket(pg_relation_size(a.oid),tmin,tmax,10) buk from (select min(pg_relation_size(a.oid)) tmin,max(pg_relation_size(a.oid)) tmax from pg_class a,pg_namespace c where a.relnamespace=c.oid and nspname !~ $$^pg_$$ and nspname<>$$information_schema$$) t, pg_class a,pg_namespace c where a.relnamespace=c.oid and nspname !~ $$^pg_$$ and nspname<>$$information_schema$$ ) t)t where row_number=1;'
done

echo "----->>>---->>>  當前用戶的操作係統定時任務: "
echo "I am `whoami`"
crontab -l
echo "建議: "
echo "    仔細檢查定時任務的必要性, 以及定時任務的成功與否的評判標準, 以及監控措施. "
echo "    請以啟動數據庫的OS用戶執行本腳本. "
echo -e "\n"


common() {
# 進入pg_log工作目錄
cd $PGDATA
eval cd $pg_log_dir

echo "----->>>---->>>  獲取pg_hba.conf md5值: "
md5sum $PGDATA/pg_hba.conf
echo "建議: "
echo "    主備md5值一致(判斷主備配置文件是否內容一致的一種手段, 或者使用diff)."
echo -e "\n"

echo "----->>>---->>>  獲取pg_hba.conf配置: "
grep '^\ *[a-z]' $PGDATA/pg_hba.conf
echo "建議: "
echo "    主備配置盡量保持一致, 注意trust和password認證方法的危害(password方法 驗證時網絡傳輸密碼明文, 建議改為md5), 建議除了unix socket可以使用trust以外, 其他都使用md5或者LDAP認證方法."
echo "    建議先設置白名單(超級用戶允許的來源IP, 可以訪問的數據庫), 再設置黑名單(不允許超級用戶登陸, reject), 再設置白名單(普通應用), 參考pg_hba.conf中的描述. "
echo -e "\n"

echo "----->>>---->>>  獲取postgresql.conf md5值: "
md5sum $PGDATA/postgresql.conf
echo "建議: "
echo "    主備md5值一致(判斷主備配置文件是否內容一致的一種手段, 或者使用diff)."
echo -e "\n"

echo "----->>>---->>>  獲取postgresql.conf配置: "
grep '^\ *[a-z]' $PGDATA/postgresql.conf|awk -F "#" '{print $1}'
echo "建議: "
echo "    主備配置盡量保持一致, 配置合理的參數值."
echo -e "    建議修改的參數列表如下  ( 假設操作係統內存為128GB, 數據庫獨占操作係統, 數據庫版本9.4.x )  : 
echo ""
listen_addresses = '0.0.0.0'            # 監聽所有IPV4地址
port = 1921                             # 監聽非默認端口
max_connections = 4000                  # 最大允許連接數
superuser_reserved_connections = 20     # 為超級用戶保留的連接
unix_socket_directories = '.'           # unix socket文件目錄最好放在$PGDATA中, 確保安全
unix_socket_permissions = 0700          # 確保權限安全
tcp_keepalives_idle = 30                # 間歇性發送TCP心跳包, 防止連接被網絡設備中斷.
tcp_keepalives_interval = 10
tcp_keepalives_count = 10
shared_buffers = 16GB                   # 數據庫自己管理的共享內存大小
huge_pages = try                        # 盡量使用大頁, 需要操作係統支持, 配置vm.nr_hugepages*2MB大於shared_buffers.
maintenance_work_mem = 512MB            # 可以加速創建索引, 回收垃圾(假設沒有設置autovacuum_work_mem)
autovacuum_work_mem = 512MB             # 可以加速回收垃圾
shared_preload_libraries = 'auth_delay,passwordcheck,pg_stat_statements,auto_explain'           # 建議防止暴力破解, 密碼複雜度檢測, 開啟pg_stat_statements, 開啟auto_explain, 參考 https://blog.163.com/digoal@126/blog/static/16387704020149852941586  
bgwriter_delay = 10ms                   # bgwriter process間隔多久調用write接口(注意不是fsync)將shared buffer中的dirty page寫到文件係統.
bgwriter_lru_maxpages = 1000            # 一個周期最多寫多少髒頁
max_worker_processes = 20               # 如果要使用worker process, 最多可以允許fork 多少個worker進程.
wal_level = logical                     # 如果將來打算使用logical複製, 最後先配置好, 不需要停機再改.
synchronous_commit = off                # 如果磁盤的IOPS能力一般, 建議使用異步提交來提高性能, 但是數據庫crash或操作係統crash時, 最多可能丟失2*wal_writer_delay時間段產生的事務日誌(在wal buffer中). 
wal_sync_method = open_datasync         # 使用pg_test_fsync測試wal所在磁盤的fsync接口, 使用性能好的.
wal_buffers = 16MB
wal_writer_delay = 10ms
checkpoint_segments = 1024              # 等於shared_buffers除以單個wal segment的大小.
checkpoint_timeout = 30min
checkpoint_completion_target = 0.2
archive_mode = on                       # 最好先開啟, 否則需要重啟數據庫來修改
archive_command = '/bin/date'           # 最好先開啟, 否則需要重啟數據庫來修改, 將來修改為正確的命令例如, test ! -f /home/postgres/archivedir/pg_root/%f && cp %p /home/postgres/archivedir/pg_root/%f
max_wal_senders = 32                    # 最多允許多少個wal sender進程.
wal_keep_segments = 2048                # 在pg_xlog目錄中保留的WAL文件數, 根據流複製業務的延遲情況和pg_xlog目錄大小來預估.
max_replication_slots = 32              # 最多允許多少個複製插槽
hot_standby = on
max_standby_archive_delay = 300s        # 如果備庫要被用於隻讀, 有大的查詢的情況下, 如果遇到conflicts, 可以考慮調整這個值來避免conflict造成cancel query.
max_standby_streaming_delay = 300s      # 如果備庫要被用於隻讀, 有大的查詢的情況下, 如果遇到conflicts, 可以考慮調整這個值來避免conflict造成cancel query.
wal_receiver_status_interval = 1s
hot_standby_feedback = on
random_page_cost = 2                    # 根據IO能力調整
effective_cache_size = 100GB            # 調整為與內存一樣大, 或者略小(減去shared_buffer). 用來評估OS PAGE CACHE可以用到的內存大小.
log_destination = 'csvlog'
logging_collector = on
log_truncate_on_rotation = on
log_rotation_size = 10MB
log_min_duration_statement = 1s
log_checkpoints = on
log_connections = on
log_disconnections = on
log_error_verbosity = verbose           # 在日誌中輸出代碼位置
log_lock_waits = on
log_statement = 'ddl'
autovacuum = on
log_autovacuum_min_duration = 0
autovacuum_max_workers = 10
autovacuum_naptime = 30s                # 快速喚醒, 防止膨脹
autovacuum_vacuum_scale_factor = 0.02   # 當垃圾超過比例時, 啟動垃圾回收工作進程
autovacuum_analyze_scale_factor = 0.1
auth_delay.milliseconds = 5000          # 認證失敗, 延遲多少毫秒反饋
auto_explain.log_min_duration = 5000    # 記錄超過多少毫秒的SQL當時的執行計劃
auto_explain.log_analyze = true
auto_explain.log_verbose = true
auto_explain.log_buffers = true
auto_explain.log_nested_statements = true
pg_stat_statements.track_utility=off

    建議的操作係統配置(根據實際情況修改) : 
vi /etc/sysctl.conf
# add by digoal.zhou
net.ipv4.ip_forward = 0
net.ipv4.conf.default.rp_filter = 1
net.ipv4.conf.default.accept_source_route = 0
kernel.sysrq = 0
kernel.core_uses_pid = 1
net.ipv4.tcp_syncookies = 1
net.bridge.bridge-nf-call-ip6tables = 0
net.bridge.bridge-nf-call-iptables = 0
net.bridge.bridge-nf-call-arptables = 0
kernel.msgmnb = 65536
kernel.msgmax = 65536
kernel.shmmax = 68719476736
kernel.shmall = 4294967296
kernel.shmmni = 4096
kernel.sem = 50100 64128000 50100 1280
fs.file-max = 76724600
net.ipv4.ip_local_port_range = 9000 65000
net.core.rmem_default = 1048576
net.core.rmem_max = 4194304
net.core.wmem_default = 262144
net.core.wmem_max = 1048576
net.ipv4.tcp_tw_recycle = 1
net.ipv4.tcp_max_syn_backlog = 4096
net.core.netdev_max_backlog = 10000
net.ipv4.netfilter.ip_conntrack_max = 655360
fs.aio-max-nr = 1048576
net.ipv4.tcp_timestamps = 0
vm.overcommit_memory = 0
net.ipv4.tcp_tw_recycle=1
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_keepalive_time = 72 
net.ipv4.tcp_keepalive_probes = 9 
net.ipv4.tcp_keepalive_intvl = 7
vm.zone_reclaim_mode=1
vm.dirty_background_ratio = 10
vm.dirty_background_bytes = 1024000000
vm.dirty_ratio = 60
vm.dirty_bytes = 0
vm.dirty_writeback_centisecs = 500
vm.dirty_expire_centisecs = 3000
vm.swappiness=0
net.ipv4.tcp_syncookies = 0
net.ipv4.tcp_tw_reuse = 0
net.ipv4.tcp_tw_recycle = 1
net.ipv4.tcp_fin_timeout = 30
vm.nr_hugepages=102400

vi /etc/security/limits.conf
* soft    nofile  1024000
* hard    nofile  1024000
* soft    nproc   unlimited
* hard    nproc   unlimited
* soft    core    unlimited
* hard    core    unlimited
* soft    memlock unlimited
* hard    memlock unlimited

rm -f /etc/security/limits.d/90-nproc.conf
\n "

echo "----->>>---->>>  用戶或數據庫級別定製參數: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select * from pg_db_role_setting'
echo "建議: "
echo "    定製參數需要關注, 優先級高於數據庫的啟動參數和配置文件中的參數, 特別是排錯時需要關注. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                   數據庫錯誤日誌分析                    |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  獲取錯誤日誌信息: "
cat *.csv | grep -E "^[0-9]" | grep -E "WARNING|ERROR|FATAL|PANIC" | awk -F "," '{print $12" , "$13" , "$14}'|sort|uniq -c|sort -rn
echo "建議: "
echo "    參考 https://www.postgresql.org/docs/current/static/errcodes-appendix.html ."
echo -e "\n"

echo "----->>>---->>>  獲取連接請求情況: "
find . -name "*.csv" -type f -mtime -28 -exec grep "connection authorized" {} +|awk -F "," '{print $2,$3,$5}'|sed 's/\:[0-9]*//g'|sort|uniq -c|sort -n -r
echo "建議: "
echo "    連接請求非常多時, 請考慮應用層使用連接池, 或者使用pgbouncer連接池. "
echo -e "\n"

echo "----->>>---->>>  獲取認證失敗情況: "
find . -name "*.csv" -type f -mtime -28 -exec grep "password authentication failed" {} +|awk -F "," '{print $2,$3,$5}'|sed 's/\:[0-9]*//g'|sort|uniq -c|sort -n -r
echo "建議: "
echo "    認證失敗次數很多時, 可能是有用戶在暴力破解, 建議使用auth_delay插件防止暴力破解. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                   數據庫慢SQL日誌分析                   |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  慢查詢統計: "
cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "duration:"|grep -v "plan:"|awk '{print $1" "$4" "$5" "$6}'|sort|uniq -c|sort -rn
echo "建議: "
echo "    輸出格式(條數,日期,用戶,數據庫,QUERY,耗時ms). "
echo "    慢查詢反映執行時間超過log_min_duration_statement的SQL, 可以根據實際情況分析數據庫或SQL語句是否有優化空間. "
echo ""
echo "----->>>---->>>  慢查詢分布頭10條的執行時間, ms: "
cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "duration:"|grep -v "plan:"|awk '{print $1" "$4" "$5" "$6" "$7" "$8}'|sort -k 6 -n|head -n 10
echo ""
echo "----->>>---->>>  慢查詢分布尾10條的執行時間, ms: "
cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "duration:"|grep -v "plan:"|awk '{print $1" "$4" "$5" "$6" "$7" "$8}'|sort -k 6 -n|tail -n 10
echo -e "\n"

echo "----->>>---->>>  auto_explain 分析統計: "
cat *.csv|awk -F "," '{print $1" "$2" "$3" "$8" "$14}' |grep "plan:"|grep "duration:"|awk '{print $1" "$4" "$5" "$6}'|sort|uniq -c|sort -rn
echo "建議: "
echo "    輸出格式(條數,日期,用戶,數據庫,QUERY). "
echo "    慢查詢反映執行時間超過auto_explain.log_min_duration的SQL, 可以根據實際情況分析數據庫或SQL語句是否有優化空間, 分析csvlog中auto_explain的輸出可以了解語句超時時的執行計劃詳情. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                   數據庫空間使用分析                    |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  輸出文件係統剩餘空間: "
df -m
echo "建議: "
echo "    注意預留足夠的空間給數據庫. "
echo -e "\n"

echo "----->>>---->>>  輸出表空間對應目錄: "
echo $PGDATA
ls -la $PGDATA/pg_tblspc/
echo "建議: "
echo "    注意表空間如果不是軟鏈接, 注意是否刻意所為, 正常情況下應該是軟鏈接. "
echo -e "\n"

echo "----->>>---->>>  輸出表空間使用情況: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select spcname,pg_tablespace_location(oid),pg_size_pretty(pg_tablespace_size(oid)) from pg_tablespace order by pg_tablespace_size(oid) desc'
echo "建議: "
echo "    注意檢查表空間所在文件係統的剩餘空間, (默認表空間在$PGDATA/base目錄下), IOPS分配是否均勻, OS的sysstat包可以觀察IO使用率. "
echo -e "\n"

echo "----->>>---->>>  輸出數據庫使用情況: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select datname,pg_size_pretty(pg_database_size(oid)) from pg_database order by pg_database_size(oid) desc'
echo "建議: "
echo "    注意檢查數據庫的大小, 是否需要清理曆史數據. "
echo -e "\n"

echo "----->>>---->>>  TOP 10 size對象: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),b.nspname,c.relname,c.relkind,pg_size_pretty(pg_relation_size(c.oid)),a.seq_scan,a.seq_tup_read,a.idx_scan,a.idx_tup_fetch,a.n_tup_ins,a.n_tup_upd,a.n_tup_del,a.n_tup_hot_upd,a.n_live_tup,a.n_dead_tup from pg_stat_all_tables a, pg_class c,pg_namespace b where c.relnamespace=b.oid and c.relkind=$$r$$ and a.relid=c.oid order by pg_relation_size(c.oid) desc limit 10'
done
echo "建議: "
echo "    經驗值: 單表超過8GB, 並且這個表需要頻繁更新 或 刪除+插入的話, 建議對表根據業務邏輯進行合理拆分後獲得更好的性能, 以及便於對膨脹索引進行維護; 如果是隻讀的表, 建議適當結合SQL語句進行優化. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                     數據庫連接分析                      |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  當前活躍度: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select now(),state,count(*) from pg_stat_activity group by 1,2'
echo "建議: "
echo "    如果active狀態很多, 說明數據庫比較繁忙. 如果idle in transaction很多, 說明業務邏輯設計可能有問題. 如果idle很多, 可能使用了連接池, 並且可能沒有自動回收連接到連接池的最小連接數. "
echo -e "\n"

echo "----->>>---->>>  總剩餘連接數: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select max_conn,used,res_for_super,max_conn-used-res_for_super res_for_normal from (select count(*) used from pg_stat_activity) t1,(select setting::int res_for_super from pg_settings where name=$$superuser_reserved_connections$$) t2,(select setting::int max_conn from pg_settings where name=$$max_connections$$) t3'
echo "建議: "
echo "    給超級用戶和普通用戶設置足夠的連接, 以免不能登錄數據庫. "
echo -e "\n"

echo "----->>>---->>>  用戶連接數限製: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select a.rolname,a.rolconnlimit,b.connects from pg_authid a,(select usename,count(*) connects from pg_stat_activity group by usename) b where a.rolname=b.usename order by b.connects desc'
echo "建議: "
echo "    給用戶設置足夠的連接數, alter role ... CONNECTION LIMIT . "
echo -e "\n"

echo "----->>>---->>>  數據庫連接限製: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select a.datname, a.datconnlimit, b.connects from pg_database a,(select datname,count(*) connects from pg_stat_activity group by datname) b where a.datname=b.datname order by b.connects desc'
echo "建議: "
echo "    給數據庫設置足夠的連接數, alter database ... CONNECTION LIMIT . "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                     數據庫性能分析                      |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  TOP 5 SQL : total_cpu_time "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -x -c 'select c.rolname,b.datname,a.total_time/a.calls per_call_time,a.* from pg_stat_statements a,pg_database b,pg_authid c where a.userid=c.oid and a.dbid=b.oid order by a.total_time desc limit 5'
echo "建議: "
echo "    檢查SQL是否有優化空間, 配合auto_explain插件在csvlog中觀察LONG SQL的執行計劃是否正確. "
echo -e "\n"

echo "----->>>---->>>  索引數超過4並且SIZE大於10MB的表: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(), t2.nspname, t1.relname, pg_size_pretty(pg_relation_size(t1.oid)), t3.idx_cnt from pg_class t1, pg_namespace t2, (select indrelid,count(*) idx_cnt from pg_index group by 1 having count(*)>4) t3 where t1.oid=t3.indrelid and t1.relnamespace=t2.oid and pg_relation_size(t1.oid)/1024/1024.0>10 order by t3.idx_cnt desc'
done
echo "建議: "
echo "    索引數量太多, 影響表的增刪改性能, 建議檢查是否有不需要的索引. "
echo -e "\n"

echo "----->>>---->>>  上次巡檢以來未使用或使用較少的索引: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),t2.schemaname,t2.relname,t2.indexrelname,t2.idx_scan,t2.idx_tup_read,t2.idx_tup_fetch,pg_size_pretty(pg_relation_size(indexrelid)) from pg_stat_all_tables t1,pg_stat_all_indexes t2 where t1.relid=t2.relid and t2.idx_scan<10 and t2.schemaname not in ($$pg_toast$$,$$pg_catalog$$) and indexrelid not in (select conindid from pg_constraint where contype in ($$p$$,$$u$$,$$f$$)) and pg_relation_size(indexrelid)>65536 order by pg_relation_size(indexrelid) desc'
done
echo "建議: "
echo "    建議和應用開發人員確認後, 刪除不需要的索引. "
echo -e "\n"

echo "----->>>---->>>  數據庫統計信息, 回滾比例, 命中比例, 數據塊讀寫時間, 死鎖, 複製衝突: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select datname,round(100*(xact_rollback::numeric/(case when xact_commit > 0 then xact_commit else 1 end + xact_rollback)),2)||$$ %$$ rollback_ratio, round(100*(blks_hit::numeric/(case when blks_read>0 then blks_read else 1 end + blks_hit)),2)||$$ %$$ hit_ratio, blk_read_time, blk_write_time, conflicts, deadlocks from pg_stat_database'
echo "建議: "
echo "    回滾比例大說明業務邏輯可能有問題, 命中率小說明shared_buffer要加大, 數據塊讀寫時間長說明塊設備的IO性能要提升, 死鎖次數多說明業務邏輯有問題, 複製衝突次數多說明備庫可能在跑LONG SQL. "
echo -e "\n"

echo "----->>>---->>>  檢查點, bgwriter 統計信息: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -x -c 'select * from pg_stat_bgwriter'
echo "建議: "
echo "    checkpoint_write_time多說明檢查點持續時間長, 檢查點過程中產生了較多的髒頁. "
echo "    checkpoint_sync_time代表檢查點開始時的shared buffer中的髒頁被同步到磁盤的時間, 如果時間過長, 並且數據庫在檢查點時性能較差, 考慮一下提升塊設備的IOPS能力. "
echo "    buffers_backend_fsync太多說明需要加大shared buffer 或者 減小bgwriter_delay參數. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                     數據庫垃圾分析                      |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  表引膨脹檢查: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -x -c 'SELECT
  current_database() AS db, schemaname, tablename, reltuples::bigint AS tups, relpages::bigint AS pages, otta,
  ROUND(CASE WHEN otta=0 OR sml.relpages=0 OR sml.relpages=otta THEN 0.0 ELSE sml.relpages/otta::numeric END,1) AS tbloat,
  CASE WHEN relpages < otta THEN 0 ELSE relpages::bigint - otta END AS wastedpages,
  CASE WHEN relpages < otta THEN 0 ELSE bs*(sml.relpages-otta)::bigint END AS wastedbytes,
  CASE WHEN relpages < otta THEN $$0 bytes$$::text ELSE (bs*(relpages-otta))::bigint || $$ bytes$$ END AS wastedsize,
  iname, ituples::bigint AS itups, ipages::bigint AS ipages, iotta,
  ROUND(CASE WHEN iotta=0 OR ipages=0 OR ipages=iotta THEN 0.0 ELSE ipages/iotta::numeric END,1) AS ibloat,
  CASE WHEN ipages < iotta THEN 0 ELSE ipages::bigint - iotta END AS wastedipages,
  CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta) END AS wastedibytes,
  CASE WHEN ipages < iotta THEN $$0 bytes$$ ELSE (bs*(ipages-iotta))::bigint || $$ bytes$$ END AS wastedisize,
  CASE WHEN relpages < otta THEN
    CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta::bigint) END
    ELSE CASE WHEN ipages < iotta THEN bs*(relpages-otta::bigint)
      ELSE bs*(relpages-otta::bigint + ipages-iotta::bigint) END
  END AS totalwastedbytes
FROM (
  SELECT
    nn.nspname AS schemaname,
    cc.relname AS tablename,
    COALESCE(cc.reltuples,0) AS reltuples,
    COALESCE(cc.relpages,0) AS relpages,
    COALESCE(bs,0) AS bs,
    COALESCE(CEIL((cc.reltuples*((datahdr+ma-
      (CASE WHEN datahdr%ma=0 THEN ma ELSE datahdr%ma END))+nullhdr2+4))/(bs-20::float)),0) AS otta,
    COALESCE(c2.relname,$$?$$) AS iname, COALESCE(c2.reltuples,0) AS ituples, COALESCE(c2.relpages,0) AS ipages,
    COALESCE(CEIL((c2.reltuples*(datahdr-12))/(bs-20::float)),0) AS iotta -- very rough approximation, assumes all cols
  FROM
     pg_class cc
  JOIN pg_namespace nn ON cc.relnamespace = nn.oid AND nn.nspname <> $$information_schema$$
  LEFT JOIN
  (
    SELECT
      ma,bs,foo.nspname,foo.relname,
      (datawidth+(hdr+ma-(case when hdr%ma=0 THEN ma ELSE hdr%ma END)))::numeric AS datahdr,
      (maxfracsum*(nullhdr+ma-(case when nullhdr%ma=0 THEN ma ELSE nullhdr%ma END))) AS nullhdr2
    FROM (
      SELECT
        ns.nspname, tbl.relname, hdr, ma, bs,
        SUM((1-coalesce(null_frac,0))*coalesce(avg_width, 2048)) AS datawidth,
        MAX(coalesce(null_frac,0)) AS maxfracsum,
        hdr+(
          SELECT 1+count(*)/8
          FROM pg_stats s2
          WHERE null_frac<>0 AND s2.schemaname = ns.nspname AND s2.tablename = tbl.relname
        ) AS nullhdr
      FROM pg_attribute att 
      JOIN pg_class tbl ON att.attrelid = tbl.oid
      JOIN pg_namespace ns ON ns.oid = tbl.relnamespace 
      LEFT JOIN pg_stats s ON s.schemaname=ns.nspname
      AND s.tablename = tbl.relname
      AND s.inherited=false
      AND s.attname=att.attname,
      (
        SELECT
          (SELECT current_setting($$block_size$$)::numeric) AS bs,
            CASE WHEN SUBSTRING(SPLIT_PART(v, $$ $$, 2) FROM $$#"[0-9]+.[0-9]+#"%$$ for $$#$$)
              IN ($$8.0$$,$$8.1$$,$$8.2$$) THEN 27 ELSE 23 END AS hdr,
          CASE WHEN v ~ $$mingw32$$ OR v ~ $$64-bit$$ THEN 8 ELSE 4 END AS ma
        FROM (SELECT version() AS v) AS foo
      ) AS constants
      WHERE att.attnum > 0 AND tbl.relkind=$$r$$
      GROUP BY 1,2,3,4,5
    ) AS foo
  ) AS rs
  ON cc.relname = rs.relname AND nn.nspname = rs.nspname
  LEFT JOIN pg_index i ON indrelid = cc.oid
  LEFT JOIN pg_class c2 ON c2.oid = i.indexrelid
) AS sml order by wastedbytes desc limit 5'
done
echo "建議: "
echo "    根據浪費的字節數, 設置合適的autovacuum_vacuum_scale_factor, 大表如果頻繁的有更新或刪除和插入操作, 建議設置較小的autovacuum_vacuum_scale_factor來降低浪費空間. "
echo "    同時還需要打開autovacuum, 根據服務器的內存大小, CPU核數, 設置足夠大的autovacuum_work_mem 或 autovacuum_max_workers 或 maintenance_work_mem, 以及足夠小的 autovacuum_naptime . "
echo "    同時還需要分析是否對大數據庫使用了邏輯備份pg_dump, 係統中是否經常有長SQL, 長事務. 這些都有可能導致膨脹. "
echo "    使用pg_reorg或者vacuum full可以回收膨脹的空間. "
echo "    參考: https://blog.163.com/digoal@126/blog/static/1638770402015329115636287/ "
echo -e "\n"


echo "----->>>---->>>  索引膨脹檢查: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -x -c 'SELECT
  current_database() AS db, schemaname, tablename, reltuples::bigint AS tups, relpages::bigint AS pages, otta,
  ROUND(CASE WHEN otta=0 OR sml.relpages=0 OR sml.relpages=otta THEN 0.0 ELSE sml.relpages/otta::numeric END,1) AS tbloat,
  CASE WHEN relpages < otta THEN 0 ELSE relpages::bigint - otta END AS wastedpages,
  CASE WHEN relpages < otta THEN 0 ELSE bs*(sml.relpages-otta)::bigint END AS wastedbytes,
  CASE WHEN relpages < otta THEN $$0 bytes$$::text ELSE (bs*(relpages-otta))::bigint || $$ bytes$$ END AS wastedsize,
  iname, ituples::bigint AS itups, ipages::bigint AS ipages, iotta,
  ROUND(CASE WHEN iotta=0 OR ipages=0 OR ipages=iotta THEN 0.0 ELSE ipages/iotta::numeric END,1) AS ibloat,
  CASE WHEN ipages < iotta THEN 0 ELSE ipages::bigint - iotta END AS wastedipages,
  CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta) END AS wastedibytes,
  CASE WHEN ipages < iotta THEN $$0 bytes$$ ELSE (bs*(ipages-iotta))::bigint || $$ bytes$$ END AS wastedisize,
  CASE WHEN relpages < otta THEN
    CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta::bigint) END
    ELSE CASE WHEN ipages < iotta THEN bs*(relpages-otta::bigint)
      ELSE bs*(relpages-otta::bigint + ipages-iotta::bigint) END
  END AS totalwastedbytes
FROM (
  SELECT
    nn.nspname AS schemaname,
    cc.relname AS tablename,
    COALESCE(cc.reltuples,0) AS reltuples,
    COALESCE(cc.relpages,0) AS relpages,
    COALESCE(bs,0) AS bs,
    COALESCE(CEIL((cc.reltuples*((datahdr+ma-
      (CASE WHEN datahdr%ma=0 THEN ma ELSE datahdr%ma END))+nullhdr2+4))/(bs-20::float)),0) AS otta,
    COALESCE(c2.relname,$$?$$) AS iname, COALESCE(c2.reltuples,0) AS ituples, COALESCE(c2.relpages,0) AS ipages,
    COALESCE(CEIL((c2.reltuples*(datahdr-12))/(bs-20::float)),0) AS iotta -- very rough approximation, assumes all cols
  FROM
     pg_class cc
  JOIN pg_namespace nn ON cc.relnamespace = nn.oid AND nn.nspname <> $$information_schema$$
  LEFT JOIN
  (
    SELECT
      ma,bs,foo.nspname,foo.relname,
      (datawidth+(hdr+ma-(case when hdr%ma=0 THEN ma ELSE hdr%ma END)))::numeric AS datahdr,
      (maxfracsum*(nullhdr+ma-(case when nullhdr%ma=0 THEN ma ELSE nullhdr%ma END))) AS nullhdr2
    FROM (
      SELECT
        ns.nspname, tbl.relname, hdr, ma, bs,
        SUM((1-coalesce(null_frac,0))*coalesce(avg_width, 2048)) AS datawidth,
        MAX(coalesce(null_frac,0)) AS maxfracsum,
        hdr+(
          SELECT 1+count(*)/8
          FROM pg_stats s2
          WHERE null_frac<>0 AND s2.schemaname = ns.nspname AND s2.tablename = tbl.relname
        ) AS nullhdr
      FROM pg_attribute att 
      JOIN pg_class tbl ON att.attrelid = tbl.oid
      JOIN pg_namespace ns ON ns.oid = tbl.relnamespace 
      LEFT JOIN pg_stats s ON s.schemaname=ns.nspname
      AND s.tablename = tbl.relname
      AND s.inherited=false
      AND s.attname=att.attname,
      (
        SELECT
          (SELECT current_setting($$block_size$$)::numeric) AS bs,
            CASE WHEN SUBSTRING(SPLIT_PART(v, $$ $$, 2) FROM $$#"[0-9]+.[0-9]+#"%$$ for $$#$$)
              IN ($$8.0$$,$$8.1$$,$$8.2$$) THEN 27 ELSE 23 END AS hdr,
          CASE WHEN v ~ $$mingw32$$ OR v ~ $$64-bit$$ THEN 8 ELSE 4 END AS ma
        FROM (SELECT version() AS v) AS foo
      ) AS constants
      WHERE att.attnum > 0 AND tbl.relkind=$$r$$
      GROUP BY 1,2,3,4,5
    ) AS foo
  ) AS rs
  ON cc.relname = rs.relname AND nn.nspname = rs.nspname
  LEFT JOIN pg_index i ON indrelid = cc.oid
  LEFT JOIN pg_class c2 ON c2.oid = i.indexrelid
) AS sml order by wastedibytes desc limit 5'
done
echo "建議: "
echo "    如果索引膨脹太大, 會影響性能, 建議重建索引, create index CONCURRENTLY ... . "
echo -e "\n"

echo "----->>>---->>>  垃圾數據: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),schemaname,relname,n_dead_tup from pg_stat_all_tables where n_live_tup>0 and n_dead_tup/n_live_tup>0.2 and schemaname not in ($$pg_toast$$,$$pg_catalog$$) order by n_dead_tup desc limit 5'
done
echo "建議: "
echo "    通常垃圾過多, 可能是因為無法回收垃圾, 或者回收垃圾的進程繁忙或沒有及時喚醒, 或者沒有開啟autovacuum, 或在短時間內產生了大量的垃圾 . "
echo "    可以等待autovacuum進行處理, 或者手工執行vacuum table . "
echo -e "\n"

echo "----->>>---->>>  未引用的大對象: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
vacuumlo -n -h $PGHOST -p $PGPORT -U $PGUSER $db -w
echo ""
done
echo "建議: "
echo "    如果大對象沒有被引用時, 建議刪除, 否則就類似於內存泄露, 使用vacuumlo可以刪除未被引用的大對象, 例如: vacuumlo -l 1000 -h $PGHOST -p $PGPORT -U $PGUSER $db -w . "
echo "    應用開發時, 注意及時刪除不需要使用的大對象, 使用lo_unlink 或 驅動對應的API . "
echo "    參考 https://www.postgresql.org/docs/9.4/static/largeobjects.html "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                     數據庫年齡分析                      |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  數據庫年齡: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select datname,age(datfrozenxid),2^31-age(datfrozenxid) age_remain from pg_database order by age(datfrozenxid) desc'
echo "建議: "
echo "    數據庫的年齡正常情況下應該小於vacuum_freeze_table_age, 如果剩餘年齡小於5億, 建議人為幹預, 將LONG SQL或事務殺掉後, 執行vacuum freeze . "
echo -e "\n"

echo "----->>>---->>>  表年齡: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER --pset=pager=off -d $PGDATABASE -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),rolname,nspname,relkind,relname,age(relfrozenxid),2^31-age(relfrozenxid) age_remain from pg_authid t1 join pg_class t2 on t1.oid=t2.relowner join pg_namespace t3 on t2.relnamespace=t3.oid where t2.relkind in ($$t$$,$$r$$) order by age(relfrozenxid) desc limit 5'
done
echo "建議: "
echo "    表的年齡正常情況下應該小於vacuum_freeze_table_age, 如果剩餘年齡小於5億, 建議人為幹預, 將LONG SQL或事務殺掉後, 執行vacuum freeze . "
echo -e "\n"

echo "----->>>---->>>  長事務, 2PC: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -x -c 'select datname,usename,query,xact_start,now()-xact_start xact_duration,query_start,now()-query_start query_duration,state from pg_stat_activity where state<>$$idle$$ and (backend_xid is not null or backend_xmin is not null) and now()-xact_start > interval $$30 min$$ order by xact_start'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -x -c 'select name,statement,prepare_time,now()-prepare_time,parameter_types,from_sql from pg_prepared_statements where now()-prepare_time > interval $$30 min$$ order by prepare_time'
echo "建議: "
echo "    長事務過程中產生的垃圾, 無法回收, 建議不要在數據庫中運行LONG SQL, 或者錯開DML高峰時間去運行LONG SQL. 2PC事務一定要記得盡快結束掉, 否則可能會導致數據庫膨脹. "
echo "    參考: https://blog.163.com/digoal@126/blog/static/1638770402015329115636287/ "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|               數據庫XLOG, 流複製狀態分析                |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  是否開啟歸檔, 自動垃圾回收: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select name,setting from pg_settings where name in ($$archive_mode$$,$$autovacuum$$,$$archive_command$$)'
echo "建議: "
echo "    建議開啟自動垃圾回收, 開啟歸檔. "
echo -e "\n"

echo "----->>>---->>>  歸檔統計信息: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select pg_xlogfile_name(pg_current_xlog_location()) now_xlog, * from pg_stat_archiver'
echo "建議: "
echo "    如果當前的XLOG文件和最後一個歸檔失敗的XLOG文件之間相差很多個文件, 建議盡快排查歸檔失敗的原因, 以便修複, 否則pg_xlog目錄可能會撐爆. "
echo -e "\n"

echo "----->>>---->>>  流複製統計信息: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -x -c 'select pg_xlog_location_diff(pg_current_xlog_location(),flush_location), * from pg_stat_replication'
echo "建議: "
echo "    關注流複製的延遲, 如果延遲非常大, 建議排查網絡帶寬, 以及本地讀xlog的性能, 遠程寫xlog的性能. "
echo -e "\n"

echo "----->>>---->>>  流複製插槽: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select pg_xlog_location_diff(pg_current_xlog_location(),restart_lsn), * from pg_replication_slots'
echo "建議: "
echo "    如果restart_lsn和當前XLOG相差非常大的字節數, 需要排查slot的訂閱者是否能正常接收XLOG, 或者訂閱者是否正常. 長時間不將slot的數據取走, pg_xlog目錄可能會撐爆. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                數據庫安全或潛在風險分析                 |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  密碼泄露檢查: "
echo "    檢查 ~/.psql_history :  "
grep -i "password" ~/.psql_history|grep -i -E "role|group|user"
echo ""
echo "    檢查 *.csv :  "
cat *.csv | grep -E "^[0-9]" | grep -i -r -E "role|group|user" |grep -i "password"|grep -i -E "create|alter"
echo ""
echo "    檢查 $PGDATA/recovery.* :  "
grep -i "password" ../recovery.*
echo ""
echo "    檢查 pg_stat_statements :  "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -c 'select query from pg_stat_statements where (query ~* $$group$$ or query ~* $$user$$ or query ~* $$role$$) and query ~* $$password$$'
echo "    檢查 pg_authid :  "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select * from pg_authid where rolpassword !~ $$^md5$$ or length(rolpassword)<>35'
echo "    檢查 pg_user_mappings, pg_views :  "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -c 'select current_database(),* from pg_user_mappings where umoptions::text ~* $$password$$'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -c 'select current_database(),* from pg_views where definition ~* $$password$$ and definition ~* $$dblink$$'
done
echo "建議: "
echo "    如果以上輸出顯示密碼已泄露, 盡快修改, 並通過參數避免密碼又被記錄到以上文件中(psql -n) (set log_statement='none'; set log_min_duration_statement=-1; set log_duration=off; set pg_stat_statements.track_utility=off;) . "
echo "    明文密碼不安全, 建議使用create|alter role ... encrypted password. "
echo "    在fdw, dblink based view中不建議使用密碼明文. "
echo "    在recovery.*的配置中不要使用密碼, 不安全, 可以使用.pgpass配置密碼 . "
echo -e "\n"

echo "----->>>---->>>  簡單密碼檢查: "
echo "    1. 檢查已有密碼是否簡單, 從crackdb庫提取密碼字典, 挨個檢查 :  "
echo "    檢查 md5('$pwd'||'$username')是否與pg_authid.rolpassword匹配 :  "
echo "    匹配則說明用戶使用了簡單密碼 :  "
echo ""
echo "    2. 事前檢查參考 https://blog.163.com/digoal@126/blog/static/16387704020149852941586"
echo -e "\n"

echo "----->>>---->>>  用戶密碼到期時間: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select rolname,rolvaliduntil from pg_authid order by rolvaliduntil'
echo "建議: "
echo "    到期後, 用戶將無法登陸, 記得修改密碼, 同時將密碼到期時間延長到某個時間或無限時間, alter role ... VALID UNTIL 'timestamp' . "
echo -e "\n"

echo "----->>>---->>>  SQL注入風險分析: "
cat *.csv | grep -E "^[0-9]" | grep exec_simple_query |awk -F "," '{print $2" "$3" "$5" "$NF}'|sed 's/\:[0-9]*//g'|sort|uniq -c|sort -n -r
echo "建議: "
echo "    調用exec_simple_query有風險, 允許多個SQL封裝在一個接口中調用, 建議程序使用綁定變量規避SQL注入風險, 或者程序端使用SQL注入過濾插件. "
echo -e "\n"

echo "----->>>---->>>  普通用戶對象上的規則安全檢查: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -c 'select current_database(),a.schemaname,a.tablename,a.rulename,a.definition from pg_rules a,pg_namespace b,pg_class c,pg_authid d where a.schemaname=b.nspname and a.tablename=c.relname and d.oid=c.relowner and not d.rolsuper union all select current_database(),a.schemaname,a.viewname,a.viewowner,a.definition from pg_views a,pg_namespace b,pg_class c,pg_authid d where a.schemaname=b.nspname and a.viewname=c.relname and d.oid=c.relowner and not d.rolsuper'
done
echo "建議: "
echo "    防止普通用戶在規則中設陷阱, 注意有危險的security invoker的函數調用, 超級用戶可能因為規則觸發後誤調用這些危險函數(以invoker角色). "
echo "    參考 https://blog.163.com/digoal@126/blog/static/16387704020155131217736/ "
echo -e "\n"

echo "----->>>---->>>  普通用戶自定義函數安全檢查: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -c 'select current_database(),b.rolname,c.nspname,a.proname from pg_proc a,pg_authid b,pg_namespace c where a.proowner=b.oid and a.pronamespace=c.oid and not b.rolsuper and not a.prosecdef'
done
echo "建議: "
echo "    防止普通用戶在函數中設陷阱, 注意有危險的security invoker的函數調用, 超級用戶可能因為觸發器觸發後誤調用這些危險函數(以invoker角色). "
echo "    參考 https://blog.163.com/digoal@126/blog/static/16387704020155131217736/ "
echo -e "\n"

echo "----->>>---->>>  unlogged table 和 哈希索引: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),t3.rolname,t2.nspname,t1.relname from pg_class t1,pg_namespace t2,pg_authid t3 where t1.relnamespace=t2.oid and t1.relowner=t3.oid and t1.relpersistence=$$u$$'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),pg_get_indexdef(oid) from pg_class where relkind=$$i$$ and pg_get_indexdef(oid) ~ $$USING hash$$'
done
echo "建議: "
echo "    unlogged table和hash index不記錄XLOG, 無法使用流複製或者log shipping的方式複製到standby節點, 如果在standby節點執行某些SQL, 可能導致報錯或查不到數據. "
echo "    在數據庫CRASH後無法修複unlogged table和hash index, 不建議使用. "
echo "    PITR對unlogged table和hash index也不起作用. "
echo -e "\n"

echo "----->>>---->>>  剩餘可使用次數不足1000萬次的序列檢查: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off <<EOF
create or replace function f(OUT v_datname name, OUT v_role name, OUT v_nspname name, OUT v_relname name, OUT v_times_remain int8) returns setof record as \$\$
declare
begin
  v_datname := current_database();
  for v_role,v_nspname,v_relname in select rolname,nspname,relname from pg_authid t1 , pg_class t2 , pg_namespace t3 where t1.oid=t2.relowner and t2.relnamespace=t3.oid and t2.relkind='S' 
  LOOP
    execute 'select (max_value-last_value)/increment_by from "'||v_nspname||'"."'||v_relname||'" where not is_cycled' into v_times_remain;
    return next;
  end loop;
end;
\$\$ language plpgsql;

select * from f() where v_times_remain is not null and v_times_remain < 10240000 order by v_times_remain limit 10;
EOF
done
echo "建議: "
echo "    序列剩餘使用次數到了之後, 將無法使用, 報錯, 請開發人員關注. "
echo -e "\n"

echo "----->>>---->>>  觸發器, 事件觸發器: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),relname,tgname,proname,tgenabled from pg_trigger t1,pg_class t2,pg_proc t3 where t1.tgfoid=t3.oid and t1.tgrelid=t2.oid'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),rolname,proname,evtname,evtevent,evtenabled,evttags from pg_event_trigger t1,pg_proc t2,pg_authid t3 where t1.evtfoid=t2.oid and t1.evtowner=t3.oid'
done
echo "建議: "
echo "    請管理員注意觸發器和事件觸發器的必要性. "
echo -e "\n"

echo "----->>>---->>>  檢查是否使用了a-z 0-9 _ 以外的字母作為對象名: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -c 'select distinct datname from (select datname,regexp_split_to_table(datname,$$$$) word from pg_database) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95'
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),relname,relkind from (select relname,relkind,regexp_split_to_table(relname,$$$$) word from pg_class) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2,3'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(), typname from (select typname,regexp_split_to_table(typname,$$$$) word from pg_type) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(), proname from (select proname,regexp_split_to_table(proname,$$$$) word from pg_proc where proname !~ $$^RI_FKey_$$) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select current_database(),nspname,relname,attname from (select nspname,relname,attname,regexp_split_to_table(attname,$$$$) word from pg_class a,pg_attribute b,pg_namespace c where a.oid=b.attrelid and a.relnamespace=c.oid ) t where (not (ascii(word) >=97 and ascii(word) <=122)) and (not (ascii(word) >=48 and ascii(word) <=57)) and ascii(word)<>95 group by 1,2,3,4'
done
echo "建議: "
echo "    建議任何identify都隻使用 a-z, 0-9, _ (例如表名, 列名, 視圖名, 函數名, 類型名, 數據庫名, schema名, 物化視圖名等等). "
echo -e "\n"

echo "----->>>---->>>  鎖等待: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE -x --pset=pager=off <<EOF
create or replace function f_lock_level(i_mode text) returns int as \$\$
declare
begin
  case i_mode
    when 'INVALID' then return 0;
    when 'AccessShareLock' then return 1;
    when 'RowShareLock' then return 2;
    when 'RowExclusiveLock' then return 3;
    when 'ShareUpdateExclusiveLock' then return 4;
    when 'ShareLock' then return 5;
    when 'ShareRowExclusiveLock' then return 6;
    when 'ExclusiveLock' then return 7;
    when 'AccessExclusiveLock' then return 8;
    else return 0;
  end case;
end; 
\$\$ language plpgsql strict;

with t_wait as                     
(select a.mode,a.locktype,a.database,a.relation,a.page,a.tuple,a.classid,a.objid,a.objsubid,a.pid,a.virtualtransaction,a.virtualxid,a,transactionid,b.query,b.xact_start,b.query_start,b.usename,b.datname from pg_locks a,pg_stat_activity b where a.pid=b.pid and not a.granted),
t_run as 
(select a.mode,a.locktype,a.database,a.relation,a.page,a.tuple,a.classid,a.objid,a.objsubid,a.pid,a.virtualtransaction,a.virtualxid,a,transactionid,b.query,b.xact_start,b.query_start,b.usename,b.datname from pg_locks a,pg_stat_activity b where a.pid=b.pid and a.granted) 
select r.locktype,r.mode r_mode,r.usename r_user,r.datname r_db,r.relation::regclass,r.pid r_pid,r.xact_start r_xact_start,r.query_start r_query_start,now()-r.query_start r_locktime,r.query r_query,
w.mode w_mode,w.pid w_pid,w.xact_start w_xact_start,w.query_start w_query_start,now()-w.query_start w_locktime,w.query w_query  
from t_wait w,t_run r where
  r.locktype is not distinct from w.locktype and
  r.database is not distinct from w.database and
  r.relation is not distinct from w.relation and
  r.page is not distinct from w.page and
  r.tuple is not distinct from w.tuple and
  r.classid is not distinct from w.classid and
  r.objid is not distinct from w.objid and
  r.objsubid is not distinct from w.objsubid and
  r.transactionid is not distinct from w.transactionid and
  r.pid <> w.pid
  order by f_lock_level(w.mode)+f_lock_level(r.mode) desc,r.xact_start;
EOF
echo "建議: "
echo "    鎖等待狀態, 反映業務邏輯的問題或者SQL性能有問題, 建議深入排查持鎖的SQL. "
echo -e "\n"

echo "----->>>---->>>  繼承關係檢查: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -q -c 'select inhrelid::regclass,inhparent::regclass,inhseqno from pg_inherits order by 2,3'
done
echo "建議: "
echo "    如果使用繼承來實現分區表, 注意分區表的觸發器中邏輯是否正常, 對於時間模式的分區表是否需要及時加分區, 修改觸發器函數 . "
echo "    建議繼承表的權限統一, 如果權限不一致, 可能導致某些用戶查詢時權限不足. "
echo -e "\n"


echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                      重置統計信息                       |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  重置統計信息: "
for db in `psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -t -A -q -c 'select datname from pg_database where datname not in ($$template0$$, $$template1$$)'`
do
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $db --pset=pager=off -c 'select pg_stat_reset()'
done
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -c 'select pg_stat_reset_shared($$bgwriter$$)'
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -c 'select pg_stat_reset_shared($$archiver$$)'

echo "----->>>---->>>  重置pg_stat_statements統計信息: "
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE --pset=pager=off -q -A -c 'select pg_stat_statements_reset()'

}  # common function end


primary() {
echo "----->>>---->>>  獲取recovery.done md5值: "
md5sum $PGDATA/recovery.done
echo "建議: "
echo "    主備md5值一致(判斷主備配置文件是否內容一致的一種手段, 或者使用diff)."
echo -e "\n"

echo "----->>>---->>>  獲取recovery.done配置: "
grep '^\ *[a-z]' $PGDATA/recovery.done|awk -F "#" '{print $1}'
echo "建議: "
echo "    在primary_conninfo中不要配置密碼, 容易泄露. 建議為流複製用戶創建replication角色的用戶, 並且配置pg_hba.conf隻允許需要的來源IP連接. "
echo -e "\n"
}  # primary function end


standby() {
echo "----->>>---->>>  獲取recovery.conf md5值: "
md5sum $PGDATA/recovery.conf
echo "建議: "
echo "    主備md5值一致(判斷主備配置文件是否內容一致的一種手段, 或者使用diff)."
echo -e "\n"

echo "----->>>---->>>  獲取recovery.conf配置: "
grep '^\ *[a-z]' $PGDATA/recovery.conf|awk -F "#" '{print $1}'
echo "建議: "
echo "    在primary_conninfo中不要配置密碼, 容易泄露. 建議為流複製用戶創建replication角色的用戶, 並且配置pg_hba.conf隻允許需要的來源IP連接. "
echo -e "\n"
}  # standby function end


adds() {
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo "|                        附加信息                         |"
echo "|+++++++++++++++++++++++++++++++++++++++++++++++++++++++++|"
echo ""

echo "----->>>---->>>  附件1 : `date -d '-1 day' +"%Y-%m-%d"` 操作係統sysstat收集的統計信息 "
sar -A -f /var/log/sa/sa`date -d '-1 day' +%d`
echo -e "\n"

echo "----->>>---->>>  其他建議: "
echo "    其他建議的巡檢項: "
echo "        HA 狀態是否正常, 例如檢查HA程序, 檢查心跳表的延遲. "
echo "        sar io, load, ...... "
echo "    巡檢結束後, 清理csv日誌 "
}  # adds function end


if [ $is_standby == 't' ]; then
standby
else
primary
fi

common
adds
cd $pwd
return 0

#  備注
#  csv日誌分析需要優化
#  某些操作需要root

最後更新:2017-04-01 13:37:07

  上一篇:go 在雲上搭建大規模實時數據流處理係統
  下一篇:go 在線應用雲調試係統--Zdebugger