DBA必備技能:通過truss跟蹤解決監聽無法啟動案例
作者簡介:劉斌,雲和恩墨高級技術專家,擅長數據庫故障診斷分析,數據庫性能優化,自動化運維開發,堅持學習、寫作、分享,
在Oracle DBA的日常工作中,通過各種跟蹤手段,從數據庫內外部發現問題,最終找到解決方案,是必備的重要技能。
以下這則案例,就是通過OS係統級別的跟蹤,快速定位並解決問題的一個例證。在Oracle數據庫的跟蹤時,OS上Truss是非常重要的工具。
登錄數據庫主機發現一個節點監聽異常:
嚐試手工啟動,一樣報錯:
grid@xxxxdbb:/home/grid $lsnrctl statuss
LSNRCTL for IBM/AIX RISC System/6000: Version 11.2.0.4.0 - Production on 04-MAR-2016 01:23:34
Copyright (c) 1991, 2013, Oracle. All rights reserved.
Connecting to (DESCRIPTION=(ADDRESS=(PROTOCOL=IPC)(KEY=LISTENER)))
TNS-12541: TNS:no listener
TNS-12560: TNS:protocol adapter error
TNS-00511: No listener
IBM/AIX RISC System/6000 Error: 79: Connection refused
第一懷疑就是hosts文件被改了,查看host沒發現修改過。從上麵看出是使用ipc方式啟動,嚐試truss看看能不能有信息。
在Linux上調用truss跟蹤一個操作非常簡單,以下跟蹤了lsnrctl start操作的過程:
truss lsnrctl start
execve("/usr/bin/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/etc/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/usr/sbin/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/usr/ucb/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/home/grid/bin/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/usr/bin/X11/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/sbin/lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("./lsnrctl", 0x2FF22BF8, 0x200138A8) Err#2 ENOENT
execve("/u01/app/11.2.0.4/grid/bin/lsnrctl", 0x2FF22BF8, 0x200138A8) argc: 2
kusla(2, 0x09FFFFFFF0001170) Err#1 EPERM
read_sysconfig(0x09001000A07D1550,, 0x09001000A0806E68) = 0x0000000000000000
sbrk(0x0000000000000000) = 0x00000001101D54D8
vmgetinfo(0x0FFFFFFFFFFFF140, 7, 16) = 0
sbrk(0x0000000000000000) = 0x00000001101D54D8
sbrk(0x0000000000000008) = 0x00000001101D54D8
__libc_sbrk(0x0000000000010020) = 0x00000001101D54E0
thread_init(0x0900000000520760, 0x09001000A0888470) =
sbrk(0x0000000000000000) = 0x00000001101E5500
vmgetinfo(0x0FFFFFFFFFFFF780, 7, 16) = 0
smcr_procattr(0, 1, 0x0FFFFFFFFFFFF778) Err#109 ENOSYS
getrpid(-1, -1, 648535941220675944) = 8912948
_getpid() = 8912948
getprocs64(0x00000001101D7BF0, 5024, , 1) = 1
appulimit(1005, 0) = 0x0FFFFFFFFE000000
_thread_self() = 66453675
thread_setmystate(0x0000000000000000, 0x0FFFFFFFFFFFF2A0) = 0
.....省略....
_getpid() = 8912948
accessx("/etc/secvars.cfg", 04, 0) = 0
statx("/etc/secvars.cfg", 0x0FFFFFFFFFFF6FB0, 176, 0) = 0
_getpid() = 8912948
accessx("/etc/passwd", 04, 0) = 0
statx("/etc/passwd", 0x0FFFFFFFFFFF6E20, 176, 0) = 0
accessx("/etc/security/passwd", 04, 0) Err#13 EACCES
accessx("/etc/passwd", 04, 0) = 0
statx("/etc/passwd", 0x0FFFFFFFFFFF6B50, 176, 0) = 0
accessx("/etc/passwd", 04, 0) = 0
statx("/etc/passwd", 0x0FFFFFFFFFFF6C20, 176, 0) = 0
_getpid() = 8912948
accessx("/etc/passwd", 04, 0) = 0
statx("/etc/passwd", 0x0FFFFFFFFFFF6E20, 176, 0) = 0
accessx("/etc/security/passwd", 04, 0) Err#13 EACCES
accessx("/etc/passwd", 04, 0) = 0
statx("/etc/passwd", 0x0FFFFFFFFFFF6B50, 176, 0) = 0
accessx("/etc/passwd", 04, 0) = 0
statx("/etc/passwd", 0x0FFFFFFFFFFF6C20, 176, 0) = 0
accessx("/etc/security/passwd", 04, 0) Err#13 EACCES
close(5) = 0
gethostname(0x0FFFFFFFFFFF9C00, 512) = 0
access("/tmp/.oracle", 0) = 0
chmod("/tmp/.oracle", 01777) Err#1 EPERM
socket(1, 1, 0) = 5
access("/tmp/.oracle/sLISTENER", 0) = 0
connext(5, 0x0FFFFFFFFFFF5BA8, 1025) Err#79 ECONNREFUSED
-->注意這裏,發現要讀取/tmp/.oracle/sLISTENER
access("/tmp/.oracle/sLISTENER", 0) = 0
_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0
close(5) = 0
socket(1, 1, 0) = 5
connext(5, 0x0FFFFFFFFFFF5BA8, 1025) Err#79 ECONNREFUSED
access("/tmp/.oracle/sLISTENER", 0) = 0
-->注意,這裏大量讀取出錯的信息拋出
_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0
close(5) = 0
socket(1, 1, 0) = 5
connext(5, 0x0FFFFFFFFFFF5BA8, 1025) Err#79 ECONNREFUSED
access("/tmp/.oracle/sLISTENER", 0) = 0
_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0
close(5) = 0
。。。。。
socket(1, 1, 0) = 5
connext(5, 0x0FFFFFFFFFFF5BA8, 1025) Err#79 ECONNREFUSED
access("/tmp/.oracle/sLISTENER", 0) = 0
_nsleep(0x0FFFFFFFFFFF5580, 0x0FFFFFFFFFFF5650) = 0
close(5) = 0
socket(1, 1, 0) = 5
connext(5, 0x0FFFFFFFFFFF5BA8, 1025) Err#79 ECONNREFUSED
access("/tmp/.oracle/sLISTENER", 0) = 0
close(5) = 0
kopen("/u01/app/11.2.0.4/grid/network/mesg/tnsus.msb", O_RDONLY) = 5
kfcntl(5, F_SETFD, 0x0000000000000001) = 0
lseek(5, 0, 0) = 0
kread(5, "1513 "011303\t\t\0\0\0\0".., 256) = 256
lseek(5, 512, 0) = 512
kread(5, " 19B\0\0\0\0\0\0\0\0\0\0".., 512) = 512
lseek(5, 1024, 0) = 1024
kread(5, "\0\t\012\01A\0 &\0 -\0 F".., 172) = 172
lseek(5, 19456, 0) = 19456
kread(5, "\00F04 $\0\0\0 b04 %\0\0".., 512) = 512
Starting /u01/app/11.2.0.4/grid/bin/tnslsnr: please wait...
kwrite(1, " S t a r t i n g / u 0".., 61) = 61
kfcntl(1, F_GETFL, 0x0000000000000008) = 67110914
pipe(0x0FFFFFFFFFFF80E0) = 0
pipe(0x0FFFFFFFFFFF80D8) = 0
sigprocmask(0, 0x09001000A090DB70, 0x09001000A090DB90) = 0
kfork() = 19988596
kread(8, " N T P 0 1 4 4 8 3 5 7".., 64) = 14
_getpid() = 8912948
kfcntl(8, F_SETFD, 0x0000000000000001) = 0
kwrite(7, "\0 \001\0\0\001 :01 ,".., 179) = 179
kread(8, "\0\f\0\004\0\0\0 "\001 ., 8208) = 492
close(7) = 0
close(8) = 0
--> 在經過了連續的嚐試無法鎖定文件後,出錯,提示無法啟動:
TNSLSNR for IBM/AIX RISC System/6000: Version 11.2.0.4.0 - Production
System parameter file is /u01/app/11.2.0.4/grid/network/admin/listener.ora
Log messages written to /u01/app/grid/diag/tnslsnr/xxxxdbb/listener/alert/log.xml
Error listening on: (DESCRIPTION=(ADDRESS=(PROTOCOL=IPC)(KEY=LISTENER)))
TNS-12555: TNS:permission denied
TNS-12560: TNS:protocol adapter error
TNS-00525: Insufficient privilege for operation
IBM/AIX RISC System/6000 Error: 1: Not owner
kwrite(1, " T N S L S N R f o r ".., 470) = 470
kwrite(1, "\n", 1) = 1
lseek(5, 19968, 0) = 19968
kread(5, "\0\r04 5\0\0\0 V04 6\0\0".., 512) = 512
Listener failed to start. See the error message(s) above...
kwrite(1, " L i s t e n e r f a i".., 61) = 61
kfcntl(1, F_GETFL, 0x0000000000000008) = 67110914
close(4) = 0
kfcntl(1, F_GETFL, 0x0000000000000008) = 67110914
close(5) = 0
kfcntl(1, F_GETFL, 0x0000000013F600AB) = 67110914
kfcntl(2, F_GETFL, 0x0000000013F600AB) = 67110914
_exit(1)
嚐試刪除/tmp/.oracle/sLISTENER後監聽正常啟動.
事實上,在 /tmp/.oracle 目錄,或者有些平台在 /var/tmp/.oracle 目錄,存放一些數據庫運行時的臨時文件,包括基於socket的協議監聽臨時文件。在某些異常情況下,數據庫關閉時沒有及時清理這些文件,導致啟動出現問題。這個案例就是這樣的情況之一。
以下是一段相關描述:
The hidden directory '/var/tmp/.oracle' (or /tmp/.oracle on some platforms) or its content was removed while instances & the CRS stack were up and running. Typically this directory contains a number of "special" socket files that are used by local clients to connect via the IPC protocol (sqlnet) to various Oracle processes including the TNS listener, the CSS, CRS & EVM daemons or even database or ASM instances. These files are created when the "listening" process starts.
這是應用truss的診斷案例一則,與大家分享。
本文出自數據和雲公眾號,原文鏈接
最後更新:2017-07-18 11:03:43