[pgpool-general: 7370] Watchdog New Primary & Standby shutdown when Node 0 Fails
Joe Madden
Joe.Madden at mottmac.com
Fri Dec 18 03:57:08 JST 2020
Hi List,
I've got a PGpool instance with three nodes:
|Pg Pool Node 0 (192.168.40.66)| Pg Pool Node 1 (192.168.40.67)| Pg Pool Node 2 (192.168.40.64)|
Communicate switch back end-
|Postgresql12 Primary | Postgresql 12 Secondary|
This works fine, Standby Nodes 1 & 2 can be shutdown, restarted etc without an issue. When node 0 is shutdown, one of the child processes fails and causes the Nodes 1 and Node 2 to shutdown after about 60 seconds post failover.
I feel like this could be a bug, Our configurations on all three nodes are identical bar the weght pram which is different and node id of course.
Config:
# ----------------------------
# pgPool-II configuration file
# ----------------------------
#
# This file consists of lines of the form:
#
# name = value
#
# Whitespace may be used. Comments are introduced with "#" anywhere on a line.
# The complete list of parameter names and allowed values can be found in the
# pgPool-II documentation.
#
# This file is read on server startup and when the server receives a SIGHUP
# signal. If you edit the file on a running system, you have to SIGHUP the
# server for the changes to take effect, or use "pgpool reload". Some
# parameters, which are marked below, require a server shutdown and restart to
# take effect.
#
#------------------------------------------------------------------------------
# BACKEND CLUSTERING MODE
# Choose one of: 'streaming_replication', 'native_replication',
# 'logical_replication', 'slony', 'raw' or 'snapshot_isolation'
# (change requires restart)
#------------------------------------------------------------------------------
backend_clustering_mode = 'streaming_replication'
#------------------------------------------------------------------------------
# CONNECTIONS
#------------------------------------------------------------------------------
# - pgpool Connection Settings -
listen_addresses = '*'
# Host name or IP address to listen on:
# '*' for all, '' for no TCP/IP connections
# (change requires restart)
port = 9999
# Port number
# (change requires restart)
socket_dir = '/tmp'
# Unix domain socket path
# The Debian package defaults to
# /var/run/postgresql
# (change requires restart)
reserved_connections = 0
# Number of reserved connections.
# Pgpool-II does not accept connections if over
# num_init_chidlren - reserved_connections.
# - pgpool Communication Manager Connection Settings -
pcp_listen_addresses = '*'
# Host name or IP address for pcp process to listen on:
# '*' for all, '' for no TCP/IP connections
# (change requires restart)
pcp_port = 9898
# Port number for pcp
# (change requires restart)
pcp_socket_dir = '/tmp'
# Unix domain socket path for pcp
# The Debian package defaults to
# /var/run/postgresql
# (change requires restart)
listen_backlog_multiplier = 2
# Set the backlog parameter of listen(2) to
# num_init_children * listen_backlog_multiplier.
# (change requires restart)
serialize_accept = off
# whether to serialize accept() call to avoid thundering herd problem
# (change requires restart)
# - Backend Connection Settings -
backend_hostname0 = '192.168.40.61'
# Host name or IP address to connect to for backend 0
backend_port0 = 5432
# Port number for backend 0
backend_weight0 = 1
# Weight for backend 0 (only in load balancing mode)
backend_data_directory0 = '/var/lib/pgsql/12/data/'
# Data directory for backend 0
backend_flag0 = 'ALLOW_TO_FAILOVER'
# Controls various backend behavior
# ALLOW_TO_FAILOVER, DISALLOW_TO_FAILOVER
# or ALWAYS_PRIMARY
backend_application_name0 = '192.168.40.61'
# walsender's application_name, used for "show pool_nodes" command
# - Backend Connection Settings -
backend_hostname1 = '192.168.40.60'
# Host name or IP address to connect to for backend 0
backend_port1 = 5432
# Port number for backend 0
backend_weight1 = 1
# Weight for backend 0 (only in load balancing mode)
backend_data_directory1 = '/var/lib/pgsql/12/data/'
# Data directory for backend 0
backend_flag1 = 'ALLOW_TO_FAILOVER'
# Controls various backend behavior
# ALLOW_TO_FAILOVER, DISALLOW_TO_FAILOVER
# or ALWAYS_PRIMARY
backend_application_name1 = '192.168.40.60'
# walsender's application_name, used for "show pool_nodes" command
# - Authentication -
enable_pool_hba = on
# Use pool_hba.conf for client authentication
pool_passwd = 'pool_passwd'
# File name of pool_passwd for md5 authentication.
# "" disables pool_passwd.
# (change requires restart)
authentication_timeout = 1min
# Delay in seconds to complete client authentication
# 0 means no timeout.
allow_clear_text_frontend_auth = off
# Allow Pgpool-II to use clear text password authentication
# with clients, when pool_passwd does not
# contain the user password
# - SSL Connections -
ssl = off
# Enable SSL support
# (change requires restart)
#ssl_key = 'server.key'
# SSL private key file
# (change requires restart)
#ssl_cert = 'server.crt'
# SSL public certificate file
# (change requires restart)
#ssl_ca_cert = ''
# Single PEM format file containing
# CA root certificate(s)
# (change requires restart)
#ssl_ca_cert_dir = ''
# Directory containing CA root certificate(s)
# (change requires restart)
#ssl_crl_file = ''
# SSL certificate revocation list file
# (change requires restart)
ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL'
# Allowed SSL ciphers
# (change requires restart)
ssl_prefer_server_ciphers = off
# Use server's SSL cipher preferences,
# rather than the client's
# (change requires restart)
ssl_ecdh_curve = 'prime256v1'
# Name of the curve to use in ECDH key exchange
ssl_dh_params_file = ''
# Name of the file containing Diffie-Hellman parameters used
# for so-called ephemeral DH family of SSL cipher.
#ssl_passphrase_command=''
# Sets an external command to be invoked when a passphrase
# for decrypting an SSL file needs to be obtained
# (change requires restart)
#------------------------------------------------------------------------------
# POOLS
#------------------------------------------------------------------------------
# - Concurrent session and pool size -
num_init_children = 32
# Number of concurrent sessions allowed
# (change requires restart)
max_pool = 4
# Number of connection pool caches per connection
# (change requires restart)
# - Life time -
child_life_time = 5min
# Pool exits after being idle for this many seconds
child_max_connections = 0
# Pool exits after receiving that many connections
# 0 means no exit
connection_life_time = 0
# Connection to backend closes after being idle for this many seconds
# 0 means no close
client_idle_limit = 0
# Client is disconnected after being idle for that many seconds
# (even inside an explicit transactions!)
# 0 means no disconnection
#------------------------------------------------------------------------------
# LOGS
#------------------------------------------------------------------------------
# - Where to log -
log_destination = 'stderr'
# Where to log
# Valid values are combinations of stderr,
# and syslog. Default to stderr.
# - What to log -
log_line_prefix = '%t: pid %p: ' # printf-style string to output at beginning of each log line.
log_connections = off
# Log connections
log_disconnections = off
# Log disconnections
log_hostname = off
# Hostname will be shown in ps status
# and in logs if connections are logged
log_statement = off
# Log all statements
log_per_node_statement = off
# Log all statements
# with node and backend informations
log_client_messages = off
# Log any client messages
log_standby_delay = 'if_over_threshold'
# Log standby delay
# Valid values are combinations of always,
# if_over_threshold, none
# - Syslog specific -
syslog_facility = 'LOCAL0'
# Syslog local facility. Default to LOCAL0
syslog_ident = 'pgpool'
# Syslog program identification string
# Default to 'pgpool'
# - Debug -
#log_error_verbosity = default # terse, default, or verbose messages
#client_min_messages = notice # values in order of decreasing detail:
# debug5
# debug4
# debug3
# debug2
# debug1
# log
# notice
# warning
# error
log_min_messages = debug5 # values in order of decreasing detail:
# debug5
# debug4
# debug3
# debug2
# debug1
# info
# notice
# warning
# error
# log
# fatal
# panic
# This is used when logging to stderr:
#logging_collector = off # Enable capturing of stderr
# into log files.
# (change requires restart)
# -- Only used if logging_collector is on ---
#log_directory = '/tmp/pgpool_log' # directory where log files are written,
# can be absolute
#log_filename = 'pgpool-%Y-%m-%d_%H%M%S.log'
# log file name pattern,
# can include strftime() escapes
#log_file_mode = 0600 # creation mode for log files,
# begin with 0 to use octal notation
#log_truncate_on_rotation = off # If on, an existing log file with the
# same name as the new log file will be
# truncated rather than appended to.
# But such truncation only occurs on
# time-driven rotation, not on restarts
# or size-driven rotation. Default is
# off, meaning append to existing files
# in all cases.
#log_rotation_age = 1d # Automatic rotation of logfiles will
# happen after that (minutes)time.
# 0 disables time based rotation.
#log_rotation_size = 10MB # Automatic rotation of logfiles will
# happen after that much (KB) log output.
# 0 disables size based rotation.
#------------------------------------------------------------------------------
# FILE LOCATIONS
#------------------------------------------------------------------------------
pid_file_name = '/var/run/pgpool/pgpool.pid'
# PID file name
# Can be specified as relative to the"
# location of pgpool.conf file or
# as an absolute path
# (change requires restart)
logdir = '/tmp'
# Directory of pgPool status file
# (change requires restart)
#------------------------------------------------------------------------------
# CONNECTION POOLING
#------------------------------------------------------------------------------
connection_cache = on
# Activate connection pools
# (change requires restart)
# Semicolon separated list of queries
# to be issued at the end of a session
# The default is for 8.3 and later
reset_query_list = 'ABORT; DISCARD ALL'
# The following one is for 8.2 and before
#reset_query_list = 'ABORT; RESET ALL; SET SESSION AUTHORIZATION DEFAULT'
#------------------------------------------------------------------------------
# REPLICATION MODE
#------------------------------------------------------------------------------
replicate_select = off
# Replicate SELECT statements
# when in replication mode
# replicate_select is higher priority than
# load_balance_mode.
insert_lock = off
# Automatically locks a dummy row or a table
# with INSERT statements to keep SERIAL data
# consistency
# Without SERIAL, no lock will be issued
lobj_lock_table = ''
# When rewriting lo_creat command in
# replication mode, specify table name to
# lock
# - Degenerate handling -
replication_stop_on_mismatch = off
# On disagreement with the packet kind
# sent from backend, degenerate the node
# which is most likely "minority"
# If off, just force to exit this session
failover_if_affected_tuples_mismatch = off
# On disagreement with the number of affected
# tuples in UPDATE/DELETE queries, then
# degenerate the node which is most likely
# "minority".
# If off, just abort the transaction to
# keep the consistency
#------------------------------------------------------------------------------
# LOAD BALANCING MODE
#------------------------------------------------------------------------------
load_balance_mode = on
# Activate load balancing mode
# (change requires restart)
ignore_leading_white_space = on
# Ignore leading white spaces of each query
read_only_function_list = ''
# Comma separated list of function names
# that don't write to database
# Regexp are accepted
write_function_list = ''
# Comma separated list of function names
# that write to database
# Regexp are accepted
# If both read_only_function_list and write_function_list
# is empty, function's volatile property is checked.
# If it's volatile, the function is regarded as a
# writing function.
primary_routing_query_pattern_list = ''
# Semicolon separated list of query patterns
# that should be sent to primary node
# Regexp are accepted
# valid for streaming replicaton mode only.
database_redirect_preference_list = ''
# comma separated list of pairs of database and node id.
# example: postgres:primary,mydb[0-4]:1,mydb[5-9]:2'
# valid for streaming replicaton mode only.
app_name_redirect_preference_list = ''
# comma separated list of pairs of app name and node id.
# example: 'psql:primary,myapp[0-4]:1,myapp[5-9]:standby'
# valid for streaming replicaton mode only.
allow_sql_comments = off
# if on, ignore SQL comments when judging if load balance or
# query cache is possible.
# If off, SQL comments effectively prevent the judgment
# (pre 3.4 behavior).
disable_load_balance_on_write = 'transaction'
# Load balance behavior when write query is issued
# in an explicit transaction.
#
# Valid values:
#
# 'transaction' (default):
# if a write query is issued, subsequent
# read queries will not be load balanced
# until the transaction ends.
#
# 'trans_transaction':
# if a write query is issued, subsequent
# read queries in an explicit transaction
# will not be load balanced until the session ends.
#
# 'dml_adaptive':
# Queries on the tables that have already been
# modified within the current explicit transaction will
# not be load balanced until the end of the transaction.
#
# 'always':
# if a write query is issued, read queries will
# not be load balanced until the session ends.
#
# Note that any query not in an explicit transaction
# is not affected by the parameter.
dml_adaptive_object_relationship_list= ''
# comma separated list of object pairs
# [object]:[dependent-object], to disable load balancing
# of dependent objects within the explicit transaction
# after WRITE statement is issued on (depending-on) object.
#
# example: 'tb_t1:tb_t2,insert_tb_f_func():tb_f,tb_v:my_view'
# Note: function name in this list must also be present in
# the write_function_list
# only valid for disable_load_balance_on_write = 'dml_adaptive'.
statement_level_load_balance = off
# Enables statement level load balancing
#------------------------------------------------------------------------------
# NATIVE REPLICATION MODE
#------------------------------------------------------------------------------
# - Streaming -
sr_check_period = 10
# Streaming replication check period
# Disabled (0) by default
sr_check_user = 'repmgr'
# Streaming replication check user
# This is neccessary even if you disable streaming
# replication delay check by sr_check_period = 0
sr_check_password = '###################'
# Password for streaming replication check user
# Leaving it empty will make Pgpool-II to first look for the
# Password in pool_passwd file before using the empty password
sr_check_database = 'repmgr'
# Database name for streaming replication check
delay_threshold = 10000000
# Threshold before not dispatching query to standby node
# Unit is in bytes
# Disabled (0) by default
# - Special commands -
follow_primary_command = ''
# Executes this command after main node failover
# Special values:
# %d = failed node id
# %h = failed node host name
# %p = failed node port number
# %D = failed node database cluster path
# %m = new main node id
# %H = new main node hostname
# %M = old main node id
# %P = old primary node id
# %r = new main port number
# %R = new main database cluster path
# %N = old primary node hostname
# %S = old primary node port number
# %% = '%' character
#------------------------------------------------------------------------------
# HEALTH CHECK GLOBAL PARAMETERS
#------------------------------------------------------------------------------
health_check_period = 5
# Health check period
# Disabled (0) by default
health_check_timeout = 20
# Health check timeout
# 0 means no timeout
health_check_user = 'pgpool'
# Health check user
health_check_password = '#############################'
# Password for health check user
# Leaving it empty will make Pgpool-II to first look for the
# Password in pool_passwd file before using the empty password
health_check_database = 'postgres'
# Database name for health check. If '', tries 'postgres' frist,
health_check_max_retries = 3
# Maximum number of times to retry a failed health check before giving up.
health_check_retry_delay = 1
# Amount of time to wait (in seconds) between retries.
connect_timeout = 10000
# Timeout value in milliseconds before giving up to connect to backend.
# Default is 10000 ms (10 second). Flaky network user may want to increase
# the value. 0 means no timeout.
# Note that this value is not only used for health check,
# but also for ordinary conection to backend.
#------------------------------------------------------------------------------
# HEALTH CHECK PER NODE PARAMETERS (OPTIONAL)
#------------------------------------------------------------------------------
#health_check_period0 = 0
#health_check_timeout0 = 20
#health_check_user0 = 'nobody'
#health_check_password0 = ''
#health_check_database0 = ''
#health_check_max_retries0 = 0
#health_check_retry_delay0 = 1
#connect_timeout0 = 10000
#------------------------------------------------------------------------------
# FAILOVER AND FAILBACK
#------------------------------------------------------------------------------
#failover_command = '/opt/pgpool/scripts/failover.sh %d %h %p %D %m %H %M %P %r %R'
failover_command = '/etc/pgpool-II/failover.sh %d %H %h %p %D %m %M %P %r %R %N %S'
# Executes this command at failover
# Special values:
# %d = failed node id
# %h = failed node host name
# %p = failed node port number
# %D = failed node database cluster path
# %m = new main node id
# %H = new main node hostname
# %M = old main node id
# %P = old primary node id
# %r = new main port number
# %R = new main database cluster path
# %N = old primary node hostname
# %S = old primary node port number
# %% = '%' character
failback_command = ''
# Executes this command at failback.
# Special values:
# %d = failed node id
# %h = failed node host name
# %p = failed node port number
# %D = failed node database cluster path
# %m = new main node id
# %H = new main node hostname
# %M = old main node id
# %P = old primary node id
# %r = new main port number
# %R = new main database cluster path
# %N = old primary node hostname
# %S = old primary node port number
# %% = '%' character
failover_on_backend_error = on
# Initiates failover when reading/writing to the
# backend communication socket fails
# If set to off, pgpool will report an
# error and disconnect the session.
detach_false_primary = on
# Detach false primary if on. Only
# valid in streaming replicaton
# mode and with PostgreSQL 9.6 or
# after.
search_primary_node_timeout = 5min
# Timeout in seconds to search for the
# primary node when a failover occurs.
# 0 means no timeout, keep searching
# for a primary node forever.
#------------------------------------------------------------------------------
# ONLINE RECOVERY
#------------------------------------------------------------------------------
recovery_user = 'nobody'
# Online recovery user
recovery_password = ''
# Online recovery password
# Leaving it empty will make Pgpool-II to first look for the
# Password in pool_passwd file before using the empty password
recovery_1st_stage_command = ''
# Executes a command in first stage
recovery_2nd_stage_command = ''
# Executes a command in second stage
recovery_timeout = 90
# Timeout in seconds to wait for the
# recovering node's postmaster to start up
# 0 means no wait
client_idle_limit_in_recovery = 0
# Client is disconnected after being idle
# for that many seconds in the second stage
# of online recovery
# 0 means no disconnection
# -1 means immediate disconnection
auto_failback = on
# Dettached backend node reattach automatically
# if replication_state is 'streaming'.
auto_failback_interval = 1min
# Min interval of executing auto_failback in
# seconds.
#------------------------------------------------------------------------------
# WATCHDOG
#------------------------------------------------------------------------------
# - Enabling -
use_watchdog = on
# Activates watchdog
# (change requires restart)
# -Connection to up stream servers -
trusted_servers = ''
# trusted server list which are used
# to confirm network connection
# (hostA,hostB,hostC,...)
# (change requires restart)
ping_path = '/bin'
# ping command path
# (change requires restart)
# - Watchdog communication Settings -
hostname0 = '192.168.40.66'
# Host name or IP address of pgpool node
# for watchdog connection
# (change requires restart)
wd_port0 = 9000
# Port number for watchdog service
# (change requires restart)
pgpool_port0 = 9999
# Port number for pgpool
# (change requires restart)
hostname1 = '192.168.40.67'
wd_port1 = 9000
pgpool_port1 = 9999
hostname2 = '192.168.40.64'
wd_port2 = 9000
pgpool_port2 = 9999
wd_priority = 90
# priority of this watchdog in leader election
# (change requires restart)
wd_authkey = '###################################'
# Authentication key for watchdog communication
# (change requires restart)
wd_ipc_socket_dir = '/tmp'
# Unix domain socket path for watchdog IPC socket
# The Debian package defaults to
# /var/run/postgresql
# (change requires restart)
# - Virtual IP control Setting -
delegate_IP = '192.168.40.70'
# delegate IP address
# If this is empty, virtual IP never bring up.
# (change requires restart)
if_cmd_path = '/sbin'
# path to the directory where if_up/down_cmd exists
# If if_up/down_cmd starts with "/", if_cmd_path will be ignored.
# (change requires restart)
if_up_cmd = '/usr/bin/sudo /sbin/ip addr add $_IP_$/24 dev eth0 label eth0:0'
# startup delegate IP command
# (change requires restart)
if_down_cmd = '/usr/bin/sudo /sbin/ip addr del $_IP_$/24 dev eth0'
# shutdown delegate IP command
# (change requires restart)
arping_path = '/usr/sbin'
# arping command path
# If arping_cmd starts with "/", if_cmd_path will be ignored.
# (change requires restart)
arping_cmd = '/usr/bin/sudo /usr/sbin/arping -U $_IP_$ -w 1 -I eth0'
# arping command
# (change requires restart)
# - Behaivor on escalation Setting -
clear_memqcache_on_escalation = on
# Clear all the query cache on shared memory
# when standby pgpool escalate to active pgpool
# (= virtual IP holder).
# This should be off if client connects to pgpool
# not using virtual IP.
# (change requires restart)
wd_escalation_command = ''
# Executes this command at escalation on new active pgpool.
# (change requires restart)
wd_de_escalation_command = ''
# Executes this command when leader pgpool resigns from being leader.
# (change requires restart)
# - Watchdog consensus settings for failover -
failover_when_quorum_exists = on
# Only perform backend node failover
# when the watchdog cluster holds the quorum
# (change requires restart)
failover_require_consensus = on
# Perform failover when majority of Pgpool-II nodes
# aggrees on the backend node status change
# (change requires restart)
allow_multiple_failover_requests_from_node = off
# A Pgpool-II node can cast multiple votes
# for building the consensus on failover
# (change requires restart)
enable_consensus_with_half_votes = off
# apply majority rule for consensus and quorum computation
# at 50% of votes in a cluster with even number of nodes.
# when enabled the existence of quorum and consensus
# on failover is resolved after receiving half of the
# total votes in the cluster, otherwise both these
# decisions require at least one more vote than
# half of the total votes.
# (change requires restart)
# - Lifecheck Setting -
# -- common --
wd_monitoring_interfaces_list = 'any' # Comma separated list of interfaces names to monitor.
# if any interface from the list is active the watchdog will
# consider the network is fine
# 'any' to enable monitoring on all interfaces except loopback
# '' to disable monitoring
# (change requires restart)
wd_lifecheck_method = 'heartbeat'
# Method of watchdog lifecheck ('heartbeat' or 'query' or 'external')
# (change requires restart)
wd_interval = 10
# lifecheck interval (sec) > 0
# (change requires restart)
# -- heartbeat mode --
heartbeat_hostname0 = '192.168.40.66'
# Host name or IP address used
# for sending heartbeat signal.
# (change requires restart)
heartbeat_port0 = 9694
# Port number used for receiving/sending heartbeat signal
# Usually this is the same as heartbeat_portX.
# (change requires restart)
heartbeat_device0 = 'eth0'
# Name of NIC device (such like 'eth0')
# used for sending/receiving heartbeat
# signal to/from destination 0.
# This works only when this is not empty
# and pgpool has root privilege.
# (change requires restart)
heartbeat_hostname1 = '192.168.40.67'
heartbeat_port1 = 9694
heartbeat_device1 = 'eth0'
heartbeat_hostname2 = '192.168.40.64'
heartbeat_port2 = 9694
heartbeat_device2 = 'eth0'
wd_heartbeat_keepalive = 2
# Interval time of sending heartbeat signal (sec)
# (change requires restart)
wd_heartbeat_deadtime = 30
# Deadtime interval for heartbeat signal (sec)
# (change requires restart)
# -- query mode --
wd_life_point = 3
# lifecheck retry times
# (change requires restart)
wd_lifecheck_query = 'SELECT 1'
# lifecheck query to pgpool from watchdog
# (change requires restart)
wd_lifecheck_dbname = 'template1'
# Database name connected for lifecheck
# (change requires restart)
wd_lifecheck_user = 'nobody'
# watchdog user monitoring pgpools in lifecheck
# (change requires restart)
wd_lifecheck_password = ''
# Password for watchdog user in lifecheck
# Leaving it empty will make Pgpool-II to first look for the
# Password in pool_passwd file before using the empty password
# (change requires restart)
#------------------------------------------------------------------------------
# OTHERS
#------------------------------------------------------------------------------
relcache_expire = 0
# Life time of relation cache in seconds.
# 0 means no cache expiration(the default).
# The relation cache is used for cache the
# query result against PostgreSQL system
# catalog to obtain various information
# including table structures or if it's a
# temporary table or not. The cache is
# maintained in a pgpool child local memory
# and being kept as long as it survives.
# If someone modify the table by using
# ALTER TABLE or some such, the relcache is
# not consistent anymore.
# For this purpose, cache_expiration
# controls the life time of the cache.
relcache_size = 256
# Number of relation cache
# entry. If you see frequently:
# "pool_search_relcache: cache replacement happend"
# in the pgpool log, you might want to increate this number.
check_temp_table = catalog
# Temporary table check method. catalog, trace or none.
# Default is catalog.
check_unlogged_table = on
# If on, enable unlogged table check in SELECT statements.
# This initiates queries against system catalog of primary/main
# thus increases load of primary.
# If you are absolutely sure that your system never uses unlogged tables
# and you want to save access to primary/main, you could turn this off.
# Default is on.
enable_shared_relcache = on
# If on, relation cache stored in memory cache,
# the cache is shared among child process.
# Default is on.
# (change requires restart)
relcache_query_target = primary # Target node to send relcache queries. Default is primary node.
# If load_balance_node is specified, queries will be sent to load balance node.
#------------------------------------------------------------------------------
# IN MEMORY QUERY MEMORY CACHE
#------------------------------------------------------------------------------
memory_cache_enabled = off
# If on, use the memory cache functionality, off by default
# (change requires restart)
memqcache_method = 'shmem'
# Cache storage method. either 'shmem'(shared memory) or
# 'memcached'. 'shmem' by default
# (change requires restart)
memqcache_memcached_host = 'localhost'
# Memcached host name or IP address. Mandatory if
# memqcache_method = 'memcached'.
# Defaults to localhost.
# (change requires restart)
memqcache_memcached_port = 11211
# Memcached port number. Mondatory if memqcache_method = 'memcached'.
# Defaults to 11211.
# (change requires restart)
memqcache_total_size = 64MB
# Total memory size in bytes for storing memory cache.
# Mandatory if memqcache_method = 'shmem'.
# Defaults to 64MB.
# (change requires restart)
memqcache_max_num_cache = 1000000
# Total number of cache entries. Mandatory
# if memqcache_method = 'shmem'.
# Each cache entry consumes 48 bytes on shared memory.
# Defaults to 1,000,000(45.8MB).
# (change requires restart)
memqcache_expire = 0
# Memory cache entry life time specified in seconds.
# 0 means infinite life time. 0 by default.
# (change requires restart)
memqcache_auto_cache_invalidation = on
# If on, invalidation of query cache is triggered by corresponding
# DDL/DML/DCL(and memqcache_expire). If off, it is only triggered
# by memqcache_expire. on by default.
# (change requires restart)
memqcache_maxcache = 400kB
# Maximum SELECT result size in bytes.
# Must be smaller than memqcache_cache_block_size. Defaults to 400KB.
# (change requires restart)
memqcache_cache_block_size = 1MB
# Cache block size in bytes. Mandatory if memqcache_method = 'shmem'.
# Defaults to 1MB.
# (change requires restart)
memqcache_oiddir = '/var/log/pgpool/oiddir'
# Temporary work directory to record table oids
# (change requires restart)
cache_safe_memqcache_table_list = ''
# Comma separated list of table names to memcache
# that don't write to database
# Regexp are accepted
cache_unsafe_memqcache_table_list = ''
# Comma separated list of table names not to memcache
# that don't write to database
# Regexp are accepted
Error output:
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: setting the local watchdog node name to "192.168.40.67:9999 Linux SVD-SLB02"
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: watchdog cluster is configured with 2 remote nodes
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: watchdog remote node:0 on 192.168.40.66:9000
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: watchdog remote node:1 on 192.168.40.64:9000
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: ensure availibility on any interface
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: watchdog node state changed from [DEAD] to [LOADING]
Dec 17 18:39:49 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:49: pid 1332675: LOG: new outbound connection to 192.168.40.64:9000
Dec 17 18:39:50 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:50: pid 1332675: LOG: new watchdog node connection is received from "192.168.40.66:62151"
Dec 17 18:39:50 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:50: pid 1332675: LOG: new node joined the cluster hostname:"192.168.40.66" port:9000 pgpool_port:9999
Dec 17 18:39:50 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:50: pid 1332675: DETAIL: Pgpool-II version:"4.2.0" watchdog messaging version: 1.2
Dec 17 18:39:53 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:53: pid 1332675: LOG: watchdog node state changed from [LOADING] to [INITIALIZING]
Dec 17 18:39:54 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:54: pid 1332675: LOG: watchdog node state changed from [INITIALIZING] to [STANDING FOR LEADER]
Dec 17 18:39:54 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:54: pid 1332675: LOG: watchdog node state changed from [STANDING FOR LEADER] to [PARTICIPATING IN ELECTION]
Dec 17 18:39:54 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:54: pid 1332675: LOG: watchdog node state changed from [PARTICIPATING IN ELECTION] to [INITIALIZING]
Dec 17 18:39:54 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:54: pid 1332675: LOG: setting the remote node "192.168.40.66:9999 Linux SVD-SLB01" as watchdog cluster leader
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: watchdog node state changed from [INITIALIZING] to [STANDBY]
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: successfully joined the watchdog cluster as standby node
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: DETAIL: our join coordinator request is accepted by cluster leader node "192.168.40.66:9999 Linux SVD-SLB01"
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: new IPC connection received
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: new IPC connection received
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: new IPC connection received
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: new IPC connection received
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: received the get data request from local pgpool-II on IPC interface
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: LOG: get data request from local pgpool-II node received on IPC interface is forwarded to leader watchdog node "192.168.40.66:9999 Linux SVD-SLB01"
Dec 17 18:39:55 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:55: pid 1332675: DETAIL: waiting for the reply...
Dec 17 18:39:59 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:59: pid 1332675: LOG: new watchdog node connection is received from "192.168.40.64:15019"
Dec 17 18:39:59 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:59: pid 1332675: LOG: new node joined the cluster hostname:"192.168.40.64" port:9000 pgpool_port:9999
Dec 17 18:39:59 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:39:59: pid 1332675: DETAIL: Pgpool-II version:"4.2.0" watchdog messaging version: 1.2
Dec 17 18:40:00 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:00: pid 1332675: LOG: new outbound connection to 192.168.40.66:9000
Dec 17 18:40:44 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:44: pid 1332675: LOG: remote node "192.168.40.66:9999 Linux SVD-SLB01" is shutting down
Dec 17 18:40:44 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:44: pid 1332675: LOG: watchdog cluster has lost the coordinator node
Dec 17 18:40:44 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:44: pid 1332675: LOG: removing the remote node "192.168.40.66:9999 Linux SVD-SLB01" from watchdog cluster leader
Dec 17 18:40:44 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:44: pid 1332675: LOG: We have lost the cluster leader node "192.168.40.66:9999 Linux SVD-SLB01"
Dec 17 18:40:44 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:44: pid 1332675: LOG: watchdog node state changed from [STANDBY] to [JOINING]
Dec 17 18:40:44 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:44: pid 1332675: LOG: watchdog node state changed from [JOINING] to [INITIALIZING]
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: watchdog node state changed from [INITIALIZING] to [STANDING FOR LEADER]
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: watchdog node state changed from [STANDING FOR LEADER] to [LEADER]
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: I am announcing my self as leader/coordinator watchdog node
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: I am the cluster leader node
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: DETAIL: our declare coordinator message is accepted by all nodes
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: setting the local node "192.168.40.67:9999 Linux SVD-SLB02" as watchdog cluster leader
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: I am the cluster leader node but we do not have enough nodes in cluster
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: DETAIL: waiting for the quorum to start escalation process
Dec 17 18:40:45 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:45: pid 1332675: LOG: new IPC connection received
Dec 17 18:40:46 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:46: pid 1332675: LOG: adding watchdog node "192.168.40.64:9999 Linux SVD-WEB01" to the standby list
Dec 17 18:40:46 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:46: pid 1332675: LOG: quorum found
Dec 17 18:40:46 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:46: pid 1332675: DETAIL: starting escalation process
Dec 17 18:40:46 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:46: pid 1332675: LOG: escalation process started with PID:1332782
Dec 17 18:40:46 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:46: pid 1332675: LOG: new IPC connection received
Dec 17 18:40:46 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:46: pid 1332675: LOG: new IPC connection received
Dec 17 18:40:50 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:40:50: pid 1332675: LOG: watchdog escalation process with pid: 1332782 exit with SUCCESS.
Dec 17 18:41:03 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:41:03: pid 1332675: LOG: new watchdog node connection is received from "192.168.40.66:55496"
Dec 17 18:41:03 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:41:03: pid 1332675: LOG: new node joined the cluster hostname:"192.168.40.66" port:9000 pgpool_port:9999
Dec 17 18:41:03 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:41:03: pid 1332675: DETAIL: Pgpool-II version:"4.2.0" watchdog messaging version: 1.2
Dec 17 18:41:03 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:41:03: pid 1332675: LOG: The newly joined node:"192.168.40.66:9999 Linux SVD-SLB01" had left the cluster because it was shutdown
Dec 17 18:41:03 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:41:03: pid 1332675: LOG: new outbound connection to 192.168.40.66:9000
Dec 17 18:41:04 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:41:04: pid 1332675: LOG: adding watchdog node "192.168.40.66:9999 Linux SVD-SLB01" to the standby list
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: read from socket failed, remote end closed the connection
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: client socket of 192.168.40.66:9999 Linux SVD-SLB01 is closed
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: read from socket failed, remote end closed the connection
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: outbound socket of 192.168.40.66:9999 Linux SVD-SLB01 is closed
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: remote node "192.168.40.66:9999 Linux SVD-SLB01" is not reachable
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: DETAIL: marking the node as lost
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: remote node "192.168.40.66:9999 Linux SVD-SLB01" is lost
Dec 17 18:42:51 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:42:51: pid 1332675: LOG: removing watchdog node "192.168.40.66:9999 Linux SVD-SLB01" from the standby list
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: new IPC connection received
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: read from socket failed, remote end closed the connection
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: client socket of 192.168.40.64:9999 Linux SVD-WEB01 is closed
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: remote node "192.168.40.64:9999 Linux SVD-WEB01" is shutting down
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: removing watchdog node "192.168.40.64:9999 Linux SVD-WEB01" from the standby list
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: We have lost the quorum
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: received node status change ipc message
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: DETAIL: No heartbeat signal from node
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: WARNING: watchdog life-check reported, we are disconnected from the network
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: DETAIL: changing the state to LOST
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: watchdog node state changed from [LEADER] to [LOST]
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: FATAL: system has lost the network
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332675: LOG: Watchdog is shutting down
Dec 17 18:43:25 SVD-SLB02 pgpool[1332673]: 2020-12-17 18:43:25: pid 1332673: LOG: watchdog child process with pid: 1332675 exits with status 768
Does anyone have any suggestions of what this could be?
Note if I play around with the weights I can get the other node to be the VIP but it still shutdowns with node 0 is shutdown.
It does not shutdown when any of the other nodes are shutdown, only node 0
Thanks,
Joe
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.pgpool.net/pipermail/pgpool-general/attachments/20201217/dbce1bdb/attachment.htm>
More information about the pgpool-general
mailing list