-
Notifications
You must be signed in to change notification settings - Fork 836
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
synchronise issue #1558
Comments
Strangelove would like to piggy back on this issue. We are seeing similar problems with our Evmos RPC nodes. They often fall behind. However, the tendermint RPC This behavior prevents removing unhealthy nodes from our Load Balancer. It also causes manual SRE work to restart the problem nodes. Thus far, rebooting the node causes The compute resources are not taxed. No OOMs or maxed out CPU. We use Google compute:
We've tried:
So far, no luck. We run dozens of other RPC nodes for other chains and only Evmos exhibits this problem. Therefore, we can only conclude the root cause is specific to Evmos. Here's example config: config.toml proxy_app = "tcp://127.0.0.1:26658"
moniker = "evmos-mainnet-fullnode-0"
fast_sync = true
db_backend = "goleveldb"
db_dir = "data"
log_level = "info"
log_format = "plain"
genesis_file = "config/genesis.json"
priv_validator_key_file = "config/priv_validator_key.json"
priv_validator_state_file = "data/priv_validator_state.json"
priv_validator_laddr = ""
node_key_file = "config/node_key.json"
abci = "socket"
filter_peers = false
[rpc]
laddr = "tcp://0.0.0.0:26657"
cors_allowed_origins = [
"*"
]
cors_allowed_methods = [
"HEAD",
"GET",
"POST"
]
cors_allowed_headers = [
"Origin",
"Accept",
"Content-Type",
"X-Requested-With",
"X-Server-Time"
]
grpc_laddr = ""
grpc_max_open_connections = 900
unsafe = false
max_open_connections = 900
max_subscription_clients = 100
max_subscriptions_per_client = 5
experimental_subscription_buffer_size = 200
experimental_websocket_write_buffer_size = 200
experimental_close_on_slow_client = false
timeout_broadcast_tx_commit = "10s"
max_body_bytes = 1000000
max_header_bytes = 1048576
tls_cert_file = ""
tls_key_file = ""
pprof_laddr = "localhost:6060"
[p2p]
laddr = "tcp://0.0.0.0:26656"
external_address = "34.174.21.162:26656"
seeds = "20e1000e88125698264454a884812746c2eb4807@seeds.lavenderfive.com:13456,40f4fac63da8b1ce8f850b0fa0f79b2699d2ce72@seed.evmos.jerrychong.com:26656,5740e4a36e646e80cc5648daf5e983e5b5d8f265@54.39.18.27:26656,588cedb70fa1d98c14a2f2c1456bfa41e1a156a8@evmos-sentry.mercury-nodes.net:29539,86bd5cb6e762f673f1706e5889e039d5406b4b90@evmos.seed.node75.org:10756,ade4d8bc8cbe014af6ebdf3cb7b1e9ad36f412c0@seeds.polkachu.com:13456,babc3f3f7804933265ec9c40ad94f4da8e9e0017@seed.rhinostake.com:13456,e1b058e5cfa2b836ddaa496b10911da62dcf182e@evmos-seed-1.allnodes.me:26656,e726816f42831689eab9378d5d577f1d06d25716@evmos-seed-2.allnodes.me:26656"
persistent_peers = "763a16523057b317238ecf11dd58d0115d638b34@evmos-mainnet-fullnode-p2p-1.strangelove.svc.cluster.local:26656,e09d707ac82f41f84c1e61555aedf20bd45d9c66@evmos-mainnet-fullnode-p2p-2.strangelove.svc.cluster.local:26656,2b5bc46b5ee567dc75c9036e3acbe37c397f32f3@35.247.100.131:26656,42daddcdb8540971b610e9389aefc96d7c05f869@34.83.13.56:26656,9b41e64b005388f41218866cb1f48b7de6728a6f@35.230.49.7:26656,5740e4a36e646e80cc5648daf5e983e5b5d8f265@54.39.18.27:26656,588cedb70fa1d98c14a2f2c1456bfa41e1a156a8@evmos-sentry.mercury-nodes.net:29539"
upnp = false
addr_book_file = "config/addrbook.json"
addr_book_strict = true
max_num_inbound_peers = 120
max_num_outbound_peers = 60
unconditional_peer_ids = "763a16523057b317238ecf11dd58d0115d638b34,e09d707ac82f41f84c1e61555aedf20bd45d9c66,2b5bc46b5ee567dc75c9036e3acbe37c397f32f3,42daddcdb8540971b610e9389aefc96d7c05f869,9b41e64b005388f41218866cb1f48b7de6728a6f"
persistent_peers_max_dial_period = "0s"
flush_throttle_timeout = "100ms"
max_packet_msg_payload_size = 1024
send_rate = 5120000
recv_rate = 5120000
pex = true
seed_mode = false
private_peer_ids = "763a16523057b317238ecf11dd58d0115d638b34,e09d707ac82f41f84c1e61555aedf20bd45d9c66"
allow_duplicate_ip = false
handshake_timeout = "20s"
dial_timeout = "3s"
[mempool]
version = "v0"
recheck = true
broadcast = true
wal_dir = ""
size = 10000
max_txs_bytes = 1073741824
cache_size = 10000
keep-invalid-txs-in-cache = false
max_tx_bytes = 1048576
max_batch_bytes = 0
ttl-duration = "0s"
ttl-num-blocks = 0
[statesync]
enable = false
rpc_servers = ""
trust_height = 0
trust_hash = ""
trust_period = "112h0m0s"
discovery_time = "15s"
temp_dir = ""
chunk_request_timeout = "10s"
chunk_fetchers = "4"
[fastsync]
version = "v0"
[consensus]
wal_file = "data/cs.wal/wal"
timeout_propose = "3s"
timeout_propose_delta = "500ms"
timeout_prevote = "1s"
timeout_prevote_delta = "500ms"
timeout_precommit = "1s"
timeout_precommit_delta = "500ms"
timeout_commit = "5s"
double_sign_check_height = 0
skip_timeout_commit = false
create_empty_blocks = true
create_empty_blocks_interval = "0s"
peer_gossip_sleep_duration = "100ms"
peer_query_maj23_sleep_duration = "2s"
[storage]
discard_abci_responses = false
[tx_index]
indexer = "kv"
psql-conn = ""
[instrumentation]
prometheus = true
prometheus_listen_addr = ":26660"
max_open_connections = 3
namespace = "tendermint" app.toml minimum-gas-prices = "20000000000aevmos"
pruning = "custom"
pruning-keep-recent = "5000"
pruning-interval = "17"
halt-height = 0
halt-time = 0
min-retain-blocks = 200000
inter-block-cache = true
index-events = []
iavl-cache-size = 781250
iavl-disable-fastnode = false
iavl-lazy-loading = false
app-db-backend = ""
pruning-keep-every = "0"
[telemetry]
service-name = ""
enabled = false
enable-hostname = false
enable-hostname-label = false
enable-service-label = false
prometheus-retention-time = 0
global-labels = []
[api]
enable = true
swagger = true
address = "tcp://0.0.0.0:1317"
max-open-connections = 1000
rpc-read-timeout = 10
rpc-write-timeout = 0
rpc-max-body-bytes = 1000000
enabled-unsafe-cors = true
[rosetta]
enable = false
address = ":8080"
blockchain = "app"
network = "network"
retries = 3
offline = false
enable-fee-suggestion = false
gas-to-suggest = 200000
denom-to-suggest = "uatom"
[grpc]
enable = true
address = "0.0.0.0:9090"
max-recv-msg-size = "10485760"
max-send-msg-size = "2147483647"
[grpc-web]
enable = true
address = "0.0.0.0:9091"
enable-unsafe-cors = false
[state-sync]
snapshot-interval = 5000
snapshot-keep-recent = 2
[store]
streamers = []
[streamers]
[streamers.file]
keys = [
"*"
]
write_dir = ""
prefix = ""
output-metadata = "true"
stop-node-on-error = "true"
fsync = "false"
[evm]
tracer = ""
max-tx-gas-wanted = 0
[json-rpc]
enable = true
address = "127.0.0.1:8545"
ws-address = "127.0.0.1:8546"
api = "eth,net,web3"
gas-cap = 25000000
evm-timeout = "5s"
txfee-cap = 1
filter-cap = 200
feehistory-cap = 100
logs-cap = 10000
block-range-cap = 10000
http-timeout = "30s"
http-idle-timeout = "2m0s"
allow-unprotected-txs = false
max-open-connections = 0
enable-indexer = false
metrics-address = "127.0.0.1:6065"
fix-revert-gas-refund-height = 0
[tls]
certificate-path = ""
key-path = "" |
I'm also affected by this. Not sure how to troubleshoot further. I now increased the value of the snapshot-interval to 60k to see if it reduces the gap between our node and the network. |
@DavidNix @sinaLong258 did you find out a way to fix this issue? |
As I mentioned before, restarting the node reports accurate We're building mitigation steps into https://github.com/strangelove-ventures/cosmos-operator which we use for all our RPC deployments. It's a workaround and hopefully the Evmos core team fixes the root cause. |
is there any syncable archive snapshot at the moment? sync performance seems terrible under 1blocks per second even blocks are mainly empty |
We've always had luck with Polkachu snapshots. |
This issue is stale because it has been open 45 days with no activity. Remove |
@fedekunze Same here. Any updates? |
We built a feature into the Cosmos Operator to detect height lag and restart pods if they fall too far behind. We've found Injective also lags behind as well. Of course, our solution is specific to Kubernetes. An alternative solution is a cronjob that compares heights and reboots processes if the height starts to lag. Both of these are, of course, workarounds at best. It's up to the SDK and/or Comet teams to address and fix whatever the root cause. |
Did you guys try the v14-rc to see if it improves the situation? |
This issue is stale because it has been open 45 days with no activity. Remove |
No Stale |
This issue is stale because it has been open 45 days with no activity. Remove |
No Stale |
Using Version 15 here, also affected by the block sync lag |
Thanks for the contribution! |
This issue is stale because it has been open 45 days with no activity. Remove |
no stale |
Context
I use snapshots to sync nodes. Sync block delays often occur after sync to the latest height.
As time goes on, our node alway fall behind the latest block, the gap between the height of the node and the height of the latest block increases. And I cannot find any abnormal message in the logs.
I used evmosd v12.1.2 on Amazon Linux release 2.
Any configuration item should I use to resolve this issue?
The text was updated successfully, but these errors were encountered: