synchronise issue #1558

sinaLong258 · 2023-05-08T07:11:11Z

Context

I use snapshots to sync nodes. Sync block delays often occur after sync to the latest height.

As time goes on, our node alway fall behind the latest block, the gap between the height of the node and the height of the latest block increases. And I cannot find any abnormal message in the logs.

I used evmosd v12.1.2 on Amazon Linux release 2.

Any configuration item should I use to resolve this issue?

DavidNix · 2023-05-18T22:54:15Z

Strangelove would like to piggy back on this issue. We are seeing similar problems with our Evmos RPC nodes.

They often fall behind. However, the tendermint RPC /status endpoint reports the node is caught up. Sometimes the node catches up. But often it lags behind and is unable to catch up.

This behavior prevents removing unhealthy nodes from our Load Balancer. It also causes manual SRE work to restart the problem nodes. Thus far, rebooting the node causes /status to report accurately and allows the node to catch up.

The compute resources are not taxed. No OOMs or maxed out CPU.

We use Google compute:

n2-highmem-8 (8 vCPUs | 64 GiB)
pd-ssd (Performance SSD persistent disks)

We've tried:

Increasing peers
Using private peers
Increasing IOPs
Pruning more aggressively

So far, no luck.

We run dozens of other RPC nodes for other chains and only Evmos exhibits this problem. Therefore, we can only conclude the root cause is specific to Evmos.

Here's example config:

config.toml

proxy_app = "tcp://127.0.0.1:26658"
moniker = "evmos-mainnet-fullnode-0"
fast_sync = true
db_backend = "goleveldb"
db_dir = "data"
log_level = "info"
log_format = "plain"
genesis_file = "config/genesis.json"
priv_validator_key_file = "config/priv_validator_key.json"
priv_validator_state_file = "data/priv_validator_state.json"
priv_validator_laddr = ""
node_key_file = "config/node_key.json"
abci = "socket"
filter_peers = false

[rpc]
laddr = "tcp://0.0.0.0:26657"
cors_allowed_origins = [
  "*"
]
cors_allowed_methods = [
  "HEAD",
  "GET",
  "POST"
]
cors_allowed_headers = [
  "Origin",
  "Accept",
  "Content-Type",
  "X-Requested-With",
  "X-Server-Time"
]
grpc_laddr = ""
grpc_max_open_connections = 900
unsafe = false
max_open_connections = 900
max_subscription_clients = 100
max_subscriptions_per_client = 5
experimental_subscription_buffer_size = 200
experimental_websocket_write_buffer_size = 200
experimental_close_on_slow_client = false
timeout_broadcast_tx_commit = "10s"
max_body_bytes = 1000000
max_header_bytes = 1048576
tls_cert_file = ""
tls_key_file = ""
pprof_laddr = "localhost:6060"

[p2p]
laddr = "tcp://0.0.0.0:26656"
external_address = "34.174.21.162:26656"
seeds = "20e1000e88125698264454a884812746c2eb4807@seeds.lavenderfive.com:13456,40f4fac63da8b1ce8f850b0fa0f79b2699d2ce72@seed.evmos.jerrychong.com:26656,5740e4a36e646e80cc5648daf5e983e5b5d8f265@54.39.18.27:26656,588cedb70fa1d98c14a2f2c1456bfa41e1a156a8@evmos-sentry.mercury-nodes.net:29539,86bd5cb6e762f673f1706e5889e039d5406b4b90@evmos.seed.node75.org:10756,ade4d8bc8cbe014af6ebdf3cb7b1e9ad36f412c0@seeds.polkachu.com:13456,babc3f3f7804933265ec9c40ad94f4da8e9e0017@seed.rhinostake.com:13456,e1b058e5cfa2b836ddaa496b10911da62dcf182e@evmos-seed-1.allnodes.me:26656,e726816f42831689eab9378d5d577f1d06d25716@evmos-seed-2.allnodes.me:26656"
persistent_peers = "763a16523057b317238ecf11dd58d0115d638b34@evmos-mainnet-fullnode-p2p-1.strangelove.svc.cluster.local:26656,e09d707ac82f41f84c1e61555aedf20bd45d9c66@evmos-mainnet-fullnode-p2p-2.strangelove.svc.cluster.local:26656,2b5bc46b5ee567dc75c9036e3acbe37c397f32f3@35.247.100.131:26656,42daddcdb8540971b610e9389aefc96d7c05f869@34.83.13.56:26656,9b41e64b005388f41218866cb1f48b7de6728a6f@35.230.49.7:26656,5740e4a36e646e80cc5648daf5e983e5b5d8f265@54.39.18.27:26656,588cedb70fa1d98c14a2f2c1456bfa41e1a156a8@evmos-sentry.mercury-nodes.net:29539"
upnp = false
addr_book_file = "config/addrbook.json"
addr_book_strict = true
max_num_inbound_peers = 120
max_num_outbound_peers = 60
unconditional_peer_ids = "763a16523057b317238ecf11dd58d0115d638b34,e09d707ac82f41f84c1e61555aedf20bd45d9c66,2b5bc46b5ee567dc75c9036e3acbe37c397f32f3,42daddcdb8540971b610e9389aefc96d7c05f869,9b41e64b005388f41218866cb1f48b7de6728a6f"
persistent_peers_max_dial_period = "0s"
flush_throttle_timeout = "100ms"
max_packet_msg_payload_size = 1024
send_rate = 5120000
recv_rate = 5120000
pex = true
seed_mode = false
private_peer_ids = "763a16523057b317238ecf11dd58d0115d638b34,e09d707ac82f41f84c1e61555aedf20bd45d9c66"
allow_duplicate_ip = false
handshake_timeout = "20s"
dial_timeout = "3s"

[mempool]
version = "v0"
recheck = true
broadcast = true
wal_dir = ""
size = 10000
max_txs_bytes = 1073741824
cache_size = 10000
keep-invalid-txs-in-cache = false
max_tx_bytes = 1048576
max_batch_bytes = 0
ttl-duration = "0s"
ttl-num-blocks = 0

[statesync]
enable = false
rpc_servers = ""
trust_height = 0
trust_hash = ""
trust_period = "112h0m0s"
discovery_time = "15s"
temp_dir = ""
chunk_request_timeout = "10s"
chunk_fetchers = "4"

[fastsync]
version = "v0"

[consensus]
wal_file = "data/cs.wal/wal"
timeout_propose = "3s"
timeout_propose_delta = "500ms"
timeout_prevote = "1s"
timeout_prevote_delta = "500ms"
timeout_precommit = "1s"
timeout_precommit_delta = "500ms"
timeout_commit = "5s"
double_sign_check_height = 0
skip_timeout_commit = false
create_empty_blocks = true
create_empty_blocks_interval = "0s"
peer_gossip_sleep_duration = "100ms"
peer_query_maj23_sleep_duration = "2s"

[storage]
discard_abci_responses = false

[tx_index]
indexer = "kv"
psql-conn = ""

[instrumentation]
prometheus = true
prometheus_listen_addr = ":26660"
max_open_connections = 3
namespace = "tendermint"

app.toml

minimum-gas-prices = "20000000000aevmos"
pruning = "custom"
pruning-keep-recent = "5000"
pruning-interval = "17"
halt-height = 0
halt-time = 0
min-retain-blocks = 200000
inter-block-cache = true
index-events = []
iavl-cache-size = 781250
iavl-disable-fastnode = false
iavl-lazy-loading = false
app-db-backend = ""
pruning-keep-every = "0"

[telemetry]
service-name = ""
enabled = false
enable-hostname = false
enable-hostname-label = false
enable-service-label = false
prometheus-retention-time = 0
global-labels = []

[api]
enable = true
swagger = true
address = "tcp://0.0.0.0:1317"
max-open-connections = 1000
rpc-read-timeout = 10
rpc-write-timeout = 0
rpc-max-body-bytes = 1000000
enabled-unsafe-cors = true

[rosetta]
enable = false
address = ":8080"
blockchain = "app"
network = "network"
retries = 3
offline = false
enable-fee-suggestion = false
gas-to-suggest = 200000
denom-to-suggest = "uatom"

[grpc]
enable = true
address = "0.0.0.0:9090"
max-recv-msg-size = "10485760"
max-send-msg-size = "2147483647"

[grpc-web]
enable = true
address = "0.0.0.0:9091"
enable-unsafe-cors = false

[state-sync]
snapshot-interval = 5000
snapshot-keep-recent = 2

[store]
streamers = []

[streamers]

  [streamers.file]
  keys = [
    "*"
  ]
  write_dir = ""
  prefix = ""
  output-metadata = "true"
  stop-node-on-error = "true"
  fsync = "false"

[evm]
tracer = ""
max-tx-gas-wanted = 0

[json-rpc]
enable = true
address = "127.0.0.1:8545"
ws-address = "127.0.0.1:8546"
api = "eth,net,web3"
gas-cap = 25000000
evm-timeout = "5s"
txfee-cap = 1
filter-cap = 200
feehistory-cap = 100
logs-cap = 10000
block-range-cap = 10000
http-timeout = "30s"
http-idle-timeout = "2m0s"
allow-unprotected-txs = false
max-open-connections = 0
enable-indexer = false
metrics-address = "127.0.0.1:6065"
fix-revert-gas-refund-height = 0

[tls]
certificate-path = ""
key-path = ""

heitorPB · 2023-05-23T17:00:33Z

I'm also affected by this. Not sure how to troubleshoot further. I now increased the value of the snapshot-interval to 60k to see if it reduces the gap between our node and the network.

heitorPB · 2023-05-26T18:00:11Z

@DavidNix @sinaLong258 did you find out a way to fix this issue?

DavidNix · 2023-05-30T13:38:43Z

As I mentioned before, restarting the node reports accurate /status and allows it to catch up.

We're building mitigation steps into https://github.com/strangelove-ventures/cosmos-operator which we use for all our RPC deployments.

It's a workaround and hopefully the Evmos core team fixes the root cause.

hitchhooker · 2023-05-31T01:48:20Z

is there any syncable archive snapshot at the moment? sync performance seems terrible under 1blocks per second even blocks are mainly empty

DavidNix · 2023-05-31T14:10:31Z

We've always had luck with Polkachu snapshots.

github-actions · 2023-07-16T00:20:38Z

This issue is stale because it has been open 45 days with no activity. Remove Status: Stale label or comment or this will be closed in 7 days.

fernando-galaxystaking · 2023-07-22T06:41:39Z

@fedekunze Same here. Any updates?

DavidNix · 2023-07-24T17:47:25Z

We built a feature into the Cosmos Operator to detect height lag and restart pods if they fall too far behind. We've found Injective also lags behind as well. Of course, our solution is specific to Kubernetes.

An alternative solution is a cronjob that compares heights and reboots processes if the height starts to lag.

Both of these are, of course, workarounds at best. It's up to the SDK and/or Comet teams to address and fix whatever the root cause.

heitorPB · 2023-08-01T16:10:32Z

Did you guys try the v14-rc to see if it improves the situation?

github-actions · 2023-09-16T00:13:45Z

This issue is stale because it has been open 45 days with no activity. Remove Status: Stale label or comment or this will be closed in 7 days.

VictorTrustyDev · 2023-09-17T19:23:32Z

No Stale

github-actions · 2023-11-02T00:14:36Z

This issue is stale because it has been open 45 days with no activity. Remove Status: Stale label or comment or this will be closed in 7 days.

VictorTrustyDev · 2023-11-02T01:12:36Z

No Stale

peter-krypto · 2023-12-11T14:34:12Z

Using Version 15 here, also affected by the block sync lag

linear · 2024-01-23T09:53:15Z

ENG-2469 Issue 1558

ramacarlucho · 2024-01-24T17:25:17Z

Thanks for the contribution!
Gonna bring this up with the rest of the team.

github-actions · 2024-03-10T00:16:35Z

This issue is stale because it has been open 45 days with no activity. Remove Status: Stale label or comment or this will be closed in 7 days.

MalteHerrmann · 2024-03-11T10:44:33Z

no stale

heitorPB mentioned this issue May 26, 2023

synchronise issue #1532

Closed

github-actions bot added the Status: Stale label Jul 16, 2023

github-actions bot removed the Status: Stale label Jul 23, 2023

github-actions bot added the Status: Stale label Sep 16, 2023

Vvaradinov removed the Status: Stale label Sep 17, 2023

github-actions bot added the Status: Stale label Nov 2, 2023

github-actions bot removed the Status: Stale label Nov 3, 2023

ramacarlucho added bug Something isn't working being reviewed Core team is investigating it labels Jan 24, 2024

github-actions bot added the Status: Stale label Mar 10, 2024

github-actions bot removed the Status: Stale label Mar 12, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

synchronise issue #1558

synchronise issue #1558

sinaLong258 commented May 8, 2023 •

edited by linear bot

DavidNix commented May 18, 2023

heitorPB commented May 23, 2023

heitorPB commented May 26, 2023

DavidNix commented May 30, 2023 •

edited

hitchhooker commented May 31, 2023

DavidNix commented May 31, 2023

github-actions bot commented Jul 16, 2023

fernando-galaxystaking commented Jul 22, 2023 •

edited

DavidNix commented Jul 24, 2023 •

edited

heitorPB commented Aug 1, 2023

github-actions bot commented Sep 16, 2023

VictorTrustyDev commented Sep 17, 2023

github-actions bot commented Nov 2, 2023

VictorTrustyDev commented Nov 2, 2023

peter-krypto commented Dec 11, 2023

linear bot commented Jan 23, 2024

ramacarlucho commented Jan 24, 2024

github-actions bot commented Mar 10, 2024

MalteHerrmann commented Mar 11, 2024

synchronise issue #1558

synchronise issue #1558

Comments

sinaLong258 commented May 8, 2023 • edited by linear bot

Context

DavidNix commented May 18, 2023

heitorPB commented May 23, 2023

heitorPB commented May 26, 2023

DavidNix commented May 30, 2023 • edited

hitchhooker commented May 31, 2023

DavidNix commented May 31, 2023

github-actions bot commented Jul 16, 2023

fernando-galaxystaking commented Jul 22, 2023 • edited

DavidNix commented Jul 24, 2023 • edited

heitorPB commented Aug 1, 2023

github-actions bot commented Sep 16, 2023

VictorTrustyDev commented Sep 17, 2023

github-actions bot commented Nov 2, 2023

VictorTrustyDev commented Nov 2, 2023

peter-krypto commented Dec 11, 2023

linear bot commented Jan 23, 2024

ramacarlucho commented Jan 24, 2024

github-actions bot commented Mar 10, 2024

MalteHerrmann commented Mar 11, 2024

sinaLong258 commented May 8, 2023 •

edited by linear bot

DavidNix commented May 30, 2023 •

edited

fernando-galaxystaking commented Jul 22, 2023 •

edited

DavidNix commented Jul 24, 2023 •

edited