Keepalive fix, auto-reconnect on disconnect, tab icon fix, video playback guard

Keepalive: tokio::time::sleep inside select! was resetting every iteration —
keepalives never fired. Switched to tokio::time::interval which ticks reliably.
This caused connections to be zombie-reaped (10min timeout with no pings).

Auto-reconnect: unexpected disconnects (stream error, not SocialDisconnectNotice)
now attempt direct reconnect after 3s delay using last known address from peers
table or social route. Falls back to notify_growth() if direct reconnect fails.

Tab icons: updateTabBadge was using textContent which destroyed the icon and
label spans inside tab buttons. Now updates only the .tab-label span and manages
a separate .tab-badge element.

Video playback: feed re-render skipped while any video or audio is actively
playing, preventing echo from DOM destruction and media element recreation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Scott Reimers 2026-03-22 23:27:41 -04:00
parent 68afc40b16
commit 6320a82852
4 changed files with 118 additions and 12 deletions

View file

@ -3197,6 +3197,13 @@ impl ConnectionManager {
self.log_activity(ActivityLevel::Warn, ActivityCategory::Connection, "Mesh empty".into(), None);
self.notify_recovery();
}
// Signal growth loop to fill the empty slot (don't wait 10min for rebalance)
let total_slots = self.preferred_slots + self.local_slots + self.wide_slots;
if remaining < total_slots {
self.notify_growth();
}
}
/// Notify watchers that a previously disconnected peer has reconnected.
@ -4495,7 +4502,10 @@ impl ConnectionManager {
last_activity: Arc<AtomicU64>,
) {
let our_stable_id = conn.stable_id();
let keepalive_interval = std::time::Duration::from_secs(MESH_KEEPALIVE_INTERVAL_SECS);
// Use interval (not sleep) so the timer ticks reliably even when other select branches fire.
// tokio::time::sleep inside select! restarts on every loop iteration — keepalive would never fire.
let mut keepalive_tick = tokio::time::interval(std::time::Duration::from_secs(MESH_KEEPALIVE_INTERVAL_SECS));
keepalive_tick.tick().await; // consume the immediate first tick
loop {
tokio::select! {
uni_result = conn.accept_uni() => {
@ -4534,7 +4544,7 @@ impl ConnectionManager {
}
}
}
_ = tokio::time::sleep(keepalive_interval) => {
_ = keepalive_tick.tick() => {
// Send lightweight keepalive ping — keeps NAT mapping alive
// and prevents zombie detection on the remote side
if let Ok(mut send) = conn.open_uni().await {
@ -4551,15 +4561,72 @@ impl ConnectionManager {
}
}
// Connection ended — only clean up if this is still the active connection
// (a reconnect may have already replaced our entry with a newer connection)
// Connection ended unexpectedly — clean up and attempt reconnect
let (is_current, peer_addr, has_social_route) = {
let mut cm = conn_mgr.lock().await;
let is_current = cm.connections.get(&remote_node_id)
.map_or(false, |pc| pc.connection.stable_id() == our_stable_id);
if is_current {
// Gather reconnect info before disconnect clears it
let storage = cm.storage.get().await;
let addr = storage.get_peer_record(&remote_node_id).ok().flatten()
.and_then(|r| r.addresses.first().cloned())
.or_else(|| storage.get_social_route(&remote_node_id).ok().flatten()
.and_then(|r| r.addresses.first().cloned()));
let has_route = storage.has_social_route(&remote_node_id).unwrap_or(false);
drop(storage);
cm.disconnect_peer(&remote_node_id).await;
(true, addr, has_route)
} else {
debug!(peer = hex::encode(remote_node_id), "Skipping disconnect — connection was replaced by reconnect");
(false, None, false)
}
};
// Attempt reconnect for unexpected disconnects (not intentional SocialDisconnectNotice)
if is_current {
if let Some(addr) = peer_addr {
let cm_arc = Arc::clone(&conn_mgr);
tokio::spawn(async move {
// Brief delay to let the disconnect settle and avoid reconnect storms
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
// Check if already reconnected (by the other side or growth loop)
{
let cm = cm_arc.lock().await;
if cm.connections.contains_key(&remote_node_id) || cm.sessions.contains_key(&remote_node_id) {
return; // Already reconnected
}
}
if let Ok(eid) = iroh::EndpointId::from_bytes(&remote_node_id) {
let ep_addr = iroh::EndpointAddr::from(eid).with_ip_addr(addr);
let endpoint = {
let cm = cm_arc.lock().await;
cm.endpoint.clone()
};
match ConnectionManager::connect_to_unlocked(&endpoint, ep_addr).await {
Ok(conn) => {
let mut cm = cm_arc.lock().await;
if !cm.connections.contains_key(&remote_node_id) {
cm.register_new_connection(remote_node_id, conn, &[addr], PeerSlotKind::Local).await;
info!(peer = hex::encode(remote_node_id), "Auto-reconnected after unexpected disconnect");
cm.log_activity(ActivityLevel::Info, ActivityCategory::Connection,
format!("Auto-reconnected to {}", &hex::encode(remote_node_id)[..8]), Some(remote_node_id));
}
}
Err(e) => {
debug!(peer = hex::encode(remote_node_id), error = %e, "Auto-reconnect failed");
// Signal growth loop as fallback
let cm = cm_arc.lock().await;
cm.notify_growth();
}
}
}
});
} else {
// No known address — signal growth loop to find new peers
let cm = conn_mgr.lock().await;
cm.notify_growth();
}
}
}

View file

@ -591,8 +591,24 @@ const TAB_BASE_LABELS = { feed: 'Feed', myposts: 'My Posts', people: 'People', m
function updateTabBadge(tabName, count) {
const tab = document.querySelector(`.tab[data-tab="${tabName}"]`);
if (!tab) return;
// Update the label span (preserve icon span)
const label = tab.querySelector('.tab-label');
const base = TAB_BASE_LABELS[tabName] || tabName;
tab.textContent = count > 0 ? `${base} (${count})` : base;
if (label) {
label.textContent = base;
}
// Update or create badge span
let badge = tab.querySelector('.tab-badge');
if (count > 0) {
if (!badge) {
badge = document.createElement('span');
badge.className = 'tab-badge';
tab.appendChild(badge);
}
badge.textContent = count;
} else if (badge) {
badge.remove();
}
}
let _lastFeedViewMs = 0;
@ -743,6 +759,18 @@ async function loadFeed(force) {
} catch (_) {}
}
// Skip full re-render if any video/audio is actively playing (prevents echo/restart)
const mediaPlaying = [...feedList.querySelectorAll('video, audio')].some(el => !el.paused);
if (mediaPlaying) {
// Don't destroy the DOM while media is playing — re-render on next cycle when stopped
return;
}
// Revoke old object URLs to prevent memory leaks
feedList.querySelectorAll('video[src^="blob:"], audio[src^="blob:"], img[src^="blob:"]').forEach(el => {
if (el.src.startsWith('blob:')) URL.revokeObjectURL(el.src);
});
// Preserve expanded comment threads
const expandedComments = new Set();
feedList.querySelectorAll('.comment-thread:not(.hidden)').forEach(el => {
@ -780,6 +808,13 @@ async function loadMyPosts(force) {
const fp = mine.map(p => `${p.id}:${(p.reactionCounts||[]).map(r=>r.emoji+r.count).join(',')}:${p.commentCount||0}`).join('|');
if (!force && fp === _myPostsFingerprint) return;
_myPostsFingerprint = fp;
// Skip re-render if media is playing
const mediaPlaying = [...myPostsList.querySelectorAll('video, audio')].some(el => !el.paused);
if (mediaPlaying) return;
// Revoke old blob URLs
myPostsList.querySelectorAll('video[src^="blob:"], audio[src^="blob:"], img[src^="blob:"]').forEach(el => {
if (el.src.startsWith('blob:')) URL.revokeObjectURL(el.src);
});
const expandedComments = new Set();
myPostsList.querySelectorAll('.comment-thread:not(.hidden)').forEach(el => {
const postEl = el.closest('.post');

View file

@ -44,7 +44,7 @@
<p>This is the canonical technical reference for ItsGoin. It describes the vision, the architecture, and the current state of every subsystem &mdash; with full implementation detail. This document is versioned; each update records what changed.</p>
<div class="card" style="margin-top: 1rem;">
<strong style="font-size: 0.85rem; text-transform: uppercase; letter-spacing: 0.05em;">Changelog</strong>
<p style="margin-top: 0.5rem;"><strong>v0.4.3</strong> (2026-03-22): Lock contention overhaul &mdash; all conn_mgr lock holds during network I/O eliminated. PostFetch, TcpPunch, PullFromPeer, FetchEngagement, ResolveAddress, AnchorProbe, WormLookup, ContentSearch now use brief locks for data gathering only. Bi-stream handlers (BlobRequest, WormQuery, RelayIntroduce, PostFetchRequest, ManifestRefresh) fully lock-free for I/O. ConnectionActor hoists shared Arcs (storage, blob_store, endpoint) for lock-free access. ResolveAddress adds 5s per-query timeout (was unbounded). Worm cascade uses connection snapshots. Initial exchange failure now aborts mesh upgrade (was silently continuing). connect_to_peer/connect_to_anchor use 15s timeout. StoragePool &mdash; 8 concurrent SQLite connections in WAL mode replace single Mutex&lt;Storage&gt;. Reads run fully parallel; writes serialize only at SQLite level. Bottom nav bar for mobile/tablet (&le;768px) with icon tabs. Text sizes: XS 75%, S 100%, M 125% (default), L 150%, XL 200%. Text size persisted to localStorage for instant restore. Fix: blocking_lock panic inside async runtime (prevented app startup). StoragePool reduced to 4 connections for Android compatibility.</p>
<p style="margin-top: 0.5rem;"><strong>v0.4.3</strong> (2026-03-22): Lock contention overhaul &mdash; all conn_mgr lock holds during network I/O eliminated. PostFetch, TcpPunch, PullFromPeer, FetchEngagement, ResolveAddress, AnchorProbe, WormLookup, ContentSearch now use brief locks for data gathering only. Bi-stream handlers (BlobRequest, WormQuery, RelayIntroduce, PostFetchRequest, ManifestRefresh) fully lock-free for I/O. ConnectionActor hoists shared Arcs (storage, blob_store, endpoint) for lock-free access. ResolveAddress adds 5s per-query timeout (was unbounded). Worm cascade uses connection snapshots. Initial exchange failure now aborts mesh upgrade (was silently continuing). connect_to_peer/connect_to_anchor use 15s timeout. StoragePool &mdash; 8 concurrent SQLite connections in WAL mode replace single Mutex&lt;Storage&gt;. Reads run fully parallel; writes serialize only at SQLite level. Bottom nav bar for mobile/tablet (&le;768px) with icon tabs. Text sizes: XS 75%, S 100%, M 125% (default), L 150%, XL 200%. Text size persisted to localStorage for instant restore. Fix: blocking_lock panic inside async runtime (prevented app startup). StoragePool reduced to 4 connections for Android compatibility. Keepalive fix &mdash; tokio::time::sleep inside select! was resetting every loop iteration, keepalives never fired; switched to tokio::time::interval. Auto-reconnect on unexpected disconnect &mdash; 3s delay then direct reconnect to last known address; falls back to growth loop. notify_growth on disconnect &mdash; immediately signals growth loop to fill empty slot instead of waiting 10min rebalance. Tab badge fix &mdash; updateTabBadge was using textContent which destroyed icon+label spans; now updates only the label and manages badge span separately. Feed re-render skip during media playback &mdash; prevents video echo from DOM destruction.</p>
<p><strong>v0.4.2</strong> (2026-03-22): Welcome screen &mdash; startup shows &ldquo;How&rsquo;s it goin?&rdquo; with staggered counters (connections, posts, messages, reacts, comments) while backend bootstraps. Status ticker &mdash; header ticker for new posts, messages, reactions, comments, connection changes. Notification improvements &mdash; Tauri plugin &rarr; Web Notification &rarr; notify-rust fallback chain, Linux native notifications. Responsive text scaling &mdash; Small/Normal/Large (100%/150%/200%), persisted via settings. Diagnostics popover &mdash; diagnostics moved from inline section to overlay, connections on-demand, timers removed. Share details lightbox with QR code. Connect string prefers external address (UPnP/public IPv6/observed). Stale N1 fix &mdash; disconnected social routes excluded from N1 share. Replication handler fix &mdash; actively fetches posts + blobs from requester after accepting replication. Hole punch fix &mdash; target-side registers publicly routable remote address for relay introduction. Replication semaphore (3 concurrent max). Peer labels show truncated node ID.</p>
<p><strong>v0.4.1</strong> (2026-03-21): Security hardening &mdash; reaction signatures (ed25519), comment signature verification on receipt, reaction removal authorization, BlobHeader author verification. Lock contention fixes &mdash; ManifestPush discovery (cm lock released during I/O), pull request handler (filter without lock), pull sender (split into brief locks), engagement checker (batch writes per chunk). Data cleanup &mdash; post deletion cleans downstream/upstream/seen tables.</p>
<p><strong>v0.4.0</strong> (2026-03-21): Protocol v4 &mdash; header-driven sync. ManifestPush as primary post notification. Slim PullSyncRequest (per-author timestamps, not full post ID list). Tiered engagement checks (5min/1hr/4hr/24hr by content age). Multi-upstream (3 max) with fallback chain. Auto-prefetch followed authors &lt;90d. Self Last Encounter per-author tracking. Encrypted-but-not-for-us CDN caching. Serial engagement polling. ~90% bandwidth reduction for established nodes.</p>

View file

@ -81,6 +81,10 @@
<li><strong>Bottom nav bar</strong> &mdash; Mobile/tablet (&le;768px) gets a fixed bottom navigation bar with icon tabs. Desktop keeps the top tab bar.</li>
<li><strong>Text size update</strong> &mdash; Five options: XS (75%), S (100%), M (125% default), L (150%), XL (200%). Persisted to localStorage for instant restore on startup.</li>
<li><strong>Startup fix</strong> &mdash; Fixed blocking_lock panic that prevented app from launching (async runtime conflict). StoragePool reduced to 4 connections for Android compatibility.</li>
<li><strong>Keepalive fix</strong> &mdash; Mesh keepalive pings were never firing due to timer reset bug in select loop. Connections were being zombie-reaped instead of kept alive.</li>
<li><strong>Auto-reconnect</strong> &mdash; Unexpected disconnects now trigger immediate reconnect attempt (3s delay, then direct connect to last known address). Falls back to growth loop if direct fails.</li>
<li><strong>Tab icon fix</strong> &mdash; Badge updates were destroying tab icons on mobile. Now updates label and badge separately.</li>
<li><strong>Video playback</strong> &mdash; Feed re-render skipped while video/audio is playing to prevent echo and restart.</li>
</ul>
<div class="changelog-date">v0.4.2 &mdash; March 22, 2026</div>