optimize memory issues (#767)

* optimize memory issues

1. introduce jemalloc support, which can dump current memory usage
2. reduce the GlobalEvent broadcaster memory usage.
3. reduce tcp & udp tunnel memory usage

TODO: if peer conn tunnel hangs, the unbounded channel of peer rpc
may consume lots of memory, which should be improved.

* select a port from 15888+ when port is 0
This commit is contained in:
Sijie.Sun
2025-04-09 23:05:49 +08:00
committed by GitHub
parent 3c0d85c9db
commit 01e3ad99ca
16 changed files with 491 additions and 178 deletions

View File

@@ -118,9 +118,21 @@ impl Peer {
}
pub async fn add_peer_conn(&self, mut conn: PeerConn) {
conn.set_close_event_sender(self.close_event_sender.clone());
let close_event_sender = self.close_event_sender.clone();
let close_notifier = conn.get_close_notifier();
tokio::spawn(async move {
let conn_id = close_notifier.get_conn_id();
if let Some(mut waiter) = close_notifier.get_waiter().await {
let _ = waiter.recv().await;
}
if let Err(e) = close_event_sender.send(conn_id).await {
tracing::warn!(?conn_id, "failed to send close event: {}", e);
}
});
conn.start_recv_loop(self.packet_recv_chan.clone()).await;
conn.start_pingpong();
self.global_ctx
.issue_event(GlobalCtxEvent::PeerConnAdded(conn.get_conn_info()));
self.conns.insert(conn.get_conn_id(), Arc::new(conn));

View File

@@ -13,7 +13,7 @@ use futures::{StreamExt, TryFutureExt};
use prost::Message;
use tokio::{
sync::{broadcast, mpsc, Mutex},
sync::{broadcast, Mutex},
task::JoinSet,
time::{timeout, Duration},
};
@@ -50,6 +50,41 @@ pub type PeerConnId = uuid::Uuid;
const MAGIC: u32 = 0xd1e1a5e1;
const VERSION: u32 = 1;
pub struct PeerConnCloseNotify {
conn_id: PeerConnId,
sender: Arc<std::sync::Mutex<Option<broadcast::Sender<()>>>>,
}
impl PeerConnCloseNotify {
fn new(conn_id: PeerConnId) -> Self {
let (sender, _) = broadcast::channel(1);
Self {
conn_id,
sender: Arc::new(std::sync::Mutex::new(Some(sender))),
}
}
fn notify_close(&self) {
self.sender.lock().unwrap().take();
}
pub async fn get_waiter(&self) -> Option<broadcast::Receiver<()>> {
if let Some(sender) = self.sender.lock().unwrap().as_mut() {
let receiver = sender.subscribe();
return Some(receiver);
}
None
}
pub fn get_conn_id(&self) -> PeerConnId {
self.conn_id
}
pub fn is_closed(&self) -> bool {
self.sender.lock().unwrap().is_none()
}
}
pub struct PeerConn {
conn_id: PeerConnId,
@@ -66,7 +101,7 @@ pub struct PeerConn {
info: Option<HandshakeRequest>,
is_client: Option<bool>,
close_event_sender: Option<mpsc::Sender<PeerConnId>>,
close_event_notifier: Arc<PeerConnCloseNotify>,
ctrl_resp_sender: broadcast::Sender<ZCPacket>,
@@ -88,7 +123,7 @@ impl Debug for PeerConn {
impl PeerConn {
pub fn new(my_peer_id: PeerId, global_ctx: ArcGlobalCtx, tunnel: Box<dyn Tunnel>) -> Self {
let tunnel_info = tunnel.info();
let (ctrl_sender, _ctrl_receiver) = broadcast::channel(100);
let (ctrl_sender, _ctrl_receiver) = broadcast::channel(8);
let peer_conn_tunnel_filter = StatsRecorderTunnelFilter::new();
let throughput = peer_conn_tunnel_filter.filter_output();
@@ -97,8 +132,10 @@ impl PeerConn {
let (recv, sink) = (mpsc_tunnel.get_stream(), mpsc_tunnel.get_sink());
let conn_id = PeerConnId::new_v4();
PeerConn {
conn_id: PeerConnId::new_v4(),
conn_id: conn_id.clone(),
my_peer_id,
global_ctx,
@@ -114,7 +151,8 @@ impl PeerConn {
info: None,
is_client: None,
close_event_sender: None,
close_event_notifier: Arc::new(PeerConnCloseNotify::new(conn_id)),
ctrl_resp_sender: ctrl_sender,
@@ -267,10 +305,8 @@ impl PeerConn {
let mut stream = self.recv.lock().await.take().unwrap();
let sink = self.sink.clone();
let sender = packet_recv_chan.clone();
let close_event_sender = self.close_event_sender.clone().unwrap();
let conn_id = self.conn_id;
let close_event_notifier = self.close_event_notifier.clone();
let ctrl_sender = self.ctrl_resp_sender.clone();
let _conn_info = self.get_conn_info();
let conn_info_for_instrument = self.get_conn_info();
self.tasks.spawn(
@@ -312,9 +348,7 @@ impl PeerConn {
tracing::info!("end recving peer conn packet");
drop(sink);
if let Err(e) = close_event_sender.send(conn_id).await {
tracing::error!(error = ?e, "peer conn close event send error");
}
close_event_notifier.notify_close();
task_ret
}
@@ -335,17 +369,14 @@ impl PeerConn {
self.throughput.clone(),
);
let close_event_sender = self.close_event_sender.clone().unwrap();
let conn_id = self.conn_id;
let close_event_notifier = self.close_event_notifier.clone();
self.tasks.spawn(async move {
pingpong.pingpong().await;
tracing::warn!(?pingpong, "pingpong task exit");
if let Err(e) = close_event_sender.send(conn_id).await {
tracing::warn!("close event sender error: {:?}", e);
}
close_event_notifier.notify_close();
Ok(())
});
@@ -373,8 +404,8 @@ impl PeerConn {
ret
}
pub fn set_close_event_sender(&mut self, sender: mpsc::Sender<PeerConnId>) {
self.close_event_sender = Some(sender);
pub fn get_close_notifier(&self) -> Arc<PeerConnCloseNotify> {
self.close_event_notifier.clone()
}
pub fn get_stats(&self) -> PeerConnStats {
@@ -405,6 +436,13 @@ impl PeerConn {
}
}
impl Drop for PeerConn {
fn drop(&mut self) {
// if someone drop a conn manually, the notifier is not called.
self.close_event_notifier.notify_close();
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
@@ -496,15 +534,13 @@ mod tests {
s_peer.do_handshake_as_server()
);
s_peer.set_close_event_sender(tokio::sync::mpsc::channel(1).0);
s_peer.start_recv_loop(create_packet_recv_chan().0).await;
// do not start ping for s, s only reponde to ping from c
assert!(c_ret.is_ok());
assert!(s_ret.is_ok());
let (close_send, mut close_recv) = tokio::sync::mpsc::channel(1);
c_peer.set_close_event_sender(close_send);
let close_notifier = c_peer.get_close_notifier();
c_peer.start_pingpong();
c_peer.start_recv_loop(create_packet_recv_chan().0).await;
@@ -520,9 +556,9 @@ mod tests {
tokio::time::sleep(Duration::from_secs(15)).await;
if conn_closed {
assert!(close_recv.try_recv().is_ok());
assert!(close_notifier.is_closed());
} else {
assert!(close_recv.try_recv().is_err());
assert!(!close_notifier.is_closed());
}
}

View File

@@ -347,21 +347,43 @@ impl PeerManager {
async fn start_peer_conn_close_event_handler(&self) {
let dmap = self.directly_connected_conn_map.clone();
let mut event_recv = self.global_ctx.subscribe();
let peer_map = self.peers.clone();
use tokio::sync::broadcast::error::RecvError;
self.tasks.lock().await.spawn(async move {
while let Ok(event) = event_recv.recv().await {
match event {
GlobalCtxEvent::PeerConnRemoved(info) => {
if let Some(set) = dmap.get_mut(&info.peer_id) {
let conn_id = info.conn_id.parse().unwrap();
let old = set.remove(&conn_id);
tracing::info!(
?old,
?info,
"try remove conn id from directly connected map"
);
loop {
match event_recv.recv().await {
Err(RecvError::Closed) => {
tracing::error!("peer conn close event handler exit");
break;
}
Err(RecvError::Lagged(_)) => {
tracing::warn!("peer conn close event handler lagged");
event_recv = event_recv.resubscribe();
let alive_conns = peer_map.get_alive_conns();
for p in dmap.iter_mut() {
p.retain(|x| alive_conns.contains_key(&(*p.key(), *x)));
}
dmap.retain(|_, v| !v.is_empty());
}
Ok(event) => {
if let GlobalCtxEvent::PeerConnRemoved(info) = event {
let mut need_remove = false;
if let Some(set) = dmap.get_mut(&info.peer_id) {
let conn_id = info.conn_id.parse().unwrap();
let old = set.remove(&conn_id);
tracing::info!(
?old,
?info,
"try remove conn id from directly connected map"
);
need_remove = set.is_empty();
}
if need_remove {
dmap.remove(&info.peer_id);
}
}
}
_ => {}
}
}
});

View File

@@ -27,6 +27,7 @@ pub struct PeerMap {
peer_map: DashMap<PeerId, Arc<Peer>>,
packet_send: PacketRecvChan,
routes: RwLock<Vec<ArcRoute>>,
alive_conns: Arc<DashMap<(PeerId, PeerConnId), PeerConnInfo>>,
}
impl PeerMap {
@@ -37,6 +38,7 @@ impl PeerMap {
peer_map: DashMap::new(),
packet_send,
routes: RwLock::new(Vec::new()),
alive_conns: Arc::new(DashMap::new()),
}
}
@@ -48,6 +50,7 @@ impl PeerMap {
}
pub async fn add_new_peer_conn(&self, peer_conn: PeerConn) {
self.maintain_alive_conns(&peer_conn);
let peer_id = peer_conn.get_peer_id();
let no_entry = self.peer_map.get(&peer_id).is_none();
if no_entry {
@@ -60,6 +63,30 @@ impl PeerMap {
}
}
fn maintain_alive_conns(&self, peer_conn: &PeerConn) {
let close_notifier = peer_conn.get_close_notifier();
let alive_conns_weak = Arc::downgrade(&self.alive_conns);
let conn_id = close_notifier.get_conn_id();
let conn_info = peer_conn.get_conn_info();
self.alive_conns
.insert((conn_info.peer_id, conn_id.clone()), conn_info.clone());
tokio::spawn(async move {
if let Some(mut waiter) = close_notifier.get_waiter().await {
let _ = waiter.recv().await;
}
let mut alive_conn_count = 0;
if let Some(alive_conns) = alive_conns_weak.upgrade() {
alive_conns.remove(&(conn_info.peer_id, conn_id)).unwrap();
alive_conn_count = alive_conns.len();
}
tracing::debug!(
?conn_id,
"peer conn is closed, current alive conns: {}",
alive_conn_count
);
});
}
fn get_peer_by_id(&self, peer_id: PeerId) -> Option<Arc<Peer>> {
self.peer_map.get(&peer_id).map(|v| v.clone())
}
@@ -284,6 +311,13 @@ impl PeerMap {
Ok(!self.has_peer(gateway_id))
}
pub fn get_alive_conns(&self) -> DashMap<(PeerId, PeerConnId), PeerConnInfo> {
self.alive_conns
.iter()
.map(|v| (v.key().clone(), v.value().clone()))
.collect()
}
}
impl Drop for PeerMap {