This commit is contained in:
2025-12-24 01:36:12 +00:00
commit c14080ff4e
21 changed files with 4531 additions and 0 deletions

185
src/events/handler.rs Normal file
View File

@@ -0,0 +1,185 @@
use crate::ai::{AIClient, DiagnosticEngine};
use crate::k8s::{KubeClient, NodeEvent, NodeEventType, PodEvent, PodEventType};
use std::sync::Arc;
use tokio::sync::Semaphore;
use tracing::{error, info, warn};
#[derive(Clone)]
pub struct EventHandler {
kube_client: KubeClient,
ai_client: AIClient,
// Semaphore to limit concurrent AI diagnoses
diagnosis_semaphore: Arc<Semaphore>,
}
impl EventHandler {
pub fn new(kube_client: KubeClient, ai_client: AIClient, max_concurrent: usize) -> Self {
Self {
kube_client,
ai_client,
diagnosis_semaphore: Arc::new(Semaphore::new(max_concurrent)),
}
}
/// Handle node event and trigger AI diagnostics if needed
pub async fn handle_node_event(&self, event: NodeEvent) {
match event.event_type {
NodeEventType::BecameNotReady => {
warn!(
node = %event.node_name,
"Node became NotReady, starting AI diagnostics"
);
self.diagnose_node(&event.node_name).await;
}
NodeEventType::ConditionChanged {
ref condition_type,
ref status,
ref reason,
} => {
warn!(
node = %event.node_name,
condition = %condition_type,
status = %status,
reason = ?reason,
"Problematic condition detected, starting AI diagnostics"
);
self.diagnose_node(&event.node_name).await;
}
NodeEventType::BecameReady => {
info!(node = %event.node_name, "Node became Ready");
}
}
}
/// Handle pod event and trigger AI diagnostics if needed
pub async fn handle_pod_event(&self, event: PodEvent) {
let problem_description = match &event.event_type {
PodEventType::HighRestartCount { count } => {
warn!(
pod = %event.pod_name,
namespace = %event.namespace,
restart_count = count,
"Pod has high restart count"
);
format!("High restart count: {} restarts", count)
}
PodEventType::CrashLoopBackOff => {
warn!(
pod = %event.pod_name,
namespace = %event.namespace,
"Pod in CrashLoopBackOff"
);
"Container is in CrashLoopBackOff state".to_string()
}
PodEventType::ImagePullError => {
warn!(
pod = %event.pod_name,
namespace = %event.namespace,
"Pod has image pull error"
);
"Failed to pull container image".to_string()
}
PodEventType::Pending { reason } => {
warn!(
pod = %event.pod_name,
namespace = %event.namespace,
reason = ?reason,
"Pod stuck in Pending"
);
format!(
"Pod stuck in Pending state. Reason: {}",
reason.as_deref().unwrap_or("Unknown")
)
}
PodEventType::Failed { reason } => {
warn!(
pod = %event.pod_name,
namespace = %event.namespace,
reason = ?reason,
"Pod in Failed state"
);
format!(
"Pod in Failed state. Reason: {}",
reason.as_deref().unwrap_or("Unknown")
)
}
PodEventType::ContainerCreating { duration_seconds } => {
warn!(
pod = %event.pod_name,
namespace = %event.namespace,
duration_seconds = duration_seconds,
"Container creating for too long"
);
format!(
"Container has been creating for {} seconds",
duration_seconds
)
}
};
self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description)
.await;
}
async fn diagnose_node(&self, node_name: &str) {
// Acquire semaphore permit to limit concurrency
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
info!(node = %node_name, "Starting AI diagnosis (acquired permit)");
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
match diagnostic_engine
.diagnose_nodes(&self.ai_client, vec![node_name.to_string()])
.await
{
Ok(diagnosis) => {
info!(
node = %node_name,
diagnosis = %diagnosis,
"AI diagnosis completed"
);
}
Err(e) => {
error!(
node = %node_name,
error = %e,
"AI diagnosis failed"
);
}
}
// Permit is automatically released when _permit is dropped
}
async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str) {
// Acquire semaphore permit to limit concurrency
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
info!(pod = %pod_name, namespace = %namespace, "Starting AI diagnosis (acquired permit)");
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
match diagnostic_engine
.diagnose_pod(&self.ai_client, namespace, pod_name, problem)
.await
{
Ok(diagnosis) => {
info!(
pod = %pod_name,
namespace = %namespace,
diagnosis = %diagnosis,
"AI diagnosis completed"
);
}
Err(e) => {
error!(
pod = %pod_name,
namespace = %namespace,
error = %e,
"AI diagnosis failed"
);
}
}
// Permit is automatically released when _permit is dropped
}
}

3
src/events/mod.rs Normal file
View File

@@ -0,0 +1,3 @@
mod handler;
pub use handler::EventHandler;