Init
This commit is contained in:
185
src/events/handler.rs
Normal file
185
src/events/handler.rs
Normal file
@@ -0,0 +1,185 @@
|
||||
use crate::ai::{AIClient, DiagnosticEngine};
|
||||
use crate::k8s::{KubeClient, NodeEvent, NodeEventType, PodEvent, PodEventType};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Semaphore;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct EventHandler {
|
||||
kube_client: KubeClient,
|
||||
ai_client: AIClient,
|
||||
// Semaphore to limit concurrent AI diagnoses
|
||||
diagnosis_semaphore: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
impl EventHandler {
|
||||
pub fn new(kube_client: KubeClient, ai_client: AIClient, max_concurrent: usize) -> Self {
|
||||
Self {
|
||||
kube_client,
|
||||
ai_client,
|
||||
diagnosis_semaphore: Arc::new(Semaphore::new(max_concurrent)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle node event and trigger AI diagnostics if needed
|
||||
pub async fn handle_node_event(&self, event: NodeEvent) {
|
||||
match event.event_type {
|
||||
NodeEventType::BecameNotReady => {
|
||||
warn!(
|
||||
node = %event.node_name,
|
||||
"Node became NotReady, starting AI diagnostics"
|
||||
);
|
||||
self.diagnose_node(&event.node_name).await;
|
||||
}
|
||||
NodeEventType::ConditionChanged {
|
||||
ref condition_type,
|
||||
ref status,
|
||||
ref reason,
|
||||
} => {
|
||||
warn!(
|
||||
node = %event.node_name,
|
||||
condition = %condition_type,
|
||||
status = %status,
|
||||
reason = ?reason,
|
||||
"Problematic condition detected, starting AI diagnostics"
|
||||
);
|
||||
self.diagnose_node(&event.node_name).await;
|
||||
}
|
||||
NodeEventType::BecameReady => {
|
||||
info!(node = %event.node_name, "Node became Ready");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle pod event and trigger AI diagnostics if needed
|
||||
pub async fn handle_pod_event(&self, event: PodEvent) {
|
||||
let problem_description = match &event.event_type {
|
||||
PodEventType::HighRestartCount { count } => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
restart_count = count,
|
||||
"Pod has high restart count"
|
||||
);
|
||||
format!("High restart count: {} restarts", count)
|
||||
}
|
||||
PodEventType::CrashLoopBackOff => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
"Pod in CrashLoopBackOff"
|
||||
);
|
||||
"Container is in CrashLoopBackOff state".to_string()
|
||||
}
|
||||
PodEventType::ImagePullError => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
"Pod has image pull error"
|
||||
);
|
||||
"Failed to pull container image".to_string()
|
||||
}
|
||||
PodEventType::Pending { reason } => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
reason = ?reason,
|
||||
"Pod stuck in Pending"
|
||||
);
|
||||
format!(
|
||||
"Pod stuck in Pending state. Reason: {}",
|
||||
reason.as_deref().unwrap_or("Unknown")
|
||||
)
|
||||
}
|
||||
PodEventType::Failed { reason } => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
reason = ?reason,
|
||||
"Pod in Failed state"
|
||||
);
|
||||
format!(
|
||||
"Pod in Failed state. Reason: {}",
|
||||
reason.as_deref().unwrap_or("Unknown")
|
||||
)
|
||||
}
|
||||
PodEventType::ContainerCreating { duration_seconds } => {
|
||||
warn!(
|
||||
pod = %event.pod_name,
|
||||
namespace = %event.namespace,
|
||||
duration_seconds = duration_seconds,
|
||||
"Container creating for too long"
|
||||
);
|
||||
format!(
|
||||
"Container has been creating for {} seconds",
|
||||
duration_seconds
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
self.diagnose_pod(&event.namespace, &event.pod_name, &problem_description)
|
||||
.await;
|
||||
}
|
||||
|
||||
async fn diagnose_node(&self, node_name: &str) {
|
||||
// Acquire semaphore permit to limit concurrency
|
||||
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
|
||||
|
||||
info!(node = %node_name, "Starting AI diagnosis (acquired permit)");
|
||||
|
||||
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||
|
||||
match diagnostic_engine
|
||||
.diagnose_nodes(&self.ai_client, vec![node_name.to_string()])
|
||||
.await
|
||||
{
|
||||
Ok(diagnosis) => {
|
||||
info!(
|
||||
node = %node_name,
|
||||
diagnosis = %diagnosis,
|
||||
"AI diagnosis completed"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
node = %node_name,
|
||||
error = %e,
|
||||
"AI diagnosis failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
// Permit is automatically released when _permit is dropped
|
||||
}
|
||||
|
||||
async fn diagnose_pod(&self, namespace: &str, pod_name: &str, problem: &str) {
|
||||
// Acquire semaphore permit to limit concurrency
|
||||
let _permit = self.diagnosis_semaphore.acquire().await.unwrap();
|
||||
|
||||
info!(pod = %pod_name, namespace = %namespace, "Starting AI diagnosis (acquired permit)");
|
||||
|
||||
let diagnostic_engine = DiagnosticEngine::new(self.kube_client.clone());
|
||||
|
||||
match diagnostic_engine
|
||||
.diagnose_pod(&self.ai_client, namespace, pod_name, problem)
|
||||
.await
|
||||
{
|
||||
Ok(diagnosis) => {
|
||||
info!(
|
||||
pod = %pod_name,
|
||||
namespace = %namespace,
|
||||
diagnosis = %diagnosis,
|
||||
"AI diagnosis completed"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
pod = %pod_name,
|
||||
namespace = %namespace,
|
||||
error = %e,
|
||||
"AI diagnosis failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
// Permit is automatically released when _permit is dropped
|
||||
}
|
||||
}
|
||||
3
src/events/mod.rs
Normal file
3
src/events/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
mod handler;
|
||||
|
||||
pub use handler::EventHandler;
|
||||
Reference in New Issue
Block a user