mirror of
https://github.com/compute-blade-community/compute-blade-agent.git
synced 2026-04-21 17:45:43 +02:00
feat: add event-driven handlers
Signed-off-by: Matthias Riegler <matthias.riegler@ankorstore.com>
This commit is contained in:
103
cmd/agent/main.go
Normal file
103
cmd/agent/main.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/xvzf/computeblade-agent/internal/agent"
|
||||
"github.com/xvzf/computeblade-agent/pkg/ledengine"
|
||||
"github.com/xvzf/computeblade-agent/pkg/log"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// setup logger
|
||||
|
||||
zapLogger := zap.Must(zap.NewDevelopment()).With(zap.String("app", "computeblade-agent"))
|
||||
_ = zap.ReplaceGlobals(zapLogger.With(zap.String("scope", "global")))
|
||||
baseCtx := log.IntoContext(context.Background(), zapLogger)
|
||||
|
||||
ctx, cancelCtx := context.WithCancelCause(baseCtx)
|
||||
defer cancelCtx(context.Canceled)
|
||||
|
||||
agent, err := agent.NewComputeBladeAgent(agent.ComputeBladeAgentConfig{
|
||||
IdleLedColor: ledengine.LedColorGreen(0.05),
|
||||
IdentifyLedColor: ledengine.LedColorPurple(0.05),
|
||||
CriticalLedColor: ledengine.LedColorRed(0.3),
|
||||
StealthModeEnabled: false,
|
||||
DefaultFanSpeed: 40,
|
||||
CriticalTemperature: 60,
|
||||
})
|
||||
if err != nil {
|
||||
log.FromContext(ctx).Error("Failed to create agent", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
}
|
||||
|
||||
// setup stop signal handlers
|
||||
sigs := make(chan os.Signal, 1)
|
||||
signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
// Wait for context cancel or signal
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case sig := <-sigs:
|
||||
// On signal, cancel context
|
||||
cancelCtx(fmt.Errorf("signal %s received", sig))
|
||||
}
|
||||
}()
|
||||
|
||||
// Run agent
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
err := agent.Run(ctx)
|
||||
if err != nil && err != context.Canceled {
|
||||
log.FromContext(ctx).Error("Failed to run agent", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
}
|
||||
}()
|
||||
|
||||
// setup prometheus endpoint
|
||||
promHandler := http.NewServeMux()
|
||||
promHandler.Handle("/metrics", promhttp.Handler())
|
||||
server := &http.Server{Addr: ":9666", Handler: promHandler}
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
err := server.ListenAndServe()
|
||||
if err != nil && err != http.ErrServerClosed {
|
||||
log.FromContext(ctx).Error("Failed to start prometheus server", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
}
|
||||
}()
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
<-ctx.Done()
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
err := server.Shutdown(shutdownCtx)
|
||||
if err != nil {
|
||||
log.FromContext(ctx).Error("Failed to shutdown prometheus server", zap.Error(err))
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for context cancel
|
||||
wg.Wait()
|
||||
if err := ctx.Err(); err != nil && err != context.Canceled {
|
||||
log.FromContext(ctx).Fatal("Exiting", zap.Error(err))
|
||||
} else {
|
||||
log.FromContext(ctx).Info("Exiting")
|
||||
}
|
||||
}
|
||||
334
internal/agent/agent.go
Normal file
334
internal/agent/agent.go
Normal file
@@ -0,0 +1,334 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/xvzf/computeblade-agent/pkg/hal"
|
||||
"github.com/xvzf/computeblade-agent/pkg/ledengine"
|
||||
"github.com/xvzf/computeblade-agent/pkg/log"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
var (
|
||||
// eventCounter is a prometheus counter that counts the number of events handled by the agent
|
||||
eventCounter = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "computeblade_agent",
|
||||
Name: "events_count",
|
||||
Help: "ComputeBlade Agent internal event handler statistics (handled events)",
|
||||
}, []string{"type"})
|
||||
|
||||
// droppedEventCounter is a prometheus counter that counts the number of events dropped by the agent
|
||||
droppedEventCounter = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "computeblade_agent",
|
||||
Name: "events_dropped_count",
|
||||
Help: "ComputeBlade Agent internal event handler statistics (dropped events)",
|
||||
}, []string{"type"})
|
||||
)
|
||||
|
||||
type Event int
|
||||
|
||||
const (
|
||||
NoopEvent = iota
|
||||
IdentifyEvent
|
||||
IdentifyConfirmEvent
|
||||
CriticalEvent
|
||||
CriticalResetEvent
|
||||
EdgeButtonEvent
|
||||
)
|
||||
|
||||
func (e Event) String() string {
|
||||
switch e {
|
||||
case NoopEvent:
|
||||
return "noop"
|
||||
case IdentifyEvent:
|
||||
return "identify"
|
||||
case IdentifyConfirmEvent:
|
||||
return "identify_confirm"
|
||||
case CriticalEvent:
|
||||
return "critical"
|
||||
case CriticalResetEvent:
|
||||
return "critical_reset"
|
||||
case EdgeButtonEvent:
|
||||
return "edge_button"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type ComputeBladeAgentConfig struct {
|
||||
|
||||
// IdleLedColor is the color of the edge LED when the blade is idle mode
|
||||
IdleLedColor hal.LedColor
|
||||
// IdentifyLedColor is the color of the edge LED when the blade is in identify mode
|
||||
IdentifyLedColor hal.LedColor
|
||||
// CriticalLedColor is the color of the top(!) LED when the blade is in critical mode.
|
||||
// In the circumstance when >1 blades are in critical mode, the identidy function can be used to find the right blade
|
||||
CriticalLedColor hal.LedColor
|
||||
|
||||
// StealthModeEnabled indicates whether stealth mode is enabled
|
||||
StealthModeEnabled bool
|
||||
|
||||
// DefaultFanSpeed is the default fan speed in percent. Usually 40% is sufficient
|
||||
DefaultFanSpeed uint
|
||||
|
||||
// Critical temperature of the compute blade (used to trigger critical mode)
|
||||
CriticalTemperature uint
|
||||
}
|
||||
|
||||
type ComputeBladeAgent interface {
|
||||
Run(ctx context.Context) error
|
||||
}
|
||||
|
||||
type computeBladeAgentImpl struct {
|
||||
opts ComputeBladeAgentConfig
|
||||
blade hal.ComputeBladeHal
|
||||
state *computebladeState
|
||||
edgeLedEngine ledengine.LedEngine
|
||||
topLedEngine ledengine.LedEngine
|
||||
|
||||
eventChan chan Event
|
||||
}
|
||||
|
||||
func NewComputeBladeAgent(opts ComputeBladeAgentConfig) (ComputeBladeAgent, error) {
|
||||
var err error
|
||||
|
||||
// blade, err := hal.NewCm4Hal(hal.ComputeBladeHalOpts{
|
||||
blade, err := hal.NewCm4Hal(hal.ComputeBladeHalOpts{
|
||||
FanUnit: hal.FanUnitStandard, // FIXME: support smart fan unit
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
edgeLedEngine := ledengine.NewLedEngine(ledengine.LedEngineOpts{
|
||||
LedIdx: hal.LedEdge,
|
||||
Hal: blade,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
topLedEngine := ledengine.NewLedEngine(ledengine.LedEngineOpts{
|
||||
LedIdx: hal.LedTop,
|
||||
Hal: blade,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &computeBladeAgentImpl{
|
||||
opts: opts,
|
||||
blade: blade,
|
||||
edgeLedEngine: edgeLedEngine,
|
||||
topLedEngine: topLedEngine,
|
||||
state: NewComputeBladeState(),
|
||||
eventChan: make(chan Event, 10), // backlog of 10 events. They should process fast but we e.g. don't want to miss button presses
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) Run(origCtx context.Context) error {
|
||||
var wg sync.WaitGroup
|
||||
ctx, cancelCtx := context.WithCancelCause(origCtx)
|
||||
defer a.cleanup(ctx)
|
||||
|
||||
log.FromContext(ctx).Info("Starting ComputeBlade agent")
|
||||
|
||||
// Ingest noop event to initialise metrics
|
||||
a.state.RegisterEvent(NoopEvent)
|
||||
|
||||
// Set defaults
|
||||
if err := a.blade.SetFanSpeed(uint8(a.opts.DefaultFanSpeed)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := a.blade.SetStealthMode(a.opts.StealthModeEnabled); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Start edge button event handler
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
log.FromContext(ctx).Info("Starting edge button event handler")
|
||||
for {
|
||||
err := a.blade.WaitForEdgeButtonPress(ctx)
|
||||
if err != nil && err != context.Canceled {
|
||||
log.FromContext(ctx).Error("Edge button event handler failed", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
} else if err != nil {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case a.eventChan <- Event(EdgeButtonEvent):
|
||||
default:
|
||||
log.FromContext(ctx).Warn("Edge button press event dropped due to backlog")
|
||||
droppedEventCounter.WithLabelValues(Event(EdgeButtonEvent).String()).Inc()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Start top LED engine
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
log.FromContext(ctx).Info("Starting top LED engine")
|
||||
err := a.runTopLedEngine(ctx)
|
||||
if err != nil && err != context.Canceled {
|
||||
log.FromContext(ctx).Error("Top LED engine failed", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Start edge LED engine
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
log.FromContext(ctx).Info("Starting edge LED engine")
|
||||
err := a.runEdgeLedEngine(ctx)
|
||||
if err != nil && err != context.Canceled {
|
||||
log.FromContext(ctx).Error("Edge LED engine failed", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Start event handler
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
log.FromContext(ctx).Info("Starting event handler")
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case event := <-a.eventChan:
|
||||
err := a.handleEvent(ctx, event)
|
||||
if err != nil && err != context.Canceled {
|
||||
log.FromContext(ctx).Error("Event handler failed", zap.Error(err))
|
||||
cancelCtx(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
// cleanup restores sane defaults before exiting. Ignores canceled context!
|
||||
func (a *computeBladeAgentImpl) cleanup(ctx context.Context) {
|
||||
log.FromContext(ctx).Info("Exiting, restoring safe settings")
|
||||
if err := a.blade.SetFanSpeed(100); err != nil {
|
||||
log.FromContext(ctx).Error("Failed to set fan speed to 100%", zap.Error(err))
|
||||
}
|
||||
if err := a.blade.SetLed(hal.LedEdge, hal.LedColor{}); err != nil {
|
||||
log.FromContext(ctx).Error("Failed to set edge LED to off", zap.Error(err))
|
||||
}
|
||||
if err := a.blade.SetLed(hal.LedTop, hal.LedColor{}); err != nil {
|
||||
log.FromContext(ctx).Error("Failed to set edge LED to off", zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) handleEvent(ctx context.Context, event Event) error {
|
||||
log.FromContext(ctx).Info("Handling event", zap.String("event", event.String()))
|
||||
eventCounter.WithLabelValues(event.String()).Inc()
|
||||
|
||||
// register event in state
|
||||
a.state.RegisterEvent(event)
|
||||
|
||||
// Dispatch incoming events to the right handler(s)
|
||||
switch event {
|
||||
case CriticalEvent:
|
||||
// Handle critical event
|
||||
return a.handleCriticalActive(ctx)
|
||||
case CriticalResetEvent:
|
||||
// Handle critical event
|
||||
return a.handleCriticalReset(ctx)
|
||||
case IdentifyEvent:
|
||||
// Handle identify event
|
||||
return a.handleIdentifyActive(ctx)
|
||||
case IdentifyConfirmEvent:
|
||||
// Handle identify event
|
||||
return a.handleIdentifyClear(ctx)
|
||||
case EdgeButtonEvent:
|
||||
// Handle edge button press to toggle identify mode
|
||||
event := Event(IdentifyEvent)
|
||||
if a.state.IdentifyActive() {
|
||||
event = Event(IdentifyConfirmEvent)
|
||||
}
|
||||
select {
|
||||
case a.eventChan <- Event(event):
|
||||
default:
|
||||
log.FromContext(ctx).Warn("Edge button press event dropped due to backlog")
|
||||
droppedEventCounter.WithLabelValues(event.String()).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) handleIdentifyActive(ctx context.Context) error {
|
||||
log.FromContext(ctx).Info("Identify active")
|
||||
return a.edgeLedEngine.SetPattern(ledengine.NewBurstPattern(hal.LedColor{}, a.opts.IdentifyLedColor))
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) handleIdentifyClear(ctx context.Context) error {
|
||||
log.FromContext(ctx).Info("Identify confirmed/cleared")
|
||||
return a.edgeLedEngine.SetPattern(ledengine.NewStaticPattern(a.opts.IdleLedColor))
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) handleCriticalActive(ctx context.Context) error {
|
||||
log.FromContext(ctx).Warn("Blade in critical state, setting fan speed to 100% and turning on LEDs")
|
||||
|
||||
// Set fan speed to 100%
|
||||
setFanspeedError := a.blade.SetFanSpeed(100)
|
||||
|
||||
// Disable stealth mode (turn on LEDs)
|
||||
setStealthModeError := a.blade.SetStealthMode(false)
|
||||
|
||||
// Set critical pattern for top LED
|
||||
setPatternTopLedErr := a.topLedEngine.SetPattern(
|
||||
ledengine.NewSlowBlinkPattern(hal.LedColor{}, a.opts.CriticalLedColor),
|
||||
)
|
||||
// Combine errors, but don't stop execution flow for now
|
||||
return errors.Join(setFanspeedError, setStealthModeError, setPatternTopLedErr)
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) handleCriticalReset(ctx context.Context) error {
|
||||
log.FromContext(ctx).Info("Critical state cleared, setting fan speed to default and restoring LEDs to default state")
|
||||
// Set fan speed to 100%
|
||||
if err := a.blade.SetFanSpeed(uint8(a.opts.DefaultFanSpeed)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Reset stealth mode
|
||||
if err := a.blade.SetStealthMode(a.opts.StealthModeEnabled); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Set top LED off
|
||||
if err := a.topLedEngine.SetPattern(ledengine.NewStaticPattern(hal.LedColor{})); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *computeBladeAgentImpl) Close() error {
|
||||
return errors.Join(a.blade.Close())
|
||||
}
|
||||
|
||||
// runTopLedEngine runs the top LED engine
|
||||
func (a *computeBladeAgentImpl) runTopLedEngine(ctx context.Context) error {
|
||||
// FIXME the top LED is only used to indicate emergency situations
|
||||
a.edgeLedEngine.SetPattern(ledengine.NewStaticPattern(hal.LedColor{}))
|
||||
return a.edgeLedEngine.Run(ctx)
|
||||
}
|
||||
|
||||
// runEdgeLedEngine runs the edge LED engine
|
||||
func (a *computeBladeAgentImpl) runEdgeLedEngine(ctx context.Context) error {
|
||||
a.edgeLedEngine.SetPattern(ledengine.NewStaticPattern(a.opts.IdleLedColor))
|
||||
return a.edgeLedEngine.Run(ctx)
|
||||
}
|
||||
102
internal/agent/state.go
Normal file
102
internal/agent/state.go
Normal file
@@ -0,0 +1,102 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
var (
|
||||
stateMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "computeblade_state",
|
||||
Name: "state",
|
||||
Help: "ComputeBlade state (label values are critical, identify, normal)",
|
||||
}, []string{"state"})
|
||||
)
|
||||
|
||||
type computebladeState struct {
|
||||
mutex sync.Mutex
|
||||
|
||||
// identifyActive indicates whether the blade is currently in identify mode
|
||||
identifyActive bool
|
||||
identifyClearChan chan struct{}
|
||||
// criticalActive indicates whether the blade is currently in critical mode
|
||||
criticalActive bool
|
||||
criticalClearChan chan struct{}
|
||||
}
|
||||
|
||||
func NewComputeBladeState() *computebladeState {
|
||||
return &computebladeState{
|
||||
identifyClearChan: make(chan struct{}),
|
||||
criticalClearChan: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *computebladeState) RegisterEvent(event Event) {
|
||||
s.mutex.Lock()
|
||||
defer s.mutex.Unlock()
|
||||
switch event {
|
||||
case IdentifyEvent:
|
||||
s.identifyActive = true
|
||||
case IdentifyConfirmEvent:
|
||||
s.identifyActive = false
|
||||
close(s.identifyClearChan)
|
||||
s.identifyClearChan = make(chan struct{})
|
||||
case CriticalEvent:
|
||||
s.criticalActive = true
|
||||
s.identifyActive = false
|
||||
case CriticalResetEvent:
|
||||
s.criticalActive = false
|
||||
close(s.criticalClearChan)
|
||||
s.criticalClearChan = make(chan struct{})
|
||||
}
|
||||
|
||||
// Set identify state metric
|
||||
if s.identifyActive {
|
||||
stateMetric.WithLabelValues("identify").Set(1)
|
||||
} else {
|
||||
stateMetric.WithLabelValues("identify").Set(0)
|
||||
}
|
||||
|
||||
// Set critical state metric
|
||||
if s.criticalActive {
|
||||
stateMetric.WithLabelValues("critical").Set(1)
|
||||
} else {
|
||||
stateMetric.WithLabelValues("critical").Set(0)
|
||||
}
|
||||
|
||||
// Set critical state metric
|
||||
if !s.criticalActive && !s.identifyActive {
|
||||
stateMetric.WithLabelValues("normal").Set(1)
|
||||
} else {
|
||||
stateMetric.WithLabelValues("normal").Set(0)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *computebladeState) IdentifyActive() bool {
|
||||
return s.identifyActive
|
||||
}
|
||||
|
||||
func (s *computebladeState) WaitForIdentifyClear(ctx context.Context) error {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-s.identifyClearChan:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *computebladeState) CriticalActive() bool {
|
||||
return s.criticalActive
|
||||
}
|
||||
|
||||
func (s *computebladeState) WaitForCriticalClear(ctx context.Context) error {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-s.criticalClearChan:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user