Skip to content

Nomad Platform Deployment

Complete guide for deploying Productify on HashiCorp Nomad with detailed setup instructions and monitoring.

Prerequisites

Required Software

  • Nomad: 1.6+ cluster (server and client nodes)
  • PostgreSQL: 16+ database
  • Docker: Runtime on all Nomad client nodes

System Requirements

  • Memory: 8 GB RAM minimum per node (16 GB recommended)
  • CPU: 4 cores minimum per node
  • Storage: 50 GB free disk space per node
  • Network: Low latency between nodes (< 10ms for same datacenter)

Nomad Installation

Linux (AMD64)

bash
# Download Nomad
wget https://releases.hashicorp.com/nomad/1.7.3/nomad_1.7.3_linux_amd64.zip

# Extract
unzip nomad_1.7.3_linux_amd64.zip

# Move to system path
sudo mv nomad /usr/local/bin/

# Verify installation
nomad version

macOS (Homebrew)

bash
# Install Nomad
brew install nomad

# Verify installation
nomad version

Other Platforms

See the official Nomad installation guide for other platforms and installation methods.

Nomad Agent Setup

Development Mode (Single Node)

For local testing and development:

bash
# Start Nomad in development mode (server + client)
sudo nomad agent -dev

# In another terminal, verify
nomad node status
nomad server members

# Access Web UI
open http://localhost:4646

Important Nomad Ports:

  • 4646: HTTP API and Web UI
  • 4647: RPC (Remote Procedure Call)
  • 4648: Serf WAN (gossip protocol)

Production Cluster

For production, separate server and client nodes are recommended.

Server Node:

bash
sudo nomad agent -config=/etc/nomad.d/server.hcl

server.hcl:

hcl
datacenter = "dc1"
data_dir = "/var/lib/nomad"

server {
  enabled = true
  bootstrap_expect = 3
}

Client Node:

bash
sudo nomad agent -config=/etc/nomad.d/client.hcl

client.hcl:

hcl
datacenter = "dc1"
data_dir = "/var/lib/nomad"

client {
  enabled = true
}

plugin "docker" {
  config {
    allow_privileged = false
  }
}

Prerequisites

  • Nomad 1.6+ cluster
  • PostgreSQL 16+ database
  • Docker runtime on Nomad clients

Complete Job Specification

Manager Job

hcl
job "manager" {
  datacenters = ["dc1"]
  type = "service"

  group "api" {
    count = 3

    update {
      max_parallel = 1
      health_check = "checks"
      min_healthy_time = "10s"
      healthy_deadline = "3m"
      auto_revert = true
    }

    network {
      port "http" {
        to = 8080
      }
    }

    service {
      name = "manager-api"
      port = "http"
      tags = ["productify", "api"]

      check {
        type = "http"
        path = "/health"
        interval = "10s"
        timeout = "2s"
      }
    }

    task "server" {
      driver = "docker"

      config {
        image = "ghcr.io/productifyfw/manager:${VERSION}"
        ports = ["http"]
      }

      env {
        PFY_RUN_MODE = "api"
      }

      template {
        data = <<EOH
PFY_DB_HOST={{ key "manager/db/host" }}
PFY_DB_PORT={{ key "manager/db/port" }}
PFY_DB_USER={{ key "manager/db/user" }}
PFY_DB_PASSWORD={{ key "manager/db/password" }}
PFY_DB_NAME={{ key "manager/db/name" }}
PFY_DB_SSLMODE=require

PFY_POCKET_ID_HOST={{ key "manager/pocketid/host" }}
PFY_POCKET_ID_API_KEY={{ key "manager/pocketid/api_key" }}

PFY_ENV=production
PFY_RUN_MODE=api
PFY_PORT=${NOMAD_PORT_http}
EOH
        destination = "secrets/env"
        env = true
      }

      resources {
        cpu    = 500
        memory = 512
      }
    }
  }

  group "executor" {
    count = 1

    constraint {
      operator = "distinct_hosts"
      value = "true"
    }

    network {
      port "metrics" {
        to = 9090
      }
    }

    service {
      name = "manager-executor"
      port = "metrics"
      tags = ["productify", "executor"]
    }

    task "executor" {
      driver = "docker"

      config {
        image = "ghcr.io/productifyfw/manager:${VERSION}"
      }

      env {
        MODE = "executor"
      }

      template {
        data = <<EOH
DB_HOST={{ key "manager/db/host" }}
DB_PORT={{ key "manager/db/port" }}
DB_USER={{ key "manager/db/user" }}
DB_PASSWORD={{ key "manager/db/password" }}
DB_NAME={{ key "manager/db/name" }}
EOH
        destination = "secrets/env"
        env = true
      }

      resources {
        cpu    = 200
        memory = 256
      }
    }
  }
}

Optimizer Job

hcl
job "optimizer" {
  datacenters = ["dc1"]
  type = "service"

  group "optimizer" {
    count = 2

    network {
      port "http" {
        to = 8000
      }
      port "metrics" {
        to = 9090
      }
    }

    service {
      name = "optimizer"
      port = "http"
      tags = ["productify", "autoscaler"]

      check {
        type = "http"
        path = "/health"
        interval = "10s"
        timeout = "2s"
      }
    }

    task "server" {
      driver = "docker"

      config {
        image = "ghcr.io/productifyfw/optimizer:${VERSION}"
        ports = ["http", "metrics"]
      }

      env {
        CACHE_SIZE = "10"
        FORECAST_HORIZON = "60"
        LOG_LEVEL = "INFO"
      }

      resources {
        cpu    = 1000
        memory = 1024
      }
    }
  }
}

Proxy Job

hcl
job "proxy" {
  datacenters = ["dc1"]
  type = "system"  # Deploy on all nodes

  group "caddy" {
    network {
      port "http" {
        static = 80
        to = 80
      }
      port "https" {
        static = 443
        to = 443
      }
      port "admin" {
        to = 2019
      }
    }

    service {
      name = "proxy"
      port = "http"
      tags = ["productify", "proxy"]

      check {
        type = "http"
        port = "admin"
        path = "/health"
        interval = "10s"
        timeout = "2s"
      }
    }

    task "caddy" {
      driver = "docker"

      config {
        image = "caddy:2.7-alpine"
        ports = ["http", "https", "admin"]

        volumes = [
          "local/Caddyfile:/etc/caddy/Caddyfile"
        ]
      }

      template {
        data = <<EOH
{
  admin :2019
  auto_https on
  email {{ key "proxy/acme_email" }}
}

*.{{ key "proxy/domain" }} {
  tls {
    dns cloudflare {{ key "proxy/cloudflare_token" }}
  }

  @manager host manager.{{ key "proxy/domain" }}
  handle @manager {
    reverse_proxy {
      {{- range service "manager-api" }}
      to {{ .Address }}:{{ .Port }}
      {{- end }}

      lb_policy least_conn
      health_uri /health
      health_interval 10s
    }
  }

  @tenant host *.{{ key "proxy/domain" }}
  handle @tenant {
    reverse_proxy {
      {{- range service "tenant-proxy" }}
      to {{ .Address }}:{{ .Port }}
      {{- end }}
    }
  }
}
EOH
        destination = "local/Caddyfile"
        change_mode = "signal"
        change_signal = "SIGHUP"
      }

      resources {
        cpu    = 200
        memory = 128
      }
    }
  }
}

Autoscaler Plugin Job

hcl
job "nomad-autoscaler" {
  datacenters = ["dc1"]
  type = "service"

  group "autoscaler" {
    count = 1

    network {
      port "http" {
        to = 8080
      }
      port "prometheus" {
        to = 8081
      }
    }

    service {
      name = "nomad-autoscaler"
      port = "http"

      check {
        type = "http"
        path = "/v1/health"
        interval = "10s"
        timeout = "2s"
      }
    }

    task "autoscaler" {
      driver = "docker"

      config {
        image = "hashicorp/nomad-autoscaler:0.4.0"

        volumes = [
          "local/config.hcl:/etc/autoscaler.hcl",
          "local/policies:/policies"
        ]

        args = ["agent", "-config=/etc/autoscaler.hcl"]
      }

      # Autoscaler configuration
      template {
        data = <<EOH
plugin_dir = "/plugins"

nomad {
  address = "http://{{ env "NOMAD_IP_http" }}:4646"
}

policy_dir = "/policies"

http {
  bind_address = "0.0.0.0"
  bind_port    = 8080
}

telemetry {
  prometheus_metrics = true
  prometheus_retention_time = "24h"
}
EOH
        destination = "local/config.hcl"
      }

      # Scaling policy for Manager API
      template {
        data = <<EOH
scaling "horizontal" {
  enabled = true
  min     = 2
  max     = 10

  policy {
    evaluation_interval = "10s"
    cooldown           = "30s"

    target "nomadscaler" {
      namespace     = "default"
      job           = "manager"
      group         = "api"
      optimizer_url = "http://{{- with service "optimizer" }}{{ with index . 0 }}{{ .Address }}:{{ .Port }}{{ end }}{{ end }}"
      cache_size    = 10
    }

    check "cpu" {
      source = "nomad-apm"
      query  = "avg_cpu_percent"

      strategy "target-value" {
        target = 75
      }
    }

    check "memory" {
      source = "nomad-apm"
      query  = "avg_memory_percent"

      strategy "target-value" {
        target = 80
      }
    }
  }
}
EOH
        destination = "local/policies/manager-api.hcl"
      }

      # Copy plugin binary
      artifact {
        source = "https://releases.productify.io/nomadscaler/latest/nomadscaler"
        destination = "plugins/nomadscaler"
        mode = "file"
      }

      resources {
        cpu = CPU_LIMIT
        memory = MEMORY_LIMIT
      }
    }
  }
}

Monitoring

Monitor Optimizer and Nomad Autoscaler metrics using Prometheus:

yaml
# prometheus.yml
scrape_configs:
  - job_name: "optimizer"
    static_configs:
      - targets: ["optimizer:8000"]

  - job_name: "nomad-autoscaler"
    static_configs:
      - targets: ["nomad-autoscaler:8080"]

  - job_name: "nomad"
    static_configs:
      - targets: ["localhost:4646"]

Deployment Commands

Deploy Manager Component

bash
cd manager/nomad

# Validate job specification
nomad job validate manager.nomad.hcl

# Plan the deployment (dry run)
nomad job plan manager.nomad.hcl

# Deploy the job
nomad job run manager.nomad.hcl

# Check job status
nomad job status manager

# View allocations (running instances)
nomad alloc status <alloc-id>

# Follow logs for a specific allocation
nomad alloc logs -f <alloc-id> server

Manager Job Components:

  • API Group: 3 instances (horizontally scaled)
    • Serves GraphQL API
    • Handles HTTP requests
    • Registers with Nomad service discovery
  • Executor Group: 1 instance
    • Runs scheduled jobs and triggers
    • Processes background tasks

Deploy Proxy and Authentication

bash
cd proxy/nomad

# Deploy Pocket ID first
nomad job validate pocketid.nomad.hcl
nomad job run pocketid.nomad.hcl

# Wait for Pocket ID to be healthy
nomad job status pocketid

# Then deploy proxy
nomad job validate proxy.nomad.hcl
nomad job run proxy.nomad.hcl

# Check status
nomad job status proxy

# View service endpoints
nomad service list

Deploy Autoscaler Stack

bash
cd autoscaler/nomadscaler/config

# Deploy autoscaler with optimizer and Prometheus
nomad job validate autoscaler.hcl
nomad job run autoscaler.hcl

# Check status
nomad job status autoscaler

# Access Prometheus UI
open http://localhost:9090

# Verify metrics collection
curl http://localhost:9090/api/v1/targets

Autoscaler Components:

  • Nomad Autoscaler Plugin: Evaluates scaling policies
  • Optimizer Service: Provides ML-based scaling decisions (Python/FastAPI)
  • Prometheus: Collects and stores metrics

Deploy All Jobs (Complete Stack)

bash
# Set version variable
export VERSION="1.0.0"

# Deploy in order
nomad job run -var="VERSION=${VERSION}" manager.nomad.hcl
nomad job run pocketid.nomad.hcl
nomad job run proxy.nomad.hcl
nomad job run -var="VERSION=${VERSION}" optimizer.nomad.hcl
nomad job run autoscaler.nomad.hcl

# Verify all jobs are running
nomad job status

Rolling Update

bash
# Set version variable
export VERSION="1.0.0"

# Deploy jobs
nomad job run -var="VERSION=${VERSION}" manager.nomad.hcl
nomad job run -var="VERSION=${VERSION}" optimizer.nomad.hcl
nomad job run proxy.nomad.hcl
nomad job run autoscaler.nomad.hcl

Rolling Update

bash
# Update Manager
nomad job run -check-index $(nomad job inspect manager | jq .JobModifyIndex) manager.nomad.hcl

# Monitor deployment
nomad deployment status <deployment-id>

# Watch deployment progress
watch -n 2 'nomad job status manager | head -20'

Rollback

bash
# View job versions
nomad job history manager

# Revert to previous version
nomad job revert manager <version>

# Stop a failed deployment
nomad deployment fail <deployment-id>

Monitoring and Verification

Nomad Web UI

Access the Nomad UI at http://localhost:4646:

  • Jobs: View all running jobs, their status, and allocations
  • Allocations: See details of each running container instance
  • Nodes: Infrastructure information and resource usage
  • Topology: Visual overview of the cluster state
  • Evaluations: View scheduling decisions

Check Job Health

bash
# Job overview
nomad job status manager

# Specific allocation details
nomad alloc status <alloc-id>

# Service health checks
nomad service info manager-api

# Recent job history
nomad job history manager

# View deployment status
nomad deployment status <deployment-id>

Prometheus Metrics

Access Prometheus at http://localhost:9090 and query metrics:

bash
# Manager API availability
curl 'http://localhost:9090/api/v1/query?query=up{job="manager"}'

# CPU usage across cluster
curl 'http://localhost:9090/api/v1/query?query=nomad_client_host_cpu_user'

# Memory usage per allocation
curl 'http://localhost:9090/api/v1/query?query=nomad_client_alloc_memory_usage'

# HTTP request rate (if exposed by app)
curl 'http://localhost:9090/api/v1/query?query=rate(http_requests_total[5m])'

# Active connections
curl 'http://localhost:9090/api/v1/query?query=nomad_client_alloc_network_rx_bytes'

Test Autoscaler

Verify autoscaler functionality by generating load:

bash
# Install Apache Bench (if not already installed)
# Ubuntu/Debian: apt-get install apache2-utils
# macOS: brew install ab

# Generate load
ab -n 10000 -c 100 http://localhost:8080/api/health

# Watch scaling events in real-time
nomad job history manager

# Monitor allocation count changes
watch -n 2 'nomad job status manager | grep "Allocations"'

# View autoscaler decision logs
nomad alloc logs -f <autoscaler-alloc-id> autoscaler

# Check optimizer service health
curl http://localhost:8000/health

# View recent scaling decisions
curl http://localhost:8000/metrics | grep scaling

Service Discovery Verification

bash
# List all registered services
nomad service list

# Get specific service information
nomad service info manager-api

High Availability

Executor Instance

Only one Executor instance should run. The Executor uses database-level locking to ensure single-instance execution across the cluster.

Increase the count for API instances to scale horizontally:

hcl
group "api" {
  count = 5  # Scale as needed
  # ...
}

Troubleshooting

Job Won't Start

bash
# Validate job file syntax
nomad job validate <job-file>.hcl

# Get detailed error message
nomad job status <job-name>

# Check allocation-level errors
nomad alloc status <alloc-id>

# View allocation logs
nomad alloc logs <alloc-id> <task-name>

# Check Docker driver status
docker ps
docker images

Network Connectivity Issues

bash
# Check service discovery
nomad service list
nomad service info <service-name>

# Verify port mappings
nomad alloc status <alloc-id> | grep -A 10 "Ports"

# Enter container for debugging
nomad alloc exec -task <task-name> <alloc-id> /bin/sh

# Test connectivity from inside container
nomad alloc exec <alloc-id> curl http://localhost:8080/health
nomad alloc exec <alloc-id> ping postgres

Database Connection Problems

bash
# Test database connectivity
nomad alloc exec <postgres-alloc-id> psql -U postgres -d productify -c "SELECT version();"

# Check database logs
nomad alloc logs <postgres-alloc-id>

Autoscaler Not Scaling

bash
# Check scaling policies
nomad scaling policy list

# View policy details
nomad scaling policy info <policy-id>

# Test manual scaling
nomad job scale manager 5

# Verify Prometheus targets
curl http://localhost:9090/api/v1/targets

# Check optimizer logs
nomad alloc logs -f <optimizer-alloc-id> optimizer

# Verify autoscaler plugin
nomad alloc logs -f <autoscaler-alloc-id> autoscaler

High Resource Usage

bash
# Check cluster resources
nomad node status

# View resource allocation
nomad status

# Check specific node usage
nomad node status <node-id>

# See resource constraints
nomad job inspect manager | jq '.Job.TaskGroups[].Tasks[].Resources'

Check Job Status

bash
nomad job status manager
nomad alloc status <alloc-id>
nomad alloc logs <alloc-id>

Networking Issues

bash
# Check allocations
nomad alloc status -verbose <alloc-id>

# Exec into container
nomad alloc exec <alloc-id> sh

Advanced Configuration

Auto-Revert on Failure

hcl
update {
  max_parallel = 1
  health_check = "checks"
  min_healthy_time = "10s"
  healthy_deadline = "5m"
  auto_revert = true
  auto_promote = false
}

Canary Deployments

hcl
update {
  max_parallel = 1
  canary = 1
  min_healthy_time = "30s"
  healthy_deadline = "10m"
  auto_promote = false
  auto_revert = true
}

Resource Limits

hcl
resources {
  cpu    = 1000  # MHz
  memory = 1024  # MB

  memory_max = 2048  # Max memory before OOM
}

Spread Across Nodes

hcl
spread {
  attribute = "${node.unique.id}"
  weight    = 100
}

See Also