Grafana monitoring with Docker. Part 2 - Traces with Tempo

Intro

in a previous article we configured Grafana, Loki and Alloy for logs gathering.

in this article we are going through configuring Tracing with Tempo, and we assume that previous configurations of a system were already done from the Part 1 of the article series (like Grafana web gui configuration and Caddy web server at least)

Tracing answers a question, if we have some long executed processes, how can we easily see which time each step takes? The same answer can be done with Profiling, but Profiling is not controllable and hard to navigate. The tracing is the best for Backend applications with many different network requests (to databases or http requests to apis and other your own services). Tracing has the next properties:

Note

We have as some weak substistution for instrumentation in Go [epbf based tool](https://github.com/open-telemetry/opentelemetry-go-instrumentation), but it is highly limited, your logs, metrics will not have connections to traces, and it works only for specific sub set of libraries which u can't easily change. We will not be covering this tool usage in this series of article since it is not looking like good method to go by default.

Raising Tempo

Important

we provide docker-compose way of configuration as demo example because more devs are highly likely familiar and comfortable with docker-compose than with terraform. We utilize terraform for configuration of it and recommend it to use instead of docker-compose if u can. Book "Terraform up and running" is excellent place to start with it.


version: '3.8'

services:
  tempo:
    build:
      dockerfile: ./Dockerfile.tempo
      context: .
    container_name: tempo
    user: root
    entrypoint: ["sh", "-c"]
    command: ["/tempo -config.file=/etc/tempo.yaml"]
    networks:
      grafana:
        aliases:
          - tempo
    restart: always
    logging:
      driver: json-file
      options:
        mode: non-blocking
        max-buffer-size: 500m
    volumes:
      - tempo_data:/var/tempo
    mem_limit: 1000m

  alloy-traces:
    build:
      dockerfile: ./Dockerfile.alloy.traces
      context: .
    container_name: alloy-traces
    entrypoint: ["/bin/alloy"]
    command: ["run","/etc/alloy/config.alloy","--storage.path=/var/lib/alloy/data"]
    restart: always
    logging:
      driver: json-file
      options:
        mode: non-blocking
        max-buffer-size: 500m
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    networks:
      grafana:
        aliases:
          - alloy-traces
    mem_limit: 1000m

volumes:
  tempo_data:
    name: "tempo_data"

networks:
  grafana:
    name: grafana
    external: true

  

Participating configs:

Dockerfile.tempo - Show / Hide

FROM grafana/tempo:2.7.2
COPY infra/tf/modules/docker_stack/monitoring/tempo.yaml /etc/tempo.yaml

tempo.yaml - Show / Hide

stream_over_http_enabled: true
server:
  http_listen_port: 3200
  log_level: info

query_frontend:
  search:
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09
    metadata_slo:
        duration_slo: 5s
        throughput_bytes_slo: 1.073741824e+09
  trace_by_id:
    duration_slo: 100ms
  metrics:
    max_duration: 200h
    query_backend_after: 5m
    duration_slo: 5s
    throughput_bytes_slo: 1.073741824e+09

distributor:
  receivers:                           
    otlp:
      protocols:
        http:
          endpoint: "tempo:4318"
        grpc:
          endpoint: "tempo:4317"

compactor:
  compaction:
    block_retention: 24h

metrics_generator:
  registry:
    external_labels:
      source: tempo
      cluster: docker-compose
  storage:
    path: /var/tempo/generator/wal
    remote_write:
      - url: http://prometheus:9090/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /var/tempo/generator/traces
  processor:
    local_blocks:
      filter_server_spans: false
      flush_to_storage: true

storage:
  trace:
    backend: local                 
    wal:
      path: /var/tempo/wal            
    local:
      path: /var/tempo/blocks

overrides:
  defaults:
    metrics_generator:
      processors: [service-graphs, span-metrics, local-blocks]
      generate_native_histograms: both

Dockerfile.alloy.traces - Show / Hide

FROM grafana/alloy:v1.8.3
COPY infra/tf/modules/docker_stack/monitoring/cfg.traces.alloy /etc/alloy/config.alloy

cfg.traces.alloy - Show / Hide


logging {
  level  = "info"
  format = "logfmt"
}

otelcol.receiver.otlp "receiver" {
  debug_metrics {
    disable_high_cardinality_metrics = true
  }

  grpc {
    endpoint = "0.0.0.0:4317"
  }
  http {
    endpoint = "0.0.0.0:4318"
  }
  output {
    metrics = [otelcol.processor.transform.default.input]
    logs    = [otelcol.processor.transform.default.input]
    traces  = [otelcol.processor.transform.default.input]
  }
}

otelcol.processor.transform "default" {
  error_mode = "ignore"

  trace_statements {
    context = "resource"
    statements = [
      `limit(attributes, 500, [])`,
      `truncate_all(attributes, 20480)`,
    ]
  }

  trace_statements {
    context = "span"
    statements = [
      `limit(attributes, 500, [])`,
      `truncate_all(attributes, 20480)`,
    ]
  }
  output {
    metrics = [otelcol.processor.batch.default.input]
    logs    = [otelcol.processor.batch.default.input]
    traces  = [otelcol.processor.batch.default.input]
  }
}

otelcol.processor.batch "default" {
  output {
    metrics = [otelcol.exporter.prometheus.default.input]
    logs    = [otelcol.exporter.loki.default.input]
    traces  = [otelcol.exporter.otlphttp.default.input]
  }
}

otelcol.exporter.otlphttp "default" {
  client {
    endpoint = coalesce(sys.env("TEMPO_URL"),"http://tempo:4318/")
    tls {
        insecure             = true
        insecure_skip_verify = true
    }
  }
}

tracing {
    sampling_fraction = encoding.from_json(coalesce(sys.env("SAMPING_FRACTION"),"1"))
    write_to = [otelcol.exporter.otlphttp.default.input]
}

otelcol.exporter.prometheus "default" {
  forward_to = [prometheus.remote_write.local.receiver]

  include_target_info = true
  include_scope_info = true
  resource_to_telemetry_conversion = true
}

prometheus.remote_write "local" {
  endpoint {
    url = coalesce(sys.env("PROMETHEUS_URL"),"http://prometheus:9090/api/v1/write")
  }
}

otelcol.exporter.loki "default" {
  forward_to = [loki.write.local.receiver]
}

loki.write "local" {
  endpoint {
    url = coalesce(sys.env("LOKI_URL"), "http://loki:3100/loki/api/v1/push")
    tenant_id = ""
  }
}

Proceed to apply deployment for raising the tracing stack part (or use Opentofu(Terraform) to raise all stuff together as modules from ./main.tf)

git clone --recurse-submodules https://github.com/darklab8/blog
cd blog/articles/article_detailed/article_20250609_grafana/code_examples

# if docker-compose way
export DOCKER_HOST=ssh://root@homelab
docker ps
docker compose -f docker-compose.tracing.yaml build
docker compose -f docker-compose.tracing.yaml up -d tempo
docker compose -f docker-compose.tracing.yaml up -d alloy-traces

# if opentofu way
tofu init
tofu apply

# after deploy, u need to grant tempo proper rights to be persistent and possible to init
chmod -R a+rw /var/lib/docker/volumes/tempo_data

Note

We presume “grafana” and “caddy” were raised in the previous part of the article about Loki. If that did not happen, start with it first there https://darklab8.github.io/blog/article_grafana_loki.html . raising grafana and caddy is fairly simple as “docker compose up -d grafana ; docker compose up -d caddy ; chmod -R a+rw /var/lib/docker/volumes/grafana_data. Code for their raising and configs in the same folder as docker-compose.tracing.yaml

Demo application to test it.

export DOCKER_HOST=ssh://root@homelab
docker compose -f docker-compose.app-traces.yaml build
docker compose -f docker-compose.app-traces.yaml run -it app-traces-go

with the next code is deployed

package app_traces_go

import (
	"context"
	"errors"
	"fmt"
	"math/rand/v2"
	"time"

	"github.com/darklab8/go-typelog/otlp"
	"github.com/darklab8/go-utils/typelog"
	"go.opentelemetry.io/otel"
)

type WebEndpoint struct {
	pattern      string
	max_duration float64
	url          func() string
}

var WebEndpoints = []WebEndpoint{
	{
		pattern:      "/index.html",
		max_duration: 0.1,
		url:          func() string { return "/index.html" },
	},
	{
		pattern:      "/some_pattern1",
		max_duration: 1,
		url:          func() string { return "/some_pattern1" },
	},
	{
		pattern:      "/another_pattern",
		max_duration: 2,
		url:          func() string { return "/another_pattern" },
	},
	{
		pattern:      "/books/__book_id__",
		max_duration: 0.1,
		url:          func() string { return fmt.Sprintf("books/%d", rand.IntN(100)) },
	},
	{
		pattern:      "/books/__book_id__/page/__page_id__",
		max_duration: 0.2,
		url:          func() string { return fmt.Sprintf("books/%d/page/%d", rand.IntN(100), rand.IntN(1000)) },
	},
}

var (
	logger *typelog.Logger = typelog.NewLogger("go-demo-app")
	Tracer                 = otel.Tracer("go-demo-app")
)

func NestedAction(ctx_span context.Context) {
	ctx_span, span := Tracer.Start(ctx_span, "nested action")
	defer span.End()
}

func doRun() {
	time_start := time.Now()
	fmt.Println("started run", time_start)
	ctx_span, span := Tracer.Start(context.Background(), "web request")
	defer span.End()

	time.Sleep(3 * time.Second)

	web_endpoint := WebEndpoints[rand.IntN(len(WebEndpoints))]
	duration := rand.Float64() * web_endpoint.max_duration
	pattern := web_endpoint.pattern
	logger.InfoCtx(ctx_span, "web request",
		typelog.String("url_pattern", pattern),
		typelog.Float64("duration", duration),
		typelog.String("url_path", web_endpoint.url()),
	)
	NestedAction(ctx_span)
	fmt.Println("fninished run", time.Now(), time.Since(time_start))
	time.Sleep(3 * time.Second)
}

func main() {
	fmt.Println("starting app-traces")
	ctx := context.Background()

  // example how to initialize Tracing itself is copy pasted from https://opentelemetry.io/docs/languages/go/getting-started/
  // and into https://github.com/darklab8/go-utils/blob/master/otlp/setup.go
	otelShutdown, err := otlp.SetupOTelSDK(ctx) // Set up OpenTelemetry.
	if err != nil {
		fmt.Println("error to initialize tracing, err=", err.Error())
	}
	defer func() { // Handle shutdown properly so nothing leaks.
		err = errors.Join(err, otelShutdown(context.Background()))
	}()
	fmt.Println("configured tracing")
	for {
		doRun()
		time.Sleep(30 * time.Second)
	}
}

and wee see in its logging its is working

> starting app-traces
> configured trading
> started run 2026-04-27 01:33:20.263967654 +0000 UTC m=+0.004603667
> fninished run 2026-04-27 01:33:23.264569558 +0000 UTC m=+3.005205571 3.000601995s

If everything is all right and no errors appears at any level in the chain of

You will see traces visible in your tracing drilldown interface then!

In a real world tracing is the most useful for backend applications and the best to turn it on by default for all the network interacting libraries through writing some kind of middleware. Then it will be able to answer you that issues you have at specific SQL request, or elastic search query, or specific http request. And since it is distributed tracing, the trace will shown how workload works within the called service too!

Note

In pet projects Tracing usability is honestly very limited, since it is very doubtful for pet project to have any kind of network interaction long enough requiring tracing debugging. Just because your database will rarely reach the level requiring to debug it. You will benefit in pet projects more from Logging and Metrics monitoring system. At any backend real work though, tracing is the most useful system to have, i would dare to say potentially even more useful than any other type of monitoring.