package link import ( "bytes" "errors" "fmt" "os" "path/filepath" "runtime" "strconv" "strings" "unsafe" "github.com/cilium/ebpf" "github.com/cilium/ebpf/asm" "github.com/cilium/ebpf/internal" "github.com/cilium/ebpf/internal/sys" "github.com/cilium/ebpf/internal/unix" ) // Getting the terminology right is usually the hardest part. For posterity and // for staying sane during implementation: // // - trace event: Representation of a kernel runtime hook. Filesystem entries // under /events. Can be tracepoints (static), kprobes or uprobes. // Can be instantiated into perf events (see below). // - tracepoint: A predetermined hook point in the kernel. Exposed as trace // events in (sub)directories under /events. Cannot be closed or // removed, they are static. // - k(ret)probe: Ephemeral trace events based on entry or exit points of // exported kernel symbols. kprobe-based (tracefs) trace events can be // created system-wide by writing to the /kprobe_events file, or // they can be scoped to the current process by creating PMU perf events. // - u(ret)probe: Ephemeral trace events based on user provides ELF binaries // and offsets. uprobe-based (tracefs) trace events can be // created system-wide by writing to the /uprobe_events file, or // they can be scoped to the current process by creating PMU perf events. // - perf event: An object instantiated based on an existing trace event or // kernel symbol. Referred to by fd in userspace. // Exactly one eBPF program can be attached to a perf event. Multiple perf // events can be created from a single trace event. Closing a perf event // stops any further invocations of the attached eBPF program. var ( tracefsPath = "/sys/kernel/debug/tracing" errInvalidInput = errors.New("invalid input") ) const ( perfAllThreads = -1 ) type perfEventType uint8 const ( tracepointEvent perfEventType = iota kprobeEvent kretprobeEvent uprobeEvent uretprobeEvent ) // A perfEvent represents a perf event kernel object. Exactly one eBPF program // can be attached to it. It is created based on a tracefs trace event or a // Performance Monitoring Unit (PMU). type perfEvent struct { // The event type determines the types of programs that can be attached. typ perfEventType // Group and name of the tracepoint/kprobe/uprobe. group string name string // PMU event ID read from sysfs. Valid IDs are non-zero. pmuID uint64 // ID of the trace event read from tracefs. Valid IDs are non-zero. tracefsID uint64 // User provided arbitrary value. cookie uint64 // This is the perf event FD. fd *sys.FD } func (pe *perfEvent) Close() error { if err := pe.fd.Close(); err != nil { return fmt.Errorf("closing perf event fd: %w", err) } switch pe.typ { case kprobeEvent, kretprobeEvent: // Clean up kprobe tracefs entry. if pe.tracefsID != 0 { return closeTraceFSProbeEvent(kprobeType, pe.group, pe.name) } case uprobeEvent, uretprobeEvent: // Clean up uprobe tracefs entry. if pe.tracefsID != 0 { return closeTraceFSProbeEvent(uprobeType, pe.group, pe.name) } case tracepointEvent: // Tracepoint trace events don't hold any extra resources. return nil } return nil } // perfEventLink represents a bpf perf link. type perfEventLink struct { RawLink pe *perfEvent } func (pl *perfEventLink) isLink() {} // Pinning requires the underlying perf event FD to stay open. // // | PerfEvent FD | BpfLink FD | Works | // |--------------|------------|-------| // | Open | Open | Yes | // | Closed | Open | No | // | Open | Closed | No (Pin() -> EINVAL) | // | Closed | Closed | No (Pin() -> EINVAL) | // // There is currently no pretty way to recover the perf event FD // when loading a pinned link, so leave as not supported for now. func (pl *perfEventLink) Pin(string) error { return fmt.Errorf("perf event link pin: %w", ErrNotSupported) } func (pl *perfEventLink) Unpin() error { return fmt.Errorf("perf event link unpin: %w", ErrNotSupported) } func (pl *perfEventLink) Close() error { if err := pl.pe.Close(); err != nil { return fmt.Errorf("perf event link close: %w", err) } return pl.fd.Close() } func (pl *perfEventLink) Update(prog *ebpf.Program) error { return fmt.Errorf("perf event link update: %w", ErrNotSupported) } // perfEventIoctl implements Link and handles the perf event lifecycle // via ioctl(). type perfEventIoctl struct { *perfEvent } func (pi *perfEventIoctl) isLink() {} // Since 4.15 (e87c6bc3852b "bpf: permit multiple bpf attachments for a single perf event"), // calling PERF_EVENT_IOC_SET_BPF appends the given program to a prog_array // owned by the perf event, which means multiple programs can be attached // simultaneously. // // Before 4.15, calling PERF_EVENT_IOC_SET_BPF more than once on a perf event // returns EEXIST. // // Detaching a program from a perf event is currently not possible, so a // program replacement mechanism cannot be implemented for perf events. func (pi *perfEventIoctl) Update(prog *ebpf.Program) error { return fmt.Errorf("perf event ioctl update: %w", ErrNotSupported) } func (pi *perfEventIoctl) Pin(string) error { return fmt.Errorf("perf event ioctl pin: %w", ErrNotSupported) } func (pi *perfEventIoctl) Unpin() error { return fmt.Errorf("perf event ioctl unpin: %w", ErrNotSupported) } func (pi *perfEventIoctl) Info() (*Info, error) { return nil, fmt.Errorf("perf event ioctl info: %w", ErrNotSupported) } // attach the given eBPF prog to the perf event stored in pe. // pe must contain a valid perf event fd. // prog's type must match the program type stored in pe. func attachPerfEvent(pe *perfEvent, prog *ebpf.Program) (Link, error) { if prog == nil { return nil, errors.New("cannot attach a nil program") } if prog.FD() < 0 { return nil, fmt.Errorf("invalid program: %w", sys.ErrClosedFd) } switch pe.typ { case kprobeEvent, kretprobeEvent, uprobeEvent, uretprobeEvent: if t := prog.Type(); t != ebpf.Kprobe { return nil, fmt.Errorf("invalid program type (expected %s): %s", ebpf.Kprobe, t) } case tracepointEvent: if t := prog.Type(); t != ebpf.TracePoint { return nil, fmt.Errorf("invalid program type (expected %s): %s", ebpf.TracePoint, t) } default: return nil, fmt.Errorf("unknown perf event type: %d", pe.typ) } if err := haveBPFLinkPerfEvent(); err == nil { return attachPerfEventLink(pe, prog) } return attachPerfEventIoctl(pe, prog) } func attachPerfEventIoctl(pe *perfEvent, prog *ebpf.Program) (*perfEventIoctl, error) { if pe.cookie != 0 { return nil, fmt.Errorf("cookies are not supported: %w", ErrNotSupported) } // Assign the eBPF program to the perf event. err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_SET_BPF, prog.FD()) if err != nil { return nil, fmt.Errorf("setting perf event bpf program: %w", err) } // PERF_EVENT_IOC_ENABLE and _DISABLE ignore their given values. if err := unix.IoctlSetInt(pe.fd.Int(), unix.PERF_EVENT_IOC_ENABLE, 0); err != nil { return nil, fmt.Errorf("enable perf event: %s", err) } pi := &perfEventIoctl{pe} // Close the perf event when its reference is lost to avoid leaking system resources. runtime.SetFinalizer(pi, (*perfEventIoctl).Close) return pi, nil } // Use the bpf api to attach the perf event (BPF_LINK_TYPE_PERF_EVENT, 5.15+). // // https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e func attachPerfEventLink(pe *perfEvent, prog *ebpf.Program) (*perfEventLink, error) { fd, err := sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{ ProgFd: uint32(prog.FD()), TargetFd: pe.fd.Uint(), AttachType: sys.BPF_PERF_EVENT, BpfCookie: pe.cookie, }) if err != nil { return nil, fmt.Errorf("cannot create bpf perf link: %v", err) } pl := &perfEventLink{RawLink{fd: fd}, pe} // Close the perf event when its reference is lost to avoid leaking system resources. runtime.SetFinalizer(pl, (*perfEventLink).Close) return pl, nil } // unsafeStringPtr returns an unsafe.Pointer to a NUL-terminated copy of str. func unsafeStringPtr(str string) (unsafe.Pointer, error) { p, err := unix.BytePtrFromString(str) if err != nil { return nil, err } return unsafe.Pointer(p), nil } // getTraceEventID reads a trace event's ID from tracefs given its group and name. // The kernel requires group and name to be alphanumeric or underscore. // // name automatically has its invalid symbols converted to underscores so the caller // can pass a raw symbol name, e.g. a kernel symbol containing dots. func getTraceEventID(group, name string) (uint64, error) { name = sanitizeSymbol(name) tid, err := uint64FromFile(tracefsPath, "events", group, name, "id") if errors.Is(err, os.ErrNotExist) { return 0, fmt.Errorf("trace event %s/%s: %w", group, name, os.ErrNotExist) } if err != nil { return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err) } return tid, nil } // getPMUEventType reads a Performance Monitoring Unit's type (numeric identifier) // from /sys/bus/event_source/devices//type. // // Returns ErrNotSupported if the pmu type is not supported. func getPMUEventType(typ probeType) (uint64, error) { et, err := uint64FromFile("/sys/bus/event_source/devices", typ.String(), "type") if errors.Is(err, os.ErrNotExist) { return 0, fmt.Errorf("pmu type %s: %w", typ, ErrNotSupported) } if err != nil { return 0, fmt.Errorf("reading pmu type %s: %w", typ, err) } return et, nil } // openTracepointPerfEvent opens a tracepoint-type perf event. System-wide // [k,u]probes created by writing to /[k,u]probe_events are tracepoints // behind the scenes, and can be attached to using these perf events. func openTracepointPerfEvent(tid uint64, pid int) (*sys.FD, error) { attr := unix.PerfEventAttr{ Type: unix.PERF_TYPE_TRACEPOINT, Config: tid, Sample_type: unix.PERF_SAMPLE_RAW, Sample: 1, Wakeup: 1, } fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) if err != nil { return nil, fmt.Errorf("opening tracepoint perf event: %w", err) } return sys.NewFD(fd) } // uint64FromFile reads a uint64 from a file. All elements of path are sanitized // and joined onto base. Returns error if base no longer prefixes the path after // joining all components. func uint64FromFile(base string, path ...string) (uint64, error) { l := filepath.Join(path...) p := filepath.Join(base, l) if !strings.HasPrefix(p, base) { return 0, fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, errInvalidInput) } data, err := os.ReadFile(p) if err != nil { return 0, fmt.Errorf("reading file %s: %w", p, err) } et := bytes.TrimSpace(data) return strconv.ParseUint(string(et), 10, 64) } // Probe BPF perf link. // // https://elixir.bootlin.com/linux/v5.16.8/source/kernel/bpf/syscall.c#L4307 // https://github.com/torvalds/linux/commit/b89fbfbb854c9afc3047e8273cc3a694650b802e var haveBPFLinkPerfEvent = internal.FeatureTest("bpf_link_perf_event", "5.15", func() error { prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ Name: "probe_bpf_perf_link", Type: ebpf.Kprobe, Instructions: asm.Instructions{ asm.Mov.Imm(asm.R0, 0), asm.Return(), }, License: "MIT", }) if err != nil { return err } defer prog.Close() _, err = sys.LinkCreatePerfEvent(&sys.LinkCreatePerfEventAttr{ ProgFd: uint32(prog.FD()), AttachType: sys.BPF_PERF_EVENT, }) if errors.Is(err, unix.EINVAL) { return internal.ErrNotSupported } if errors.Is(err, unix.EBADF) { return nil } return err }) // isValidTraceID implements the equivalent of a regex match // against "^[a-zA-Z_][0-9a-zA-Z_]*$". // // Trace event groups, names and kernel symbols must adhere to this set // of characters. Non-empty, first character must not be a number, all // characters must be alphanumeric or underscore. func isValidTraceID(s string) bool { if len(s) < 1 { return false } for i, c := range []byte(s) { switch { case c >= 'a' && c <= 'z': case c >= 'A' && c <= 'Z': case c == '_': case i > 0 && c >= '0' && c <= '9': default: return false } } return true }