diff --git a/internal/cri/opts/spec_opts.go b/internal/cri/opts/spec_opts.go index 9dda38af3aff..148026a526bb 100644 --- a/internal/cri/opts/spec_opts.go +++ b/internal/cri/opts/spec_opts.go @@ -301,6 +301,23 @@ func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts { } } +// WithNamespacePath updates namespace with existing path. +func WithNamespacePath(t runtimespec.LinuxNamespaceType, nsPath string) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Linux == nil { + return fmt.Errorf("Linux spec is required") + } + + for i, ns := range s.Linux.Namespaces { + if ns.Type == t { + s.Linux.Namespaces[i].Path = nsPath + return nil + } + } + return fmt.Errorf("no such namespace %s", t) + } +} + // WithPodNamespaces sets the pod namespaces for the container func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { namespaces := config.GetNamespaceOptions() diff --git a/internal/cri/server/podsandbox/helpers_linux.go b/internal/cri/server/podsandbox/helpers_linux.go index 2d9ea718a91c..5f16e6781d01 100644 --- a/internal/cri/server/podsandbox/helpers_linux.go +++ b/internal/cri/server/podsandbox/helpers_linux.go @@ -40,6 +40,7 @@ import ( "github.com/containerd/containerd/v2/core/snapshots" "github.com/containerd/containerd/v2/internal/cri/seutil" "github.com/containerd/containerd/v2/pkg/seccomp" + "github.com/containerd/containerd/v2/pkg/sys" ) const ( @@ -88,6 +89,50 @@ func (c *Controller) getSandboxDevShm(id string) string { return filepath.Join(c.getVolatileSandboxRootDir(id), "shm") } +// getSandboxPinnedNamespaces returns the pinned namespaces directory inside the +// sandbox state directory. +func (c *Controller) getSandboxPinnedNamespaces(id string) string { + return filepath.Join(c.getVolatileSandboxRootDir(id), "pinned-namespaces") +} + +// getSandboxPinnedUserNamespace returns the pinned user namespace file. +func (c *Controller) getSandboxPinnedUserNamespace(id string) string { + return filepath.Join(c.getSandboxPinnedNamespaces(id), "user") +} + +// pinUserNamespace persists user namespace in namespace filesystem. +func (c *Controller) pinUserNamespace(sandboxID string, netnsPath string) error { + nsPath := c.getSandboxPinnedUserNamespace(sandboxID) + + baseDir := filepath.Dir(nsPath) + if err := os.MkdirAll(baseDir, 0755); err != nil { + return fmt.Errorf("failed to init pinned-namespaces directory %s: %w", baseDir, err) + } + + emptyFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666) + if err != nil { + return fmt.Errorf("failed to create empty file %s: %w", nsPath, err) + } + emptyFd.Close() + + netnsFd, err := os.Open(netnsPath) + if err != nil { + return fmt.Errorf("failed to open netns(%s): %w", netnsPath, err) + } + defer netnsFd.Close() + + usernsFd, err := sys.GetUsernsForNamespace(netnsFd.Fd()) + if err != nil { + return fmt.Errorf("failed to get user namespace for netns(%s): %w", netnsPath, err) + } + defer usernsFd.Close() + + if err = unix.Mount(usernsFd.Name(), nsPath, "none", unix.MS_BIND, ""); err != nil { + return fmt.Errorf("failed to bind mount ns src: %v at %s: %w", usernsFd.Name(), nsPath, err) + } + return nil +} + func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { var labels []string diff --git a/internal/cri/server/podsandbox/sandbox_run.go b/internal/cri/server/podsandbox/sandbox_run.go index 389689703c10..702b568019fc 100644 --- a/internal/cri/server/podsandbox/sandbox_run.go +++ b/internal/cri/server/podsandbox/sandbox_run.go @@ -95,6 +95,39 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll labels["oci_runtime_type"] = ociRuntime.Type + // Create sandbox container root directories. + sandboxRootDir := c.getSandboxRootDir(id) + if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { + return cin, fmt.Errorf("failed to create sandbox root directory %q: %w", + sandboxRootDir, err) + } + defer func() { + if retErr != nil && cleanupErr == nil { + // Cleanup the sandbox root directory. + if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", + sandboxRootDir) + } + } + }() + + volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) + if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { + return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", + volatileSandboxRootDir, err) + } + defer func() { + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + // Cleanup the volatile sandbox root directory. + if cleanupErr = ensureRemoveAll(deferCtx, volatileSandboxRootDir); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", + volatileSandboxRootDir) + } + } + }() + // Create sandbox container. // NOTE: sandboxContainerSpec SHOULD NOT have side // effect, e.g. accessing/creating files, so that we can test @@ -164,37 +197,6 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll } }() - // Create sandbox container root directories. - sandboxRootDir := c.getSandboxRootDir(id) - if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil { - return cin, fmt.Errorf("failed to create sandbox root directory %q: %w", - sandboxRootDir, err) - } - defer func() { - if retErr != nil && cleanupErr == nil { - // Cleanup the sandbox root directory. - if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil { - log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q", - sandboxRootDir) - } - } - }() - - volatileSandboxRootDir := c.getVolatileSandboxRootDir(id) - if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil { - return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w", - volatileSandboxRootDir, err) - } - defer func() { - if retErr != nil && cleanupErr == nil { - // Cleanup the volatile sandbox root directory. - if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil { - log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q", - volatileSandboxRootDir) - } - } - }() - // Setup files required for the sandbox. if err = c.setupSandboxFiles(id, config); err != nil { return cin, fmt.Errorf("failed to setup sandbox files: %w", err) diff --git a/internal/cri/server/podsandbox/sandbox_run_linux.go b/internal/cri/server/podsandbox/sandbox_run_linux.go index 832df4dc390d..b9ad3047d142 100644 --- a/internal/cri/server/podsandbox/sandbox_run_linux.go +++ b/internal/cri/server/podsandbox/sandbox_run_linux.go @@ -103,6 +103,11 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC case runtime.NamespaceMode_POD: specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) usernsEnabled = true + + if err := c.pinUserNamespace(id, nsPath); err != nil { + return nil, fmt.Errorf("failed to pin user namespace: %w", err) + } + specOpts = append(specOpts, customopts.WithNamespacePath(runtimespec.UserNamespace, c.getSandboxPinnedUserNamespace(id))) default: return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) } diff --git a/internal/cri/server/podsandbox/sandbox_run_linux_test.go b/internal/cri/server/podsandbox/sandbox_run_linux_test.go index 8935a9e31861..3166365044bb 100644 --- a/internal/cri/server/podsandbox/sandbox_run_linux_test.go +++ b/internal/cri/server/podsandbox/sandbox_run_linux_test.go @@ -17,9 +17,11 @@ package podsandbox import ( + "context" "os" "path/filepath" "strconv" + "syscall" "testing" "github.com/moby/sys/userns" @@ -32,11 +34,15 @@ import ( v1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/containerd/containerd/v2/internal/cri/annotations" + criconfig "github.com/containerd/containerd/v2/internal/cri/config" "github.com/containerd/containerd/v2/internal/cri/opts" + "github.com/containerd/containerd/v2/pkg/netns" ostesting "github.com/containerd/containerd/v2/pkg/os/testing" + "github.com/containerd/containerd/v2/pkg/sys" + "github.com/containerd/containerd/v2/pkg/testutil" ) -func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { +func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { config := &runtime.PodSandboxConfig{ Metadata: &runtime.PodSandboxMetadata{ Name: "test-name", @@ -94,7 +100,7 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf } assert.Contains(t, spec.Mounts, runtimespec.Mount{ - Source: "/test/root/sandboxes/test-id/resolv.conf", + Source: filepath.Join(criCfg.RootDir, "sandboxes/test-id/resolv.conf"), Destination: resolvConfPath, Type: "bind", Options: []string{"rbind", "ro", "nosuid", "nodev", "noexec"}, @@ -105,8 +111,10 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf } func TestLinuxSandboxContainerSpec(t *testing.T) { + testutil.RequiresRoot(t) + testID := "test-id" - nsPath := "test-cni" + idMap := runtime.IDMapping{ HostId: 1000, ContainerId: 1000, @@ -118,15 +126,30 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { Size: 10, } + netnsBasedir := t.TempDir() + t.Cleanup(func() { + assert.NoError(t, unmountRecursive(context.Background(), netnsBasedir)) + }) + + var netNs *netns.NetNS + uerr := sys.UnshareAfterEnterUserns("1000:1000:10", "1000:1000:10", syscall.CLONE_NEWNET, func(pid int) error { + var err error + netNs, err = netns.NewNetNSFromPID(netnsBasedir, uint32(pid)) + return err + }) + require.NoError(t, uerr) + + nsPath := netNs.GetPath() + for _, test := range []struct { desc string configChange func(*runtime.PodSandboxConfig) - specCheck func(*testing.T, *runtimespec.Spec) + specCheck func(*testing.T, *Controller, *runtimespec.Spec) expectErr bool }{ { desc: "spec should reflect original config", - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { // runtime spec should have expected namespaces enabled by default. require.NotNil(t, spec.Linux) assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ @@ -162,10 +185,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { }, } }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) { require.NotNil(t, spec.Linux) assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ Type: runtimespec.UserNamespace, + Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"), }) assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") }, @@ -181,7 +205,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { }, } }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { // runtime spec should disable expected namespaces in host mode. require.NotNil(t, spec.Linux) assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ @@ -213,10 +237,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { }, } }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) { require.NotNil(t, spec.Linux) assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ Type: runtimespec.UserNamespace, + Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"), }) require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) @@ -314,7 +339,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { SupplementalGroups: []int64{1111, 2222}, } }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { require.NotNil(t, spec.Process) assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111)) assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222)) @@ -328,7 +353,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { "net.ipv4.ping_group_range": "1 1000", } }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { require.NotNil(t, spec.Process) assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500") assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000") @@ -344,7 +369,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { MemoryLimitInBytes: 1024, } }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { value, ok := spec.Annotations[annotations.SandboxCPUPeriod] assert.True(t, ok) assert.EqualValues(t, strconv.FormatInt(100, 10), value) @@ -365,7 +390,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { }, { desc: "sandbox sizing annotations should not be set if LinuxContainerResources were not provided", - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { _, ok := spec.Annotations[annotations.SandboxCPUPeriod] assert.False(t, ok) _, ok = spec.Annotations[annotations.SandboxCPUQuota] @@ -381,7 +406,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { configChange: func(c *runtime.PodSandboxConfig) { c.Linux.Resources = &v1.LinuxContainerResources{} }, - specCheck: func(t *testing.T, spec *runtimespec.Spec) { + specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) { value, ok := spec.Annotations[annotations.SandboxCPUPeriod] assert.True(t, ok) assert.EqualValues(t, "0", value) @@ -400,9 +425,17 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { test := test t.Run(test.desc, func(t *testing.T) { c := newControllerService() + c.config.RootDir = t.TempDir() + c.config.StateDir = t.TempDir() + + defer func() { + assert.NoError(t, unmountRecursive(context.Background(), c.config.StateDir)) + }() + c.config.EnableUnprivilegedICMP = true c.config.EnableUnprivilegedPorts = true - config, imageConfig, specCheck := getRunPodSandboxTestData() + + config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) if test.configChange != nil { test.configChange(config) } @@ -416,7 +449,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { assert.NotNil(t, spec) specCheck(t, testID, spec) if test.specCheck != nil { - test.specCheck(t, spec) + test.specCheck(t, c, spec) } }) } @@ -757,6 +790,3 @@ options timeout:1 }) } } - -// TODO(random-liu): [P1] Add unit test for different error cases to make sure -// the function cleans up on error properly. diff --git a/internal/cri/server/podsandbox/sandbox_run_other_test.go b/internal/cri/server/podsandbox/sandbox_run_other_test.go index 196955f7fb6b..772b9b846561 100644 --- a/internal/cri/server/podsandbox/sandbox_run_other_test.go +++ b/internal/cri/server/podsandbox/sandbox_run_other_test.go @@ -21,12 +21,13 @@ package podsandbox import ( "testing" + criconfig "github.com/containerd/containerd/v2/internal/cri/config" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) -func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { +func getRunPodSandboxTestData(_ criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { config := &runtime.PodSandboxConfig{} imageConfig := &imagespec.ImageConfig{} specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) { diff --git a/internal/cri/server/podsandbox/sandbox_run_test.go b/internal/cri/server/podsandbox/sandbox_run_test.go index f74a24dc33fb..74dd779aee6b 100644 --- a/internal/cri/server/podsandbox/sandbox_run_test.go +++ b/internal/cri/server/podsandbox/sandbox_run_test.go @@ -27,8 +27,14 @@ import ( runtime "k8s.io/cri-api/pkg/apis/runtime/v1" sandboxstore "github.com/containerd/containerd/v2/internal/cri/store/sandbox" + "github.com/containerd/containerd/v2/pkg/testutil" ) +func TestEmpty(t *testing.T) { + // NOTE: It's used to register -test.root for all platforms. + testutil.RequiresRoot(t) +} + func TestSandboxContainerSpec(t *testing.T) { switch goruntime.GOOS { case "darwin": @@ -97,7 +103,7 @@ func TestSandboxContainerSpec(t *testing.T) { test := test t.Run(test.desc, func(t *testing.T) { c := newControllerService() - config, imageConfig, specCheck := getRunPodSandboxTestData() + config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) if test.configChange != nil { test.configChange(config) } @@ -154,7 +160,9 @@ func TestTypeurlMarshalUnmarshalSandboxMeta(t *testing.T) { Name: "sandbox_1", NetNSPath: "/home/cloud", } - meta.Config, _, _ = getRunPodSandboxTestData() + + c := newControllerService() + meta.Config, _, _ = getRunPodSandboxTestData(c.config) if test.configChange != nil { test.configChange(meta.Config) } diff --git a/internal/cri/server/podsandbox/sandbox_run_windows_test.go b/internal/cri/server/podsandbox/sandbox_run_windows_test.go index d31ec68355e9..18e86d395539 100644 --- a/internal/cri/server/podsandbox/sandbox_run_windows_test.go +++ b/internal/cri/server/podsandbox/sandbox_run_windows_test.go @@ -25,10 +25,11 @@ import ( runtime "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/containerd/containerd/v2/internal/cri/annotations" + criconfig "github.com/containerd/containerd/v2/internal/cri/config" "github.com/containerd/containerd/v2/internal/cri/opts" ) -func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { +func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) { config := &runtime.PodSandboxConfig{ Metadata: &runtime.PodSandboxMetadata{ Name: "test-name", @@ -100,7 +101,7 @@ func TestSandboxWindowsNetworkNamespace(t *testing.T) { nsPath := "test-cni" c := newControllerService() - config, imageConfig, specCheck := getRunPodSandboxTestData() + config, imageConfig, specCheck := getRunPodSandboxTestData(c.config) spec, err := c.sandboxContainerSpec(testID, config, imageConfig, nsPath, nil) assert.NoError(t, err) assert.NotNil(t, spec) diff --git a/internal/cri/server/sandbox_run.go b/internal/cri/server/sandbox_run.go index 2ab43bb39620..5c049b9f883c 100644 --- a/internal/cri/server/sandbox_run.go +++ b/internal/cri/server/sandbox_run.go @@ -167,18 +167,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } // Setup the network namespace if host networking wasn't requested. - if !hostNetwork(config) && !userNsEnabled { - // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. - // We can't move this to a function, as the defer calls need to be executed if other - // errors are returned in this function. So, we would need more refactors to move - // this code to a function and the idea was to not change the current code for - // !userNsEnabled case, therefore doing it would defeat the purpose. - // - // The difference between the cases is the use of netns.NewNetNS() vs - // netns.NewNetNSFromPID(). - // - // To simplify this, in the future, we should just remove this case (podNetwork && - // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). + if !hostNetwork(config) { span.AddEvent("setup pod network") netStart := time.Now() // If it is not in host network namespace then create a namespace and set the sandbox @@ -189,7 +178,13 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox if c.config.NetNSMountsUnderStateDir { netnsMountDir = filepath.Join(c.config.StateDir, "netns") } - sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) + + if !userNsEnabled { + sandbox.NetNS, err = netns.NewNetNS(netnsMountDir) + } else { + usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + sandbox.NetNS, err = c.setupNetnsWithinUserns(netnsMountDir, usernsOpts) + } if err != nil { return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) } @@ -284,92 +279,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) } - if !hostNetwork(config) && userNsEnabled { - // If userns is enabled, then the netns was created by the OCI runtime - // on controller.Start(). The OCI runtime needs to create the netns - // because, if userns is in use, the netns needs to be owned by the - // userns. So, let the OCI runtime just handle this for us. - // If the netns is not owned by the userns several problems will happen. - // For instance, the container will lack permission (even if - // capabilities are present) to modify the netns or, even worse, the OCI - // runtime will fail to mount sysfs: - // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 - // - // Note we do this after controller.Start(), as before that we - // can't get the PID for the sandbox that we need for the netns. - // Doing a controller.Status() call before that fails (can't - // find the sandbox) so we can't get the PID. - netStart := time.Now() - - // If it is not in host network namespace then create a namespace and set the sandbox - // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network - // namespaces. If the pod is in host network namespace then both are empty and should not - // be used. - var netnsMountDir = "/var/run/netns" - if c.config.NetNSMountsUnderStateDir { - netnsMountDir = filepath.Join(c.config.StateDir, "netns") - } - - sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid) - if err != nil { - return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) - } - - // Update network namespace in the store, which is used to generate the container's spec - sandbox.NetNSPath = sandbox.NetNS.GetPath() - defer func() { - // Remove the network namespace only if all the resource cleanup is done - if retErr != nil && cleanupErr == nil { - if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { - log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) - return - } - sandbox.NetNSPath = "" - } - }() - - if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { - return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) - } - // Save sandbox metadata to store - if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { - return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) - } - - // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. - // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource - // creation functions. - defer func() { - // Remove the network namespace only if all the resource cleanup is done. - if retErr != nil && cleanupErr == nil { - deferCtx, deferCancel := util.DeferContext() - defer deferCancel() - // Teardown network if an error is returned. - if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { - log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) - } - - } - }() - - // Setup network for sandbox. - // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) - // rely on the assumption that CRI shim will not be querying the network namespace to check the - // network states such as IP. - // In future runtime implementation should avoid relying on CRI shim implementation details. - // In this case however caching the IP will add a subtle performance enhancement by avoiding - // calls to network namespace of the pod to query the IP of the veth interface on every - // SandboxStatus request. - if err := c.setupPodNetwork(ctx, &sandbox); err != nil { - return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) - } - sandboxCreateNetworkTimer.UpdateSince(netStart) - - span.AddEvent("finished pod network setup", - tracing.Attribute("pod.network.setup.duration", time.Since(netStart).String()), - ) - } - // TODO: get rid of this. sandbox object should no longer have Container field. if ociRuntime.Sandboxer == string(criconfig.ModePodSandbox) { container, err := c.client.LoadContainer(ctx, id) diff --git a/internal/cri/server/sandbox_run_linux.go b/internal/cri/server/sandbox_run_linux.go index 810bcc7d3c61..b671d4e82d2c 100644 --- a/internal/cri/server/sandbox_run_linux.go +++ b/internal/cri/server/sandbox_run_linux.go @@ -18,9 +18,14 @@ package server import ( "fmt" + "syscall" + + "github.com/containerd/containerd/v2/pkg/netns" + "github.com/containerd/containerd/v2/pkg/sys" "github.com/containernetworking/plugins/pkg/ns" "github.com/vishvananda/netlink" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) func (c *criService) bringUpLoopback(netns string) error { @@ -35,3 +40,44 @@ func (c *criService) bringUpLoopback(netns string) error { } return nil } + +func (c *criService) setupNetnsWithinUserns(netnsMountDir string, opt *runtime.UserNamespace) (*netns.NetNS, error) { + if opt.GetMode() != runtime.NamespaceMode_POD { + return nil, fmt.Errorf("required pod-level user namespace setting") + } + + uidMaps := opt.GetUids() + if len(uidMaps) != 1 { + return nil, fmt.Errorf("required only one uid mapping, but got %d uid mapping(s)", len(uidMaps)) + } + if uidMaps[0] == nil { + return nil, fmt.Errorf("required only one uid mapping, but got empty uid mapping") + } + + gidMaps := opt.GetGids() + if len(gidMaps) != 1 { + return nil, fmt.Errorf("required only one gid mapping, but got %d gid mapping(s)", len(gidMaps)) + } + if gidMaps[0] == nil { + return nil, fmt.Errorf("required only one gid mapping, but got empty gid mapping") + } + + var netNs *netns.NetNS + var err error + uerr := sys.UnshareAfterEnterUserns( + fmt.Sprintf("%d:%d:%d", uidMaps[0].ContainerId, uidMaps[0].HostId, uidMaps[0].Length), + fmt.Sprintf("%d:%d:%d", gidMaps[0].ContainerId, gidMaps[0].HostId, gidMaps[0].Length), + syscall.CLONE_NEWNET, + func(pid int) error { + netNs, err = netns.NewNetNSFromPID(netnsMountDir, uint32(pid)) + if err != nil { + return fmt.Errorf("failed to mount netns from pid %d: %w", pid, err) + } + return nil + }, + ) + if uerr != nil { + return nil, uerr + } + return netNs, nil +} diff --git a/internal/cri/server/sandbox_run_other.go b/internal/cri/server/sandbox_run_other.go index 8f1a2b209051..7404770122e6 100644 --- a/internal/cri/server/sandbox_run_other.go +++ b/internal/cri/server/sandbox_run_other.go @@ -18,6 +18,17 @@ package server +import ( + "fmt" + + "github.com/containerd/containerd/v2/pkg/netns" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + func (c *criService) bringUpLoopback(string) error { return nil } + +func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) { + return nil, fmt.Errorf("unsupported to setup netns within userns on unix platform") +} diff --git a/internal/cri/server/sandbox_run_windows.go b/internal/cri/server/sandbox_run_windows.go index 99ec7b2fd374..4b7f0ea64225 100644 --- a/internal/cri/server/sandbox_run_windows.go +++ b/internal/cri/server/sandbox_run_windows.go @@ -16,6 +16,17 @@ package server +import ( + "fmt" + + "github.com/containerd/containerd/v2/pkg/netns" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + func (c *criService) bringUpLoopback(string) error { return nil } + +func (c *criService) setupNetnsWithinUserns(basedir string, cfg *runtime.UserNamespace) (*netns.NetNS, error) { + return nil, fmt.Errorf("unsupported to setup netns within userns on windows platform") +}