diff options
Diffstat (limited to 'libgo/go/syscall/exec_linux.go')
-rw-r--r-- | libgo/go/syscall/exec_linux.go | 194 |
1 files changed, 151 insertions, 43 deletions
diff --git a/libgo/go/syscall/exec_linux.go b/libgo/go/syscall/exec_linux.go index f035e6b..215ecc2 100644 --- a/libgo/go/syscall/exec_linux.go +++ b/libgo/go/syscall/exec_linux.go @@ -19,6 +19,9 @@ import ( //sysnb rawMount(source *byte, target *byte, fstype *byte, flags uintptr, data *byte) (err Errno) //mount(source *byte, target *byte, fstype *byte, flags _C_long, data *byte) _C_int +//sysnb rawOpenat(dirfd int, pathname *byte, flags int, perm uint32) (fd int, err Errno) +//openat(dirfd _C_int, pathname *byte, flags _C_int, perm Mode_t) _C_int + // SysProcIDMap holds Container ID to Host ID mappings used for User Namespaces in Linux. // See user_namespaces(7). type SysProcIDMap struct { @@ -94,18 +97,44 @@ func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr if sys.UidMappings != nil || sys.GidMappings != nil { Close(p[0]) - err := writeUidGidMappings(pid, sys) var err2 Errno - if err != nil { - err2 = err.(Errno) + // uid/gid mappings will be written after fork and unshare(2) for user + // namespaces. + if sys.Unshareflags&CLONE_NEWUSER == 0 { + if err := writeUidGidMappings(pid, sys); err != nil { + err2 = err.(Errno) + } } - RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) + raw_write(p[1], (*byte)(unsafe.Pointer(&err2)), int(unsafe.Sizeof(err2))) Close(p[1]) } return pid, 0 } +const _LINUX_CAPABILITY_VERSION_3 = 0x20080522 + +type capHeader struct { + version uint32 + pid int32 +} + +type capData struct { + effective uint32 + permitted uint32 + inheritable uint32 +} +type caps struct { + hdr capHeader + data [2]capData +} + +// See CAP_TO_INDEX in linux/capability.h: +func capToIndex(cap uintptr) uintptr { return cap >> 5 } + +// See CAP_TO_MASK in linux/capability.h: +func capToMask(cap uintptr) uint32 { return 1 << uint(cap&31) } + // forkAndExecInChild1 implements the body of forkAndExecInChild up to // the parent's post-fork path. This is a separate function so we can // separate the child's and parent's stack frames if we're using @@ -131,12 +160,33 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att // Declare all variables at top in case any // declarations require heap allocation (e.g., err1). var ( - err2 Errno - nextfd int - i int - r2 int + err2 Errno + nextfd int + i int + r2 int + caps caps + fd1 int + puid, psetgroups, pgid []byte + uidmap, setgroups, gidmap []byte ) + if sys.UidMappings != nil { + puid = []byte("/proc/self/uid_map\000") + uidmap = formatIDMappings(sys.UidMappings) + } + + if sys.GidMappings != nil { + psetgroups = []byte("/proc/self/setgroups\000") + pgid = []byte("/proc/self/gid_map\000") + + if sys.GidMappingsEnableSetgroups { + setgroups = []byte("allow\000") + } else { + setgroups = []byte("deny\000") + } + gidmap = formatIDMappings(sys.GidMappings) + } + // Record parent PID so child can test if it has died. ppid := raw_getpid() @@ -187,7 +237,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att // Enable the "keep capabilities" flag to set ambient capabilities later. if len(sys.AmbientCaps) > 0 { - _, _, err1 = RawSyscall6(SYS_PRCTL, PR_SET_KEEPCAPS, 1, 0, 0, 0, 0) + _, err1 = raw_prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) if err1 != 0 { goto childerror } @@ -195,14 +245,14 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att // Wait for User ID/Group ID mappings to be written. if sys.UidMappings != nil || sys.GidMappings != nil { - if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(p[1]), 0, 0); err1 != 0 { + if err1 = raw_close(p[1]); err1 != 0 { goto childerror } - r1, _, err1 = RawSyscall(SYS_READ, uintptr(p[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2)) + r2, err1 = raw_read(p[0], (*byte)(unsafe.Pointer(&err2)), int(unsafe.Sizeof(err2))) if err1 != 0 { goto childerror } - if r1 != unsafe.Sizeof(err2) { + if r2 != int(unsafe.Sizeof(err2)) { err1 = EINVAL goto childerror } @@ -248,6 +298,46 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att if err1 != 0 { goto childerror } + + if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.GidMappings != nil { + dirfd := int(_AT_FDCWD) + if fd1, err1 = rawOpenat(dirfd, &psetgroups[0], O_WRONLY, 0); err1 != 0 { + goto childerror + } + _, err1 = raw_write(fd1, &setgroups[0], len(setgroups)) + if err1 != 0 { + goto childerror + } + if err1 = raw_close(fd1); err1 != 0 { + goto childerror + } + + if fd1, err1 = rawOpenat(dirfd, &pgid[0], O_WRONLY, 0); err1 != 0 { + goto childerror + } + _, err1 = raw_write(fd1, &gidmap[0], len(gidmap)) + if err1 != 0 { + goto childerror + } + if err1 = raw_close(fd1); err1 != 0 { + goto childerror + } + } + + if sys.Unshareflags&CLONE_NEWUSER != 0 && sys.UidMappings != nil { + dirfd := int(_AT_FDCWD) + if fd1, err1 = rawOpenat(dirfd, &puid[0], O_WRONLY, 0); err1 != 0 { + goto childerror + } + _, err1 = raw_write(fd1, &uidmap[0], len(uidmap)) + if err1 != 0 { + goto childerror + } + if err1 = raw_close(fd1); err1 != 0 { + goto childerror + } + } + // The unshare system call in Linux doesn't unshare mount points // mounted with --shared. Systemd mounts / with --shared. For a // long discussion of the pros and cons of this see debian bug 739593. @@ -294,11 +384,32 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att } } - for _, c := range sys.AmbientCaps { - _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0) - if err1 != 0 { + if len(sys.AmbientCaps) != 0 { + // Ambient capabilities were added in the 4.3 kernel, + // so it is safe to always use _LINUX_CAPABILITY_VERSION_3. + caps.hdr.version = _LINUX_CAPABILITY_VERSION_3 + + if _, _, err1 = RawSyscall(SYS_CAPGET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 { + goto childerror + } + + for _, c := range sys.AmbientCaps { + // Add the c capability to the permitted and inheritable capability mask, + // otherwise we will not be able to add it to the ambient capability mask. + caps.data[capToIndex(c)].permitted |= capToMask(c) + caps.data[capToIndex(c)].inheritable |= capToMask(c) + } + + if _, _, err1 = RawSyscall(SYS_CAPSET, uintptr(unsafe.Pointer(&caps.hdr)), uintptr(unsafe.Pointer(&caps.data[0])), 0); err1 != 0 { goto childerror } + + for _, c := range sys.AmbientCaps { + _, _, err1 = RawSyscall6(SYS_PRCTL, PR_CAP_AMBIENT, uintptr(PR_CAP_AMBIENT_RAISE), c, 0, 0, 0) + if err1 != 0 { + goto childerror + } + } } // Chdir @@ -329,6 +440,22 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att } } + // Detach fd 0 from tty + if sys.Noctty { + _, err1 = raw_ioctl(0, TIOCNOTTY, 0) + if err1 != 0 { + goto childerror + } + } + + // Set the controlling TTY to Ctty + if sys.Setctty { + _, err1 = raw_ioctl(sys.Ctty, TIOCSCTTY, 1) + if err1 != 0 { + goto childerror + } + } + // Pass 1: look for fd[i] < i and move those up above len(fd) // so that pass 2 won't stomp on an fd it needs later. if pipe < nextfd { @@ -386,22 +513,6 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att raw_close(i) } - // Detach fd 0 from tty - if sys.Noctty { - _, err1 = raw_ioctl(0, TIOCNOTTY, 0) - if err1 != 0 { - goto childerror - } - } - - // Set the controlling TTY to Ctty - if sys.Setctty { - _, err1 = raw_ioctl(sys.Ctty, TIOCSCTTY, sys.Ctty) - if err1 != 0 { - goto childerror - } - } - // Enable tracing if requested. // Do this right before exec so that we don't unnecessarily trace the runtime // setting up after the fork. See issue #21428. @@ -440,25 +551,22 @@ func forkExecPipe(p []int) (err error) { return } -// writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. -func writeIDMappings(path string, idMap []SysProcIDMap) error { - fd, err := Open(path, O_RDWR, 0) - if err != nil { - return err - } - - data := "" +func formatIDMappings(idMap []SysProcIDMap) []byte { + var data []byte for _, im := range idMap { - data = data + itoa(im.ContainerID) + " " + itoa(im.HostID) + " " + itoa(im.Size) + "\n" + data = append(data, []byte(itoa(im.ContainerID)+" "+itoa(im.HostID)+" "+itoa(im.Size)+"\n")...) } + return data +} - bytes, err := ByteSliceFromString(data) +// writeIDMappings writes the user namespace User ID or Group ID mappings to the specified path. +func writeIDMappings(path string, idMap []SysProcIDMap) error { + fd, err := Open(path, O_RDWR, 0) if err != nil { - Close(fd) return err } - if _, err := Write(fd, bytes); err != nil { + if _, err := Write(fd, formatIDMappings(idMap)); err != nil { Close(fd) return err } |