There are quite some use cases where users run into the current limit for
{g,u}id mappings. Consider a user requesting us to map everything but 999, and
1001 for a given range of 1000000000 with a sub{g,u}id layout of:
some-user:100000:1000000000
some-user:999:1
some-user:1000:1
some-user:1001:1
some-user:1002:1
This translates to:
MAPPING-TYPE | CONTAINER | HOST | RANGE |
-------------|-----------|---------|-----------|
uid | 999 | 999 | 1 |
uid | 1001 | 1001 | 1 |
uid | 0 | 1000000 | 999 |
uid | 1000 | 1001000 | 1 |
uid | 1002 | 1001002 | 999998998 |
------------------------------------------------
gid | 999 | 999 | 1 |
gid | 1001 | 1001 | 1 |
gid | 0 | 1000000 | 999 |
gid | 1000 | 1001000 | 1 |
gid | 1002 | 1001002 | 999998998 |
which is already the current limit.
As discussed at LPC simply bumping the number of limits is not going to work
since this would mean that struct uid_gid_map won't fit into a single cache-line
anymore thereby regressing performance for the base-cases. The same problem
seems to arise when using a single pointer. So the idea is to use
struct uid_gid_extent {
u32 first;
u32 lower_first;
u32 count;
};
struct uid_gid_map { /* 64 bytes -- 1 cache line */
u32 nr_extents;
union {
struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
struct {
struct uid_gid_extent *forward;
struct uid_gid_extent *reverse;
};
};
};
For the base cases we will only use the struct uid_gid_extent extent member. If
we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k
kmalloc() which means we can have a maximum of 340 mappings
(340 * size(struct uid_gid_extent) = 4080). For the latter case we use two
pointers "forward" and "reverse". The forward pointer points to an array sorted
by "first" and the reverse pointer points to an array sorted by "lower_first".
We can then perform binary search on those arrays.
Performance Testing:
When Eric introduced the extent-based struct uid_gid_map approach he measured
the performanc impact of his idmap changes:
> My benchmark consisted of going to single user mode where nothing else was
> running. On an ext4 filesystem opening 1,000,000 files and looping through all
> of the files 1000 times and calling fstat on the individuals files. This was
> to ensure I was benchmarking stat times where the inodes were in the kernels
> cache, but the inode values were not in the processors cache. My results:
> v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled)
> v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled)
> v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled)
I used an identical approach on my laptop. Here's a thorough description of what
I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I
booted into single user mode and used an ext4 filesystem to open/create
1,000,000 files. Then I looped through all of the files calling fstat() on each
of them 1000 times and calculated the mean fstat() time for a single file. (The
test program can be found below.)
Here are the results. For fun, I compared the first version of my patch which
scaled linearly with the new version of the patch:
| # MAPPINGS | PATCH-V1 | PATCH-NEW |
|--------------|------------|-----------|
| 0 mappings | 158 ns | 158 ns |
| 1 mappings | 164 ns | 157 ns |
| 2 mappings | 170 ns | 158 ns |
| 3 mappings | 175 ns | 161 ns |
| 5 mappings | 187 ns | 165 ns |
| 10 mappings | 218 ns | 199 ns |
| 50 mappings | 528 ns | 218 ns |
| 100 mappings | 980 ns | 229 ns |
| 200 mappings | 1880 ns | 239 ns |
| 300 mappings | 2760 ns | 240 ns |
| 340 mappings | not tested | 248 ns |
Here's the test program I used. I asked Eric what he did and this is a more
"advanced" implementation of the idea. It's pretty straight-forward:
#define __GNU_SOURCE
#define __STDC_FORMAT_MACROS
#include <errno.h>
#include <dirent.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
int main(int argc, char *argv[])
{
int ret;
size_t i, k;
int fd[1000000];
int times[1000];
char pathname[4096];
struct stat st;
struct timeval t1, t2;
uint64_t time_in_mcs;
uint64_t sum = 0;
if (argc != 2) {
fprintf(stderr, "Please specify a directory where to create "
"the test files\n");
exit(EXIT_FAILURE);
}
for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
sprintf(pathname, "%s/idmap_test_%zu", argv[1], i);
fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH);
if (fd[i] < 0) {
ssize_t j;
for (j = i; j >= 0; j--)
close(fd[j]);
exit(EXIT_FAILURE);
}
}
for (k = 0; k < 1000; k++) {
ret = gettimeofday(&t1, NULL);
if (ret < 0)
goto close_all;
for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
ret = fstat(fd[i], &st);
if (ret < 0)
goto close_all;
}
ret = gettimeofday(&t2, NULL);
if (ret < 0)
goto close_all;
time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) -
(1000000 * t1.tv_sec + t1.tv_usec);
printf("Total time in micro seconds: %" PRIu64 "\n",
time_in_mcs);
printf("Total time in nanoseconds: %" PRIu64 "\n",
time_in_mcs * 1000);
printf("Time per file in nanoseconds: %" PRIu64 "\n",
(time_in_mcs * 1000) / 1000000);
times[k] = (time_in_mcs * 1000) / 1000000;
}
close_all:
for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++)
close(fd[i]);
if (ret < 0)
exit(EXIT_FAILURE);
for (k = 0; k < 1000; k++) {
sum += times[k];
}
printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000);
exit(EXIT_SUCCESS);;
}
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
CC: Serge Hallyn <serge@hallyn.com>
CC: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
175 lines
4.3 KiB
C
175 lines
4.3 KiB
C
#ifndef _LINUX_USER_NAMESPACE_H
|
|
#define _LINUX_USER_NAMESPACE_H
|
|
|
|
#include <linux/kref.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/ns_common.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/err.h>
|
|
|
|
#define UID_GID_MAP_MAX_BASE_EXTENTS 5
|
|
#define UID_GID_MAP_MAX_EXTENTS 340
|
|
|
|
struct uid_gid_extent {
|
|
u32 first;
|
|
u32 lower_first;
|
|
u32 count;
|
|
};
|
|
|
|
struct uid_gid_map { /* 64 bytes -- 1 cache line */
|
|
u32 nr_extents;
|
|
union {
|
|
struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
|
|
struct {
|
|
struct uid_gid_extent *forward;
|
|
struct uid_gid_extent *reverse;
|
|
};
|
|
};
|
|
};
|
|
|
|
#define USERNS_SETGROUPS_ALLOWED 1UL
|
|
|
|
#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
|
|
|
|
struct ucounts;
|
|
|
|
enum ucount_type {
|
|
UCOUNT_USER_NAMESPACES,
|
|
UCOUNT_PID_NAMESPACES,
|
|
UCOUNT_UTS_NAMESPACES,
|
|
UCOUNT_IPC_NAMESPACES,
|
|
UCOUNT_NET_NAMESPACES,
|
|
UCOUNT_MNT_NAMESPACES,
|
|
UCOUNT_CGROUP_NAMESPACES,
|
|
#ifdef CONFIG_INOTIFY_USER
|
|
UCOUNT_INOTIFY_INSTANCES,
|
|
UCOUNT_INOTIFY_WATCHES,
|
|
#endif
|
|
UCOUNT_COUNTS,
|
|
};
|
|
|
|
struct user_namespace {
|
|
struct uid_gid_map uid_map;
|
|
struct uid_gid_map gid_map;
|
|
struct uid_gid_map projid_map;
|
|
atomic_t count;
|
|
struct user_namespace *parent;
|
|
int level;
|
|
kuid_t owner;
|
|
kgid_t group;
|
|
struct ns_common ns;
|
|
unsigned long flags;
|
|
|
|
/* Register of per-UID persistent keyrings for this namespace */
|
|
#ifdef CONFIG_PERSISTENT_KEYRINGS
|
|
struct key *persistent_keyring_register;
|
|
struct rw_semaphore persistent_keyring_register_sem;
|
|
#endif
|
|
struct work_struct work;
|
|
#ifdef CONFIG_SYSCTL
|
|
struct ctl_table_set set;
|
|
struct ctl_table_header *sysctls;
|
|
#endif
|
|
struct ucounts *ucounts;
|
|
int ucount_max[UCOUNT_COUNTS];
|
|
} __randomize_layout;
|
|
|
|
struct ucounts {
|
|
struct hlist_node node;
|
|
struct user_namespace *ns;
|
|
kuid_t uid;
|
|
int count;
|
|
atomic_t ucount[UCOUNT_COUNTS];
|
|
};
|
|
|
|
extern struct user_namespace init_user_ns;
|
|
|
|
bool setup_userns_sysctls(struct user_namespace *ns);
|
|
void retire_userns_sysctls(struct user_namespace *ns);
|
|
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
|
|
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
|
|
|
|
#ifdef CONFIG_USER_NS
|
|
|
|
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
|
|
{
|
|
if (ns)
|
|
atomic_inc(&ns->count);
|
|
return ns;
|
|
}
|
|
|
|
extern int create_user_ns(struct cred *new);
|
|
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
|
|
extern void __put_user_ns(struct user_namespace *ns);
|
|
|
|
static inline void put_user_ns(struct user_namespace *ns)
|
|
{
|
|
if (ns && atomic_dec_and_test(&ns->count))
|
|
__put_user_ns(ns);
|
|
}
|
|
|
|
struct seq_operations;
|
|
extern const struct seq_operations proc_uid_seq_operations;
|
|
extern const struct seq_operations proc_gid_seq_operations;
|
|
extern const struct seq_operations proc_projid_seq_operations;
|
|
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
|
|
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
|
|
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
|
|
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
|
|
extern int proc_setgroups_show(struct seq_file *m, void *v);
|
|
extern bool userns_may_setgroups(const struct user_namespace *ns);
|
|
extern bool in_userns(const struct user_namespace *ancestor,
|
|
const struct user_namespace *child);
|
|
extern bool current_in_userns(const struct user_namespace *target_ns);
|
|
struct ns_common *ns_get_owner(struct ns_common *ns);
|
|
#else
|
|
|
|
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
|
|
{
|
|
return &init_user_ns;
|
|
}
|
|
|
|
static inline int create_user_ns(struct cred *new)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline int unshare_userns(unsigned long unshare_flags,
|
|
struct cred **new_cred)
|
|
{
|
|
if (unshare_flags & CLONE_NEWUSER)
|
|
return -EINVAL;
|
|
return 0;
|
|
}
|
|
|
|
static inline void put_user_ns(struct user_namespace *ns)
|
|
{
|
|
}
|
|
|
|
static inline bool userns_may_setgroups(const struct user_namespace *ns)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline bool in_userns(const struct user_namespace *ancestor,
|
|
const struct user_namespace *child)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline bool current_in_userns(const struct user_namespace *target_ns)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline struct ns_common *ns_get_owner(struct ns_common *ns)
|
|
{
|
|
return ERR_PTR(-EPERM);
|
|
}
|
|
#endif
|
|
|
|
#endif /* _LINUX_USER_H */
|