Merge tag 'seccomp-v4.14-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/kees...

author Linus Torvalds <[email protected]>

Sat, 23 Sep 2017 02:16:41 +0000 (16:16 -1000)

committer Linus Torvalds <[email protected]>

Sat, 23 Sep 2017 02:16:41 +0000 (16:16 -1000)
author Linus Torvalds <[email protected]>
Sat, 23 Sep 2017 02:16:41 +0000 (16:16 -1000)
committer Linus Torvalds <[email protected]>
Sat, 23 Sep 2017 02:16:41 +0000 (16:16 -1000)
diff --combined Documentation/networking/filter.txt

index 789b74dbe1d96af46fae8a3504a242eaa0b11c3c,73aa0f12156d12a70c4bf7a6f6903a7d8ae0e25b..87814859cfc21c5c6a64c1dfb83a1d6cff596731
--- 1/Documentation/networking/filter.txt
--- 2/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@@ -45,7 -45,7 +45,7 @@@ in many more places. There's xt_bpf fo
   qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places
   such as team driver, PTP code, etc where BPF is being used.
   
- - [1] Documentation/prctl/seccomp_filter.txt
+ + [1] Documentation/userspace-api/seccomp_filter.rst
   
   Original BPF paper:
   
@@@ -337,7 -337,7 +337,7 @@@ Examples for low-level BPF
     jeq #14, good           /* __NR_rt_sigprocmask */
     jeq #13, good           /* __NR_rt_sigaction */
     jeq #35, good           /* __NR_nanosleep */
-   bad: ret #0             /* SECCOMP_RET_KILL */
+   bad: ret #0             /* SECCOMP_RET_KILL_THREAD */
     good: ret #0x7fff0000   /* SECCOMP_RET_ALLOW */
   
   The above example code can be placed into a file (here called "foo"), and
@@@ -596,8 -596,8 +596,8 @@@ skb pointer). All constraints and restr
   before a conversion to the new layout is being done behind the scenes!
   
   Currently, the classic BPF format is being used for JITing on most 32-bit
- -architectures, whereas x86-64, aarch64, s390x, powerpc64, sparc64 perform JIT
- -compilation from eBPF instruction set.
+ +architectures, whereas x86-64, aarch64, s390x, powerpc64, sparc64, arm32 perform
+ +JIT compilation from eBPF instruction set.
   
   Some core changes of the new internal format:
   
@@@ -793,7 -793,7 +793,7 @@@
       bpf_exit
   
     After the call the registers R1-R5 contain junk values and cannot be read.
- -  In the future an eBPF verifier can be used to validate internal BPF programs.
+ +  An in-kernel eBPF verifier is used to validate internal BPF programs.
   
   Also in the new design, eBPF is limited to 4096 insns, which means that any
   program will terminate quickly and will only call a fixed number of kernel
@@@ -906,10 -906,6 +906,10 @@@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(c
     BPF_JSGE  0x70  /* eBPF only: signed '>=' */
     BPF_CALL  0x80  /* eBPF only: function call */
     BPF_EXIT  0x90  /* eBPF only: function return */
+ +  BPF_JLT   0xa0  /* eBPF only: unsigned '<' */
+ +  BPF_JLE   0xb0  /* eBPF only: unsigned '<=' */
+ +  BPF_JSLT  0xc0  /* eBPF only: signed '<' */
+ +  BPF_JSLE  0xd0  /* eBPF only: signed '<=' */
   
   So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF
   and eBPF. There are only two registers in classic BPF, so it means A += X.
@@@ -1021,7 -1017,7 +1021,7 @@@ At the start of the program the registe
   and has type PTR_TO_CTX.
   If verifier sees an insn that does R2=R1, then R2 has now type
   PTR_TO_CTX as well and can be used on the right hand side of expression.
- -If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE,
+ +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=SCALAR_VALUE,
   since addition of two valid pointers makes invalid pointer.
   (In 'secure' mode verifier will reject any type of pointer arithmetic to make
   sure that kernel addresses don't leak to unprivileged users)
@@@ -1043,7 -1039,7 +1043,7 @@@ is a correct program. If there was R1 i
   been rejected.
   
   load/store instructions are allowed only with registers of valid types, which
- -are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked.
+ +are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked.
   For example:
    bpf_mov R1 = 1
    bpf_mov R2 = 2
@@@ -1062,7 -1058,7 +1062,7 @@@ intends to load a word from address R6 
   If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know
   that offset 8 of size 4 bytes can be accessed for reading, otherwise
   the verifier will reject the program.
- -If R6=FRAME_PTR, then access should be aligned and be within
+ +If R6=PTR_TO_STACK, then access should be aligned and be within
   stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8,
   so it will fail verification, since it's out of bounds.
   
@@@ -1073,7 -1069,7 +1073,7 @@@ For example
     bpf_ld R0 = *(u32 *)(R10 - 4)
     bpf_exit
   is invalid program.
- -Though R10 is correct read-only register and has type FRAME_PTR
+ +Though R10 is correct read-only register and has type PTR_TO_STACK
   and R10 - 4 is within stack bounds, there were no stores into that location.
   
   Pointer register spill/fill is tracked as well, since four (R6-R9)
@@@ -1098,71 -1094,6 +1098,71 @@@ all use cases
   
   See details of eBPF verifier in kernel/bpf/verifier.c
   
+ +Register value tracking
+ +-----------------------
+ +In order to determine the safety of an eBPF program, the verifier must track
+ +the range of possible values in each register and also in each stack slot.
+ +This is done with 'struct bpf_reg_state', defined in include/linux/
+ +bpf_verifier.h, which unifies tracking of scalar and pointer values.  Each
+ +register state has a type, which is either NOT_INIT (the register has not been
+ +written to), SCALAR_VALUE (some value which is not usable as a pointer), or a
+ +pointer type.  The types of pointers describe their base, as follows:
+ +    PTR_TO_CTX          Pointer to bpf_context.
+ +    CONST_PTR_TO_MAP    Pointer to struct bpf_map.  "Const" because arithmetic
+ +                        on these pointers is forbidden.
+ +    PTR_TO_MAP_VALUE    Pointer to the value stored in a map element.
+ +    PTR_TO_MAP_VALUE_OR_NULL
+ +                        Either a pointer to a map value, or NULL; map accesses
+ +                        (see section 'eBPF maps', below) return this type,
+ +                        which becomes a PTR_TO_MAP_VALUE when checked != NULL.
+ +                        Arithmetic on these pointers is forbidden.
+ +    PTR_TO_STACK        Frame pointer.
+ +    PTR_TO_PACKET       skb->data.
+ +    PTR_TO_PACKET_END   skb->data + headlen; arithmetic forbidden.
+ +However, a pointer may be offset from this base (as a result of pointer
+ +arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable
+ +offset'.  The former is used when an exactly-known value (e.g. an immediate
+ +operand) is added to a pointer, while the latter is used for values which are
+ +not exactly known.  The variable offset is also used in SCALAR_VALUEs, to track
+ +the range of possible values in the register.
+ +The verifier's knowledge about the variable offset consists of:
+ +* minimum and maximum values as unsigned
+ +* minimum and maximum values as signed
+ +* knowledge of the values of individual bits, in the form of a 'tnum': a u64
+ +'mask' and a u64 'value'.  1s in the mask represent bits whose value is unknown;
+ +1s in the value represent bits known to be 1.  Bits known to be 0 have 0 in both
+ +mask and value; no bit should ever be 1 in both.  For example, if a byte is read
+ +into a register from memory, the register's top 56 bits are known zero, while
+ +the low 8 are unknown - which is represented as the tnum (0x0; 0xff).  If we
+ +then OR this with 0x40, we get (0x40; 0xcf), then if we add 1 we get (0x0;
+ +0x1ff), because of potential carries.
+ +Besides arithmetic, the register state can also be updated by conditional
+ +branches.  For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch
+ +it will have a umin_value (unsigned minimum value) of 9, whereas in the 'false'
+ +branch it will have a umax_value of 8.  A signed compare (with BPF_JSGT or
+ +BPF_JSGE) would instead update the signed minimum/maximum values.  Information
+ +from the signed and unsigned bounds can be combined; for instance if a value is
+ +first tested < 8 and then tested s> 4, the verifier will conclude that the value
+ +is also > 4 and s< 8, since the bounds prevent crossing the sign boundary.
+ +PTR_TO_PACKETs with a variable offset part have an 'id', which is common to all
+ +pointers sharing that same variable offset.  This is important for packet range
+ +checks: after adding some variable to a packet pointer, if you then copy it to
+ +another register and (say) add a constant 4, both registers will share the same
+ +'id' but one will have a fixed offset of +4.  Then if it is bounds-checked and
+ +found to be less than a PTR_TO_PACKET_END, the other register is now known to
+ +have a safe range of at least 4 bytes.  See 'Direct packet access', below, for
+ +more on PTR_TO_PACKET ranges.
+ +The 'id' field is also used on PTR_TO_MAP_VALUE_OR_NULL, common to all copies of
+ +the pointer returned from a map lookup.  This means that when one copy is
+ +checked and found to be non-NULL, all copies can become PTR_TO_MAP_VALUEs.
+ +As well as range-checking, the tracked information is also used for enforcing
+ +alignment of pointer accesses.  For instance, on most systems the packet pointer
+ +is 2 bytes after a 4-byte alignment.  If a program adds 14 bytes to that to jump
+ +over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting
+ +pointer will have a variable offset known to be 4n+2 for some n, so adding the 2
+ +bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through
+ +that pointer are safe.
+ +
   Direct packet access
   --------------------
   In cls_bpf and act_bpf programs the verifier allows direct access to the packet
@@@ -1190,7 -1121,7 +1190,7 @@@ it now points to 'skb->data + 14' and a
   which is zero bytes.
   
   More complex packet access may look like:
- - R0=imm1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp
+ + R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp
    6:  r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */
    7:  r4 = *(u8 *)(r3 +12)
    8:  r4 *= 14
@@@ -1204,31 -1135,26 +1204,31 @@@
   16:  r2 += 8
   17:  r1 = *(u32 *)(r1 +80) /* load skb->data_end */
   18:  if r2 > r1 goto pc+2
- - R0=inv56 R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv52 R5=pkt(id=0,off=14,r=14) R10=fp
+ + R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp
   19:  r1 = *(u8 *)(r3 +4)
   The state of the register R3 is R3=pkt(id=2,off=0,r=8)
   id=2 means that two 'r3 += rX' instructions were seen, so r3 points to some
   offset within a packet and since the program author did
   'if (r3 + 8 > r1) goto err' at insn #18, the safe range is [R3, R3 + 8).
- -The verifier only allows 'add' operation on packet registers. Any other
- -operation will set the register state to 'unknown_value' and it won't be
+ +The verifier only allows 'add'/'sub' operations on packet registers. Any other
+ +operation will set the register state to 'SCALAR_VALUE' and it won't be
   available for direct packet access.
   Operation 'r3 += rX' may overflow and become less than original skb->data,
- -therefore the verifier has to prevent that. So it tracks the number of
- -upper zero bits in all 'uknown_value' registers, so when it sees
- -'r3 += rX' instruction and rX is more than 16-bit value, it will error as:
- -"cannot add integer value with N upper zero bits to ptr_to_packet"
+ +therefore the verifier has to prevent that.  So when it sees 'r3 += rX'
+ +instruction and rX is more than 16-bit value, any subsequent bounds-check of r3
+ +against skb->data_end will not give us 'range' information, so attempts to read
+ +through the pointer will give "invalid access to packet" error.
   Ex. after insn 'r4 = *(u8 *)(r3 +12)' (insn #7 above) the state of r4 is
- -R4=inv56 which means that upper 56 bits on the register are guaranteed
- -to be zero. After insn 'r4 *= 14' the state becomes R4=inv52, since
- -multiplying 8-bit value by constant 14 will keep upper 52 bits as zero.
- -Similarly 'r2 >>= 48' will make R2=inv48, since the shift is not sign
- -extending. This logic is implemented in evaluate_reg_alu() function.
+ +R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits
+ +of the register are guaranteed to be zero, and nothing is known about the lower
+ +8 bits. After insn 'r4 *= 14' the state becomes
+ +R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit
+ +value by constant 14 will keep upper 52 bits as zero, also the least significant
+ +bit will be zero as 14 is even.  Similarly 'r2 >>= 48' will make
+ +R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign
+ +extending.  This logic is implemented in adjust_reg_min_max_vals() function,
+ +which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice
+ +versa) and adjust_scalar_min_max_vals() for operations on two scalars.
   
   The end result is that bpf program author can access packet directly
   using normal C code as:
@@@ -1288,22 -1214,6 +1288,22 @@@ The map is defined by
     . key size in bytes
     . value size in bytes
   
+ +Pruning
+ +-------
+ +The verifier does not actually walk all possible paths through the program.  For
+ +each new branch to analyse, the verifier looks at all the states it's previously
+ +been in when at this instruction.  If any of them contain the current state as a
+ +subset, the branch is 'pruned' - that is, the fact that the previous state was
+ +accepted implies the current state would be as well.  For instance, if in the
+ +previous state, r1 held a packet-pointer, and in the current state, r1 holds a
+ +packet-pointer with a range as long or longer and at least as strict an
+ +alignment, then r1 is safe.  Similarly, if r2 was NOT_INIT before then it can't
+ +have been used by any path from that point, so any value in r2 (including
+ +another NOT_INIT) is safe.  The implementation is in the function regsafe().
+ +Pruning considers not only the registers but also the stack (and any spilled
+ +registers it may hold).  They must all be safe for the branch to be pruned.
+ +This is implemented in states_equal().
+ +
   Understanding eBPF verifier messages
   ------------------------------------
   
diff --combined Documentation/sysctl/kernel.txt

index ce61d1fe08cacb5af99260241b058d139838029a,995c42cf86baac4330fbab79cb4bcca4411e9cc3..694968c7523cc28620c8ac51a28a33dc1b14336e
--- 1/Documentation/sysctl/kernel.txt
--- 2/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@@ -61,7 -61,6 +61,7 @@@ show up in /proc/sys/kernel
   - perf_cpu_time_max_percent
   - perf_event_paranoid
   - perf_event_max_stack
+ +- perf_event_mlock_kb
   - perf_event_max_contexts_per_stack
   - pid_max
   - powersave-nap               [ PPC only ]
@@@ -75,6 -74,7 +75,7 @@@
   - reboot-cmd                  [ SPARC only ]
   - rtsig-max
   - rtsig-nr
+ - seccomp/                    ==> Documentation/userspace-api/seccomp_filter.rst
   - sem
   - sem_next_id               [ sysv ipc ]
   - sg-big-buff                 [ generic SCSI device (sg) ]
@@@ -655,9 -655,7 +656,9 @@@ Controls use of the performance events 
   users (without CAP_SYS_ADMIN).  The default value is 2.
   
    -1: Allow use of (almost) all events by all users
- ->=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+ +     Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+ +>=0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
+ +     Disallow raw tracepoint access by users without CAP_SYS_ADMIN
   >=1: Disallow CPU event access by users without CAP_SYS_ADMIN
   >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
   
@@@ -676,14 -674,6 +677,14 @@@ The default value is 127
   
   ==============================================================
   
+ +perf_event_mlock_kb:
+ +
+ +Control size of per-cpu ring buffer not counted agains mlock limit.
+ +
+ +The default value is 512 + 1 page
+ +
+ +==============================================================
+ +
   perf_event_max_contexts_per_stack:
   
   Controls maximum number of stack frame context entries for
diff --combined include/linux/audit.h

index 74d4d4e8e3db9f173611dc169a20955a7c89dc45,8c30f06d639d485dc360f83ab35a51d57e1bb77b..cb708eb8accc59d3dafee57c7c696647e80be471
--- 1/include/linux/audit.h
--- 2/include/linux/audit.h
+++ b/include/linux/audit.h
@@@ -314,11 -314,7 +314,7 @@@ void audit_core_dumps(long signr)
   
   static inline void audit_seccomp(unsigned long syscall, long signr, int code)
   {
-       if (!audit_enabled)
-               return;
- 
-       /* Force a record to be reported if a signal was delivered. */
-       if (signr || unlikely(!audit_dummy_context()))
+       if (audit_enabled && unlikely(!audit_dummy_context()))
                 __audit_seccomp(syscall, signr, code);
   }
   
@@@ -351,7 -347,7 +347,7 @@@ extern int __audit_socketcall(int nargs
   extern int __audit_sockaddr(int len, void *addr);
   extern void __audit_fd_pair(int fd1, int fd2);
   extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
- -extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+ +extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
   extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
   extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
   extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
@@@ -412,7 -408,7 +408,7 @@@ static inline void audit_mq_open(int of
         if (unlikely(!audit_dummy_context()))
                 __audit_mq_open(oflag, mode, attr);
   }
- -static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout)
+ +static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
   {
         if (unlikely(!audit_dummy_context()))
                 __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
@@@ -549,7 -545,7 +545,7 @@@ static inline void audit_mq_open(int of
   { }
   static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                      unsigned int msg_prio,
- -                                   const struct timespec *abs_timeout)
+ +                                   const struct timespec64 *abs_timeout)
   { }
   static inline void audit_mq_notify(mqd_t mqdes,
                                    const struct sigevent *notification)
diff --combined tools/testing/selftests/seccomp/seccomp_bpf.c

index 4d6f92a9df6b4aaa0bec897b108bf7cafcae41ef,0683cd543cd5fb634c3fdc6a691d18827cffcfff..67c3e276430363754e8354dfb2bb7f2485a26c66
--- 1/tools/testing/selftests/seccomp/seccomp_bpf.c
--- 2/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@@ -68,17 -68,7 +68,7 @@@
   #define SECCOMP_MODE_FILTER 2
   #endif
   
- #ifndef SECCOMP_RET_KILL
- #define SECCOMP_RET_KILL        0x00000000U /* kill the task immediately */
- #define SECCOMP_RET_TRAP        0x00030000U /* disallow and force a SIGSYS */
- #define SECCOMP_RET_ERRNO       0x00050000U /* returns an errno */
- #define SECCOMP_RET_TRACE       0x7ff00000U /* pass to a tracer or disallow */
- #define SECCOMP_RET_ALLOW       0x7fff0000U /* allow */
- 
- /* Masks for the return value sections. */
- #define SECCOMP_RET_ACTION      0x7fff0000U
- #define SECCOMP_RET_DATA        0x0000ffffU
- 
+ #ifndef SECCOMP_RET_ALLOW
   struct seccomp_data {
         int nr;
         __u32 arch;
@@@ -87,6 -77,70 +77,70 @@@
   };
   #endif
   
+ #ifndef SECCOMP_RET_KILL_PROCESS
+ #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
+ #define SECCOMP_RET_KILL_THREAD        0x00000000U /* kill the thread */
+ #endif
+ #ifndef SECCOMP_RET_KILL
+ #define SECCOMP_RET_KILL       SECCOMP_RET_KILL_THREAD
+ #define SECCOMP_RET_TRAP       0x00030000U /* disallow and force a SIGSYS */
+ #define SECCOMP_RET_ERRNO      0x00050000U /* returns an errno */
+ #define SECCOMP_RET_TRACE      0x7ff00000U /* pass to a tracer or disallow */
+ #define SECCOMP_RET_ALLOW      0x7fff0000U /* allow */
+ #endif
+ #ifndef SECCOMP_RET_LOG
+ #define SECCOMP_RET_LOG                0x7ffc0000U /* allow after logging */
+ #endif
+ 
+ #ifndef __NR_seccomp
+ # if defined(__i386__)
+ #  define __NR_seccomp 354
+ # elif defined(__x86_64__)
+ #  define __NR_seccomp 317
+ # elif defined(__arm__)
+ #  define __NR_seccomp 383
+ # elif defined(__aarch64__)
+ #  define __NR_seccomp 277
+ # elif defined(__hppa__)
+ #  define __NR_seccomp 338
+ # elif defined(__powerpc__)
+ #  define __NR_seccomp 358
+ # elif defined(__s390__)
+ #  define __NR_seccomp 348
+ # else
+ #  warning "seccomp syscall number unknown for this architecture"
+ #  define __NR_seccomp 0xffff
+ # endif
+ #endif
+ 
+ #ifndef SECCOMP_SET_MODE_STRICT
+ #define SECCOMP_SET_MODE_STRICT 0
+ #endif
+ 
+ #ifndef SECCOMP_SET_MODE_FILTER
+ #define SECCOMP_SET_MODE_FILTER 1
+ #endif
+ 
+ #ifndef SECCOMP_GET_ACTION_AVAIL
+ #define SECCOMP_GET_ACTION_AVAIL 2
+ #endif
+ 
+ #ifndef SECCOMP_FILTER_FLAG_TSYNC
+ #define SECCOMP_FILTER_FLAG_TSYNC 1
+ #endif
+ 
+ #ifndef SECCOMP_FILTER_FLAG_LOG
+ #define SECCOMP_FILTER_FLAG_LOG 2
+ #endif
+ 
+ #ifndef seccomp
+ int seccomp(unsigned int op, unsigned int flags, void *args)
+ {
+       errno = 0;
+       return syscall(__NR_seccomp, op, flags, args);
+ }
+ #endif
+ 
   #if __BYTE_ORDER == __LITTLE_ENDIAN
   #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
   #elif __BYTE_ORDER == __BIG_ENDIAN
@@@ -107,7 -161,7 +161,7 @@@ TEST(mode_strict_support
         ASSERT_EQ(0, ret) {
                 TH_LOG("Kernel does not support CONFIG_SECCOMP");
         }
- -      syscall(__NR_exit, 1);
+ +      syscall(__NR_exit, 0);
   }
   
   TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
@@@ -136,7 -190,7 +190,7 @@@ TEST(no_new_privs_support
         }
   }
   
- /* Tests kernel support by checking for a copy_from_user() fault on * NULL. */
+ /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
   TEST(mode_filter_support)
   {
         long ret;
@@@ -342,6 -396,28 +396,28 @@@ TEST(empty_prog
         EXPECT_EQ(EINVAL, errno);
   }
   
+ TEST(log_all)
+ {
+       struct sock_filter filter[] = {
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+       };
+       struct sock_fprog prog = {
+               .len = (unsigned short)ARRAY_SIZE(filter),
+               .filter = filter,
+       };
+       long ret;
+       pid_t parent = getppid();
+ 
+       ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+       ASSERT_EQ(0, ret);
+ 
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+       ASSERT_EQ(0, ret);
+ 
+       /* getppid() should succeed and be logged (no check for logging) */
+       EXPECT_EQ(parent, syscall(__NR_getppid));
+ }
+ 
   TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
   {
         struct sock_filter filter[] = {
@@@ -520,6 -596,117 +596,117 @@@ TEST_SIGNAL(KILL_one_arg_six, SIGSYS
         close(fd);
   }
   
+ /* This is a thread task to die via seccomp filter violation. */
+ void *kill_thread(void *data)
+ {
+       bool die = (bool)data;
+ 
+       if (die) {
+               prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+               return (void *)SIBLING_EXIT_FAILURE;
+       }
+ 
+       return (void *)SIBLING_EXIT_UNKILLED;
+ }
+ 
+ /* Prepare a thread that will kill itself or both of us. */
+ void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
+ {
+       pthread_t thread;
+       void *status;
+       /* Kill only when calling __NR_prctl. */
+       struct sock_filter filter_thread[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+       };
+       struct sock_fprog prog_thread = {
+               .len = (unsigned short)ARRAY_SIZE(filter_thread),
+               .filter = filter_thread,
+       };
+       struct sock_filter filter_process[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+       };
+       struct sock_fprog prog_process = {
+               .len = (unsigned short)ARRAY_SIZE(filter_process),
+               .filter = filter_process,
+       };
+ 
+       ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+               TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+       }
+ 
+       ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
+                            kill_process ? &prog_process : &prog_thread));
+ 
+       /*
+        * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
+        * flag cannot be downgraded by a new filter.
+        */
+       ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
+ 
+       /* Start a thread that will exit immediately. */
+       ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
+       ASSERT_EQ(0, pthread_join(thread, &status));
+       ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
+ 
+       /* Start a thread that will die immediately. */
+       ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
+       ASSERT_EQ(0, pthread_join(thread, &status));
+       ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
+ 
+       /*
+        * If we get here, only the spawned thread died. Let the parent know
+        * the whole process didn't die (i.e. this thread, the spawner,
+        * stayed running).
+        */
+       exit(42);
+ }
+ 
+ TEST(KILL_thread)
+ {
+       int status;
+       pid_t child_pid;
+ 
+       child_pid = fork();
+       ASSERT_LE(0, child_pid);
+       if (child_pid == 0) {
+               kill_thread_or_group(_metadata, false);
+               _exit(38);
+       }
+ 
+       ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ 
+       /* If only the thread was killed, we'll see exit 42. */
+       ASSERT_TRUE(WIFEXITED(status));
+       ASSERT_EQ(42, WEXITSTATUS(status));
+ }
+ 
+ TEST(KILL_process)
+ {
+       int status;
+       pid_t child_pid;
+ 
+       child_pid = fork();
+       ASSERT_LE(0, child_pid);
+       if (child_pid == 0) {
+               kill_thread_or_group(_metadata, true);
+               _exit(38);
+       }
+ 
+       ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ 
+       /* If the entire process was killed, we'll see SIGSYS. */
+       ASSERT_TRUE(WIFSIGNALED(status));
+       ASSERT_EQ(SIGSYS, WTERMSIG(status));
+ }
+ 
   /* TODO(wad) add 64-bit versus 32-bit arg tests. */
   TEST(arg_out_of_range)
   {
@@@ -541,26 -728,30 +728,30 @@@
         EXPECT_EQ(EINVAL, errno);
   }
   
+ #define ERRNO_FILTER(name, errno)                                     \
+       struct sock_filter _read_filter_##name[] = {                    \
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,                          \
+                       offsetof(struct seccomp_data, nr)),             \
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),       \
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),     \
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),             \
+       };                                                              \
+       struct sock_fprog prog_##name = {                               \
+               .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
+               .filter = _read_filter_##name,                          \
+       }
+ 
+ /* Make sure basic errno values are correctly passed through a filter. */
   TEST(ERRNO_valid)
   {
-       struct sock_filter filter[] = {
-               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
-                       offsetof(struct seccomp_data, nr)),
-               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
-               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | E2BIG),
-               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
-       };
-       struct sock_fprog prog = {
-               .len = (unsigned short)ARRAY_SIZE(filter),
-               .filter = filter,
-       };
+       ERRNO_FILTER(valid, E2BIG);
         long ret;
         pid_t parent = getppid();
   
         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
         ASSERT_EQ(0, ret);
   
-       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
         ASSERT_EQ(0, ret);
   
         EXPECT_EQ(parent, syscall(__NR_getppid));
@@@ -568,26 -759,17 +759,17 @@@
         EXPECT_EQ(E2BIG, errno);
   }
   
+ /* Make sure an errno of zero is correctly handled by the arch code. */
   TEST(ERRNO_zero)
   {
-       struct sock_filter filter[] = {
-               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
-                       offsetof(struct seccomp_data, nr)),
-               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
-               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 0),
-               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
-       };
-       struct sock_fprog prog = {
-               .len = (unsigned short)ARRAY_SIZE(filter),
-               .filter = filter,
-       };
+       ERRNO_FILTER(zero, 0);
         long ret;
         pid_t parent = getppid();
   
         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
         ASSERT_EQ(0, ret);
   
-       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
         ASSERT_EQ(0, ret);
   
         EXPECT_EQ(parent, syscall(__NR_getppid));
@@@ -595,26 -777,21 +777,21 @@@
         EXPECT_EQ(0, read(0, NULL, 0));
   }
   
+ /*
+  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
+  * This tests that the errno value gets capped correctly, fixed by
+  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
+  */
   TEST(ERRNO_capped)
   {
-       struct sock_filter filter[] = {
-               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
-                       offsetof(struct seccomp_data, nr)),
-               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
-               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | 4096),
-               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
-       };
-       struct sock_fprog prog = {
-               .len = (unsigned short)ARRAY_SIZE(filter),
-               .filter = filter,
-       };
+       ERRNO_FILTER(capped, 4096);
         long ret;
         pid_t parent = getppid();
   
         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
         ASSERT_EQ(0, ret);
   
-       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
         ASSERT_EQ(0, ret);
   
         EXPECT_EQ(parent, syscall(__NR_getppid));
@@@ -622,6 -799,37 +799,37 @@@
         EXPECT_EQ(4095, errno);
   }
   
+ /*
+  * Filters are processed in reverse order: last applied is executed first.
+  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
+  * SECCOMP_RET_DATA mask results will follow the most recently applied
+  * matching filter return (and not the lowest or highest value).
+  */
+ TEST(ERRNO_order)
+ {
+       ERRNO_FILTER(first,  11);
+       ERRNO_FILTER(second, 13);
+       ERRNO_FILTER(third,  12);
+       long ret;
+       pid_t parent = getppid();
+ 
+       ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+       ASSERT_EQ(0, ret);
+ 
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
+       ASSERT_EQ(0, ret);
+ 
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
+       ASSERT_EQ(0, ret);
+ 
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
+       ASSERT_EQ(0, ret);
+ 
+       EXPECT_EQ(parent, syscall(__NR_getppid));
+       EXPECT_EQ(-1, read(0, NULL, 0));
+       EXPECT_EQ(12, errno);
+ }
+ 
   FIXTURE_DATA(TRAP) {
         struct sock_fprog prog;
   };
@@@ -735,6 -943,7 +943,7 @@@ TEST_F(TRAP, handler
   
   FIXTURE_DATA(precedence) {
         struct sock_fprog allow;
+       struct sock_fprog log;
         struct sock_fprog trace;
         struct sock_fprog error;
         struct sock_fprog trap;
@@@ -746,6 -955,13 +955,13 @@@ FIXTURE_SETUP(precedence
         struct sock_filter allow_insns[] = {
                 BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
         };
+       struct sock_filter log_insns[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+       };
         struct sock_filter trace_insns[] = {
                 BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
                         offsetof(struct seccomp_data, nr)),
@@@ -782,6 -998,7 +998,7 @@@
         memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
         self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
         FILTER_ALLOC(allow);
+       FILTER_ALLOC(log);
         FILTER_ALLOC(trace);
         FILTER_ALLOC(error);
         FILTER_ALLOC(trap);
@@@ -792,6 -1009,7 +1009,7 @@@ FIXTURE_TEARDOWN(precedence
   {
   #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
         FILTER_FREE(allow);
+       FILTER_FREE(log);
         FILTER_FREE(trace);
         FILTER_FREE(error);
         FILTER_FREE(trap);
@@@ -809,6 -1027,8 +1027,8 @@@ TEST_F(precedence, allow_ok
   
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@@ -833,6 -1053,8 +1053,8 @@@ TEST_F_SIGNAL(precedence, kill_is_highe
   
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@@ -864,6 -1086,8 +1086,8 @@@ TEST_F_SIGNAL(precedence, kill_is_highe
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
@@@ -885,6 -1109,8 +1109,8 @@@ TEST_F_SIGNAL(precedence, trap_is_secon
   
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@@ -910,6 -1136,8 +1136,8 @@@ TEST_F_SIGNAL(precedence, trap_is_secon
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@@ -931,6 -1159,8 +1159,8 @@@ TEST_F(precedence, errno_is_third
   
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@@ -949,6 -1179,8 +1179,8 @@@ TEST_F(precedence, errno_is_third_in_an
         ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
         ASSERT_EQ(0, ret);
   
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
@@@ -971,6 -1203,8 +1203,8 @@@ TEST_F(precedence, trace_is_fourth
   
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
         ASSERT_EQ(0, ret);
         /* Should work just fine. */
@@@ -992,12 -1226,54 +1226,54 @@@ TEST_F(precedence, trace_is_fourth_in_a
         ASSERT_EQ(0, ret);
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
         ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
         /* Should work just fine. */
         EXPECT_EQ(parent, syscall(__NR_getppid));
         /* No ptracer */
         EXPECT_EQ(-1, syscall(__NR_getpid));
   }
   
+ TEST_F(precedence, log_is_fifth)
+ {
+       pid_t mypid, parent;
+       long ret;
+ 
+       mypid = getpid();
+       parent = getppid();
+       ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+       ASSERT_EQ(0, ret);
+ 
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+       ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
+       /* Should work just fine. */
+       EXPECT_EQ(parent, syscall(__NR_getppid));
+       /* Should also work just fine */
+       EXPECT_EQ(mypid, syscall(__NR_getpid));
+ }
+ 
+ TEST_F(precedence, log_is_fifth_in_any_order)
+ {
+       pid_t mypid, parent;
+       long ret;
+ 
+       mypid = getpid();
+       parent = getppid();
+       ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+       ASSERT_EQ(0, ret);
+ 
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+       ASSERT_EQ(0, ret);
+       ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+       ASSERT_EQ(0, ret);
+       /* Should work just fine. */
+       EXPECT_EQ(parent, syscall(__NR_getppid));
+       /* Should also work just fine */
+       EXPECT_EQ(mypid, syscall(__NR_getpid));
+ }
+ 
   #ifndef PTRACE_O_TRACESECCOMP
   #define PTRACE_O_TRACESECCOMP 0x00000080
   #endif
@@@ -1262,6 -1538,13 +1538,13 @@@ TEST_F(TRACE_poke, getpid_runs_normally
   # error "Do not know how to find your architecture's registers and syscalls"
   #endif
   
+ /* When the syscall return can't be changed, stub out the tests for it. */
+ #ifdef SYSCALL_NUM_RET_SHARE_REG
+ # define EXPECT_SYSCALL_RETURN(val, action)   EXPECT_EQ(-1, action)
+ #else
+ # define EXPECT_SYSCALL_RETURN(val, action)   EXPECT_EQ(val, action)
+ #endif
+ 
   /* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
    * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
    */
@@@ -1357,7 -1640,7 +1640,7 @@@ void change_syscall(struct __test_metad
   #ifdef SYSCALL_NUM_RET_SHARE_REG
                 TH_LOG("Can't modify syscall return on this architecture");
   #else
-               regs.SYSCALL_RET = 1;
+               regs.SYSCALL_RET = EPERM;
   #endif
   
   #ifdef HAVE_GETREGS
@@@ -1426,6 -1709,8 +1709,8 @@@ void tracer_ptrace(struct __test_metada
   
         if (nr == __NR_getpid)
                 change_syscall(_metadata, tracee, __NR_getppid);
+       if (nr == __NR_open)
+               change_syscall(_metadata, tracee, -1);
   }
   
   FIXTURE_DATA(TRACE_syscall) {
@@@ -1480,6 -1765,28 +1765,28 @@@ FIXTURE_TEARDOWN(TRACE_syscall
                 free(self->prog.filter);
   }
   
+ TEST_F(TRACE_syscall, ptrace_syscall_redirected)
+ {
+       /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
+       teardown_trace_fixture(_metadata, self->tracer);
+       self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
+                                          true);
+ 
+       /* Tracer will redirect getpid to getppid. */
+       EXPECT_NE(self->mypid, syscall(__NR_getpid));
+ }
+ 
+ TEST_F(TRACE_syscall, ptrace_syscall_dropped)
+ {
+       /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
+       teardown_trace_fixture(_metadata, self->tracer);
+       self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
+                                          true);
+ 
+       /* Tracer should skip the open syscall, resulting in EPERM. */
+       EXPECT_SYSCALL_RETURN(EPERM, syscall(__NR_open));
+ }
+ 
   TEST_F(TRACE_syscall, syscall_allowed)
   {
         long ret;
@@@ -1520,13 -1827,8 +1827,8 @@@ TEST_F(TRACE_syscall, syscall_dropped
         ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
         ASSERT_EQ(0, ret);
   
- #ifdef SYSCALL_NUM_RET_SHARE_REG
-       /* gettid has been skipped */
-       EXPECT_EQ(-1, syscall(__NR_gettid));
- #else
         /* gettid has been skipped and an altered return value stored. */
-       EXPECT_EQ(1, syscall(__NR_gettid));
- #endif
+       EXPECT_SYSCALL_RETURN(EPERM, syscall(__NR_gettid));
         EXPECT_NE(self->mytid, syscall(__NR_gettid));
   }
   
@@@ -1557,6 -1859,7 +1859,7 @@@ TEST_F(TRACE_syscall, skip_after_RET_TR
         ASSERT_EQ(0, ret);
   
         /* Tracer will redirect getpid to getppid, and we should see EPERM. */
+       errno = 0;
         EXPECT_EQ(-1, syscall(__NR_getpid));
         EXPECT_EQ(EPERM, errno);
   }
@@@ -1654,47 -1957,6 +1957,6 @@@ TEST_F_SIGNAL(TRACE_syscall, kill_after
         EXPECT_NE(self->mypid, syscall(__NR_getpid));
   }
   
- #ifndef __NR_seccomp
- # if defined(__i386__)
- #  define __NR_seccomp 354
- # elif defined(__x86_64__)
- #  define __NR_seccomp 317
- # elif defined(__arm__)
- #  define __NR_seccomp 383
- # elif defined(__aarch64__)
- #  define __NR_seccomp 277
- # elif defined(__hppa__)
- #  define __NR_seccomp 338
- # elif defined(__powerpc__)
- #  define __NR_seccomp 358
- # elif defined(__s390__)
- #  define __NR_seccomp 348
- # else
- #  warning "seccomp syscall number unknown for this architecture"
- #  define __NR_seccomp 0xffff
- # endif
- #endif
- 
- #ifndef SECCOMP_SET_MODE_STRICT
- #define SECCOMP_SET_MODE_STRICT 0
- #endif
- 
- #ifndef SECCOMP_SET_MODE_FILTER
- #define SECCOMP_SET_MODE_FILTER 1
- #endif
- 
- #ifndef SECCOMP_FILTER_FLAG_TSYNC
- #define SECCOMP_FILTER_FLAG_TSYNC 1
- #endif
- 
- #ifndef seccomp
- int seccomp(unsigned int op, unsigned int flags, void *args)
- {
-       errno = 0;
-       return syscall(__NR_seccomp, op, flags, args);
- }
- #endif
- 
   TEST(seccomp_syscall)
   {
         struct sock_filter filter[] = {
@@@ -1783,6 -2045,67 +2045,67 @@@ TEST(seccomp_syscall_mode_lock
         }
   }
   
+ /*
+  * Test detection of known and unknown filter flags. Userspace needs to be able
+  * to check if a filter flag is supported by the current kernel and a good way
+  * of doing that is by attempting to enter filter mode, with the flag bit in
+  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
+  * that the flag is valid and EINVAL indicates that the flag is invalid.
+  */
+ TEST(detect_seccomp_filter_flags)
+ {
+       unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
+                                SECCOMP_FILTER_FLAG_LOG };
+       unsigned int flag, all_flags;
+       int i;
+       long ret;
+ 
+       /* Test detection of known-good filter flags */
+       for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
+               flag = flags[i];
+               ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+               ASSERT_NE(ENOSYS, errno) {
+                       TH_LOG("Kernel does not support seccomp syscall!");
+               }
+               EXPECT_EQ(-1, ret);
+               EXPECT_EQ(EFAULT, errno) {
+                       TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
+                              flag);
+               }
+ 
+               all_flags |= flag;
+       }
+ 
+       /* Test detection of all known-good filter flags */
+       ret = seccomp(SECCOMP_SET_MODE_FILTER, all_flags, NULL);
+       EXPECT_EQ(-1, ret);
+       EXPECT_EQ(EFAULT, errno) {
+               TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
+                      all_flags);
+       }
+ 
+       /* Test detection of an unknown filter flag */
+       flag = -1;
+       ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+       EXPECT_EQ(-1, ret);
+       EXPECT_EQ(EINVAL, errno) {
+               TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
+                      flag);
+       }
+ 
+       /*
+        * Test detection of an unknown filter flag that may simply need to be
+        * added to this test
+        */
+       flag = flags[ARRAY_SIZE(flags) - 1] << 1;
+       ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+       EXPECT_EQ(-1, ret);
+       EXPECT_EQ(EINVAL, errno) {
+               TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
+                      flag);
+       }
+ }
+ 
   TEST(TSYNC_first)
   {
         struct sock_filter filter[] = {
@@@ -2421,6 -2744,99 +2744,99 @@@ TEST(syscall_restart
                 _metadata->passed = 0;
   }
   
+ TEST_SIGNAL(filter_flag_log, SIGSYS)
+ {
+       struct sock_filter allow_filter[] = {
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+       };
+       struct sock_filter kill_filter[] = {
+               BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+                       offsetof(struct seccomp_data, nr)),
+               BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+               BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+       };
+       struct sock_fprog allow_prog = {
+               .len = (unsigned short)ARRAY_SIZE(allow_filter),
+               .filter = allow_filter,
+       };
+       struct sock_fprog kill_prog = {
+               .len = (unsigned short)ARRAY_SIZE(kill_filter),
+               .filter = kill_filter,
+       };
+       long ret;
+       pid_t parent = getppid();
+ 
+       ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+       ASSERT_EQ(0, ret);
+ 
+       /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
+       ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
+                     &allow_prog);
+       ASSERT_NE(ENOSYS, errno) {
+               TH_LOG("Kernel does not support seccomp syscall!");
+       }
+       EXPECT_NE(0, ret) {
+               TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
+       }
+       EXPECT_EQ(EINVAL, errno) {
+               TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
+       }
+ 
+       /* Verify that a simple, permissive filter can be added with no flags */
+       ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
+       EXPECT_EQ(0, ret);
+ 
+       /* See if the same filter can be added with the FILTER_FLAG_LOG flag */
+       ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+                     &allow_prog);
+       ASSERT_NE(EINVAL, errno) {
+               TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
+       }
+       EXPECT_EQ(0, ret);
+ 
+       /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
+       ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+                     &kill_prog);
+       EXPECT_EQ(0, ret);
+ 
+       EXPECT_EQ(parent, syscall(__NR_getppid));
+       /* getpid() should never return. */
+       EXPECT_EQ(0, syscall(__NR_getpid));
+ }
+ 
+ TEST(get_action_avail)
+ {
+       __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
+                           SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
+                           SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
+       __u32 unknown_action = 0x10000000U;
+       int i;
+       long ret;
+ 
+       ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
+       ASSERT_NE(ENOSYS, errno) {
+               TH_LOG("Kernel does not support seccomp syscall!");
+       }
+       ASSERT_NE(EINVAL, errno) {
+               TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
+       }
+       EXPECT_EQ(ret, 0);
+ 
+       for (i = 0; i < ARRAY_SIZE(actions); i++) {
+               ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
+               EXPECT_EQ(ret, 0) {
+                       TH_LOG("Expected action (0x%X) not available!",
+                              actions[i]);
+               }
+       }
+ 
+       /* Check that an unknown action is handled properly (EOPNOTSUPP) */
+       ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
+       EXPECT_EQ(ret, -1);
+       EXPECT_EQ(errno, EOPNOTSUPP);
+ }
+ 
   /*
    * TODO:
    * - add microbenchmarks
@@@ -2429,6 -2845,8 +2845,8 @@@
    * - endianness checking when appropriate
    * - 64-bit arg prodding
    * - arch value testing (x86 modes especially)
+  * - verify that FILTER_FLAG_LOG filters generate log messages
+  * - verify that RET_LOG generates log messages
    * - ...
    */
author	Linus Torvalds <[email protected]>
	Sat, 23 Sep 2017 02:16:41 +0000 (16:16 -1000)
committer	Linus Torvalds <[email protected]>
	Sat, 23 Sep 2017 02:16:41 +0000 (16:16 -1000)
		1	2
Documentation/networking/filter.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/sysctl/kernel.txt	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/audit.h	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/seccomp/seccomp_bpf.c	patch \|	diff1 \|	diff2 \|	blob \| history