From 81c94e76ce8e14a487bca210b3fd92ca661516ac Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Thu, 6 Feb 2014 07:34:02 -0500
Subject: selinux: fix the output of ./scripts/get_maintainer.pl for SELinux

Correctly tag the SELinux mailing list as moderated for non-subscribers
and do some shuffling of the SELinux maintainers to try and make things
more clear when the scripts/get_maintainer.pl script is used.

 # ./scripts/get_maintainer.pl -f security/selinux
 Paul Moore <paul@paul-moore.com> (supporter:SELINUX SECURITY...)
 Stephen Smalley <sds@tycho.nsa.gov> (supporter:SELINUX SECURITY...)
 Eric Paris <eparis@parisplace.org> (supporter:SELINUX SECURITY...)
 James Morris <james.l.morris@oracle.com> (supporter:SECURITY SUBSYSTEM)
 selinux@tycho.nsa.gov (moderated list:SELINUX SECURITY...)
 linux-security-module@vger.kernel.org (open list:SECURITY SUBSYSTEM)
 linux-kernel@vger.kernel.org (open list)

Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Eric Paris <eparis@parisplace.org>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 MAINTAINERS | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6a6e4ac72287..bfe8da460f7a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7583,11 +7583,10 @@ M:	Security Officers <security@kernel.org>
 S:	Supported
 
 SELINUX SECURITY MODULE
+M:	Paul Moore <paul@paul-moore.com>
 M:	Stephen Smalley <sds@tycho.nsa.gov>
-M:	James Morris <james.l.morris@oracle.com>
 M:	Eric Paris <eparis@parisplace.org>
-M:	Paul Moore <paul@paul-moore.com>
-L:	selinux@tycho.nsa.gov (subscribers-only, general discussion)
+L:	selinux@tycho.nsa.gov (moderated for non-subscribers)
 W:	http://selinuxproject.org
 T:	git git://git.infradead.org/users/pcmoore/selinux
 S:	Supported
-- 
cgit v1.2.3


From 0909c0ae999c325b9d34c6f4710f40730ae3bc24 Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Fri, 28 Feb 2014 07:23:24 -0500
Subject: selinux: put the mmap() DAC controls before the MAC controls

It turns out that doing the SELinux MAC checks for mmap() before the
DAC checks was causing users and the SELinux policy folks headaches
as users were seeing a lot of SELinux AVC denials for the
memprotect:mmap_zero permission that would have also been denied by
the normal DAC capability checks (CAP_SYS_RAWIO).

Example:

 # cat mmap_test.c
  #include <stdlib.h>
  #include <stdio.h>
  #include <errno.h>
  #include <sys/mman.h>

  int main(int argc, char *argv[])
  {
        int rc;
        void *mem;

        mem = mmap(0x0, 4096,
                   PROT_READ | PROT_WRITE,
                   MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
        if (mem == MAP_FAILED)
                return errno;
        printf("mem = %p\n", mem);
        munmap(mem, 4096);

        return 0;
  }
 # gcc -g -O0 -o mmap_test mmap_test.c
 # ./mmap_test
 mem = (nil)
 # ausearch -m AVC | grep mmap_zero
 type=AVC msg=audit(...): avc:  denied  { mmap_zero }
   for pid=1025 comm="mmap_test"
   scontext=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
   tcontext=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
   tclass=memprotect

This patch corrects things so that when the above example is run by a
user without CAP_SYS_RAWIO the SELinux AVC is no longer generated as
the DAC capability check fails before the SELinux permission check.

Signed-off-by: Paul Moore <pmoore@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
---
 security/selinux/hooks.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4b34847208cc..a3230de656e4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3204,24 +3204,20 @@ error:
 
 static int selinux_mmap_addr(unsigned long addr)
 {
-	int rc = 0;
-	u32 sid = current_sid();
+	int rc;
+
+	/* do DAC check on address space usage */
+	rc = cap_mmap_addr(addr);
+	if (rc)
+		return rc;
 
-	/*
-	 * notice that we are intentionally putting the SELinux check before
-	 * the secondary cap_file_mmap check.  This is such a likely attempt
-	 * at bad behaviour/exploit that we always want to get the AVC, even
-	 * if DAC would have also denied the operation.
-	 */
 	if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
+		u32 sid = current_sid();
 		rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
 				  MEMPROTECT__MMAP_ZERO, NULL);
-		if (rc)
-			return rc;
 	}
 
-	/* do DAC check on address space usage */
-	return cap_mmap_addr(addr);
+	return rc;
 }
 
 static int selinux_mmap_file(struct file *file, unsigned long reqprot,
-- 
cgit v1.2.3


From eee3094683fbc7fe6bcdaef58c1ef31f8460cdca Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Wed, 5 Mar 2014 15:54:57 -0500
Subject: selinux: correctly label /proc inodes in use before the policy is
 loaded

This patch is based on an earlier patch by Eric Paris, he describes
the problem below:

  "If an inode is accessed before policy load it will get placed on a
   list of inodes to be initialized after policy load.  After policy
   load we call inode_doinit() which calls inode_doinit_with_dentry()
   on all inodes accessed before policy load.  In the case of inodes
   in procfs that means we'll end up at the bottom where it does:

     /* Default to the fs superblock SID. */
     isec->sid = sbsec->sid;

     if ((sbsec->flags & SE_SBPROC) && !S_ISLNK(inode->i_mode)) {
             if (opt_dentry) {
                     isec->sclass = inode_mode_to_security_class(...)
                     rc = selinux_proc_get_sid(opt_dentry,
                                               isec->sclass,
                                               &sid);
                     if (rc)
                             goto out_unlock;
                     isec->sid = sid;
             }
     }

   Since opt_dentry is null, we'll never call selinux_proc_get_sid()
   and will leave the inode labeled with the label on the superblock.
   I believe a fix would be to mimic the behavior of xattrs.  Look
   for an alias of the inode.  If it can't be found, just leave the
   inode uninitialized (and pick it up later) if it can be found, we
   should be able to call selinux_proc_get_sid() ..."

On a system exhibiting this problem, you will notice a lot of files in
/proc with the generic "proc_t" type (at least the ones that were
accessed early in the boot), for example:

   # ls -Z /proc/sys/kernel/shmmax | awk '{ print $4 " " $5 }'
   system_u:object_r:proc_t:s0 /proc/sys/kernel/shmmax

However, with this patch in place we see the expected result:

   # ls -Z /proc/sys/kernel/shmmax | awk '{ print $4 " " $5 }'
   system_u:object_r:sysctl_kernel_t:s0 /proc/sys/kernel/shmmax

Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
---
 security/selinux/hooks.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a3230de656e4..8b1656f053f8 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1418,15 +1418,33 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent
 		isec->sid = sbsec->sid;
 
 		if ((sbsec->flags & SE_SBPROC) && !S_ISLNK(inode->i_mode)) {
-			if (opt_dentry) {
-				isec->sclass = inode_mode_to_security_class(inode->i_mode);
-				rc = selinux_proc_get_sid(opt_dentry,
-							  isec->sclass,
-							  &sid);
-				if (rc)
-					goto out_unlock;
-				isec->sid = sid;
-			}
+			/* We must have a dentry to determine the label on
+			 * procfs inodes */
+			if (opt_dentry)
+				/* Called from d_instantiate or
+				 * d_splice_alias. */
+				dentry = dget(opt_dentry);
+			else
+				/* Called from selinux_complete_init, try to
+				 * find a dentry. */
+				dentry = d_find_alias(inode);
+			/*
+			 * This can be hit on boot when a file is accessed
+			 * before the policy is loaded.  When we load policy we
+			 * may find inodes that have no dentry on the
+			 * sbsec->isec_head list.  No reason to complain as
+			 * these will get fixed up the next time we go through
+			 * inode_doinit() with a dentry, before these inodes
+			 * could be used again by userspace.
+			 */
+			if (!dentry)
+				goto out_unlock;
+			isec->sclass = inode_mode_to_security_class(inode->i_mode);
+			rc = selinux_proc_get_sid(dentry, isec->sclass, &sid);
+			dput(dentry);
+			if (rc)
+				goto out_unlock;
+			isec->sid = sid;
 		}
 		break;
 	}
-- 
cgit v1.2.3


From 626b9740fa73cad043e136bfb3b6fca68a4f8a7c Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Tue, 29 Apr 2014 11:29:04 -0700
Subject: selinux:  Report permissive mode in avc: denied messages.

We cannot presently tell from an avc: denied message whether access was in
fact denied or was allowed due to global or per-domain permissive mode.
Add a permissive= field to the avc message to reflect this information.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/avc.c         | 7 ++++++-
 security/selinux/hooks.c       | 5 +++--
 security/selinux/include/avc.h | 4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index fc3e6628a864..a18f1fa6440b 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -444,11 +444,15 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
 	avc_dump_query(ab, ad->selinux_audit_data->ssid,
 			   ad->selinux_audit_data->tsid,
 			   ad->selinux_audit_data->tclass);
+	if (ad->selinux_audit_data->denied) {
+		audit_log_format(ab, " permissive=%u",
+				 ad->selinux_audit_data->result ? 0 : 1);
+	}
 }
 
 /* This is the slow part of avc audit with big stack footprint */
 noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
-		u32 requested, u32 audited, u32 denied,
+		u32 requested, u32 audited, u32 denied, int result,
 		struct common_audit_data *a,
 		unsigned flags)
 {
@@ -477,6 +481,7 @@ noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
 	sad.tsid = tsid;
 	sad.audited = audited;
 	sad.denied = denied;
+	sad.result = result;
 
 	a->selinux_audit_data = &sad;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d58946dca8c9..889cf4c3c3fa 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2770,6 +2770,7 @@ static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *na
 
 static noinline int audit_inode_permission(struct inode *inode,
 					   u32 perms, u32 audited, u32 denied,
+					   int result,
 					   unsigned flags)
 {
 	struct common_audit_data ad;
@@ -2780,7 +2781,7 @@ static noinline int audit_inode_permission(struct inode *inode,
 	ad.u.inode = inode;
 
 	rc = slow_avc_audit(current_sid(), isec->sid, isec->sclass, perms,
-			    audited, denied, &ad, flags);
+			    audited, denied, result, &ad, flags);
 	if (rc)
 		return rc;
 	return 0;
@@ -2822,7 +2823,7 @@ static int selinux_inode_permission(struct inode *inode, int mask)
 	if (likely(!audited))
 		return rc;
 
-	rc2 = audit_inode_permission(inode, perms, audited, denied, flags);
+	rc2 = audit_inode_permission(inode, perms, audited, denied, rc, flags);
 	if (rc2)
 		return rc2;
 	return rc;
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index f53ee3c58d0f..ddf8eec03f21 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -102,7 +102,7 @@ static inline u32 avc_audit_required(u32 requested,
 }
 
 int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
-		   u32 requested, u32 audited, u32 denied,
+		   u32 requested, u32 audited, u32 denied, int result,
 		   struct common_audit_data *a,
 		   unsigned flags);
 
@@ -137,7 +137,7 @@ static inline int avc_audit(u32 ssid, u32 tsid,
 	if (likely(!audited))
 		return 0;
 	return slow_avc_audit(ssid, tsid, tclass,
-			      requested, audited, denied,
+			      requested, audited, denied, result,
 			      a, 0);
 }
 
-- 
cgit v1.2.3


From 4f189988a0a5890db597ec48fc0e8b09922f290a Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Thu, 15 May 2014 11:16:06 -0400
Subject: selinux: reject setexeccon() on MNT_NOSUID applications with -EACCES

We presently prevent processes from using setexecon() to set the
security label of exec()'d processes when NO_NEW_PRIVS is enabled by
returning an error; however, we silently ignore setexeccon() when
exec()'ing from a nosuid mounted filesystem.  This patch makes things
a bit more consistent by returning an error in the setexeccon()/nosuid
case.

Signed-off-by: Paul Moore <pmoore@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
---
 security/selinux/hooks.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 889cf4c3c3fa..b03b0776955a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2123,11 +2123,13 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 		new_tsec->exec_sid = 0;
 
 		/*
-		 * Minimize confusion: if no_new_privs and a transition is
-		 * explicitly requested, then fail the exec.
+		 * Minimize confusion: if no_new_privs or nosuid and a
+		 * transition is explicitly requested, then fail the exec.
 		 */
 		if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
 			return -EPERM;
+		if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+			return -EACCES;
 	} else {
 		/* Check for a default transition on this program. */
 		rc = security_transition_sid(old_tsec->sid, isec->sid,
-- 
cgit v1.2.3


From 612c353178c45250fbec271e7e8e75596d5cbbea Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 15 May 2014 15:02:53 -0400
Subject: selinux: conditionally reschedule in mls_convert_context while
 loading selinux policy

On a slow machine (with debugging enabled), upgrading selinux policy may take
a considerable amount of time. Long enough that the softlockup detector
gets triggered.

The backtrace looks like this..

 > BUG: soft lockup - CPU#2 stuck for 23s! [load_policy:19045]
 > Call Trace:
 >  [<ffffffff81221ddf>] symcmp+0xf/0x20
 >  [<ffffffff81221c27>] hashtab_search+0x47/0x80
 >  [<ffffffff8122e96c>] mls_convert_context+0xdc/0x1c0
 >  [<ffffffff812294e8>] convert_context+0x378/0x460
 >  [<ffffffff81229170>] ? security_context_to_sid_core+0x240/0x240
 >  [<ffffffff812221b5>] sidtab_map+0x45/0x80
 >  [<ffffffff8122bb9f>] security_load_policy+0x3ff/0x580
 >  [<ffffffff810788a8>] ? sched_clock_cpu+0xa8/0x100
 >  [<ffffffff810786dd>] ? sched_clock_local+0x1d/0x80
 >  [<ffffffff810788a8>] ? sched_clock_cpu+0xa8/0x100
 >  [<ffffffff8103096a>] ? __change_page_attr_set_clr+0x82a/0xa50
 >  [<ffffffff810786dd>] ? sched_clock_local+0x1d/0x80
 >  [<ffffffff810788a8>] ? sched_clock_cpu+0xa8/0x100
 >  [<ffffffff8103096a>] ? __change_page_attr_set_clr+0x82a/0xa50
 >  [<ffffffff810788a8>] ? sched_clock_cpu+0xa8/0x100
 >  [<ffffffff81534ddc>] ? retint_restore_args+0xe/0xe
 >  [<ffffffff8109c82d>] ? trace_hardirqs_on_caller+0xfd/0x1c0
 >  [<ffffffff81279a2e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
 >  [<ffffffff810d28a8>] ? rcu_irq_exit+0x68/0xb0
 >  [<ffffffff81534ddc>] ? retint_restore_args+0xe/0xe
 >  [<ffffffff8121e947>] sel_write_load+0xa7/0x770
 >  [<ffffffff81139633>] ? vfs_write+0x1c3/0x200
 >  [<ffffffff81210e8e>] ? security_file_permission+0x1e/0xa0
 >  [<ffffffff8113952b>] vfs_write+0xbb/0x200
 >  [<ffffffff811581c7>] ? fget_light+0x397/0x4b0
 >  [<ffffffff81139c27>] SyS_write+0x47/0xa0
 >  [<ffffffff8153bde4>] tracesys+0xdd/0xe2

Stephen Smalley suggested:

 > Maybe put a cond_resched() within the ebitmap_for_each_positive_bit()
 > loop in mls_convert_context()?

That seems to do the trick. Tested by downgrading and re-upgrading selinux-policy-targeted.

Signed-off-by: Dave Jones <davej@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/mls.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index c85bc1ec040c..d307b37ddc2b 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -492,6 +492,8 @@ int mls_convert_context(struct policydb *oldp,
 			rc = ebitmap_set_bit(&bitmap, catdatum->value - 1, 1);
 			if (rc)
 				return rc;
+
+			cond_resched();
 		}
 		ebitmap_destroy(&c->range.level[l].cat);
 		c->range.level[l].cat = bitmap;
-- 
cgit v1.2.3


From 47dd0b76ace953bd2c0479076db0d3e3b9594003 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 15 May 2014 15:03:53 -0400
Subject: selinux: conditionally reschedule in hashtab_insert while loading
 selinux policy

After silencing the sleeping warning in mls_convert_context() I started
seeing similar traces from hashtab_insert. Do a cond_resched there too.

Signed-off-by: Dave Jones <davej@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/hashtab.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c
index 933e735bb185..2cc496149842 100644
--- a/security/selinux/ss/hashtab.c
+++ b/security/selinux/ss/hashtab.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
+#include <linux/sched.h>
 #include "hashtab.h"
 
 struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void *key),
@@ -40,6 +41,8 @@ int hashtab_insert(struct hashtab *h, void *key, void *datum)
 	u32 hvalue;
 	struct hashtab_node *prev, *cur, *newnode;
 
+	cond_resched();
+
 	if (!h || h->nel == HASHTAB_MAX_NODES)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From aa65506f198c482b52ba6592c0023eca8b4bf8bd Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.m@jp.panasonic.com>
Date: Tue, 17 Jun 2014 17:31:40 -0400
Subject: selinux, kbuild: remove unnecessary $(hostprogs-y) from clean-files

Files added to hostprogs-y are cleaned. (See scripts/Makefile.clean)
Adding them to clean-files is redundant.

Signed-off-by: Masahiro Yamada <yamada.m@jp.panasonic.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 scripts/selinux/genheaders/Makefile | 1 -
 scripts/selinux/mdp/Makefile        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/selinux/genheaders/Makefile b/scripts/selinux/genheaders/Makefile
index 417b165008ee..1d1ac51359e3 100644
--- a/scripts/selinux/genheaders/Makefile
+++ b/scripts/selinux/genheaders/Makefile
@@ -2,4 +2,3 @@ hostprogs-y	:= genheaders
 HOST_EXTRACFLAGS += -Isecurity/selinux/include
 
 always		:= $(hostprogs-y)
-clean-files	:= $(hostprogs-y)
diff --git a/scripts/selinux/mdp/Makefile b/scripts/selinux/mdp/Makefile
index eb365b333441..dba7eff69a00 100644
--- a/scripts/selinux/mdp/Makefile
+++ b/scripts/selinux/mdp/Makefile
@@ -2,4 +2,4 @@ hostprogs-y	:= mdp
 HOST_EXTRACFLAGS += -Isecurity/selinux/include
 
 always		:= $(hostprogs-y)
-clean-files	:= $(hostprogs-y) policy.* file_contexts
+clean-files	:= policy.* file_contexts
-- 
cgit v1.2.3


From 5c7001b84be55917a99c35ce96df7e67b3c40cea Mon Sep 17 00:00:00 2001
From: Himangi Saraogi <himangi774@gmail.com>
Date: Tue, 17 Jun 2014 01:41:05 +0530
Subject: SELinux: use ARRAY_SIZE

ARRAY_SIZE is more concise to use when the size of an array is divided
by the size of its type or the size of its first element.

The Coccinelle semantic patch that makes this change is as follows:

// <smpl>
@@
type T;
T[] E;
@@

- (sizeof(E)/sizeof(E[...]))
+ ARRAY_SIZE(E)
// </smpl>

Signed-off-by: Himangi Saraogi <himangi774@gmail.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/policydb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 9c5cdc2caaef..56eb65f67cb1 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -2608,7 +2608,7 @@ static int mls_write_range_helper(struct mls_range *r, void *fp)
 	if (!eq)
 		buf[2] = cpu_to_le32(r->level[1].sens);
 
-	BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+	BUG_ON(items > ARRAY_SIZE(buf));
 
 	rc = put_entry(buf, sizeof(u32), items, fp);
 	if (rc)
@@ -2990,7 +2990,7 @@ static int role_write(void *vkey, void *datum, void *ptr)
 	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
 		buf[items++] = cpu_to_le32(role->bounds);
 
-	BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+	BUG_ON(items > ARRAY_SIZE(buf));
 
 	rc = put_entry(buf, sizeof(u32), items, fp);
 	if (rc)
@@ -3040,7 +3040,7 @@ static int type_write(void *vkey, void *datum, void *ptr)
 	} else {
 		buf[items++] = cpu_to_le32(typdatum->primary);
 	}
-	BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+	BUG_ON(items > ARRAY_SIZE(buf));
 	rc = put_entry(buf, sizeof(u32), items, fp);
 	if (rc)
 		return rc;
@@ -3069,7 +3069,7 @@ static int user_write(void *vkey, void *datum, void *ptr)
 	buf[items++] = cpu_to_le32(usrdatum->value);
 	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
 		buf[items++] = cpu_to_le32(usrdatum->bounds);
-	BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+	BUG_ON(items > ARRAY_SIZE(buf));
 	rc = put_entry(buf, sizeof(u32), items, fp);
 	if (rc)
 		return rc;
-- 
cgit v1.2.3


From 4b6f405f72112175a5e044b015c4119b3a5b6f52 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 15 Jun 2014 23:02:51 +0900
Subject: selinux: introduce str_read() helper

There're some code duplication for reading a string value during
policydb_read().  Add str_read() helper to fix it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/policydb.c | 133 ++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 96 deletions(-)

diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 56eb65f67cb1..bc2a586f095c 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -1080,6 +1080,26 @@ out:
  * binary representation file.
  */
 
+static int str_read(char **strp, gfp_t flags, void *fp, u32 len)
+{
+	int rc;
+	char *str;
+
+	str = kmalloc(len + 1, flags);
+	if (!str)
+		return -ENOMEM;
+
+	/* it's expected the caller should free the str */
+	*strp = str;
+
+	rc = next_entry(str, fp, len);
+	if (rc)
+		return rc;
+
+	str[len] = '\0';
+	return 0;
+}
+
 static int perm_read(struct policydb *p, struct hashtab *h, void *fp)
 {
 	char *key = NULL;
@@ -1100,15 +1120,9 @@ static int perm_read(struct policydb *p, struct hashtab *h, void *fp)
 	len = le32_to_cpu(buf[0]);
 	perdatum->value = le32_to_cpu(buf[1]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_KERNEL);
-	if (!key)
-		goto bad;
-
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_KERNEL, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	rc = hashtab_insert(h, key, perdatum);
 	if (rc)
@@ -1146,15 +1160,9 @@ static int common_read(struct policydb *p, struct hashtab *h, void *fp)
 	comdatum->permissions.nprim = le32_to_cpu(buf[2]);
 	nel = le32_to_cpu(buf[3]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_KERNEL);
-	if (!key)
-		goto bad;
-
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_KERNEL, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	for (i = 0; i < nel; i++) {
 		rc = perm_read(p, comdatum->permissions.table, fp);
@@ -1321,25 +1329,14 @@ static int class_read(struct policydb *p, struct hashtab *h, void *fp)
 
 	ncons = le32_to_cpu(buf[5]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_KERNEL);
-	if (!key)
-		goto bad;
-
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_KERNEL, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	if (len2) {
-		rc = -ENOMEM;
-		cladatum->comkey = kmalloc(len2 + 1, GFP_KERNEL);
-		if (!cladatum->comkey)
-			goto bad;
-		rc = next_entry(cladatum->comkey, fp, len2);
+		rc = str_read(&cladatum->comkey, GFP_KERNEL, fp, len2);
 		if (rc)
 			goto bad;
-		cladatum->comkey[len2] = '\0';
 
 		rc = -EINVAL;
 		cladatum->comdatum = hashtab_search(p->p_commons.table, cladatum->comkey);
@@ -1422,15 +1419,9 @@ static int role_read(struct policydb *p, struct hashtab *h, void *fp)
 	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
 		role->bounds = le32_to_cpu(buf[2]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_KERNEL);
-	if (!key)
-		goto bad;
-
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_KERNEL, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	rc = ebitmap_read(&role->dominates, fp);
 	if (rc)
@@ -1495,14 +1486,9 @@ static int type_read(struct policydb *p, struct hashtab *h, void *fp)
 		typdatum->primary = le32_to_cpu(buf[2]);
 	}
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_KERNEL);
-	if (!key)
-		goto bad;
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_KERNEL, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	rc = hashtab_insert(h, key, typdatum);
 	if (rc)
@@ -1565,14 +1551,9 @@ static int user_read(struct policydb *p, struct hashtab *h, void *fp)
 	if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
 		usrdatum->bounds = le32_to_cpu(buf[2]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_KERNEL);
-	if (!key)
-		goto bad;
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_KERNEL, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	rc = ebitmap_read(&usrdatum->roles, fp);
 	if (rc)
@@ -1616,14 +1597,9 @@ static int sens_read(struct policydb *p, struct hashtab *h, void *fp)
 	len = le32_to_cpu(buf[0]);
 	levdatum->isalias = le32_to_cpu(buf[1]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_ATOMIC);
-	if (!key)
-		goto bad;
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_ATOMIC, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	rc = -ENOMEM;
 	levdatum->level = kmalloc(sizeof(struct mls_level), GFP_ATOMIC);
@@ -1664,14 +1640,9 @@ static int cat_read(struct policydb *p, struct hashtab *h, void *fp)
 	catdatum->value = le32_to_cpu(buf[1]);
 	catdatum->isalias = le32_to_cpu(buf[2]);
 
-	rc = -ENOMEM;
-	key = kmalloc(len + 1, GFP_ATOMIC);
-	if (!key)
-		goto bad;
-	rc = next_entry(key, fp, len);
+	rc = str_read(&key, GFP_ATOMIC, fp, len);
 	if (rc)
 		goto bad;
-	key[len] = '\0';
 
 	rc = hashtab_insert(h, key, catdatum);
 	if (rc)
@@ -1968,18 +1939,12 @@ static int filename_trans_read(struct policydb *p, void *fp)
 			goto out;
 		len = le32_to_cpu(buf[0]);
 
-		rc = -ENOMEM;
-		name = kmalloc(len + 1, GFP_KERNEL);
-		if (!name)
-			goto out;
-
-		ft->name = name;
-
 		/* path component string */
-		rc = next_entry(name, fp, len);
+		rc = str_read(&name, GFP_KERNEL, fp, len);
 		if (rc)
 			goto out;
-		name[len] = 0;
+
+		ft->name = name;
 
 		rc = next_entry(buf, fp, sizeof(u32) * 4);
 		if (rc)
@@ -2045,17 +2010,10 @@ static int genfs_read(struct policydb *p, void *fp)
 		if (!newgenfs)
 			goto out;
 
-		rc = -ENOMEM;
-		newgenfs->fstype = kmalloc(len + 1, GFP_KERNEL);
-		if (!newgenfs->fstype)
-			goto out;
-
-		rc = next_entry(newgenfs->fstype, fp, len);
+		rc = str_read(&newgenfs->fstype, GFP_KERNEL, fp, len);
 		if (rc)
 			goto out;
 
-		newgenfs->fstype[len] = 0;
-
 		for (genfs_p = NULL, genfs = p->genfs; genfs;
 		     genfs_p = genfs, genfs = genfs->next) {
 			rc = -EINVAL;
@@ -2091,15 +2049,9 @@ static int genfs_read(struct policydb *p, void *fp)
 			if (!newc)
 				goto out;
 
-			rc = -ENOMEM;
-			newc->u.name = kmalloc(len + 1, GFP_KERNEL);
-			if (!newc->u.name)
-				goto out;
-
-			rc = next_entry(newc->u.name, fp, len);
+			rc = str_read(&newc->u.name, GFP_KERNEL, fp, len);
 			if (rc)
 				goto out;
-			newc->u.name[len] = 0;
 
 			rc = next_entry(buf, fp, sizeof(u32));
 			if (rc)
@@ -2189,16 +2141,10 @@ static int ocontext_read(struct policydb *p, struct policydb_compat_info *info,
 					goto out;
 				len = le32_to_cpu(buf[0]);
 
-				rc = -ENOMEM;
-				c->u.name = kmalloc(len + 1, GFP_KERNEL);
-				if (!c->u.name)
-					goto out;
-
-				rc = next_entry(c->u.name, fp, len);
+				rc = str_read(&c->u.name, GFP_KERNEL, fp, len);
 				if (rc)
 					goto out;
 
-				c->u.name[len] = 0;
 				rc = context_read_and_validate(&c->context[0], p, fp);
 				if (rc)
 					goto out;
@@ -2240,16 +2186,11 @@ static int ocontext_read(struct policydb *p, struct policydb_compat_info *info,
 				if (c->v.behavior > SECURITY_FS_USE_MAX)
 					goto out;
 
-				rc = -ENOMEM;
 				len = le32_to_cpu(buf[1]);
-				c->u.name = kmalloc(len + 1, GFP_KERNEL);
-				if (!c->u.name)
-					goto out;
-
-				rc = next_entry(c->u.name, fp, len);
+				rc = str_read(&c->u.name, GFP_KERNEL, fp, len);
 				if (rc)
 					goto out;
-				c->u.name[len] = 0;
+
 				rc = context_read_and_validate(&c->context[0], p, fp);
 				if (rc)
 					goto out;
-- 
cgit v1.2.3


From 4bb9398300a3a2c691e5c0ad6b9cfa78775e767e Mon Sep 17 00:00:00 2001
From: Gideon Israel Dsouza <gidisrael@gmail.com>
Date: Wed, 11 Jun 2014 21:25:30 +0530
Subject: security: Used macros from compiler.h instead of __attribute__((...))

To increase compiler portability there is <linux/compiler.h> which
provides convenience macros for various gcc constructs.  Eg: __packed
for __attribute__((packed)).

This patch is part of a large task I've taken to clean the gcc
specific attributes and use the the macros instead.

Signed-off-by: Gideon Israel Dsouza <gidisrael@gmail.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/include/security.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index ce7852cf526b..d1e0b239b602 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -8,6 +8,7 @@
 #ifndef _SELINUX_SECURITY_H_
 #define _SELINUX_SECURITY_H_
 
+#include <linux/compiler.h>
 #include <linux/dcache.h>
 #include <linux/magic.h>
 #include <linux/types.h>
@@ -220,7 +221,7 @@ struct selinux_kernel_status {
 	/*
 	 * The version > 0 supports above members.
 	 */
-} __attribute__((packed));
+} __packed;
 
 extern void selinux_status_update_setenforce(int enforcing);
 extern void selinux_status_update_policyload(int seqno);
-- 
cgit v1.2.3


From f004afe60db5b98f2b981978fde8a0d4c6298c5d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 15 Jun 2014 01:19:01 +0900
Subject: selinux: simple cleanup for cond_read_node()

The node->cur_state and len can be read in a single call of next_entry().
And setting len before reading is a dead write so can be eliminated.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
(Minor tweak to the length parameter in the call to next_entry())
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/conditional.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index 377d148e7157..f09cc7268b65 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -402,19 +402,14 @@ static int cond_read_node(struct policydb *p, struct cond_node *node, void *fp)
 	int rc;
 	struct cond_expr *expr = NULL, *last = NULL;
 
-	rc = next_entry(buf, fp, sizeof(u32));
+	rc = next_entry(buf, fp, sizeof(u32) * 2);
 	if (rc)
 		return rc;
 
 	node->cur_state = le32_to_cpu(buf[0]);
 
-	len = 0;
-	rc = next_entry(buf, fp, sizeof(u32));
-	if (rc)
-		return rc;
-
 	/* expr */
-	len = le32_to_cpu(buf[0]);
+	len = le32_to_cpu(buf[1]);
 
 	for (i = 0; i < len; i++) {
 		rc = next_entry(buf, fp, sizeof(u32) * 2);
-- 
cgit v1.2.3


From 6e51f9cbfa04a92b40e7f9c1e76c8ecbff534a22 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 15 Jun 2014 01:19:02 +0900
Subject: selinux: fix a possible memory leak in cond_read_node()

The cond_read_node() should free the given node on error path as it's
not linked to p->cond_list yet.  This is done via cond_node_destroy()
but it's not called when next_entry() fails before the expr loop.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/conditional.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index f09cc7268b65..62c6773be0b7 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -404,7 +404,7 @@ static int cond_read_node(struct policydb *p, struct cond_node *node, void *fp)
 
 	rc = next_entry(buf, fp, sizeof(u32) * 2);
 	if (rc)
-		return rc;
+		goto err;
 
 	node->cur_state = le32_to_cpu(buf[0]);
 
-- 
cgit v1.2.3


From f31e799459659ae88c341aeac16a8a5efb1271d4 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Mon, 23 Jun 2014 11:28:51 -0400
Subject: selinux: no recursive read_lock of policy_rwlock in
 security_genfs_sid()

With the introduction of fair queued rwlock, recursive read_lock()
may hang the offending process if there is a write_lock() somewhere
in between.

With recursive read_lock checking enabled, the following error was
reported:

=============================================
[ INFO: possible recursive locking detected ]
3.16.0-rc1 #2 Tainted: G            E
---------------------------------------------
load_policy/708 is trying to acquire lock:
 (policy_rwlock){.+.+..}, at: [<ffffffff8125b32a>]
security_genfs_sid+0x3a/0x170

but task is already holding lock:
 (policy_rwlock){.+.+..}, at: [<ffffffff8125b48c>]
security_fs_use+0x2c/0x110

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(policy_rwlock);
  lock(policy_rwlock);

This patch fixes the occurrence of recursive read_lock() of
policy_rwlock by adding a helper function __security_genfs_sid()
which requires caller to take the lock before calling it. The
security_fs_use() was then modified to call the new helper function.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/ss/services.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 4bca49414a40..2aa9d172dc7e 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2277,7 +2277,7 @@ out:
 }
 
 /**
- * security_genfs_sid - Obtain a SID for a file in a filesystem
+ * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem
  * @fstype: filesystem type
  * @path: path from root of mount
  * @sclass: file security class
@@ -2286,11 +2286,13 @@ out:
  * Obtain a SID to use for a file in a filesystem that
  * cannot support xattr or use a fixed labeling behavior like
  * transition SIDs or task SIDs.
+ *
+ * The caller must acquire the policy_rwlock before calling this function.
  */
-int security_genfs_sid(const char *fstype,
-		       char *path,
-		       u16 orig_sclass,
-		       u32 *sid)
+static inline int __security_genfs_sid(const char *fstype,
+				       char *path,
+				       u16 orig_sclass,
+				       u32 *sid)
 {
 	int len;
 	u16 sclass;
@@ -2301,8 +2303,6 @@ int security_genfs_sid(const char *fstype,
 	while (path[0] == '/' && path[1] == '/')
 		path++;
 
-	read_lock(&policy_rwlock);
-
 	sclass = unmap_class(orig_sclass);
 	*sid = SECINITSID_UNLABELED;
 
@@ -2336,10 +2336,32 @@ int security_genfs_sid(const char *fstype,
 	*sid = c->sid[0];
 	rc = 0;
 out:
-	read_unlock(&policy_rwlock);
 	return rc;
 }
 
+/**
+ * security_genfs_sid - Obtain a SID for a file in a filesystem
+ * @fstype: filesystem type
+ * @path: path from root of mount
+ * @sclass: file security class
+ * @sid: SID for path
+ *
+ * Acquire policy_rwlock before calling __security_genfs_sid() and release
+ * it afterward.
+ */
+int security_genfs_sid(const char *fstype,
+		       char *path,
+		       u16 orig_sclass,
+		       u32 *sid)
+{
+	int retval;
+
+	read_lock(&policy_rwlock);
+	retval = __security_genfs_sid(fstype, path, orig_sclass, sid);
+	read_unlock(&policy_rwlock);
+	return retval;
+}
+
 /**
  * security_fs_use - Determine how to handle labeling for a filesystem.
  * @sb: superblock in question
@@ -2370,7 +2392,8 @@ int security_fs_use(struct super_block *sb)
 		}
 		sbsec->sid = c->sid[0];
 	} else {
-		rc = security_genfs_sid(fstype, "/", SECCLASS_DIR, &sbsec->sid);
+		rc = __security_genfs_sid(fstype, "/", SECCLASS_DIR,
+					  &sbsec->sid);
 		if (rc) {
 			sbsec->behavior = SECURITY_FS_USE_NONE;
 			rc = 0;
-- 
cgit v1.2.3


From 615e51fdda6f274e94b1e905fcaf6111e0d9aa20 Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Thu, 26 Jun 2014 14:33:56 -0400
Subject: selinux: reduce the number of calls to synchronize_net() when
 flushing caches

When flushing the AVC, such as during a policy load, the various
network caches are also flushed, with each making a call to
synchronize_net() which has shown to be expensive in some cases.
This patch consolidates the network cache flushes into a single AVC
callback which only calls synchronize_net() once for each AVC cache
flush.

Reported-by: Jaejyn Shin <flagon22bass@gmail.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 security/selinux/hooks.c           | 14 ++++++++++++++
 security/selinux/include/netif.h   |  2 ++
 security/selinux/include/netnode.h |  2 ++
 security/selinux/include/netport.h |  2 ++
 security/selinux/netif.c           | 15 +--------------
 security/selinux/netnode.c         | 15 +--------------
 security/selinux/netport.c         | 15 +--------------
 7 files changed, 23 insertions(+), 42 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 336f0a04450e..39bc8c94b969 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -161,6 +161,17 @@ static int selinux_peerlbl_enabled(void)
 	return (selinux_policycap_alwaysnetwork || netlbl_enabled() || selinux_xfrm_enabled());
 }
 
+static int selinux_netcache_avc_callback(u32 event)
+{
+	if (event == AVC_CALLBACK_RESET) {
+		sel_netif_flush();
+		sel_netnode_flush();
+		sel_netport_flush();
+		synchronize_net();
+	}
+	return 0;
+}
+
 /*
  * initialise the security for the init task
  */
@@ -5993,6 +6004,9 @@ static __init int selinux_init(void)
 	if (register_security(&selinux_ops))
 		panic("SELinux: Unable to register with kernel.\n");
 
+	if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
+		panic("SELinux: Unable to register AVC netcache callback\n");
+
 	if (selinux_enforcing)
 		printk(KERN_DEBUG "SELinux:  Starting in enforcing mode\n");
 	else
diff --git a/security/selinux/include/netif.h b/security/selinux/include/netif.h
index 43d507242b42..57c6eae81eac 100644
--- a/security/selinux/include/netif.h
+++ b/security/selinux/include/netif.h
@@ -17,6 +17,8 @@
 #ifndef _SELINUX_NETIF_H_
 #define _SELINUX_NETIF_H_
 
+void sel_netif_flush(void);
+
 int sel_netif_sid(int ifindex, u32 *sid);
 
 #endif	/* _SELINUX_NETIF_H_ */
diff --git a/security/selinux/include/netnode.h b/security/selinux/include/netnode.h
index df7a5ed6c694..937668dd3024 100644
--- a/security/selinux/include/netnode.h
+++ b/security/selinux/include/netnode.h
@@ -27,6 +27,8 @@
 #ifndef _SELINUX_NETNODE_H
 #define _SELINUX_NETNODE_H
 
+void sel_netnode_flush(void);
+
 int sel_netnode_sid(void *addr, u16 family, u32 *sid);
 
 #endif
diff --git a/security/selinux/include/netport.h b/security/selinux/include/netport.h
index 4d965b83d735..d1ce896b2cb0 100644
--- a/security/selinux/include/netport.h
+++ b/security/selinux/include/netport.h
@@ -26,6 +26,8 @@
 #ifndef _SELINUX_NETPORT_H
 #define _SELINUX_NETPORT_H
 
+void sel_netport_flush(void);
+
 int sel_netport_sid(u8 protocol, u16 pnum, u32 *sid);
 
 #endif
diff --git a/security/selinux/netif.c b/security/selinux/netif.c
index 694e9e43855f..3c3de4ca0ebc 100644
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c
@@ -240,7 +240,7 @@ static void sel_netif_kill(int ifindex)
  * Remove all entries from the network interface table.
  *
  */
-static void sel_netif_flush(void)
+void sel_netif_flush(void)
 {
 	int idx;
 	struct sel_netif *netif;
@@ -252,15 +252,6 @@ static void sel_netif_flush(void)
 	spin_unlock_bh(&sel_netif_lock);
 }
 
-static int sel_netif_avc_callback(u32 event)
-{
-	if (event == AVC_CALLBACK_RESET) {
-		sel_netif_flush();
-		synchronize_net();
-	}
-	return 0;
-}
-
 static int sel_netif_netdev_notifier_handler(struct notifier_block *this,
 					     unsigned long event, void *ptr)
 {
@@ -291,10 +282,6 @@ static __init int sel_netif_init(void)
 
 	register_netdevice_notifier(&sel_netif_netdev_notifier);
 
-	err = avc_add_callback(sel_netif_avc_callback, AVC_CALLBACK_RESET);
-	if (err)
-		panic("avc_add_callback() failed, error %d\n", err);
-
 	return err;
 }
 
diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c
index 03a72c32afd7..ddf315260839 100644
--- a/security/selinux/netnode.c
+++ b/security/selinux/netnode.c
@@ -283,7 +283,7 @@ int sel_netnode_sid(void *addr, u16 family, u32 *sid)
  * Remove all entries from the network address table.
  *
  */
-static void sel_netnode_flush(void)
+void sel_netnode_flush(void)
 {
 	unsigned int idx;
 	struct sel_netnode *node, *node_tmp;
@@ -300,15 +300,6 @@ static void sel_netnode_flush(void)
 	spin_unlock_bh(&sel_netnode_lock);
 }
 
-static int sel_netnode_avc_callback(u32 event)
-{
-	if (event == AVC_CALLBACK_RESET) {
-		sel_netnode_flush();
-		synchronize_net();
-	}
-	return 0;
-}
-
 static __init int sel_netnode_init(void)
 {
 	int iter;
@@ -322,10 +313,6 @@ static __init int sel_netnode_init(void)
 		sel_netnode_hash[iter].size = 0;
 	}
 
-	ret = avc_add_callback(sel_netnode_avc_callback, AVC_CALLBACK_RESET);
-	if (ret != 0)
-		panic("avc_add_callback() failed, error %d\n", ret);
-
 	return ret;
 }
 
diff --git a/security/selinux/netport.c b/security/selinux/netport.c
index d35379781c2c..73ac6784d091 100644
--- a/security/selinux/netport.c
+++ b/security/selinux/netport.c
@@ -217,7 +217,7 @@ int sel_netport_sid(u8 protocol, u16 pnum, u32 *sid)
  * Remove all entries from the network address table.
  *
  */
-static void sel_netport_flush(void)
+void sel_netport_flush(void)
 {
 	unsigned int idx;
 	struct sel_netport *port, *port_tmp;
@@ -234,15 +234,6 @@ static void sel_netport_flush(void)
 	spin_unlock_bh(&sel_netport_lock);
 }
 
-static int sel_netport_avc_callback(u32 event)
-{
-	if (event == AVC_CALLBACK_RESET) {
-		sel_netport_flush();
-		synchronize_net();
-	}
-	return 0;
-}
-
 static __init int sel_netport_init(void)
 {
 	int iter;
@@ -256,10 +247,6 @@ static __init int sel_netport_init(void)
 		sel_netport_hash[iter].size = 0;
 	}
 
-	ret = avc_add_callback(sel_netport_avc_callback, AVC_CALLBACK_RESET);
-	if (ret != 0)
-		panic("avc_add_callback() failed, error %d\n", ret);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4da6daf4d3df5a977e4623963f141a627fd2efce Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Thu, 10 Jul 2014 10:17:48 -0400
Subject: selinux: fix the default socket labeling in sock_graft()

The sock_graft() hook has special handling for AF_INET, AF_INET, and
AF_UNIX sockets as those address families have special hooks which
label the sock before it is attached its associated socket.
Unfortunately, the sock_graft() hook was missing a default approach
to labeling sockets which meant that any other address family which
made use of connections or the accept() syscall would find the
returned socket to be in an "unlabeled" state.  This was recently
demonstrated by the kcrypto/AF_ALG subsystem and the newly released
cryptsetup package (cryptsetup v1.6.5 and later).

This patch preserves the special handling in selinux_sock_graft(),
but adds a default behavior - setting the sock's label equal to the
associated socket - which resolves the problem with AF_ALG and
presumably any other address family which makes use of accept().

Cc: stable@vger.kernel.org
Signed-off-by: Paul Moore <pmoore@redhat.com>
Tested-by: Milan Broz <gmazyland@gmail.com>
---
 include/linux/security.h |  5 ++++-
 security/selinux/hooks.c | 13 +++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/linux/security.h b/include/linux/security.h
index 6478ce3252c7..794be735ff4b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -987,7 +987,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Retrieve the LSM-specific secid for the sock to enable caching of network
  *	authorizations.
  * @sock_graft:
- *	Sets the socket's isec sid to the sock's sid.
+ *	This hook is called in response to a newly created sock struct being
+ *	grafted onto an existing socket and allows the security module to
+ *	perform whatever security attribute management is necessary for both
+ *	the sock and socket.
  * @inet_conn_request:
  *	Sets the openreq's sid to socket's sid with MLS portion taken from peer sid.
  * @inet_csk_clone:
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 336f0a04450e..b3a6754e932b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4499,9 +4499,18 @@ static void selinux_sock_graft(struct sock *sk, struct socket *parent)
 	struct inode_security_struct *isec = SOCK_INODE(parent)->i_security;
 	struct sk_security_struct *sksec = sk->sk_security;
 
-	if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
-	    sk->sk_family == PF_UNIX)
+	switch (sk->sk_family) {
+	case PF_INET:
+	case PF_INET6:
+	case PF_UNIX:
 		isec->sid = sksec->sid;
+		break;
+	default:
+		/* by default there is no special labeling mechanism for the
+		 * sksec label so inherit the label from the parent socket */
+		BUG_ON(sksec->sid != SECINITSID_UNLABELED);
+		sksec->sid = isec->sid;
+	}
 	sksec->sclass = isec->sclass;
 }
 
-- 
cgit v1.2.3


From 2c50b964823ebb7f0a098878c399ce859cd38e9e Mon Sep 17 00:00:00 2001
From: Dmitry Kasatkin <d.kasatkin@samsung.com>
Date: Fri, 13 Jun 2014 18:55:47 +0300
Subject: ima: remove unnecessary i_mutex locking from
 ima_rdwr_violation_check()

Before 2.6.39 inode->i_readcount was maintained by IMA. It was not atomic
and protected using spinlock. For 2.6.39, i_readcount was converted to
atomic and maintaining was moved VFS layer. Spinlock for some unclear
reason was replaced by i_mutex.

After analyzing the code, we came to conclusion that i_mutex locking is
unnecessary, especially when an IMA policy has not been defined.

This patch removes i_mutex locking from ima_rdwr_violation_check().

Signed-off-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 security/integrity/ima/ima_main.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 09baa335ebc7..cf1c3696c72e 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -88,8 +88,6 @@ static void ima_rdwr_violation_check(struct file *file)
 	if (!S_ISREG(inode->i_mode) || !ima_initialized)
 		return;
 
-	mutex_lock(&inode->i_mutex);	/* file metadata: permissions, xattr */
-
 	if (mode & FMODE_WRITE) {
 		if (atomic_read(&inode->i_readcount) && IS_IMA(inode)) {
 			struct integrity_iint_cache *iint;
@@ -104,8 +102,6 @@ static void ima_rdwr_violation_check(struct file *file)
 			send_writers = true;
 	}
 
-	mutex_unlock(&inode->i_mutex);
-
 	if (!send_tomtou && !send_writers)
 		return;
 
-- 
cgit v1.2.3


From 209b43ca64a6f2b0c7ac66b457f530c52d608c3e Mon Sep 17 00:00:00 2001
From: Dmitry Kasatkin <d.kasatkin@samsung.com>
Date: Fri, 13 Jun 2014 18:55:48 +0300
Subject: ima: delay template descriptor lookup until use

process_measurement() always calls ima_template_desc_current(),
including when an IMA policy has not been defined.

This patch delays template descriptor lookup until action is
determined.

Signed-off-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 security/integrity/ima/ima_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index cf1c3696c72e..f474c608fa11 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -159,7 +159,7 @@ static int process_measurement(struct file *file, const char *filename,
 {
 	struct inode *inode = file_inode(file);
 	struct integrity_iint_cache *iint;
-	struct ima_template_desc *template_desc = ima_template_desc_current();
+	struct ima_template_desc *template_desc;
 	char *pathbuf = NULL;
 	const char *pathname = NULL;
 	int rc = -ENOMEM, action, must_appraise, _func;
@@ -203,6 +203,7 @@ static int process_measurement(struct file *file, const char *filename,
 		goto out_digsig;
 	}
 
+	template_desc = ima_template_desc_current();
 	if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
 		if (action & IMA_APPRAISE_SUBMASK)
 			xattr_ptr = &xattr_value;
-- 
cgit v1.2.3


From 7e9001f663636116fdc2ea7978f0350849ced624 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 16 Jun 2014 15:52:07 -0400
Subject: audit: fix dangling keywords in integrity ima message output

Replace spaces in op keyword labels in log output since userspace audit tools
can't parse orphaned keywords.

Reported-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 security/integrity/ima/ima_appraise.c | 2 +-
 security/integrity/ima/ima_policy.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index d3113d4aaa3c..59ac90275070 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -214,7 +214,7 @@ int ima_appraise_measurement(int func, struct integrity_iint_cache *iint,
 		hash_start = 1;
 	case IMA_XATTR_DIGEST:
 		if (iint->flags & IMA_DIGSIG_REQUIRED) {
-			cause = "IMA signature required";
+			cause = "IMA-signature-required";
 			status = INTEGRITY_FAIL;
 			break;
 		}
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 40a7488f6721..cea84d8bd7be 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -332,7 +332,7 @@ void __init ima_init_policy(void)
 void ima_update_policy(void)
 {
 	static const char op[] = "policy_update";
-	const char *cause = "already exists";
+	const char *cause = "already-exists";
 	int result = 1;
 	int audit_info = 0;
 
@@ -659,7 +659,7 @@ ssize_t ima_parse_add_rule(char *rule)
 	/* Prevent installed policy from changing */
 	if (ima_rules != &ima_default_rules) {
 		integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
-				    NULL, op, "already exists",
+				    NULL, op, "already-exists",
 				    -EACCES, audit_info);
 		return -EACCES;
 	}
@@ -685,7 +685,7 @@ ssize_t ima_parse_add_rule(char *rule)
 	if (result) {
 		kfree(entry);
 		integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
-				    NULL, op, "invalid policy", result,
+				    NULL, op, "invalid-policy", result,
 				    audit_info);
 		return result;
 	}
-- 
cgit v1.2.3


From 3bcced39ea7d1b0da0a991e221f53de480c6b60b Mon Sep 17 00:00:00 2001
From: Dmitry Kasatkin <d.kasatkin@samsung.com>
Date: Wed, 26 Feb 2014 17:05:20 +0200
Subject: ima: use ahash API for file hash calculation

Async hash API allows the use of HW acceleration for hash calculation.
It may give significant performance gain and/or reduce power consumption,
which might be very beneficial for battery powered devices.

This patch introduces hash calculation using ahash API. ahash performance
depends on the data size and the particular HW. Depending on the specific
system, shash performance may be better.

This patch defines 'ahash_minsize' module parameter, which is used to
define the minimal file size to use with ahash.  If this minimum file size
is not set or the file is smaller than defined by the parameter, shash will
be used.

Changes in v3:
- kernel parameter replaced with module parameter
- pr_crit replaced with pr_crit_ratelimited
- more comment changes - Mimi

Changes in v2:
- ima_ahash_size became as ima_ahash
- ahash pre-allocation moved out from __init code to be able to use
  ahash crypto modules. Ahash allocated once on the first use.
- hash calculation falls back to shash if ahash allocation/calculation fails
- complex initialization separated from variable declaration
- improved comments

Signed-off-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |   9 ++
 security/integrity/ima/ima_crypto.c | 187 +++++++++++++++++++++++++++++++++++-
 2 files changed, 192 insertions(+), 4 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c1b9aa8c5a52..775fe039e060 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1313,6 +1313,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Formats: { "ima" | "ima-ng" }
 			Default: "ima-ng"
 
+	ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
+			Format: <min_file_size>
+			Set the minimal file size for using asynchronous hash.
+			If left unspecified, ahash usage is disabled.
+
+			ahash performance varies for different data sizes on
+			different crypto accelerators. This option can be used
+			to achieve the best performance for a particular HW.
+
 	init=		[KNL]
 			Format: <full_path>
 			Run specified binary instead of /sbin/init as init
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index ccd0ac8fa9a0..b9e5120559d4 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -16,6 +16,8 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/ratelimit.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
@@ -25,7 +27,18 @@
 #include <crypto/hash_info.h>
 #include "ima.h"
 
+struct ahash_completion {
+	struct completion completion;
+	int err;
+};
+
+/* minimum file size for ahash use */
+static unsigned long ima_ahash_minsize;
+module_param_named(ahash_minsize, ima_ahash_minsize, ulong, 0644);
+MODULE_PARM_DESC(ahash_minsize, "Minimum file size for ahash use");
+
 static struct crypto_shash *ima_shash_tfm;
+static struct crypto_ahash *ima_ahash_tfm;
 
 /**
  * ima_kernel_read - read file content
@@ -93,9 +106,146 @@ static void ima_free_tfm(struct crypto_shash *tfm)
 		crypto_free_shash(tfm);
 }
 
-/*
- * Calculate the MD5/SHA1 file digest
- */
+static struct crypto_ahash *ima_alloc_atfm(enum hash_algo algo)
+{
+	struct crypto_ahash *tfm = ima_ahash_tfm;
+	int rc;
+
+	if ((algo != ima_hash_algo && algo < HASH_ALGO__LAST) || !tfm) {
+		tfm = crypto_alloc_ahash(hash_algo_name[algo], 0, 0);
+		if (!IS_ERR(tfm)) {
+			if (algo == ima_hash_algo)
+				ima_ahash_tfm = tfm;
+		} else {
+			rc = PTR_ERR(tfm);
+			pr_err("Can not allocate %s (reason: %d)\n",
+			       hash_algo_name[algo], rc);
+		}
+	}
+	return tfm;
+}
+
+static void ima_free_atfm(struct crypto_ahash *tfm)
+{
+	if (tfm != ima_ahash_tfm)
+		crypto_free_ahash(tfm);
+}
+
+static void ahash_complete(struct crypto_async_request *req, int err)
+{
+	struct ahash_completion *res = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+	res->err = err;
+	complete(&res->completion);
+}
+
+static int ahash_wait(int err, struct ahash_completion *res)
+{
+	switch (err) {
+	case 0:
+		break;
+	case -EINPROGRESS:
+	case -EBUSY:
+		wait_for_completion(&res->completion);
+		reinit_completion(&res->completion);
+		err = res->err;
+		/* fall through */
+	default:
+		pr_crit_ratelimited("ahash calculation failed: err: %d\n", err);
+	}
+
+	return err;
+}
+
+static int ima_calc_file_hash_atfm(struct file *file,
+				   struct ima_digest_data *hash,
+				   struct crypto_ahash *tfm)
+{
+	loff_t i_size, offset;
+	char *rbuf;
+	int rc, read = 0, rbuf_len;
+	struct ahash_request *req;
+	struct scatterlist sg[1];
+	struct ahash_completion res;
+
+	hash->length = crypto_ahash_digestsize(tfm);
+
+	req = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	init_completion(&res.completion);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+				   CRYPTO_TFM_REQ_MAY_SLEEP,
+				   ahash_complete, &res);
+
+	rc = ahash_wait(crypto_ahash_init(req), &res);
+	if (rc)
+		goto out1;
+
+	i_size = i_size_read(file_inode(file));
+
+	if (i_size == 0)
+		goto out2;
+
+	rbuf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!rbuf) {
+		rc = -ENOMEM;
+		goto out1;
+	}
+
+	if (!(file->f_mode & FMODE_READ)) {
+		file->f_mode |= FMODE_READ;
+		read = 1;
+	}
+
+	for (offset = 0; offset < i_size; offset += rbuf_len) {
+		rbuf_len = ima_kernel_read(file, offset, rbuf, PAGE_SIZE);
+		if (rbuf_len < 0) {
+			rc = rbuf_len;
+			break;
+		}
+		if (rbuf_len == 0)
+			break;
+
+		sg_init_one(&sg[0], rbuf, rbuf_len);
+		ahash_request_set_crypt(req, sg, NULL, rbuf_len);
+
+		rc = ahash_wait(crypto_ahash_update(req), &res);
+		if (rc)
+			break;
+	}
+	if (read)
+		file->f_mode &= ~FMODE_READ;
+	kfree(rbuf);
+out2:
+	if (!rc) {
+		ahash_request_set_crypt(req, NULL, hash->digest, 0);
+		rc = ahash_wait(crypto_ahash_final(req), &res);
+	}
+out1:
+	ahash_request_free(req);
+	return rc;
+}
+
+static int ima_calc_file_ahash(struct file *file, struct ima_digest_data *hash)
+{
+	struct crypto_ahash *tfm;
+	int rc;
+
+	tfm = ima_alloc_atfm(hash->algo);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	rc = ima_calc_file_hash_atfm(file, hash, tfm);
+
+	ima_free_atfm(tfm);
+
+	return rc;
+}
+
 static int ima_calc_file_hash_tfm(struct file *file,
 				  struct ima_digest_data *hash,
 				  struct crypto_shash *tfm)
@@ -156,7 +306,7 @@ out:
 	return rc;
 }
 
-int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash)
+static int ima_calc_file_shash(struct file *file, struct ima_digest_data *hash)
 {
 	struct crypto_shash *tfm;
 	int rc;
@@ -172,6 +322,35 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash)
 	return rc;
 }
 
+/*
+ * ima_calc_file_hash - calculate file hash
+ *
+ * Asynchronous hash (ahash) allows using HW acceleration for calculating
+ * a hash. ahash performance varies for different data sizes on different
+ * crypto accelerators. shash performance might be better for smaller files.
+ * The 'ima.ahash_minsize' module parameter allows specifying the best
+ * minimum file size for using ahash on the system.
+ *
+ * If the ima.ahash_minsize parameter is not specified, this function uses
+ * shash for the hash calculation.  If ahash fails, it falls back to using
+ * shash.
+ */
+int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash)
+{
+	loff_t i_size;
+	int rc;
+
+	i_size = i_size_read(file_inode(file));
+
+	if (ima_ahash_minsize && i_size >= ima_ahash_minsize) {
+		rc = ima_calc_file_ahash(file, hash);
+		if (!rc)
+			return 0;
+	}
+
+	return ima_calc_file_shash(file, hash);
+}
+
 /*
  * Calculate the hash of template data
  */
-- 
cgit v1.2.3


From 6edf7a89260859c5e72861dc4e6e169495f076c8 Mon Sep 17 00:00:00 2001
From: Dmitry Kasatkin <d.kasatkin@samsung.com>
Date: Tue, 6 May 2014 14:47:13 +0300
Subject: ima: introduce multi-page collect buffers

Use of multiple-page collect buffers reduces:
1) the number of block IO requests
2) the number of asynchronous hash update requests

Second is important for HW accelerated hashing, because significant
amount of time is spent for preparation of hash update operation,
which includes configuring acceleration HW, DMA engine, etc...
Thus, HW accelerators are more efficient when working on large
chunks of data.

This patch introduces usage of multi-page collect buffers. Buffer size
can be specified using 'ahash_bufsize' module parameter. Default buffer
size is 4096 bytes.

Changes in v3:
- kernel parameter replaced with module parameter

Signed-off-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  8 +++
 security/integrity/ima/ima_crypto.c | 98 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 775fe039e060..8b2ab548b6e4 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1322,6 +1322,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			different crypto accelerators. This option can be used
 			to achieve the best performance for a particular HW.
 
+	ima.ahash_bufsize= [IMA] Asynchronous hash buffer size
+			Format: <bufsize>
+			Set hashing buffer size. Default: 4k.
+
+			ahash performance varies for different chunk sizes on
+			different crypto accelerators. This option can be used
+			to achieve best performance for particular HW.
+
 	init=		[KNL]
 			Format: <full_path>
 			Run specified binary instead of /sbin/init as init
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index b9e5120559d4..84b938f86bea 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -37,6 +37,33 @@ static unsigned long ima_ahash_minsize;
 module_param_named(ahash_minsize, ima_ahash_minsize, ulong, 0644);
 MODULE_PARM_DESC(ahash_minsize, "Minimum file size for ahash use");
 
+/* default is 0 - 1 page. */
+static int ima_maxorder;
+static unsigned int ima_bufsize = PAGE_SIZE;
+
+static int param_set_bufsize(const char *val, const struct kernel_param *kp)
+{
+	unsigned long long size;
+	int order;
+
+	size = memparse(val, NULL);
+	order = get_order(size);
+	if (order >= MAX_ORDER)
+		return -EINVAL;
+	ima_maxorder = order;
+	ima_bufsize = PAGE_SIZE << order;
+	return 0;
+}
+
+static struct kernel_param_ops param_ops_bufsize = {
+	.set = param_set_bufsize,
+	.get = param_get_uint,
+};
+#define param_check_bufsize(name, p) __param_check(name, p, unsigned int)
+
+module_param_named(ahash_bufsize, ima_bufsize, bufsize, 0644);
+MODULE_PARM_DESC(ahash_bufsize, "Maximum ahash buffer size");
+
 static struct crypto_shash *ima_shash_tfm;
 static struct crypto_ahash *ima_ahash_tfm;
 
@@ -106,6 +133,68 @@ static void ima_free_tfm(struct crypto_shash *tfm)
 		crypto_free_shash(tfm);
 }
 
+/**
+ * ima_alloc_pages() - Allocate contiguous pages.
+ * @max_size:       Maximum amount of memory to allocate.
+ * @allocated_size: Returned size of actual allocation.
+ * @last_warn:      Should the min_size allocation warn or not.
+ *
+ * Tries to do opportunistic allocation for memory first trying to allocate
+ * max_size amount of memory and then splitting that until zero order is
+ * reached. Allocation is tried without generating allocation warnings unless
+ * last_warn is set. Last_warn set affects only last allocation of zero order.
+ *
+ * By default, ima_maxorder is 0 and it is equivalent to kmalloc(GFP_KERNEL)
+ *
+ * Return pointer to allocated memory, or NULL on failure.
+ */
+static void *ima_alloc_pages(loff_t max_size, size_t *allocated_size,
+			     int last_warn)
+{
+	void *ptr;
+	int order = ima_maxorder;
+	gfp_t gfp_mask = __GFP_WAIT | __GFP_NOWARN | __GFP_NORETRY;
+
+	if (order)
+		order = min(get_order(max_size), order);
+
+	for (; order; order--) {
+		ptr = (void *)__get_free_pages(gfp_mask, order);
+		if (ptr) {
+			*allocated_size = PAGE_SIZE << order;
+			return ptr;
+		}
+	}
+
+	/* order is zero - one page */
+
+	gfp_mask = GFP_KERNEL;
+
+	if (!last_warn)
+		gfp_mask |= __GFP_NOWARN;
+
+	ptr = (void *)__get_free_pages(gfp_mask, 0);
+	if (ptr) {
+		*allocated_size = PAGE_SIZE;
+		return ptr;
+	}
+
+	*allocated_size = 0;
+	return NULL;
+}
+
+/**
+ * ima_free_pages() - Free pages allocated by ima_alloc_pages().
+ * @ptr:  Pointer to allocated pages.
+ * @size: Size of allocated buffer.
+ */
+static void ima_free_pages(void *ptr, size_t size)
+{
+	if (!ptr)
+		return;
+	free_pages((unsigned long)ptr, get_order(size));
+}
+
 static struct crypto_ahash *ima_alloc_atfm(enum hash_algo algo)
 {
 	struct crypto_ahash *tfm = ima_ahash_tfm;
@@ -169,6 +258,7 @@ static int ima_calc_file_hash_atfm(struct file *file,
 	struct ahash_request *req;
 	struct scatterlist sg[1];
 	struct ahash_completion res;
+	size_t rbuf_size;
 
 	hash->length = crypto_ahash_digestsize(tfm);
 
@@ -190,7 +280,11 @@ static int ima_calc_file_hash_atfm(struct file *file,
 	if (i_size == 0)
 		goto out2;
 
-	rbuf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	/*
+	 * Try to allocate maximum size of memory.
+	 * Fail if even a single page cannot be allocated.
+	 */
+	rbuf = ima_alloc_pages(i_size, &rbuf_size, 1);
 	if (!rbuf) {
 		rc = -ENOMEM;
 		goto out1;
@@ -219,7 +313,7 @@ static int ima_calc_file_hash_atfm(struct file *file,
 	}
 	if (read)
 		file->f_mode &= ~FMODE_READ;
-	kfree(rbuf);
+	ima_free_pages(rbuf, rbuf_size);
 out2:
 	if (!rc) {
 		ahash_request_set_crypt(req, NULL, hash->digest, 0);
-- 
cgit v1.2.3


From 32c2e6752ff0f48fe03b9e1c7c64bde580a840d2 Mon Sep 17 00:00:00 2001
From: Dmitry Kasatkin <d.kasatkin@samsung.com>
Date: Tue, 6 May 2014 14:54:27 +0300
Subject: ima: provide double buffering for hash calculation

The asynchronous hash API allows initiating a hash calculation and
then performing other tasks, while waiting for the hash calculation
to complete.

This patch introduces usage of double buffering for simultaneous
hashing and reading of the next chunk of data from storage.

Changes in v3:
- better comments

Signed-off-by: Dmitry Kasatkin <d.kasatkin@samsung.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 security/integrity/ima/ima_crypto.c | 65 ++++++++++++++++++++++++++++---------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 84b938f86bea..0bd732843fe7 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -253,12 +253,12 @@ static int ima_calc_file_hash_atfm(struct file *file,
 				   struct crypto_ahash *tfm)
 {
 	loff_t i_size, offset;
-	char *rbuf;
-	int rc, read = 0, rbuf_len;
+	char *rbuf[2] = { NULL, };
+	int rc, read = 0, rbuf_len, active = 0, ahash_rc = 0;
 	struct ahash_request *req;
 	struct scatterlist sg[1];
 	struct ahash_completion res;
-	size_t rbuf_size;
+	size_t rbuf_size[2];
 
 	hash->length = crypto_ahash_digestsize(tfm);
 
@@ -284,36 +284,69 @@ static int ima_calc_file_hash_atfm(struct file *file,
 	 * Try to allocate maximum size of memory.
 	 * Fail if even a single page cannot be allocated.
 	 */
-	rbuf = ima_alloc_pages(i_size, &rbuf_size, 1);
-	if (!rbuf) {
+	rbuf[0] = ima_alloc_pages(i_size, &rbuf_size[0], 1);
+	if (!rbuf[0]) {
 		rc = -ENOMEM;
 		goto out1;
 	}
 
+	/* Only allocate one buffer if that is enough. */
+	if (i_size > rbuf_size[0]) {
+		/*
+		 * Try to allocate secondary buffer. If that fails fallback to
+		 * using single buffering. Use previous memory allocation size
+		 * as baseline for possible allocation size.
+		 */
+		rbuf[1] = ima_alloc_pages(i_size - rbuf_size[0],
+					  &rbuf_size[1], 0);
+	}
+
 	if (!(file->f_mode & FMODE_READ)) {
 		file->f_mode |= FMODE_READ;
 		read = 1;
 	}
 
 	for (offset = 0; offset < i_size; offset += rbuf_len) {
-		rbuf_len = ima_kernel_read(file, offset, rbuf, PAGE_SIZE);
-		if (rbuf_len < 0) {
-			rc = rbuf_len;
-			break;
+		if (!rbuf[1] && offset) {
+			/* Not using two buffers, and it is not the first
+			 * read/request, wait for the completion of the
+			 * previous ahash_update() request.
+			 */
+			rc = ahash_wait(ahash_rc, &res);
+			if (rc)
+				goto out3;
+		}
+		/* read buffer */
+		rbuf_len = min_t(loff_t, i_size - offset, rbuf_size[active]);
+		rc = ima_kernel_read(file, offset, rbuf[active], rbuf_len);
+		if (rc != rbuf_len)
+			goto out3;
+
+		if (rbuf[1] && offset) {
+			/* Using two buffers, and it is not the first
+			 * read/request, wait for the completion of the
+			 * previous ahash_update() request.
+			 */
+			rc = ahash_wait(ahash_rc, &res);
+			if (rc)
+				goto out3;
 		}
-		if (rbuf_len == 0)
-			break;
 
-		sg_init_one(&sg[0], rbuf, rbuf_len);
+		sg_init_one(&sg[0], rbuf[active], rbuf_len);
 		ahash_request_set_crypt(req, sg, NULL, rbuf_len);
 
-		rc = ahash_wait(crypto_ahash_update(req), &res);
-		if (rc)
-			break;
+		ahash_rc = crypto_ahash_update(req);
+
+		if (rbuf[1])
+			active = !active; /* swap buffers, if we use two */
 	}
+	/* wait for the last update request to complete */
+	rc = ahash_wait(ahash_rc, &res);
+out3:
 	if (read)
 		file->f_mode &= ~FMODE_READ;
-	ima_free_pages(rbuf, rbuf_size);
+	ima_free_pages(rbuf[0], rbuf_size[0]);
+	ima_free_pages(rbuf[1], rbuf_size[1]);
 out2:
 	if (!rc) {
 		ahash_request_set_crypt(req, NULL, hash->digest, 0);
-- 
cgit v1.2.3


From c04f9d61caa34fc83e3517e3092874c9607c19c3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 18 Jul 2014 11:28:33 -0700
Subject: MAINTAINERS: create seccomp entry

Add myself as seccomp maintainer.

Suggested-by: James Morris <jmorris@namei.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 MAINTAINERS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e31c87474739..55762cba8516 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7953,6 +7953,16 @@ S:	Maintained
 F:	drivers/mmc/host/sdhci.*
 F:	drivers/mmc/host/sdhci-pltfm.[ch]
 
+SECURE COMPUTING
+M:	Kees Cook <keescook@chromium.org>
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git seccomp
+S:	Supported
+F:	kernel/seccomp.c
+F:	include/uapi/linux/seccomp.h
+F:	include/linux/seccomp.h
+K:	\bsecure_computing
+K:	\bTIF_SECCOMP\b
+
 SECURE DIGITAL HOST CONTROLLER INTERFACE, OPEN FIRMWARE BINDINGS (SDHCI-OF)
 M:	Anton Vorontsov <anton@enomsg.org>
 L:	linuxppc-dev@lists.ozlabs.org
-- 
cgit v1.2.3


From d78ab02c2c194257a03355fbb79eb721b381d105 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 21 May 2014 15:02:11 -0700
Subject: seccomp: create internal mode-setting function

In preparation for having other callers of the seccomp mode setting
logic, split the prctl entry point away from the core logic that performs
seccomp mode setting.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/seccomp.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 301bbc24739c..afb916c7e890 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -473,7 +473,7 @@ long prctl_get_seccomp(void)
 }
 
 /**
- * prctl_set_seccomp: configures current->seccomp.mode
+ * seccomp_set_mode: internal function for setting seccomp mode
  * @seccomp_mode: requested mode to use
  * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
  *
@@ -486,7 +486,7 @@ long prctl_get_seccomp(void)
  *
  * Returns 0 on success or -EINVAL on failure.
  */
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
 {
 	long ret = -EINVAL;
 
@@ -517,3 +517,15 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 out:
 	return ret;
 }
+
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+	return seccomp_set_mode(seccomp_mode, filter);
+}
-- 
cgit v1.2.3


From 1f41b450416e689b9b7c8bfb750a98604f687a9b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Jun 2014 15:38:02 -0700
Subject: seccomp: extract check/assign mode helpers

To support splitting mode 1 from mode 2, extract the mode checking and
assignment logic into common functions.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/seccomp.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index afb916c7e890..9df7def86c3b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -194,7 +194,23 @@ static u32 seccomp_run_filters(int syscall)
 	}
 	return ret;
 }
+#endif /* CONFIG_SECCOMP_FILTER */
 
+static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
+{
+	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
+		return false;
+
+	return true;
+}
+
+static inline void seccomp_assign_mode(unsigned long seccomp_mode)
+{
+	current->seccomp.mode = seccomp_mode;
+	set_tsk_thread_flag(current, TIF_SECCOMP);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
 /**
  * seccomp_attach_filter: Attaches a seccomp filter to current.
  * @fprog: BPF program to install
@@ -490,8 +506,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
 {
 	long ret = -EINVAL;
 
-	if (current->seccomp.mode &&
-	    current->seccomp.mode != seccomp_mode)
+	if (!seccomp_may_assign_mode(seccomp_mode))
 		goto out;
 
 	switch (seccomp_mode) {
@@ -512,8 +527,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
 		goto out;
 	}
 
-	current->seccomp.mode = seccomp_mode;
-	set_thread_flag(TIF_SECCOMP);
+	seccomp_assign_mode(seccomp_mode);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From 3b23dd12846215eff4afb073366b80c0c4d7543e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Jun 2014 15:55:25 -0700
Subject: seccomp: split mode setting routines

Separates the two mode setting paths to make things more readable with
fewer #ifdefs within function bodies.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/seccomp.c | 71 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 9df7def86c3b..05cac2c2eca1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -489,48 +489,66 @@ long prctl_get_seccomp(void)
 }
 
 /**
- * seccomp_set_mode: internal function for setting seccomp mode
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
- *
- * This function may be called repeatedly with a @seccomp_mode of
- * SECCOMP_MODE_FILTER to install additional filters.  Every filter
- * successfully installed will be evaluated (in reverse order) for each system
- * call the task makes.
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
  *
  * Once current->seccomp.mode is non-zero, it may not be changed.
  *
  * Returns 0 on success or -EINVAL on failure.
  */
-static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode_strict(void)
 {
+	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
 	long ret = -EINVAL;
 
 	if (!seccomp_may_assign_mode(seccomp_mode))
 		goto out;
 
-	switch (seccomp_mode) {
-	case SECCOMP_MODE_STRICT:
-		ret = 0;
 #ifdef TIF_NOTSC
-		disable_TSC();
+	disable_TSC();
 #endif
-		break;
+	seccomp_assign_mode(seccomp_mode);
+	ret = 0;
+
+out:
+
+	return ret;
+}
+
 #ifdef CONFIG_SECCOMP_FILTER
-	case SECCOMP_MODE_FILTER:
-		ret = seccomp_attach_user_filter(filter);
-		if (ret)
-			goto out;
-		break;
-#endif
-	default:
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @filter: struct sock_fprog containing filter
+ *
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
+ *
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_filter(char __user *filter)
+{
+	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+	long ret = -EINVAL;
+
+	if (!seccomp_may_assign_mode(seccomp_mode))
+		goto out;
+
+	ret = seccomp_attach_user_filter(filter);
+	if (ret)
 		goto out;
-	}
 
 	seccomp_assign_mode(seccomp_mode);
 out:
 	return ret;
 }
+#else
+static inline long seccomp_set_mode_filter(char __user *filter)
+{
+	return -EINVAL;
+}
+#endif
 
 /**
  * prctl_set_seccomp: configures current->seccomp.mode
@@ -541,5 +559,12 @@ out:
  */
 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 {
-	return seccomp_set_mode(seccomp_mode, filter);
+	switch (seccomp_mode) {
+	case SECCOMP_MODE_STRICT:
+		return seccomp_set_mode_strict();
+	case SECCOMP_MODE_FILTER:
+		return seccomp_set_mode_filter(filter);
+	default:
+		return -EINVAL;
+	}
 }
-- 
cgit v1.2.3


From 48dc92b9fc3926844257316e75ba11eb5c742b2c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Jun 2014 16:08:24 -0700
Subject: seccomp: add "seccomp" syscall

This adds the new "seccomp" syscall with both an "operation" and "flags"
parameter for future expansion. The third argument is a pointer value,
used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

In addition to the TSYNC flag later in this patch series, there is a
non-zero chance that this syscall could be used for configuring a fixed
argument area for seccomp-tracer-aware processes to pass syscall arguments
in the future. Hence, the use of "seccomp" not simply "seccomp_add_filter"
for this syscall. Additionally, this syscall uses operation, flags,
and user pointer for arguments because strictly passing arguments via
a user pointer would mean seccomp itself would be unable to trivially
filter the seccomp syscall itself.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/Kconfig                      |  1 +
 arch/x86/syscalls/syscall_32.tbl  |  1 +
 arch/x86/syscalls/syscall_64.tbl  |  1 +
 include/linux/syscalls.h          |  2 ++
 include/uapi/asm-generic/unistd.h |  4 ++-
 include/uapi/linux/seccomp.h      |  4 +++
 kernel/seccomp.c                  | 55 +++++++++++++++++++++++++++++++++++----
 kernel/sys_ni.c                   |  3 +++
 8 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 97ff872c7acc..0eae9df35b88 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -321,6 +321,7 @@ config HAVE_ARCH_SECCOMP_FILTER
 	  - secure_computing is called from a ptrace_event()-safe context
 	  - secure_computing return value is checked and a return value of -1
 	    results in the system call being skipped immediately.
+	  - seccomp syscall wired up
 
 config SECCOMP_FILTER
 	def_bool y
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..7527eac24122 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@
 351	i386	sched_setattr		sys_sched_setattr
 352	i386	sched_getattr		sys_sched_getattr
 353	i386	renameat2		sys_renameat2
+354	i386	seccomp			sys_seccomp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..16272a6c12b7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
 314	common	sched_setattr		sys_sched_setattr
 315	common	sched_getattr		sys_sched_getattr
 316	common	renameat2		sys_renameat2
+317	common	seccomp			sys_seccomp
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b0881a0ed322..1713977ee26f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 			 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
+			    const char __user *uargs);
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 333640608087..65acbf0e2867 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
 __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
 #define __NR_renameat2 276
 __SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
 
 #undef __NR_syscalls
-#define __NR_syscalls 277
+#define __NR_syscalls 278
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ac2dc9f72973..b258878ba754 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -10,6 +10,10 @@
 #define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
 #define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
 
+/* Valid operations for seccomp syscall. */
+#define SECCOMP_SET_MODE_STRICT	0
+#define SECCOMP_SET_MODE_FILTER	1
+
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 05cac2c2eca1..f0652578af75 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/sched.h>
 #include <linux/seccomp.h>
+#include <linux/syscalls.h>
 
 /* #define SECCOMP_DEBUG 1 */
 
@@ -314,7 +315,7 @@ free_prog:
  *
  * Returns 0 on success and non-zero otherwise.
  */
-static long seccomp_attach_user_filter(char __user *user_filter)
+static long seccomp_attach_user_filter(const char __user *user_filter)
 {
 	struct sock_fprog fprog;
 	long ret = -EFAULT;
@@ -517,6 +518,7 @@ out:
 #ifdef CONFIG_SECCOMP_FILTER
 /**
  * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags:  flags to change filter behavior
  * @filter: struct sock_fprog containing filter
  *
  * This function may be called repeatedly to install additional filters.
@@ -527,11 +529,16 @@ out:
  *
  * Returns 0 on success or -EINVAL on failure.
  */
-static long seccomp_set_mode_filter(char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+				    const char __user *filter)
 {
 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
 	long ret = -EINVAL;
 
+	/* Validate flags. */
+	if (flags != 0)
+		goto out;
+
 	if (!seccomp_may_assign_mode(seccomp_mode))
 		goto out;
 
@@ -544,12 +551,35 @@ out:
 	return ret;
 }
 #else
-static inline long seccomp_set_mode_filter(char __user *filter)
+static inline long seccomp_set_mode_filter(unsigned int flags,
+					   const char __user *filter)
 {
 	return -EINVAL;
 }
 #endif
 
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+		       const char __user *uargs)
+{
+	switch (op) {
+	case SECCOMP_SET_MODE_STRICT:
+		if (flags != 0 || uargs != NULL)
+			return -EINVAL;
+		return seccomp_set_mode_strict();
+	case SECCOMP_SET_MODE_FILTER:
+		return seccomp_set_mode_filter(flags, uargs);
+	default:
+		return -EINVAL;
+	}
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+			 const char __user *, uargs)
+{
+	return do_seccomp(op, flags, uargs);
+}
+
 /**
  * prctl_set_seccomp: configures current->seccomp.mode
  * @seccomp_mode: requested mode to use
@@ -559,12 +589,27 @@ static inline long seccomp_set_mode_filter(char __user *filter)
  */
 long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 {
+	unsigned int op;
+	char __user *uargs;
+
 	switch (seccomp_mode) {
 	case SECCOMP_MODE_STRICT:
-		return seccomp_set_mode_strict();
+		op = SECCOMP_SET_MODE_STRICT;
+		/*
+		 * Setting strict mode through prctl always ignored filter,
+		 * so make sure it is always NULL here to pass the internal
+		 * check in do_seccomp().
+		 */
+		uargs = NULL;
+		break;
 	case SECCOMP_MODE_FILTER:
-		return seccomp_set_mode_filter(filter);
+		op = SECCOMP_SET_MODE_FILTER;
+		uargs = filter;
+		break;
 	default:
 		return -EINVAL;
 	}
+
+	/* prctl interface doesn't have flags, so they are always zero. */
+	return do_seccomp(op, 0, uargs);
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..2904a2105914 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -213,3 +213,6 @@ cond_syscall(compat_sys_open_by_handle_at);
 
 /* compare kernel pointers */
 cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
-- 
cgit v1.2.3


From 839669714f0a85d677283690e6e164fb698ce206 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Jun 2014 15:40:23 -0700
Subject: ARM: add seccomp syscall

Wires up the new seccomp syscall.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
---
 arch/arm/include/uapi/asm/unistd.h | 1 +
 arch/arm/kernel/calls.S            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index ba94446c72d9..e21b4a069701 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -409,6 +409,7 @@
 #define __NR_sched_setattr		(__NR_SYSCALL_BASE+380)
 #define __NR_sched_getattr		(__NR_SYSCALL_BASE+381)
 #define __NR_renameat2			(__NR_SYSCALL_BASE+382)
+#define __NR_seccomp			(__NR_SYSCALL_BASE+383)
 
 /*
  * This may need to be greater than __NR_last_syscall+1 in order to
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 8f51bdcdacbb..bea85f97f363 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -392,6 +392,7 @@
 /* 380 */	CALL(sys_sched_setattr)
 		CALL(sys_sched_getattr)
 		CALL(sys_renameat2)
+		CALL(sys_seccomp)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
-- 
cgit v1.2.3


From 8855d608c145c1ca0e26f4da00741080bb49d80d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Jun 2014 15:45:09 -0700
Subject: MIPS: add seccomp syscall

Wires up the new seccomp syscall.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
---
 arch/mips/include/uapi/asm/unistd.h | 15 +++++++++------
 arch/mips/kernel/scall32-o32.S      |  1 +
 arch/mips/kernel/scall64-64.S       |  1 +
 arch/mips/kernel/scall64-n32.S      |  1 +
 arch/mips/kernel/scall64-o32.S      |  1 +
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/mips/include/uapi/asm/unistd.h b/arch/mips/include/uapi/asm/unistd.h
index 5805414777e0..9bc13eaf9d67 100644
--- a/arch/mips/include/uapi/asm/unistd.h
+++ b/arch/mips/include/uapi/asm/unistd.h
@@ -372,16 +372,17 @@
 #define __NR_sched_setattr		(__NR_Linux + 349)
 #define __NR_sched_getattr		(__NR_Linux + 350)
 #define __NR_renameat2			(__NR_Linux + 351)
+#define __NR_seccomp			(__NR_Linux + 352)
 
 /*
  * Offset of the last Linux o32 flavoured syscall
  */
-#define __NR_Linux_syscalls		351
+#define __NR_Linux_syscalls		352
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
 
 #define __NR_O32_Linux			4000
-#define __NR_O32_Linux_syscalls		351
+#define __NR_O32_Linux_syscalls		352
 
 #if _MIPS_SIM == _MIPS_SIM_ABI64
 
@@ -701,16 +702,17 @@
 #define __NR_sched_setattr		(__NR_Linux + 309)
 #define __NR_sched_getattr		(__NR_Linux + 310)
 #define __NR_renameat2			(__NR_Linux + 311)
+#define __NR_seccomp			(__NR_Linux + 312)
 
 /*
  * Offset of the last Linux 64-bit flavoured syscall
  */
-#define __NR_Linux_syscalls		311
+#define __NR_Linux_syscalls		312
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
 
 #define __NR_64_Linux			5000
-#define __NR_64_Linux_syscalls		311
+#define __NR_64_Linux_syscalls		312
 
 #if _MIPS_SIM == _MIPS_SIM_NABI32
 
@@ -1034,15 +1036,16 @@
 #define __NR_sched_setattr		(__NR_Linux + 313)
 #define __NR_sched_getattr		(__NR_Linux + 314)
 #define __NR_renameat2			(__NR_Linux + 315)
+#define __NR_seccomp			(__NR_Linux + 316)
 
 /*
  * Offset of the last N32 flavoured syscall
  */
-#define __NR_Linux_syscalls		315
+#define __NR_Linux_syscalls		316
 
 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
 
 #define __NR_N32_Linux			6000
-#define __NR_N32_Linux_syscalls		315
+#define __NR_N32_Linux_syscalls		316
 
 #endif /* _UAPI_ASM_UNISTD_H */
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 3245474f19d5..ab02d14f1b5c 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -578,3 +578,4 @@ EXPORT(sys_call_table)
 	PTR	sys_sched_setattr
 	PTR	sys_sched_getattr		/* 4350 */
 	PTR	sys_renameat2
+	PTR	sys_seccomp
diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S
index be2fedd4ae33..010dccf128ec 100644
--- a/arch/mips/kernel/scall64-64.S
+++ b/arch/mips/kernel/scall64-64.S
@@ -431,4 +431,5 @@ EXPORT(sys_call_table)
 	PTR	sys_sched_setattr
 	PTR	sys_sched_getattr		/* 5310 */
 	PTR	sys_renameat2
+	PTR	sys_seccomp
 	.size	sys_call_table,.-sys_call_table
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index c1dbcda4b816..c3b3b6525df5 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -424,4 +424,5 @@ EXPORT(sysn32_call_table)
 	PTR	sys_sched_setattr
 	PTR	sys_sched_getattr
 	PTR	sys_renameat2			/* 6315 */
+	PTR	sys_seccomp
 	.size	sysn32_call_table,.-sysn32_call_table
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index f1343ccd7ed7..bb1550b1f501 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -557,4 +557,5 @@ EXPORT(sys32_call_table)
 	PTR	sys_sched_setattr
 	PTR	sys_sched_getattr		/* 4350 */
 	PTR	sys_renameat2
+	PTR	sys_seccomp
 	.size	sys32_call_table,.-sys32_call_table
-- 
cgit v1.2.3


From 1d4457f99928a968767f6405b4a1f50845aa15fd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 21 May 2014 15:23:46 -0700
Subject: sched: move no_new_privs into new atomic flags

Since seccomp transitions between threads requires updates to the
no_new_privs flag to be atomic, the flag must be part of an atomic flag
set. This moves the nnp flag into a separate task field, and introduces
accessors.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 fs/exec.c                  |  4 ++--
 include/linux/sched.h      | 18 +++++++++++++++---
 kernel/seccomp.c           |  2 +-
 kernel/sys.c               |  4 ++--
 security/apparmor/domain.c |  4 ++--
 5 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index a3d33fe592d6..0f5c272410f6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1234,7 +1234,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	 * This isn't strictly necessary, but it makes it harder for LSMs to
 	 * mess up.
 	 */
-	if (current->no_new_privs)
+	if (task_no_new_privs(current))
 		bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
 
 	t = p;
@@ -1272,7 +1272,7 @@ int prepare_binprm(struct linux_binprm *bprm)
 	bprm->cred->egid = current_egid();
 
 	if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-	    !current->no_new_privs &&
+	    !task_no_new_privs(current) &&
 	    kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
 	    kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
 		/* Set-uid? */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 306f4f0c987a..0fd19055bb64 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1307,13 +1307,12 @@ struct task_struct {
 				 * execve */
 	unsigned in_iowait:1;
 
-	/* task may not gain privileges */
-	unsigned no_new_privs:1;
-
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 
+	unsigned long atomic_flags; /* Flags needing atomic access. */
+
 	pid_t pid;
 	pid_t tgid;
 
@@ -1967,6 +1966,19 @@ static inline void memalloc_noio_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
 }
 
+/* Per-process atomic flags. */
+#define PFA_NO_NEW_PRIVS 0x00000001	/* May not gain new privileges. */
+
+static inline bool task_no_new_privs(struct task_struct *p)
+{
+	return test_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
+}
+
+static inline void task_set_no_new_privs(struct task_struct *p)
+{
+	set_bit(PFA_NO_NEW_PRIVS, &p->atomic_flags);
+}
+
 /*
  * task->jobctl flags
  */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f0652578af75..d2596136b0d1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -241,7 +241,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	 * This avoids scenarios where unprivileged tasks can affect the
 	 * behavior of privileged children.
 	 */
-	if (!current->no_new_privs &&
+	if (!task_no_new_privs(current) &&
 	    security_capable_noaudit(current_cred(), current_user_ns(),
 				     CAP_SYS_ADMIN) != 0)
 		return -EACCES;
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 != 1 || arg3 || arg4 || arg5)
 			return -EINVAL;
 
-		current->no_new_privs = 1;
+		task_set_no_new_privs(current);
 		break;
 	case PR_GET_NO_NEW_PRIVS:
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
-		return current->no_new_privs ? 1 : 0;
+		return task_no_new_privs(current) ? 1 : 0;
 	case PR_GET_THP_DISABLE:
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 452567d3a08e..d97cba3e3849 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -621,7 +621,7 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest)
 	 * There is no exception for unconfined as change_hat is not
 	 * available.
 	 */
-	if (current->no_new_privs)
+	if (task_no_new_privs(current))
 		return -EPERM;
 
 	/* released below */
@@ -776,7 +776,7 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec,
 	 * no_new_privs is set because this aways results in a reduction
 	 * of permissions.
 	 */
-	if (current->no_new_privs && !unconfined(profile)) {
+	if (task_no_new_privs(current) && !unconfined(profile)) {
 		put_cred(cred);
 		return -EPERM;
 	}
-- 
cgit v1.2.3


From c8bee430dc52cfca6c1aab27752a89275d78d50f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 27 Jun 2014 15:16:33 -0700
Subject: seccomp: split filter prep from check and apply

In preparation for adding seccomp locking, move filter creation away
from where it is checked and applied. This will allow for locking where
no memory allocation is happening. The validation, filter attachment,
and seccomp mode setting can all happen under the future locks.

For extreme defensiveness, I've added a BUG_ON check for the calculated
size of the buffer allocation in case BPF_MAXINSN ever changes, which
shouldn't ever happen. The compiler should actually optimize out this
check since the test above it makes it impossible.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/seccomp.c | 97 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 30 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d2596136b0d1..58125160417c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/sched.h>
 #include <linux/seccomp.h>
+#include <linux/slab.h>
 #include <linux/syscalls.h>
 
 /* #define SECCOMP_DEBUG 1 */
@@ -27,7 +28,6 @@
 #include <linux/filter.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
-#include <linux/slab.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
 
@@ -213,27 +213,23 @@ static inline void seccomp_assign_mode(unsigned long seccomp_mode)
 
 #ifdef CONFIG_SECCOMP_FILTER
 /**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
  * @fprog: BPF program to install
  *
- * Returns 0 on success or an errno on failure.
+ * Returns filter on success or an ERR_PTR on failure.
  */
-static long seccomp_attach_filter(struct sock_fprog *fprog)
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *filter;
-	unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
-	unsigned long total_insns = fprog->len;
+	unsigned long fp_size;
 	struct sock_filter *fp;
 	int new_len;
 	long ret;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
-		return -EINVAL;
-
-	for (filter = current->seccomp.filter; filter; filter = filter->prev)
-		total_insns += filter->prog->len + 4;  /* include a 4 instr penalty */
-	if (total_insns > MAX_INSNS_PER_PATH)
-		return -ENOMEM;
+		return ERR_PTR(-EINVAL);
+	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
+	fp_size = fprog->len * sizeof(struct sock_filter);
 
 	/*
 	 * Installing a seccomp filter requires that the task has
@@ -244,11 +240,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	if (!task_no_new_privs(current) &&
 	    security_capable_noaudit(current_cred(), current_user_ns(),
 				     CAP_SYS_ADMIN) != 0)
-		return -EACCES;
+		return ERR_PTR(-EACCES);
 
 	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
 	if (!fp)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	/* Copy the instructions from fprog. */
 	ret = -EFAULT;
@@ -292,13 +288,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 
 	sk_filter_select_runtime(filter->prog);
 
-	/*
-	 * If there is an existing filter, make it the prev and don't drop its
-	 * task reference.
-	 */
-	filter->prev = current->seccomp.filter;
-	current->seccomp.filter = filter;
-	return 0;
+	return filter;
 
 free_filter_prog:
 	kfree(filter->prog);
@@ -306,19 +296,20 @@ free_filter:
 	kfree(filter);
 free_prog:
 	kfree(fp);
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
  * @user_filter: pointer to the user data containing a sock_fprog.
  *
  * Returns 0 on success and non-zero otherwise.
  */
-static long seccomp_attach_user_filter(const char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
 {
 	struct sock_fprog fprog;
-	long ret = -EFAULT;
+	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
 
 #ifdef CONFIG_COMPAT
 	if (is_compat_task()) {
@@ -331,9 +322,39 @@ static long seccomp_attach_user_filter(const char __user *user_filter)
 #endif
 	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
 		goto out;
-	ret = seccomp_attach_filter(&fprog);
+	filter = seccomp_prepare_filter(&fprog);
 out:
-	return ret;
+	return filter;
+}
+
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @flags:  flags to change filter behavior
+ * @filter: seccomp filter to add to the current process
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(unsigned int flags,
+				  struct seccomp_filter *filter)
+{
+	unsigned long total_insns;
+	struct seccomp_filter *walker;
+
+	/* Validate resulting filter length. */
+	total_insns = filter->prog->len;
+	for (walker = current->seccomp.filter; walker; walker = walker->prev)
+		total_insns += walker->prog->len + 4;  /* 4 instr penalty */
+	if (total_insns > MAX_INSNS_PER_PATH)
+		return -ENOMEM;
+
+	/*
+	 * If there is an existing filter, make it the prev and don't drop its
+	 * task reference.
+	 */
+	filter->prev = current->seccomp.filter;
+	current->seccomp.filter = filter;
+
+	return 0;
 }
 
 /* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -346,6 +367,14 @@ void get_seccomp_filter(struct task_struct *tsk)
 	atomic_inc(&orig->usage);
 }
 
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+	if (filter) {
+		sk_filter_free(filter->prog);
+		kfree(filter);
+	}
+}
+
 /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
 void put_seccomp_filter(struct task_struct *tsk)
 {
@@ -354,8 +383,7 @@ void put_seccomp_filter(struct task_struct *tsk)
 	while (orig && atomic_dec_and_test(&orig->usage)) {
 		struct seccomp_filter *freeme = orig;
 		orig = orig->prev;
-		sk_filter_free(freeme->prog);
-		kfree(freeme);
+		seccomp_filter_free(freeme);
 	}
 }
 
@@ -533,21 +561,30 @@ static long seccomp_set_mode_filter(unsigned int flags,
 				    const char __user *filter)
 {
 	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+	struct seccomp_filter *prepared = NULL;
 	long ret = -EINVAL;
 
 	/* Validate flags. */
 	if (flags != 0)
 		goto out;
 
+	/* Prepare the new filter before holding any locks. */
+	prepared = seccomp_prepare_user_filter(filter);
+	if (IS_ERR(prepared))
+		return PTR_ERR(prepared);
+
 	if (!seccomp_may_assign_mode(seccomp_mode))
 		goto out;
 
-	ret = seccomp_attach_user_filter(filter);
+	ret = seccomp_attach_filter(flags, prepared);
 	if (ret)
 		goto out;
+	/* Do not free the successfully attached filter. */
+	prepared = NULL;
 
 	seccomp_assign_mode(seccomp_mode);
 out:
+	seccomp_filter_free(prepared);
 	return ret;
 }
 #else
-- 
cgit v1.2.3


From dbd952127d11bb44a4ea30b08cc60531b6a23d71 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 27 Jun 2014 15:18:48 -0700
Subject: seccomp: introduce writer locking

Normally, task_struct.seccomp.filter is only ever read or modified by
the task that owns it (current). This property aids in fast access
during system call filtering as read access is lockless.

Updating the pointer from another task, however, opens up race
conditions. To allow cross-thread filter pointer updates, writes to the
seccomp fields are now protected by the sighand spinlock (which is shared
by all threads in the thread group). Read access remains lockless because
pointer updates themselves are atomic.  However, writes (or cloning)
often entail additional checking (like maximum instruction counts)
which require locking to perform safely.

In the case of cloning threads, the child is invisible to the system
until it enters the task list. To make sure a child can't be cloned from
a thread and left in a prior state, seccomp duplication is additionally
moved under the sighand lock. Then parent and child are certain have
the same seccomp state when they exit the lock.

Based on patches by Will Drewry and David Drysdale.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 include/linux/seccomp.h |  6 +++---
 kernel/fork.c           | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/seccomp.c        | 16 +++++++++++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 4054b0994071..9ff98b4bfe2e 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -14,11 +14,11 @@ struct seccomp_filter;
  *
  * @mode:  indicates one of the valid values above for controlled
  *         system calls available to a process.
- * @filter: The metadata and ruleset for determining what system calls
- *          are allowed for a task.
+ * @filter: must always point to a valid seccomp-filter or NULL as it is
+ *          accessed without locking during system call entry.
  *
  *          @filter must only be accessed from the context of current as there
- *          is no locking.
+ *          is no read locking.
  */
 struct seccomp {
 	int mode;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6a13c46cd87d..ed4bc339c9dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto free_ti;
 
 	tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+	/*
+	 * We must handle setting up seccomp filters once we're under
+	 * the sighand lock in case orig has changed between now and
+	 * then. Until then, filter must be NULL to avoid messing up
+	 * the usage counts on the error path calling free_task.
+	 */
+	tsk->seccomp.filter = NULL;
+#endif
 
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
@@ -1081,6 +1090,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Must be called with sighand->lock held, which is common to
+	 * all threads in the group. Holding cred_guard_mutex is not
+	 * needed because this new task is not yet running and cannot
+	 * be racing exec.
+	 */
+	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+	/* Ref-count the new filter user, and assign it. */
+	get_seccomp_filter(current);
+	p->seccomp = current->seccomp;
+
+	/*
+	 * Explicitly enable no_new_privs here in case it got set
+	 * between the task_struct being duplicated and holding the
+	 * sighand lock. The seccomp state and nnp must be in sync.
+	 */
+	if (task_no_new_privs(current))
+		task_set_no_new_privs(p);
+
+	/*
+	 * If the parent gained a seccomp mode after copying thread
+	 * flags and between before we held the sighand lock, we have
+	 * to manually enable the seccomp thread flag here.
+	 */
+	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+		set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
+
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 {
 	current->clear_child_tid = tidptr;
@@ -1196,7 +1238,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	ftrace_graph_init_task(p);
-	get_seccomp_filter(p);
 
 	rt_mutex_init_task(p);
 
@@ -1436,6 +1477,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	spin_lock(&current->sighand->siglock);
 
+	/*
+	 * Copy seccomp details explicitly here, in case they were changed
+	 * before holding sighand lock.
+	 */
+	copy_seccomp(p);
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 58125160417c..d5543e787e4e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -199,6 +199,8 @@ static u32 seccomp_run_filters(int syscall)
 
 static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 {
+	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
 	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
 		return false;
 
@@ -207,6 +209,8 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 
 static inline void seccomp_assign_mode(unsigned long seccomp_mode)
 {
+	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
 	current->seccomp.mode = seccomp_mode;
 	set_tsk_thread_flag(current, TIF_SECCOMP);
 }
@@ -332,6 +336,8 @@ out:
  * @flags:  flags to change filter behavior
  * @filter: seccomp filter to add to the current process
  *
+ * Caller must be holding current->sighand->siglock lock.
+ *
  * Returns 0 on success, -ve on error.
  */
 static long seccomp_attach_filter(unsigned int flags,
@@ -340,6 +346,8 @@ static long seccomp_attach_filter(unsigned int flags,
 	unsigned long total_insns;
 	struct seccomp_filter *walker;
 
+	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
 	/* Validate resulting filter length. */
 	total_insns = filter->prog->len;
 	for (walker = current->seccomp.filter; walker; walker = walker->prev)
@@ -529,6 +537,8 @@ static long seccomp_set_mode_strict(void)
 	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
 	long ret = -EINVAL;
 
+	spin_lock_irq(&current->sighand->siglock);
+
 	if (!seccomp_may_assign_mode(seccomp_mode))
 		goto out;
 
@@ -539,6 +549,7 @@ static long seccomp_set_mode_strict(void)
 	ret = 0;
 
 out:
+	spin_unlock_irq(&current->sighand->siglock);
 
 	return ret;
 }
@@ -566,13 +577,15 @@ static long seccomp_set_mode_filter(unsigned int flags,
 
 	/* Validate flags. */
 	if (flags != 0)
-		goto out;
+		return -EINVAL;
 
 	/* Prepare the new filter before holding any locks. */
 	prepared = seccomp_prepare_user_filter(filter);
 	if (IS_ERR(prepared))
 		return PTR_ERR(prepared);
 
+	spin_lock_irq(&current->sighand->siglock);
+
 	if (!seccomp_may_assign_mode(seccomp_mode))
 		goto out;
 
@@ -584,6 +597,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
 
 	seccomp_assign_mode(seccomp_mode);
 out:
+	spin_unlock_irq(&current->sighand->siglock);
 	seccomp_filter_free(prepared);
 	return ret;
 }
-- 
cgit v1.2.3


From 3ba2530cc06eb4aee4f1f754f43d781e8a12ee09 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 27 Jun 2014 15:01:35 -0700
Subject: seccomp: allow mode setting across threads

This changes the mode setting helper to allow threads to change the
seccomp mode from another thread. We must maintain barriers to keep
TIF_SECCOMP synchronized with the rest of the seccomp state.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 kernel/seccomp.c | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d5543e787e4e..9065d2c79c56 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -173,21 +173,24 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  */
 static u32 seccomp_run_filters(int syscall)
 {
-	struct seccomp_filter *f;
+	struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
 	struct seccomp_data sd;
 	u32 ret = SECCOMP_RET_ALLOW;
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
-	if (WARN_ON(current->seccomp.filter == NULL))
+	if (unlikely(WARN_ON(f == NULL)))
 		return SECCOMP_RET_KILL;
 
+	/* Make sure cross-thread synced filter points somewhere sane. */
+	smp_read_barrier_depends();
+
 	populate_seccomp_data(&sd);
 
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
-	for (f = current->seccomp.filter; f; f = f->prev) {
+	for (; f; f = f->prev) {
 		u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
 
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
@@ -207,12 +210,18 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 	return true;
 }
 
-static inline void seccomp_assign_mode(unsigned long seccomp_mode)
+static inline void seccomp_assign_mode(struct task_struct *task,
+				       unsigned long seccomp_mode)
 {
-	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+	BUG_ON(!spin_is_locked(&task->sighand->siglock));
 
-	current->seccomp.mode = seccomp_mode;
-	set_tsk_thread_flag(current, TIF_SECCOMP);
+	task->seccomp.mode = seccomp_mode;
+	/*
+	 * Make sure TIF_SECCOMP cannot be set before the mode (and
+	 * filter) is set.
+	 */
+	smp_mb__before_atomic();
+	set_tsk_thread_flag(task, TIF_SECCOMP);
 }
 
 #ifdef CONFIG_SECCOMP_FILTER
@@ -435,12 +444,17 @@ static int mode1_syscalls_32[] = {
 
 int __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
 	int exit_sig = 0;
 	int *syscall;
 	u32 ret;
 
-	switch (mode) {
+	/*
+	 * Make sure that any changes to mode from another thread have
+	 * been seen after TIF_SECCOMP was seen.
+	 */
+	rmb();
+
+	switch (current->seccomp.mode) {
 	case SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
@@ -545,7 +559,7 @@ static long seccomp_set_mode_strict(void)
 #ifdef TIF_NOTSC
 	disable_TSC();
 #endif
-	seccomp_assign_mode(seccomp_mode);
+	seccomp_assign_mode(current, seccomp_mode);
 	ret = 0;
 
 out:
@@ -595,7 +609,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	/* Do not free the successfully attached filter. */
 	prepared = NULL;
 
-	seccomp_assign_mode(seccomp_mode);
+	seccomp_assign_mode(current, seccomp_mode);
 out:
 	spin_unlock_irq(&current->sighand->siglock);
 	seccomp_filter_free(prepared);
-- 
cgit v1.2.3


From c2e1f2e30daa551db3c670c0ccfeab20a540b9e1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 5 Jun 2014 00:23:17 -0700
Subject: seccomp: implement SECCOMP_FILTER_FLAG_TSYNC

Applying restrictive seccomp filter programs to large or diverse
codebases often requires handling threads which may be started early in
the process lifetime (e.g., by code that is linked in). While it is
possible to apply permissive programs prior to process start up, it is
difficult to further restrict the kernel ABI to those threads after that
point.

This change adds a new seccomp syscall flag to SECCOMP_SET_MODE_FILTER for
synchronizing thread group seccomp filters at filter installation time.

When calling seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
filter) an attempt will be made to synchronize all threads in current's
threadgroup to its new seccomp filter program. This is possible iff all
threads are using a filter that is an ancestor to the filter current is
attempting to synchronize to. NULL filters (where the task is running as
SECCOMP_MODE_NONE) are also treated as ancestors allowing threads to be
transitioned into SECCOMP_MODE_FILTER. If prctrl(PR_SET_NO_NEW_PRIVS,
...) has been set on the calling thread, no_new_privs will be set for
all synchronized threads too. On success, 0 is returned. On failure,
the pid of one of the failing threads will be returned and no filters
will have been applied.

The race conditions against another thread are:
- requesting TSYNC (already handled by sighand lock)
- performing a clone (already handled by sighand lock)
- changing its filter (already handled by sighand lock)
- calling exec (handled by cred_guard_mutex)
The clone case is assisted by the fact that new threads will have their
seccomp state duplicated from their parent before appearing on the tasklist.

Holding cred_guard_mutex means that seccomp filters cannot be assigned
while in the middle of another thread's exec (potentially bypassing
no_new_privs or similar). The call to de_thread() may kill threads waiting
for the mutex.

Changes across threads to the filter pointer includes a barrier.

Based on patches by Will Drewry.

Suggested-by: Julien Tinnes <jln@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Andy Lutomirski <luto@amacapital.net>
---
 fs/exec.c                    |   2 +-
 include/linux/seccomp.h      |   2 +
 include/uapi/linux/seccomp.h |   3 +
 kernel/seccomp.c             | 135 ++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 0f5c272410f6..ab1f1200ce5d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1216,7 +1216,7 @@ EXPORT_SYMBOL(install_exec_creds);
 /*
  * determine how safe it is to execute the proposed program
  * - the caller must hold ->cred_guard_mutex to protect against
- *   PTRACE_ATTACH
+ *   PTRACE_ATTACH or seccomp thread-sync
  */
 static void check_unsafe_exec(struct linux_binprm *bprm)
 {
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 9ff98b4bfe2e..5d586a45a319 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,6 +3,8 @@
 
 #include <uapi/linux/seccomp.h>
 
+#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC)
+
 #ifdef CONFIG_SECCOMP
 
 #include <linux/thread_info.h>
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index b258878ba754..0f238a43ff1e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -14,6 +14,9 @@
 #define SECCOMP_SET_MODE_STRICT	0
 #define SECCOMP_SET_MODE_FILTER	1
 
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC	1
+
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 9065d2c79c56..74f460179171 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -26,6 +26,7 @@
 #ifdef CONFIG_SECCOMP_FILTER
 #include <asm/syscall.h>
 #include <linux/filter.h>
+#include <linux/pid.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/tracehook.h>
@@ -225,6 +226,114 @@ static inline void seccomp_assign_mode(struct task_struct *task,
 }
 
 #ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the parent is an ancestor of the child. */
+static int is_ancestor(struct seccomp_filter *parent,
+		       struct seccomp_filter *child)
+{
+	/* NULL is the root ancestor. */
+	if (parent == NULL)
+		return 1;
+	for (; child; child = child->prev)
+		if (child == parent)
+			return 1;
+	return 0;
+}
+
+/**
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects sighand and cred_guard_mutex locks to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static inline pid_t seccomp_can_sync_threads(void)
+{
+	struct task_struct *thread, *caller;
+
+	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+	/* Validate all threads being eligible for synchronization. */
+	caller = current;
+	for_each_thread(caller, thread) {
+		pid_t failed;
+
+		/* Skip current, since it is initiating the sync. */
+		if (thread == caller)
+			continue;
+
+		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+		    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+		     is_ancestor(thread->seccomp.filter,
+				 caller->seccomp.filter)))
+			continue;
+
+		/* Return the first thread that cannot be synchronized. */
+		failed = task_pid_vnr(thread);
+		/* If the pid cannot be resolved, then return -ESRCH */
+		if (unlikely(WARN_ON(failed == 0)))
+			failed = -ESRCH;
+		return failed;
+	}
+
+	return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects sighand and cred_guard_mutex locks to be held, and for
+ * seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static inline void seccomp_sync_threads(void)
+{
+	struct task_struct *thread, *caller;
+
+	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+	BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+	/* Synchronize all threads. */
+	caller = current;
+	for_each_thread(caller, thread) {
+		/* Skip current, since it needs no changes. */
+		if (thread == caller)
+			continue;
+
+		/* Get a task reference for the new leaf node. */
+		get_seccomp_filter(caller);
+		/*
+		 * Drop the task reference to the shared ancestor since
+		 * current's path will hold a reference.  (This also
+		 * allows a put before the assignment.)
+		 */
+		put_seccomp_filter(thread);
+		smp_store_release(&thread->seccomp.filter,
+				  caller->seccomp.filter);
+		/*
+		 * Opt the other thread into seccomp if needed.
+		 * As threads are considered to be trust-realm
+		 * equivalent (see ptrace_may_access), it is safe to
+		 * allow one thread to transition the other.
+		 */
+		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+			/*
+			 * Don't let an unprivileged task work around
+			 * the no_new_privs restriction by creating
+			 * a thread that sets it up, enters seccomp,
+			 * then dies.
+			 */
+			if (task_no_new_privs(caller))
+				task_set_no_new_privs(thread);
+
+			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+		}
+	}
+}
+
 /**
  * seccomp_prepare_filter: Prepares a seccomp filter for use.
  * @fprog: BPF program to install
@@ -364,6 +473,15 @@ static long seccomp_attach_filter(unsigned int flags,
 	if (total_insns > MAX_INSNS_PER_PATH)
 		return -ENOMEM;
 
+	/* If thread sync has been requested, check that it is possible. */
+	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+		int ret;
+
+		ret = seccomp_can_sync_threads();
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
@@ -371,6 +489,10 @@ static long seccomp_attach_filter(unsigned int flags,
 	filter->prev = current->seccomp.filter;
 	current->seccomp.filter = filter;
 
+	/* Now that the new filter is in place, synchronize to all threads. */
+	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+		seccomp_sync_threads();
+
 	return 0;
 }
 
@@ -590,7 +712,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	long ret = -EINVAL;
 
 	/* Validate flags. */
-	if (flags != 0)
+	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
 		return -EINVAL;
 
 	/* Prepare the new filter before holding any locks. */
@@ -598,6 +720,14 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	if (IS_ERR(prepared))
 		return PTR_ERR(prepared);
 
+	/*
+	 * Make sure we cannot change seccomp or nnp state via TSYNC
+	 * while another thread is in the middle of calling exec.
+	 */
+	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+	    mutex_lock_killable(&current->signal->cred_guard_mutex))
+		goto out_free;
+
 	spin_lock_irq(&current->sighand->siglock);
 
 	if (!seccomp_may_assign_mode(seccomp_mode))
@@ -612,6 +742,9 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	seccomp_assign_mode(current, seccomp_mode);
 out:
 	spin_unlock_irq(&current->sighand->siglock);
+	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+		mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
 	seccomp_filter_free(prepared);
 	return ret;
 }
-- 
cgit v1.2.3