diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 609d53f7a91..3c72a05dc32 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -50,6 +50,12 @@ #define STATE_PENDING 1 #define STATE_READY 2 +struct posix_msg_tree_node { + struct rb_node rb_node; + struct list_head msg_list; + int priority; +}; + struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; @@ -62,7 +68,7 @@ struct mqueue_inode_info { struct inode vfs_inode; wait_queue_head_t wait_q; - struct msg_msg **messages; + struct rb_root msg_tree; struct mq_attr attr; struct sigevent notify; @@ -110,6 +116,90 @@ static struct ipc_namespace *get_ns_from_inode(struct inode *inode) return ns; } +/* Auxiliary functions to manipulate messages' list */ +static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) +{ + struct rb_node **p, *parent = NULL; + struct posix_msg_tree_node *leaf; + + p = &info->msg_tree.rb_node; + while (*p) { + parent = *p; + leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); + + if (likely(leaf->priority == msg->m_type)) + goto insert_msg; + else if (msg->m_type < leaf->priority) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + leaf = kzalloc(sizeof(*leaf), GFP_ATOMIC); + if (!leaf) + return -ENOMEM; + rb_init_node(&leaf->rb_node); + INIT_LIST_HEAD(&leaf->msg_list); + leaf->priority = msg->m_type; + rb_link_node(&leaf->rb_node, parent, p); + rb_insert_color(&leaf->rb_node, &info->msg_tree); + info->qsize += sizeof(struct posix_msg_tree_node); +insert_msg: + info->attr.mq_curmsgs++; + info->qsize += msg->m_ts; + list_add_tail(&msg->m_list, &leaf->msg_list); + return 0; +} + +static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) +{ + struct rb_node **p, *parent = NULL; + struct posix_msg_tree_node *leaf; + struct msg_msg *msg; + +try_again: + p = &info->msg_tree.rb_node; + while (*p) { + parent = *p; + /* + * During insert, low priorities go to the left and high to the + * right. On receive, we want the highest priorities first, so + * walk all the way to the right. + */ + p = &(*p)->rb_right; + } + if (!parent) { + if (info->attr.mq_curmsgs) { + pr_warn_once("Inconsistency in POSIX message queue, " + "no tree element, but supposedly messages " + "should exist!\n"); + info->attr.mq_curmsgs = 0; + } + return NULL; + } + leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); + if (list_empty(&leaf->msg_list)) { + pr_warn_once("Inconsistency in POSIX message queue, " + "empty leaf node but we haven't implemented " + "lazy leaf delete!\n"); + rb_erase(&leaf->rb_node, &info->msg_tree); + info->qsize -= sizeof(struct posix_msg_tree_node); + kfree(leaf); + goto try_again; + } else { + msg = list_first_entry(&leaf->msg_list, + struct msg_msg, m_list); + list_del(&msg->m_list); + if (list_empty(&leaf->msg_list)) { + rb_erase(&leaf->rb_node, &info->msg_tree); + info->qsize -= sizeof(struct posix_msg_tree_node); + kfree(leaf); + } + } + info->attr.mq_curmsgs--; + info->qsize -= msg->m_ts; + return msg; +} + static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) @@ -130,7 +220,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, if (S_ISREG(mode)) { struct mqueue_inode_info *info; - unsigned long mq_bytes, mq_msg_tblsz; + unsigned long mq_bytes, mq_treesize; inode->i_fop = &mqueue_file_operations; inode->i_size = FILENT_SIZE; @@ -144,6 +234,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->notify_user_ns = NULL; info->qsize = 0; info->user = NULL; /* set when all is ok */ + info->msg_tree = RB_ROOT; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, ipc_ns->mq_msg_default); @@ -153,16 +244,25 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->attr.mq_maxmsg = attr->mq_maxmsg; info->attr.mq_msgsize = attr->mq_msgsize; } - mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *); - if (mq_msg_tblsz > PAGE_SIZE) - info->messages = vmalloc(mq_msg_tblsz); - else - info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL); - if (!info->messages) - goto out_inode; + /* + * We used to allocate a static array of pointers and account + * the size of that array as well as one msg_msg struct per + * possible message into the queue size. That's no longer + * accurate as the queue is now an rbtree and will grow and + * shrink depending on usage patterns. We can, however, still + * account one msg_msg struct per message, but the nodes are + * allocated depending on priority usage, and most programs + * only use one, or a handful, of priorities. However, since + * this is pinned memory, we need to assume worst case, so + * that means the min(mq_maxmsg, max_priorities) * struct + * posix_msg_tree_node. + */ + mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * + sizeof(struct posix_msg_tree_node); - mq_bytes = (mq_msg_tblsz + - (info->attr.mq_maxmsg * info->attr.mq_msgsize)); + mq_bytes = mq_treesize + (info->attr.mq_maxmsg * + info->attr.mq_msgsize); spin_lock(&mq_lock); if (u->mq_bytes + mq_bytes < u->mq_bytes || @@ -253,9 +353,9 @@ static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; struct user_struct *user; - unsigned long mq_bytes; - int i; + unsigned long mq_bytes, mq_treesize; struct ipc_namespace *ipc_ns; + struct msg_msg *msg; clear_inode(inode); @@ -265,17 +365,18 @@ static void mqueue_evict_inode(struct inode *inode) ipc_ns = get_ns_from_inode(inode); info = MQUEUE_I(inode); spin_lock(&info->lock); - for (i = 0; i < info->attr.mq_curmsgs; i++) - free_msg(info->messages[i]); - if (is_vmalloc_addr(info->messages)) - vfree(info->messages); - else - kfree(info->messages); + while ((msg = msg_get(info)) != NULL) + free_msg(msg); spin_unlock(&info->lock); /* Total amount of bytes accounted for the mqueue */ - mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *) - + info->attr.mq_msgsize); + mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * + sizeof(struct posix_msg_tree_node); + + mq_bytes = mq_treesize + (info->attr.mq_maxmsg * + info->attr.mq_msgsize); + user = info->user; if (user) { spin_lock(&mq_lock); @@ -495,26 +596,6 @@ static struct ext_wait_queue *wq_get_first_waiter( return list_entry(ptr, struct ext_wait_queue, list); } -/* Auxiliary functions to manipulate messages' list */ -static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info) -{ - int k; - - k = info->attr.mq_curmsgs - 1; - while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) { - info->messages[k + 1] = info->messages[k]; - k--; - } - info->attr.mq_curmsgs++; - info->qsize += ptr->m_ts; - info->messages[k + 1] = ptr; -} - -static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) -{ - info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts; - return info->messages[info->attr.mq_curmsgs]; -} static inline void set_cookie(struct sk_buff *skb, char code) { @@ -848,7 +929,8 @@ static inline void pipelined_receive(struct mqueue_inode_info *info) wake_up_interruptible(&info->wait_q); return; } - msg_insert(sender->msg, info); + if (msg_insert(sender->msg, info)) + return; list_del(&sender->list); sender->state = STATE_PENDING; wake_up_process(sender->task); @@ -936,7 +1018,12 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, pipelined_send(info, msg_ptr, receiver); } else { /* adds message to the queue */ - msg_insert(msg_ptr, info); + if (msg_insert(msg_ptr, info)) { + free_msg(msg_ptr); + ret = -ENOMEM; + spin_unlock(&info->lock); + goto out_fput; + } __do_notify(info); } inode->i_atime = inode->i_mtime = inode->i_ctime =