--- linux-2.6.9.orig/cluster/cman/Makefile	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/Makefile	2006-12-20 17:05:54.000000000 +0300
@@ -0,0 +1,6 @@
+cman-objs := cnxman.o config.o membership.o proc.o\
+	     sm_barrier.o sm_control.o sm_daemon.o sm_joinleave.o\
+	     sm_membership.o sm_message.o sm_misc.o sm_recover.o sm_services.o \
+	     sm_user.o
+
+obj-$(CONFIG_CLUSTER) := cman.o
--- linux-2.6.9.orig/cluster/cman/cnxman-private.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/cnxman-private.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,442 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CNXMAN_PRIVATE_H
+#define __CNXMAN_PRIVATE_H
+
+/* Protocol Version triplet */
+#define CNXMAN_MAJOR_VERSION 5
+#define CNXMAN_MINOR_VERSION 0
+#define CNXMAN_PATCH_VERSION 1
+
+#define CAP_CLUSTER CAP_SYS_ADMIN	/* Capability needed to manage the
+					 * cluster */
+#ifdef __KERNEL__
+
+/* How we announce ourself in console events */
+#define CMAN_NAME "CMAN"
+
+/* One of these per AF_CLUSTER socket */
+struct cluster_sock {
+	/* WARNING: sk has to be the first member */
+	struct sock sk;
+
+	unsigned char port;	/* Bound port or zero */
+	int (*kernel_callback) (char *, int, char *, int, unsigned int);
+	void *service_data;
+};
+
+#define cluster_sk(__sk) ((struct cluster_sock *)__sk)
+
+/* We have one of these for each socket we use for communications */
+struct cl_comms_socket {
+	struct socket *sock;
+	int broadcast;		/* This is a broadcast socket */
+	int recv_only;		/* This is the unicast receive end of a
+				 * multicast socket */
+	struct sockaddr_in6 saddr; /* Socket address, contains the sockaddr for
+				 * the remote end(s) */
+	int addr_len;		/* Length of above */
+	int number;		/* Internal socket number, used to cycle around
+				 * sockets in case of network errors */
+	struct file *file;	/* file pointer for user-passed in sockets */
+
+	wait_queue_t wait;
+
+	struct cl_comms_socket *peer;
+
+	/* The socket list */
+	struct list_head list;
+
+	/* On here when it has something to say */
+	struct list_head active_list;
+	unsigned long active;
+};
+
+/* A client socket. We keep a list of these so we can notify clients of cluster
+ * events */
+struct cl_client_socket {
+	struct socket    *sock;
+	struct list_head  list;
+};
+
+/* This structure is tacked onto the start of a cluster message packet for our
+ * own nefarious purposes. */
+struct cl_protheader {
+	unsigned char  tgtport; /* Target port number */
+	unsigned char  srcport; /* Source (originationg) port number */
+	unsigned short seq;	/* Packet sequence number, little-endian */
+	unsigned short ack;	/* Inline ACK */
+	unsigned short cluster;	/* Our cluster number, little-endian */
+	unsigned int   flags;
+	int            srcid;	/* Node ID of the sender */
+	int            tgtid;	/* Node ID of the target or 0 for multicast
+				 * messages */
+};
+
+/* A cluster internal protocol message - port number 0 */
+struct cl_protmsg {
+	struct cl_protheader header;
+	unsigned char cmd;
+};
+
+/* A Cluster ACK message */
+struct cl_ackmsg {
+	struct cl_protheader header;
+	unsigned char  cmd;	/* Always CLUSTER_CMD_ACK */
+	unsigned char  remport;	/* Remote port number the original message was
+				 * for */
+	unsigned char  aflags;	/* ACK flags 0=OK, 1=No listener */
+	unsigned char  pad;
+};
+
+/* A Cluster LISTENREQ/LISTENRESP message */
+struct cl_listenmsg {
+	unsigned char  cmd;	/* CLUSTER_CMD_LISTENRESP/REQ */
+	unsigned char  target_port;	/* Port to probe */
+	unsigned char  listening;	/* Always 0 for LISTENREQ */
+	unsigned char  pad;
+	unsigned short tag;	/* PID of remote waiting process */
+};
+
+/* A Cluster PORTCLOSED message */
+struct cl_closemsg {
+	unsigned char cmd;	/* CLUSTER_CMD_PORTCLOSED */
+	unsigned char port;
+};
+
+/* Structure of a newly dead node, passed from cnxman to kmembershipd */
+struct cl_new_dead_node {
+	struct list_head     list;
+	struct cluster_node *node;
+};
+
+/* Subcommands for BARRIER message */
+#define BARRIER_REGISTER 1
+#define BARRIER_CHANGE   2
+#define BARRIER_WAIT     4
+#define BARRIER_COMPLETE 5
+
+/* A Cluster BARRIER message */
+struct cl_barriermsg {
+	unsigned char  cmd;	/* CLUSTER_CMD_BARRIER */
+	unsigned char  subcmd;	/* BARRIER sub command */
+	unsigned short pad;
+	unsigned int   flags;
+	unsigned int   nodes;
+	char name[MAX_BARRIER_NAME_LEN];
+};
+
+/* Membership services messages, the cl_protheader is added transparently */
+struct cl_mem_hello_msg {
+	unsigned char  cmd;
+	unsigned char  flags;
+	unsigned short members;	    /* Number of nodes in the cluster,
+				     * little-endian */
+	unsigned int   generation;  /* Current cluster generation number */
+};
+
+struct cl_mem_endtrans_msg {
+	unsigned char  cmd;
+	unsigned char  pad1;
+	unsigned short pad2;
+	unsigned int   quorum;
+	unsigned int   total_votes;
+	unsigned int   generation;	/* Current cluster generation number */
+	unsigned int   new_node_id;	/* If reason is a new node joining */
+};
+
+/* ACK types for JOINACK message */
+#define JOINACK_TYPE_OK   1	/* You can join */
+#define JOINACK_TYPE_NAK  2	/* You can NOT join */
+#define JOINACK_TYPE_WAIT 3	/* Wait a bit longer - cluster is in transition
+				 * already */
+
+struct cl_mem_joinack_msg {
+	unsigned char cmd;
+	unsigned char acktype;
+};
+
+/* This is used by JOINREQ message */
+struct cl_mem_join_msg {
+	unsigned char  cmd;
+	unsigned char  votes;
+	unsigned short num_addr;	/* Number of addresses for this node */
+	unsigned int   expected_votes;
+        unsigned int   nodeid;	        /* node ID we want */
+	unsigned int   major_version;	/* Not backwards compatible */
+	unsigned int   minor_version;	/* Backwards compatible */
+	unsigned int   patch_version;	/* Backwards/forwards compatible */
+	unsigned int   config_version;
+        unsigned int   addr_len;        /* length of node addresses */
+        char           clustername[16];
+	/* Followed by <num_addr> addresses of `address_length` bytes and a
+	 * NUL-terminated node name */
+};
+
+/* State transition start reasons: */
+#define TRANS_NONE           0  /* No current transition */
+#define TRANS_NEWNODE        1	/* A new node is joining the cluster */
+#define TRANS_REMNODE        2	/* a node has left the cluster */
+#define TRANS_ANOTHERREMNODE 3	/* A node left the cluster while we were in
+				 * transition */
+#define TRANS_NEWMASTER      4	/* We have had an election and I am the new
+				 * master */
+#define TRANS_CHECK          5	/* A consistency check was called for */
+#define TRANS_RESTART        6	/* Transition restarted because of a previous
+				 * timeout */
+#define TRANS_DEADMASTER     7	/* The master died during transition and I have
+				 * taken over */
+
+/* This is used to start a state transition */
+struct cl_mem_starttrans_msg {
+	unsigned char  cmd;
+	unsigned char  reason;	/* Why a start transition was started - see
+				 * above */
+	unsigned char  flags;
+	unsigned char  votes;
+	unsigned int   expected_votes;
+	unsigned int   generation;	/* Incremented for each STARTTRANS sent
+					 */
+	int            nodeid;	/* Node to be removed */
+	unsigned short num_addrs;
+	/* If reason == TRANS_NEWNODE: Followed by <num_addr> addresses of
+	 * `address_length` bytes and a NUL-terminated node name */
+};
+
+struct cl_mem_startack_msg {
+	unsigned char  cmd;
+	unsigned char  reason;
+	unsigned short pad;
+	unsigned int   generation;
+};
+
+/* Reconfigure a cluster parameter */
+struct cl_mem_reconfig_msg {
+	unsigned char  cmd;
+	unsigned char  param;
+	unsigned short pad;
+	unsigned int   value;
+};
+
+/* Tell the cluster a node has died */
+struct cl_mem_nodedown_msg {
+	unsigned char  cmd;
+	unsigned char  reason;
+	unsigned short pad;
+	unsigned int   nodeid;
+};
+
+/* Structure containing information about an outstanding listen request */
+struct cl_waiting_listen_request {
+	wait_queue_head_t waitq;
+	int               result;
+	int               waiting;
+	unsigned short    tag;
+	int               nodeid;
+	struct list_head  list;
+};
+
+/* Messages from membership services */
+#define CLUSTER_MEM_JOINCONF   1
+#define CLUSTER_MEM_JOINREQ    2
+#define CLUSTER_MEM_LEAVE      3
+#define CLUSTER_MEM_HELLO      4
+#define CLUSTER_MEM_KILL       5
+#define CLUSTER_MEM_JOINACK    6
+#define CLUSTER_MEM_ENDTRANS   7
+#define CLUSTER_MEM_RECONFIG   8
+#define CLUSTER_MEM_MASTERVIEW 9
+#define CLUSTER_MEM_STARTTRANS 10
+#define CLUSTER_MEM_JOINREJ    11
+#define CLUSTER_MEM_VIEWACK    12
+#define CLUSTER_MEM_STARTACK   13
+#define CLUSTER_MEM_TRANSITION 14
+#define CLUSTER_MEM_NEWCLUSTER 15
+#define CLUSTER_MEM_CONFACK    16
+#define CLUSTER_MEM_NOMINATE   17
+#define CLUSTER_MEM_NODEDOWN   18
+
+/* Flags in the HELLO message */
+#define HELLO_FLAG_MASTER       1
+#define HELLO_FLAG_QUORATE      2
+
+/* Parameters for RECONFIG command */
+#define RECONFIG_PARAM_EXPECTED_VOTES 1
+#define RECONFIG_PARAM_NODE_VOTES     2
+#define RECONFIG_PARAM_CONFIG_VERSION 3
+
+/* Data associated with an outgoing socket */
+struct cl_socket {
+	struct file *file;	/* The real file */
+	struct socket *socket;	/* The real sock */
+	int num_nodes;		/* On this link */
+	int retransmit_count;
+};
+
+/* There's one of these for each node in the cluster */
+struct cluster_node {
+	struct list_head list;
+	char *name;		/* Node/host name of node */
+	struct list_head addr_list;
+	int us;			/* This node is us */
+	unsigned int node_id;	/* Unique node ID */
+	nodestate_t state;
+	unsigned short last_seq_recv;
+	unsigned short last_ackneeded_seq_recv;
+	unsigned short last_seq_acked;
+	unsigned short last_seq_sent;
+	unsigned int votes;
+	unsigned int expected_votes;
+	unsigned int leave_reason;
+	unsigned int incarnation;	/* Incremented each time a node joins
+					 * the cluster */
+	unsigned long last_hello;	/* Jiffies */
+        struct timeval join_time;
+};
+
+/* This is how we keep a list of user processes that are listening for cluster
+ * membership events */
+struct notify_struct {
+	struct list_head list;
+	pid_t pid;
+	int signal;
+};
+
+/* This is how we keep a list of kernel callbacks that are registered for
+ * cluster membership events */
+struct kernel_notify_struct {
+	struct list_head list;
+	void (*callback) (kcl_callback_reason, long arg);
+};
+
+/* A message waiting to be sent */
+struct queued_message {
+	struct list_head list;
+
+	struct socket *socket;
+	struct sockaddr_cl addr;
+	int addr_len;
+	int msg_len;
+	unsigned char port;
+	unsigned int flags;
+	char msg_buffer[MAX_CLUSTER_MESSAGE];
+};
+
+/* A barrier */
+struct cl_barrier {
+	struct list_head list;
+
+	char name[MAX_BARRIER_NAME_LEN];
+	unsigned int flags;
+	enum { BARRIER_STATE_WAITING, BARRIER_STATE_INACTIVE,
+	       BARRIER_STATE_COMPLETE, BARRIER_STATE_DELETED } state;
+	unsigned int expected_nodes;
+	unsigned int registered_nodes;
+	atomic_t     got_nodes;
+	atomic_t     completed_nodes;
+	unsigned int inuse;
+	unsigned int waitsent;
+	unsigned int phase;	/* Completion phase */
+	unsigned int endreason;	/* Reason we were woken, usually 0 */
+	unsigned long timeout;	/* In seconds */
+
+	void (*callback) (char *name, int status);
+	wait_queue_head_t waitq;
+	struct semaphore lock;	/* To synch with cnxman messages */
+	spinlock_t phase2_spinlock;	/* Need to synchronise with timer
+					 * interrupts */
+	struct timer_list timer;
+};
+
+/* Cluster protocol commands sent to port 0 */
+#define CLUSTER_CMD_ACK        1
+#define CLUSTER_CMD_LISTENREQ  2
+#define CLUSTER_CMD_LISTENRESP 3
+#define CLUSTER_CMD_PORTCLOSED 4
+#define CLUSTER_CMD_BARRIER    5
+
+extern struct cluster_node *find_node_by_addr(unsigned char *addr,
+					      int addr_len);
+extern struct cluster_node *find_node_by_nodeid(unsigned int id);
+extern struct cluster_node *find_node_by_name(char *name);
+extern void set_quorate(int);
+extern void notify_kernel_listeners(kcl_callback_reason reason, long arg);
+extern void notify_listeners(void);
+extern void free_nodeid_array(void);
+extern int send_reconfigure(int param, unsigned int value);
+extern int calculate_quorum(int, int, int *);
+extern void recalculate_quorum(int);
+extern int send_leave(unsigned char);
+extern int get_quorum(void);
+extern void set_votes(int, int);
+extern void kcl_wait_for_all_acks(void);
+extern char *membership_state(char *, int);
+extern char *leave_string(int reason);
+extern void a_node_just_died(struct cluster_node *node);
+extern void check_barrier_returns(void);
+extern int in_transition(void);
+extern void get_local_addresses(struct cluster_node *node);
+extern int add_node_address(struct cluster_node *node, unsigned char *addr, int len);
+extern void create_proc_entries(void);
+extern void cleanup_proc_entries(void);
+extern unsigned int get_highest_nodeid(void);
+extern int allocate_nodeid_array(void);
+extern void queue_oob_skb(struct socket *sock, int cmd);
+extern int new_temp_nodeid(char *addr, int addrlen);
+extern int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen);
+extern void purge_temp_nodeids(void);
+extern inline char *print_addr(unsigned char *addr, int len, char *buf)
+{
+	int i;
+	int ptr = 0;
+
+	for (i = 0; i < len; i++)
+		ptr += sprintf(buf + ptr, "%02x ", addr[i]);
+
+	return buf;
+}
+
+#define MAX_ADDR_PRINTED_LEN (address_length*3 + 1)
+
+/* Debug enabling macros. Sorry about the C++ comments but they're easier to
+ * get rid of than C ones... */
+
+// #define DEBUG_MEMB
+// #define DEBUG_COMMS
+// #define DEBUG_BARRIER
+
+/* Debug macros */
+#ifdef DEBUG_COMMS
+#define P_COMMS(fmt, args...) printk(KERN_DEBUG "cman comms: " fmt, ## args)
+#else
+#define P_COMMS(fmt, args...)
+#endif
+
+#ifdef DEBUG_BARRIER
+#define P_BARRIER(fmt, args...) printk(KERN_DEBUG "cman barrier: " fmt, ## args)
+#else
+#define P_BARRIER(fmt, args...)
+#endif
+
+#ifdef DEBUG_MEMB
+#define P_MEMB(fmt, args...) printk(KERN_DEBUG "cman memb: " fmt, ## args)
+#define C_MEMB(fmt, args...) printk(fmt, ## args)
+#else
+#define P_MEMB(fmt, args...)
+#define C_MEMB(fmt, args...)
+#endif
+
+#endif				/* __KERNEL */
+
+#endif
--- linux-2.6.9.orig/cluster/cman/cnxman.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/cnxman.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,4370 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#define EXPORT_SYMTAB
+#include <linux/init.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/sock.h>
+#include <linux/proc_fs.h>
+#include <linux/poll.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/uio.h>
+#include <cluster/cnxman.h>
+#include <cluster/service.h>
+
+#include "cnxman-private.h"
+#include "sm_control.h"
+#include "sm_user.h"
+#include "config.h"
+
+#define CMAN_RELEASE_NAME "2.6.9-45.8"
+
+static void process_incoming_packet(struct cl_comms_socket *csock,
+				    struct msghdr *msg, struct kvec *vec, int veclen, int len);
+static int cl_sendack(struct cl_comms_socket *sock, unsigned short seq,
+		      int addr_len, char *addr, unsigned char remport,
+		      unsigned char flag);
+static void send_listen_request(int nodeid, unsigned char port);
+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
+				 unsigned char port, unsigned short tag);
+static void resend_last_message(void);
+static void start_ack_timer(void);
+static int send_queued_message(struct queued_message *qmsg);
+static void send_port_close_oob(unsigned char port);
+static void post_close_oob(unsigned char port, int nodeid);
+static void process_barrier_msg(struct cl_barriermsg *msg,
+				struct cluster_node *node);
+static struct cl_barrier *find_barrier(char *name);
+static void tidy_barriers(void);
+static void node_shutdown(void);
+static void node_cleanup(void);
+static int send_or_queue_message(struct socket *sock, void *buf, int len, struct sockaddr_cl *caddr,
+				 unsigned int flags);
+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur);
+static struct cl_comms_socket *get_peer_interface(int if_num, int mcast);
+static void check_for_unacked_nodes(void);
+static void free_cluster_sockets(void);
+static uint16_t generate_cluster_id(char *name);
+static int is_valid_temp_nodeid(int nodeid);
+
+extern int start_membership_services(pid_t);
+extern int kcl_leave_cluster(int remove);
+extern int send_kill(int nodeid, int needack);
+extern void cman_set_realtime(struct task_struct *tsk, int prio);
+
+static struct proto_ops cl_proto_ops;
+static struct sock *master_sock;
+static kmem_cache_t *cluster_sk_cachep;
+
+/* Pointer to the pseudo node that maintains quorum in a 2node system */
+struct cluster_node *quorum_device = NULL;
+
+/* Array of "ports" allocated. This is just a list of pointers to the sock that
+ * has this port bound. Speed is a major issue here so 1-2K of allocated
+ * storage is worth sacrificing. Port 0 is reserved for protocol messages */
+static struct sock *port_array[256];
+static struct semaphore port_array_lock;
+
+/* Our cluster name & number */
+uint16_t cluster_id;
+char cluster_name[MAX_CLUSTER_NAME_LEN+1];
+
+/* Two-node mode: causes cluster to remain quorate if one of two nodes fails.
+ * No more than two nodes are permitted to join the cluster. */
+unsigned short two_node;
+
+/* Cluster configuration version that must be the same among members. */
+unsigned int config_version;
+
+/* Reference counting for cluster applications */
+atomic_t use_count;
+
+/* Length of sockaddr address for our comms protocol */
+unsigned int address_length;
+
+/* Message sending */
+static unsigned short cur_seq;	/* Last message sent */
+static unsigned int ack_count;	/* Number of acks received for message
+				 * 'cur_seq' */
+static unsigned int acks_expected;	/* Number of acks we expect to receive */
+static struct semaphore send_lock;
+static struct timer_list ack_timer;
+
+/* Saved packet information in case we need to resend it */
+static char saved_msg_buffer[MAX_CLUSTER_MESSAGE];
+static int saved_msg_len;
+static int retry_count;
+
+/* Task variables */
+static pid_t kcluster_pid;
+extern struct task_struct *membership_task;
+extern spinlock_t membership_task_lock;
+extern int quit_threads;
+
+wait_queue_head_t cnxman_waitq;
+
+/* Variables owned by membership services */
+extern int cluster_members;
+extern struct list_head cluster_members_list;
+extern struct semaphore cluster_members_lock;
+extern int we_are_a_cluster_member;
+extern int cluster_is_quorate;
+extern struct cluster_node *us;
+extern struct list_head new_dead_node_list;
+extern spinlock_t new_dead_node_lock;
+extern char nodename[];
+extern int wanted_nodeid;
+
+/* A list of processes listening for membership events */
+static struct list_head event_listener_list;
+static struct semaphore event_listener_lock;
+
+/* A list of kernel callbacks listening for membership events */
+static struct list_head kernel_listener_list;
+static struct semaphore kernel_listener_lock;
+
+/* A list of sockets we are listening on (and can transmit on...later) */
+static struct list_head socket_list;
+
+/* A list of all open cluster client sockets */
+static struct list_head client_socket_list;
+static struct semaphore client_socket_lock;
+
+/* A list of all current barriers */
+static struct list_head barrier_list;
+static struct semaphore barrier_list_lock;
+
+/* When a socket is read for reading it goes on this queue */
+static spinlock_t active_socket_lock;
+static struct list_head active_socket_list;
+
+/* If the cnxman process is running and available for work */
+atomic_t cnxman_running;
+
+/* Fkags set by timers etc for the mainloop to detect and act upon */
+static unsigned long mainloop_flags;
+
+#define ACK_TIMEOUT   1
+#define RESEND_NEEDED 2
+#define TIDY_BARRIERS 3
+
+/* A queue of messages waiting to be sent. If kcl_sendmsg is called outside of
+ * process context then the messages get put in here */
+static struct list_head messages_list;
+static struct semaphore messages_list_lock;
+
+static struct semaphore start_thread_sem;
+
+/* List of outstanding ISLISTENING requests */
+static struct list_head listenreq_list;
+static struct semaphore listenreq_lock;
+
+/* Any sending requests wait on this queue if necessary (eg inquorate, waiting
+ * ACK) */
+static DECLARE_WAIT_QUEUE_HEAD(socket_waitq);
+
+/* Wait for thread to exit properly */
+struct completion cluster_thread_comp;
+struct completion member_thread_comp;
+
+/* The resend delay to use, We increase this geometrically(word?) each time a
+ * send is delayed. in deci-seconds */
+static int resend_delay = 1;
+
+/* Highest numbered interface and the current default */
+static int num_interfaces;
+static struct cl_comms_socket *current_interface = NULL;
+
+struct temp_node
+{
+	int nodeid;
+	char addr[sizeof(struct sockaddr_in6)];
+	int addrlen;
+	struct list_head list;
+};
+static struct list_head tempnode_list;
+static struct semaphore tempnode_lock;
+
+
+/* This is what's squirrelled away in skb->cb */
+struct cb_info
+{
+	int  orig_nodeid;
+	char orig_port;
+	char oob;
+};
+
+
+/* Wake up any processes that are waiting to send. This is usually called when
+ * all the ACKs have been gathered up or when a node has left the cluster
+ * unexpectedly and we reckon there are no more acks to collect */
+static void unjam(void)
+{
+	wake_up_interruptible(&socket_waitq);
+	wake_up_interruptible(&cnxman_waitq);
+}
+
+/* Used by the data_ready routine to locate a connection given the socket */
+static inline struct cl_comms_socket *find_comms_by_sock(struct sock *sk)
+{
+	struct list_head *conlist;
+
+	list_for_each(conlist, &socket_list) {
+		struct cl_comms_socket *clsock =
+		    list_entry(conlist, struct cl_comms_socket, list);
+		if (clsock->sock->sk == sk) {
+			return clsock;
+		}
+	}
+	return NULL;
+}
+
+/* Data available on socket */
+static void cnxman_data_ready(struct sock *sk, int count_unused)
+{
+	struct cl_comms_socket *clsock = find_comms_by_sock(sk);
+
+	if (clsock == NULL)	/* ASSERT ?? */
+		return;
+
+	/* If we're already on the list then don't do it again */
+	if (test_and_set_bit(1, &clsock->active))
+		return;
+
+	spin_lock_irq(&active_socket_lock);
+	list_add(&clsock->active_list, &active_socket_list);
+	spin_unlock_irq(&active_socket_lock);
+
+	wake_up_interruptible(&cnxman_waitq);
+}
+
+static int receive_message(struct cl_comms_socket *csock, char *iobuf)
+{
+	struct msghdr msg;
+	struct kvec vec;
+	struct sockaddr_in6 sin;
+	int len;
+
+	memset(&sin, 0, sizeof (sin));
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_name = &sin;
+	msg.msg_namelen = sizeof (sin);
+	msg.msg_flags = 0;
+
+	vec.iov_len = MAX_CLUSTER_MESSAGE;
+	vec.iov_base = iobuf;
+
+	len = kernel_recvmsg(csock->sock, &msg,
+			     &vec, 1, MAX_CLUSTER_MESSAGE, MSG_DONTWAIT);
+
+	vec.iov_base = iobuf;
+	vec.iov_len = MAX_CLUSTER_MESSAGE;
+
+	if (len > 0) {
+		if (len > MAX_CLUSTER_MESSAGE) {
+			printk(KERN_CRIT CMAN_NAME
+			       ": %d byte message far too big\n", len);
+			return 0;
+		}
+		process_incoming_packet(csock, &msg, &vec, 1, len);
+	}
+	else {
+		if (len != -EAGAIN)
+			printk(KERN_CRIT CMAN_NAME ": recvmsg failed: %d\n",
+			       len);
+	}
+	return len;
+}
+
+static int cluster_kthread(void *unused)
+{
+	int len;
+	char *iobuf;
+	struct list_head *socklist;
+	struct cl_comms_socket *csock;
+	wait_queue_t cnxman_waitq_head;
+	sigset_t tmpsig;
+
+	daemonize("cman_comms");
+
+	/* Block everything but SIGKILL/SIGSTOP/SIGTERM */
+	siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
+	sigprocmask(SIG_BLOCK, &tmpsig, NULL);
+
+	/* This is the waitq we can wake the process up with */
+	init_waitqueue_head(&cnxman_waitq);
+	init_waitqueue_entry(&cnxman_waitq_head, current);
+	add_wait_queue(&cnxman_waitq, &cnxman_waitq_head);
+
+	cman_set_realtime(current, 1);
+
+	/* Allow the sockets to start receiving */
+	list_for_each(socklist, &socket_list) {
+		csock = list_entry(socklist, struct cl_comms_socket, list);
+
+		clear_bit(1, &csock->active);
+	}
+
+	iobuf = kmalloc(MAX_CLUSTER_MESSAGE, GFP_KERNEL);
+	if (!iobuf) {
+		printk(KERN_CRIT CMAN_NAME
+		       ": Cannot allocate receive buffer for cluster comms\n");
+		return -1;
+	}
+
+	complete(&cluster_thread_comp);
+
+	for (;;) {
+		struct list_head *temp;
+
+		/* Wait for activity on any of the sockets */
+		set_task_state(current, TASK_INTERRUPTIBLE);
+
+		if (list_empty(&active_socket_list))
+			schedule();
+		set_task_state(current, TASK_RUNNING);
+
+		if (quit_threads)
+			break;
+
+		/* Now receive any messages waiting for us */
+		spin_lock_irq(&active_socket_lock);
+		list_for_each_safe(socklist, temp, &active_socket_list) {
+			csock =
+			    list_entry(socklist, struct cl_comms_socket,
+				       active_list);
+
+			list_del(&csock->active_list);
+			clear_bit(1, &csock->active);
+
+			spin_unlock_irq(&active_socket_lock);
+
+			do {
+				len = receive_message(csock, iobuf);
+			}
+			while (len > 0);
+
+			spin_lock_irq(&active_socket_lock);
+
+			if (len == 0)
+				break;	/* EOF on socket */
+		}
+		spin_unlock_irq(&active_socket_lock);
+
+		if (test_and_clear_bit(ACK_TIMEOUT, &mainloop_flags)) {
+			check_for_unacked_nodes();
+		}
+
+		if (test_and_clear_bit(TIDY_BARRIERS, &mainloop_flags)) {
+			tidy_barriers();
+		}
+
+		/* Resend any unacked messages */
+		if (test_and_clear_bit(RESEND_NEEDED, &mainloop_flags)
+		    && acks_expected) {
+			resend_last_message();
+		}
+
+		/* Send any queued messages */
+		if (acks_expected == 0) {
+			struct list_head *temp;
+			struct list_head *msglist;
+
+			down(&messages_list_lock);
+			list_for_each_safe(msglist, temp, &messages_list) {
+				struct queued_message *qmsg =
+				    list_entry(msglist, struct queued_message,
+					       list);
+				int status = send_queued_message(qmsg);
+
+				if (status >= 0) {
+					/* Suceeded, remove it from the queue */
+					list_del(&qmsg->list);
+					kfree(qmsg);
+				}
+				/* Did it fail horribly ?? */
+				if (status < 0 && status != -EAGAIN) {
+					printk(KERN_INFO CMAN_NAME
+					       ": send_queued_message failed, error %d\n",
+					       status);
+					list_del(&qmsg->list);
+					kfree(qmsg);
+				}
+				break;	/* Only send one message at a time */
+			}
+			up(&messages_list_lock);
+		}
+
+		if (signal_pending(current))
+			break;
+	}
+	P_COMMS("closing down\n");
+
+	quit_threads = 1;	/* force other thread to die too */
+
+	/* Wait for membership thread to finish, that way any
+	   LEAVE message will get sent. */
+	spin_lock(&membership_task_lock);
+	if (membership_task) {
+		wake_up_process(membership_task);
+		spin_unlock(&membership_task_lock);
+		wait_for_completion(&member_thread_comp);
+	}
+	else {
+		spin_unlock(&membership_task_lock);
+	}
+
+	node_shutdown();
+
+	if (timer_pending(&ack_timer))
+		del_timer(&ack_timer);
+
+	node_cleanup();
+	kfree(iobuf);
+
+	complete(&cluster_thread_comp);
+	return 0;
+}
+
+void notify_kernel_listeners(kcl_callback_reason reason, long arg)
+{
+	struct kernel_notify_struct *knotify;
+	struct list_head *proclist;
+
+	down(&kernel_listener_lock);
+	list_for_each(proclist, &kernel_listener_list) {
+		knotify =
+		    list_entry(proclist, struct kernel_notify_struct, list);
+		knotify->callback(reason, arg);
+	}
+	up(&kernel_listener_lock);
+}
+
+static void check_for_unacked_nodes()
+{
+	struct list_head *nodelist;
+	struct list_head *temp;
+	struct cluster_node *node;
+
+	clear_bit(RESEND_NEEDED, &mainloop_flags);
+	retry_count = 0;
+
+	P_COMMS("Retry count exceeded -- looking for dead node\n");
+
+	/* Node did not ACK a message after <n> tries, remove it from the
+	 * cluster */
+	down(&cluster_members_lock);
+	list_for_each_safe(nodelist, temp, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		P_COMMS("checking node %s: last_acked = %d, last_seq_sent = %d\n",
+			node->name, node->last_seq_acked, node->last_seq_sent);
+		if (node->state != NODESTATE_DEAD &&
+		    node->last_seq_acked != node->last_seq_sent && !node->us) {
+
+			/* Drop this lock or we can deadlock with membership */
+			up(&cluster_members_lock);
+
+			/* Start a state transition */
+			node->leave_reason = CLUSTER_LEAVEFLAG_NORESPONSE;
+			a_node_just_died(node);
+			down(&cluster_members_lock);
+		}
+	}
+	up(&cluster_members_lock);
+	acks_expected = ack_count = 0;
+	unjam();
+	return;
+}
+
+static void ack_timer_fn(unsigned long arg)
+{
+	P_COMMS("%ld: ack_timer fired, retries=%d\n", jiffies, retry_count);
+
+	/* Too many retries ? */
+	if (++retry_count > cman_config.max_retries) {
+		set_bit(ACK_TIMEOUT, &mainloop_flags);
+		wake_up_interruptible(&cnxman_waitq);
+	}
+	else {
+		/* Resend last message */
+		set_bit(RESEND_NEEDED, &mainloop_flags);
+		wake_up_interruptible(&cnxman_waitq);
+	}
+}
+
+/* Called to resend a packet if sock_sendmsg was busy */
+static void short_timer_fn(unsigned long arg)
+{
+	P_COMMS("short_timer fired\n");
+
+	/* Resend last message */
+	resend_delay <<= 1;
+	set_bit(RESEND_NEEDED, &mainloop_flags);
+	wake_up_interruptible(&cnxman_waitq);
+}
+
+static void start_ack_timer()
+{
+	ack_timer.function = ack_timer_fn;
+	ack_timer.data = 0L;
+	mod_timer(&ack_timer, jiffies + HZ);
+}
+
+static void start_short_timer(void)
+{
+	ack_timer.function = short_timer_fn;
+	ack_timer.data = 0L;
+	mod_timer(&ack_timer, jiffies + (resend_delay * HZ));
+}
+
+
+static struct cl_waiting_listen_request *find_listen_request(unsigned short tag)
+{
+	struct list_head *llist;
+	struct cl_waiting_listen_request *listener;
+
+	list_for_each(llist, &listenreq_list) {
+		listener = list_entry(llist, struct cl_waiting_listen_request,
+				      list);
+		if (listener->tag == tag) {
+			return listener;
+		}
+	}
+	return NULL;
+}
+
+static void process_ack(struct cluster_node *rem_node, unsigned short seq)
+{
+	if (rem_node && rem_node->state != NODESTATE_DEAD) {
+		/* This copes with duplicate acks from a multipathed
+		 * host */
+		if (rem_node->last_seq_acked != le16_to_cpu(seq)) {
+			rem_node->last_seq_acked = le16_to_cpu(seq);
+
+			/* Got em all */
+			if (++ack_count >= acks_expected) {
+
+				/* Cancel the timer */
+				del_timer(&ack_timer);
+				acks_expected = 0;
+				unjam();
+			}
+		}
+	}
+}
+
+static void process_cnxman_message(struct cl_comms_socket *csock, char *data,
+				   int len, char *addr, int addrlen,
+				   struct cluster_node *rem_node)
+{
+	struct cl_protmsg *msg = (struct cl_protmsg *) data;
+	struct cl_protheader *header = (struct cl_protheader *) data;
+	struct cl_ackmsg *ackmsg;
+	struct cl_listenmsg *listenmsg;
+	struct cl_closemsg *closemsg;
+	struct cl_barriermsg *barriermsg;
+	struct cl_waiting_listen_request *listen_request;
+
+	P_COMMS("Message on port 0 is %d\n", msg->cmd);
+	switch (msg->cmd) {
+	case CLUSTER_CMD_ACK:
+		ackmsg = (struct cl_ackmsg *) data;
+
+		if (rem_node && (ackmsg->aflags & 1)) {
+			if (net_ratelimit())
+				printk(KERN_INFO CMAN_NAME
+				       ": WARNING no listener for port %d on node %s\n",
+				       ackmsg->remport, rem_node->name);
+		}
+		P_COMMS("Got ACK from %s. seq=%d (cur=%d)\n",
+			rem_node ? rem_node->name : "Unknown",
+			le16_to_cpu(ackmsg->header.ack), cur_seq);
+
+		/* ACK processing has already happened */
+		break;
+
+		/* Return 1 if we have a listener on this port, 0 if not */
+	case CLUSTER_CMD_LISTENREQ:
+		listenmsg =
+		    (struct cl_listenmsg *) (data +
+					     sizeof (struct cl_protheader));
+		cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
+		send_listen_response(csock, le32_to_cpu(header->srcid),
+				     listenmsg->target_port, listenmsg->tag);
+		break;
+
+	case CLUSTER_CMD_LISTENRESP:
+		/* Wake up process waiting for listen response */
+		listenmsg =
+		    (struct cl_listenmsg *) (data +
+					     sizeof (struct cl_protheader));
+		cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
+		down(&listenreq_lock);
+		listen_request = find_listen_request(listenmsg->tag);
+		if (listen_request) {
+			listen_request->result = listenmsg->listening;
+			listen_request->waiting = 0;
+			wake_up_interruptible(&listen_request->waitq);
+		}
+		up(&listenreq_lock);
+		break;
+
+	case CLUSTER_CMD_PORTCLOSED:
+		closemsg =
+		    (struct cl_closemsg *) (data +
+					    sizeof (struct cl_protheader));
+		cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
+		post_close_oob(closemsg->port, le32_to_cpu(header->srcid));
+		break;
+
+	case CLUSTER_CMD_BARRIER:
+		barriermsg =
+		    (struct cl_barriermsg *) (data +
+					      sizeof (struct cl_protheader));
+		cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
+		if (rem_node)
+			process_barrier_msg(barriermsg, rem_node);
+		break;
+
+	default:
+		printk(KERN_ERR CMAN_NAME
+		       ": Unknown protocol message %d received\n", msg->cmd);
+		break;
+
+	}
+	return;
+}
+
+static int valid_addr_for_node(struct cluster_node *node, char *addr)
+{
+	struct list_head *addrlist;
+	struct cluster_node_addr *nodeaddr;
+
+	/* We don't compare the first two bytes of the address because it's
+	 * the Address Family and always in native byte order...so it will
+	 * not match if we have mixed big & little-endian machines in the cluster
+	 */
+
+	list_for_each(addrlist, &node->addr_list) {
+		nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
+
+		if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0)
+			return 1; /* TRUE */
+	}
+	return 0; /* FALSE */
+}
+
+static void memcpy_fromkvec(void *data, struct kvec *vec, int len)
+{
+        while (len > 0) {
+                if (vec->iov_len) {
+                        int copy = min_t(unsigned int, len, vec->iov_len);
+                        memcpy(data, vec->iov_base, copy);
+                        len -= copy;
+                        data += copy;
+                        vec->iov_base += copy;
+                        vec->iov_len -= copy;
+                }
+                vec++;
+        }
+}
+
+static int send_to_user_port(struct cl_comms_socket *csock,
+			     struct cl_protheader *header,
+			     struct msghdr *msg,
+			     struct kvec *iov, int veclen,
+			     int len)
+{
+	struct sk_buff *skb;
+	struct cb_info *cbinfo;
+	int err;
+	int flags = le32_to_cpu(header->flags);
+
+        /* Get the port number and look for a listener */
+	down(&port_array_lock);
+	if (port_array[header->tgtport]) {
+		struct cluster_sock *c = cluster_sk(port_array[header->tgtport]);
+
+		/* ACK it */
+		if (!(flags & MSG_NOACK) && !(flags & MSG_REPLYEXP)) {
+
+			cl_sendack(csock, header->seq, msg->msg_namelen,
+				   msg->msg_name, header->tgtport, 0);
+		}
+
+		/* Call a callback if there is one */
+		if (c->kernel_callback) {
+			up(&port_array_lock);
+			if (veclen == 1) {
+				c->kernel_callback(iov->iov_base,
+						   iov->iov_len,
+						   msg->msg_name, msg->msg_namelen,
+						   le32_to_cpu(header->srcid));
+
+			}
+			else { /* Unroll iov, this Hardly ever Happens */
+				char *data;
+				data = kmalloc(len, GFP_KERNEL);
+				if (!data)
+					return -ENOMEM;
+
+				memcpy_fromkvec(data, iov, len);
+				c->kernel_callback(data, len,
+						   msg->msg_name, msg->msg_namelen,
+						   le32_to_cpu(header->srcid));
+				kfree(data);
+			}
+			return len;
+		}
+
+		/* Otherwise put it into an SKB and pass it onto the recvmsg
+		 * mechanism */
+		skb = alloc_skb(len, GFP_KERNEL);
+		if (!skb) {
+			up(&port_array_lock);
+			printk(KERN_INFO CMAN_NAME
+			       ": Failed to allocate skb\n");
+			return -ENOMEM;
+		}
+
+		skb_put(skb, len);
+		memcpy_fromkvec(skb->data, iov, len);
+
+		/* Put metadata into cb[] */
+		cbinfo = (struct cb_info *)skb->cb;
+		cbinfo->orig_nodeid = le32_to_cpu(header->srcid);
+		cbinfo->orig_port = header->srcport;
+		cbinfo->oob = 0;
+
+		if ((err =
+		     sock_queue_rcv_skb(port_array[header->tgtport], skb)) < 0) {
+
+			printk(KERN_INFO CMAN_NAME
+			       ": Error queueing request to port %d: %d\n",
+			       header->tgtport, err);
+			kfree_skb(skb);
+
+			/* If the port was MEMBERSHIP then we have to die */
+			if (header->tgtport == CLUSTER_PORT_MEMBERSHIP) {
+				up(&port_array_lock);
+				send_leave(CLUSTER_LEAVEFLAG_PANIC);
+				panic("membership stopped responding");
+			}
+		}
+		up(&port_array_lock);
+
+	}
+	else {
+		/* ACK it, but set the flag bit so remote end knows no-one
+		 * caught it */
+		if (!(flags & MSG_NOACK))
+			cl_sendack(csock, header->seq,
+				   msg->msg_namelen, msg->msg_name,
+				   header->tgtport, 1);
+
+		/* Nobody listening, drop it */
+		up(&port_array_lock);
+	}
+	return len;
+}
+
+/* NOTE: This routine knows (assumes!) that there is only one
+   iov element passed into it. */
+static void process_incoming_packet(struct cl_comms_socket *csock,
+				    struct msghdr *msg,
+				    struct kvec *vec, int veclen, int len)
+{
+	char *data = vec->iov_base;
+	char *addr = msg->msg_name;
+	int addrlen = msg->msg_namelen;
+	struct cl_protheader *header = (struct cl_protheader *) data;
+	int flags = le32_to_cpu(header->flags);
+	struct cluster_node *rem_node =
+		find_node_by_nodeid(le32_to_cpu(header->srcid));
+
+	P_COMMS("seen message, from %d for %d, sequence num = %d, rem_node=%p, state=%d\n",
+	     le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
+	     le16_to_cpu(header->seq), rem_node,
+	     rem_node ? rem_node->state : -1);
+
+	/* If the remote end is being coy about its node ID then look it up by
+	 * address */
+	if (!rem_node && header->srcid == 0) {
+		rem_node = find_node_by_addr(addr, addrlen);
+	}
+
+	/* If this node is an ex-member then treat it as unknown */
+	if (rem_node && rem_node->state != NODESTATE_MEMBER
+	    && rem_node->state != NODESTATE_JOINING)
+		rem_node = NULL;
+
+	/* Ignore messages not for our cluster */
+	if (le16_to_cpu(header->cluster) != cluster_id) {
+		P_COMMS("Dumping message - wrong cluster ID (us=%d, msg=%d)\n",
+			cluster_id, header->cluster);
+		goto incoming_finish;
+	}
+
+	/* If the message is from us then just dump it */
+	if (rem_node && rem_node->us)
+		goto incoming_finish;
+
+	/* If we can't find the nodeid then check for our own messages the hard
+	 * way - this only happens during joining */
+	if (!rem_node) {
+		struct list_head *socklist;
+		struct cl_comms_socket *clsock;
+
+		list_for_each(socklist, &socket_list) {
+			clsock =
+			    list_entry(socklist, struct cl_comms_socket, list);
+
+			if (clsock->recv_only) {
+
+				if (memcmp(addr, &clsock->saddr, address_length) == 0) {
+					goto incoming_finish;
+				}
+			}
+		}
+
+	}
+
+	/* Ignore messages not for us */
+	if (le32_to_cpu(header->tgtid) > 0 && us
+	    && le32_to_cpu(header->tgtid) != us->node_id) {
+		goto incoming_finish;
+	}
+
+	P_COMMS("got message, from %d for %d, sequence num = %d\n",
+		le32_to_cpu(header->srcid), le32_to_cpu(header->tgtid),
+		le16_to_cpu(header->seq));
+
+	if (header->ack && rem_node) {
+		process_ack(rem_node, header->ack);
+	}
+
+        /* Have we received this message before ? If so just ignore it, it's a
+	 * resend for someone else's benefit */
+	if (!(flags & MSG_NOACK) &&
+	    rem_node && rem_node->last_seq_recv &&
+	    (short)((short)le16_to_cpu(header->seq) - (short)rem_node->last_seq_recv) <= 0) {
+		P_COMMS("Discarding message -  seq = %d, last_seen = %d\n",
+			header->seq, rem_node->last_seq_recv);
+		/* Still need to ACK it though, in case it was the ACK that got
+		 * lost */
+		cl_sendack(csock, header->seq, addrlen, addr, header->tgtport, 0);
+		goto incoming_finish;
+	}
+
+	/* Check that the message is from the node we think it is from */
+	if (rem_node && !valid_addr_for_node(rem_node, addr)) {
+		return;
+	}
+
+	/* If it's a new node then assign it a temporary node ID */
+	if (!rem_node)
+		header->srcid = cpu_to_le32(new_temp_nodeid(addr, addrlen));
+
+	P_COMMS("Got message: flags = %x, port = %d, we_are_a_member = %d\n",
+		flags, header->tgtport, we_are_a_cluster_member);
+
+
+	/* If we are not part of the cluster then ignore multicast messages
+	 * that need an ACK as we will confuse the sender who is only expecting
+	 * ACKS from bona fide members */
+	if ((flags & MSG_MULTICAST) &&
+	    !(flags & MSG_NOACK) && !we_are_a_cluster_member) {
+		P_COMMS
+		    ("Discarding message - multicast and we are not a cluster member. port=%d flags=%x\n",
+		     header->tgtport, flags);
+		goto incoming_finish;
+	}
+
+	/* Save the sequence number of this message so we can ignore duplicates
+	 * (above). Ignore ACKS(seq==0)! */
+	if (rem_node && header->seq) {
+		rem_node->last_seq_recv = le16_to_cpu(header->seq);
+		if (!(flags & MSG_NOACK))
+			rem_node->last_ackneeded_seq_recv = le16_to_cpu(header->seq);
+	}
+
+	/* Is it a protocol message? */
+	if (header->tgtport == 0) {
+		process_cnxman_message(csock, data, len, addr, addrlen,
+				       rem_node);
+		goto incoming_finish;
+	}
+
+	/* Skip past the header to the data */
+	vec[0].iov_base = data + sizeof (struct cl_protheader);
+	vec[0].iov_len -= sizeof (struct cl_protheader);
+	len -= sizeof (struct cl_protheader);
+
+	send_to_user_port(csock, header, msg, vec, veclen, len);
+
+      incoming_finish:
+	return;
+}
+
+static struct sock *cl_alloc_sock(struct socket *sock, int gfp)
+{
+	struct sock *sk;
+	struct cluster_sock *c;
+
+	if ((sk =
+	     sk_alloc(AF_CLUSTER, gfp, sizeof (struct cluster_sock),
+		      cluster_sk_cachep)) == NULL)
+		goto no_sock;
+
+	if (sock) {
+		sock->ops = &cl_proto_ops;
+	}
+	sock_init_data(sock, sk);
+
+	sk->sk_destruct = NULL;
+	sk->sk_no_check = 1;
+	sk->sk_family = PF_CLUSTER;
+	sk->sk_allocation = gfp;
+
+	c = cluster_sk(sk);
+	c->port = 0;
+	c->service_data = NULL;
+
+	return sk;
+      no_sock:
+	return NULL;
+}
+
+static int cl_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct cl_client_socket *csock;
+	struct list_head *socklist;
+	struct list_head *tmp;
+
+	down(&client_socket_lock);
+	if (sk) {
+		/* Remove port allocations if it's a bound socket */
+		struct cluster_sock *c = cluster_sk(sk);
+
+		down(&port_array_lock);
+		if (c->port) {
+			port_array[c->port] = NULL;
+		}
+		up(&port_array_lock);
+
+		/* Tell other nodes in the cluster that this listener is going
+		 * away */
+		if (atomic_read(&cnxman_running) && c->port)
+			send_port_close_oob(c->port);
+
+		if (c->service_data)
+			sm_sock_release(sock);
+
+		/* Master socket released ? */
+		if (sk->sk_protocol == CLPROTO_MASTER) {
+			master_sock = NULL;
+
+			/* If this socket is being freed and cnxman is not
+			 * started then free all the comms sockets as either
+			 * the userland "join" process has crashed or the
+			 * join failed.
+			 */
+			if (!atomic_read(&cnxman_running)) {
+				quit_threads = 1;
+				free_cluster_sockets();
+			}
+		}
+
+		sock_orphan(sk);
+		sock_hold(sk);
+		lock_sock(sk);
+		release_sock(sk);
+		sock_put(sk);
+		sock_put(sk);
+		sock->sk = NULL;
+	}
+
+	/* Remove it from the list of clients */
+	list_for_each_safe(socklist, tmp, &client_socket_list) {
+		csock = list_entry(socklist, struct cl_client_socket, list);
+
+		if (csock->sock == sock) {
+			list_del(&csock->list);
+			kfree(csock);
+			break;
+		}
+	}
+	up(&client_socket_lock);
+
+	return 0;
+}
+
+static int cl_create(struct socket *sock, int protocol)
+{
+	struct sock *sk;
+
+	/* All are datagrams */
+	if (sock->type != SOCK_DGRAM)
+		return -ESOCKTNOSUPPORT;
+
+	if (protocol == CLPROTO_MASTER && !capable(CAP_CLUSTER))
+		return -EPERM;
+
+	/* Can only have one master socket */
+	if (master_sock && protocol == CLPROTO_MASTER)
+		return -EBUSY;
+
+	/* cnxman not running and a client was requested */
+	if (!atomic_read(&cnxman_running) && protocol != CLPROTO_MASTER)
+		return -ENETDOWN;
+
+	if ((sk = cl_alloc_sock(sock, GFP_KERNEL)) == NULL)
+		return -ENOBUFS;
+
+	sk->sk_protocol = protocol;
+
+	if (protocol == CLPROTO_MASTER)
+		master_sock = sk;
+
+	/* Add client sockets to the list */
+	if (protocol == CLPROTO_CLIENT) {
+		struct cl_client_socket *clsock =
+		    kmalloc(sizeof (struct cl_client_socket), GFP_KERNEL);
+		if (!clsock) {
+			cl_release(sock);
+			return -ENOMEM;
+		}
+		clsock->sock = sock;
+		down(&client_socket_lock);
+		list_add(&clsock->list, &client_socket_list);
+		up(&client_socket_lock);
+	}
+
+	return 0;
+}
+
+static int cl_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_cl *saddr = (struct sockaddr_cl *) uaddr;
+	struct cluster_sock *c = cluster_sk(sk);
+
+	if (!capable(CAP_NET_BIND_SERVICE))
+		return -EPERM;
+
+	if (sk->sk_zapped == 0)
+		return -EINVAL;
+
+	if (addr_len != sizeof (struct sockaddr_cl))
+		return -EINVAL;
+
+	if (saddr->scl_family != AF_CLUSTER)
+		return -EINVAL;
+
+	if (saddr->scl_port == 0)
+		return -EINVAL;	/* Port 0 is reserved for protocol messages */
+
+	down(&port_array_lock);
+
+	if (port_array[saddr->scl_port]) {
+		up(&port_array_lock);
+		return -EADDRINUSE;
+	}
+
+	port_array[saddr->scl_port] = sk;
+
+	up(&port_array_lock);
+
+	c->port = saddr->scl_port;
+	sk->sk_zapped = 0;
+
+	/* If we are not a cluster member yet then make the client wait until
+	 * we are, this allows nodes to start cluster clients at the same time
+	 * as cluster services but they will wait until membership is achieved.
+	 * This looks odd in bind() (open would seem more obvious) but we need
+	 * to know which port number is being used so that things like
+	 * membership services don't get blocked
+	 */
+
+	if (saddr->scl_port > HIGH_PROTECTED_PORT)
+		while (!we_are_a_cluster_member || !cluster_is_quorate
+		       || in_transition()) {
+			DECLARE_WAITQUEUE(wq, current);
+			struct task_struct *tsk = current;
+
+			set_task_state(tsk, TASK_INTERRUPTIBLE);
+			add_wait_queue(&socket_waitq, &wq);
+
+			if (!we_are_a_cluster_member || !cluster_is_quorate
+			    || in_transition())
+				schedule();
+
+			set_task_state(tsk, TASK_RUNNING);
+			remove_wait_queue(&socket_waitq, &wq);
+
+			/* We were woken up because the cluster is going down,
+			 * ...and we never got a chance to do any work! (sob) */
+			if (atomic_read(&cnxman_running) == 0 || quit_threads) {
+				return -ENOTCONN;
+			}
+		}
+
+	return 0;
+}
+
+static int cl_getname(struct socket *sock, struct sockaddr *uaddr,
+		      int *uaddr_len, int peer)
+{
+	struct sockaddr_cl *sa = (struct sockaddr_cl *) uaddr;
+	struct sock *sk = sock->sk;
+	struct cluster_sock *c = cluster_sk(sk);
+
+	*uaddr_len = sizeof (struct sockaddr_cl);
+
+	lock_sock(sk);
+
+	sa->scl_port = c->port;
+	sa->scl_flags = 0;
+	sa->scl_family = AF_CLUSTER;
+
+	release_sock(sk);
+
+	return 0;
+}
+
+static unsigned int cl_poll(struct file *file, struct socket *sock,
+			    poll_table * wait)
+{
+	return datagram_poll(file, sock, wait);
+}
+
+/* Copy internal node format to userland format */
+void copy_to_usernode(struct cluster_node *node,
+			     struct cl_cluster_node *unode)
+{
+	strcpy(unode->name, node->name);
+	unode->size = sizeof (struct cl_cluster_node);
+	unode->votes = node->votes;
+	unode->state = node->state;
+	unode->us = node->us;
+	unode->node_id = node->node_id;
+	unode->leave_reason = node->leave_reason;
+	unode->incarnation = node->incarnation;
+}
+
+static int add_clsock(int broadcast, int number, struct socket *sock,
+		      struct file *file)
+{
+	struct cl_comms_socket *peer;
+	struct cl_comms_socket *newsock =
+	    kmalloc(sizeof (struct cl_comms_socket), GFP_KERNEL);
+	if (!newsock)
+		return -ENOMEM;
+
+	memset(newsock, 0, sizeof (*newsock));
+	newsock->number = number;
+	newsock->sock = sock;
+	if (broadcast) {
+		newsock->broadcast = 1;
+		newsock->recv_only = 0;
+	}
+	else {
+		newsock->broadcast = 0;
+		newsock->recv_only = 1;
+	}
+
+	newsock->file = file;
+	newsock->addr_len = sizeof(struct sockaddr_in6);
+
+	/* Mark it active until cnxman thread is running and ready to process
+	 * messages */
+	set_bit(1, &newsock->active);
+
+	/* Find out what it's bound to */
+	newsock->sock->ops->getname(newsock->sock,
+				    (struct sockaddr *)&newsock->saddr,
+				    &newsock->addr_len, 0);
+
+	num_interfaces = max(num_interfaces, newsock->number);
+	if (!current_interface && newsock->recv_only)
+		current_interface = newsock;
+
+	/* Get peer, if this fails because we're the first socket with this
+	   number then that's fine. The subsequent call will fill in both */
+	peer = get_peer_interface(number, !broadcast);
+	if (peer) {
+		peer->peer = newsock;
+		newsock->peer = peer;
+	}
+
+	/* Hook data_ready */
+	newsock->sock->sk->sk_data_ready = cnxman_data_ready;
+
+	/* Make an attempt to keep them in order */
+	list_add_tail(&newsock->list, &socket_list);
+
+	address_length = newsock->addr_len;
+	return 0;
+}
+
+/* ioctl processing functions */
+
+static int do_ioctl_set_version(unsigned long arg)
+{
+	struct cl_version version, *u_version;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (arg == 0)
+		return -EINVAL;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	u_version = (struct cl_version *) arg;
+
+	if (copy_from_user(&version, u_version, sizeof(struct cl_version)))
+		return -EFAULT;
+
+	if (version.major != CNXMAN_MAJOR_VERSION ||
+	    version.minor != CNXMAN_MINOR_VERSION ||
+	    version.patch != CNXMAN_PATCH_VERSION)
+		return -EINVAL;
+
+	if (config_version == version.config)
+		return 0;
+
+	config_version = version.config;
+	send_reconfigure(RECONFIG_PARAM_CONFIG_VERSION, config_version);
+	return 0;
+}
+
+static int do_ioctl_get_members(unsigned long arg)
+{
+	struct cluster_node *node;
+	/* Kernel copies */
+	struct cl_cluster_node user_format_node;
+	struct cl_cluster_nodelist user_format_nodelist;
+	/* User space array ptr */
+	struct cl_cluster_node *user_node;
+	struct list_head *nodelist;
+	int num_nodes = 0;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	if (arg == 0)
+		return cluster_members;
+
+	if (copy_from_user(&user_format_nodelist, (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
+		return -EFAULT;
+
+	down(&cluster_members_lock);
+
+	if (user_format_nodelist.max_members < cluster_members) {
+		up(&cluster_members_lock);
+		return -E2BIG;
+	}
+
+	user_node = user_format_nodelist.nodes;
+
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+		if (node->state == NODESTATE_MEMBER) {
+			copy_to_usernode(node, &user_format_node);
+			if (copy_to_user(user_node, &user_format_node,
+					 sizeof (struct cl_cluster_node))) {
+				up(&cluster_members_lock);
+				return -EFAULT;
+			}
+			user_node++;
+			num_nodes++;
+		}
+	}
+	up(&cluster_members_lock);
+
+	return num_nodes;
+}
+
+static int do_ioctl_get_all_members(unsigned long arg)
+{
+	struct cluster_node *node;
+	/* Kernel copies */
+	struct cl_cluster_node user_format_node;
+	struct cl_cluster_nodelist user_format_nodelist;
+	/* User space array ptr*/
+	struct cl_cluster_node *user_node;
+	struct list_head *nodelist;
+	int num_nodes = 0;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	if (arg &&
+	    copy_from_user(&user_format_nodelist,
+			   (void __user *)arg, sizeof(struct cl_cluster_nodelist)))
+		return -EFAULT;
+
+	down(&cluster_members_lock);
+
+	user_node = user_format_nodelist.nodes;
+
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+		if (arg) {
+			copy_to_usernode(node,
+					 &user_format_node);
+
+			if (copy_to_user(user_node, &user_format_node,
+					 sizeof (struct cl_cluster_node))) {
+				up(&cluster_members_lock);
+				return -EFAULT;
+			}
+			user_node++;
+			if (--user_format_nodelist.max_members < 0) {
+				num_nodes = -EFAULT;
+				goto err_exit;
+			}
+
+		}
+		num_nodes++;
+	}
+	err_exit:
+	up(&cluster_members_lock);
+
+	return num_nodes;
+}
+
+
+static int do_ioctl_get_cluster(unsigned long arg)
+{
+	struct cl_cluster_info __user *info;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	info = (struct cl_cluster_info *)arg;
+
+	if (copy_to_user(&info->number, &cluster_id, sizeof(cluster_id)))
+	    return -EFAULT;
+
+	if (copy_to_user(&info->name, cluster_name, strlen(cluster_name)+1))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int do_ioctl_get_node(unsigned long arg)
+{
+	struct cluster_node *node;
+	struct cl_cluster_node k_node, *u_node;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	u_node = (struct cl_cluster_node *) arg;
+
+	if (copy_from_user(&k_node, u_node, sizeof(struct cl_cluster_node)))
+		return -EFAULT;
+
+	if (!k_node.name[0]) {
+		if (k_node.node_id == 0)
+			k_node.node_id = us->node_id;
+		node = find_node_by_nodeid(k_node.node_id);
+	}
+	else
+		node = find_node_by_name(k_node.name);
+
+	if (!node)
+		return -ENOENT;
+
+	copy_to_usernode(node, &k_node);
+
+	if (copy_to_user(u_node, &k_node, sizeof(struct cl_cluster_node)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int do_ioctl_set_expected(unsigned long arg)
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+	unsigned int total_votes;
+	unsigned int newquorum;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (arg == 0)
+		return -EINVAL;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	newquorum = calculate_quorum(1, arg, &total_votes);
+
+	if (newquorum < total_votes / 2
+	    || newquorum > total_votes) {
+		return -EINVAL;
+	}
+
+	/* Now do it */
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+		if (node->state == NODESTATE_MEMBER
+		    && node->expected_votes > arg) {
+			node->expected_votes = arg;
+		}
+	}
+	up(&cluster_members_lock);
+
+	recalculate_quorum(1);
+
+	send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, arg);
+	sm_member_update(cluster_is_quorate);
+
+	return 0;
+}
+
+static int do_ioctl_kill_node(unsigned long arg)
+{
+	struct cluster_node *node;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	if ((node = find_node_by_nodeid(arg)) == NULL)
+		return -EINVAL;
+
+	/* Can't kill us */
+	if (node->us)
+		return -EINVAL;
+
+	if (node->state != NODESTATE_MEMBER)
+		return -EINVAL;
+
+	/* Just in case it is alive, send a KILL message */
+	send_kill(arg, 1);
+
+	node->leave_reason = CLUSTER_LEAVEFLAG_KILLED;
+	a_node_just_died(node);
+
+	return 0;
+}
+
+static int do_ioctl_barrier(unsigned long arg)
+{
+	struct cl_barrier_info info;
+
+	if (!capable(CAP_CLUSTER))
+			return -EPERM;
+
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	if (copy_from_user(&info, (void *)arg, sizeof(info))  != 0)
+		return -EFAULT;
+
+	switch (info.cmd) {
+	case BARRIER_IOCTL_REGISTER:
+		return kcl_barrier_register(info.name,
+					    info.flags,
+					    info.arg);
+	case BARRIER_IOCTL_CHANGE:
+		return kcl_barrier_setattr(info.name,
+					   info.flags,
+					   info.arg);
+	case BARRIER_IOCTL_WAIT:
+		return kcl_barrier_wait(info.name);
+	case BARRIER_IOCTL_DELETE:
+		return kcl_barrier_delete(info.name);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int do_ioctl_islistening(unsigned long arg)
+{
+	DECLARE_WAITQUEUE(wq, current);
+	struct cl_listen_request rq;
+	struct cluster_node *rem_node;
+	int nodeid;
+	int result;
+	struct cl_waiting_listen_request *listen_request;
+
+	if (!arg)
+		return -EINVAL;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	if (copy_from_user(&rq, (void *) arg, sizeof (rq)) != 0)
+		return -EFAULT;
+
+	nodeid = rq.nodeid;
+	if (!nodeid)
+		nodeid = us->node_id;
+
+	rem_node = find_node_by_nodeid(nodeid);
+
+	/* Node not in the cluster */
+	if (!rem_node)
+		return -ENOENT;
+
+	if (rem_node->state != NODESTATE_MEMBER)
+		return -ENOTCONN;
+
+	/* If the request is for us then just look in the ports
+	 * array */
+	if (rem_node->us)
+		return (port_array[rq.port] != 0) ? 1 : 0;
+
+	/* For a remote node we need to send a request out */
+
+	/* If we are in transition then wait until we are not */
+	while (in_transition()) {
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		add_wait_queue(&socket_waitq, &wq);
+
+		if (in_transition())
+			schedule();
+
+		set_task_state(current, TASK_RUNNING);
+		remove_wait_queue(&socket_waitq, &wq);
+
+		if (signal_pending(current))
+			return -EINTR;
+	}
+
+	/* Were we shut down before it completed ? */
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	listen_request =
+		kmalloc(sizeof (struct cl_waiting_listen_request),
+			GFP_KERNEL);
+	if (!listen_request)
+		return -ENOMEM;
+
+	/* Build the request */
+	listen_request->waiting = 1;
+	listen_request->result = 0;
+	listen_request->tag = current->pid;
+	listen_request->nodeid = nodeid;
+	init_waitqueue_head(&listen_request->waitq);
+
+	down(&listenreq_lock);
+	list_add(&listen_request->list, &listenreq_list);
+	up(&listenreq_lock);
+
+	/* Now wait for the response to come back */
+	send_listen_request(rq.nodeid, rq.port);
+
+	while (listen_request->waiting) {
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		add_wait_queue(&listen_request->waitq, &wq);
+
+		if (listen_request->waiting)
+			schedule();
+
+		set_task_state(current, TASK_RUNNING);
+		remove_wait_queue(&listen_request->waitq, &wq);
+
+		if (signal_pending(current)) {
+			result = -ERESTARTSYS;
+			goto end_listen;
+		}
+	}
+	result = listen_request->result;
+
+ end_listen:
+	down(&listenreq_lock);
+	list_del(&listen_request->list);
+	kfree(listen_request);
+	up(&listenreq_lock);
+	return result;
+}
+
+static int do_ioctl_set_votes(unsigned long arg)
+{
+	unsigned int total_votes;
+	unsigned int newquorum;
+	int saved_votes;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	/* Check votes is valid */
+	saved_votes = us->votes;
+	us->votes = arg;
+
+	newquorum = calculate_quorum(1, 0, &total_votes);
+
+	if (newquorum < total_votes / 2 || newquorum > total_votes) {
+		us->votes = saved_votes;
+		return -EINVAL;
+	}
+
+	recalculate_quorum(1);
+
+	send_reconfigure(RECONFIG_PARAM_NODE_VOTES, arg);
+
+	return 0;
+}
+
+static int do_ioctl_pass_socket(unsigned long arg)
+{
+	struct cl_passed_sock sock_info;
+	struct file *file;
+	int error;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (atomic_read(&cnxman_running))
+		return -EALREADY;
+
+	error = -EBADF;
+
+	if (copy_from_user(&sock_info, (void *)arg, sizeof(sock_info)))
+		return -EFAULT;
+
+	file = fget(sock_info.fd);
+	if (file) {
+		struct inode *inode = file->f_dentry->d_inode;
+
+		error =	add_clsock(sock_info.multicast,
+				   sock_info.number, SOCKET_I(inode),
+				   file);
+		if (error)
+			fput(file);
+	}
+	return error;
+
+}
+
+static int do_ioctl_set_nodename(unsigned long arg)
+{
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+	if (atomic_read(&cnxman_running))
+		return -EALREADY;
+	if (strncpy_from_user(nodename, (void *)arg, MAX_CLUSTER_MEMBER_NAME_LEN) < 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int do_ioctl_set_nodeid(unsigned long arg)
+{
+	int nodeid = (int)arg;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+	if (atomic_read(&cnxman_running))
+		return -EALREADY;
+	if (nodeid < 0 || nodeid > 4096)
+		return -EINVAL;
+
+	wanted_nodeid = (int)arg;
+	return 0;
+}
+
+static int do_ioctl_join_cluster(unsigned long arg)
+{
+	struct cl_join_cluster_info join_info;
+	pid_t membership_pid;
+
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (atomic_read(&cnxman_running))
+		return -EALREADY;
+
+	if (copy_from_user(&join_info, (void *)arg, sizeof (struct cl_join_cluster_info) ))
+		return -EFAULT;
+
+	if (strlen(join_info.cluster_name) > MAX_CLUSTER_NAME_LEN)
+		return -EINVAL;
+
+	if (list_empty(&socket_list))
+		return -ENOTCONN;
+
+	set_votes(join_info.votes, join_info.expected_votes);
+	cluster_id = generate_cluster_id(join_info.cluster_name);
+	strncpy(cluster_name, join_info.cluster_name, MAX_CLUSTER_NAME_LEN);
+	two_node = join_info.two_node;
+	config_version = join_info.config_version;
+
+	quit_threads = 0;
+	acks_expected = 0;
+	init_completion(&cluster_thread_comp);
+	init_completion(&member_thread_comp);
+	if (allocate_nodeid_array())
+		return -ENOMEM;
+
+	kcluster_pid = kernel_thread(cluster_kthread, NULL, 0);
+	if (kcluster_pid < 0)
+		return kcluster_pid;
+
+	wait_for_completion(&cluster_thread_comp);
+	init_completion(&cluster_thread_comp);
+
+	atomic_set(&cnxman_running, 1);
+
+	/* Make sure we have a node name */
+	if (nodename[0] == '\0')
+		strcpy(nodename, system_utsname.nodename);
+
+	membership_pid = start_membership_services(kcluster_pid);
+	if (membership_pid < 0) {
+		quit_threads = 1;
+		wait_for_completion(&cluster_thread_comp);
+		init_completion(&member_thread_comp);
+		return membership_pid;
+	}
+
+	sm_start();
+	return 0;
+}
+
+static int do_ioctl_leave_cluster(unsigned long leave_flags)
+{
+	if (!capable(CAP_CLUSTER))
+		return -EPERM;
+
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	/* FORCE overrides several checks */
+	if (!(leave_flags & CLUSTER_LEAVEFLAG_FORCE)) {
+		if (!we_are_a_cluster_member)
+			return -ENOENT;
+
+		if (in_transition())
+			return -EBUSY;
+
+		if (atomic_read(&use_count))
+			return -ENOTCONN;
+	}
+
+	us->leave_reason = leave_flags;
+	quit_threads = 1;
+	wake_up_interruptible(&cnxman_waitq);
+
+	wait_for_completion(&cluster_thread_comp);
+	atomic_set(&use_count, 0);
+	return 0;
+}
+
+static int cl_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int err = -EOPNOTSUPP;
+	struct list_head *proclist;
+	struct list_head *tmp;
+	struct notify_struct *notify;
+	struct cl_version cnxman_version;
+	struct cl_quorumdevice_info qd_info;
+
+	switch (cmd) {
+		/* Process requests notification of cluster events */
+	case SIOCCLUSTER_NOTIFY:
+		notify = kmalloc(sizeof (struct notify_struct), GFP_KERNEL);
+		if (!notify)
+			return -ENOMEM;
+		notify->pid = current->pid;
+		notify->signal = arg;
+		down(&event_listener_lock);
+		list_add(&notify->list, &event_listener_list);
+		up(&event_listener_lock);
+		err = 0;
+		break;
+
+		/* Process is no longer interested cluster events */
+	case SIOCCLUSTER_REMOVENOTIFY:
+		err = EINVAL;
+
+		down(&event_listener_lock);
+		list_for_each_safe(proclist, tmp, &event_listener_list) {
+			notify =
+			    list_entry(proclist, struct notify_struct, list);
+			if (notify->pid == current->pid) {
+				list_del(&notify->list);
+				kfree(notify);
+				err = 0;
+			}
+		}
+		up(&event_listener_lock);
+		break;
+
+		/* Return the cnxman version number */
+	case SIOCCLUSTER_GET_VERSION:
+		if (!arg)
+			return -EINVAL;
+		err = 0;
+		cnxman_version.major = CNXMAN_MAJOR_VERSION;
+		cnxman_version.minor = CNXMAN_MINOR_VERSION;
+		cnxman_version.patch = CNXMAN_PATCH_VERSION;
+		cnxman_version.config = config_version;
+		if (copy_to_user((void *) arg, &cnxman_version,
+				 sizeof (struct cl_version))) {
+			return -EFAULT;
+		}
+		break;
+
+		/* Set the cnxman config version number */
+	case SIOCCLUSTER_SET_VERSION:
+		err = do_ioctl_set_version(arg);
+		break;
+
+		/* Return the active membership list */
+	case SIOCCLUSTER_GETMEMBERS:
+		err = do_ioctl_get_members(arg);
+		break;
+
+		/* Return the full membership list include dead nodes */
+	case SIOCCLUSTER_GETALLMEMBERS:
+		err = do_ioctl_get_all_members(arg);
+		break;
+
+	case SIOCCLUSTER_GETNODE:
+		err = do_ioctl_get_node(arg);
+		break;
+
+	case SIOCCLUSTER_GETCLUSTER:
+		err = do_ioctl_get_cluster(arg);
+		break;
+
+	case SIOCCLUSTER_ISQUORATE:
+		return cluster_is_quorate;
+
+	case SIOCCLUSTER_ISACTIVE:
+		return atomic_read(&cnxman_running);
+
+	case SIOCCLUSTER_SETEXPECTED_VOTES:
+		err = do_ioctl_set_expected(arg);
+		break;
+
+		/* Change the number of votes for this node */
+	case SIOCCLUSTER_SET_VOTES:
+		err = do_ioctl_set_votes(arg);
+		break;
+
+		/* Return 1 if the specified node is listening on a given port */
+	case SIOCCLUSTER_ISLISTENING:
+		err = do_ioctl_islistening(arg);
+		break;
+
+		/* Forcibly kill a node */
+	case SIOCCLUSTER_KILLNODE:
+		err = do_ioctl_kill_node(arg);
+		break;
+
+	case SIOCCLUSTER_GET_JOINCOUNT:
+		if (!capable(CAP_CLUSTER))
+			return -EPERM;
+		else
+			return atomic_read(&use_count);
+
+		/* ioctl interface to the barrier system */
+	case SIOCCLUSTER_BARRIER:
+		err = do_ioctl_barrier(arg);
+		break;
+
+	case SIOCCLUSTER_PASS_SOCKET:
+		if (sock->sk->sk_protocol != CLPROTO_MASTER)
+			err = -EOPNOTSUPP;
+		else
+			err = do_ioctl_pass_socket(arg);
+		break;
+
+	case SIOCCLUSTER_SET_NODENAME:
+		if (sock->sk->sk_protocol != CLPROTO_MASTER)
+			err = -EOPNOTSUPP;
+		else
+			err = do_ioctl_set_nodename(arg);
+		break;
+
+	case SIOCCLUSTER_SET_NODEID:
+		if (sock->sk->sk_protocol != CLPROTO_MASTER)
+			err = -EOPNOTSUPP;
+		else
+			err = do_ioctl_set_nodeid(arg);
+		break;
+
+	case SIOCCLUSTER_JOIN_CLUSTER:
+		if (sock->sk->sk_protocol != CLPROTO_MASTER)
+			err = -EOPNOTSUPP;
+		else
+			err = do_ioctl_join_cluster(arg);
+		break;
+
+	case SIOCCLUSTER_LEAVE_CLUSTER:
+		err = do_ioctl_leave_cluster(arg);
+		break;
+
+	case SIOCCLUSTER_QD_REGISTER:
+		if (copy_from_user(&qd_info, (void *)arg, sizeof(qd_info)))
+			return -EFAULT;
+
+		err = kcl_register_quorum_device(qd_info.name, qd_info.votes);
+		break;
+
+	case SIOCCLUSTER_QD_POLL:
+		err = kcl_quorum_device_available((int)arg);
+		break;
+
+	case SIOCCLUSTER_QD_UNREGISTER:
+		err = kcl_unregister_quorum_device();
+		break;
+
+	default:
+		if (!atomic_read(&cnxman_running))
+			return -ENOTCONN;
+
+		err = sm_ioctl(sock, cmd, arg);
+	}
+	return err;
+}
+
+static int cl_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = -ENOTCONN;
+
+	lock_sock(sk);
+
+	if (sock->state == SS_UNCONNECTED)
+		goto out;
+
+	err = 0;
+	if (sock->state == SS_DISCONNECTING)
+		goto out;
+
+	err = -EINVAL;
+
+	if (how != SHUTDOWN_MASK)
+		goto out;
+
+	sk->sk_shutdown = how;
+	err = 0;
+
+      out:
+	release_sock(sk);
+
+	return err;
+}
+
+
+/* We'll be giving out reward points next... */
+/* Send the packet and save a copy in case someone loses theirs. Should be
+ * protected by the send semaphore */
+static int __send_and_save(struct cl_comms_socket *csock, struct msghdr *msg,
+			   struct kvec *vec, int veclen,
+			   int size, int needack)
+{
+	int result;
+	struct kvec save_vectors[veclen];
+
+	/* Save a copy of the IO vectors as sendmsg mucks around with them and
+	 * we might want to send the same stuff out more than once (for different
+	 * interfaces)
+	 */
+	memcpy(save_vectors, vec,
+	       sizeof (struct kvec) * veclen);
+
+	result = kernel_sendmsg(csock->sock, msg, vec, veclen, size);
+
+	if (result >= 0 && acks_expected && needack) {
+
+		/* Start retransmit timer if it didn't go */
+		if (result == 0) {
+			start_short_timer();
+		}
+		else {
+			resend_delay = 1;
+		}
+	}
+	if (result < 0) {
+		printk(KERN_ERR CMAN_NAME ": sendmsg failed: %d\n", result);
+	}
+
+	/* Restore IOVs */
+	memcpy(vec, save_vectors,
+	       sizeof (struct kvec) * veclen);
+
+	return result;
+}
+
+static void resend_last_message()
+{
+	struct msghdr msg;
+	struct kvec vec[1];
+	int result;
+
+	P_COMMS("%ld resending last message: %d bytes: port=%d, cmd=%d\n",
+		jiffies, saved_msg_len, saved_msg_buffer[0],
+		saved_msg_buffer[6]);
+
+	/* Assume there is something wrong with the last interface */
+	current_interface = get_next_interface(current_interface);
+	if (num_interfaces > 1)
+		printk(KERN_WARNING CMAN_NAME ": Now using interface %d\n",
+		       current_interface->number);
+
+	vec[0].iov_base = saved_msg_buffer;
+	vec[0].iov_len = saved_msg_len;
+
+	memset(&msg, 0, sizeof (msg));
+	msg.msg_name = &current_interface->peer->saddr;
+	msg.msg_namelen = current_interface->peer->addr_len;
+
+	result = kernel_sendmsg(current_interface->sock, &msg, vec, 1, saved_msg_len);
+
+	if (result < 0)
+		printk(KERN_ERR CMAN_NAME ": resend failed: %d\n", result);
+
+	/* Try indefinitely to send this, the backlog must die down eventually
+	 * !? */
+	if (result == 0)
+		start_short_timer();
+
+	/* Send succeeded, continue waiting for ACKS */
+	if (result > 0)
+		start_ack_timer();
+
+}
+
+static int cl_recvmsg(struct kiocb *iocb, struct socket *sock,
+		      struct msghdr *msg, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_cl *sin = (struct sockaddr_cl *) msg->msg_name;
+	struct sk_buff *skb;
+	struct cb_info *cbinfo;
+	int copied, err = 0;
+	int isoob = 0;
+
+	/* Socket was notified of shutdown, remove any pending skbs and return
+	 * EOF */
+	if (!atomic_read(&cnxman_running)) {
+		while ((skb = skb_recv_datagram(sk, flags, MSG_DONTWAIT, &err)))
+			skb_free_datagram(sk, skb);
+		return 0;	/* cnxman has left the building */
+	}
+
+	/* Generic datagram code does most of the work. If the user is not
+	 * interested in OOB messages then ignore them */
+	do {
+		skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+		if (!skb)
+			goto out;
+
+		cbinfo = (struct cb_info *)skb->cb;
+		isoob = cbinfo->oob;
+
+		/* If it is OOB and the user doesn't want it, then throw it away. */
+		if (isoob && !(flags & MSG_OOB)) {
+			skb_free_datagram(sk, skb);
+
+			/* If we peeked (?) an OOB but the user doesn't want it
+			   then we need to discard it or we'll loop forever */
+			if (flags & MSG_PEEK) {
+				skb = skb_recv_datagram(sk, flags & ~MSG_PEEK,
+							MSG_DONTWAIT, &err);
+				if (skb)
+					skb_free_datagram(sk, skb);
+			}
+		}
+	}
+	while (isoob && !(flags & MSG_OOB));
+
+	copied = skb->len;
+	if (copied > size) {
+		copied = size;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+	err = memcpy_toiovec(msg->msg_iov, skb->data, copied);
+
+	if (err)
+		goto out_free;
+
+	if (msg->msg_name && msg->msg_namelen) {
+		memset(msg->msg_name, 0, msg->msg_namelen);
+
+		if (msg->msg_namelen >= sizeof (struct sockaddr_cl)) {
+
+			/* Nodeid is in native byte order - anything else is just
+			 * perverse */
+			sin->scl_nodeid = cbinfo->orig_nodeid;
+		}
+		msg->msg_namelen = sizeof (struct sockaddr_cl);
+		sin->scl_port = cbinfo->orig_port;
+	}
+
+	if (isoob) {
+		msg->msg_flags |= MSG_OOB;
+	}
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	err = copied;
+
+      out_free:
+	skb_free_datagram(sk, skb);
+
+      out:
+	return err;
+}
+
+/* Send a message out on all interfaces */
+static int send_to_all_ints(int nodeid, struct msghdr *our_msg,
+			    struct kvec *vec, int veclen, int size, int flags)
+{
+	struct sockaddr_in6 daddr;
+	struct cl_comms_socket *clsock;
+	int result = 0;
+	static int errors = 0;
+
+	our_msg->msg_name = &daddr;
+
+	list_for_each_entry(clsock, &socket_list, list) {
+
+		/* Don't send out of a broadcast socket */
+		if (clsock->recv_only) {
+
+			/* For temporary node IDs send to the node's real IP address */
+			if (nodeid < 0) {
+				get_addr_from_temp_nodeid(nodeid, (char *)&daddr, &our_msg->msg_namelen);
+			}
+			else {
+				memcpy(&daddr, &clsock->peer->saddr, clsock->peer->addr_len);
+				our_msg->msg_namelen = clsock->peer->addr_len;
+			}
+
+			result = __send_and_save(clsock, our_msg, vec, veclen,
+						 size + sizeof (struct cl_protheader),
+						 !(flags & MSG_NOACK));
+			if (result < 0)
+				errors++;
+			else
+				errors = 0;
+		}
+	}
+
+	/* If all the interfaces error then die */
+	if (errors >= num_interfaces * cman_config.max_retries) {
+		printk(KERN_ERR CMAN_NAME ": No functional network interfaces, leaving cluster\n");
+		quit_threads = 1;
+		wake_up_interruptible(&cnxman_waitq);
+	}
+	return result;
+}
+
+
+/* Internal common send message routine */
+static int __sendmsg(struct socket *sock, struct msghdr *msg,
+		     struct kvec *vec, int veclen, int size,
+		     unsigned char port)
+{
+	int result = 0, i;
+	int flags = msg->msg_flags;
+	struct msghdr our_msg;
+	struct sockaddr_cl *caddr = msg->msg_name;
+	struct cl_protheader header;
+	struct kvec vectors[veclen + 1];
+	unsigned char srcport;
+	int nodeid = 0;
+
+	if (size > MAX_CLUSTER_MESSAGE)
+		return -EINVAL;
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	if (caddr)
+		nodeid = caddr->scl_nodeid;
+
+	/* Check that the node id (if present) is valid */
+	if (msg->msg_namelen && (!find_node_by_nodeid(nodeid) &&
+				 !is_valid_temp_nodeid(nodeid))) {
+		return -ENOTCONN;
+	}
+
+	/* If there's no sending client socket then the source
+	   port is 0: "us" */
+	if (sock) {
+		struct cluster_sock *csock = cluster_sk(sock->sk);
+		srcport = csock->port;
+	}
+	else {
+		srcport = 0;
+	}
+
+	/* We can only have one send outstanding at a time so we might as well
+	 * lock the whole send mechanism */
+	down(&send_lock);
+
+	while ((port > HIGH_PROTECTED_PORT
+		&& (!cluster_is_quorate || in_transition()))
+	       || (acks_expected > 0 && !(msg->msg_flags & MSG_NOACK))) {
+
+		DECLARE_WAITQUEUE(wq, current);
+		struct task_struct *tsk = current;
+
+		if (quit_threads) {
+			up(&send_lock);
+			return -ENOTCONN;
+		}
+
+		if (flags & MSG_DONTWAIT) {
+			up(&send_lock);
+			return -EAGAIN;
+		}
+
+		if (current->pid == kcluster_pid) {
+			P_COMMS
+			    ("Tried to make kclusterd wait, port=%d, acks_count=%d, expected=%d\n",
+			     port, ack_count, acks_expected);
+			up(&send_lock);
+			return -EAGAIN;
+		}
+
+		P_COMMS("%s process waiting. acks=%d, expected=%d\n", tsk->comm,
+			ack_count, acks_expected);
+
+		set_task_state(tsk, TASK_INTERRUPTIBLE);
+		add_wait_queue(&socket_waitq, &wq);
+
+		if ((port > HIGH_PROTECTED_PORT
+		     && (!cluster_is_quorate || in_transition()))
+		    || (acks_expected > 0)) {
+
+			up(&send_lock);
+			schedule();
+			down(&send_lock);
+		}
+
+		set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&socket_waitq, &wq);
+
+		/* Going down */
+		if (quit_threads) {
+			up(&send_lock);
+			return -ENOTCONN;
+		}
+
+		if (signal_pending(current)) {
+			up(&send_lock);
+			return -ERESTARTSYS;
+		}
+
+		/* Were we shut down in the meantime ? */
+		if (!atomic_read(&cnxman_running)) {
+			up(&send_lock);
+			return -ENOTCONN;
+		}
+
+	}
+
+	memset(&our_msg, 0, sizeof (our_msg));
+
+	/* Build the header */
+	header.tgtport = port;
+	header.srcport = srcport;
+	header.flags = msg->msg_flags; /* byte-swapped later */
+	header.cluster = cpu_to_le16(cluster_id);
+	header.srcid = us ? cpu_to_le32(us->node_id) : 0;
+	header.tgtid = caddr ? cpu_to_le32(nodeid) : 0;
+
+	if (++cur_seq == 0)
+		++cur_seq;
+
+	header.seq = cpu_to_le16(cur_seq);
+	header.ack = 0;
+
+	if (header.tgtid) {
+		struct cluster_node *remnode;
+
+		remnode = find_node_by_nodeid(nodeid);
+		if (remnode)  {
+			header.ack = cpu_to_le16(remnode->last_ackneeded_seq_recv);
+		}
+	}
+
+	/* Set the MULTICAST flag on messages with no particular destination */
+	if (!msg->msg_namelen) {
+		header.flags |= MSG_MULTICAST;
+		header.tgtid = 0;
+	}
+
+	/* Loopback shortcut */
+	if (nodeid == us->node_id && nodeid != 0) {
+
+		up(&send_lock);
+		header.flags |= MSG_NOACK; /* Don't ack it! */
+
+		return send_to_user_port(NULL, &header, msg, vec, veclen, size);
+	}
+
+	/* Copy the existing kvecs into our array and add the header on at the
+	 * beginning */
+	vectors[0].iov_base = &header;
+	vectors[0].iov_len = sizeof (header);
+	for (i = 0; i < veclen; i++) {
+		vectors[i + 1] = vec[i];
+	}
+
+
+        /* Work out how many ACKS are wanted - *don't* reset acks_expected to
+	 * zero if no acks are required as an ACK-needed message may still be
+	 * outstanding */
+	if (!(msg->msg_flags & MSG_NOACK)) {
+		if (msg->msg_namelen)
+			acks_expected = 1;	/* Unicast */
+		else
+			acks_expected = max(cluster_members - 1, 0);
+
+	}
+
+	P_COMMS
+	    ("Sending message - tgt=%d port %d required %d acks, seq=%d, flags=%x\n",
+	     nodeid, header.tgtport,
+	     (msg->msg_flags & MSG_NOACK) ? 0 : acks_expected,
+	     le16_to_cpu(header.seq), header.flags);
+
+	/* Don't include temp nodeids in the message itself */
+	if (header.tgtid < 0)
+		header.tgtid = 0;
+
+	header.flags = cpu_to_le32(header.flags);
+
+	/* For non-member sends we use all the interfaces */
+	if ((nodeid < 0) || (flags & MSG_ALLINT)) {
+
+		result = send_to_all_ints(nodeid, &our_msg, vectors, veclen+1,
+					  size, msg->msg_flags);
+	}
+	else {
+		/* Send to only the current socket - resends will use the
+		 * others if necessary */
+		our_msg.msg_name = &current_interface->peer->saddr;
+		our_msg.msg_namelen = current_interface->peer->addr_len;
+
+		result =
+		    __send_and_save(current_interface, &our_msg,
+				    vectors, veclen+1,
+				    size + sizeof (header),
+				    !(msg->msg_flags & MSG_NOACK));
+	}
+
+	/* Make a note in each nodes' structure that it has been sent a message
+	 * so we can see which ones went astray */
+	if (!(flags & MSG_NOACK) && nodeid >= 0) {
+		if (msg->msg_namelen) {
+			struct cluster_node *node;
+
+			node = find_node_by_nodeid(le32_to_cpu(header.tgtid));
+			if (node)
+				node->last_seq_sent = cur_seq;
+		}
+		else {
+			struct cluster_node *node;
+			struct list_head *nodelist;
+
+			list_for_each(nodelist, &cluster_members_list) {
+				node =
+				    list_entry(nodelist, struct cluster_node,
+					       list);
+				if (node->state == NODESTATE_MEMBER) {
+					node->last_seq_sent = cur_seq;
+				}
+			}
+		}
+	}
+
+	/* if the client wants a broadcast message sending back to itself
+	   then loop it back */
+	if (nodeid == 0 && (flags & MSG_BCASTSELF)) {
+		header.flags |= cpu_to_le32(MSG_NOACK); /* Don't ack it! */
+
+		result = send_to_user_port(NULL, &header, msg, vec, veclen, size);
+	}
+
+	/* Save a copy of the message if we're expecting an ACK */
+	if (!(flags & MSG_NOACK) && acks_expected) {
+		struct cl_protheader *savhdr = (struct cl_protheader *) saved_msg_buffer;
+
+		memcpy_fromkvec(saved_msg_buffer, vectors,
+				size + sizeof (header));
+
+		saved_msg_len = size + sizeof (header);
+		retry_count = ack_count = 0;
+		clear_bit(RESEND_NEEDED, &mainloop_flags);
+
+		/* Clear the REPLYEXPected flag so we force a real ACK
+		   if it's necessary to resend this packet */
+		savhdr->flags &= ~MSG_REPLYEXP;
+		start_ack_timer();
+	}
+
+	up(&send_lock);
+	return result;
+}
+
+static int queue_message(struct socket *sock, void *buf, int len,
+			 struct sockaddr_cl *caddr,
+			 unsigned char port, int flags)
+{
+	struct queued_message *qmsg;
+
+	qmsg = kmalloc(sizeof (struct queued_message),
+		       (in_atomic()
+			|| irqs_disabled())? GFP_ATOMIC : GFP_KERNEL);
+	if (qmsg == NULL)
+		return -1;
+
+	memcpy(qmsg->msg_buffer, buf, len);
+	qmsg->msg_len = len;
+	if (caddr) {
+		memcpy(&qmsg->addr, caddr, sizeof (struct sockaddr_cl));
+		qmsg->addr_len = sizeof (struct sockaddr_cl);
+	}
+	else {
+		qmsg->addr_len = 0;
+	}
+	qmsg->flags = flags;
+	qmsg->port = port;
+	qmsg->socket = sock;
+
+	down(&messages_list_lock);
+	list_add_tail(&qmsg->list, &messages_list);
+	up(&messages_list_lock);
+
+	wake_up_interruptible(&cnxman_waitq);
+
+	return 0;
+}
+
+static int cl_sendmsg(struct kiocb *iocb, struct socket *sock,
+		      struct msghdr *msg, size_t size)
+{
+	struct cluster_sock *c = cluster_sk(sock->sk);
+	char *buffer;
+	int status;
+	uint8_t port;
+	struct kvec vec;
+	struct sockaddr_cl *caddr = msg->msg_name;
+
+	if (sock->sk->sk_protocol == CLPROTO_MASTER)
+		return -EOPNOTSUPP;
+
+	port = c->port;
+
+	/* Only capable users can override the port number */
+	if (caddr && capable(CAP_CLUSTER) && caddr->scl_port)
+		port = caddr->scl_port;
+
+	if (port == 0)
+		return -EDESTADDRREQ;
+
+	/* Allocate a kernel buffer for the data so we can put it into a kvec */
+	buffer = kmalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	if (memcpy_fromiovec(buffer, msg->msg_iov, size)) {
+		status = -EFAULT;
+		goto end_send;
+	}
+
+	vec.iov_len = size;
+	vec.iov_base = buffer;
+
+	status = __sendmsg(sock, msg, &vec, 1, size, port);
+
+ end_send:
+	kfree(buffer);
+
+	return status;
+}
+
+/* Kernel call to sendmsg */
+int kcl_sendmsg(struct socket *sock, void *buf, int size,
+		struct sockaddr_cl *caddr, int addr_len, unsigned int flags)
+{
+	struct kvec vecs[1];
+	struct msghdr msg;
+	struct cluster_sock *c = cluster_sk(sock->sk);
+	unsigned char port;
+
+	if (size > MAX_CLUSTER_MESSAGE)
+		return -EINVAL;
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	port = c->port;
+	if (caddr && caddr->scl_port)
+		port = caddr->scl_port;
+
+	if (port == 0)
+		return -EDESTADDRREQ;
+
+	/* If we have no process context then queue it up for kclusterd to
+	 * send. */
+	if (in_interrupt() || flags & MSG_QUEUE) {
+		return queue_message(sock, buf, size, caddr, port,
+				     flags & ~MSG_QUEUE);
+	}
+
+	vecs[0].iov_base = buf;
+	vecs[0].iov_len = size;
+
+	memset(&msg, 0, sizeof (msg));
+	msg.msg_name = caddr;
+	msg.msg_namelen = addr_len;
+	msg.msg_flags = flags;
+
+	return __sendmsg(sock, &msg, vecs, 1, size, port);
+}
+
+static int send_queued_message(struct queued_message *qmsg)
+{
+	struct kvec vecs[1];
+	struct msghdr msg;
+
+	/* Don't send blocked messages */
+	if (qmsg->port > HIGH_PROTECTED_PORT
+	    && (!cluster_is_quorate || in_transition()))
+		return -EAGAIN;
+
+	vecs[0].iov_base = qmsg->msg_buffer;
+	vecs[0].iov_len = qmsg->msg_len;
+
+	memset(&msg, 0, sizeof (msg));
+	msg.msg_name = qmsg->addr_len ? &qmsg->addr : NULL;
+	msg.msg_namelen = qmsg->addr_len;
+	msg.msg_flags = qmsg->flags;
+
+	return __sendmsg(qmsg->socket, &msg, vecs, 1,
+			 qmsg->msg_len, qmsg->port);
+}
+
+int kcl_register_read_callback(struct socket *sock,
+			       int (*routine) (char *, int, char *, int,
+					       unsigned int))
+{
+	struct cluster_sock *c = cluster_sk(sock->sk);
+
+	c->kernel_callback = routine;
+
+	return 0;
+}
+
+/* Used where we are in kclusterd context and we can't allow the task to wait
+ * as we are also responsible to processing the ACKs that do the wake up. Try
+ * to send the message immediately and queue it if that's not possible */
+static int send_or_queue_message(struct socket *sock, void *buf, int len,
+				 struct sockaddr_cl *caddr,
+				 unsigned int flags)
+{
+	struct kvec vecs[1];
+	struct msghdr msg;
+	int status;
+
+	vecs[0].iov_base = buf;
+	vecs[0].iov_len = len;
+
+	memset(&msg, 0, sizeof (msg));
+	msg.msg_name = caddr;
+	msg.msg_namelen = caddr ? sizeof (struct sockaddr_cl) : 0;
+	msg.msg_flags = MSG_DONTWAIT | flags;
+
+	status = __sendmsg(NULL, &msg, vecs, 1, len, 0);
+
+	/* Did it work ? */
+	if (status > 0) {
+		return 0;
+	}
+
+	/* Failure other than EAGAIN is fatal */
+	if (status != -EAGAIN) {
+		return status;
+	}
+
+	return queue_message(sock, buf, len, caddr, 0, flags);
+}
+
+/* Send a listen request to a node */
+static void send_listen_request(int nodeid, unsigned char port)
+{
+	struct cl_listenmsg listenmsg;
+	struct sockaddr_cl caddr;
+
+	memset(&caddr, 0, sizeof (caddr));
+
+	/* Build the header */
+	listenmsg.cmd = CLUSTER_CMD_LISTENREQ;
+	listenmsg.target_port = port;
+	listenmsg.listening = 0;
+	listenmsg.tag = current->pid;
+
+	caddr.scl_family = AF_CLUSTER;
+	caddr.scl_port = 0;
+	caddr.scl_nodeid = nodeid;
+
+	send_or_queue_message(NULL, &listenmsg, sizeof(listenmsg), &caddr, MSG_REPLYEXP);
+	return;
+}
+
+/* Return 1 or 0 to indicate if we have a listener on the requested port */
+static void send_listen_response(struct cl_comms_socket *csock, int nodeid,
+				 unsigned char port, unsigned short tag)
+{
+	struct cl_listenmsg listenmsg;
+	struct sockaddr_cl caddr;
+	int status;
+
+	memset(&caddr, 0, sizeof (caddr));
+
+	/* Build the message */
+	listenmsg.cmd = CLUSTER_CMD_LISTENRESP;
+	listenmsg.target_port = port;
+	listenmsg.tag = tag;
+	listenmsg.listening = (port_array[port] != 0) ? 1 : 0;
+
+	caddr.scl_family = AF_CLUSTER;
+	caddr.scl_port = 0;
+	caddr.scl_nodeid = nodeid;
+
+	status = send_or_queue_message(NULL, &listenmsg,
+				       sizeof (listenmsg),
+				       &caddr, 0);
+
+	return;
+}
+
+/* Send an ACK */
+static int cl_sendack(struct cl_comms_socket *csock, unsigned short seq,
+		      int addr_len, char *addr, unsigned char remport,
+		      unsigned char flag)
+{
+	struct kvec vec;
+	struct cl_ackmsg ackmsg;
+	struct msghdr msg;
+	struct sockaddr_in6 daddr;
+	int result;
+
+#ifdef DEBUG_COMMS
+	char buf[MAX_ADDR_PRINTED_LEN];
+
+	P_COMMS("Sending ACK to %s, seq=%d\n",
+		print_addr(addr, address_length, buf), le16_to_cpu(seq));
+#endif
+
+	if (addr) {
+		memcpy(&daddr, addr, addr_len);
+	}
+	else {
+		memcpy(&daddr, &csock->saddr, csock->addr_len);
+		addr_len = csock->addr_len;
+	}
+
+	/* Build the header */
+	ackmsg.header.tgtport = 0;	/* Protocol port */
+	ackmsg.header.srcport = 0;
+	ackmsg.header.seq = 0;
+	ackmsg.header.flags = MSG_NOACK;
+	ackmsg.header.cluster = cpu_to_le16(cluster_id);
+	ackmsg.header.srcid = us ? cpu_to_le32(us->node_id) : 0;
+	ackmsg.header.ack = seq; /* already in LE order */
+	ackmsg.header.tgtid = 0;	/* ACKS are unicast so we don't bother
+					 * to look this up */
+	ackmsg.cmd = CLUSTER_CMD_ACK;
+	ackmsg.remport = remport;
+	ackmsg.aflags = flag;
+	vec.iov_base = &ackmsg;
+	vec.iov_len = sizeof (ackmsg);
+
+	memset(&msg, 0, sizeof (msg));
+	msg.msg_name = &daddr;
+	msg.msg_namelen = addr_len;
+
+	result = kernel_sendmsg(csock->sock, &msg, &vec, 1, sizeof (ackmsg));
+
+	if (result < 0)
+		printk(KERN_CRIT CMAN_NAME ": error sending ACK: %d\n", result);
+
+	return result;
+
+}
+
+/* Wait for all ACKS to be gathered */
+void kcl_wait_for_all_acks()
+{
+	while (ack_count < acks_expected) {
+
+		DECLARE_WAITQUEUE(wq, current);
+		struct task_struct *tsk = current;
+
+		set_task_state(tsk, TASK_INTERRUPTIBLE);
+		add_wait_queue(&socket_waitq, &wq);
+
+		if (ack_count < acks_expected) {
+			schedule();
+		}
+
+		set_task_state(tsk, TASK_RUNNING);
+		remove_wait_queue(&socket_waitq, &wq);
+	}
+}
+
+/* Send a closedown OOB message to all cluster nodes - this tells them that a
+ * port listener has gone away */
+static void send_port_close_oob(unsigned char port)
+{
+	struct cl_closemsg closemsg;
+
+	/* Build the header */
+	closemsg.cmd = CLUSTER_CMD_PORTCLOSED;
+	closemsg.port = port;
+
+	send_or_queue_message(NULL, &closemsg, sizeof (closemsg), NULL, 0);
+	return;
+}
+
+/* A remote port has been closed - post an OOB message to the local listen on
+ * that port (if there is one) */
+static void post_close_oob(unsigned char port, int nodeid)
+{
+	struct cl_portclosed_oob *oobmsg;
+	struct sk_buff *skb;
+	struct sock *sock = port_array[port];
+	struct cb_info *cbinfo;
+
+	if (!sock) {
+		return;		/* No-one listening */
+	}
+
+	skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
+	if (!skb)
+		return;
+
+	skb_put(skb, sizeof (*oobmsg));
+	oobmsg = (struct cl_portclosed_oob *) skb->data;
+	oobmsg->port = port;
+	oobmsg->cmd = CLUSTER_OOB_MSG_PORTCLOSED;
+
+	cbinfo = (struct cb_info *)skb->cb;
+	cbinfo->oob = 1;
+	cbinfo->orig_nodeid = nodeid;
+	cbinfo->orig_port = port;
+
+	sock_queue_rcv_skb(sock, skb);
+
+}
+
+/* Leave the cluster */
+static void node_shutdown()
+{
+	struct cl_barrier *barrier;
+	struct list_head *blist;
+	struct list_head *temp;
+	struct list_head *socklist;
+	struct cl_client_socket *csock;
+	struct sk_buff *null_skb;
+
+	if (we_are_a_cluster_member)
+		printk(KERN_INFO CMAN_NAME ": we are leaving the cluster. %s\n",
+		       us->leave_reason?leave_string(us->leave_reason):"");
+
+	atomic_set(&cnxman_running, 0);
+	unjam();
+
+	/* Notify kernel listeners first */
+	notify_kernel_listeners(LEAVING, 0);
+
+	/* Notify client sockets */
+	down(&client_socket_lock);
+	list_for_each_safe(socklist, temp, &client_socket_list) {
+		csock = list_entry(socklist, struct cl_client_socket, list);
+
+		null_skb = alloc_skb(0, GFP_KERNEL);
+		if (null_skb)
+			sock_queue_rcv_skb(csock->sock->sk, null_skb);
+		list_del(&csock->list);
+		kfree(csock);
+	}
+	up(&client_socket_lock);
+	we_are_a_cluster_member = 0;
+	cluster_is_quorate = 0;
+
+	sm_stop(1);
+
+	/* Wake up any processes waiting for barriers */
+	down(&barrier_list_lock);
+	list_for_each(blist, &barrier_list) {
+		barrier = list_entry(blist, struct cl_barrier, list);
+
+		/* Cancel any timers */
+		if (timer_pending(&barrier->timer))
+			del_timer(&barrier->timer);
+
+		/* Force it to be auto-delete so it discards itself */
+		if (barrier->state == BARRIER_STATE_WAITING) {
+			barrier->flags |= BARRIER_ATTR_AUTODELETE;
+			wake_up_interruptible(&barrier->waitq);
+		}
+		else {
+			if (barrier->callback) {
+				barrier->callback(barrier->name, -ENOTCONN);
+				barrier->callback = NULL;
+			}
+		}
+	}
+	up(&barrier_list_lock);
+
+	/* Wake up any processes waiting for ISLISTENING requests */
+	down(&listenreq_lock);
+	list_for_each(blist, &listenreq_list) {
+		struct cl_waiting_listen_request *lrequest =
+		    list_entry(blist, struct cl_waiting_listen_request, list);
+
+		if (lrequest->waiting)
+			wake_up_interruptible(&lrequest->waitq);
+	}
+	up(&listenreq_lock);
+}
+
+static void free_cluster_sockets()
+{
+	struct list_head *socklist;
+	struct cl_comms_socket *sock;
+	struct list_head *temp;
+
+	list_for_each_safe(socklist, temp, &socket_list) {
+		sock = list_entry(socklist, struct cl_comms_socket, list);
+
+		list_del(&sock->list);
+		fput(sock->file);
+		kfree(sock);
+	}
+	num_interfaces = 0;
+	current_interface = NULL;
+}
+
+/* Tidy up after all the rest of the cluster bits have shut down */
+static void node_cleanup()
+{
+	struct list_head *nodelist;
+	struct list_head *proclist;
+	struct list_head *temp;
+	struct list_head *socklist;
+	struct list_head *blist;
+	struct temp_node *tn;
+	struct temp_node *tmp;
+	struct cl_comms_socket *sock;
+	struct kernel_notify_struct *knotify;
+	struct queued_message *qmsg, *qtmp;
+
+	/* Free list of kernel listeners */
+	list_for_each_safe(proclist, temp, &kernel_listener_list) {
+		knotify =
+		    list_entry(proclist, struct kernel_notify_struct, list);
+		list_del(&knotify->list);
+		kfree(knotify);
+	}
+
+	/* Mark the sockets as busy so they don't get added to the active
+	 * sockets list in the next few lines of code before we free them */
+	list_for_each_safe(socklist, temp, &socket_list) {
+		sock = list_entry(socklist, struct cl_comms_socket, list);
+
+		set_bit(1, &sock->active);
+	}
+
+	/* Tidy the active sockets list */
+	list_for_each_safe(socklist, temp, &active_socket_list) {
+		sock =
+		    list_entry(socklist, struct cl_comms_socket, active_list);
+		list_del(&sock->active_list);
+	}
+
+	/* Free the memory allocated to cluster nodes */
+	free_nodeid_array();
+	down(&cluster_members_lock);
+	us = NULL;
+	list_for_each_safe(nodelist, temp, &cluster_members_list) {
+
+		struct list_head *addrlist;
+		struct list_head *addrtemp;
+		struct cluster_node *node;
+		struct cluster_node_addr *nodeaddr;
+
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		list_for_each_safe(addrlist, addrtemp, &node->addr_list) {
+			nodeaddr =
+			    list_entry(addrlist, struct cluster_node_addr,
+				       list);
+
+			list_del(&nodeaddr->list);
+			kfree(nodeaddr);
+		}
+		list_del(&node->list);
+		kfree(node->name);
+		kfree(node);
+	}
+	cluster_members = 0;
+	up(&cluster_members_lock);
+
+	/* Clean the queued messages list */
+	down(&messages_list_lock);
+	list_for_each_entry_safe(qmsg, qtmp, &messages_list, list) {
+		list_del(&qmsg->list);
+		kfree(qmsg);
+	}
+	up(&messages_list_lock);
+
+	/* Clean the temp node IDs list. */
+	down(&tempnode_lock);
+	list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
+		list_del(&tn->list);
+		kfree(tn);
+	}
+	up(&tempnode_lock);
+
+	/* Free the memory allocated to the outgoing sockets */
+	free_cluster_sockets();
+
+	/* Make sure that all the barriers are deleted */
+	down(&barrier_list_lock);
+	list_for_each_safe(blist, temp, &barrier_list) {
+		struct cl_barrier *barrier =
+		    list_entry(blist, struct cl_barrier, list);
+
+		list_del(&barrier->list);
+		kfree(barrier);
+	}
+	up(&barrier_list_lock);
+
+	kcluster_pid = 0;
+	clear_bit(RESEND_NEEDED, &mainloop_flags);
+	acks_expected = 0;
+	wanted_nodeid = 0;
+	cur_seq = 0;
+	quorum_device = NULL;
+}
+
+/* If "cluster_is_quorate" is 0 then all activity apart from protected ports is
+ * blocked. */
+void set_quorate(int total_votes)
+{
+	int quorate;
+
+	if (get_quorum() > total_votes) {
+		quorate = 0;
+	}
+	else {
+		quorate = 1;
+	}
+
+	if (cluster_is_quorate && !quorate)
+		printk(KERN_CRIT CMAN_NAME
+		       ": quorum lost, blocking activity\n");
+	if (!cluster_is_quorate && quorate)
+		printk(KERN_CRIT CMAN_NAME
+		       ": quorum regained, resuming activity\n");
+
+	cluster_is_quorate = quorate;
+
+	/* Wake up any sleeping processes */
+	if (cluster_is_quorate) {
+		unjam();
+	}
+
+}
+
+void queue_oob_skb(struct socket *sock, int cmd)
+{
+	struct sk_buff *skb;
+	struct cb_info *cbinfo;
+	struct cl_portclosed_oob *oobmsg;
+
+	skb = alloc_skb(sizeof (*oobmsg), GFP_KERNEL);
+	if (!skb)
+		return;
+
+	skb_put(skb, sizeof (*oobmsg));
+	oobmsg = (struct cl_portclosed_oob *) skb->data;
+	oobmsg->port = 0;
+	oobmsg->cmd = cmd;
+
+	/* There is no remote node associated with this so
+	   clear out the field to avoid any accidents */
+	cbinfo = (struct cb_info *)skb->cb;
+	cbinfo->oob = 1;
+	cbinfo->orig_nodeid = 0;
+	cbinfo->orig_port = 0;
+
+	sock_queue_rcv_skb(sock->sk, skb);
+}
+
+/* Notify interested parties that the cluster configuration has changed */
+void notify_listeners()
+{
+	struct notify_struct *notify;
+	struct list_head *proclist;
+	struct list_head *socklist;
+	struct list_head *temp;
+
+	/* Do kernel listeners first */
+	notify_kernel_listeners(CLUSTER_RECONFIG, 0);
+
+	/* Now we deign to tell userspace */
+	down(&event_listener_lock);
+	list_for_each_safe(proclist, temp, &event_listener_list) {
+		notify = list_entry(proclist, struct notify_struct, list);
+
+		/* If the kill fails then remove the process from the list */
+		if (kill_proc(notify->pid, notify->signal, 0) == -ESRCH) {
+			list_del(&notify->list);
+			kfree(notify);
+		}
+	}
+	up(&event_listener_lock);
+
+	/* Tell userspace processes which want OOB messages */
+	down(&client_socket_lock);
+	list_for_each(socklist, &client_socket_list) {
+		struct cl_client_socket *csock;
+		csock = list_entry(socklist, struct cl_client_socket, list);
+		queue_oob_skb(csock->sock, CLUSTER_OOB_MSG_STATECHANGE);
+	}
+	up(&client_socket_lock);
+}
+
+/* This fills in the list of all addresses for the local node */
+void get_local_addresses(struct cluster_node *node)
+{
+	struct list_head *socklist;
+	struct cl_comms_socket *sock;
+
+	list_for_each(socklist, &socket_list) {
+		sock = list_entry(socklist, struct cl_comms_socket, list);
+
+		if (sock->recv_only) {
+			add_node_address(node, (char *) &sock->saddr, address_length);
+		}
+	}
+}
+
+
+static uint16_t generate_cluster_id(char *name)
+{
+	int i;
+	int value = 0;
+
+	for (i=0; i<strlen(name); i++) {
+		value <<= 1;
+		value += name[i];
+	}
+	return value & 0xFFFF;
+}
+
+/* Return the next comms socket we can use. */
+static struct cl_comms_socket *get_next_interface(struct cl_comms_socket *cur)
+{
+	int next;
+	struct list_head *socklist;
+
+	/* Fast path for single interface systems */
+	if (num_interfaces <= 1)
+		return cur;
+
+	/* Next number */
+	next = cur->number + 1;
+	if (next > num_interfaces)
+		next = 1;
+
+	/* Find the socket with this number, I could optimise this by starting
+	 * at the current i/f but most systems are going to have a small number
+	 * of them anyway */
+	list_for_each(socklist, &socket_list) {
+		struct cl_comms_socket *sock;
+		sock = list_entry(socklist, struct cl_comms_socket, list);
+
+		if (sock->recv_only && sock->number == next)
+			return sock;
+	}
+
+	BUG();
+	return NULL;
+}
+
+static struct cl_comms_socket *get_peer_interface(int if_num, int mcast)
+{
+	struct list_head *socklist;
+
+	list_for_each(socklist, &socket_list) {
+		struct cl_comms_socket *sock;
+		sock = list_entry(socklist, struct cl_comms_socket, list);
+
+		if (sock->broadcast == mcast && sock->number == if_num)
+			return sock;
+	}
+
+	return NULL;
+}
+
+
+/* MUST be called with the barrier list lock held */
+static struct cl_barrier *find_barrier(char *name)
+{
+	struct list_head *blist;
+	struct cl_barrier *bar;
+
+	list_for_each(blist, &barrier_list) {
+		bar = list_entry(blist, struct cl_barrier, list);
+
+		if (strcmp(name, bar->name) == 0)
+			return bar;
+	}
+	return NULL;
+}
+
+static void tidy_barriers(void)
+{
+	struct list_head *blist, *tmp;
+	struct cl_barrier *bar;
+
+	down(&barrier_list_lock);
+	list_for_each_safe(blist, tmp, &barrier_list) {
+		bar = list_entry(blist, struct cl_barrier, list);
+
+		if (bar->state == BARRIER_STATE_DELETED) {
+			P_BARRIER("Deleting barrier %s\n", bar->name);
+			list_del(&bar->list);
+			kfree(bar);
+		}
+	}
+	up(&barrier_list_lock);
+}
+
+/* Do the stuff we need to do when the barrier has completed phase 1 */
+static void check_barrier_complete_phase1(struct cl_barrier *barrier)
+{
+	if (atomic_read(&barrier->got_nodes) == ((barrier->expected_nodes != 0)
+						 ? barrier->expected_nodes :
+						 cluster_members)) {
+
+		struct cl_barriermsg bmsg;
+
+		atomic_inc(&barrier->completed_nodes);	/* We have completed */
+		barrier->phase = 2;	/* Wait for complete phase II */
+
+		/* Send completion message, remember: we are in cnxman context
+		 * and must not block */
+		bmsg.cmd = CLUSTER_CMD_BARRIER;
+		bmsg.subcmd = BARRIER_COMPLETE;
+		bmsg.flags = 0;
+		strcpy(bmsg.name, barrier->name);
+
+		P_BARRIER("Sending COMPLETE for %s\n", barrier->name);
+		queue_message(NULL, (char *) &bmsg, sizeof (bmsg), NULL, 0, 0);
+	}
+}
+
+/* Do the stuff we need to do when the barrier has been reached */
+/* Return 1 if we deleted the barrier */
+static int check_barrier_complete_phase2(struct cl_barrier *barrier, int status)
+{
+	spin_lock_irq(&barrier->phase2_spinlock);
+
+	if (barrier->state != BARRIER_STATE_COMPLETE &&
+	    (status == -ETIMEDOUT ||
+	     atomic_read(&barrier->completed_nodes) ==
+	     ((barrier->expected_nodes != 0)
+	      ? barrier->expected_nodes : cluster_members))) {
+
+		if (status == 0 && barrier->timeout)
+			del_timer(&barrier->timer);
+		barrier->endreason = status;
+
+		/* Wake up listener */
+		if (barrier->state == BARRIER_STATE_WAITING) {
+			wake_up_interruptible(&barrier->waitq);
+		}
+		else {
+			/* Additional tasks we have to do if the user was not
+			 * waiting... */
+			/* Call the callback */
+			if (barrier->callback) {
+				barrier->callback(barrier->name, 0);
+				barrier->callback = NULL;
+			}
+			/* Flag it to be removed it if it's AUTO-DELETE.
+			   We can't actually remove it because we can't get the barrier semaphore
+			   in timer context */
+			if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
+				barrier->state = BARRIER_STATE_DELETED;
+				set_bit(TIDY_BARRIERS, &mainloop_flags);
+				wake_up_interruptible(&cnxman_waitq);
+				spin_unlock_irq(&barrier->phase2_spinlock);
+				return 1;
+			}
+		}
+		barrier->state = BARRIER_STATE_COMPLETE;
+	}
+	spin_unlock_irq(&barrier->phase2_spinlock);
+	return 0;
+}
+
+/* Called if a barrier timeout happens */
+static void barrier_timer_fn(unsigned long arg)
+{
+	struct cl_barrier *barrier = (struct cl_barrier *) arg;
+
+	/* Ignore any futher messages, they are too late. */
+	barrier->phase = 0;
+
+	/* and cause it to timeout */
+	check_barrier_complete_phase2(barrier, -ETIMEDOUT);
+}
+
+/* Process BARRIER messages from other nodes */
+static void process_barrier_msg(struct cl_barriermsg *msg,
+				struct cluster_node *node)
+{
+	struct cl_barrier *barrier;
+
+	down(&barrier_list_lock);
+	barrier = find_barrier(msg->name);
+	up(&barrier_list_lock);
+
+	/* Ignore other peoples messages, in_transition() is needed here so
+	 * that joining nodes will see their barrier messages before the
+	 * we_are_a_cluster_member is set */
+	if (!we_are_a_cluster_member && !in_transition())
+		return;
+	if (!barrier)
+		return;
+
+	P_BARRIER("Got %d for %s, from node %s\n", msg->subcmd, msg->name,
+		  node ? node->name : "unknown");
+
+	switch (msg->subcmd) {
+	case BARRIER_WAIT:
+		down(&barrier->lock);
+		if (barrier->phase == 0)
+			barrier->phase = 1;
+
+		if (barrier->phase == 1) {
+			atomic_inc(&barrier->got_nodes);
+			check_barrier_complete_phase1(barrier);
+		}
+		else {
+			printk(KERN_WARNING CMAN_NAME
+			       ": got WAIT barrier not in phase 1 %s (%d)\n",
+			       msg->name, barrier->phase);
+
+		}
+		up(&barrier->lock);
+		break;
+
+	case BARRIER_COMPLETE:
+		down(&barrier->lock);
+		atomic_inc(&barrier->completed_nodes);
+
+		/* First node to get all the WAIT messages sends COMPLETE, so
+		 * we all complete */
+		if (barrier->phase == 1) {
+			if (barrier->expected_nodes)
+				atomic_set(&barrier->got_nodes, barrier->expected_nodes);
+			else
+				atomic_set(&barrier->got_nodes, cluster_members);
+			check_barrier_complete_phase1(barrier);
+		}
+
+		if (barrier->phase == 2) {
+			/* If it was deleted (ret==1) then no need to unlock
+			 * the mutex */
+			if (check_barrier_complete_phase2(barrier, 0) == 1)
+				return;
+		}
+		up(&barrier->lock);
+		break;
+	}
+}
+
+/* In-kernel membership API */
+int kcl_add_callback(void (*callback) (kcl_callback_reason, long arg))
+{
+	struct kernel_notify_struct *notify;
+
+	notify = kmalloc(sizeof (struct kernel_notify_struct), GFP_KERNEL);
+	if (!notify)
+		return -ENOMEM;
+	notify->callback = callback;
+
+	down(&kernel_listener_lock);
+	list_add(&notify->list, &kernel_listener_list);
+	up(&kernel_listener_lock);
+
+	return 0;
+}
+
+int kcl_remove_callback(void (*callback) (kcl_callback_reason, long arg))
+{
+	struct list_head *calllist;
+	struct list_head *temp;
+	struct kernel_notify_struct *notify;
+
+	down(&kernel_listener_lock);
+	list_for_each_safe(calllist, temp, &kernel_listener_list) {
+		notify = list_entry(calllist, struct kernel_notify_struct, list);
+		if (notify->callback == callback){
+			list_del(&notify->list);
+			kfree(notify);
+			up(&kernel_listener_lock);
+			return 0;
+		}
+	}
+	up(&kernel_listener_lock);
+	return -EINVAL;
+}
+
+/* Return quorate status */
+int kcl_is_quorate()
+{
+	return cluster_is_quorate;
+}
+
+/* Return the address list for a node */
+struct list_head *kcl_get_node_addresses(int nodeid)
+{
+	struct cluster_node *node = find_node_by_nodeid(nodeid);
+
+	if (node)
+		return &node->addr_list;
+	else
+		return NULL;
+}
+
+static void copy_to_kclnode(struct cluster_node *node,
+			    struct kcl_cluster_node *knode)
+{
+	strcpy(knode->name, node->name);
+	knode->size = sizeof (struct kcl_cluster_node);
+	knode->votes = node->votes;
+	knode->state = node->state;
+	knode->node_id = node->node_id;
+	knode->us = node->us;
+	knode->leave_reason = node->leave_reason;
+	knode->incarnation = node->incarnation;
+}
+
+/* Return the info for a node given it's address. if addr is NULL then return
+ * OUR info */
+int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
+			 struct kcl_cluster_node *n)
+{
+	struct cluster_node *node;
+
+	/* They want us */
+	if (addr == NULL) {
+		node = us;
+	}
+	else {
+		node = find_node_by_addr(addr, addr_len);
+		if (!node)
+			return -1;
+	}
+
+	/* Copy to user's buffer */
+	copy_to_kclnode(node, n);
+	return 0;
+}
+
+int kcl_get_node_by_name(unsigned char *name, struct kcl_cluster_node *n)
+{
+	struct cluster_node *node;
+
+	/* They want us */
+	if (name == NULL) {
+		node = us;
+		if (node == NULL)
+			return -1;
+	}
+	else {
+		node = find_node_by_name(name);
+		if (!node)
+			return -1;
+	}
+
+	/* Copy to user's buffer */
+	copy_to_kclnode(node, n);
+	return 0;
+}
+
+/* As above but by node id. MUCH faster */
+int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n)
+{
+	struct cluster_node *node;
+
+	/* They want us */
+	if (nodeid == 0) {
+		node = us;
+		if (node == NULL)
+			return -1;
+	}
+	else {
+		node = find_node_by_nodeid(nodeid);
+		if (!node)
+			return -1;
+	}
+
+	/* Copy to user's buffer */
+	copy_to_kclnode(node, n);
+	return 0;
+}
+
+/* Return a list of all cluster members ever */
+int kcl_get_all_members(struct list_head *list)
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+	struct kcl_cluster_node *newnode;
+	int num_nodes = 0;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		if (list) {
+			node = list_entry(nodelist, struct cluster_node, list);
+			newnode =
+			    kmalloc(sizeof (struct kcl_cluster_node),
+				    GFP_KERNEL);
+			if (newnode) {
+				copy_to_kclnode(node, newnode);
+				list_add(&newnode->list, list);
+				num_nodes++;
+			}
+		}
+		else {
+			num_nodes++;
+		}
+	}
+	up(&cluster_members_lock);
+
+	return num_nodes;
+}
+
+/* Return a list of cluster members */
+int kcl_get_members(struct list_head *list)
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+	struct kcl_cluster_node *newnode;
+	int num_nodes = 0;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (node->state == NODESTATE_MEMBER) {
+			if (list) {
+				newnode =
+				    kmalloc(sizeof (struct kcl_cluster_node),
+					    GFP_KERNEL);
+				if (newnode) {
+					copy_to_kclnode(node, newnode);
+					list_add(&newnode->list, list);
+					num_nodes++;
+				}
+			}
+			else {
+				num_nodes++;
+			}
+		}
+	}
+	up(&cluster_members_lock);
+
+	return num_nodes;
+}
+
+/* Copy current member's nodeids into buffer */
+int kcl_get_member_ids(uint32_t *idbuf, int size)
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+	int num_nodes = 0;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (node->state == NODESTATE_MEMBER) {
+			if (idbuf && size) {
+				idbuf[num_nodes] = node->node_id;
+				num_nodes++;
+				size--;
+			}
+			else {
+				num_nodes++;
+			}
+		}
+	}
+	up(&cluster_members_lock);
+
+	return num_nodes;
+}
+
+/* Barrier API */
+int kcl_barrier_register(char *name, unsigned int flags, unsigned int nodes)
+{
+	struct cl_barrier *barrier;
+
+	/* We are not joined to a cluster */
+	if (!we_are_a_cluster_member)
+		return -ENOENT;
+
+	/* Must have a valid name */
+	if (name == NULL || strlen(name) > MAX_BARRIER_NAME_LEN - 1)
+		return -EINVAL;
+
+	/* We don't do this yet */
+	if (flags & BARRIER_ATTR_MULTISTEP)
+		return -ENOTSUPP;
+
+	down(&barrier_list_lock);
+
+	/* See if it already exists */
+	if ((barrier = find_barrier(name))) {
+		up(&barrier_list_lock);
+		if (nodes != barrier->expected_nodes) {
+			printk(KERN_WARNING CMAN_NAME
+			       ": Barrier registration failed for '%s', expected nodes=%d, requested=%d\n",
+			       name, barrier->expected_nodes, nodes);
+			return -EINVAL;
+		}
+		else
+			return 0;
+	}
+
+	/* Build a new struct and add it to the list */
+	barrier = kmalloc(sizeof (struct cl_barrier), GFP_KERNEL);
+	if (barrier == NULL) {
+		up(&barrier_list_lock);
+		return -ENOMEM;
+	}
+	memset(barrier, 0, sizeof (*barrier));
+
+	strcpy(barrier->name, name);
+	barrier->flags = flags;
+	barrier->expected_nodes = nodes;
+	atomic_set(&barrier->got_nodes, 0);
+	atomic_set(&barrier->completed_nodes, 0);
+	barrier->endreason = 0;
+	barrier->registered_nodes = 1;
+	spin_lock_init(&barrier->phase2_spinlock);
+	barrier->state = BARRIER_STATE_INACTIVE;
+	init_MUTEX(&barrier->lock);
+
+	list_add(&barrier->list, &barrier_list);
+	up(&barrier_list_lock);
+
+	return 0;
+}
+
+static int barrier_setattr_enabled(struct cl_barrier *barrier,
+				   unsigned int attr, unsigned long arg)
+{
+	int status;
+
+	/* Can't disable a barrier */
+	if (!arg) {
+		up(&barrier->lock);
+		return -EINVAL;
+	}
+
+	/* We need to send WAIT now because the user may not
+	 * actually call kcl_barrier_wait() */
+	if (!barrier->waitsent) {
+		struct cl_barriermsg bmsg;
+
+		/* Send it to the rest of the cluster */
+		bmsg.cmd = CLUSTER_CMD_BARRIER;
+		bmsg.subcmd = BARRIER_WAIT;
+		strcpy(bmsg.name, barrier->name);
+
+		barrier->waitsent = 1;
+		barrier->phase = 1;
+
+		atomic_inc(&barrier->got_nodes);
+
+		/* Start the timer if one was wanted */
+		if (barrier->timeout) {
+			init_timer(&barrier->timer);
+			barrier->timer.function = barrier_timer_fn;
+			barrier->timer.data = (long) barrier;
+			mod_timer(&barrier->timer, jiffies + (barrier->timeout * HZ));
+		}
+
+		/* Barrier WAIT and COMPLETE messages are
+		 * always queued - that way they always get
+		 * sent out in the right order. If we don't do
+		 * this then one can get sent out in the
+		 * context of the user process and the other in
+		 * cnxman and COMPLETE may /just/ slide in
+		 * before WAIT if its in the queue
+		 */
+		P_BARRIER("Sending WAIT for %s\n", barrier->name);
+		status = queue_message(NULL, &bmsg, sizeof (bmsg), NULL, 0, 0);
+		if (status < 0) {
+			up(&barrier->lock);
+			return status;
+		}
+
+		/* It might have been reached now */
+		if (barrier
+		    && barrier->state != BARRIER_STATE_COMPLETE
+		    && barrier->phase == 1)
+			check_barrier_complete_phase1(barrier);
+	}
+	if (barrier && barrier->state == BARRIER_STATE_COMPLETE) {
+		up(&barrier->lock);
+		return barrier->endreason;
+	}
+	up(&barrier->lock);
+	return 0;	/* Nothing to propogate */
+}
+
+int kcl_barrier_setattr(char *name, unsigned int attr, unsigned long arg)
+{
+	struct cl_barrier *barrier;
+
+	/* See if it already exists */
+	down(&barrier_list_lock);
+	if (!(barrier = find_barrier(name))) {
+		up(&barrier_list_lock);
+		return -ENOENT;
+	}
+	up(&barrier_list_lock);
+
+	down(&barrier->lock);
+	if (barrier->state == BARRIER_STATE_COMPLETE) {
+		up(&barrier->lock);
+		return 0;
+	}
+
+	switch (attr) {
+	case BARRIER_SETATTR_AUTODELETE:
+		if (arg)
+			barrier->flags |= BARRIER_ATTR_AUTODELETE;
+		else
+			barrier->flags &= ~BARRIER_ATTR_AUTODELETE;
+		up(&barrier->lock);
+		return 0;
+		break;
+
+	case BARRIER_SETATTR_TIMEOUT:
+		/* Can only change the timout of an inactive barrier */
+		if (barrier->state == BARRIER_STATE_WAITING
+		    || barrier->waitsent) {
+			up(&barrier->lock);
+			return -EINVAL;
+		}
+		barrier->timeout = arg;
+		up(&barrier->lock);
+		return 0;
+
+	case BARRIER_SETATTR_MULTISTEP:
+		up(&barrier->lock);
+		return -ENOTSUPP;
+
+	case BARRIER_SETATTR_ENABLED:
+		return barrier_setattr_enabled(barrier, attr, arg);
+
+	case BARRIER_SETATTR_NODES:
+		/* Can only change the expected node count of an inactive
+		 * barrier */
+		if (barrier->state == BARRIER_STATE_WAITING
+		    || barrier->waitsent)
+			return -EINVAL;
+		barrier->expected_nodes = arg;
+		break;
+
+	case BARRIER_SETATTR_CALLBACK:
+		if (barrier->state == BARRIER_STATE_WAITING
+		    || barrier->waitsent)
+			return -EINVAL;
+		barrier->callback = (void (*)(char *, int)) arg;
+		up(&barrier->lock);
+		return 0;	/* Don't propgate this to other nodes */
+	}
+
+	up(&barrier->lock);
+	return 0;
+}
+
+int kcl_barrier_delete(char *name)
+{
+	struct cl_barrier *barrier;
+
+	down(&barrier_list_lock);
+	/* See if it exists */
+	if (!(barrier = find_barrier(name))) {
+		up(&barrier_list_lock);
+		return -ENOENT;
+	}
+
+	/* Delete it */
+	list_del(&barrier->list);
+	kfree(barrier);
+
+	up(&barrier_list_lock);
+
+	return 0;
+}
+
+int kcl_barrier_cancel(char *name)
+{
+	struct cl_barrier *barrier;
+
+	/* See if it exists */
+	down(&barrier_list_lock);
+	if (!(barrier = find_barrier(name))) {
+		up(&barrier_list_lock);
+		return -ENOENT;
+	}
+	down(&barrier->lock);
+
+	barrier->endreason = -ENOTCONN;
+
+	if (barrier->callback) {
+		barrier->callback(barrier->name, -ECONNRESET);
+		barrier->callback = NULL;
+	}
+
+	if (barrier->timeout)
+		del_timer(&barrier->timer);
+
+	/* Remove it if it's AUTO-DELETE */
+	if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
+		list_del(&barrier->list);
+		up(&barrier->lock);
+		kfree(barrier);
+		up(&barrier_list_lock);
+		return 0;
+	}
+
+	if (barrier->state == BARRIER_STATE_WAITING)
+		wake_up_interruptible(&barrier->waitq);
+
+	up(&barrier->lock);
+	up(&barrier_list_lock);
+	return 0;
+}
+
+int kcl_barrier_wait(char *name)
+{
+	struct cl_barrier *barrier;
+	int ret;
+
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+
+	/* Enable it */
+	kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, 1L);
+
+	down(&barrier_list_lock);
+
+	/* See if it still exists - enable may have deleted it! */
+	if (!(barrier = find_barrier(name))) {
+		up(&barrier_list_lock);
+		return -ENOENT;
+	}
+
+	down(&barrier->lock);
+
+	up(&barrier_list_lock);
+
+	/* If it has already completed then return the status */
+	if (barrier->state == BARRIER_STATE_COMPLETE) {
+		up(&barrier->lock);
+		return barrier->endreason;
+	}
+
+	barrier->state = BARRIER_STATE_WAITING;
+
+	/* Have we all reached the barrier? */
+	while (atomic_read(&barrier->completed_nodes) !=
+	       ((barrier->expected_nodes == 0)
+		? cluster_members : barrier->expected_nodes)
+	       && barrier->endreason == 0) {
+
+		wait_queue_t wq;
+
+		init_waitqueue_entry(&wq, current);
+		init_waitqueue_head(&barrier->waitq);
+
+		/* Wait for em all */
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		add_wait_queue(&barrier->waitq, &wq);
+
+		if (atomic_read(&barrier->completed_nodes) !=
+		    ((barrier->expected_nodes ==
+		      0) ? cluster_members : barrier->expected_nodes)
+		    && barrier->endreason == 0) {
+			up(&barrier->lock);
+			schedule();
+			down(&barrier->lock);
+		}
+
+		remove_wait_queue(&barrier->waitq, &wq);
+		set_task_state(current, TASK_RUNNING);
+
+		if (signal_pending(current)) {
+			barrier->endreason = -EINTR;
+			break;
+		}
+	}
+	barrier->state = BARRIER_STATE_INACTIVE;
+
+	if (barrier->timeout)
+		del_timer(&barrier->timer);
+
+	/* Barrier has been reached on all nodes, call the callback */
+	if (barrier->callback) {
+		barrier->callback(barrier->name, barrier->endreason);
+		barrier->callback = NULL;
+	}
+
+	atomic_set(&barrier->got_nodes, 0);
+
+	/* Return the reason we were woken */
+	ret = barrier->endreason;
+
+	/* Remove it if it's AUTO-DELETE */
+	if (barrier->flags & BARRIER_ATTR_AUTODELETE) {
+		down(&barrier_list_lock);
+		list_del(&barrier->list);
+		up(&barrier_list_lock);
+		up(&barrier->lock);
+		kfree(barrier);
+	}
+	else {
+		up(&barrier->lock);
+	}
+
+	/* We were woken up because the node left the cluster ? */
+	if (!atomic_read(&cnxman_running))
+		ret = -ENOTCONN;
+
+	return ret;
+}
+
+/* This is called from membership services when a node has left the cluster -
+ * we signal all waiting barriers with -ESRCH so they know to do something
+ * else, if the number of nodes is left at 0 then we compare the new number of
+ * nodes in the cluster with that at the barrier and return 0 (success) in that
+ * case */
+void check_barrier_returns()
+{
+	struct list_head *blist;
+	struct list_head *llist;
+	struct cl_barrier *barrier;
+	int status = 0;
+
+	down(&barrier_list_lock);
+	list_for_each(blist, &barrier_list) {
+		barrier = list_entry(blist, struct cl_barrier, list);
+
+		if (barrier->waitsent) {
+			int wakeit = 0;
+
+			/* Check for a dynamic member barrier */
+			if (barrier->expected_nodes == 0) {
+				if (barrier->registered_nodes ==
+				    cluster_members) {
+					status = 0;
+					wakeit = 1;
+				}
+			}
+			else {
+				status = -ESRCH;
+				wakeit = 1;
+			}
+
+			/* Do we need to tell the barrier? */
+			if (wakeit) {
+				if (barrier->state == BARRIER_STATE_WAITING) {
+					barrier->endreason = status;
+					wake_up_interruptible(&barrier->waitq);
+				}
+				else {
+					if (barrier->callback) {
+						barrier->callback(barrier->name,
+								  status);
+					}
+				}
+			}
+		}
+	}
+	up(&barrier_list_lock);
+
+	/* Part 2 check for outstanding listen requests for dead nodes and
+	 * cancel them */
+	down(&listenreq_lock);
+	list_for_each(llist, &listenreq_list) {
+		struct cl_waiting_listen_request *lrequest =
+		    list_entry(llist, struct cl_waiting_listen_request, list);
+		struct cluster_node *node =
+		    find_node_by_nodeid(lrequest->nodeid);
+
+		if (node && node->state != NODESTATE_MEMBER) {
+			lrequest->result = -ENOTCONN;
+			lrequest->waiting = 0;
+			wake_up_interruptible(&lrequest->waitq);
+		}
+	}
+	up(&listenreq_lock);
+}
+
+int get_addr_from_temp_nodeid(int nodeid, char *addr, int *addrlen)
+{
+	struct temp_node *tn;
+	int err = 1; /* true */
+#ifdef DEBUG_COMMS
+	char buf[MAX_ADDR_PRINTED_LEN];
+#endif
+
+	down(&tempnode_lock);
+
+	list_for_each_entry(tn, &tempnode_list, list) {
+		if (tn->nodeid == nodeid) {
+			memcpy(addr, tn->addr, tn->addrlen);
+			*addrlen = tn->addrlen;
+			P_COMMS("get_temp_nodeid. id %d:\n: %s\n",
+				tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
+
+			goto out;
+		}
+	}
+	err = 0;
+
+ out:
+	up(&tempnode_lock);
+	return err;
+}
+
+/* Create a new temporary node ID. This list will only ever be very small
+   (usaully only 1 item) but I can't take the risk that someone won't try to
+   boot 128 nodes all at exactly the same time. */
+int new_temp_nodeid(char *addr, int addrlen)
+{
+	struct temp_node *tn;
+	int err = -1;
+	int try_nodeid = 0;
+#ifdef DEBUG_COMMS
+	char buf[MAX_ADDR_PRINTED_LEN];
+#endif
+
+	P_COMMS("new_temp_nodeid needed for\n: %s\n",
+		print_addr(addr, addrlen, buf));
+
+	down(&tempnode_lock);
+
+	/* First see if we already know about this node */
+	list_for_each_entry(tn, &tempnode_list, list) {
+
+		P_COMMS("new_temp_nodeid list. id %d:\n: %s\n",
+			tn->nodeid, print_addr(tn->addr, tn->addrlen, buf));
+
+		/* We're already in here... */
+		if (tn->addrlen == addrlen &&
+		    memcmp(tn->addr, addr, addrlen) == 0) {
+			P_COMMS("reused temp node ID %d\n", tn->nodeid);
+			err = tn->nodeid;
+			goto out;
+		}
+	}
+
+	/* Nope, OK, invent a suitable number */
+ retry:
+	try_nodeid -= 1;
+	list_for_each_entry(tn, &tempnode_list, list) {
+
+		if (tn->nodeid == try_nodeid)
+			goto retry;
+	}
+
+	tn = kmalloc(sizeof(struct temp_node), GFP_KERNEL);
+	if (!tn)
+		goto out;
+
+	memcpy(tn->addr, addr, addrlen);
+	tn->addrlen = addrlen;
+	tn->nodeid = try_nodeid;
+	list_add_tail(&tn->list, &tempnode_list);
+	err = try_nodeid;
+	P_COMMS("new temp nodeid = %d\n", try_nodeid);
+ out:
+	up(&tempnode_lock);
+	return err;
+}
+
+static int is_valid_temp_nodeid(int nodeid)
+{
+	struct temp_node *tn;
+	int err = 1; /* true */
+
+	down(&tempnode_lock);
+
+	list_for_each_entry(tn, &tempnode_list, list) {
+		if (tn->nodeid == nodeid)
+			goto out;
+	}
+	err = 0;
+
+ out:
+	P_COMMS("is_valid_temp_nodeid. %d = %d\n", nodeid, err);
+	up(&tempnode_lock);
+	return err;
+}
+
+/*
+ * Remove any temp nodeIDs that refer to now-valid cluster members.
+ */
+void purge_temp_nodeids()
+{
+	struct temp_node *tn;
+	struct temp_node *tmp;
+	struct cluster_node *node;
+	struct cluster_node_addr *nodeaddr;
+
+
+	down(&tempnode_lock);
+	down(&cluster_members_lock);
+
+	/*
+	 * The ordering of these nested lists is deliberately
+	 * arranged for the fewest list traversals overall
+	 */
+
+	/* For each node... */
+	list_for_each_entry(node, &cluster_members_list, list) {
+		if (node->state == NODESTATE_MEMBER) {
+			/* ...We check the temp node ID list... */
+			list_for_each_entry_safe(tn, tmp, &tempnode_list, list) {
+
+				/* ...against that node's address */
+				list_for_each_entry(nodeaddr, &node->addr_list, list) {
+
+					if (memcmp(nodeaddr->addr, tn->addr, tn->addrlen) == 0) {
+						list_del(&tn->list);
+						kfree(tn);
+					}
+				}
+			}
+		}
+	}
+	up(&cluster_members_lock);
+	up(&tempnode_lock);
+}
+
+
+/* Quorum device functions */
+int kcl_register_quorum_device(char *name, int votes)
+{
+	if (quorum_device)
+		return -EBUSY;
+
+	if (find_node_by_name(name))
+		return -EINVAL;
+
+	quorum_device = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
+	if (!quorum_device)
+		return -ENOMEM;
+	memset(quorum_device, 0, sizeof (struct cluster_node));
+
+	quorum_device->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	if (!quorum_device->name) {
+		kfree(quorum_device);
+		quorum_device = NULL;
+		return -ENOMEM;
+	}
+
+	strcpy(quorum_device->name, name);
+	quorum_device->votes = votes;
+	quorum_device->state = NODESTATE_DEAD;
+
+	/* Keep this list valid so it doesn't confuse other code */
+	INIT_LIST_HEAD(&quorum_device->addr_list);
+
+	return 0;
+}
+
+int kcl_unregister_quorum_device(void)
+{
+	if (!quorum_device)
+		return -EINVAL;
+	if (quorum_device->state == NODESTATE_MEMBER)
+		return -EINVAL;
+
+	quorum_device = NULL;
+
+	return 0;
+}
+
+int kcl_quorum_device_available(int yesno)
+{
+	if (!quorum_device)
+		return -EINVAL;
+
+	if (yesno) {
+		quorum_device->last_hello = jiffies;
+		if (quorum_device->state == NODESTATE_DEAD) {
+			quorum_device->state = NODESTATE_MEMBER;
+			recalculate_quorum(0);
+		}
+	}
+	else {
+		if (quorum_device->state == NODESTATE_MEMBER) {
+			quorum_device->state = NODESTATE_DEAD;
+			recalculate_quorum(0);
+		}
+	}
+
+	return 0;
+}
+
+/* APIs for cluster ref counting. */
+int kcl_addref_cluster()
+{
+	int ret = -ENOTCONN;
+
+	if (!atomic_read(&cnxman_running))
+		goto addref_ret;
+
+	if (try_module_get(THIS_MODULE)) {
+		atomic_inc(&use_count);
+		ret = 0;
+	}
+
+      addref_ret:
+	return ret;
+}
+
+int kcl_releaseref_cluster()
+{
+	if (!atomic_read(&cnxman_running))
+		return -ENOTCONN;
+	atomic_dec(&use_count);
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+int kcl_cluster_name(char **cname)
+{
+	char *name;
+
+	name = kmalloc(strlen(cluster_name) + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	strncpy(name, cluster_name, strlen(cluster_name)+1);
+	*cname = name;
+	return 0;
+}
+
+int kcl_get_current_interface(void)
+{
+	return current_interface->number;
+}
+
+/* Socket registration stuff */
+static struct net_proto_family cl_family_ops = {
+	.family = AF_CLUSTER,
+	.create = cl_create,
+	.owner  = THIS_MODULE,
+};
+
+static struct proto_ops cl_proto_ops = {
+	.family      = AF_CLUSTER,
+
+	.release     = cl_release,
+	.bind        = cl_bind,
+	.connect     = sock_no_connect,
+	.socketpair  = sock_no_socketpair,
+	.accept      = sock_no_accept,
+	.getname     = cl_getname,
+	.poll        = cl_poll,
+	.ioctl       = cl_ioctl,
+	.listen      = sock_no_listen,
+	.shutdown    = cl_shutdown,
+	.setsockopt  = sock_no_setsockopt,
+	.getsockopt  = sock_no_getsockopt,
+	.sendmsg     = cl_sendmsg,
+	.recvmsg     = cl_recvmsg,
+	.mmap        = sock_no_mmap,
+	.sendpage    = sock_no_sendpage,
+	.owner       = THIS_MODULE,
+};
+
+#ifdef MODULE
+MODULE_DESCRIPTION("Cluster Connection and Service Manager");
+MODULE_AUTHOR("Red Hat, Inc");
+MODULE_LICENSE("GPL");
+#endif
+
+static int __init cluster_init(void)
+{
+	printk("CMAN %s (built %s %s) installed\n",
+	       CMAN_RELEASE_NAME, __DATE__, __TIME__);
+
+	if (sock_register(&cl_family_ops)) {
+		printk(KERN_INFO "Unable to register cluster socket type\n");
+		return -1;
+	}
+
+	/* allocate our sock slab cache */
+	cluster_sk_cachep = kmem_cache_create("cluster_sock",
+					      sizeof (struct cluster_sock), 0,
+					      SLAB_HWCACHE_ALIGN, 0, 0);
+	if (!cluster_sk_cachep) {
+		printk(KERN_CRIT
+		       "cluster_init: Cannot create cluster_sock SLAB cache\n");
+		sock_unregister(AF_CLUSTER);
+		return -1;
+	}
+
+#ifdef CONFIG_PROC_FS
+	create_proc_entries();
+#endif
+
+	init_MUTEX(&start_thread_sem);
+	init_MUTEX(&send_lock);
+	init_MUTEX(&barrier_list_lock);
+	init_MUTEX(&cluster_members_lock);
+	init_MUTEX(&port_array_lock);
+	init_MUTEX(&messages_list_lock);
+	init_MUTEX(&listenreq_lock);
+	init_MUTEX(&client_socket_lock);
+	init_MUTEX(&event_listener_lock);
+	init_MUTEX(&kernel_listener_lock);
+	init_MUTEX(&tempnode_lock);
+	spin_lock_init(&active_socket_lock);
+	spin_lock_init(&new_dead_node_lock);
+	spin_lock_init(&membership_task_lock);
+	init_timer(&ack_timer);
+
+	INIT_LIST_HEAD(&event_listener_list);
+	INIT_LIST_HEAD(&kernel_listener_list);
+	INIT_LIST_HEAD(&socket_list);
+	INIT_LIST_HEAD(&client_socket_list);
+	INIT_LIST_HEAD(&active_socket_list);
+	INIT_LIST_HEAD(&barrier_list);
+	INIT_LIST_HEAD(&messages_list);
+	INIT_LIST_HEAD(&listenreq_list);
+	INIT_LIST_HEAD(&cluster_members_list);
+	INIT_LIST_HEAD(&new_dead_node_list);
+	INIT_LIST_HEAD(&tempnode_list);
+
+	atomic_set(&cnxman_running, 0);
+
+	sm_init();
+
+	return 0;
+}
+
+static void __exit cluster_exit(void)
+{
+#ifdef CONFIG_PROC_FS
+	cleanup_proc_entries();
+#endif
+
+	sock_unregister(AF_CLUSTER);
+	kmem_cache_destroy(cluster_sk_cachep);
+}
+
+module_init(cluster_init);
+module_exit(cluster_exit);
+
+EXPORT_SYMBOL(kcl_sendmsg);
+EXPORT_SYMBOL(kcl_register_read_callback);
+EXPORT_SYMBOL(kcl_add_callback);
+EXPORT_SYMBOL(kcl_remove_callback);
+EXPORT_SYMBOL(kcl_get_members);
+EXPORT_SYMBOL(kcl_get_member_ids);
+EXPORT_SYMBOL(kcl_get_all_members);
+EXPORT_SYMBOL(kcl_is_quorate);
+EXPORT_SYMBOL(kcl_get_node_by_addr);
+EXPORT_SYMBOL(kcl_get_node_by_name);
+EXPORT_SYMBOL(kcl_get_node_by_nodeid);
+EXPORT_SYMBOL(kcl_get_node_addresses);
+EXPORT_SYMBOL(kcl_addref_cluster);
+EXPORT_SYMBOL(kcl_releaseref_cluster);
+EXPORT_SYMBOL(kcl_cluster_name);
+
+EXPORT_SYMBOL(kcl_barrier_register);
+EXPORT_SYMBOL(kcl_barrier_setattr);
+EXPORT_SYMBOL(kcl_barrier_delete);
+EXPORT_SYMBOL(kcl_barrier_wait);
+EXPORT_SYMBOL(kcl_barrier_cancel);
+
+EXPORT_SYMBOL(kcl_register_quorum_device);
+EXPORT_SYMBOL(kcl_unregister_quorum_device);
+EXPORT_SYMBOL(kcl_quorum_device_available);
+
+EXPORT_SYMBOL(kcl_register_service);
+EXPORT_SYMBOL(kcl_unregister_service);
+EXPORT_SYMBOL(kcl_join_service);
+EXPORT_SYMBOL(kcl_leave_service);
+EXPORT_SYMBOL(kcl_global_service_id);
+EXPORT_SYMBOL(kcl_start_done);
+EXPORT_SYMBOL(kcl_get_services);
+EXPORT_SYMBOL(kcl_get_current_interface);
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- linux-2.6.9.orig/cluster/cman/config.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/config.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,51 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "config.h"
+
+/* Config file defaults */
+
+#define DEFAULT_JOIN_WAIT_TIME   16	/* Time to wait while sending JOINREQ
+					 * messages. Should be at least twice
+					 * the HELLO timer, probably 3x */
+#define DEFAULT_JOIN_TIMEOUT     30	/* How long we wait after getting a
+					 * JOINACK to regarding that node as
+					 * dead */
+#define DEFAULT_HELLO_TIMER       5	/* Period between HELLO messages */
+#define DEFAULT_DEADNODE_TIMER   21	/* If we don't get a message from a
+					 * node in this period kill it */
+#define DEFAULT_TRANSITION_TIMER 15	/* Maximum time a state transition
+					 * should take */
+#define DEFAULT_JOINCONF_TIMER    5	/* Time allowed to a node to respond to 
+					 * a JOINCONF message */
+#define DEFAULT_MAX_NODES       128	/* Max allowed nodes */
+#define DEFAULT_TRANSITION_RESTARTS  10	/* Maximum number of transition
+					 * restarts before we die */
+#define DEFAULT_SM_DEBUG_SIZE	256	/* Size in bytes of SM debug buffer */
+
+#define DEFAULT_NEWCLUSTER_TIMEOUT 16   /* Time to send NEWCLUSTER messages */
+#define DEFAULT_MAX_RETRIES 5		/* Number of times we resend a message */
+
+struct config_info cman_config = {
+	.joinwait_timeout = DEFAULT_JOIN_WAIT_TIME,
+	.joinconf_timeout = DEFAULT_JOINCONF_TIMER,
+	.join_timeout = DEFAULT_JOIN_TIMEOUT,
+	.hello_timer = DEFAULT_HELLO_TIMER,
+	.deadnode_timeout = DEFAULT_DEADNODE_TIMER,
+	.transition_timeout = DEFAULT_TRANSITION_TIMER,
+	.transition_restarts = DEFAULT_TRANSITION_RESTARTS,
+	.max_nodes = DEFAULT_MAX_NODES,
+	.sm_debug_size = DEFAULT_SM_DEBUG_SIZE,
+	.newcluster_timeout = DEFAULT_NEWCLUSTER_TIMEOUT,
+	.max_retries = DEFAULT_MAX_RETRIES,
+};
--- linux-2.6.9.orig/cluster/cman/config.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/config.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,33 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+struct config_info {
+	int joinwait_timeout;
+	int joinconf_timeout;
+	int join_timeout;
+	int hello_timer;
+	int deadnode_timeout;
+	int transition_timeout;
+	int transition_restarts;
+	int max_nodes;
+	int sm_debug_size;
+        int newcluster_timeout;
+	int max_retries;
+};
+
+extern struct config_info cman_config;
+
+#endif				/* __CONFIG_DOT_H__ */
--- linux-2.6.9.orig/cluster/cman/kjoin.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/kjoin.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,238 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/list.h>
+#include <cluster/cnxman.h>
+#include <linux/in.h>
+
+#include "cnxman-private.h"
+
+static struct socket *mcast_sock;
+static struct socket *recv_sock;
+static struct socket *cluster_sock;
+
+extern short cluster_id;
+extern int join_count;
+extern struct semaphore join_count_lock;
+extern atomic_t cnxman_running;
+
+int kcl_join_cluster(struct cl_join_cluster_info *join_info)
+{
+	int result;
+	int one = 1, error;
+	unsigned int ipaddr = join_info->ipaddr, brdaddr = join_info->brdaddr;
+	unsigned short port = join_info->port;
+	mm_segment_t fs;
+	struct sockaddr_in saddr;
+	struct kcl_multicast_sock mcast_info;
+
+	down(&join_count_lock);
+	if (atomic_read(&cnxman_running))
+	{
+		error = 0;
+		if (join_info->cluster_id == cluster_id)
+			join_count++;
+		else
+			error = -EINVAL;
+		up(&join_count_lock);
+		return error;
+	}
+	up(&join_count_lock);
+
+	result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &mcast_sock);
+	if (result < 0)
+	{
+		printk(KERN_ERR CMAN_NAME ": Can't create Multicast socket\n");
+		return result;
+	}
+
+	result = sock_create(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &recv_sock);
+	if (result < 0)
+	{
+		printk(KERN_ERR CMAN_NAME ": Can't create Receive socket\n");
+		return result;
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	if ((error = sock_setsockopt(mcast_sock, SOL_SOCKET, SO_BROADCAST,
+				     (void *) &one, sizeof (int))))
+	{
+		set_fs(fs);
+		printk("Error %d Setting master socket to SO_BROADCAST\n",
+		       error);
+		sock_release(mcast_sock);
+		return -1;
+	}
+	set_fs(fs);
+
+	/* Bind the multicast socket */
+	saddr.sin_family = AF_INET;
+	saddr.sin_port = htons(port);
+	saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
+	result =
+	    mcast_sock->ops->bind(mcast_sock, (struct sockaddr *) &saddr,
+				  sizeof (saddr));
+	if (result < 0)
+	{
+		printk(KERN_ERR CMAN_NAME ": Can't bind multicast socket\n");
+		sock_release(mcast_sock);
+		sock_release(recv_sock);
+		return result;
+	}
+
+	/* Bind the receive socket to our IP address */
+	saddr.sin_family = AF_INET;
+	saddr.sin_port = htons(port);
+	saddr.sin_addr.s_addr = cpu_to_be32(ipaddr);
+	result =
+	    recv_sock->ops->bind(recv_sock, (struct sockaddr *) &saddr,
+				 sizeof (saddr));
+	if (result < 0)
+	{
+		printk(KERN_ERR CMAN_NAME ": Can't bind receive socket\n");
+		sock_release(mcast_sock);
+		sock_release(recv_sock);
+		return result;
+	}
+
+	/* Create the cluster master socket */
+	result =
+	    sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER, &cluster_sock);
+	if (result < 0)
+	{
+		printk(KERN_ERR CMAN_NAME
+		       ": Can't create cluster master socket\n");
+		sock_release(mcast_sock);
+		sock_release(recv_sock);
+		return result;
+	}
+
+	/* This is the broadcast transmit address */
+	saddr.sin_addr.s_addr = cpu_to_be32(brdaddr);
+
+	/* Pass the multicast socket to kernel space */
+	mcast_info.sock = mcast_sock;
+	mcast_info.number = 1;
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	if ((error = cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
+						   KCL_SET_MULTICAST,
+						   (void *) &mcast_info,
+						   sizeof (mcast_info))))
+	{
+		set_fs(fs);
+		printk(CMAN_NAME
+		       ": Unable to pass multicast socket to cnxman, %d\n",
+		       error);
+		sock_release(mcast_sock);
+		sock_release(recv_sock);
+		sock_release(cluster_sock);
+		return -1;
+	}
+
+	mcast_info.sock = recv_sock;
+	if ((error =
+	     cluster_sock->ops->setsockopt(cluster_sock, CLPROTO_MASTER,
+					   KCL_SET_RCVONLY,
+					   (void *) &mcast_info,
+					   sizeof (mcast_info))))
+	{
+		set_fs(fs);
+		printk(CMAN_NAME
+		       ": Unable to pass receive socket to cnxman, %d\n",
+		       error);
+		sock_release(mcast_sock);
+		sock_release(recv_sock);
+		sock_release(cluster_sock);
+		return -1;
+	}
+
+	/* This setsockopt expects usermode variables */
+
+	if (cluster_sock->ops->
+	    setsockopt(cluster_sock, CLPROTO_MASTER, CLU_JOIN_CLUSTER,
+		       (void *) join_info,
+		       sizeof (struct cl_join_cluster_info)))
+
+	{
+		set_fs(fs);
+		printk(CMAN_NAME ": Unable to join cluster\n");
+		sock_release(mcast_sock);
+		sock_release(recv_sock);
+		sock_release(cluster_sock);
+		return -1;
+	}
+	set_fs(fs);
+
+	return 0;
+}
+
+int kcl_leave_cluster(int remove)
+{
+	mm_segment_t fs;
+	int rem = remove;
+	int ret = 0;
+	struct socket *shutdown_sock = cluster_sock;
+
+	cluster_sock = NULL;
+
+	if (!shutdown_sock)
+	{
+		/* Create the cluster master socket */
+		int result =
+		    sock_create(AF_CLUSTER, SOCK_DGRAM, CLPROTO_MASTER,
+				&shutdown_sock);
+		if (result < 0)
+		{
+			printk(KERN_ERR CMAN_NAME
+			       ": Can't create cluster master socket\n");
+			sock_release(mcast_sock);
+			sock_release(recv_sock);
+			return result;
+		}
+	}
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	if ((ret =
+	     shutdown_sock->ops->setsockopt(shutdown_sock, CLPROTO_MASTER,
+					    CLU_LEAVE_CLUSTER, (void *) &rem,
+					    sizeof (int))))
+	{
+		printk(KERN_ERR CMAN_NAME ": Unable to leave cluster, %d\n",
+		       ret);
+	}
+	set_fs(fs);
+
+	sock_release(shutdown_sock);
+
+	return ret;
+}
+
+/* 
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- linux-2.6.9.orig/cluster/cman/membership.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/membership.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,3347 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <cluster/cnxman.h>
+
+#include "cnxman-private.h"
+#include "config.h"
+#include "sm_control.h"
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+/* Barrier name for membership transitions. %d is the cluster generation number
+ */
+#define MEMBERSHIP_BARRIER_NAME	"TRANSITION.%d"
+
+/* Variables also used by connection manager */
+struct list_head cluster_members_list;
+struct semaphore cluster_members_lock;
+int cluster_members;		/* Number of ACTIVE members, not a count of
+				 * nodes in the list */
+int we_are_a_cluster_member;
+int cluster_is_quorate;
+int quit_threads;
+struct task_struct *membership_task;
+spinlock_t membership_task_lock;
+struct cluster_node *us;
+
+static struct task_struct *hello_task;
+static struct semaphore hello_task_lock;
+
+/* Variables that belong to the connection manager */
+extern wait_queue_head_t cnxman_waitq;
+extern struct completion member_thread_comp;
+extern struct cluster_node *quorum_device;
+extern unsigned short two_node;
+extern char cluster_name[];
+extern unsigned int config_version;
+extern unsigned int address_length;
+
+static struct socket *mem_socket;
+static pid_t kcluster_pid;
+
+static char iobuf[MAX_CLUSTER_MESSAGE];
+static char scratchbuf[MAX_CLUSTER_MESSAGE + 100];
+
+/* Our node name, usually system_utsname.nodename, but can be overridden */
+char nodename[MAX_CLUSTER_MEMBER_NAME_LEN + 1];
+
+/* Node ID that we want. defaults of zero means
+ *  it will be allocated by the cluster join mechanism
+ */
+int wanted_nodeid;
+
+static spinlock_t members_by_nodeid_lock;
+static int sizeof_members_array;	/* Can dynamically increase (vmalloc
+					 * permitting) */
+static struct cluster_node **members_by_nodeid;
+
+#define MEMBER_INCREMENT_SIZE 10
+
+static int votes = 1;		/* Votes this node has */
+static int expected_votes = 1;	/* Total expected votes in the cluster */
+static unsigned int quorum;	/* Quorum, fewer votes than this and we stop
+				 * work */
+static int leavereason;		/* Saved for the duration of a state transition */
+static int transitionreason;	/* Reason this transition was initiated */
+static unsigned int highest_nodeid;	/* Highest node ID known to the cluster */
+static struct timer_list transition_timer;	/* Kicks in if the transition
+						 * doesn't complete in a
+						 * reasonable time */
+static struct timer_list hello_timer;	/* Timer to send HELLOs on */
+static unsigned long join_time;	/* The time that we got our JOIN-ACK */
+static unsigned long start_time; /* The time that we were started */
+static int joinconf_count;	/* Number of JOINCONF messages we have sent to
+				 * a new node */
+static unsigned long wake_flags;/* Reason we were woken */
+
+/* Flags in above */
+#define WAKE_FLAG_DEADNODE    1
+#define WAKE_FLAG_TRANSTIMER  2
+
+/* The time the transition finished */
+static unsigned long transition_end_time;
+
+/* A list of nodes that cnxman tells us are dead. I hope this never has more
+ * than one element in it but I can't take that chance. only non-static so it
+ * can be initialised in module_load. */
+struct list_head new_dead_node_list;
+spinlock_t new_dead_node_lock;
+
+static int do_membership_packet(struct msghdr *msg, char *buf, int len);
+static int do_process_joinreq(struct msghdr *msg, char *buf, int len);
+static int do_process_joinack(struct msghdr *msg, char *buf, int len);
+static int do_process_joinconf(struct msghdr *msg, char *buf, int len);
+static int do_process_leave(struct msghdr *msg, char *buf, int len);
+static int do_process_hello(struct msghdr *msg, char *buf, int len);
+static int do_process_kill(struct msghdr *msg, char *buf, int len);
+static int do_process_reconfig(struct msghdr *msg, char *buf, int len);
+static int do_process_starttrans(struct msghdr *msg, char *buf, int len);
+static int do_process_nodedown(struct msghdr *msg, char *buf, int len);
+static int do_process_masterview(struct msghdr *msg, char *buf, int len);
+static int do_process_endtrans(struct msghdr *msg, char *buf, int len);
+static int do_process_viewack(struct msghdr *msg, char *buf, int len);
+static int do_process_startack(struct msghdr *msg, char *buf, int len);
+static int do_process_newcluster(struct msghdr *msg, char *buf, int len);
+static int do_process_nominate(struct msghdr *msg, char *buf, int len);
+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
+			     unsigned int flags, unsigned int flags2);
+static int send_joinreq(struct sockaddr_cl *addr, int addr_len);
+static int send_startack(struct sockaddr_cl *addr, int addr_len);
+static int send_hello(void);
+static int send_master_hello(void);
+static int send_newcluster(void);
+static int end_transition(void);
+static int dispatch_messages(struct socket *mem_socket);
+static void check_for_dead_nodes(void);
+static void confirm_joiner(void);
+static void reset_hello_time(void);
+static int add_us(void);
+static int send_joinconf(void);
+static int init_membership_services(void);
+static int elect_master(struct cluster_node **, int disallow_node);
+static void trans_timer_expired(unsigned long arg);
+static void hello_timer_expired(unsigned long arg);
+static void join_or_form_cluster(void);
+static int do_timer_wakeup(void);
+static int start_transition(unsigned char reason, struct cluster_node *node);
+static uint32_t low32_of_ip(void);
+static void remove_joiner(int tell_wait);
+int send_leave(unsigned char);
+int send_reconfigure(int, unsigned int);
+
+#ifdef DEBUG_MEMB
+static char *msgname(int msg);
+static int debug_sendmsg(struct socket *sock, void *buf, int size,
+			 struct sockaddr_cl *caddr, int addr_len,
+			 unsigned int flags)
+{
+	P_MEMB("%ld: sending %s, len=%d\n", jiffies, msgname(((char *) buf)[0]),
+	       size);
+	return kcl_sendmsg(sock, buf, size, caddr, addr_len, flags);
+}
+
+#define kcl_sendmsg debug_sendmsg
+#endif
+
+/* State of the node */
+static enum { STARTING, NEWCLUSTER, JOINING, JOINWAIT, JOINACK, TRANSITION,
+	    TRANSITION_COMPLETE, MEMBER, REJECTED, LEFT_CLUSTER, MASTER
+} node_state = LEFT_CLUSTER;
+
+/* Sub-state when we are MASTER */
+static enum { MASTER_START, MASTER_COLLECT, MASTER_CONFIRM,
+	    MASTER_COMPLETE } master_state;
+
+/* Number of responses collected while a master controlling a state transition */
+static int responses_collected;
+static int responses_expected;
+
+/* Current cluster generation number */
+int cluster_generation = 1;
+
+/* When another node initiates a transtion then store it's pointer in here so
+ * we can check for other nodes trying to spoof us */
+static struct cluster_node *master_node = NULL;
+
+/* Struct the node wanting to join us */
+static struct cluster_node *joining_node = NULL;
+static int joining_temp_nodeid;
+
+/* Last time a HELLO message was sent */
+unsigned long last_hello;
+
+/* When we got our JOINWAIT or NEWCLUSTER */
+unsigned long joinwait_time;
+
+/* Number of times a transition has restarted when we were master */
+int transition_restarts;
+
+/* Variables used by the master to collect cluster status during a transition */
+static int agreeing_nodes;
+static int dissenting_nodes;
+static uint8_t *node_opinion = NULL;
+#define OPINION_AGREE    1
+#define OPINION_DISAGREE 2
+
+
+/* None of our threads is CPU intensive, but if they don't run when they are supposed
+   to, the node can get kicked out of the cluster.
+*/
+void cman_set_realtime(struct task_struct *tsk, int prio)
+{
+        tsk->policy = SCHED_FIFO;
+        tsk->rt_priority = prio;
+}
+
+/* Set node id of a node, also add it to the members array and expand the array
+ * if necessary */
+static inline void set_nodeid(struct cluster_node *node, int nodeid)
+{
+	if (!nodeid)
+		return;
+
+	node->node_id = nodeid;
+	if (nodeid >= sizeof_members_array) {
+		int new_size = sizeof_members_array + MEMBER_INCREMENT_SIZE;
+		struct cluster_node **new_array;
+
+		if (new_size < nodeid)
+			new_size = nodeid + MEMBER_INCREMENT_SIZE;
+
+		new_array = vmalloc((new_size) * sizeof (struct cluster_node *));
+		if (new_array) {
+			spin_lock(&members_by_nodeid_lock);
+			memcpy(new_array, members_by_nodeid,
+			       sizeof_members_array *
+			       sizeof (struct cluster_node *));
+			memset(&new_array[sizeof_members_array], 0,
+			       (new_size - sizeof_members_array) *
+			       sizeof (struct cluster_node *));
+			vfree(members_by_nodeid);
+
+			members_by_nodeid = new_array;
+			sizeof_members_array = new_size;
+			spin_unlock(&members_by_nodeid_lock);
+		}
+		else {
+			panic("No memory for more nodes");
+		}
+	}
+	notify_kernel_listeners(NEWNODE, (long) nodeid);
+
+	/* The old node may be a failed joiner, in which case we can overwrite it with
+	   the new node */
+	if (members_by_nodeid[nodeid] &&
+	    members_by_nodeid[nodeid]->state == NODESTATE_JOINING) {
+		struct cluster_node *node;
+
+		P_MEMB("Removing failed joining node %s (%d)\n",
+		       members_by_nodeid[nodeid]->name, members_by_nodeid[nodeid]->node_id);
+
+		down(&cluster_members_lock);
+		list_del(&members_by_nodeid[nodeid]->list);
+		up(&cluster_members_lock);
+
+		node = members_by_nodeid[nodeid];
+
+		spin_lock(&members_by_nodeid_lock);
+		members_by_nodeid[nodeid] = NULL;
+		spin_unlock(&members_by_nodeid_lock);
+
+		kfree(node);
+	}
+
+	if (members_by_nodeid[nodeid] &&
+	    members_by_nodeid[nodeid] != node) {
+		printk(KERN_ERR CMAN_NAME ": Attempt to re-add node with id %d\n", nodeid);
+		printk(KERN_ERR CMAN_NAME ": existing node is %s\n", members_by_nodeid[nodeid]->name);
+		printk(KERN_ERR CMAN_NAME ": new node is %s\n", node->name);
+		BUG();
+	}
+
+	spin_lock(&members_by_nodeid_lock);
+	members_by_nodeid[nodeid] = node;
+	spin_unlock(&members_by_nodeid_lock);
+}
+
+static int hello_kthread(void *unused)
+{
+	struct task_struct *tsk = current;
+	sigset_t tmpsig;
+
+	daemonize("cman_hbeat");
+
+	/* Block everything but SIGKILL/SIGSTOP/SIGTERM */
+	siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
+	sigprocmask(SIG_BLOCK, &tmpsig, NULL);
+
+	down(&hello_task_lock);
+	hello_task = tsk;
+	up(&hello_task_lock);
+
+	mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
+
+	cman_set_realtime(current, 1);
+
+	while (node_state != REJECTED && node_state != LEFT_CLUSTER &&
+	       quit_threads == 0) {
+
+		/* Scan the nodes list for dead nodes */
+		if (node_state == MEMBER)
+			check_for_dead_nodes();
+
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		schedule();
+		set_task_state(current, TASK_RUNNING);
+
+		if (node_state != REJECTED && node_state != LEFT_CLUSTER)
+			send_hello();
+	}
+	if (timer_pending(&hello_timer))
+		del_timer(&hello_timer);
+
+	down(&hello_task_lock);
+	hello_task = NULL;
+	up(&hello_task_lock);
+	P_MEMB("heartbeat closing down\n");
+	return 0;
+}
+
+static void process_dead_nodes(void)
+{
+	struct list_head *nodelist, *tmp;
+	struct cl_new_dead_node *deadnode;
+
+	spin_lock(&new_dead_node_lock);
+	list_for_each_safe(nodelist, tmp, &new_dead_node_list) {
+		deadnode = list_entry(nodelist,
+				      struct cl_new_dead_node, list);
+
+		list_del(&deadnode->list);
+		if (deadnode->node->state == NODESTATE_MEMBER) {
+			spin_unlock(&new_dead_node_lock);
+			a_node_just_died(deadnode->node);
+			spin_lock(&new_dead_node_lock);
+		}
+		kfree(deadnode);
+	}
+	spin_unlock(&new_dead_node_lock);
+}
+
+/* This is the membership "daemon". A client of cnxman (but symbiotic with it)
+ * that keeps track of and controls cluster membership. */
+static int membership_kthread(void *unused)
+{
+	struct task_struct *tsk = current;
+	sigset_t tmpsig;
+
+	daemonize("cman_memb");
+
+	/* Block everything but SIGKILL/SIGSTOP/SIGTERM */
+	siginitset(&tmpsig, SIGKILL | SIGSTOP | SIGTERM);
+	sigprocmask(SIG_BLOCK, &tmpsig, NULL);
+
+	spin_lock(&membership_task_lock);
+	membership_task = tsk;
+	spin_unlock(&membership_task_lock);
+	cman_set_realtime(current, 1);
+
+	/* Open the socket */
+	if (init_membership_services())
+		return -1;
+
+	add_us();
+	joining_node = us;
+
+	init_timer(&hello_timer);
+	hello_timer.function = hello_timer_expired;
+	hello_timer.data = 0L;
+
+	/* Do joining stuff */
+	join_or_form_cluster();
+
+	transition_end_time = jiffies;
+
+	/* Main loop */
+	while (node_state != REJECTED && node_state != LEFT_CLUSTER && !quit_threads) {
+
+		struct task_struct *tsk = current;
+
+		DECLARE_WAITQUEUE(wait, tsk);
+
+		tsk->state = TASK_INTERRUPTIBLE;
+		add_wait_queue(mem_socket->sk->sk_sleep, &wait);
+
+		if (!skb_peek(&mem_socket->sk->sk_receive_queue) &&
+		    wake_flags == 0) {
+			if (node_state == JOINACK ||
+			    node_state == JOINWAIT)
+				schedule_timeout(HZ);
+			else
+				schedule();
+		}
+
+		tsk->state = TASK_RUNNING;
+		remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
+
+		/* Are we being shut down? */
+		if (node_state == LEFT_CLUSTER || quit_threads ||
+		    signal_pending(current))
+			break;
+
+		/* Were we woken by a dead node passed down from cnxman ? */
+		if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
+			process_dead_nodes();
+		}
+
+		/* Process received messages. If dispatch_message() returns an
+		 * error then we shut down */
+		if (skb_peek(&mem_socket->sk->sk_receive_queue)) {
+			if (dispatch_messages(mem_socket) < 0)
+				goto leave_cluster;
+		}
+
+		/* Messages may cause us to quit */
+		if (quit_threads)
+			goto leave_cluster;
+
+		/* Check this again here, in case nodes die while we're doing stuff */
+		if (test_and_clear_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
+			process_dead_nodes();
+		}
+
+		/* Were we woken by the transition timer firing ? */
+		if (test_and_clear_bit(WAKE_FLAG_TRANSTIMER, &wake_flags)) {
+			switch (do_timer_wakeup()) {
+			case -1:
+				continue;
+			case 0:
+				break;
+			case +1:
+				goto leave_cluster;
+			}
+		}
+
+		/* Got a JOINACK but no JOIN-CONF, start waiting for HELLO
+		 * messages again */
+		if (node_state == JOINACK &&
+		    time_after(jiffies,
+			       join_time + cman_config.join_timeout * HZ)) {
+			P_MEMB
+			    ("Waited a long time for a join-conf, going back to JOINWAIT state\n");
+			node_state = JOINWAIT;
+			joinwait_time = jiffies;
+		}
+
+		/* Have we had an ACK for our JOINREQ message ? */
+		if (node_state == JOINING &&
+		    time_after(jiffies,
+			       join_time + cman_config.join_timeout * HZ)) {
+			P_MEMB("didn't get JOINACK, going back to JOINWAIT\n");
+			node_state = JOINWAIT;
+			joinwait_time = jiffies;
+		}
+
+		/* Have we been in joinwait for too long... */
+		if (node_state == JOINWAIT &&
+		    time_after(jiffies,
+			       joinwait_time + cman_config.joinwait_timeout * HZ)) {
+			printk(KERN_WARNING CMAN_NAME
+			       ": Been in JOINWAIT for too long - giving up\n");
+			goto leave_cluster;
+		}
+	}
+
+      leave_cluster:
+
+	/* Wake up the heartbeat thread so it can exit */
+	down(&hello_task_lock);
+	if (hello_task)
+		wake_up_process(hello_task);
+	up(&hello_task_lock);
+
+	if (timer_pending(&transition_timer))
+		del_timer(&transition_timer);
+
+	node_state = LEFT_CLUSTER;
+	P_MEMB("closing down\n");
+	quit_threads = 1;	/* force other thread to exit too */
+
+	send_leave(us->leave_reason);
+	sock_release(mem_socket);
+	highest_nodeid = 0;
+	joining_node = NULL;
+	master_node = NULL;
+	complete(&member_thread_comp);
+
+	spin_lock(&membership_task_lock);
+	membership_task = NULL;
+	spin_unlock(&membership_task_lock);
+	return 0;
+}
+
+/* Things to do in the main thread when the transition timer has woken us.
+ * Usually this happens when a transition is taking too long and we need to
+ * take remedial action.
+ *
+ * returns: -1 continue; 0 carry on processing +1 leave cluster; */
+static int do_timer_wakeup()
+{
+	P_MEMB("Timer wakeup - checking for dead master node %ld\n", jiffies);
+
+	/* Resend JOINCONF if it got lost on the wire */
+	if (node_state == MASTER && master_state == MASTER_CONFIRM) {
+		mod_timer(&transition_timer,
+			  jiffies + cman_config.joinconf_timeout * HZ);
+		if (++joinconf_count < cman_config.max_retries) {
+			P_MEMB("Resending JOINCONF\n");
+			send_joinconf();
+		}
+		else {
+			P_MEMB("JOINCONF not acked, removing node\n");
+			joining_node->state = NODESTATE_DEAD;
+			start_transition(TRANS_REMNODE, joining_node);
+			remove_joiner(1);
+			joining_node = NULL;
+		}
+		return -1;
+	}
+
+	/* A joining node probably died */
+	if (cluster_members == 1) {
+		end_transition();
+		return -1;
+	}
+
+	/* See if the master is still there */
+	if (node_state == TRANSITION || node_state == TRANSITION_COMPLETE) {
+
+		/* If we are in transition and master_node is NULL then we are
+		 * waiting for ENDTRANS after JOIN-CONF */
+		if (!master_node) {
+			/* Hmmm. master died after sending JOINCONF, we'll have
+			 * to die as we are in mid-transition */
+			printk(KERN_INFO CMAN_NAME
+			       ": Master died after JOINCONF, we must leave the cluster\n");
+			quit_threads = 1;
+			return +1;
+		}
+
+		/* No messages from the master - see if it's stil there */
+		if (master_node->state == NODESTATE_MEMBER) {
+			send_master_hello();
+			mod_timer(&transition_timer,
+				  jiffies +
+				  cman_config.transition_timeout * HZ);
+		}
+
+		/* If the master is dead then elect a new one */
+		if (master_node->state == NODESTATE_DEAD) {
+
+			struct cluster_node *node;
+
+			P_MEMB("Master node is dead...Election!\n");
+			if (elect_master(&node, 0)) {
+
+				/* We are master now, all kneel */
+				master_node->leave_reason = CLUSTER_LEAVEFLAG_NORESPONSE;
+				start_transition(TRANS_DEADMASTER, master_node);
+			}
+			else {
+				/* Leave the job to someone on more pay */
+				master_node = node;
+				mod_timer(&transition_timer,
+					  jiffies +
+					  cman_config.transition_timeout * HZ);
+			}
+		}
+	}
+
+	/* If we are the master node then restart the transition */
+	if (node_state == MASTER) {
+		start_transition(TRANS_RESTART, us);
+	}
+
+	return 0;
+}
+
+static void form_cluster(void)
+{
+	printk(KERN_INFO CMAN_NAME ": forming a new cluster\n");
+	node_state = MEMBER;
+	we_are_a_cluster_member = TRUE;
+	us->state = NODESTATE_MEMBER;
+	if (wanted_nodeid)
+		set_nodeid(us, wanted_nodeid);
+	else
+		set_nodeid(us, 1);
+	recalculate_quorum(0);
+	sm_member_update(cluster_is_quorate);
+	send_hello();
+	kernel_thread(hello_kthread, NULL, 0);
+}
+
+/* This does the initial JOIN part of the membership process. Actually most of
+ * is done in the message processing routines but this is the main loop that
+ * controls it. The side-effect of this routine is "node_state" which tells the
+ * real main loop (in the kernel thread routine) what to do next */
+static void join_or_form_cluster()
+{
+	start_time = jiffies;
+
+	printk(KERN_INFO CMAN_NAME
+	       ": Waiting to join or form a Linux-cluster\n");
+
+ restart_joinwait:
+	join_time = 0;
+	start_time = jiffies;
+	joinwait_time = jiffies;
+	last_hello = 0;
+
+	/* Listen for HELLO or NEWCLUSTER messages */
+	do {
+		DECLARE_WAITQUEUE(wait, current);
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		add_wait_queue(mem_socket->sk->sk_sleep, &wait);
+
+		if (!skb_peek(&mem_socket->sk->sk_receive_queue))
+			schedule_timeout((cman_config.joinwait_timeout * HZ) /
+					 5);
+
+		set_task_state(current, TASK_RUNNING);
+		remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
+
+		while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
+			dispatch_messages(mem_socket);
+		}
+		if (quit_threads)
+			node_state = LEFT_CLUSTER;
+
+	}
+	while (time_before(jiffies, start_time + cman_config.joinwait_timeout * HZ) &&
+	       node_state == STARTING);
+
+	if (node_state == STARTING) {
+		start_time = jiffies;
+		joinwait_time = jiffies;
+		node_state = NEWCLUSTER;
+	}
+
+        /* If we didn't hear any HELLO messages then start sending NEWCLUSTER messages */
+	while (time_before(jiffies, start_time + cman_config.newcluster_timeout * HZ) &&
+	       node_state == NEWCLUSTER) {
+
+		DECLARE_WAITQUEUE(wait, current);
+
+		send_newcluster();
+
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		add_wait_queue(mem_socket->sk->sk_sleep, &wait);
+
+		if (!skb_peek(&mem_socket->sk->sk_receive_queue))
+			schedule_timeout((cman_config.joinwait_timeout * HZ) /
+					 5);
+
+		set_task_state(current, TASK_RUNNING);
+		remove_wait_queue(mem_socket->sk->sk_sleep, &wait);
+
+		while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
+			dispatch_messages(mem_socket);
+		}
+		/* Did we get a lower "NEWCLUSTER" message ? */
+		if (node_state == STARTING) {
+			P_MEMB("NEWCLUSTER: restarting joinwait\n");
+			goto restart_joinwait;
+		}
+
+		if (quit_threads)
+			node_state = LEFT_CLUSTER;
+
+	}
+
+
+        /* If we didn't hear any HELLO messages then form a new cluster */
+	if (node_state == NEWCLUSTER) {
+		form_cluster();
+	}
+	else
+		last_hello = jiffies;
+
+}
+
+int start_membership_services(pid_t cluster_pid)
+{
+	kcluster_pid = cluster_pid;
+
+	init_timer(&transition_timer);
+	transition_timer.function = trans_timer_expired;
+	transition_timer.data = 0L;
+	wake_flags = 0L;
+
+	/* Start the thread */
+	return kernel_thread(membership_kthread, NULL, 0);
+}
+
+static int init_membership_services()
+{
+	int result;
+	struct sockaddr_cl saddr;
+	struct socket *sock;
+
+	init_MUTEX(&hello_task_lock);
+	/* Create a socket to communicate with */
+	result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
+	if (result < 0) {
+		printk(KERN_ERR CMAN_NAME
+		       ": Can't create cluster socket for membership services\n");
+		return result;
+	}
+	mem_socket = sock;
+
+	/* Bind to our port */
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+	result =
+	    sock->ops->bind(sock, (struct sockaddr *) &saddr, sizeof (saddr));
+	if (result < 0) {
+		printk(KERN_ERR CMAN_NAME
+		       ": Can't bind to cluster membership services port\n");
+		sock_release(sock);
+		return result;
+	}
+
+	node_state = STARTING;
+	return 0;
+}
+
+static int send_joinconf()
+{
+	struct sockaddr_cl saddr;
+	int status;
+
+	if (joining_temp_nodeid == 0) {
+		printk(KERN_DEBUG CMAN_NAME ": Failed to join node '%s'\n",
+		       joining_node?joining_node->name:"unknown");
+		remove_joiner(0);
+		return -1;
+        }
+
+	master_state = MASTER_CONFIRM;
+	saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_nodeid = joining_temp_nodeid;
+	status = send_cluster_view(CLUSTER_MEM_JOINCONF, &saddr,
+				   MSG_NOACK, 0);
+
+	if (status < 0) {
+		printk(KERN_WARNING CMAN_NAME ": Error %d sending JOINCONF\n", status);
+        }
+	return status;
+}
+
+static int send_joinreq(struct sockaddr_cl *addr, int addr_len)
+{
+	char *msgbuf = scratchbuf;
+	struct list_head *addrlist;
+	int ptr = sizeof (struct cl_mem_join_msg);
+	unsigned short num_addr = 0;
+	struct cluster_node_addr *nodeaddr;
+	struct cl_mem_join_msg *msg = (struct cl_mem_join_msg *) msgbuf;
+
+	msg->cmd = CLUSTER_MEM_JOINREQ;
+	msg->votes = votes;
+	msg->expected_votes = cpu_to_le32(expected_votes);
+	msg->nodeid         = cpu_to_le32(wanted_nodeid);
+	msg->major_version  = cpu_to_le32(CNXMAN_MAJOR_VERSION);
+	msg->minor_version  = cpu_to_le32(CNXMAN_MINOR_VERSION);
+	msg->patch_version  = cpu_to_le32(CNXMAN_PATCH_VERSION);
+	msg->config_version = cpu_to_le32(config_version);
+	msg->addr_len       = cpu_to_le32(address_length);
+	strcpy(msg->clustername, cluster_name);
+
+	/* Add our addresses */
+	list_for_each(addrlist, &us->addr_list) {
+		nodeaddr = list_entry(addrlist, struct cluster_node_addr, list);
+
+		memcpy(msgbuf + ptr, nodeaddr->addr, address_length);
+		ptr += address_length;
+		num_addr++;
+	}
+	msg->num_addr = cpu_to_le16(num_addr);
+
+	/* And our name */
+	strcpy(msgbuf + ptr, nodename);
+	ptr += strlen(nodename) + 1;
+
+	return kcl_sendmsg(mem_socket, msgbuf, ptr,
+			   addr, addr_len, MSG_NOACK);
+}
+
+static int send_startack(struct sockaddr_cl *addr, int addr_len)
+{
+	struct cl_mem_startack_msg msg;
+
+	msg.cmd = CLUSTER_MEM_STARTACK;
+	msg.generation = cpu_to_le32(cluster_generation);
+
+	return kcl_sendmsg(mem_socket, &msg, sizeof (msg), addr, addr_len, MSG_REPLYEXP);
+}
+
+static int send_newcluster()
+{
+	char buf[5];
+	uint32_t lowip;
+
+	buf[0] = CLUSTER_MEM_NEWCLUSTER;
+	lowip = cpu_to_le32(low32_of_ip());
+	memcpy(&buf[1], &lowip, sizeof(lowip));
+
+	return kcl_sendmsg(mem_socket, buf, sizeof(uint32_t)+1,
+			   NULL, 0,
+			   MSG_NOACK);
+}
+
+static int send_hello()
+{
+	struct cl_mem_hello_msg hello_msg;
+	int status;
+
+	hello_msg.cmd = CLUSTER_MEM_HELLO;
+	hello_msg.members = cpu_to_le16(cluster_members);
+	hello_msg.flags = cluster_is_quorate ? HELLO_FLAG_QUORATE : 0;
+	hello_msg.generation = cpu_to_le32(cluster_generation);
+
+	status = kcl_sendmsg(mem_socket, &hello_msg,
+			     sizeof(struct cl_mem_hello_msg),
+			     NULL, 0, MSG_NOACK | MSG_ALLINT);
+
+	last_hello = jiffies;
+
+	return status;
+}
+
+/* This is a special HELLO message that requires an ACK. clients in transition
+ * send these to the master to check it is still alive. If it does not ACK then
+ * cnxman will signal it dead and we can restart the transition */
+static int send_master_hello()
+{
+	struct cl_mem_hello_msg hello_msg;
+	int status;
+	struct sockaddr_cl saddr;
+
+	hello_msg.cmd = CLUSTER_MEM_HELLO;
+	hello_msg.members = cpu_to_le16(cluster_members);
+	hello_msg.flags = HELLO_FLAG_MASTER |
+		          (cluster_is_quorate ? HELLO_FLAG_QUORATE : 0);
+	hello_msg.generation = cpu_to_le32(cluster_generation);
+
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+	saddr.scl_nodeid = master_node->node_id;
+
+	status = kcl_sendmsg(mem_socket, &hello_msg,
+			     sizeof(struct cl_mem_hello_msg),
+			     &saddr, sizeof (saddr), 0);
+
+	last_hello = jiffies;
+
+	return status;
+}
+
+/* Called when the transition timer has expired, meaning we sent a transition
+ * message that was not ACKed */
+static void trans_timer_expired(unsigned long arg)
+{
+	P_MEMB("Transition timer fired %ld\n", jiffies);
+
+	set_bit(WAKE_FLAG_TRANSTIMER, &wake_flags);
+	wake_up_process(membership_task);
+}
+
+static void hello_timer_expired(unsigned long arg)
+{
+	P_MEMB("Hello timer fired %ld\n", jiffies);
+
+	mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
+
+	if (node_state >= TRANSITION) {
+		if (!wake_up_process(hello_task))
+			P_MEMB("Failed to wake up hello thread\n");
+	}
+}
+
+static int wait_for_completion_barrier(void)
+{
+	int status;
+	char barriername[MAX_BARRIER_NAME_LEN];
+
+	sprintf(barriername, MEMBERSHIP_BARRIER_NAME, cluster_generation);
+
+	/* Make sure we all complete together */
+	P_MEMB("Waiting for completion barrier: %d members\n", cluster_members);
+	if ((status =
+	     kcl_barrier_register(barriername, 0, cluster_members)) < 0) {
+		printk(KERN_ERR CMAN_NAME ": Error registering barrier: %d\n", status);
+		return -1;
+	}
+	kcl_barrier_setattr(barriername, BARRIER_SETATTR_TIMEOUT,
+			    cman_config.transition_timeout);
+	status = kcl_barrier_wait(barriername);
+	kcl_barrier_delete(barriername);
+
+	P_MEMB("Completion barrier reached : status = %d\n", status);
+	return status;
+}
+
+/* Called at the end of a state transition when we are the master */
+static int end_transition()
+{
+	struct cl_mem_endtrans_msg msg;
+	int total_votes;
+	int status;
+
+	/* Cancel the timer */
+	del_timer(&transition_timer);
+
+	confirm_joiner();
+
+	quorum = calculate_quorum(leavereason, leavereason?cluster_members:0, &total_votes);
+
+	msg.cmd = CLUSTER_MEM_ENDTRANS;
+	msg.quorum = cpu_to_le32(quorum);
+	msg.generation = cpu_to_le32(++cluster_generation);
+	msg.total_votes = cpu_to_le32(total_votes);
+	if (joining_node && transitionreason == TRANS_NEWNODE) {
+		msg.new_node_id = cpu_to_le32(joining_node->node_id);
+	}
+	else {
+		msg.new_node_id = 0;
+	}
+	status = kcl_sendmsg(mem_socket, &msg, sizeof (msg), NULL, 0, 0);
+
+	/* When that's all settled down, do the transition completion barrier */
+	kcl_wait_for_all_acks();
+
+	/* We check this below too, but this can save us 3 seconds in a transition */
+	if (test_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
+		P_MEMB("Node died during ACK collection - restart\n");
+		remove_joiner(0);
+		return 0;
+	}
+
+	if (wait_for_completion_barrier() != 0) {
+		P_MEMB("Barrier timed out - restart\n");
+
+		/* If a node died while we were waiting then restart transition with ANOTHERREMNODE */
+		if (!test_bit(WAKE_FLAG_DEADNODE, &wake_flags)) {
+			remove_joiner(0);
+			start_transition(TRANS_RESTART, us);
+		}
+		return 0;
+	}
+
+	joining_temp_nodeid = 0;
+	joining_node = NULL;
+	purge_temp_nodeids();
+
+	set_quorate(total_votes);
+
+	notify_listeners();
+	reset_hello_time();
+
+	/* Tell any waiting barriers that we had a transition */
+	check_barrier_returns();
+
+	leavereason = 0;
+	transitionreason = TRANS_NONE;
+	node_state = MEMBER;
+	transition_end_time = jiffies;
+
+	sm_member_update(cluster_is_quorate);
+
+	return 0;
+}
+
+int send_reconfigure(int param, unsigned int value)
+{
+	char msgbuf[66];
+	struct cl_mem_reconfig_msg *msg =
+	    (struct cl_mem_reconfig_msg *) &msgbuf;
+
+	if (param == RECONFIG_PARAM_EXPECTED_VOTES && expected_votes > value)
+		expected_votes = value;
+
+	msg->cmd = CLUSTER_MEM_RECONFIG;
+	msg->param = param;
+	msg->value = cpu_to_le32(value);
+
+	return kcl_sendmsg(mem_socket, &msgbuf, sizeof (*msg), NULL, 0, 0);
+}
+
+static int send_joinack(char *addr, int addr_len, unsigned char acktype)
+{
+	struct cl_mem_joinack_msg msg;
+
+	msg.cmd = CLUSTER_MEM_JOINACK;
+	msg.acktype = acktype;
+
+	return kcl_sendmsg(mem_socket, &msg, sizeof (msg),
+			   (struct sockaddr_cl *)addr, addr_len,  MSG_NOACK);
+}
+
+/* Only send a leave message to one node in the cluster so that it can master
+ * the state transition, otherwise we get a "thundering herd" of potential
+ * masters fighting it out */
+int send_leave(unsigned char flags)
+{
+	unsigned char msg[2];
+	struct sockaddr_cl saddr;
+	struct cluster_node *node = NULL;
+	int status;
+
+	if (!mem_socket)
+		return 0;
+
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+
+	/* If we are in transition then use the current master */
+	if (node_state == TRANSITION) {
+		node = master_node;
+	}
+	if (!node) {
+		/* If we are the master or not in transition then pick a node
+		 * almost at random */
+		struct list_head *nodelist;
+
+		down(&cluster_members_lock);
+		list_for_each(nodelist, &cluster_members_list) {
+			node = list_entry(nodelist, struct cluster_node, list);
+
+			if (node->state == NODESTATE_MEMBER && !node->us)
+				break;
+		}
+		up(&cluster_members_lock);
+	}
+
+	/* we are the only member of the cluster - there is no-one to tell */
+	if (node && !node->us) {
+		saddr.scl_nodeid = node->node_id;
+
+		P_MEMB("Sending LEAVE to %s\n", node->name);
+		msg[0] = CLUSTER_MEM_LEAVE;
+		msg[1] = flags;
+		status = kcl_sendmsg(mem_socket, msg, 2,
+				     &saddr, sizeof (saddr),
+				     MSG_NOACK);
+		if (status < 0)
+			return status;
+	}
+
+	/* And exit */
+	node_state = LEFT_CLUSTER;
+	wake_up_process(membership_task);
+	return 0;
+}
+
+int send_kill(int nodeid, int needack)
+{
+	char killmsg;
+	struct sockaddr_cl saddr;
+
+	killmsg = CLUSTER_MEM_KILL;
+
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+	saddr.scl_nodeid = nodeid;
+	return kcl_sendmsg(mem_socket, &killmsg, 1, &saddr,
+			   sizeof (struct sockaddr_cl), needack?0:MSG_NOACK);
+}
+
+/* Tell the rest of the cluster a node has gone down */
+static int send_nodedown(int nodeid, unsigned char reason)
+{
+	struct cl_mem_nodedown_msg downmsg;
+	int status;
+
+	downmsg.reason = reason;
+	downmsg.nodeid = cpu_to_le32(nodeid);
+	downmsg.cmd = CLUSTER_MEM_NODEDOWN;
+
+	status = kcl_sendmsg(mem_socket, (char *)&downmsg, sizeof(downmsg), NULL, 0, 0);
+	return status;
+}
+
+/* Process a message */
+static int do_membership_packet(struct msghdr *msg, char *buf, int len)
+{
+	int result = -1;
+	struct sockaddr_cl *saddr = msg->msg_name;
+	struct cluster_node *node;
+
+	node = find_node_by_nodeid(saddr->scl_nodeid);
+
+	P_MEMB("got membership message : %s, from (%d) %s, len = %d\n",
+	       msgname(*buf), saddr->scl_nodeid, node ? node->name : "unknown", len);
+
+	switch (*buf) {
+	case CLUSTER_MEM_JOINREQ:
+		result = do_process_joinreq(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_LEAVE:
+		if (we_are_a_cluster_member)
+			result = do_process_leave(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_HELLO:
+		result = do_process_hello(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_KILL:
+		if (we_are_a_cluster_member)
+			result = do_process_kill(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_JOINCONF:
+		if (node_state == JOINACK) {
+			do_process_joinconf(msg, buf, len);
+		}
+		break;
+
+	case CLUSTER_MEM_CONFACK:
+		if (node_state == MASTER && master_state == MASTER_CONFIRM) {
+			end_transition();
+		}
+		break;
+
+	case CLUSTER_MEM_MASTERVIEW:
+		if (node_state == TRANSITION)
+			do_process_masterview(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_JOINACK:
+		if (node_state == JOINING || node_state == JOINWAIT) {
+			do_process_joinack(msg, buf, len);
+		}
+		break;
+	case CLUSTER_MEM_RECONFIG:
+		if (we_are_a_cluster_member) {
+			do_process_reconfig(msg, buf, len);
+		}
+		break;
+
+	case CLUSTER_MEM_STARTTRANS:
+		result = do_process_starttrans(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_NODEDOWN:
+		result = do_process_nodedown(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_ENDTRANS:
+		result = do_process_endtrans(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_VIEWACK:
+		if (node_state == MASTER && master_state == MASTER_COLLECT)
+			result = do_process_viewack(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_STARTACK:
+		if (node_state == MASTER)
+			result = do_process_startack(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_NEWCLUSTER:
+		result = do_process_newcluster(msg, buf, len);
+		break;
+
+	case CLUSTER_MEM_NOMINATE:
+		if (node_state != MASTER)
+			result = do_process_nominate(msg, buf, len);
+		break;
+
+	default:
+		printk(KERN_ERR CMAN_NAME
+		       ": Unknown membership services message %d received from node %d port %d\n",
+		       *buf, saddr->scl_nodeid, saddr->scl_port);
+		break;
+
+	}
+	return result;
+}
+
+/* Returns -ve to reject membership of the cluster 0 to accept membership +ve
+ * to ignore request (node already joining) */
+static int check_duplicate_node(char *name, struct msghdr *msg, int len)
+{
+	struct cluster_node *node;
+	struct sockaddr_cl *saddr = (struct sockaddr_cl *)msg->msg_name;
+	char addr[address_length];
+	int addrlen;
+
+	if (strlen(name) >= MAX_CLUSTER_MEMBER_NAME_LEN)
+		return -3;
+
+	/* See if we already have a cluster member with that name... */
+	node = find_node_by_name(name);
+	if (node && node->state != NODESTATE_DEAD) {
+
+		if (node->state == NODESTATE_JOINING)
+			return +1;
+
+		printk(KERN_WARNING CMAN_NAME
+		       ": Rejecting cluster membership application from %s - already have a node with that name\n",
+		       name);
+		return -1;
+
+	}
+
+	/* Need to check the node's address too */
+	if (get_addr_from_temp_nodeid(saddr->scl_nodeid, addr, &addrlen) &&
+	    (node = find_node_by_addr(addr, addrlen)) &&
+	    node->state != NODESTATE_DEAD) {
+
+		if (node->state == NODESTATE_JOINING)
+			return +1;
+
+		printk(KERN_WARNING CMAN_NAME
+		       ": Rejecting cluster membership application from %s - already have a node with that address\n",
+		       name);
+		return -1;
+	}
+	return 0;
+}
+
+/* Start the state transition */
+static int start_transition(unsigned char reason, struct cluster_node *node)
+{
+	char *startbuf = scratchbuf;
+	struct cl_mem_starttrans_msg *msg =
+	    (struct cl_mem_starttrans_msg *) startbuf;
+
+	P_MEMB("Start transition - reason = %d(last reason = %d)\n", reason, transitionreason);
+
+	/* If this is a restart then zero the counters */
+	if (reason == TRANS_RESTART || reason == TRANS_NEWMASTER) {
+		agreeing_nodes = 0;
+		dissenting_nodes = 0;
+		if (node_opinion) {
+			kfree(node_opinion);
+			node_opinion = NULL;
+		}
+		responses_collected = 0;
+
+		/* Make sure we restart with the right new node if applicable. */
+		if (transitionreason == TRANS_NEWNODE && joining_node)
+			node = joining_node;
+
+		/* If we are a new master then try to restart the transition proper */
+		if (reason == TRANS_NEWMASTER) {
+			reason = transitionreason;
+			if (reason == TRANS_NEWNODE) {
+				if (joining_node)
+					node = joining_node;
+				else
+					reason = TRANS_NEWMASTER;
+			}
+		}
+	}
+
+	/* If we have timed out too many times then just die */
+	if (reason == TRANS_RESTART
+	    && ++transition_restarts > cman_config.transition_restarts) {
+		printk(KERN_WARNING CMAN_NAME
+		       ": too many transition restarts - will die\n");
+		us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
+		node_state = LEFT_CLUSTER;
+		quit_threads = 1;
+		wake_up_process(membership_task);
+		wake_up_interruptible(&cnxman_waitq);
+		return 0;
+	}
+	if (reason != TRANS_RESTART)
+		transition_restarts = 0;
+
+	/* Only keep the original state transition reason in the global
+	 * variable. */
+	if (reason != TRANS_ANOTHERREMNODE && reason != TRANS_NEWMASTER &&
+	    reason != TRANS_RESTART && reason != TRANS_DEADMASTER)
+		transitionreason = reason;
+
+	if (reason == TRANS_DEADMASTER)
+		transitionreason = TRANS_REMNODE;
+
+        /* Save the info of the requesting node */
+	if (reason == TRANS_NEWNODE)
+		joining_node = node;
+
+	node_state = MASTER;
+	master_state = MASTER_START;
+	responses_collected = 0;
+	responses_expected = cluster_members - 1;
+
+	/* If we are on our own then just do it */
+	if (responses_expected == 0) {
+		P_MEMB("We are on our own...lonely here\n");
+		responses_collected--;
+		do_process_startack(NULL, NULL, 0);
+	}
+	else {
+		int ptr = sizeof (struct cl_mem_starttrans_msg);
+		struct list_head *addrlist;
+		unsigned short num_addrs = 0;
+		int flags = MSG_REPLYEXP;
+
+		/* Send the STARTTRANS message */
+		msg->cmd = CLUSTER_MEM_STARTTRANS;
+		msg->reason = reason;
+		msg->votes = node->votes;
+		msg->expected_votes = cpu_to_le32(node->expected_votes);
+		msg->generation = cpu_to_le32(++cluster_generation);
+		msg->nodeid = cpu_to_le32(node->node_id);
+		msg->flags = node->leave_reason;
+
+		if (reason == TRANS_NEWNODE) {
+			/* Add the addresses */
+			list_for_each(addrlist, &node->addr_list) {
+				struct cluster_node_addr *nodeaddr =
+				    list_entry(addrlist,
+					       struct cluster_node_addr, list);
+
+				memcpy(startbuf + ptr, nodeaddr->addr,
+				       address_length);
+				ptr += address_length;
+				num_addrs++;
+			}
+
+			/* And the name */
+			strcpy(startbuf + ptr, node->name);
+			ptr += strlen(node->name) + 1;
+		}
+
+		/* If another node died then we must queue the STARTTRANS
+		 * messages so that membershipd can carry on processing the
+		 * other replies */
+		if (reason == TRANS_ANOTHERREMNODE)
+			flags |= MSG_QUEUE;
+
+		msg->num_addrs = cpu_to_le16(num_addrs);
+		kcl_sendmsg(mem_socket, msg, ptr, NULL, 0, flags);
+
+		/* Set a timer in case we don't get 'em all back */
+		mod_timer(&transition_timer,
+			  jiffies + cman_config.transition_timeout * HZ);
+	}
+	return 0;
+}
+
+/* A node has died - decide what to do */
+void a_node_just_died(struct cluster_node *node)
+{
+	/* If we are not in the context of kmembershipd then stick it on the
+	 * list and wake it */
+	if (current != membership_task) {
+		struct cl_new_dead_node *newnode =
+		    kmalloc(sizeof (struct cl_new_dead_node), GFP_KERNEL);
+		if (!newnode)
+			return;
+		newnode->node = node;
+		spin_lock(&new_dead_node_lock);
+		list_add_tail(&newnode->list, &new_dead_node_list);
+		set_bit(WAKE_FLAG_DEADNODE, &wake_flags);
+		spin_unlock(&new_dead_node_lock);
+		wake_up_process(membership_task);
+		P_MEMB("Passing dead node %s onto kmembershipd\n", node->name);
+		return;
+	}
+
+	printk(KERN_WARNING CMAN_NAME ": removing node %s from the cluster : %s\n",
+	       node->name, leave_string(node->leave_reason));
+
+	/* Remove it */
+	down(&cluster_members_lock);
+	if (node->state == NODESTATE_MEMBER)
+		cluster_members--;
+	node->state = NODESTATE_DEAD;
+	node->last_seq_recv = 0;
+	up(&cluster_members_lock);
+
+	send_nodedown(node->node_id, node->leave_reason);
+
+	/* Notify listeners */
+	notify_kernel_listeners(DIED, (long) node->node_id);
+
+	/* If we are in normal operation then become master and initiate a
+	 * state-transition */
+	if (node_state == MEMBER) {
+		start_transition(TRANS_REMNODE, node);
+		return;
+	}
+
+	/* If we are a slave in transition then see if it's the master that has
+	 * failed. If not then ignore it. If it /is/ the master then elect a
+	 * new one */
+	if (node_state == TRANSITION) {
+		if (master_node == node) {
+			if (elect_master(&node, 0)) {
+				del_timer(&transition_timer);
+				node_state = MASTER;
+
+				master_node->leave_reason = CLUSTER_LEAVEFLAG_NORESPONSE;
+				start_transition(TRANS_DEADMASTER, master_node);
+			}
+			else {
+				/* Someone else can be in charge - phew! */
+			}
+		}
+		return;
+	}
+
+	/* If we are the master then we need to start the transition all over
+	 * again */
+	if (node_state == MASTER) {
+		/* Cancel timer */
+		del_timer(&transition_timer);
+
+		/* Restart the transition */
+		start_transition(TRANS_ANOTHERREMNODE, node);
+		transition_restarts = 0;
+		return;
+	}
+}
+
+/*
+ * Build up and send a set of messages consisting of the whole cluster view.
+ * The first byte is the command (cmd as passed in), the second is a flag byte:
+ * bit 0 is set in the first message, bit 1 in the last (NOTE both may be set if
+ * this is the only message sent The rest is a set of packed node entries, which
+ * are NOT split over packets. */
+static int send_cluster_view(unsigned char cmd, struct sockaddr_cl *saddr,
+			     unsigned int flags, unsigned int flags2)
+{
+	int ptr = 2;
+	int len;
+	int status = 0;
+	int last_node_start = 2;
+	unsigned char first_packet_flag = 1;
+	struct list_head *nodelist;
+	struct list_head *temp;
+	struct cluster_node *node;
+	char *message = scratchbuf;
+
+	message[0] = cmd;
+	P_MEMB("send_cluster_view, msg=%d\n", cmd);
+
+	down(&cluster_members_lock);
+	list_for_each_safe(nodelist, temp, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		P_MEMB("Node %s (%d), state = %d\n", node->name, node->node_id, node->state);
+
+		if (node->state == NODESTATE_MEMBER || node->state == NODESTATE_DEAD) {
+			unsigned int evotes;
+			unsigned int node_id;
+			unsigned short num_addrs = 0;
+			unsigned short num_addrs_le;
+			struct list_head *addrlist;
+
+			last_node_start = ptr;
+
+			message[ptr++] = len = strlen(node->name);
+			strcpy(&message[ptr], node->name);
+			ptr += len;
+
+			message[ptr++] = node->state;
+
+			/* Count the number of addresses this node has */
+			list_for_each(addrlist, &node->addr_list) {
+				num_addrs++;
+			}
+
+			num_addrs_le = cpu_to_le16(num_addrs);
+			memcpy(&message[ptr], &num_addrs_le, sizeof (short));
+			ptr += sizeof (short);
+
+			/* Pack em in */
+			list_for_each(addrlist, &node->addr_list) {
+
+				struct cluster_node_addr *nodeaddr =
+					list_entry(addrlist,
+						   struct cluster_node_addr, list);
+
+				memcpy(&message[ptr], nodeaddr->addr,
+				       address_length);
+				ptr += address_length;
+			}
+
+			message[ptr++] = node->votes;
+
+			evotes = cpu_to_le32(node->expected_votes);
+			memcpy(&message[ptr], &evotes, sizeof (int));
+			ptr += sizeof (int);
+
+			node_id = cpu_to_le32(node->node_id);
+			memcpy(&message[ptr], &node_id, sizeof (int));
+			ptr += sizeof (int);
+
+			/* If the block is full then send it */
+			if (ptr > MAX_CLUSTER_MESSAGE) {
+				message[1] = first_packet_flag;
+
+				up(&cluster_members_lock);
+				status = kcl_sendmsg(mem_socket, message,
+						     last_node_start, saddr,
+						     saddr ? sizeof (struct sockaddr_cl) : 0,
+						     flags);
+
+				if (status < 0)
+					goto send_fail;
+
+				down(&cluster_members_lock);
+
+				first_packet_flag = 0;
+				/* Copy the overflow back to the start of the
+				 * buffer for the next send */
+				memcpy(&message[2], &message[last_node_start],
+				       ptr - last_node_start);
+				ptr = ptr - last_node_start + 2;
+			}
+		}
+	}
+
+	up(&cluster_members_lock);
+
+	message[1] = first_packet_flag | 2;	/* The last may also be first */
+	status = kcl_sendmsg(mem_socket, message, ptr,
+			     saddr, saddr ? sizeof (struct sockaddr_cl) : 0,
+			     flags | flags2);
+      send_fail:
+
+	return status;
+}
+
+/* Make the JOINING node into a MEMBER */
+static void confirm_joiner()
+{
+	if (joining_node && joining_node->state == NODESTATE_JOINING) {
+		down(&cluster_members_lock);
+		joining_node->state = NODESTATE_MEMBER;
+		cluster_members++;
+		up(&cluster_members_lock);
+	}
+}
+
+/* Reset HELLO timers for all nodes We do this after a state-transition as we
+ * have had HELLOS disabled during the transition and if we don't do this the
+ * nodes will go on an uncontrolled culling-spree afterwards */
+static void reset_hello_time()
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (node->state == NODESTATE_MEMBER) {
+			node->last_hello = jiffies;
+		}
+
+	}
+	up(&cluster_members_lock);
+}
+
+/* Calculate the new quorum and return the value. do *not* set it in here as
+ * cnxman calls this to check if a new expected_votes value is valid. It
+ * (optionally) returns the total number of votes in the cluster */
+int calculate_quorum(int allow_decrease, int max_expected, int *ret_total_votes)
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+	unsigned int total_votes = 0;
+	unsigned int highest_expected = 0;
+	unsigned int newquorum, q1, q2;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (node->state == NODESTATE_MEMBER) {
+			highest_expected =
+			    max(highest_expected, node->expected_votes);
+			total_votes += node->votes;
+		}
+	}
+	up(&cluster_members_lock);
+	if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
+		total_votes += quorum_device->votes;
+
+	if (max_expected > 0)
+		highest_expected = max_expected;
+
+	/* This quorum calculation is taken from the OpenVMS Cluster Systems
+	 * manual, but, then, you guessed that didn't you */
+	q1 = (highest_expected + 2) / 2;
+	q2 = (total_votes + 2) / 2;
+	newquorum = max(q1, q2);
+
+	/* Normally quorum never decreases but the system administrator can
+	 * force it down by setting expected votes to a maximum value */
+	if (!allow_decrease)
+		newquorum = max(quorum, newquorum);
+
+	/* The special two_node mode allows each of the two nodes to retain
+	 * quorum if the other fails.  Only one of the two should live past
+	 * fencing (as both nodes try to fence each other in split-brain.) */
+	if (two_node)
+		newquorum = 1;
+
+	if (ret_total_votes)
+		*ret_total_votes = total_votes;
+	return newquorum;
+}
+
+/* Recalculate cluster quorum, set quorate and notify changes */
+void recalculate_quorum(int allow_decrease)
+{
+	int total_votes;
+
+	quorum = calculate_quorum(allow_decrease, 0, &total_votes);
+	set_quorate(total_votes);
+	notify_listeners();
+}
+
+/* Add new node address to an existing node */
+int add_node_address(struct cluster_node *node, unsigned char *addr, int len)
+{
+	struct cluster_node_addr *newaddr;
+
+	newaddr = kmalloc(sizeof (struct cluster_node_addr), GFP_KERNEL);
+	if (!newaddr)
+		return -1;
+
+	memcpy(newaddr->addr, addr, len);
+	newaddr->addr_len = len;
+	list_add_tail(&newaddr->list, &node->addr_list);
+
+	return 0;
+}
+
+static struct cluster_node *add_new_node(char *name, unsigned char votes,
+					 unsigned int expected_votes,
+					 int node_id, int state)
+{
+	struct cluster_node *newnode;
+
+	/* Look for a dead node with this name */
+	newnode = find_node_by_name(name);
+
+	/* Is it already joining */
+	if (newnode && newnode->state == NODESTATE_JOINING)
+		return NULL;
+
+	/* Update existing information */
+	if (newnode && newnode->state == NODESTATE_DEAD) {
+		newnode->last_hello = jiffies;
+		newnode->votes = votes;
+		newnode->expected_votes = expected_votes;
+		newnode->state = state;
+		newnode->us = 0;
+		newnode->leave_reason = 0;
+		newnode->last_seq_recv = 0;
+		newnode->last_seq_acked = 0;
+		newnode->last_seq_sent = 0;
+		newnode->incarnation++;
+		do_gettimeofday(&newnode->join_time);
+		/* Don't overwrite the node ID */
+
+		if (state == NODESTATE_MEMBER) {
+			down(&cluster_members_lock);
+			cluster_members++;
+			up(&cluster_members_lock);
+		}
+
+		printk(KERN_INFO CMAN_NAME ": node %s rejoining\n", name);
+		return newnode;
+	}
+
+	newnode = kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
+	if (!newnode)
+		goto alloc_err;
+
+	memset(newnode, 0, sizeof (struct cluster_node));
+	newnode->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+	if (!newnode->name)
+		goto alloc_err1;
+
+	strcpy(newnode->name, name);
+	newnode->last_hello = jiffies;
+	newnode->votes = votes;
+	newnode->expected_votes = expected_votes;
+	newnode->state = state;
+	newnode->node_id = node_id;
+	newnode->us = 0;
+	newnode->leave_reason = 0;
+	newnode->last_seq_recv = 0;
+	newnode->last_seq_acked = 0;
+	newnode->last_seq_sent = 0;
+	newnode->incarnation = 0;
+	do_gettimeofday(&newnode->join_time);
+	INIT_LIST_HEAD(&newnode->addr_list);
+	set_nodeid(newnode, node_id);
+
+	/* Add the new node to the list */
+	down(&cluster_members_lock);
+	list_add(&newnode->list, &cluster_members_list);
+	if (state == NODESTATE_MEMBER)
+		cluster_members++;
+	up(&cluster_members_lock);
+
+	if (state == NODESTATE_MEMBER)
+		printk(KERN_INFO CMAN_NAME ": got node %s\n", name);
+
+	return newnode;
+
+      alloc_err1:
+	kfree(newnode);
+      alloc_err:
+	send_leave(CLUSTER_LEAVEFLAG_PANIC);
+
+	printk(KERN_CRIT CMAN_NAME
+	       ": Cannot allocate memory for new cluster node %s\n", name);
+
+	panic("cluster memory allocation failed");
+
+	return NULL;
+}
+
+/* Remove node from a NODEDOWN message */
+static struct cluster_node *remove_node(int nodeid, unsigned char reason)
+{
+	struct cluster_node *node;
+
+	/* It may be a failed joiner */
+	if (joining_node && joining_node->node_id == nodeid) {
+		remove_joiner(0);
+	}
+
+	node = find_node_by_nodeid(nodeid);
+	if (node && node->state != NODESTATE_DEAD) {
+		printk(KERN_INFO CMAN_NAME ": node %s has been removed from the cluster : %s\n",
+		       node->name, leave_string(reason));
+		down(&cluster_members_lock);
+		node->state = NODESTATE_DEAD;
+		cluster_members--;
+		up(&cluster_members_lock);
+		node->leave_reason = reason;
+
+		notify_kernel_listeners(DIED, (long) nodeid);
+
+		/* If this node is us then go quietly */
+		if (node->us) {
+			printk(KERN_INFO CMAN_NAME
+			       ": killed by NODEDOWN message\n");
+			node_state = LEFT_CLUSTER;
+			quit_threads = 1;
+			wake_up_process(membership_task);
+			wake_up_interruptible(&cnxman_waitq);
+		}
+	}
+	return node;
+}
+
+/* Add a node from a STARTTRANS or NOMINATE message */
+static void add_node_from_starttrans(struct msghdr *msg, char *buf, int len)
+{
+	/* Add the new node but don't fill in the ID until the master has
+	 * confirmed it */
+	struct cl_mem_starttrans_msg *startmsg =
+	    (struct cl_mem_starttrans_msg *)buf;
+	int ptr = sizeof (struct cl_mem_starttrans_msg);
+	int i;
+	char *name = buf + ptr + le16_to_cpu(startmsg->num_addrs) * address_length;
+	char *nodeaddr = buf + sizeof(struct cl_mem_starttrans_msg);
+
+	/* Remove any old joiner */
+	remove_joiner(0);
+
+	joining_node = add_new_node(name, startmsg->votes,
+				    le32_to_cpu(startmsg->expected_votes),
+				    le32_to_cpu(startmsg->nodeid), NODESTATE_JOINING);
+
+	/* add_new_node returns NULL if the node already exists */
+	if (!joining_node)
+		joining_node = find_node_by_name(name);
+
+	/* Add the node's addresses */
+	if (list_empty(&joining_node->addr_list)) {
+		for (i = 0; i < le16_to_cpu(startmsg->num_addrs); i++) {
+			add_node_address(joining_node, buf + ptr, address_length);
+			ptr += address_length;
+		}
+	}
+
+	/* Make sure we have a temp nodeid for the new node in case we
+	   become master */
+	joining_temp_nodeid = new_temp_nodeid(nodeaddr,
+					      address_length);
+}
+
+/* We have been nominated as master for a transition */
+static int do_process_nominate(struct msghdr *msg, char *buf, int len)
+{
+	struct cl_mem_starttrans_msg *startmsg =
+	    (struct cl_mem_starttrans_msg *)buf;
+	struct cluster_node *node = NULL;
+
+	P_MEMB("nominate reason is %d\n", startmsg->reason);
+	remove_joiner(1);
+
+	if (startmsg->reason == TRANS_NEWNODE) {
+		add_node_from_starttrans(msg, buf, len);
+		node = joining_node;
+	}
+
+	/* Start_transition needs some node info */
+	if (node == NULL)
+		node = us;
+	start_transition(startmsg->reason, node);
+	return 0;
+}
+
+/* Got a STARTACK response from a node */
+static int do_process_startack(struct msghdr *msg, char *buf, int len)
+{
+	if (node_state != MASTER && master_state != MASTER_START) {
+		P_MEMB("Got StartACK when not in MASTER_STARTING substate\n");
+		return 0;
+	}
+
+	/* buf is NULL if we are called directly from start_transition */
+	if (buf) {
+		struct cl_mem_startack_msg *ackmsg =
+			(struct cl_mem_startack_msg *)buf;
+
+		/* Ignore any messages wil old generation numbers in them */
+		if (le32_to_cpu(ackmsg->generation) != cluster_generation) {
+			P_MEMB("Got old generation START-ACK msg - ignoring\n");
+			return 0;
+		}
+	}
+
+	/* If we have all the responses in then move to the next stage */
+	if (++responses_collected == responses_expected) {
+
+		/* Behave a little differently if we are on our own */
+		if (cluster_members == 1) {
+			if (transitionreason == TRANS_NEWNODE) {
+				/* If the cluster is just us then confirm at
+				 * once */
+				joinconf_count = 0;
+				mod_timer(&transition_timer,
+					  jiffies +
+					  cman_config.joinconf_timeout * HZ);
+				if (send_joinconf() < 0)
+					end_transition();
+				return 0;
+			}
+			else {	/* Node leaving the cluster */
+				int total_votes;
+				quorum = calculate_quorum(leavereason, leavereason?cluster_members:0, &total_votes);
+				set_quorate(total_votes);
+				leavereason = 0;
+				joining_temp_nodeid = 0;
+				node_state = MEMBER;
+				notify_listeners();
+				sm_member_update(cluster_is_quorate);
+			}
+		}
+		else {
+			master_state = MASTER_COLLECT;
+			responses_collected = 0;
+			responses_expected = cluster_members - 1;
+			P_MEMB("Sending MASTERVIEW: expecting %d responses\n",
+			       responses_expected);
+
+			send_cluster_view(CLUSTER_MEM_MASTERVIEW, NULL, 0, MSG_REPLYEXP);
+
+			/* Set a timer in case we don't get 'em all back */
+			mod_timer(&transition_timer,
+				  jiffies +
+				  cman_config.transition_timeout * HZ);
+		}
+	}
+	return 0;
+}
+
+/* Got a VIEWACK response from a node */
+static int do_process_viewack(struct msghdr *msg, char *reply, int len)
+{
+	struct sockaddr_cl *saddr = msg->msg_name;
+
+
+	/* This has been known to happen, but I'm not sure why */
+	if (saddr->scl_nodeid < 1)
+		return 0;
+
+	if (node_opinion == NULL) {
+		node_opinion =
+		    kmalloc((10 + highest_nodeid) * sizeof (uint8_t), GFP_KERNEL);
+		if (!node_opinion) {
+			panic(": malloc agree/dissent failed\n");
+		}
+		memset(node_opinion, 0, (1 + highest_nodeid) * sizeof (uint8_t));
+	}
+
+	/* Keep a list of agreeing and dissenting nodes */
+	if (reply[1] == 1) {
+		/* ACK - remote node agrees with me */
+		P_MEMB("Node agrees\n");
+		node_opinion[saddr->scl_nodeid] = OPINION_AGREE;
+		agreeing_nodes++;
+	}
+	else {
+		/* Remote node disagrees */
+		P_MEMB("Node disagrees\n");
+		node_opinion[saddr->scl_nodeid] = OPINION_DISAGREE;
+		dissenting_nodes++;
+	}
+
+	P_MEMB("got %d responses, expected %d\n", responses_collected + 1,
+	       responses_expected);
+
+	/* Are all the results in yet ? */
+	if (++responses_collected == responses_expected) {
+		del_timer(&transition_timer);
+
+		P_MEMB("The results are in: %d agree, %d dissent\n",
+		       agreeing_nodes, dissenting_nodes);
+
+		if (agreeing_nodes > dissenting_nodes) {
+			/* Kill dissenting nodes */
+			int i;
+
+			for (i = 1; i <= responses_collected; i++) {
+				if (node_opinion[i] == OPINION_DISAGREE) {
+					struct cluster_node *node;
+					node = find_node_by_nodeid(saddr->scl_nodeid);
+					if (node)
+						node->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
+					send_kill(i, 1);
+
+
+				}
+			}
+		}
+		else {
+			/* We must leave the cluster as we are in a minority,
+			 * the rest of them can fight it out amongst
+			 * themselves. */
+			us->leave_reason = CLUSTER_LEAVEFLAG_INCONSISTENT;
+			agreeing_nodes = 0;
+			dissenting_nodes = 0;
+			kfree(node_opinion);
+			node_opinion = NULL;
+			node_state = LEFT_CLUSTER;
+			quit_threads = 1;
+			wake_up_process(membership_task);
+			wake_up_interruptible(&cnxman_waitq);
+			return -1;
+		}
+
+		/* Reset counters */
+		agreeing_nodes = 0;
+		dissenting_nodes = 0;
+		kfree(node_opinion);
+		node_opinion = NULL;
+
+		/* Confirm new node */
+		if (transitionreason == TRANS_NEWNODE) {
+			mod_timer(&transition_timer,
+				  jiffies + cman_config.joinconf_timeout * HZ);
+			joinconf_count = 0;
+			if (send_joinconf() >= 0)
+				return 0;
+			/* if send_joinconf failed then complete the transition here and how */
+		}
+
+		master_state = MASTER_COMPLETE;
+
+		end_transition();
+	}
+
+	return 0;
+}
+
+/* Remove the node from the list if it's a brand-new node,
+ * otherwise we end up knowing about a node that no-one
+ * else has and transitions get a bit fragile!
+ *
+ * Optionally tells the joining node to cancel it's join and try
+ * again later.
+ */
+static void remove_joiner(int tell_wait)
+{
+	if (!joining_node)
+		return;
+
+	if (tell_wait) {
+		struct sockaddr_cl saddr;
+
+		saddr.scl_nodeid = joining_temp_nodeid;
+		saddr.scl_family = AF_CLUSTER;
+		saddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+
+		P_MEMB("Postponing membership of node %s (incarnation=%d)\n",
+		       joining_node->name, joining_node->incarnation);
+		send_joinack((char *)&saddr, sizeof(saddr),
+			     JOINACK_TYPE_WAIT);
+	}
+
+	if (joining_node->incarnation == 0) {
+		P_MEMB("Removing joining node %s\n", joining_node->name);
+		down(&cluster_members_lock);
+		if (joining_node->state == NODESTATE_MEMBER)
+			cluster_members--;
+		list_del(&joining_node->list);
+		up(&cluster_members_lock);
+
+		if (joining_node->node_id)
+			members_by_nodeid[joining_node->node_id] = NULL;
+		kfree(joining_node);
+	}
+	else {
+		joining_node->state = NODESTATE_DEAD;
+	}
+	joining_node = NULL;
+	joining_temp_nodeid = 0;
+}
+
+/* Got an ENDTRANS message */
+static int do_process_endtrans(struct msghdr *msg, char *buf, int len)
+{
+	struct cl_mem_endtrans_msg *endmsg =
+		(struct cl_mem_endtrans_msg *)buf;
+	struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
+
+	/* Someone else's state transition */
+	if (node_state != TRANSITION && node_state != JOINACK)
+		return 0;
+
+	/* Check we got it from the MASTER node */
+	if (master_node && master_node->node_id != saddr->scl_nodeid) {
+		printk(KERN_INFO
+		       "Got ENDTRANS from a node not the master: master: %d, sender: %d\n",
+		       master_node->node_id, saddr->scl_nodeid);
+		return 0;
+	}
+
+	del_timer(&transition_timer);
+
+	/* Set our new node id */
+	if (endmsg->new_node_id && us->node_id == 0) {
+		set_nodeid(us, le32_to_cpu(endmsg->new_node_id));
+		P_MEMB("our new node ID is %d\n", us->node_id);
+	}
+
+	node_state = TRANSITION_COMPLETE;
+
+	if (endmsg->new_node_id)
+		confirm_joiner();
+	else
+		remove_joiner(0);
+
+	cluster_generation = le32_to_cpu(endmsg->generation);
+
+	if (wait_for_completion_barrier() != 0) {
+		P_MEMB("Barrier timed out - restart client(ie do nowt)\n");
+		node_state = TRANSITION;
+		mod_timer(&transition_timer,
+			  jiffies + cman_config.transition_timeout * HZ);
+		return 0;
+	}
+
+	quorum = le32_to_cpu(endmsg->quorum);
+	set_quorate(le32_to_cpu(endmsg->total_votes));
+	highest_nodeid = get_highest_nodeid();
+
+	/* Tell any waiting barriers that we had a transition */
+	check_barrier_returns();
+
+	purge_temp_nodeids();
+
+	/* Clear up */
+	master_node = NULL;
+	joining_node = NULL;
+	joining_temp_nodeid = 0;
+
+	node_state = MEMBER;
+	transitionreason = TRANS_NONE;
+
+	/* Notify other listeners that transition has completed */
+	notify_listeners();
+	reset_hello_time();
+	transition_end_time = jiffies;
+
+	sm_member_update(cluster_is_quorate);
+	return 0;
+}
+
+/* Turn a STARTTRANS message into NOMINATE and send it to the new master */
+static int send_nominate(struct cl_mem_starttrans_msg *startmsg, int msglen,
+			 int nodeid)
+{
+	struct sockaddr_cl maddr;
+
+	maddr.scl_port = CLUSTER_PORT_MEMBERSHIP;
+	maddr.scl_family = AF_CLUSTER;
+	maddr.scl_nodeid = nodeid;
+
+	startmsg->cmd = CLUSTER_MEM_NOMINATE;
+	return kcl_sendmsg(mem_socket, startmsg, msglen,
+			   &maddr, sizeof (maddr), 0);
+}
+
+/* Got a NODEDOWN message */
+static int do_process_nodedown(struct msghdr *msg, char *buf, int len)
+{
+	struct cl_mem_nodedown_msg *downmsg =
+		(struct cl_mem_nodedown_msg *)buf;
+
+	remove_node(le32_to_cpu(downmsg->nodeid), downmsg->reason);
+	return 0;
+}
+
+/* Got a STARTTRANS message */
+static int do_process_starttrans(struct msghdr *msg, char *buf, int len)
+{
+	struct cl_mem_starttrans_msg *startmsg =
+		(struct cl_mem_starttrans_msg *)buf;
+	struct sockaddr_cl *saddr = (struct sockaddr_cl *) msg->msg_name;
+	struct cluster_node *node;
+	unsigned int newgen = le32_to_cpu(startmsg->generation);
+
+	/* Got a WHAT from WHOM? */
+	node = find_node_by_nodeid(saddr->scl_nodeid);
+	if (!node || node->state != NODESTATE_MEMBER)
+		return 0;
+
+	/* Someone else's state transition */
+	if (node_state != MEMBER &&
+	    node_state != TRANSITION && node_state != MASTER)
+		return 0;
+
+	/* Ignore old generation STARTTRANS messages */
+	if ((newgen < cluster_generation) ||
+	    (newgen == 0xFFFFFFFF && cluster_generation == 0)) {
+		P_MEMB("Ignoring STARTTRANS with old generation number\n");
+		return 0;
+	}
+
+	P_MEMB("Got starttrans: newgen = %d, oldgen = %d, reason = %d\n",
+	       newgen, cluster_generation, startmsg->reason);
+
+	/* Up the generation number */
+	cluster_generation = newgen;
+
+	/* If we are also a master then decide between us */
+	if (node_state == MASTER) {
+
+		int not_master = 0;
+
+		/* If one node is doing a CHECK and another a "real" transition then prevent
+		   the CHECK from being master as it's a waste of time */
+		if (transitionreason != startmsg->reason) {
+			if (transitionreason == TRANS_CHECK)
+				not_master = us->node_id;
+			if (startmsg->reason == TRANS_CHECK)
+				not_master = saddr->scl_nodeid;
+		}
+
+		/* See if we really want the responsibility of being master */
+		if (elect_master(&node, not_master)) {
+
+			/* I reluctantly accept this position of responsibility
+			 */
+			P_MEMB("I elected myself master\n");
+
+			/* start_transition will re-establish this */
+			del_timer(&transition_timer);
+
+			start_transition(TRANS_NEWMASTER, node);
+			return 0;
+		}
+		else {
+			/* Back down */
+			P_MEMB("Backing down from MASTER status\n");
+			master_node = node;
+			node_state = TRANSITION;
+
+			/* If we were bringing a new node into the cluster then
+			 * we will have to abandon that now and tell the new
+			 * node to try again later */
+			if (transitionreason == TRANS_NEWNODE && joining_node) {
+				remove_joiner(1);
+			}
+
+			/* If the new master is not us OR the node we just got
+			 * the STARTTRANS from then make sure it knows it has
+			 * to be master */
+			if (saddr->scl_nodeid != node->node_id) {
+				send_nominate(startmsg, len, node->node_id);
+				return 0;
+			}
+
+			/* Fall through into MEMBER code below if we are
+			 * obeying the STARTTRANS we just received */
+		}
+	}
+
+	/* Do non-MASTER STARTTRANS bits */
+	if (node_state == MEMBER) {
+
+		P_MEMB("Normal transition start\n");
+
+		/* Save the master info */
+		master_node = find_node_by_nodeid(saddr->scl_nodeid);
+		node_state = TRANSITION;
+
+		if (startmsg->reason == TRANS_NEWNODE) {
+			add_node_from_starttrans(msg, buf, len);
+		}
+
+		send_startack(saddr, msg->msg_namelen);
+
+		/* Establish timer in case the master dies */
+		mod_timer(&transition_timer,
+			  jiffies + cman_config.transition_timeout * HZ);
+
+		return 0;
+	}
+
+	/* We are in transition but this may be a restart */
+	if (node_state == TRANSITION) {
+		struct cluster_node *oldjoin = joining_node;
+
+		master_node = find_node_by_nodeid(saddr->scl_nodeid);
+
+		/* Is it a new joining node ? This happens if a master is
+		 * usurped */
+		if (startmsg->reason == TRANS_NEWNODE) {
+
+			add_node_from_starttrans(msg, buf, len);
+		}
+
+		/* If this is a different node joining than the one we
+		 * were previously joining (probably cos the master is
+		 * a nominated one) then mark our "old" joiner as DEAD.
+		 * The original master will already have told the node
+		 * to go back into JOINWAIT state */
+		if (oldjoin && oldjoin != joining_node &&
+		    oldjoin->state == NODESTATE_JOINING)
+			oldjoin->state = NODESTATE_DEAD;
+
+		send_startack(saddr, msg->msg_namelen);
+
+		/* Is it a new master node? */
+		if (startmsg->reason == TRANS_NEWMASTER ||
+		    startmsg->reason == TRANS_DEADMASTER) {
+			P_MEMB("starttrans %s, node=%d\n",
+			       startmsg->reason ==
+			       TRANS_NEWMASTER ? "NEWMASTER" : "DEADMASTER",
+			       le32_to_cpu(startmsg->nodeid));
+
+			/* Store new master */
+			master_node = find_node_by_nodeid(saddr->scl_nodeid);
+		}
+
+
+		/* Restart the timer */
+		del_timer(&transition_timer);
+		mod_timer(&transition_timer,
+			  jiffies + cman_config.transition_timeout * HZ);
+	}
+
+	return 0;
+}
+
+
+/* Change a cluster parameter */
+static int do_process_reconfig(struct msghdr *msg, char *buf, int len)
+{
+	struct cl_mem_reconfig_msg *confmsg;
+	struct sockaddr_cl *saddr = msg->msg_name;
+	struct cluster_node *node;
+	unsigned int val;
+
+	if (len < sizeof(struct cl_mem_reconfig_msg))
+		return -1;
+
+	confmsg = (struct cl_mem_reconfig_msg *)buf;
+	val = le32_to_cpu(confmsg->value);
+
+	switch (confmsg->param) {
+
+	case RECONFIG_PARAM_EXPECTED_VOTES:
+		/* Set any nodes with expected_votes higher than the new value
+		 * down */
+		if (val > 0) {
+			struct cluster_node *node;
+
+			down(&cluster_members_lock);
+			list_for_each_entry(node, &cluster_members_list, list) {
+				if (node->state == NODESTATE_MEMBER &&
+				    node->expected_votes > val) {
+					node->expected_votes = val;
+				}
+			}
+			up(&cluster_members_lock);
+			if (expected_votes > val)
+				expected_votes = val;
+		}
+		recalculate_quorum(1);	/* Allow decrease */
+		sm_member_update(cluster_is_quorate);
+		break;
+
+	case RECONFIG_PARAM_NODE_VOTES:
+		node = find_node_by_nodeid(saddr->scl_nodeid);
+		node->votes = val;
+		recalculate_quorum(1);	/* Allow decrease */
+		sm_member_update(cluster_is_quorate);
+		break;
+
+	case RECONFIG_PARAM_CONFIG_VERSION:
+		config_version = val;
+		break;
+
+	default:
+		printk(KERN_INFO CMAN_NAME
+		       ": got unknown parameter in reconfigure message. %d\n",
+		       confmsg->param);
+		break;
+	}
+	return 0;
+}
+
+/* Response from master node */
+static int do_process_joinack(struct msghdr *msg, char *buf, int len)
+{
+	struct cl_mem_joinack_msg *ackmsg =
+		(struct cl_mem_joinack_msg *)buf;
+
+	join_time = jiffies;
+	if (ackmsg->acktype == JOINACK_TYPE_OK) {
+		node_state = JOINACK;
+	}
+
+	if (ackmsg->acktype == JOINACK_TYPE_NAK) {
+		printk(KERN_WARNING CMAN_NAME
+		       ": Cluster membership rejected\n");
+		P_MEMB("Got JOINACK NACK\n");
+		node_state = REJECTED;
+	}
+
+	if (ackmsg->acktype == JOINACK_TYPE_WAIT) {
+		P_MEMB("Got JOINACK WAIT\n");
+		node_state = JOINWAIT;
+		joinwait_time = jiffies;
+	}
+
+	return 0;
+}
+
+/* Check a JOINREQ message for validity,
+   return -1 if we can't let the node join our cluster */
+static int validate_joinmsg(struct cl_mem_join_msg *joinmsg, int len)
+{
+	struct cluster_node *node;
+
+        /* Check version number */
+	if (le32_to_cpu(joinmsg->major_version) == CNXMAN_MAJOR_VERSION) {
+		char *ptr = (char *) joinmsg;
+		char *name;
+
+		ptr += sizeof (*joinmsg);
+		name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
+
+		/* Sanity-check the num_addrs field otherwise we could oops */
+		if (le16_to_cpu(joinmsg->num_addr) * address_length > len) {
+			printk(KERN_WARNING CMAN_NAME
+			       ": num_addr in JOIN-REQ message is rubbish: %d\n",
+			       le16_to_cpu(joinmsg->num_addr));
+			return -1;
+		}
+
+		/* Check the cluster name matches */
+		if (strcmp(cluster_name, joinmsg->clustername)) {
+			printk(KERN_WARNING CMAN_NAME
+			       ": attempt to join with cluster name '%s' refused\n",
+			       joinmsg->clustername);
+			return -1;
+		}
+
+		/* Check we are not exceeding the maximum number of nodes */
+		if (cluster_members >= cman_config.max_nodes) {
+			printk(KERN_WARNING CMAN_NAME
+			       ": Join request from %s rejected, exceeds maximum number of nodes\n",
+			       name);
+			return -1;
+		}
+
+		/* Check that we don't exceed the two_node limit, if applicable */
+		if (two_node && cluster_members == 2) {
+			printk(KERN_WARNING CMAN_NAME ": Join request from %s "
+			       "rejected, exceeds two node limit\n", name);
+			return -1;
+		}
+
+		if (le32_to_cpu(joinmsg->config_version) != config_version) {
+			printk(KERN_WARNING CMAN_NAME ": Join request from %s "
+			       "rejected, config version local %u remote %u\n",
+			       name, config_version,
+			       le32_to_cpu(joinmsg->config_version));
+			return -1;
+		}
+
+		/* Validate requested static node ID */
+		if (joinmsg->nodeid &&
+		    (node = find_node_by_nodeid(le32_to_cpu(joinmsg->nodeid))) &&
+		    (node->state != NODESTATE_DEAD ||
+		     (strcmp(node->name, name)))) {
+			printk(KERN_WARNING CMAN_NAME ": Join request from %s "
+			       "rejected, node ID %d already in use by %s\n",
+			       name, node->node_id, node->name);
+			return -1;
+		}
+		if (joinmsg->nodeid &&
+		    (node = find_node_by_name(name)) &&
+		    (node->state != NODESTATE_DEAD ||
+		     node->node_id != le32_to_cpu(joinmsg->nodeid))) {
+			printk(KERN_WARNING CMAN_NAME ": Join request from %s "
+			       "rejected, wanted node %d but previously had %d\n",
+			       name, le32_to_cpu(joinmsg->nodeid), node->node_id);
+			return -1;
+		}
+
+                /* If these don't match then I don't know how the message
+		   arrived! However, I can't take the chance */
+		if (le32_to_cpu(joinmsg->addr_len) != address_length) {
+			printk(KERN_WARNING CMAN_NAME ": Join request from %s "
+			       "rejected, address length local: %u remote %u\n",
+			       name, address_length,
+			       le32_to_cpu(joinmsg->addr_len));
+			return -1;
+		}
+	}
+	else {
+		/* Version number mismatch, don't use any part of the message
+		 * other than the version numbers as things may have moved */
+		printk(KERN_INFO CMAN_NAME
+		       ": Got join message from node running incompatible software. (us: %d.%d.%d, them: %d.%d.%d)\n",
+		       CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
+		       CNXMAN_PATCH_VERSION,
+		       le32_to_cpu(joinmsg->major_version),
+		       le32_to_cpu(joinmsg->minor_version),
+		       le32_to_cpu(joinmsg->patch_version));
+		return -1;
+	}
+	return 0;
+}
+
+
+/* Request to join the cluster. This makes us the master for this state
+ * transition */
+static int do_process_joinreq(struct msghdr *msg, char *buf, int len)
+{
+	static unsigned long last_joinreq = 0;
+	static char last_name[MAX_CLUSTER_MEMBER_NAME_LEN];
+	struct cl_mem_join_msg *joinmsg = (struct cl_mem_join_msg *)buf;
+	struct cluster_node *node;
+	char *ptr = (char *) joinmsg;
+	char *name;
+	int i;
+	struct sockaddr_cl *addr = msg->msg_name;
+
+	ptr += sizeof (*joinmsg);
+	name = ptr + le16_to_cpu(joinmsg->num_addr) * address_length;
+
+	/* If we are in a state transition then tell the new node to wait a bit
+	 * longer */
+	if (node_state != MEMBER) {
+		if (node_state == MASTER || node_state == TRANSITION) {
+			send_joinack(msg->msg_name, msg->msg_namelen,
+				      JOINACK_TYPE_WAIT);
+		}
+		return 0;
+	}
+
+	/* Reject application if message is invalid for any reason */
+	if (validate_joinmsg(joinmsg, len)) {
+		send_joinack(msg->msg_name, msg->msg_namelen,
+			     JOINACK_TYPE_NAK);
+		return 0;
+	}
+
+	/* Do we already know about this node? */
+	if (check_duplicate_node(name, msg, len) < 0) {
+		send_joinack(msg->msg_name, msg->msg_namelen,
+			     JOINACK_TYPE_NAK);
+		return 0;
+	}
+
+	/* Duplicate checking: Because joining messages do not have
+	 * sequence numbers we may get as many JOINREQ messages as we
+	 * have interfaces. This bit of code here just checks for
+	 * JOINREQ messages that come in from the same node in a small
+	 * period of time and removes the duplicates */
+	if (time_before(jiffies, last_joinreq + 10 * HZ)
+	    && strcmp(name, last_name) == 0) {
+		return 0;
+	}
+
+        /* OK, you can be in my gang */
+	last_joinreq = jiffies;
+	strcpy(last_name, name);
+
+	node = add_new_node(name, joinmsg->votes,
+			    le32_to_cpu(joinmsg->expected_votes),
+			    le32_to_cpu(joinmsg->nodeid),
+			    NODESTATE_JOINING);
+
+	/* A genuinely new node, assign it a genuinely new ID */
+	if (node->node_id == 0) {
+		set_nodeid(node, get_highest_nodeid()+1);
+		highest_nodeid = node->node_id;
+	}
+	P_MEMB("New node %s has id %d\n", node->name, node->node_id);
+
+	/* Add the node's addresses */
+	if (list_empty(&node->addr_list)) {
+		for (i = 0; i < le16_to_cpu(joinmsg->num_addr);
+		     i++) {
+			add_node_address(node, ptr, address_length);
+			ptr += address_length;
+		}
+	}
+	send_joinack(msg->msg_name, msg->msg_namelen,
+		     JOINACK_TYPE_OK);
+	joining_node = node;
+	joining_temp_nodeid = addr->scl_nodeid;
+
+	/* Start the state transition */
+	start_transition(TRANS_NEWNODE, node);
+
+	return 0;
+}
+
+/* A simple function to invent a small number based
+   on the node name */
+static int node_hash(void)
+{
+	int i;
+	int value = 0;
+
+	for (i=0; i<strlen(nodename); i++) {
+		value += nodename[i];
+	}
+	return (value & 0xF) + 1;
+}
+
+
+/* Return the low 32 bits of our IP address */
+static uint32_t low32_of_ip()
+{
+	struct cluster_node_addr *addr;
+	uint32_t lowip;
+
+	addr = list_entry(us->addr_list.next, struct cluster_node_addr, list);
+	memcpy(&lowip, addr->addr+address_length-sizeof(uint32_t), sizeof(uint32_t));
+	if (!lowip)
+		memcpy(&lowip, addr->addr - sizeof(uint32_t)*2, sizeof(uint32_t));
+
+	return lowip;
+}
+
+/* A new node has stated its intent to form a new cluster. we may have
+ * something to say about that... */
+static int do_process_newcluster(struct msghdr *msg, char *buf, int len)
+{
+	/* If we are also in STARTING state then back down for a random period
+	 * of time */
+	if (node_state == STARTING) {
+		P_MEMB("got NEWCLUSTER, backing down for %d seconds\n", node_hash());
+		start_time = jiffies + node_hash() * HZ;
+	}
+
+	if (node_state == NEWCLUSTER) {
+		uint32_t otherip;
+
+		memcpy(&otherip, buf+1, sizeof(otherip));
+		otherip = le32_to_cpu(otherip);
+		P_MEMB("got NEWCLUSTER, remote ip = %x, us = %x\n", otherip, low32_of_ip());
+		if (otherip < low32_of_ip())
+			node_state = STARTING;
+	}
+
+	if (node_state == MEMBER)
+		send_hello();
+
+	return 0;
+}
+
+/* Called for each node by the node-message unpacker. Returns -1 if there is a
+ * mismatch and the caller will stop processing */
+static int check_node(struct cluster_node *newnode, char *addrs,
+		      unsigned short num_addr)
+{
+	struct cluster_node *node = find_node_by_name(newnode->name);
+
+	P_MEMB("check_node: %s", newnode->name);
+
+	if (!node) {
+		C_MEMB("  - not found\n");
+		return -1;
+	}
+
+	/* Don't fail things if we have a node flagged as JOINING
+	   but the master thinks is DEAD */
+	if (node->votes != newnode->votes ||
+	    node->node_id != newnode->node_id ||
+	    (node->state != NODESTATE_JOINING &&
+	     node->state != newnode->state)) {
+		C_MEMB(" - wrong info: votes=%d(exp: %d) id=%d(exp: %d) state = %d(exp: %d)\n",
+		       node->votes, newnode->votes, node->node_id,
+		       newnode->node_id, node->state, newnode->state);
+		return -1;
+	}
+	C_MEMB(" - OK\n");
+	return 0;
+}
+
+/* Called for each new node found in a JOINCONF message. Create a new node
+ * entry */
+static int add_node(struct cluster_node *node, char *addrs,
+		    unsigned short num_addr)
+{
+	P_MEMB("add_node: %s, v:%d, e:%d, i:%d\n", node->name, node->votes,
+	       node->expected_votes, node->node_id);
+
+	if (!find_node_by_name(node->name)) {
+		struct cluster_node *newnode;
+		int i;
+
+		if ((newnode =
+		     add_new_node(node->name, node->votes, node->expected_votes,
+				  node->node_id, node->state)) == NULL) {
+			P_MEMB("Error adding node\n");
+			return -1;
+		}
+		if (list_empty(&newnode->addr_list)) {
+			for (i = 0; i < num_addr; i++) {
+				add_node_address(newnode,
+						 addrs + i * address_length, address_length);
+			}
+		}
+		return 0;
+	}
+	else {
+		P_MEMB("Already got node with name %s\n", node->name);
+		return -1;
+	}
+}
+
+/* Call a specified routine for each node unpacked from the message. Return
+ * either the number of nodes found or -1 for an error */
+static int unpack_nodes(unsigned char *buf, int len,
+			int (*routine) (struct cluster_node *, char *,
+					unsigned short))
+{
+	int ptr = 0;
+	int num_nodes = 0;
+	char nodename[MAX_CLUSTER_MEMBER_NAME_LEN];
+	struct cluster_node node;
+
+	node.name = nodename;
+
+	while (ptr < len) {
+		int namelen = buf[ptr++];
+		unsigned int evotes;
+		unsigned int node_id;
+		unsigned short num_addr;
+		unsigned char *addrs;
+
+		memcpy(nodename, &buf[ptr], namelen);
+		nodename[namelen] = '\0';
+		ptr += namelen;
+
+		node.state = buf[ptr++];
+
+		memcpy(&num_addr, &buf[ptr], sizeof (short));
+		num_addr = le16_to_cpu(num_addr);
+		ptr += sizeof (short);
+
+		/* Just make a note of the addrs "array" */
+		addrs = &buf[ptr];
+		ptr += num_addr * address_length;
+
+		node.votes = buf[ptr++];
+
+		memcpy(&evotes, &buf[ptr], sizeof (int));
+		node.expected_votes = le32_to_cpu(evotes);
+		ptr += sizeof (int);
+
+		memcpy(&node_id, &buf[ptr], sizeof (int));
+		node.node_id = le32_to_cpu(node_id);
+		ptr += sizeof (int);
+
+		/* Call the callback routine */
+		if (routine(&node, addrs, num_addr) < 0)
+			return -1;
+
+		/* Return the number of MEMBER nodes */
+		if (node.state == NODESTATE_MEMBER)
+			num_nodes++;
+	}
+	return num_nodes;
+}
+
+/* Got join confirmation from a master node. This message contains a list of
+ * cluster nodes which we unpack and build into our cluster nodes list. When we
+ * have the last message we can go into TRANSITION state */
+static int do_process_joinconf(struct msghdr *msg, char *buf, int len)
+{
+	if (unpack_nodes(buf + 2, len - 2, add_node) < 0) {
+		printk(KERN_ERR CMAN_NAME
+		       ": Error procssing joinconf message - giving up on cluster join\n");
+		us->leave_reason = CLUSTER_LEAVEFLAG_PANIC;
+		node_state = LEFT_CLUSTER;
+		return -1;
+	}
+
+	/* Last message in the list? */
+	if (buf[1] & 2) {
+		char ackmsg;
+		struct sockaddr_cl *addr = msg->msg_name;
+
+		us->state = NODESTATE_MEMBER;
+		node_state = TRANSITION;
+		we_are_a_cluster_member = TRUE;
+
+		ackmsg = CLUSTER_MEM_CONFACK;
+		kcl_sendmsg(mem_socket, &ackmsg, 1, addr,
+			    sizeof (struct sockaddr_cl),
+			    MSG_NOACK);
+		kernel_thread(hello_kthread, NULL, 0);
+		mod_timer(&hello_timer, jiffies + cman_config.hello_timer * HZ);
+	}
+	return 0;
+}
+
+/* Got the master's view of the cluster - compare it with ours and tell it the
+ * result */
+static int do_process_masterview(struct msghdr *msg, char *buf, int len)
+{
+	char reply[2] = { CLUSTER_MEM_VIEWACK, 0 };
+	static int num_nodes;
+
+	/* Someone else's state transition */
+	if (node_state != MEMBER &&
+	    node_state != TRANSITION && node_state != MASTER)
+		return 0;
+
+	/* First message, zero the counter */
+	if (buf[1] & 1)
+		num_nodes = 0;
+
+	num_nodes += unpack_nodes(buf + 2, len - 2, check_node);
+
+	/* Last message, check the count and reply */
+	if (buf[1] & 2) {
+		if (num_nodes == cluster_members) {
+			/* Send ACK */
+			reply[1] = 1;
+		}
+		else {
+			P_MEMB
+			    ("Got %d nodes in MASTERVIEW message, we think there s/b %d\n",
+			     num_nodes, cluster_members);
+			/* Send NAK */
+			reply[1] = 0;
+		}
+		kcl_sendmsg(mem_socket, reply, 2, msg->msg_name,
+			    msg->msg_namelen, 0);
+	}
+	return 0;
+}
+
+static int do_process_leave(struct msghdr *msg, char *buf, int len)
+{
+	struct cluster_node *node;
+	struct sockaddr_cl *saddr = msg->msg_name;
+	unsigned char *leavemsg = (unsigned char *)buf;
+
+	if ((node = find_node_by_nodeid(saddr->scl_nodeid))) {
+		unsigned char reason = leavemsg[1];
+
+		node->leave_reason = reason;
+		leavereason = (reason == CLUSTER_LEAVEFLAG_REMOVED ? 1 : 0);
+
+		a_node_just_died(node);
+	}
+	return 0;
+}
+
+static int do_process_hello(struct msghdr *msg, char *buf, int len)
+{
+	struct cluster_node *node;
+	struct cl_mem_hello_msg *hellomsg =
+		(struct cl_mem_hello_msg *)buf;
+	struct sockaddr_cl *saddr = msg->msg_name;
+
+	/* We are starting up. Send a join message to the node whose HELLO we
+	 * just received */
+	if (node_state == STARTING || node_state == JOINWAIT ||
+	    node_state == JOINING  || node_state == NEWCLUSTER) {
+		struct sockaddr_cl *addr = msg->msg_name;
+
+		printk(KERN_INFO CMAN_NAME ": sending membership request\n");
+
+		send_joinreq(addr, msg->msg_namelen);
+		join_time = jiffies;
+		node_state = JOINING;
+		return 0;
+	}
+
+	/* Only process HELLOs if we are not in transition */
+	if (node_state == MEMBER) {
+
+		node = find_node_by_nodeid(saddr->scl_nodeid);
+		if (node && node->state != NODESTATE_DEAD) {
+
+			/* Check the cluster generation in the HELLO message.
+			 * NOTE: this may be different if the message crossed
+			 * on the wire with an END-TRANS so we allow a period
+			 * of grace in which this is allowable */
+			if (cluster_generation !=
+			    le32_to_cpu(hellomsg->generation)
+			    && node_state == MEMBER
+			    && time_after(jiffies,
+					  cman_config.hello_timer * HZ +
+					  transition_end_time)) {
+
+				printk(KERN_DEBUG CMAN_NAME
+				       ": bad generation number %d in HELLO message from %d, expected %d\n",
+				       le32_to_cpu(hellomsg->generation),
+				       saddr->scl_nodeid,
+				       cluster_generation);
+
+				start_transition(TRANS_CHECK, node);
+				return 0;
+			}
+
+			if (cluster_members != le16_to_cpu(hellomsg->members)
+			    && node_state == MEMBER) {
+				printk(KERN_DEBUG CMAN_NAME
+				       ": nmembers in HELLO message from %d does not match our view (got %d, exp %d)\n",
+				       saddr->scl_nodeid,
+				       le16_to_cpu(hellomsg->members),
+				       cluster_members);
+				start_transition(TRANS_CHECK, node);
+				return 0;
+			}
+			/* The message is OK - save the time */
+			node->last_hello = jiffies;
+		}
+		else {
+			/* This node is a danger to our valid cluster */
+			if (cluster_is_quorate) {
+				send_kill(saddr->scl_nodeid, 0);
+			}
+		}
+	}
+
+	/* If we get a master hello and we are not the master then start a CHECK transition, cos the
+	   real master must have gone away in a period of confusion */
+	if (node_state != MASTER && hellomsg->flags & HELLO_FLAG_MASTER) {
+		node = find_node_by_nodeid(saddr->scl_nodeid);
+		start_transition(TRANS_CHECK, node);
+	}
+
+	return 0;
+
+}
+
+static int do_process_kill(struct msghdr *msg, char *buf, int len)
+{
+	struct sockaddr_cl *saddr = msg->msg_name;
+	struct cluster_node *node;
+
+	node = find_node_by_nodeid(saddr->scl_nodeid);
+	if (node && node->state == NODESTATE_MEMBER) {
+
+		printk(KERN_INFO CMAN_NAME
+		       ": Being told to leave the cluster by node %d\n",
+		       saddr->scl_nodeid);
+
+		node_state = LEFT_CLUSTER;
+		quit_threads = 1;
+		wake_up_process(membership_task);
+		wake_up_interruptible(&cnxman_waitq);
+	}
+	else {
+		P_MEMB("Asked to leave the cluster by a non-member. What a nerve!\n");
+	}
+	return 0;
+}
+
+/* Some cluster membership utility functions */
+struct cluster_node *find_node_by_name(char *name)
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (strcmp(node->name, name) == 0) {
+			up(&cluster_members_lock);
+			return node;
+		}
+	}
+	up(&cluster_members_lock);
+	return NULL;
+}
+
+/* Try to avoid using this as it's slow and holds the members lock */
+struct cluster_node *find_node_by_addr(unsigned char *addr, int addr_len)
+{
+	struct list_head *nodelist;
+	struct list_head *addrlist;
+	struct cluster_node *node;
+	struct cluster_node_addr *nodeaddr;
+
+	down(&cluster_members_lock);
+
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		list_for_each(addrlist, &node->addr_list) {
+			nodeaddr =
+			    list_entry(addrlist, struct cluster_node_addr,
+				       list);
+
+			if (memcmp(nodeaddr->addr+2, addr+2, address_length-2) == 0) {
+				up(&cluster_members_lock);
+				return node;
+			}
+		}
+	}
+
+	up(&cluster_members_lock);
+	return NULL;
+}
+
+/* This is the quick way to find a node */
+struct cluster_node *find_node_by_nodeid(unsigned int id)
+{
+	struct cluster_node *node;
+
+	if (id >= sizeof_members_array)
+		return NULL;
+
+	spin_lock(&members_by_nodeid_lock);
+	node = members_by_nodeid[id];
+	spin_unlock(&members_by_nodeid_lock);
+	return node;
+}
+
+static int dispatch_messages(struct socket *mem_socket)
+{
+	int err = 0;
+
+	while (skb_peek(&mem_socket->sk->sk_receive_queue)) {
+		struct msghdr msg;
+		struct kvec vec;
+		struct sockaddr_cl sin;
+		int len;
+
+		/* Something more important to do ? */
+		if (quit_threads ||test_bit(WAKE_FLAG_DEADNODE, &wake_flags))
+			return 0;
+
+		memset(&sin, 0, sizeof (sin));
+
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_name = &sin;
+		msg.msg_namelen = sizeof (sin);
+		msg.msg_flags = 0;
+
+		vec.iov_len = MAX_CLUSTER_MESSAGE;
+		vec.iov_base = iobuf;
+
+		len = kernel_recvmsg(mem_socket, &msg, &vec, 1,
+				     MAX_CLUSTER_MESSAGE,
+				     MSG_DONTWAIT);
+		if (len > 0) {
+			msg.msg_name = &sin;
+			do_membership_packet(&msg, iobuf, len);
+		}
+		else {
+			if (len == -EAGAIN)
+				err = 0;
+			else
+				err = -1;
+			break;
+		}
+	}
+	return err;
+}
+
+/* Scan the nodes list for dead nodes */
+static void check_for_dead_nodes()
+{
+	struct list_head *nodelist;
+	struct cluster_node *node;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (node->state != NODESTATE_DEAD &&
+		    time_after(jiffies,
+			       node->last_hello +
+			       cman_config.deadnode_timeout * HZ) && !node->us) {
+
+			up(&cluster_members_lock);
+
+			P_MEMB("last hello was %ld, current time is %ld\n",
+			       node->last_hello, jiffies);
+
+			node->leave_reason = CLUSTER_LEAVEFLAG_DEAD;
+			leavereason = 0;
+
+			/* This is unlikely to work but it's worth a try! */
+			send_kill(node->node_id, 0);
+
+			/* Start state transition */
+			a_node_just_died(node);
+			return;
+		}
+	}
+	up(&cluster_members_lock);
+
+	/* Also check for a dead quorum device */
+	if (quorum_device) {
+		if (quorum_device->state == NODESTATE_MEMBER &&
+		    time_after(jiffies,
+			       quorum_device->last_hello +
+			       cman_config.deadnode_timeout * HZ)) {
+			quorum_device->state = NODESTATE_DEAD;
+			printk(KERN_WARNING CMAN_NAME
+			       ": Quorum device %s timed out\n",
+			       quorum_device->name);
+			recalculate_quorum(0);
+		}
+	}
+
+	return;
+}
+
+/* add "us" as a node in the cluster */
+static int add_us()
+{
+	struct cluster_node *newnode =
+	    kmalloc(sizeof (struct cluster_node), GFP_KERNEL);
+
+	if (!newnode) {
+		/* Oh shit, we have to commit hara kiri here for the greater
+		 * good of the cluster */
+		send_leave(CLUSTER_LEAVEFLAG_PANIC);
+
+		printk(KERN_CRIT CMAN_NAME
+		       ": Cannot allocate memory for our node structure\n");
+		panic("Must die");
+
+		return -1;
+	}
+
+	memset(newnode, 0, sizeof (struct cluster_node));
+	newnode->name = kmalloc(strlen(nodename) + 1, GFP_KERNEL);
+	if (!newnode->name) {
+		send_leave(CLUSTER_LEAVEFLAG_PANIC);
+
+		printk(KERN_CRIT CMAN_NAME
+		       ": Cannot allocate memory for node name\n");
+		kfree(newnode);
+
+		panic("Must die");
+
+		return -1;
+	}
+
+	strcpy(newnode->name, nodename);
+	newnode->last_hello = jiffies;
+	newnode->votes = votes;
+	newnode->expected_votes = expected_votes;
+	newnode->state = NODESTATE_JOINING;
+	newnode->node_id = 0;	/* Will get filled in by ENDTRANS message */
+	newnode->us = 1;
+	newnode->leave_reason = 0;
+	INIT_LIST_HEAD(&newnode->addr_list);
+	get_local_addresses(newnode);	/* Get from cnxman socket info */
+	do_gettimeofday(&newnode->join_time);
+
+	/* Add the new node to the list */
+	down(&cluster_members_lock);
+	list_add(&newnode->list, &cluster_members_list);
+	cluster_members++;
+	up(&cluster_members_lock);
+	us = newnode;
+
+	return 0;
+}
+
+/* Return the highest known node_id */
+unsigned int get_highest_nodeid()
+{
+	struct list_head *nodelist;
+	struct cluster_node *node = NULL;
+	unsigned int highest = 0;
+
+	down(&cluster_members_lock);
+	list_for_each(nodelist, &cluster_members_list) {
+		node = list_entry(nodelist, struct cluster_node, list);
+
+		if (node->node_id > highest)
+			highest = node->node_id;
+	}
+	up(&cluster_members_lock);
+
+	return highest;
+}
+
+/* Elect a new master if there is a clash. Returns 1 if we are the new master,
+ * the master's struct will also be returned. This, rather primitively, uses
+ * the lowest node ID */
+static int elect_master(struct cluster_node **master_node, int disallow_node)
+{
+	int i;
+
+	for (i = 1; i < sizeof_members_array; i++) {
+		if (members_by_nodeid[i] &&
+		    members_by_nodeid[i]->state == NODESTATE_MEMBER) {
+			*master_node = members_by_nodeid[i];
+			P_MEMB("Elected master is %s\n", (*master_node)->name);
+			return (*master_node)->us;
+		}
+	}
+	BUG();
+	return 0;
+}
+
+/* Called by node_cleanup in cnxman when we have left the cluster */
+void free_nodeid_array()
+{
+	vfree(members_by_nodeid);
+	members_by_nodeid = NULL;
+	sizeof_members_array = 0;
+}
+
+int allocate_nodeid_array()
+{
+	/* Allocate space for the nodeid lookup array */
+	if (!members_by_nodeid) {
+		spin_lock_init(&members_by_nodeid_lock);
+		members_by_nodeid =
+		    vmalloc(cman_config.max_nodes *
+			    sizeof (struct cluster_member *));
+	}
+
+	if (!members_by_nodeid) {
+		printk(KERN_WARNING
+		       "Unable to allocate members array for %d members\n",
+		       cman_config.max_nodes);
+		return -ENOMEM;
+	}
+	memset(members_by_nodeid, 0,
+	       cman_config.max_nodes * sizeof (struct cluster_member *));
+	sizeof_members_array = cman_config.max_nodes;
+
+	return 0;
+}
+
+/* Set the votes & expected_votes variables */
+void set_votes(int v, int e)
+{
+	votes = v;
+	expected_votes = e;
+}
+
+int get_quorum()
+{
+	return quorum;
+}
+
+/* Called by cnxman to see if activity should be blocked because we are in a
+ * state transition */
+int in_transition()
+{
+	return node_state == TRANSITION ||
+	    node_state == TRANSITION_COMPLETE || node_state == MASTER;
+}
+
+/* Return the current membership state as a string for the main line to put
+ * into /proc . I really should be using snprintf rather than sprintf but it's
+ * not exported... */
+char *membership_state(char *buf, int buflen)
+{
+	switch (node_state) {
+	case STARTING:
+		strncpy(buf, "Starting", buflen);
+		break;
+	case NEWCLUSTER:
+		strncpy(buf, "New-Cluster?", buflen);
+		break;
+	case JOINING:
+		strncpy(buf, "Joining", buflen);
+		break;
+	case JOINWAIT:
+		strncpy(buf, "Join-Wait", buflen);
+		break;
+	case JOINACK:
+		strncpy(buf, "Join-Ack", buflen);
+		break;
+	case TRANSITION:
+		sprintf(buf, "State-Transition: Master is %s",
+			master_node ? master_node->name : "Unknown");
+		break;
+	case MEMBER:
+		strncpy(buf, "Cluster-Member", buflen);
+		break;
+	case REJECTED:
+		strncpy(buf, "Rejected", buflen);
+		break;
+	case LEFT_CLUSTER:
+		strncpy(buf, "Not-in-Cluster", buflen);
+		break;
+	case TRANSITION_COMPLETE:
+		strncpy(buf, "Transition-Complete", buflen);
+		break;
+	case MASTER:
+		strncpy(buf, "Transition-Master", buflen);
+		break;
+	default:
+		sprintf(buf, "Unknown: code=%d", node_state);
+		break;
+	}
+
+	return buf;
+}
+
+char *leave_string(int reason)
+{
+	static char msg[32];
+	switch (reason & 0xF)
+	{
+	case CLUSTER_LEAVEFLAG_DOWN:
+		return "Shutdown";
+	case CLUSTER_LEAVEFLAG_KILLED:
+		return "Killed by another node";
+	case CLUSTER_LEAVEFLAG_PANIC:
+		return "Panic";
+	case CLUSTER_LEAVEFLAG_REMOVED:
+		return "Removed";
+	case CLUSTER_LEAVEFLAG_REJECTED:
+		return "Membership rejected";
+	case CLUSTER_LEAVEFLAG_INCONSISTENT:
+		return "Inconsistent cluster view";
+	case CLUSTER_LEAVEFLAG_DEAD:
+		return "Missed too many heartbeats";
+	case CLUSTER_LEAVEFLAG_NORESPONSE:
+		return "No response to messages";
+	default:
+		sprintf(msg, "Reason is %d\n", reason);
+		return msg;
+	}
+}
+
+#ifdef DEBUG_MEMB
+static char *msgname(int msg)
+{
+	switch (msg) {
+	case CLUSTER_MEM_JOINCONF:
+		return "JOINCONF";
+	case CLUSTER_MEM_JOINREQ:
+		return "JOINREQ";
+	case CLUSTER_MEM_LEAVE:
+		return "LEAVE";
+	case CLUSTER_MEM_HELLO:
+		return "HELLO";
+	case CLUSTER_MEM_KILL:
+		return "KILL";
+	case CLUSTER_MEM_JOINACK:
+		return "JOINACK";
+	case CLUSTER_MEM_ENDTRANS:
+		return "ENDTRANS";
+	case CLUSTER_MEM_RECONFIG:
+		return "RECONFIG";
+	case CLUSTER_MEM_MASTERVIEW:
+		return "MASTERVIEW";
+	case CLUSTER_MEM_STARTTRANS:
+		return "STARTTRANS";
+	case CLUSTER_MEM_JOINREJ:
+		return "JOINREJ";
+	case CLUSTER_MEM_VIEWACK:
+		return "VIEWACK";
+	case CLUSTER_MEM_STARTACK:
+		return "STARTACK";
+	case CLUSTER_MEM_NEWCLUSTER:
+		return "NEWCLUSTER";
+	case CLUSTER_MEM_CONFACK:
+		return "CONFACK";
+	case CLUSTER_MEM_NOMINATE:
+		return "NOMINATE";
+	case CLUSTER_MEM_NODEDOWN:
+		return "NODEDOWN";
+
+	default:
+		return "??UNKNOWN??";
+	}
+}
+
+#endif
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- linux-2.6.9.orig/cluster/cman/proc.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/proc.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,400 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/init.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <cluster/cnxman.h>
+#include <cluster/service.h>
+
+#include "cnxman-private.h"
+#include "config.h"
+
+extern int cluster_members;
+extern struct list_head cluster_members_list;
+extern struct semaphore cluster_members_lock;
+extern struct cluster_node *quorum_device;
+extern int we_are_a_cluster_member;
+extern int cluster_is_quorate;
+extern uint16_t cluster_id;
+extern atomic_t use_count;
+extern unsigned int address_length;
+extern unsigned int config_version;
+extern char cluster_name[];
+extern char nodename[];
+extern struct cluster_node *us;
+static struct seq_operations cluster_info_op;
+
+int sm_proc_open(struct inode *inode, struct file *file);
+int sm_debug_info(char *b, char **start, off_t offset, int length);
+
+/* /proc interface to the configuration struct */
+static struct config_proc_info {
+    char *name;
+    int  *value;
+} config_proc[] = {
+    {
+	.name = "joinwait_timeout",
+	.value = &cman_config.joinwait_timeout,
+    },
+    {
+	.name = "joinconf_timeout",
+	.value = &cman_config.joinconf_timeout,
+    },
+    {
+	.name = "join_timeout",
+	.value = &cman_config.join_timeout,
+    },
+    {
+	.name = "hello_timer",
+	.value = &cman_config.hello_timer,
+    },
+    {
+	.name = "deadnode_timeout",
+	.value = &cman_config.deadnode_timeout,
+    },
+    {
+	.name = "transition_timeout",
+	.value = &cman_config.transition_timeout,
+    },
+    {
+	.name = "transition_restarts",
+	.value = &cman_config.transition_restarts,
+    },
+    {
+	.name = "max_nodes",
+	.value = &cman_config.max_nodes,
+    },
+    {
+	.name = "sm_debug_size",
+	.value = &cman_config.sm_debug_size,
+    },
+    {
+	.name = "newcluster_timeout",
+	.value = &cman_config.newcluster_timeout,
+    },
+    {
+        .name = "max_retries",
+        .value = &cman_config.max_retries,
+    },
+
+};
+
+
+static int proc_cluster_status(char *b, char **start, off_t offset, int length)
+{
+    struct list_head *nodelist;
+    struct cluster_node *node;
+    struct cluster_node_addr *node_addr;
+    unsigned int total_votes = 0;
+    unsigned int max_expected = 0;
+    int c = 0;
+
+    c += sprintf(b+c,
+		 "Protocol version: %d.%d.%d\n",
+		 CNXMAN_MAJOR_VERSION, CNXMAN_MINOR_VERSION,
+		 CNXMAN_PATCH_VERSION);
+
+    c += sprintf(b+c,
+		 "Config version: %d\nCluster name: %s\nCluster ID: %d\nCluster Member: %s\nMembership state: ",
+		 config_version,
+		 cluster_name, cluster_id,
+		 we_are_a_cluster_member?"Yes":"No");
+
+    membership_state(b+c, length-c);
+    c += strlen(b+c);
+    c += sprintf(b+c, "\n");
+
+    if (!we_are_a_cluster_member)
+	return c;
+
+    /* Total the votes */
+    down(&cluster_members_lock);
+    list_for_each(nodelist, &cluster_members_list) {
+	node = list_entry(nodelist, struct cluster_node, list);
+	if (node->state == NODESTATE_MEMBER) {
+	    total_votes += node->votes;
+	    max_expected =
+		max(max_expected, node->expected_votes);
+	}
+    }
+    up(&cluster_members_lock);
+
+    if (quorum_device && quorum_device->state == NODESTATE_MEMBER)
+	total_votes += quorum_device->votes;
+
+    c += sprintf(b+c,
+		 "Nodes: %d\nExpected_votes: %d\nTotal_votes: %d\nQuorum: %d  %s\n",
+		 cluster_members, max_expected, total_votes,
+		 get_quorum(),
+		 cluster_is_quorate ? " " : "Activity blocked");
+    c += sprintf(b+c, "Active subsystems: %d\n",
+		 atomic_read(&use_count));
+
+    c += sprintf(b+c, "Node name: %s\n", nodename);
+
+    if (us) {
+	    c += sprintf(b+c, "Node ID: %d\n", us->node_id);
+
+	    c += sprintf(b+c, "Node addresses: ");
+	    list_for_each_entry(node_addr, &us->addr_list, list) {
+		    struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)node_addr->addr;
+		    if (saddr->sin6_family == AF_INET6) {
+			    c += sprintf(b+c, "%x:%x:%x:%x:%x:%x:%x:%x  ",
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[0]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[1]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[2]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[3]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[4]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[5]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[6]),
+					 be16_to_cpu(saddr->sin6_addr.s6_addr16[7]));
+		    }
+		    else {
+			    struct sockaddr_in *saddr4 = (struct sockaddr_in *)saddr;
+			    uint8_t *addr = (uint8_t *)&saddr4->sin_addr;
+			    c+= sprintf(b+c, "%u.%u.%u.%u  ",
+					addr[0], addr[1], addr[2], addr[3]);
+		    }
+	    }
+	    c += sprintf(b+c, "\n\n");
+    }
+    return c;
+}
+
+
+/* Allocate one of these for /proc/cluster/nodes so we can keep a track of where
+ * we are */
+struct cluster_seq_info {
+	int nodeid;
+	int highest_nodeid;
+};
+
+static int cluster_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cluster_info_op);
+}
+
+static void *cluster_seq_start(struct seq_file *m, loff_t * pos)
+{
+	struct cluster_seq_info *csi;
+
+	if (!m->private) {
+	    csi = kmalloc(sizeof (struct cluster_seq_info), GFP_KERNEL);
+		m->private = csi;
+		if (!csi)
+			return NULL;
+	}
+	else
+		csi = m->private;
+	/* Keep highest_nodeid here so we don't need to keep traversing the
+	 * list to find it */
+	csi->nodeid = *pos;
+	csi->highest_nodeid = get_highest_nodeid();
+
+	/* Print the header */
+	if (*pos == 0) {
+		seq_printf(m, "Node  Votes Exp Sts  Name\n");
+	}
+	return csi;
+}
+
+static void *cluster_seq_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	struct cluster_seq_info *csi = p;
+
+	*pos = ++csi->nodeid;
+	if (csi->nodeid > csi->highest_nodeid)
+		return NULL;
+
+	return csi;
+}
+
+static int cluster_seq_show(struct seq_file *m, void *p)
+{
+	char state = '?';
+	struct cluster_node *node;
+	struct cluster_seq_info *csi = p;
+
+	/*
+	 * If we have "0" here then display the quorum device if
+	 * there is one.
+	 */
+	if (csi->nodeid == 0)
+		node = quorum_device;
+	else
+		node = find_node_by_nodeid(csi->nodeid);
+
+	if (!node)
+		return 0;
+
+	/* Make state printable */
+	switch (node->state) {
+	case NODESTATE_MEMBER:
+		state = 'M';
+		break;
+	case NODESTATE_JOINING:
+		state = 'J';
+		break;
+	case NODESTATE_DEAD:
+		state = 'X';
+		break;
+	}
+	seq_printf(m, "%4d  %3d  %3d   %c   %s\n",
+		   node->node_id,
+		   node->votes,
+		   node->expected_votes,
+		   state,
+		   node->name);
+
+	return 0;
+}
+
+static void cluster_seq_stop(struct seq_file *m, void *p)
+{
+	if (m->private) {
+		kfree(m->private);
+		m->private = NULL;
+	}
+}
+
+static struct seq_operations cluster_info_op = {
+	.start = cluster_seq_start,
+	.next = cluster_seq_next,
+	.stop = cluster_seq_stop,
+	.show = cluster_seq_show
+};
+
+static struct file_operations cluster_fops = {
+	.open = cluster_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+static struct file_operations service_fops = {
+	.open = sm_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+static int cman_config_read_proc(char *page, char **start, off_t off, int count,
+				 int *eof, void *data)
+{
+    struct config_proc_info *cinfo = data;
+
+    return snprintf(page, count, "%d\n", *cinfo->value);
+}
+
+static int cman_config_write_proc(struct file *file, const char __user *buffer,
+				  unsigned long count, void *data)
+{
+    struct config_proc_info *cinfo = data;
+    char buff[11];
+    int value;
+    int num;
+    char *end;
+
+    num = (count < 10) ? count : 10;
+    if (copy_from_user(buff, buffer, num))
+	    return -EFAULT;
+
+    buff[num] = '\0';
+    value = simple_strtoul(buff, &end, 10);
+    if (*end) {
+	*cinfo->value = value;
+    }
+    return count;
+}
+
+/* Base of the config directory for cman */
+static struct proc_dir_entry *proc_cman_config;
+void create_proc_entries(void)
+{
+	struct proc_dir_entry *procentry;
+	struct proc_dir_entry *proc_cluster;
+	int i;
+
+	proc_cluster = proc_mkdir("cluster", 0);
+	if (!proc_cluster)
+		return;
+	proc_cluster->owner = THIS_MODULE;
+
+	/* Config dir filled in by us and others */
+	if (!proc_mkdir("cluster/config", 0))
+		return;
+
+	/* Don't much care if this fails, it's hardly vital */
+	procentry = create_proc_entry("cluster/nodes", S_IRUGO, NULL);
+	if (procentry)
+		procentry->proc_fops = &cluster_fops;
+
+	procentry = create_proc_entry("cluster/status", S_IRUGO, NULL);
+	if (procentry)
+	        procentry->get_info = proc_cluster_status;
+
+	procentry = create_proc_entry("cluster/services", S_IRUGO, NULL);
+	if (procentry)
+	        procentry->proc_fops = &service_fops;
+
+	/* Config entries */
+	proc_cman_config = proc_mkdir("cluster/config/cman", 0);
+	if (!proc_cman_config)
+	        return;
+
+	for (i=0; i<sizeof(config_proc)/sizeof(struct config_proc_info); i++) {
+	        procentry = create_proc_entry(config_proc[i].name, 0660,
+					      proc_cman_config);
+		if (procentry) {
+		        procentry->data = &config_proc[i];
+			procentry->write_proc = cman_config_write_proc;
+			procentry->read_proc = cman_config_read_proc;
+		}
+	}
+
+	procentry = create_proc_entry("cluster/sm_debug", S_IRUGO, NULL);
+	if (procentry)
+		procentry->get_info = sm_debug_info;
+}
+
+void cleanup_proc_entries(void)
+{
+        int i, config_count;
+
+	remove_proc_entry("cluster/sm_debug", NULL);
+
+	config_count = sizeof(config_proc) / sizeof(struct config_proc_info);
+
+	if (proc_cman_config) {
+	        for (i=0; i<config_count; i++)
+			remove_proc_entry(config_proc[i].name, proc_cman_config);
+	}
+	remove_proc_entry("cluster/config/cman", NULL);
+	remove_proc_entry("cluster/config", NULL);
+
+	remove_proc_entry("cluster/nodes", NULL);
+	remove_proc_entry("cluster/status", NULL);
+	remove_proc_entry("cluster/services", NULL);
+	remove_proc_entry("cluster/config", NULL);
+	remove_proc_entry("cluster", NULL);
+}
--- linux-2.6.9.orig/cluster/cman/sm.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,109 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_DOT_H__
+#define __SM_DOT_H__
+
+/* 
+ * This is the main header file to be included in each Service Manager source
+ * file.
+ */
+
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <net/sock.h>
+
+#include <cluster/cnxman.h>
+#include <cluster/service.h>
+
+#define SG_LEVELS (4)
+
+#include "sm_internal.h"
+#include "sm_barrier.h"
+#include "sm_control.h"
+#include "sm_daemon.h"
+#include "sm_joinleave.h"
+#include "sm_membership.h"
+#include "sm_message.h"
+#include "sm_misc.h"
+#include "sm_recover.h"
+#include "sm_services.h"
+
+extern struct list_head sm_sg[SG_LEVELS];
+extern struct semaphore sm_sglock;
+
+#ifndef TRUE
+#define TRUE (1)
+#endif
+
+#ifndef FALSE
+#define FALSE (0)
+#endif
+
+#define SM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk("\nSM:  Assertion failed on line %d of file %s\n" \
+               "SM:  assertion:  \"%s\"\n" \
+               "SM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    panic("SM:  Record message above and reboot.\n"); \
+  } \
+}
+
+#define SM_RETRY(do_this, until_this) \
+for (;;) \
+{ \
+  do { do_this; schedule(); } while (0); \
+  if (until_this) \
+    break; \
+  printk("SM:  out of memory:  %s, %u\n", __FILE__, __LINE__); \
+  schedule();\
+}
+
+
+#define log_print(fmt, args...) printk("SM: "fmt"\n", ##args)
+
+#define log_error(sg, fmt, args...) \
+	printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
+
+
+#define SM_DEBUG_LOG
+
+#ifdef SM_DEBUG_CONSOLE
+#define log_debug(sg, fmt, args...) \
+	printk("SM: %08x " fmt "\n", (sg)->global_id , ##args)
+#endif
+
+#ifdef SM_DEBUG_LOG
+#define log_debug(sg, fmt, args...) sm_debug_log(sg, fmt, ##args);
+#endif
+
+#ifdef SM_DEBUG_ALL
+#define log_debug(sg, fmt, args...) \
+do \
+{ \
+	printk("SM: %08x "fmt"\n", (sg)->global_id, ##args); \
+	sm_debug_log(sg, fmt, ##args); \
+} \
+while (0)
+#endif
+
+#endif				/* __SM_DOT_H__ */
--- linux-2.6.9.orig/cluster/cman/sm_barrier.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_barrier.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,233 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+
+static struct list_head	barriers;
+static spinlock_t	barriers_lock;
+
+struct bc_entry {
+	struct list_head list;
+	uint32_t gid;
+	int status;
+	char type;
+};
+typedef struct bc_entry bc_entry_t;
+
+void init_barriers(void)
+{
+	INIT_LIST_HEAD(&barriers);
+	spin_lock_init(&barriers_lock);
+}
+
+static int atoi(char *c)
+{
+	int x = 0;
+
+	while ('0' <= *c && *c <= '9') {
+		x = x * 10 + (*c - '0');
+		c++;
+	}
+	return x;
+}
+
+static void add_barrier_callback(char *name, int status, int type)
+{
+	char *p;
+	uint32_t gid;
+	bc_entry_t *be;
+
+	/* an ESRCH callback just means there was a cnxman transition */
+	if (status == -ESRCH)
+		return;
+
+	/* extract global id of SG from barrier name */
+	p = strstr(name, "sm.");
+
+	SM_ASSERT(p, printk("name=\"%s\" status=%d\n", name, status););
+
+	p += strlen("sm.");
+	gid = atoi(p);
+
+	be = kmalloc(sizeof(bc_entry_t), GFP_ATOMIC);
+	SM_ASSERT(be,);
+
+	be->gid = gid;
+	be->status = status;
+	be->type = type;
+
+	spin_lock(&barriers_lock);
+	list_add_tail(&be->list, &barriers);
+	spin_unlock(&barriers_lock);
+
+	wake_serviced(DO_BARRIERS);
+}
+
+static void callback_recovery_barrier(char *name, int status)
+{
+	add_barrier_callback(name, status, SM_BARRIER_RECOVERY);
+}
+
+static void callback_startdone_barrier_new(char *name, int status)
+{
+	add_barrier_callback(name, status, SM_BARRIER_STARTDONE_NEW);
+}
+
+static void callback_startdone_barrier(char *name, int status)
+{
+	add_barrier_callback(name, status, SM_BARRIER_STARTDONE);
+}
+
+int sm_barrier(char *name, int count, int type)
+{
+	int error;
+	unsigned long fn = 0;
+
+	switch (type) {
+	case SM_BARRIER_STARTDONE:
+		fn = (unsigned long) callback_startdone_barrier;
+		break;
+	case SM_BARRIER_STARTDONE_NEW:
+		fn = (unsigned long) callback_startdone_barrier_new;
+		break;
+	case SM_BARRIER_RECOVERY:
+		fn = (unsigned long) callback_recovery_barrier;
+		break;
+	}
+
+	error = kcl_barrier_register(name, 0, count);
+	if (error) {
+		log_print("barrier register error %d", error);
+		goto fail;
+	}
+
+	error = kcl_barrier_setattr(name, BARRIER_SETATTR_AUTODELETE, TRUE);
+	if (error) {
+		log_print("barrier setattr autodel error %d", error);
+		goto fail_bar;
+	}
+
+	error = kcl_barrier_setattr(name, BARRIER_SETATTR_CALLBACK, fn);
+	if (error) {
+		log_print("barrier setattr cb error %d", error);
+		goto fail_bar;
+	}
+
+	error = kcl_barrier_setattr(name, BARRIER_SETATTR_ENABLED, TRUE);
+	if (error) {
+		log_print("barrier setattr enabled error %d", error);
+		goto fail_bar;
+	}
+
+	return 0;
+
+ fail_bar:
+	kcl_barrier_delete(name);
+ fail:
+	return error;
+}
+
+void process_startdone_barrier_new(sm_group_t *sg, int status)
+{
+	sm_sevent_t *sev = sg->sevent;
+
+	if (!test_and_clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags)) {
+		log_debug(sev->se_sg, "ignore barrier cb status %d", status);
+		return;
+	}
+
+	sev->se_barrier_status = status;
+	sev->se_state = SEST_BARRIER_DONE;
+	set_bit(SEFL_CHECK, &sev->se_flags);
+	wake_serviced(DO_JOINLEAVE);
+}
+
+void process_startdone_barrier(sm_group_t *sg, int status)
+{
+	sm_uevent_t *uev = &sg->uevent;
+
+	if (!test_and_clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags)) {
+		log_debug(sg, "ignore barrier cb status %d", status);
+		return;
+	}
+
+	uev->ue_barrier_status = status;
+	uev->ue_state = UEST_BARRIER_DONE;
+	set_bit(UEFL_CHECK, &uev->ue_flags);
+	wake_serviced(DO_MEMBERSHIP);
+}
+
+void process_recovery_barrier(sm_group_t *sg, int status)
+{
+	if (status) {
+		log_error(sg, "process_recovery_barrier status=%d", status);
+		return;
+	}
+
+	if (sg->state != SGST_RECOVER ||
+	    sg->recover_state != RECOVER_BARRIERWAIT) {
+		log_error(sg, "process_recovery_barrier state %d recover %d",
+			  sg->state, sg->recover_state);
+		return;
+	}
+
+	if (!sg->recover_stop)
+		sg->recover_state = RECOVER_STOP;
+	else
+		sg->recover_state = RECOVER_BARRIERDONE;
+
+	wake_serviced(DO_RECOVERIES);
+}
+
+void process_barriers(void)
+{
+	sm_group_t *sg;
+	bc_entry_t *be;
+
+	while (1) {
+		be = NULL;
+
+		spin_lock(&barriers_lock);
+		if (!list_empty(&barriers)) {
+			be = list_entry(barriers.next, bc_entry_t, list);
+			list_del(&be->list);
+		}
+		spin_unlock(&barriers_lock);
+
+		if (!be)
+			break;
+
+		sg = sm_global_id_to_sg(be->gid);
+		if (!sg) {
+			log_print("process_barriers: no sg %08x", be->gid);
+			break;
+		}
+
+		switch (be->type) {
+		case SM_BARRIER_STARTDONE_NEW:
+			process_startdone_barrier_new(sg, be->status);
+			break;
+
+		case SM_BARRIER_STARTDONE:
+			process_startdone_barrier(sg, be->status);
+			break;
+
+		case SM_BARRIER_RECOVERY:
+			process_recovery_barrier(sg, be->status);
+			break;
+		}
+
+		kfree(be);
+		schedule();
+	}
+}
--- linux-2.6.9.orig/cluster/cman/sm_barrier.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_barrier.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_BARRIER_DOT_H__
+#define __SM_BARRIER_DOT_H__
+
+#define SM_BARRIER_STARTDONE		(0)
+#define SM_BARRIER_STARTDONE_NEW	(1)
+#define SM_BARRIER_RECOVERY		(2)
+#define SM_BARRIER_RESET		(3)
+
+void init_barriers(void);
+void process_barriers(void);
+int sm_barrier(char *name, int count, int type);
+void process_startdone_barrier(sm_group_t *sg, int status);
+void process_startdone_barrier_new(sm_group_t *sg, int status);
+void process_recovery_barrier(sm_group_t *sg, int status);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_control.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_control.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,156 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+#include "config.h"
+
+struct socket *		sm_socket;
+uint32_t *		sm_new_nodeids;
+uint32_t		sm_our_nodeid;
+int			sm_quorum, sm_quorum_next;
+struct list_head	sm_members;
+int			sm_member_count;
+
+
+/* 
+ * Context: cnxman
+ * Called by cnxman when it has a new member list.
+ */
+
+void sm_member_update(int quorate)
+{
+	sm_quorum_next = quorate;
+	wake_serviced(DO_START_RECOVERY);
+}
+
+/* 
+ * Context: cnxman
+ * Called when module is loaded.
+ */
+
+void sm_init(void)
+{
+	sm_socket = NULL;
+	sm_new_nodeids = NULL;
+	sm_quorum = 0;
+	sm_quorum_next = 0;
+	sm_our_nodeid = 0;
+	INIT_LIST_HEAD(&sm_members);
+	sm_member_count = 0;
+
+	init_services();
+	init_messages();
+	init_barriers();
+	init_serviced();
+	init_recovery();
+	init_joinleave();
+	init_sm_misc();
+}
+
+/* 
+ * Context: cnxman
+ * Called at beginning of cluster join procedure.
+ */
+
+void sm_start(void)
+{
+	struct sockaddr_cl saddr;
+	struct socket *sock;
+	int result;
+
+	/* Create a communication channel among service managers */
+
+	result = sock_create_kern(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT, &sock);
+	if (result < 0) {
+		log_print("can't create socket %d", result);
+		goto fail;
+	}
+
+	sm_socket = sock;
+
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_port = CLUSTER_PORT_SERVICES;
+
+	result = sock->ops->bind(sock, (struct sockaddr *) &saddr,
+				 sizeof(saddr));
+	if (result < 0) {
+		log_print("can't bind socket %d", result);
+		goto fail_release;
+	}
+
+	result = kcl_register_read_callback(sm_socket, sm_cluster_message);
+	if (result < 0) {
+		log_print("can't register read callback %d", result);
+		goto fail_release;
+	}
+
+	sm_new_nodeids = (uint32_t *) kmalloc(cman_config.max_nodes *
+						     sizeof(uint32_t),
+						     GFP_KERNEL);
+	start_serviced();
+
+	/* cnxman should call sm_member_update() once we've joined - then we
+	 * can get our first list of members and our own nodeid */
+
+	return;
+
+      fail_release:
+	sock_release(sm_socket);
+	sm_socket = NULL;
+
+      fail:
+	return;
+}
+
+/* 
+ * Context: cnxman
+ * Called before cnxman leaves the cluster.  If this returns an error to cman,
+ * cman should not leave the cluster but return EBUSY.
+ * If force is set we go away anyway. cman knows best in this case
+ */
+
+int sm_stop(int force)
+{
+	struct list_head *head;
+	sm_group_t *sg;
+	sm_node_t *node;
+	int i, busy = FALSE, error = -EBUSY;
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		if (!list_empty(&sm_sg[i])) {
+			sg = list_entry(sm_sg[i].next, sm_group_t, list);
+			log_error(sg, "sm_stop: SG still joined");
+			busy = TRUE;
+		}
+	}
+
+	if (!busy || force) {
+		stop_serviced();
+
+		if (sm_socket)
+			sock_release(sm_socket);
+
+		head = &sm_members;
+		while (!list_empty(head)) {
+			node = list_entry(head->next, sm_node_t, list);
+			list_del(&node->list);
+			sm_member_count--;
+			kfree(node);
+		}
+
+		kfree(sm_new_nodeids);
+		sm_init();
+		error = 0;
+	}
+	return error;
+}
--- linux-2.6.9.orig/cluster/cman/sm_control.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_control.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_CONTROL_DOT_H__
+#define __SM_CONTROL_DOT_H__
+
+void sm_init(void);
+void sm_start(void);
+int sm_stop(int force);
+void sm_member_update(int quorate);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_daemon.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_daemon.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,114 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+
+static unsigned long		daemon_flags;
+static struct task_struct *	daemon_task;
+extern int			sm_quorum;
+
+void init_serviced(void)
+{
+	daemon_flags = 0;
+	daemon_task = NULL;
+}
+
+void wake_serviced(int do_flag)
+{
+	set_bit(do_flag, &daemon_flags);
+	if (!daemon_task)
+		return;
+	wake_up_process(daemon_task);
+}
+
+static inline int got_work(void)
+{
+	int rv = 0;
+
+	rv = (test_bit(DO_START_RECOVERY, &daemon_flags) ||
+	      test_bit(DO_MESSAGES, &daemon_flags) ||
+	      test_bit(DO_BARRIERS, &daemon_flags) ||
+	      test_bit(DO_CALLBACKS, &daemon_flags));
+
+	if (rv)
+		goto out;
+
+	if (sm_quorum) {
+		rv = test_bit(DO_RECOVERIES, &daemon_flags);
+		if (rv)
+			goto out;
+		if (no_recoveries())
+			rv = test_bit(DO_JOINLEAVE, &daemon_flags) ||
+		      	     test_bit(DO_MEMBERSHIP, &daemon_flags);
+	}
+ out:
+	return rv;
+}
+
+static int serviced(void *arg)
+{
+	while (!kthread_should_stop()) {
+		if (test_and_clear_bit(DO_START_RECOVERY, &daemon_flags))
+			process_nodechange();
+
+		if (test_and_clear_bit(DO_MESSAGES, &daemon_flags))
+			process_messages();
+
+		if (test_and_clear_bit(DO_BARRIERS, &daemon_flags))
+			process_barriers();
+
+		if (test_and_clear_bit(DO_CALLBACKS, &daemon_flags))
+			process_callbacks();
+
+		if (sm_quorum) {
+			if (test_and_clear_bit(DO_RECOVERIES, &daemon_flags))
+				process_recoveries();
+
+			if (no_recoveries()) {
+				if (test_and_clear_bit(DO_JOINLEAVE,
+						       &daemon_flags))
+					process_joinleave();
+
+				if (test_and_clear_bit(DO_MEMBERSHIP,
+						       &daemon_flags))
+					process_membership();
+			}
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!got_work())
+			schedule();
+		set_current_state(TASK_RUNNING);
+	}
+	daemon_task = NULL;
+	return 0;
+}
+
+int start_serviced(void)
+{
+	struct task_struct *p;
+
+	p = kthread_run(serviced, NULL, "cman_serviced");
+	if (IS_ERR(p)) {
+		printk("can't start cman_serviced daemon");
+		return (IS_ERR(p));
+	}
+
+	daemon_task = p;
+	return 0;
+}
+
+void stop_serviced(void)
+{
+	kthread_stop(daemon_task);
+}
--- linux-2.6.9.orig/cluster/cman/sm_daemon.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_daemon.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,32 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_DAEMON_DOT_H__
+#define __SM_DAEMON_DOT_H__
+
+#define DO_RUN                  (0)
+#define DO_START_RECOVERY       (1)
+#define DO_MESSAGES             (2)
+#define DO_BARRIERS             (3)
+#define DO_CALLBACKS            (4)
+#define DO_JOINLEAVE            (5)
+#define DO_RECOVERIES           (6)
+#define DO_MEMBERSHIP           (7)
+#define DO_RESET		(8)
+
+void init_serviced(void);
+void wake_serviced(int do_flag);
+void stop_serviced(void);
+int start_serviced(void);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_internal.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_internal.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,232 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_INTERNAL_DOT_H__
+#define __SM_INTERNAL_DOT_H__
+
+/* 
+ * Any header files needed by this file should be included before it in sm.h.
+ * This file should only be included by sm.h.
+ */
+
+struct sm_group;
+struct sm_sevent;
+struct sm_uevent;
+struct sm_node;
+struct sm_msg;
+
+typedef struct sm_group sm_group_t;
+typedef struct sm_sevent sm_sevent_t;
+typedef struct sm_uevent sm_uevent_t;
+typedef struct sm_node sm_node_t;
+typedef struct sm_msg sm_msg_t;
+
+
+/* 
+ * Number of seconds to wait before trying again to join or leave an SG
+ */
+#define RETRY_DELAY		(2)
+
+
+/* 
+ * Service Event - what a node uses to join or leave an sg
+ */
+
+/* SE Flags */
+#define SEFL_CHECK              (0)
+#define SEFL_ALLOW_JOIN         (1)
+#define SEFL_ALLOW_JSTOP        (2)
+#define SEFL_ALLOW_LEAVE        (3)
+#define SEFL_ALLOW_LSTOP        (4)
+#define SEFL_ALLOW_STARTDONE    (5)
+#define SEFL_ALLOW_BARRIER      (6)
+#define SEFL_DELAY              (7)
+#define SEFL_DELAY_RECOVERY     (8)
+#define SEFL_LEAVE              (9)
+#define SEFL_CANCEL             (10)
+
+/* SE States */
+#define SEST_JOIN_BEGIN         (1)
+#define SEST_JOIN_ACKWAIT       (2)
+#define SEST_JOIN_ACKED         (3)
+#define SEST_JSTOP_ACKWAIT      (4)
+#define SEST_JSTOP_ACKED        (5)
+#define SEST_JSTART_SERVICEWAIT (6)
+#define SEST_JSTART_SERVICEDONE (7)
+#define SEST_BARRIER_WAIT       (8)
+#define SEST_BARRIER_DONE       (9)
+#define SEST_LEAVE_BEGIN        (10)
+#define SEST_LEAVE_ACKWAIT      (11)
+#define SEST_LEAVE_ACKED        (12)
+#define SEST_LSTOP_ACKWAIT      (13)
+#define SEST_LSTOP_ACKED        (14)
+#define SEST_LSTART_WAITREMOTE  (15)
+#define SEST_LSTART_REMOTEDONE  (16)
+
+struct sm_sevent {
+	struct list_head 	se_list;
+	unsigned int		se_id;
+	sm_group_t *		se_sg;
+	unsigned long 		se_flags;
+	unsigned int 		se_state;
+
+	int 			se_node_count;
+	int 			se_memb_count;
+	int 			se_reply_count;
+
+	uint32_t *		se_node_ids;
+	char *			se_node_status;
+	int 			se_len_ids;	/* length of node_ids */
+	int 			se_len_status;	/* length of node_status */
+
+	int 			se_barrier_status;
+	struct timer_list 	se_restart_timer;
+};
+
+/* 
+ * Update Event - what an sg member uses to respond to an sevent 
+ */
+
+/* UE Flags */
+#define UEFL_ALLOW_STARTDONE    (0)
+#define UEFL_ALLOW_BARRIER      (1)
+#define UEFL_CANCEL             (2)
+#define UEFL_LEAVE              (3)
+#define UEFL_CHECK              (4)
+
+/* UE States */
+#define UEST_JSTOP              (1)
+#define UEST_JSTART_WAITCMD     (2)
+#define UEST_JSTART             (3)
+#define UEST_JSTART_SERVICEWAIT (4)
+#define UEST_JSTART_SERVICEDONE (5)
+#define UEST_BARRIER_WAIT       (6)
+#define UEST_BARRIER_DONE       (7)
+#define UEST_LSTOP              (8)
+#define UEST_LSTART_WAITCMD     (9)
+#define UEST_LSTART             (10)
+#define UEST_LSTART_SERVICEWAIT (11)
+#define UEST_LSTART_SERVICEDONE (12)
+
+struct sm_uevent {
+	unsigned int 		ue_state;
+	unsigned long 		ue_flags;
+	uint32_t 		ue_id;
+	uint32_t 		ue_nodeid;
+	int 			ue_num_nodes;
+	int 			ue_barrier_status;
+	uint32_t 		ue_remote_seid;
+};
+
+/* 
+ * Service Group
+ */
+
+#define RECOVER_NONE		(0)
+#define RECOVER_STOP		(1)
+#define RECOVER_START		(2)
+#define RECOVER_STARTDONE	(3)
+#define RECOVER_BARRIERWAIT	(4)
+#define RECOVER_BARRIERDONE	(5)
+
+/* SG Flags */
+#define SGFL_SEVENT             (1)
+#define SGFL_UEVENT             (2)
+#define SGFL_NEED_RECOVERY      (3)
+
+/* SG States */
+#define SGST_NONE		(0)
+#define SGST_JOIN		(1)
+#define SGST_RUN		(2)
+#define SGST_RECOVER		(3)
+#define SGST_UEVENT		(4)
+
+struct sm_group {
+	struct list_head 	list;		/* list of sg's */
+	uint16_t 		level;
+	uint32_t 		local_id;
+	uint32_t 		global_id;
+	unsigned long 		flags;
+	int 			state;
+	int 			refcount;	/* references from reg/unreg */
+	void *			service_data;	/* data from the service */
+	struct kcl_service_ops *ops;		/* ops from the service */
+	struct completion 	event_comp;
+
+	struct list_head 	memb;		/* Membership List for RC */
+	int 			memb_count;	/* number of nodes in memb */
+	struct list_head 	joining;	/* nodes joining the sg */
+	sm_sevent_t *		sevent;
+	sm_uevent_t 		uevent;
+
+	int			recover_state;
+	int			recover_stop;
+	struct list_head 	recover_list;	/* recovery event list */
+	void *			recover_data;
+	char 			recover_barrier[MAX_BARRIER_NAME_LEN];
+
+	int 			namelen;
+	char 			name[1];	/* must be last field */
+};
+
+/* 
+ * Service Message
+ */
+
+/* SMSG Type */
+#define SMSG_JOIN_REQ           (1)
+#define SMSG_JOIN_REP           (2)
+#define SMSG_JSTOP_REQ          (3)
+#define SMSG_JSTOP_REP          (4)
+#define SMSG_JSTART_CMD         (5)
+#define SMSG_LEAVE_REQ          (6)
+#define SMSG_LEAVE_REP          (7)
+#define SMSG_LSTOP_REQ          (8)
+#define SMSG_LSTOP_REP          (9)
+#define SMSG_LSTART_CMD         (10)
+#define SMSG_LSTART_DONE        (11)
+#define SMSG_RECOVER		(12)
+
+/* SMSG Status */
+#define STATUS_POS              (1)
+#define STATUS_NEG              (2)
+#define STATUS_WAIT             (3)
+
+struct sm_msg {
+	uint8_t 		ms_type;
+	uint8_t 		ms_status;
+	uint16_t		ms_pad;
+	uint32_t 		ms_sevent_id;
+	uint32_t 		ms_global_sgid;
+	uint32_t 		ms_global_lastid;
+	uint16_t 		ms_sglevel;
+	uint16_t 		ms_length;
+	/* buf of ms_length bytes follows */
+};
+
+/* 
+ * Node structure
+ */
+
+#define SNFL_NEED_RECOVERY  	(0)
+#define SNFL_CLUSTER_MEMBER	(1)
+#define SNFL_LEAVING        	(2)
+
+struct sm_node {
+	struct list_head 	list;
+	uint32_t 		id;		/* node id from cnxman */
+	unsigned long 		flags;
+	int 			incarnation;	/* node incarnation number */
+};
+
+#endif				/* __SM_INTERNAL_DOT_H__ */
--- linux-2.6.9.orig/cluster/cman/sm_joinleave.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_joinleave.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,1291 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+
+/*
+ * Routines used by nodes that are joining or leaving a SG.  These "sevent"
+ * routines initiate membership changes to a SG.  Existing SG members respond
+ * using the "uevent" membership update routines.
+ */
+
+extern uint32_t 		sm_our_nodeid;
+extern struct list_head 	sm_members;
+static struct list_head 	new_event;
+static spinlock_t 		new_event_lock;
+static struct list_head		joinleave_events;
+
+void init_joinleave(void)
+{
+	INIT_LIST_HEAD(&new_event);
+	spin_lock_init(&new_event_lock);
+	INIT_LIST_HEAD(&joinleave_events);
+}
+
+void new_joinleave(sm_sevent_t *sev)
+{
+	spin_lock(&new_event_lock);
+	list_add_tail(&sev->se_list, &new_event);
+	spin_unlock(&new_event_lock);
+	wake_serviced(DO_JOINLEAVE);
+}
+
+sm_sevent_t *find_sevent(unsigned int id)
+{
+	sm_sevent_t *sev;
+
+	list_for_each_entry(sev, &joinleave_events, se_list) {
+		if (sev->se_id == id)
+			return sev;
+	}
+	return NULL;
+}
+
+static void release_sevent(sm_sevent_t *sev)
+{
+	if (sev->se_len_ids) {
+		kfree(sev->se_node_ids);
+		sev->se_node_ids = NULL;
+	}
+
+	if (sev->se_len_status) {
+		kfree(sev->se_node_status);
+		sev->se_node_status = NULL;
+	}
+
+	sev->se_node_count = 0;
+	sev->se_memb_count = 0;
+	sev->se_reply_count = 0;
+}
+
+static int init_sevent(sm_sevent_t *sev)
+{
+	sm_node_t *node;
+	int len1, len2, count, cluster_members = 0;
+
+	/* clear state from any previous attempt */
+	release_sevent(sev);
+
+	list_for_each_entry(node, &sm_members, list) {
+		if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
+			cluster_members++;
+	}
+
+	sev->se_node_count = cluster_members;
+	sev->se_memb_count = sev->se_sg->memb_count;
+
+	/*
+	 * When joining, we need a node array the size of the entire cluster
+	 * member list because we get responses from all nodes.  When leaving,
+	 * we only get responses from SG members, so the node array need only
+	 * be that large.
+	 */
+
+	if (sev->se_state < SEST_LEAVE_BEGIN)
+		count = sev->se_node_count;
+	else
+		count = sev->se_memb_count;
+
+	len1 = count * sizeof(uint32_t);
+	sev->se_len_ids = len1;
+
+	sev->se_node_ids = (uint32_t *) kmalloc(len1, GFP_KERNEL);
+	if (!sev->se_node_ids)
+		goto fail;
+
+	len2 = count * sizeof (char);
+	sev->se_len_status = len2;
+
+	sev->se_node_status = (char *) kmalloc(len2, GFP_KERNEL);
+	if (!sev->se_node_status)
+		goto fail_free;
+
+	memset(sev->se_node_status, 0, len2);
+	memset(sev->se_node_ids, 0, len1);
+
+	return 0;
+
+      fail_free:
+	kfree(sev->se_node_ids);
+	sev->se_node_ids = NULL;
+	sev->se_len_ids = 0;
+
+      fail:
+	return -ENOMEM;
+}
+
+/* Context: timer */
+
+static void sev_restart(unsigned long data)
+{
+	sm_sevent_t *sev = (sm_sevent_t *) data;
+
+	clear_bit(SEFL_DELAY, &sev->se_flags);
+	set_bit(SEFL_CHECK, &sev->se_flags);
+	wake_serviced(DO_JOINLEAVE);
+}
+
+static void schedule_sev_restart(sm_sevent_t *sev)
+{
+	init_timer(&sev->se_restart_timer);
+	sev->se_restart_timer.function = sev_restart;
+	sev->se_restart_timer.data = (long) sev;
+	mod_timer(&sev->se_restart_timer, jiffies + (RETRY_DELAY * HZ));
+}
+
+void free_sg_memb(sm_group_t *sg)
+{
+	sm_node_t *node;
+
+	while (!list_empty(&sg->memb)) {
+		node = list_entry(sg->memb.next, sm_node_t, list);
+		list_del(&node->list);
+		kfree(node);
+	}
+	sg->memb_count = 0;
+}
+
+/*
+ * 1.  First step in joining a SG - send a message to all nodes in the cluster
+ * asking to join the named SG.  If any nodes are members they will reply with
+ * a POS, or a WAIT (wait means try again, only one node can join at a time).
+ * If no one knows about this SG, they all send NEG replies which means we form
+ * the SG with just ourself as a member.
+ */
+
+static int send_join_notice(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	sm_node_t *node;
+	char *msg;
+	int i = 0, error, namelen, len = 0;
+
+	/*
+	 * Create node array from member list in which to collect responses.
+	 */
+
+	error = init_sevent(sev);
+	if (error)
+		goto out;
+
+	list_for_each_entry(node, &sm_members, list) {
+		if (test_bit(SNFL_CLUSTER_MEMBER, &node->flags))
+			sev->se_node_ids[i++] = node->id;
+	}
+
+	/*
+	 * Create and send a join request message.
+	 *
+	 * Other nodes then run process_join_request and reply to us; we
+	 * collect the responses in process_reply and check them in
+	 * check_join_notice.
+	 */
+
+	namelen = sg->namelen;
+	msg = create_smsg(sg, SMSG_JOIN_REQ, namelen, &len, sev);
+	memcpy(msg + sizeof(sm_msg_t), sg->name, namelen);
+
+	error = send_broadcast_message_sev(msg, len, sev);
+
+      out:
+	return error;
+}
+
+/*
+ * 2.  Second step in joining a SG - after we collect all replies to our join
+ * request, we look at them.  If anyone told us to wait, we'll wait a while, go
+ * back and start at step 1 again.
+ */
+
+static int check_join_notice(sm_sevent_t *sev)
+{
+	int pos = 0, wait = 0, neg = 0, restart = 0, i, error = 0;
+
+	for (i = 0; i < sev->se_node_count; i++) {
+		switch (sev->se_node_status[i]) {
+		case STATUS_POS:
+			/* this node is in the SG and will be in new proposed
+			 * memb list */
+			pos++;
+			break;
+
+		case STATUS_WAIT:
+			/* this node is in the SG but something else is
+			 * happening with it at the moment. */
+			wait++;
+			break;
+
+		case STATUS_NEG:
+			/* this node has no record of the SG we're interested
+			 * in */
+			neg++;
+
+			if (sev->se_node_ids[i] == sm_our_nodeid)
+				sev->se_node_status[i] = STATUS_POS;
+			break;
+
+		default:
+			/* we didn't get a valid response from this node,
+			 * restart the entire sev. */
+			restart++;
+			break;
+		}
+	}
+
+	if (pos && !wait && !restart) {
+		/* all current members of this sg pos'ed our entry */
+	} else if (!pos && !wait && !restart && neg) {
+		/* we're the first in the cluster to join this sg */
+		sev->se_sg->global_id = sm_new_global_id(sev->se_sg->level);
+	} else
+		error = -1;
+
+	return error;
+}
+
+/*
+ * 3.  Third step in joining the SG - tell the nodes that are already members
+ * to "stop" the service.  We stop them so that everyone can restart with the
+ * new member (us!) added.
+ */
+
+static int send_join_stop(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	sm_node_t *node;
+	char *msg;
+	uint32_t be_count;
+	int i, len = 0, error = 0;
+
+	/*
+	 * Form the SG memb list with us in it.
+	 */
+
+	for (i = 0; i < sev->se_node_count; i++) {
+		if (sev->se_node_status[i] != STATUS_POS)
+			continue;
+
+		node = sm_new_node(sev->se_node_ids[i]);
+		if (!node)
+			goto fail;
+
+		list_add_tail(&node->list, &sg->memb);
+		sg->memb_count++;
+	}
+
+	/*
+	 * Re-init the node vector in which to collect responses again.
+	 */
+
+	sev->se_memb_count = sg->memb_count;
+
+	memset(sev->se_node_status, 0, sev->se_len_status);
+	memset(sev->se_node_ids, 0, sev->se_len_ids);
+	i = 0;
+
+	list_for_each_entry(node, &sg->memb, list)
+		sev->se_node_ids[i++] = node->id;
+
+	/*
+	 * Create and send a stop message.
+	 *
+	 * Other nodes then run process_stop_request and process_join_stop and
+	 * reply to us.  They stop the sg we're trying to join if they agree.
+	 * We collect responses in process_reply and check them in
+	 * check_join_stop.
+	 */
+
+	msg = create_smsg(sg, SMSG_JSTOP_REQ, sizeof(uint32_t), &len, sev);
+	be_count = cpu_to_be32(sg->memb_count);
+	memcpy(msg + sizeof(sm_msg_t), &be_count, sizeof(uint32_t));
+
+	error = send_members_message_sev(sg, msg, len, sev);
+	if (error < 0)
+		goto fail;
+
+	return 0;
+
+      fail:
+	free_sg_memb(sg);
+	return error;
+}
+
+/*
+ * 4.  Fourth step in joining the SG - after we collect replies to our stop
+ * request, we look at them.  Everyone sending POS agrees with us joining and
+ * has stopped their SG.  If some nodes sent NEG, something is wrong and we
+ * don't have a good way to address that yet since some nodes may have sent
+ * POS.
+ *
+ * FIXME: even nodes replying with NEG should stop their SG so we can send an
+ * abort and have everyone at the same place to start from again.
+ */
+
+static int check_join_stop(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	int i, pos = 0, neg = 0;
+
+	for (i = 0; i < sev->se_memb_count; i++) {
+		switch (sev->se_node_status[i]) {
+		case STATUS_POS:
+			pos++;
+			break;
+
+		case STATUS_NEG:
+			log_error(sg, "check_join_stop: neg from nodeid %u "
+				  "(%d, %d, %u)", sev->se_node_ids[i],
+				  pos, neg, sev->se_memb_count);
+			neg++;
+			break;
+
+		default:
+			log_error(sg, "check_join_stop: unknown status=%u "
+				  "nodeid=%u", sev->se_node_status[i],
+				  sev->se_node_ids[i]);
+			neg++;
+			break;
+		}
+	}
+
+	if (pos == sg->memb_count)
+		return 0;
+
+	free_sg_memb(sg);
+	return -1;
+}
+
+/*
+ * 5.  Fifth step in joining the SG - everyone has stopped their service and we
+ * all now start the service with us, the new member, added to the SG member
+ * list.  We send start to our own service here and send a message to the other
+ * members that they should also start their service.
+ */
+
+static int send_join_start(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	sm_node_t *node;
+	uint32_t *memb;
+	char *msg;
+	int error, count = 0, len = 0;
+
+	/*
+	 * Create a start message and send it.
+	 */
+
+	msg = create_smsg(sg, SMSG_JSTART_CMD, 0, &len, sev);
+
+	error = send_members_message(sg, msg, len);
+	if (error < 0)
+		goto fail;
+
+	/*
+	 * Start the service ourself.  The chunk of memory with the member ids
+	 * must be freed by the service when it is done with it.
+	 */
+
+	SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
+		 memb);
+
+	list_for_each_entry(node, &sg->memb, list)
+		memb[count++] = node->id;
+
+	set_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
+
+	sg->ops->start(sg->service_data, memb, count, sev->se_id,
+		       SERVICE_NODE_JOIN);
+	return 0;
+
+      fail:
+	free_sg_memb(sg);
+	return error;
+}
+
+/*
+ * 6.  Sixth step in joining the SG - once the service has completed its start,
+ * it does a kcl_start_done() to signal us that it's done.  That gets us here
+ * and we do a barrier with all other members which join the barrier when their
+ * service is done starting.
+ */
+
+static int startdone_barrier_new(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	char bname[MAX_BARRIER_NAME_LEN];
+	int error;
+
+	memset(bname, 0, MAX_BARRIER_NAME_LEN);
+	sev->se_barrier_status = -1;
+
+	set_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
+
+	/* If we're the only member, skip the barrier */
+	if (sg->memb_count == 1) {
+		process_startdone_barrier_new(sg, 0);
+		return 0;
+	}
+
+	snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
+		 sg->global_id, sm_our_nodeid, sev->se_id, sg->memb_count);
+
+	error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE_NEW);
+	if (error)
+		goto fail;
+
+	return 0;
+
+      fail:
+	clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
+	sg->ops->stop(sg->service_data);
+	free_sg_memb(sg);
+	return error;
+}
+
+/*
+ * 7.  Seventh step in joining the SG - check that the barrier we joined with
+ * all other members returned with a successful status.
+ */
+
+static int check_startdone_barrier_new(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	int error = sev->se_barrier_status;
+
+	if (error) {
+		sg->ops->stop(sg->service_data);
+		free_sg_memb(sg);
+	}
+	return error;
+}
+
+/*
+ * 8.  Eigth step in joining the SG - send the service a "finish" indicating
+ * that all members have successfully started the service.
+ */
+
+static void do_finish_new(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	sg->state = SGST_RUN;
+	sg->sevent = NULL;
+	clear_bit(SGFL_SEVENT, &sg->flags);
+
+	sg->ops->finish(sg->service_data, sev->se_id);
+}
+
+/*
+ * 9.  Ninth step in joining the SG - it's done so get rid of the sevent stuff
+ * and tell the process which initiated the join that it's done.
+ */
+
+static void sevent_done(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	list_del(&sev->se_list);
+	release_sevent(sev);
+	kfree(sev);
+	complete(&sg->event_comp);
+}
+
+/*
+ * Move through the steps of a join.  Summary:
+ *
+ * 1. Send a join notice to all cluster members.
+ * 2. Collect and check replies to the join notice.
+ * 3. Send a stop message to all SG members.
+ * 4. Collect and check replies to the stop message.
+ * 5. Send a start message to all SG members and start service ourself.
+ * 6. Use barrier to wait for all nodes to complete the start.
+ * 7. Check that all SG members joined the barrier.
+ * 8. Send finish to the service indicating that all nodes started it.
+ * 9. Clean up sevent and signal completion to the process that started the join
+ */
+
+static void process_join_sevent(sm_sevent_t *sev)
+{
+	int error = 0;
+
+	/*
+	 * We may cancel the current join attempt if another node is also
+	 * attempting to join or leave. (Only a single node can join or leave
+	 * at once.)  If cancelled, 0ur join attempt will be restarted later.
+	 */
+
+	if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
+		error = 1;
+		goto cancel;
+	}
+
+	log_debug(sev->se_sg, "sevent state %u", sev->se_state);
+
+	switch (sev->se_state) {
+
+		/*
+		 * An sevent is created in kcl_join_service with a state of
+		 * JOIN_BEGIN.
+		 */
+
+	case SEST_JOIN_BEGIN:
+		sev->se_state = SEST_JOIN_ACKWAIT;
+		error = send_join_notice(sev);
+		break;
+
+		/*
+		 * se_state is changed from JOIN_ACKWAIT to JOIN_ACKED in 
+		 * process_reply  (when all the replies have been received)
+		 */
+
+	case SEST_JOIN_ACKED:
+		error = check_join_notice(sev);
+		if (error)
+			break;
+
+		sev->se_state = SEST_JSTOP_ACKWAIT;
+		error = send_join_stop(sev);
+		break;
+
+		/*
+		 * se_state is changed from JSTOP_ACKWAIT to JSTOP_ACKED in
+		 * proces_reply  (when all the replies have been received)
+		 */
+
+	case SEST_JSTOP_ACKED:
+		error = check_join_stop(sev);
+		if (error)
+			break;
+
+		sev->se_state = SEST_JSTART_SERVICEWAIT;
+		error = send_join_start(sev);
+		break;
+
+		/*
+		 * se_state is changed from JSTART_SERVICEWAIT to
+		 * JSTART_SERVICEDONE in kcl_start_done
+		 */
+
+	case SEST_JSTART_SERVICEDONE:
+		sev->se_state = SEST_BARRIER_WAIT;
+		error = startdone_barrier_new(sev);
+		break;
+
+		/*
+		 * se_state is changed from BARRIER_WAIT to BARRIER_DONE in
+		 * process_startdone_barrier_new 
+		 */
+
+	case SEST_BARRIER_DONE:
+		error = check_startdone_barrier_new(sev);
+		if (error)
+			break;
+
+		do_finish_new(sev);
+		sevent_done(sev);
+		break;
+
+	default:
+		log_error(sev->se_sg, "no join processing for state %u",
+			  sev->se_state);
+	}
+
+      cancel:
+	if (error) {
+		/* restart the sevent from the beginning */
+		log_debug(sev->se_sg, "process_join error %d %lx", error,
+			  sev->se_flags);
+		sev->se_state = SEST_JOIN_BEGIN;
+		sev->se_sg->global_id = 0;
+		set_bit(SEFL_DELAY, &sev->se_flags);
+		schedule_sev_restart(sev);
+	}
+}
+
+/*
+ * 1.  First step in leaving an SG - send a message to other SG members asking
+ * to leave the SG.  Nodes that don't have another active sevent or uevent for
+ * this SG will return POS.
+ */
+
+static int send_leave_notice(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	sm_node_t *node;
+	char *msg;
+	int i = 0, error = -1, len = 0;
+
+	/*
+	 * Create a node array from member list in which to collect responses.
+	 */
+
+	error = init_sevent(sev);
+	if (error)
+		goto out;
+
+	list_for_each_entry(node, &sg->memb, list)
+		sev->se_node_ids[i++] = node->id;
+
+	/*
+	 * Create and send a leave request message.
+	 */
+
+	msg = create_smsg(sg, SMSG_LEAVE_REQ, 0, &len, sev);
+
+	error = send_members_message_sev(sg, msg, len, sev);
+
+      out:
+	return error;
+}
+
+/*
+ * 2.  Second step in leaving an SG - after we collect all replies to our leave
+ * request, we look at them.  If anyone replied with WAIT, we abort our attempt
+ * at leaving and try again in a bit.
+ */
+
+static int check_leave_notice(sm_sevent_t *sev)
+{
+	int pos = 0, wait = 0, neg = 0, restart = 0, i;
+
+	for (i = 0; i < sev->se_memb_count; i++) {
+		switch (sev->se_node_status[i]) {
+		case STATUS_POS:
+			pos++;
+			break;
+
+		case STATUS_WAIT:
+			wait++;
+			break;
+
+		case STATUS_NEG:
+			neg++;
+			break;
+
+		default:
+			/* we didn't get a valid response from this node,
+			 * restart the entire sev. */
+			restart++;
+			break;
+		}
+	}
+
+	/* all members approve */
+	if (pos && !wait && !restart)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * 3.  Third step in leaving the SG - tell the member nodes to "stop" the SG.
+ * They must be stopped in order to restart without us as a member.
+ */
+
+static int send_leave_stop(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	char *msg;
+	int error, len = 0;
+
+	/*
+	 * Re-init the status vector in which to collect responses.
+	 */
+
+	memset(sev->se_node_status, 0, sev->se_len_status);
+
+	/*
+	 * Create and send a stop message.
+	 */
+
+	msg = create_smsg(sg, SMSG_LSTOP_REQ, 0, &len, sev);
+
+	error = send_members_message_sev(sg, msg, len, sev);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * we and all others stop the SG now 
+	 */
+
+	sg->ops->stop(sg->service_data);
+
+      out:
+	return error;
+}
+
+/*
+ * 4.  Fourth step in leaving the SG - check the replies to our stop request.
+ * Same problem with getting different replies as check_join_stop.
+ */
+
+static int check_leave_stop(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	int i, pos = 0, neg = 0;
+
+	for (i = 0; i < sev->se_memb_count; i++) {
+		switch (sev->se_node_status[i]) {
+		case STATUS_POS:
+			pos++;
+			break;
+
+		case STATUS_NEG:
+			log_error(sg, "check_leave_stop: fail from nodeid %u "
+				  "(%d, %d, %u)", sev->se_node_ids[i],
+				  pos, neg, sev->se_memb_count);
+			neg++;
+			break;
+
+		default:
+			log_error(sg, "check_leave_stop: status %u nodeid %u",
+				  sev->se_node_status[i], sev->se_node_ids[i]);
+			neg++;
+			break;
+		}
+	}
+
+	if (pos == sg->memb_count)
+		return 0;
+
+	return -1;
+}
+
+/*
+ * 5.  Fifth step in leaving the SG - tell the other SG members to restart the
+ * service without us.  We, of course, don't start our own stopped service.  If
+ * we're the last SG member and leaving, we jump right to the next step.
+ */
+
+static int send_leave_start(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	char *msg;
+	int error = 0, len = 0;
+
+	if (sg->memb_count == 1) {
+		sev->se_state = SEST_LSTART_REMOTEDONE;
+		set_bit(SEFL_CHECK, &sev->se_flags);
+		wake_serviced(DO_JOINLEAVE);
+	} else {
+		msg = create_smsg(sg, SMSG_LSTART_CMD, 0, &len, sev);
+		error = send_members_message(sg, msg, len);
+	}
+	return error;
+}
+
+/*
+ * Move through the steps of a leave.  Summary:
+ *
+ * 1. Send a leave notice to all SG members.
+ * 2. Collect and check replies to the leave notice.
+ * 3. Send a stop message to all SG members and stop our own SG.
+ * 4. Collect and check replies to the stop message.
+ * 5. Send a start message to SG members.
+ * 6. Clean up sevent and signal completion to the process that
+ *    started the leave.
+ */
+
+static void process_leave_sevent(sm_sevent_t *sev)
+{
+	int error = 0;
+
+	/*
+	 * We may cancel the current leave attempt if another node is also
+	 * attempting to join or leave. (Only a single node can join or leave
+	 * at once.)  Our leave attempt will be restarted after being
+	 * cancelled.
+	 */
+
+	if (test_and_clear_bit(SEFL_CANCEL, &sev->se_flags)) {
+		error = 1;
+		goto cancel;
+	}
+
+	if (test_bit(SGFL_UEVENT, &sev->se_sg->flags)) {
+		error = 2;
+		goto cancel;
+	}
+
+	if (!list_empty(&sev->se_sg->joining)) {
+		error = 3;
+		goto cancel;
+	}
+
+	log_debug(sev->se_sg, "sevent state %u", sev->se_state);
+
+	switch (sev->se_state) {
+
+		/*
+		 * An sevent is created in kcl_leave_service with a state of
+		 * LEAVE_BEGIN.
+		 */
+
+	case SEST_LEAVE_BEGIN:
+		sev->se_state = SEST_LEAVE_ACKWAIT;
+		error = send_leave_notice(sev);
+		break;
+
+		/*
+		 * se_state is changed from LEAVE_ACKWAIT to LEAVE_ACKED in 
+		 * process_reply  (when all the replies have been received)
+		 */
+
+	case SEST_LEAVE_ACKED:
+		error = check_leave_notice(sev);
+		if (error)
+			break;
+
+		sev->se_state = SEST_LSTOP_ACKWAIT;
+		error = send_leave_stop(sev);
+		break;
+
+		/*
+		 * se_state is changed from LSTOP_ACKWAIT to LSTOP_ACKED in
+		 * process_reply
+		 */
+
+	case SEST_LSTOP_ACKED:
+		error = check_leave_stop(sev);
+		if (error)
+			break;
+
+		sev->se_state = SEST_LSTART_WAITREMOTE;
+		error = send_leave_start(sev);
+		break;
+
+		/*
+		 * se_state is changed from LSTART_WAITREMOTE to
+		 * LSTART_REMOTEDONE in process_leave_done
+		 */
+
+	case SEST_LSTART_REMOTEDONE:
+		sevent_done(sev);
+		break;
+
+	default:
+		log_error(sev->se_sg, "process_leave_sevent state=%u",
+			  sev->se_state);
+	}
+
+ cancel:
+	if (error) {
+		log_debug(sev->se_sg, "process_leave error %d %lx", error,
+			  sev->se_flags);
+		/* restart the sevent from the beginning */
+		sev->se_state = SEST_LEAVE_BEGIN;
+		set_bit(SEFL_DELAY, &sev->se_flags);
+		schedule_sev_restart(sev);
+	}
+}
+
+/*
+ * Sevent backout code.  Take appropriate steps when a recovery occurs while
+ * we're in the midst of an sevent.  The recovery may or may not affect the
+ * sevent.  If it does, it usually means cancelling the sevent and restarting
+ * it from the beginning once the recovery processing is done.
+ */
+
+/*
+ * If any of the nodes that replied with OK is dead, we give up on the current
+ * join attempt and restart.  Otherwise, this sevent can continue.
+ */
+
+static int backout_join_acked(sm_sevent_t *sev)
+{
+	sm_node_t *node;
+	int i;
+
+	for (i = 0; i < sev->se_node_count; i++) {
+		if (sev->se_node_status[i] != STATUS_POS)
+			continue;
+
+		list_for_each_entry(node, &sm_members, list) {
+			if (test_bit(SNFL_NEED_RECOVERY, &node->flags) &&
+			    (node->id == sev->se_node_ids[i]))
+				return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+/*
+ * In this state our sg member list exists and mark_affected_sgs() will have
+ * set NEED_RECOVERY if any of the nodes in the sg we're joining is dead.  We
+ * restart the join process if this is the case, otherwise this sevent can
+ * continue.
+ */
+
+static int backout_jstop_ackwait(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	clear_bit(SEFL_ALLOW_JSTOP, &sev->se_flags);
+	free_sg_memb(sg);
+	return TRUE;
+}
+
+/*
+ * Same as previous.
+ */
+
+static int backout_jstop_acked(sm_sevent_t *sev)
+{
+	return backout_jstop_ackwait(sev);
+}
+
+/*
+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
+ * starting our service.  The recovery process will restart the service on all
+ * the prior sg members (not including those that died or us).  We will
+ * reattempt our join which should be accepted once the nodes are done with
+ * recovery.
+ */
+
+static int backout_jstart_servicewait(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags);
+	sg->ops->stop(sg->service_data);
+	free_sg_memb(sg);
+	return TRUE;
+}
+
+/*
+ * Same as previous.
+ */
+
+static int backout_jstart_servicedone(sm_sevent_t *sev)
+{
+	return backout_jstart_servicewait(sev);
+}
+
+/*
+ * If NEED_RECOVERY is set a member of the sg we're joining died while we were
+ * waiting on the "all done" barrier.  Stop our service that we just started
+ * and cancel the barrier.  The recovery process will restart the service on
+ * all the prior sg members (not including those that died or us).  We will
+ * reattempt our join which should be accepted once the nodes are done with
+ * recovery.
+ */
+
+static int backout_barrier_wait(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+	char bname[MAX_BARRIER_NAME_LEN];
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	clear_bit(SEFL_ALLOW_BARRIER, &sev->se_flags);
+
+	sg->ops->stop(sg->service_data);
+
+	memset(bname, 0, MAX_BARRIER_NAME_LEN);
+	snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
+		 sg->global_id, sm_our_nodeid, sev->se_id,
+		 sg->memb_count);
+	kcl_barrier_cancel(bname);
+
+	free_sg_memb(sg);
+	return TRUE;
+}
+
+/*
+ * If NEED_RECOVERY is set, a member of the sg we just joined has failed.  The
+ * recovery began after the barrier callback.  If the result in the callback is
+ * "success" then we are joined, this sevent is finished and we'll process the
+ * sg within the forthcoming recovery with the other members.
+ *
+ * We rely upon cnxman to guarantee that once all nodes have joined a barrier,
+ * all nodes will receive the corresponding barrier callback *before any*
+ * receive an sm_member_update() due to one of those nodes failing just after
+ * joining the barrier.  If some nodes receive the sm_member_update() before
+ * the barrier callback and others receive the barrier callback before the
+ * sm_member_update() then they will disagree as to whether the node joining/
+ * leaving is in/out of the sg.
+ */
+
+static int backout_barrier_done(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	if (!sev->se_barrier_status) {
+		do_finish_new(sev);
+		sevent_done(sev);
+		return FALSE;
+	} else {
+		sg->ops->stop(sg->service_data);
+		free_sg_memb(sg);
+		return TRUE;
+	}
+}
+
+/*
+ * We've done nothing yet, just restart when recovery is done (if sg is flagged
+ * with recovery.)
+ */
+
+static int backout_leave_begin(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	return TRUE;
+}
+
+/*
+ * Ignore any replies to our leave notice and restart when recovery is done (if
+ * sg is flagged with recovery.)
+ */
+
+static int backout_leave_ackwait(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	clear_bit(SEFL_ALLOW_LEAVE, &sev->se_flags);
+
+	return TRUE;
+}
+
+/*
+ * Same as previous.
+ */
+
+static int backout_leave_acked(sm_sevent_t *sev)
+{
+	return backout_leave_ackwait(sev);
+}
+
+/*
+ * Ignore any stop replies.  All the members will be stopped anyway to do the
+ * recovery.  Let that happen and restart our leave when done.
+ */
+
+static int backout_lstop_ackwait(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	clear_bit(SEFL_ALLOW_LSTOP, &sev->se_flags);
+
+	return TRUE;
+}
+
+/*
+ * Same as previous.
+ */
+
+static int backout_lstop_acked(sm_sevent_t *sev)
+{
+	return backout_lstop_ackwait(sev);
+}
+
+/*
+ * All members will be stopped due to recovery and restarted by recovery
+ * processing.  That includes us, we have to retry the leave once the recovery
+ * is done.
+ */
+
+static int backout_lstart_waitremote(sm_sevent_t *sev)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+		return FALSE;
+
+	return TRUE;
+}
+
+/*
+ * Reset an sevent to its beginning so it can be restarted.  This is necessary
+ * when recovery affects an SG while we're trying to join or leave (ie. a node
+ * in the SG fails).
+ */
+
+void backout_sevents(void)
+{
+	sm_sevent_t *sev, *safe;
+	int delay;
+
+	list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
+
+		delay = FALSE;
+
+		log_debug(sev->se_sg, "backout sevent state %u", sev->se_state);
+
+		switch (sev->se_state) {
+
+		/* backout after kcl_join_service and before
+		 * send_join_notice */
+		case SEST_JOIN_BEGIN:
+			break;
+
+		/* backout after send_join_notice and before final
+		 * process_reply */
+		case SEST_JOIN_ACKWAIT:
+			clear_bit(SEFL_ALLOW_JOIN, &sev->se_flags);
+			sev->se_state = SEST_JOIN_BEGIN;
+			set_bit(SEFL_CHECK, &sev->se_flags);
+			wake_serviced(DO_JOINLEAVE);
+			break;
+
+		/* backout after final process_reply and before
+		 * check_join_notice */
+		case SEST_JOIN_ACKED:
+			delay = backout_join_acked(sev);
+			break;
+
+		/* backout after send_join_stop and before final
+		 * process_reply */
+		case SEST_JSTOP_ACKWAIT:
+			delay = backout_jstop_ackwait(sev);
+			break;
+
+		/* backout after final process_reply and before
+		 * check_join_stop */
+		case SEST_JSTOP_ACKED:
+			delay = backout_jstop_acked(sev);
+			break;
+
+		/* backout after send_join_start and before
+		 * kcl_start_done */
+		case SEST_JSTART_SERVICEWAIT:
+			delay = backout_jstart_servicewait(sev);
+			break;
+
+		/* backout after kcl_start_done and before
+		 * startdone_barrier_new */
+		case SEST_JSTART_SERVICEDONE:
+			delay = backout_jstart_servicedone(sev);
+			break;
+
+		/* backout after startdone_barrier_new and before
+		 * callback_startdone_barrier_new */
+		case SEST_BARRIER_WAIT:
+			delay = backout_barrier_wait(sev);
+			break;
+
+		/* backout after callback_startdone_barrier_new and
+		 * before check_startdone_barrier_new */
+		case SEST_BARRIER_DONE:
+			delay = backout_barrier_done(sev);
+			break;
+
+		/* backout after kcl_leave_service and before
+		 * send_leave_notice */
+		case SEST_LEAVE_BEGIN:
+			delay = backout_leave_begin(sev);
+			break;
+
+		/* backout after send_leave_notice and before final
+		 * process_reply */
+		case SEST_LEAVE_ACKWAIT:
+			delay = backout_leave_ackwait(sev);
+			break;
+
+		/* backout after final process_reply and before
+		 * check_leave_notice */
+		case SEST_LEAVE_ACKED:
+			delay = backout_leave_acked(sev);
+			break;
+
+		/* backout after send_leave_stop and before final
+		 * process_reply */
+		case SEST_LSTOP_ACKWAIT:
+			delay = backout_lstop_ackwait(sev);
+			break;
+
+		/* backout after final process_reply and before
+		 * check_leave_stop */
+		case SEST_LSTOP_ACKED:
+			delay = backout_lstop_acked(sev);
+			break;
+
+		/* backout after send_leave_start and before
+		 * process_lstart_done */
+		case SEST_LSTART_WAITREMOTE:
+			delay = backout_lstart_waitremote(sev);
+			break;
+
+		/* backout after process_lstart_done and before
+		 * process_leave_sevent */
+		case SEST_LSTART_REMOTEDONE:
+			sevent_done(sev);
+			delay = FALSE;
+			break;
+
+		default:
+			log_error(sev->se_sg, "backout_sevents: bad state %d",
+				  sev->se_state);
+		}
+
+		if (delay) {
+			if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
+				sev->se_state = SEST_LEAVE_BEGIN;
+				set_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
+				set_bit(SEFL_CHECK, &sev->se_flags);
+				wake_serviced(DO_JOINLEAVE);
+			} else {
+				sev->se_state = SEST_JOIN_BEGIN;
+				set_bit(SEFL_CHECK, &sev->se_flags);
+				wake_serviced(DO_JOINLEAVE);
+			}
+		}
+	}
+}
+
+void process_joinleave(void)
+{
+	sm_sevent_t *sev = NULL, *safe;
+
+	spin_lock(&new_event_lock);
+	if (!list_empty(&new_event)) {
+		sev = list_entry(new_event.next, sm_sevent_t, se_list);
+		list_del(&sev->se_list);
+		list_add_tail(&sev->se_list, &joinleave_events);
+		set_bit(SEFL_CHECK, &sev->se_flags);
+	}
+	spin_unlock(&new_event_lock);
+
+	list_for_each_entry_safe(sev, safe, &joinleave_events, se_list) {
+		if (!test_and_clear_bit(SEFL_CHECK, &sev->se_flags))
+			continue;
+
+		if (test_bit(SEFL_DELAY, &sev->se_flags) ||
+		    test_bit(SEFL_DELAY_RECOVERY, &sev->se_flags))
+			continue;
+
+		if (sev->se_state < SEST_LEAVE_BEGIN)
+			process_join_sevent(sev);
+		else
+			process_leave_sevent(sev);
+	}
+}
--- linux-2.6.9.orig/cluster/cman/sm_joinleave.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_joinleave.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,23 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_JOINLEAVE_DOT_H__
+#define __SM_JOINLEAVE_DOT_H__
+
+void init_joinleave(void);
+void new_joinleave(sm_sevent_t *sev);
+void process_joinleave(void);
+void backout_sevents(void);
+sm_sevent_t *find_sevent(unsigned int id);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_membership.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_membership.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,701 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+
+extern struct list_head		sm_members;
+
+/*
+ * Routines for SG members to handle other nodes joining or leaving the SG.
+ * These "uevent" membership update routines are the response to an "sevent" on
+ * a joining/leaving node.
+ */
+
+static void del_memb_node(sm_group_t *sg, uint32_t nodeid)
+{
+	sm_node_t *node;
+
+	list_for_each_entry(node, &sg->memb, list) {
+		if (node->id != nodeid)
+			continue;
+		list_del(&node->list);
+		kfree(node);
+		sg->memb_count--;
+		log_debug(sg, "del node %u count %d", nodeid, sg->memb_count);
+		break;
+	}
+}
+
+static void add_memb_node(sm_group_t *sg, sm_node_t *node)
+{
+	list_add_tail(&node->list, &sg->memb);
+	sg->memb_count++;
+	log_debug(sg, "add node %u count %d", node->id, sg->memb_count);
+}
+
+/*
+ * Join 1.  The receive end of send_join_stop() from a node requesting to join
+ * the SG.  We stop the service so it can be restarted with the new node.
+ */
+
+static int process_join_stop(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	sm_node_t *node;
+	sm_msg_t reply;
+	int error;
+
+	if (uev->ue_num_nodes != sg->memb_count + 1) {
+		log_error(sg, "process_join_stop: bad num nodes %u %u",
+			  uev->ue_num_nodes, sg->memb_count);
+		return -1;
+	}
+
+	sm_set_event_id(&uev->ue_id);
+
+	node = sm_find_joiner(sg, uev->ue_nodeid);
+	if (!node) {
+		log_error(sg, "process_join_stop: no node %d", uev->ue_nodeid);
+		return -1;
+	}
+
+	sg->state = SGST_UEVENT;
+	sg->ops->stop(sg->service_data);
+
+	reply.ms_type = SMSG_JSTOP_REP;
+	reply.ms_status = STATUS_POS;
+	reply.ms_sevent_id = uev->ue_remote_seid;
+	smsg_bswap_out(&reply);
+
+	error = send_nodeid_message((char *) &reply, sizeof(reply),
+				    uev->ue_nodeid);
+	if (error < 0)
+		return error;
+	return 0;
+}
+
+/*
+ * Join 2.  The receive end of send_join_start() from a node joining the SG.
+ * We are re-starting the service with the new member added.
+ */
+
+static int process_join_start(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	sm_node_t *node;
+	uint32_t *memb;
+	int count = 0;
+
+	/* this memory is passed to the service which must free it */
+	SM_RETRY(memb =
+		 kmalloc((sg->memb_count + 1) * sizeof(uint32_t), GFP_KERNEL),
+		 memb);
+
+	/* transfer joining node from joining list to member list */
+	node = sm_find_joiner(sg, uev->ue_nodeid);
+	SM_ASSERT(node, printk("nodeid=%u\n", uev->ue_nodeid););
+	list_del(&node->list);
+	add_memb_node(sg, node);
+
+	/* the new member list for the service */
+	list_for_each_entry(node, &sg->memb, list)
+		memb[count++] = node->id;
+
+	set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
+
+	sg->ops->start(sg->service_data, memb, count, uev->ue_id,
+		       SERVICE_NODE_JOIN);
+	return 0;
+}
+
+/*
+ * Join 3.  When done starting their local service, every previous SG member
+ * calls startdone_barrier() and the new/joining member calls
+ * startdone_barrier_new().  The barrier returns when everyone has started
+ * their service and joined the barrier.
+ */
+
+static int startdone_barrier(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	char bname[MAX_BARRIER_NAME_LEN];
+	int error;
+
+	memset(bname, 0, MAX_BARRIER_NAME_LEN);
+	uev->ue_barrier_status = -1;
+
+	set_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
+
+	/* If we're the only member, skip the barrier */
+	if (sg->memb_count == 1) {
+		process_startdone_barrier(sg, 0);
+		return 0;
+	}
+
+	snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
+		 sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
+		 sg->memb_count);
+
+	error = sm_barrier(bname, sg->memb_count, SM_BARRIER_STARTDONE);
+
+	return error;
+}
+
+/*
+ * Join 4.  Check that the "all started" barrier returned a successful status.
+ * The newly joined member calls check_startdone_barrier_new().
+ */
+
+static int check_startdone_barrier(sm_group_t *sg)
+{
+	int error = sg->uevent.ue_barrier_status;
+	return error;
+}
+
+/*
+ * Join 5.  Send the service a "finish" indicating that all members have
+ * successfully started.  The newly joined member calls do_finish_new().
+ */
+
+static void do_finish(sm_group_t *sg)
+{
+	sg->state = SGST_RUN;
+	clear_bit(SGFL_UEVENT, &sg->flags);
+	sg->ops->finish(sg->service_data, sg->uevent.ue_id);
+}
+
+/*
+ * Join 6.  The uevent is done.  If this was a uevent for a node leaving the
+ * SG, then send a final message to the departed node signalling that the
+ * remaining nodes have restarted since it left.
+ */
+
+static void uevent_done(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	sm_msg_t reply;
+
+	if (test_bit(UEFL_LEAVE, &uev->ue_flags)) {
+		reply.ms_type = SMSG_LSTART_DONE;
+		reply.ms_status = STATUS_POS;
+		reply.ms_sevent_id = uev->ue_remote_seid;
+		smsg_bswap_out(&reply);
+		send_nodeid_message((char *) &reply, sizeof(reply),
+				    uev->ue_nodeid);
+	}
+	memset(&sg->uevent, 0, sizeof(sm_uevent_t));
+}
+
+/*
+ * Leave 1.  The receive end of send_leave_stop() from a node leaving the SG.
+ */
+
+static int process_leave_stop(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	sm_msg_t reply;
+	int error;
+
+	sm_set_event_id(&uev->ue_id);
+
+	sg->state = SGST_UEVENT;
+	sg->ops->stop(sg->service_data);
+
+	reply.ms_type = SMSG_LSTOP_REP;
+	reply.ms_status = STATUS_POS;
+	reply.ms_sevent_id = uev->ue_remote_seid;
+	smsg_bswap_out(&reply);
+
+	error = send_nodeid_message((char *) &reply, sizeof(reply),
+				    uev->ue_nodeid);
+	if (error < 0)
+		return error;
+	return 0;
+}
+
+/*
+ * Leave 2.  The receive end of send_leave_start() from a node leaving the SG.
+ * We are re-starting the service (without the node that's left naturally.)
+ */
+
+static int process_leave_start(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	sm_node_t *node;
+	uint32_t *memb;
+	int count = 0;
+
+	SM_ASSERT(sg->memb_count > 1,
+		  printk("memb_count=%u\n", sg->memb_count););
+
+	/* this memory is passed to the service which must free it */
+	SM_RETRY(memb =
+		 kmalloc((sg->memb_count - 1) * sizeof(uint32_t), GFP_KERNEL),
+		 memb);
+
+	/* remove departed member from sg member list */
+	del_memb_node(sg, uev->ue_nodeid);
+
+	/* build member list to pass to service */
+	list_for_each_entry(node, &sg->memb, list)
+		memb[count++] = node->id;
+
+	/* allow us to accept the start_done callback for this start */
+	set_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
+
+	sg->ops->start(sg->service_data, memb, count, uev->ue_id,
+		       SERVICE_NODE_LEAVE);
+	return 0;
+}
+
+/*
+ * Move through the steps of another node joining or leaving the SG.
+ */
+
+static void process_one_uevent(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	int error = 0;
+
+	log_debug(sg, "uevent state %u node %u", uev->ue_state, uev->ue_nodeid);
+
+	switch (uev->ue_state) {
+
+		/*
+		 * a uevent is initialized with state JSTOP in
+		 * process_stop_request
+		 */
+
+	case UEST_JSTOP:
+		uev->ue_state = UEST_JSTART_WAITCMD;
+		error = process_join_stop(sg);
+		break;
+
+		/*
+		 * ue_state is changed from JSTART_WAITCMD to JSTART in
+		 * process_start_request
+		 */
+
+	case UEST_JSTART:
+		uev->ue_state = UEST_JSTART_SERVICEWAIT;
+		error = process_join_start(sg);
+		break;
+
+		/*
+		 * ue_state is changed from JSTART_SERVICEWAIT to
+		 * JSTART_SERVICEDONE in kcl_start_done
+		 */
+
+	case UEST_JSTART_SERVICEDONE:
+		uev->ue_state = UEST_BARRIER_WAIT;
+		error = startdone_barrier(sg);
+		break;
+
+		/*
+		 * ue_state is changed from BARRIER_WAIT to BARRIER_DONE in
+		 * process_startdone_barrier
+		 */
+
+	case UEST_BARRIER_DONE:
+		error = check_startdone_barrier(sg);
+		if (error)
+			break;
+
+		do_finish(sg);
+		uevent_done(sg);
+		break;
+
+		/*
+		 * a uevent is initialized with state LSTOP in
+		 * process_stop_request
+		 */
+
+	case UEST_LSTOP:
+		uev->ue_state = UEST_LSTART_WAITCMD;
+		error = process_leave_stop(sg);
+		break;
+
+		/*
+		 * a uevent is changed from LSTART_WAITCMD to LSTART in
+		 * process_start_request
+		 */
+
+	case UEST_LSTART:
+		uev->ue_state = UEST_LSTART_SERVICEWAIT;
+		error = process_leave_start(sg);
+		break;
+
+		/*
+		 * a uevent is changed from LSTART_SERVICEWAIT to to
+		 * LSTART_SERVICEDONE in kcl_start_done
+		 */
+
+	case UEST_LSTART_SERVICEDONE:
+		uev->ue_state = UEST_BARRIER_WAIT;
+		error = startdone_barrier(sg);
+		break;
+
+	default:
+		error = -1;
+	}
+
+	/* If we encounter an error during these routines, we do nothing, 
+	   expecting that a node failure related to this sg will cause a
+	   recovery event to arrive and call cancel_one_uevent(). */
+
+	if (error)
+		log_error(sg, "process_one_uevent error %d state %u",
+			  error, uev->ue_state);
+}
+
+static sm_node_t *failed_memb(sm_group_t *sg, int *count)
+{
+	sm_node_t *node, *sm_node, *failed_uev_node = NULL;
+
+	list_for_each_entry(node, &sg->memb, list) {
+
+		sm_node = sm_find_member(node->id);
+		SM_ASSERT(sm_node, );
+
+		if (test_bit(SNFL_NEED_RECOVERY, &sm_node->flags)) {
+			(*count)++;
+			if (node->id == sg->uevent.ue_nodeid)
+				failed_uev_node = sm_node;
+		}
+	}
+	return failed_uev_node;
+}
+
+static void send_recover_msg(sm_group_t *sg)
+{
+	char *msg;
+	int len = 0;
+	msg = create_smsg(sg, SMSG_RECOVER, 0, &len, NULL);
+	send_members_message(sg, msg, len);
+}
+
+static void cancel_barrier(sm_group_t *sg)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	char bname[MAX_BARRIER_NAME_LEN];
+
+	clear_bit(UEFL_ALLOW_BARRIER, &uev->ue_flags);
+
+	memset(bname, 0, MAX_BARRIER_NAME_LEN);
+	snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.%u.%u",
+		 sg->global_id, uev->ue_nodeid, uev->ue_remote_seid,
+		 sg->memb_count);
+	kcl_barrier_cancel(bname);
+}
+
+static void cancel_one_uevent(sm_group_t *sg, int *effected)
+{
+	sm_uevent_t *uev = &sg->uevent;
+	int failed_count;
+	sm_node_t *node, *failed_joiner, *failed_leaver;
+
+	log_debug(sg, "cancel uevent state %u node %u", uev->ue_state,
+		  uev->ue_nodeid);
+
+	switch (uev->ue_state) {
+
+	case UEST_JSTOP:
+	case UEST_JSTART_WAITCMD:
+	case UEST_JSTART:
+
+		sg->ops->stop(sg->service_data);
+
+		failed_count = 0;
+		failed_joiner = failed_memb(sg, &failed_count);
+		SM_ASSERT(!failed_joiner, );
+
+		node = sm_find_member(uev->ue_nodeid);
+		if (node && test_bit(SNFL_NEED_RECOVERY, &node->flags))
+			failed_joiner = node;
+
+		if (!failed_count) {
+			/* only joining node failed */
+			SM_ASSERT(failed_joiner, );
+			SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+			set_bit(SGFL_NEED_RECOVERY, &sg->flags);
+			(*effected)++;
+			/* some nodes may not have gotten a JSTOP message
+			   in which case this will tell them to begin
+			   recovery for this sg. */
+			send_recover_msg(sg);
+
+		} else {
+			/* a member node failed (and possibly joining node, it
+			   doesn't matter) */
+			SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+		}
+
+		clear_bit(SGFL_UEVENT, &sg->flags);
+		memset(uev, 0, sizeof(sm_uevent_t));
+		break;
+
+
+	case UEST_JSTART_SERVICEWAIT:
+	case UEST_JSTART_SERVICEDONE:
+
+		clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
+		sg->ops->stop(sg->service_data);
+
+		failed_count = 0;
+		failed_joiner = failed_memb(sg, &failed_count);
+		SM_ASSERT(failed_count, );
+		SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+
+		if (failed_count == 1 && failed_joiner) {
+			/* only joining node failed */
+
+		} else if (failed_count && failed_joiner) {
+			/* joining node and another member failed */
+
+		} else {
+			/* other member failed, joining node still alive */
+			SM_ASSERT(!failed_joiner, );
+			del_memb_node(sg, uev->ue_nodeid);
+		}
+
+		clear_bit(SGFL_UEVENT, &sg->flags);
+		memset(uev, 0, sizeof(sm_uevent_t));
+		break;
+
+
+	case UEST_LSTOP:
+	case UEST_LSTART_WAITCMD:
+	case UEST_LSTART:
+
+		sg->ops->stop(sg->service_data);
+
+		failed_count = 0;
+		failed_leaver = failed_memb(sg, &failed_count);
+		SM_ASSERT(failed_count, );
+		SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+
+		if (failed_count == 1 && failed_leaver) {
+			/* only leaving node failed */
+
+		} else if (failed_count && failed_leaver) {
+			/* leaving node and another member failed */
+
+		} else {
+			/* other member failed, leaving node still alive */
+			SM_ASSERT(!failed_leaver, );
+		}
+
+		clear_bit(SGFL_UEVENT, &sg->flags);
+		memset(uev, 0, sizeof(sm_uevent_t));
+		break;
+
+
+	case UEST_LSTART_SERVICEWAIT:
+	case UEST_LSTART_SERVICEDONE:
+
+		clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags);
+		sg->ops->stop(sg->service_data);
+
+		failed_count = 0;
+		failed_leaver = failed_memb(sg, &failed_count);
+		SM_ASSERT(!failed_leaver, );
+
+		node = sm_find_member(uev->ue_nodeid);
+		if (node && test_bit(SNFL_NEED_RECOVERY, &node->flags))
+			failed_leaver = node;
+
+		if (!failed_count) {
+			/* only leaving node failed */
+			SM_ASSERT(failed_leaver, );
+			SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+			set_bit(SGFL_NEED_RECOVERY, &sg->flags);
+			(*effected)++;
+
+		} else if (failed_count && failed_leaver) {
+			/* leaving node and another member failed */
+			SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+
+		} else {
+			/* other member failed, leaving node still alive */
+			SM_ASSERT(failed_count, );
+			SM_ASSERT(!failed_leaver, );
+			SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+			node = sm_new_node(sg->uevent.ue_nodeid);
+			add_memb_node(sg, node);
+		}
+
+		clear_bit(SGFL_UEVENT, &sg->flags);
+		memset(uev, 0, sizeof(sm_uevent_t));
+		break;
+
+
+	case UEST_BARRIER_WAIT:
+
+		if (test_bit(UEFL_LEAVE, &uev->ue_flags))
+			goto barrier_wait_leave;
+
+		sg->ops->stop(sg->service_data);
+		cancel_barrier(sg);
+
+ 	      barrier_wait_join:
+
+		failed_count = 0;
+		failed_joiner = failed_memb(sg, &failed_count);
+		SM_ASSERT(failed_count, );
+		SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+
+		if (failed_count == 1 && failed_joiner) {
+			/* only joining node failed */
+
+		} else if (failed_count && failed_joiner) {
+			/* joining node and another member failed */
+
+		} else {
+			/* other member failed, joining node still alive */
+			SM_ASSERT(!failed_joiner, );
+			del_memb_node(sg, uev->ue_nodeid);
+		}
+
+		clear_bit(SGFL_UEVENT, &sg->flags);
+		memset(uev, 0, sizeof(sm_uevent_t));
+		break;
+
+              barrier_wait_leave:
+
+		failed_count = 0;
+		failed_leaver = failed_memb(sg, &failed_count);
+		SM_ASSERT(!failed_leaver, );
+
+		node = sm_find_member(uev->ue_nodeid);
+		if (node && test_bit(SNFL_NEED_RECOVERY, &node->flags))
+			failed_leaver = node;
+
+		if (!failed_count) {
+			/* only leaving node failed */
+			SM_ASSERT(failed_leaver, );
+			SM_ASSERT(!test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+			set_bit(SGFL_NEED_RECOVERY, &sg->flags);
+			(*effected)++;
+
+		} else if (failed_count && failed_leaver) {
+			/* leaving node and another member failed */
+			SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+
+		} else {
+			/* other member failed, leaving node still alive */
+			SM_ASSERT(failed_count, );
+			SM_ASSERT(!failed_leaver, );
+			SM_ASSERT(test_bit(SGFL_NEED_RECOVERY, &sg->flags), );
+			node = sm_new_node(sg->uevent.ue_nodeid);
+			add_memb_node(sg, node);
+		}
+
+		clear_bit(SGFL_UEVENT, &sg->flags);
+		memset(uev, 0, sizeof(sm_uevent_t));
+		break;
+
+
+	case UEST_BARRIER_DONE:
+
+		if (!uev->ue_barrier_status) {
+			do_finish(sg);
+			uevent_done(sg);
+			break;
+		} 
+
+		if (test_bit(UEFL_LEAVE, &uev->ue_flags))
+			goto barrier_wait_leave;
+		else
+			goto barrier_wait_join;
+
+
+	default:
+		log_error(sg, "cancel_one_uevent: state %d", uev->ue_state);
+	}
+}
+
+void cancel_uevents(int *effected)
+{
+	sm_group_t *sg;
+	sm_node_t *node, *sgnode;
+	int i;
+
+	list_for_each_entry(node, &sm_members, list) {
+		if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
+			continue;
+
+		/*
+		 * Clear this dead node from the "interested in joining" list
+		 * of any SG.  The node is added to this list before the uevent
+		 * begins.
+		 */
+
+		for (i = 0; i < SG_LEVELS; i++) {
+			list_for_each_entry(sg, &sm_sg[i], list) {
+				sgnode = sm_find_joiner(sg, node->id);
+				if (sgnode) {
+					log_debug(sg, "clear joining node %u",
+						  sgnode->id);
+					list_del(&sgnode->list);
+					kfree(sgnode);
+				}
+			}
+		}
+		schedule();
+	}
+
+	 /* Adjust any uevents in sg's effected by the failed node(s) */
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		list_for_each_entry(sg, &sm_sg[i], list) {
+			if (!test_bit(SGFL_UEVENT, &sg->flags))
+				continue;
+
+			/* We may have some cancelling to do if this sg is
+			   flagged as having a failed member, or if a joining
+			   or leaving node has died. */
+			   
+			if (test_bit(SGFL_NEED_RECOVERY, &sg->flags))
+				cancel_one_uevent(sg, effected);
+			else if (sg->uevent.ue_nodeid) {
+				node = sm_find_member(sg->uevent.ue_nodeid);
+				SM_ASSERT(node, );
+				if (test_bit(SNFL_NEED_RECOVERY, &node->flags))
+					cancel_one_uevent(sg, effected);
+			}
+		}
+		schedule();
+	}
+}
+
+void process_membership(void)
+{
+	sm_group_t *sg;
+	int i;
+
+	down(&sm_sglock);
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		list_for_each_entry(sg, &sm_sg[i], list) {
+			if (!test_bit(SGFL_UEVENT, &sg->flags))
+				continue;
+
+			if (!test_and_clear_bit(UEFL_CHECK,
+						&sg->uevent.ue_flags))
+				continue;
+
+			process_one_uevent(sg);
+		}
+	}
+	up(&sm_sglock);
+}
--- linux-2.6.9.orig/cluster/cman/sm_membership.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_membership.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,20 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_MEMBERSHIP_DOT_H__
+#define __SM_MEMBERSHIP_DOT_H__
+
+void process_membership(void);
+void cancel_uevents(int *effected);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_message.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_message.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,856 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+
+#define SMSG_BUF_SIZE (sizeof(sm_msg_t) + MAX_SERVICE_NAME_LEN + 1)
+
+extern struct socket *	sm_socket;
+extern uint32_t 	sm_our_nodeid;
+static uint32_t 	global_last_id;
+static struct list_head messages;
+static spinlock_t 	message_lock;
+static char		smsg_buf[SMSG_BUF_SIZE];
+
+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
+
+struct rq_entry {
+	struct list_head list;
+	char *msg;
+	int len;
+	uint32_t nodeid;
+};
+typedef struct rq_entry rq_entry_t;
+
+void init_messages(void)
+{
+	global_last_id = 1;
+	INIT_LIST_HEAD(&messages);
+	spin_lock_init(&message_lock);
+}
+
+uint32_t sm_new_global_id(int level)
+{
+	uint32_t id = global_last_id++;
+	uint8_t l = (uint8_t) level;
+
+	if (level > 255)
+		return 0;
+
+	if (id > 0x00FFFFFF)
+		return 0;
+
+	id |= (l << 24);
+	return id;
+}
+
+static void smsg_copy_in(char *msg, sm_msg_t *smsg)
+{
+	sm_msg_t *in = (sm_msg_t *) msg;
+
+	smsg->ms_type = in->ms_type;
+	smsg->ms_status = in->ms_status;
+	smsg->ms_sevent_id = le32_to_cpu(in->ms_sevent_id);
+	smsg->ms_global_sgid = le32_to_cpu(in->ms_global_sgid);
+	smsg->ms_global_lastid = le32_to_cpu(in->ms_global_lastid);
+	smsg->ms_sglevel = le16_to_cpu(in->ms_sglevel);
+	smsg->ms_length = le16_to_cpu(in->ms_length);
+}
+
+/* swapping bytes in place is an easy source of errors - be careful not to
+ * access the fields after calling this */
+
+void smsg_bswap_out(sm_msg_t *smsg)
+{
+	smsg->ms_sevent_id = cpu_to_le32(smsg->ms_sevent_id);
+	smsg->ms_global_sgid = cpu_to_le32(smsg->ms_global_sgid);
+	smsg->ms_global_lastid = cpu_to_le32(smsg->ms_global_lastid);
+	smsg->ms_sglevel = cpu_to_le16(smsg->ms_sglevel);
+	smsg->ms_length = cpu_to_le16(smsg->ms_length);
+}
+
+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
+		  sm_sevent_t *sev)
+{
+	char *msg;
+	sm_msg_t *smsg;
+	int fulllen = sizeof(sm_msg_t) + datalen;
+
+	msg = smsg_buf;
+	memset(smsg_buf, 0, SMSG_BUF_SIZE);
+	SM_ASSERT(fulllen <= SMSG_BUF_SIZE,);
+
+	smsg = (sm_msg_t *) msg;
+	smsg->ms_type = type;
+	smsg->ms_global_sgid = sg->global_id;
+	smsg->ms_sglevel = sg->level;
+	smsg->ms_length = datalen;
+	smsg->ms_sevent_id = sev ? sev->se_id : 0;
+
+	smsg_bswap_out(smsg);
+	*msglen = fulllen;
+	return msg;
+}
+
+static unsigned int msgtype_to_flag(int type)
+{
+	unsigned int flag;
+
+	switch (type) {
+	case SMSG_JOIN_REP:
+	case SMSG_JOIN_REQ:
+		flag = SEFL_ALLOW_JOIN;
+		break;
+
+	case SMSG_JSTOP_REP:
+	case SMSG_JSTOP_REQ:
+		flag = SEFL_ALLOW_JSTOP;
+		break;
+
+	case SMSG_LEAVE_REP:
+	case SMSG_LEAVE_REQ:
+		flag = SEFL_ALLOW_LEAVE;
+		break;
+
+	case SMSG_LSTOP_REP:
+	case SMSG_LSTOP_REQ:
+		flag = SEFL_ALLOW_LSTOP;
+		break;
+
+	default:
+		SM_ASSERT(0, printk("msgtype_to_flag bad type %d\n", type););
+	}
+	return flag;
+}
+
+static int test_allowed_msgtype(sm_sevent_t *sev, int type)
+{
+	unsigned int flag = msgtype_to_flag(type);
+
+	return test_bit(flag, &sev->se_flags);
+}
+
+static void clear_allowed_msgtype(sm_sevent_t *sev, int type)
+{
+	unsigned int flag = msgtype_to_flag(type);
+
+	clear_bit(flag, &sev->se_flags);
+}
+
+static void set_allowed_msgtype(sm_sevent_t *sev, int type)
+{
+	unsigned int flag = msgtype_to_flag(type);
+
+	set_bit(flag, &sev->se_flags);
+}
+
+static int save_global_id(sm_sevent_t *sev, sm_msg_t *smsg)
+{
+	sm_group_t *sg = sev->se_sg;
+
+	if (!smsg->ms_global_sgid) {
+		log_error(sg, "save_global_id: zero sg id");
+		return -1;
+	}
+
+	if (!sg->global_id)
+		sg->global_id = smsg->ms_global_sgid;
+
+	if (sg->global_id != smsg->ms_global_sgid) {
+		log_error(sg, "save_global_id: id %x", smsg->ms_global_sgid);
+		return -1;
+	}
+	return 0;
+}
+
+static void save_lastid(sm_msg_t *smsg)
+{
+	uint32_t gid = smsg->ms_global_lastid & 0x00FFFFFF;
+
+	/*
+	 * Keep track of the highst SG id which has been used
+	 * in the cluster in case we need to choose a new SG id.
+	 */
+
+	if (gid > global_last_id)
+		global_last_id = gid;
+}
+
+static int next_sev_state(int msg_type, int cur_state)
+{
+	int next = 0;
+
+	switch (msg_type) {
+	case SMSG_JOIN_REP:
+		SM_ASSERT(cur_state == SEST_JOIN_ACKWAIT,);
+		next = SEST_JOIN_ACKED;
+		break;
+
+	case SMSG_JSTOP_REP:
+		SM_ASSERT(cur_state == SEST_JSTOP_ACKWAIT,);
+		next = SEST_JSTOP_ACKED;
+		break;
+
+	case SMSG_LEAVE_REP:
+		SM_ASSERT(cur_state == SEST_LEAVE_ACKWAIT,);
+		next = SEST_LEAVE_ACKED;
+		break;
+
+	case SMSG_LSTOP_REP:
+		SM_ASSERT(cur_state == SEST_LSTOP_ACKWAIT,);
+		next = SEST_LSTOP_ACKED;
+		break;
+	}
+	return next;
+}
+
+/*
+ * Functions in sevent.c send messages to other nodes and then expect replies.
+ * This function collects the replies for the sevent messages and moves the
+ * sevent to the next stage when all the expected replies have been received.
+ */
+
+static void process_reply(sm_msg_t *smsg, uint32_t nodeid)
+{
+	sm_sevent_t *sev;
+	int i, expected, type = smsg->ms_type;
+
+	/*
+	 * Find the relevant sevent.
+	 */
+
+	sev = find_sevent(smsg->ms_sevent_id);
+	if (!sev) {
+		log_print("process_reply invalid id=%u nodeid=%u",
+			  smsg->ms_sevent_id, nodeid);
+		goto out;
+	}
+
+	/*
+	 * Check if this message type is what this sevent is waiting for.
+	 */
+
+	if (!test_allowed_msgtype(sev, type)) {
+		log_debug(sev->se_sg, "process_reply ignored type=%u nodeid=%u "			  "id=%u", type, nodeid, sev->se_id);
+		goto out;
+	}
+
+	expected =
+	    (type == SMSG_JOIN_REP) ? sev->se_node_count : sev->se_memb_count;
+
+	SM_ASSERT(expected * sizeof(uint32_t) <= sev->se_len_ids,
+		  printk("type=%d expected=%d len_ids=%d node_count=%d "
+			 "memb_count=%d\n", type, expected, sev->se_len_ids,
+			 sev->se_node_count, sev->se_memb_count););
+
+	SM_ASSERT(expected * sizeof(char) <= sev->se_len_status,
+		  printk("type=%d expected=%d len_status=%d node_count=%d "
+			 "memb_count=%d\n", type, expected, sev->se_len_status,
+			 sev->se_node_count, sev->se_memb_count););
+
+	for (i = 0; i < expected; i++) {
+		if (sev->se_node_ids[i] == nodeid) {
+			/*
+			 * Save the status from the replying node
+			 */
+
+			if (!sev->se_node_status[i])
+				sev->se_node_status[i] = smsg->ms_status;
+			else {
+				log_error(sev->se_sg, "process_reply duplicate"
+					  "id=%u nodeid=%u %u/%u",
+					  sev->se_id, nodeid,
+					  sev->se_node_status[i],
+					  smsg->ms_status);
+				goto out;
+			}
+
+			if (type == SMSG_JOIN_REP) {
+				save_lastid(smsg);
+
+				if (smsg->ms_status == STATUS_POS)
+					save_global_id(sev, smsg);
+			}
+
+			/*
+			 * Signal sm if we have all replies
+			 */
+
+			if (++sev->se_reply_count == expected) {
+				clear_allowed_msgtype(sev, type);
+				sev->se_state = next_sev_state(type,
+						 	       sev->se_state);
+				set_bit(SEFL_CHECK, &sev->se_flags);
+				wake_serviced(DO_JOINLEAVE);
+			}
+
+			break;
+		}
+	}
+
+      out:
+	return;
+}
+
+/*
+ * A node wants to join an SG and has run send_join_notice.  If we know nothing
+ * about the SG , then we have no objection - send back STATUS_POS.  If we're a
+ * member of the SG, then send back STATUS_POS (go ahead and join) if there's
+ * no sevent or uevent of higher priority in progress (only a single join or
+ * leave is permitted for the SG at once).  If there happens to be a higher
+ * priority sevent/uevent in progress, send back STATUS_WAIT to defer the
+ * requested join for a bit.
+ */
+
+static void process_join_request(sm_msg_t *smsg, uint32_t nodeid, char *name)
+{
+	sm_group_t *sg = NULL;
+	sm_sevent_t *sev = NULL;
+	sm_node_t *node;
+	int found = FALSE;
+	int level = smsg->ms_sglevel;
+	sm_msg_t reply;
+
+	memset(&reply, 0, sizeof(reply));
+
+	down(&sm_sglock);
+
+	if (nodeid == sm_our_nodeid)
+		goto next;
+
+	/*
+	 * search SG list for an SG with given name/len
+	 */
+
+	list_for_each_entry(sg, &sm_sg[level], list) {
+		if ((sg->namelen != smsg->ms_length) ||
+		    memcmp(sg->name, name, sg->namelen))
+			continue;
+		found = TRUE;
+		break;
+	}
+
+	/*
+	 * build reply message
+	 */
+
+      next:
+
+	if (!found) {
+		reply.ms_type = SMSG_JOIN_REP;
+		reply.ms_status = STATUS_NEG;
+		reply.ms_global_lastid = global_last_id;
+		reply.ms_sevent_id = smsg->ms_sevent_id;
+	} else {
+		reply.ms_type = SMSG_JOIN_REP;
+		reply.ms_status = STATUS_POS;
+		reply.ms_sevent_id = smsg->ms_sevent_id;
+		reply.ms_global_sgid = sg->global_id;
+		reply.ms_global_lastid = global_last_id;
+
+		/*
+		 * The node trying to join should wait and try again until
+		 * we're done with recovery.
+		 */
+
+		if (sg->state == SGST_RECOVER) {
+			reply.ms_status = STATUS_WAIT;
+			goto send;
+		}
+
+		/*
+		 * An sevent node trying to join may have gotten as far as
+		 * creating a uevent with us and then backed out.  That node
+		 * will retry joining from the beginning so we should not turn
+		 * them away.  If we're handling a uevent for another node,
+		 * tell the joining node to wait.
+		 */
+
+		if (test_bit(SGFL_UEVENT, &sg->flags)) {
+			if (sg->uevent.ue_nodeid != nodeid)
+				reply.ms_status = STATUS_WAIT;
+			goto send;
+		}
+
+		/*
+		 * We're trying to join or leave the SG at the moment.
+		 */
+
+		if (test_bit(SGFL_SEVENT, &sg->flags)) {
+			sev = sg->sevent;
+
+			/*
+			 * We're trying to leave.  Make the join wait until
+			 * we've left if we're beyond LEAVE_ACKWAIT.
+			 */
+
+			if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
+				if (sev->se_state > SEST_LEAVE_ACKED)
+					reply.ms_status = STATUS_WAIT;
+				else {
+					reply.ms_status = STATUS_POS;
+					clear_bit(SEFL_ALLOW_LEAVE,
+						  &sev->se_flags);
+					set_bit(SEFL_CANCEL, &sev->se_flags);
+				}
+			}
+
+			/*
+			 * We're trying to join.  Making the other join wait
+			 * until we're joined if we're beyond JOIN_ACKWAIT or
+			 * if we have a lower id.  (Send NEG to allow the other
+			 * node to go ahead because we're not in the SG.)
+			 */
+
+			else {
+				if (sev->se_state > SEST_JOIN_ACKED)
+					reply.ms_status = STATUS_WAIT;
+				else if (sm_our_nodeid < nodeid)
+					reply.ms_status = STATUS_WAIT;
+				else {
+					reply.ms_status = STATUS_NEG;
+					clear_bit(SEFL_ALLOW_JOIN,
+						  &sev->se_flags);
+					set_bit(SEFL_CANCEL, &sev->se_flags);
+				}
+			}
+
+			if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
+				set_bit(SEFL_CHECK, &sev->se_flags);
+				wake_serviced(DO_JOINLEAVE);
+			}
+			goto send;
+		}
+
+		/* no r,u,s event, stick with STATUS_POS */
+	}
+
+      send:
+
+	if (reply.ms_status == STATUS_POS) {
+		node = sm_find_joiner(sg, nodeid);
+		if (!node) {
+			node = sm_new_node(nodeid);
+			list_add_tail(&node->list, &sg->joining);
+		}
+	}
+
+	up(&sm_sglock);
+	smsg_bswap_out(&reply);
+	send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
+}
+
+/*
+ * Another node wants us to stop a service so it can join or leave the SG.  We
+ * do this by saving the request info in a uevent and having the sm thread do
+ * the processing and then replying.
+ */
+
+static void process_stop_request(sm_msg_t *smsg, uint32_t nodeid,
+				 uint32_t *msgbuf)
+{
+	sm_group_t *sg;
+	sm_uevent_t *uev;
+	sm_msg_t reply;
+	int type = smsg->ms_type;
+
+	if (nodeid == sm_our_nodeid)
+		goto agree;
+
+	sg = sm_global_id_to_sg(smsg->ms_global_sgid);
+	if (!sg) {
+		log_print("process_stop_request: unknown sg id %x",
+			  smsg->ms_global_sgid);
+		return;
+	}
+
+	/*
+	 * We shouldn't get here with uevent already set.
+	 */
+
+	if (test_and_set_bit(SGFL_UEVENT, &sg->flags)) {
+		log_error(sg, "process_stop_request: uevent already set");
+		return;
+	}
+
+	uev = &sg->uevent;
+	uev->ue_nodeid = nodeid;
+	uev->ue_remote_seid = smsg->ms_sevent_id;
+	uev->ue_state = (type == SMSG_JSTOP_REQ) ? UEST_JSTOP : UEST_LSTOP;
+
+	if (type == SMSG_JSTOP_REQ)
+		uev->ue_num_nodes = be32_to_cpu(*msgbuf);
+	else
+		set_bit(UEFL_LEAVE, &uev->ue_flags);
+
+	/*
+	 * Do process_join_stop() or process_leave_stop().
+	 */
+
+	set_bit(UEFL_CHECK, &uev->ue_flags);
+	wake_serviced(DO_MEMBERSHIP);
+	return;
+
+      agree:
+	reply.ms_status = STATUS_POS;
+	reply.ms_type =
+	    (type == SMSG_JSTOP_REQ) ? SMSG_JSTOP_REP : SMSG_LSTOP_REP;
+	reply.ms_sevent_id = smsg->ms_sevent_id;
+	smsg_bswap_out(&reply);
+	send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
+}
+
+static void process_start_request(sm_msg_t *smsg, uint32_t nodeid)
+{
+	sm_group_t *sg;
+	sm_uevent_t *uev;
+	int type = smsg->ms_type;
+
+	if (nodeid == sm_our_nodeid)
+		return;
+
+	sg = sm_global_id_to_sg(smsg->ms_global_sgid);
+	if (!sg) {
+		log_print("process_start_request: unknown sg id %x",
+			  smsg->ms_global_sgid);
+		return;
+	}
+
+	if (!test_bit(SGFL_UEVENT, &sg->flags)) {
+		log_error(sg, "process_start_request: no uevent");
+		return;
+	}
+
+	uev = &sg->uevent;
+
+	if (type == SMSG_JSTART_CMD)
+		uev->ue_state = UEST_JSTART;
+	else
+		uev->ue_state = UEST_LSTART;
+
+	set_bit(UEFL_CHECK, &uev->ue_flags);
+	wake_serviced(DO_MEMBERSHIP);
+}
+
+static void process_leave_request(sm_msg_t *smsg, uint32_t nodeid)
+{
+	sm_group_t *sg;
+	sm_node_t *node;
+	sm_msg_t reply;
+	sm_sevent_t *sev;
+	int found = FALSE;
+
+	sg = sm_global_id_to_sg(smsg->ms_global_sgid);
+	if (sg) {
+		if (nodeid == sm_our_nodeid)
+			found = TRUE;
+		else {
+			list_for_each_entry(node, &sg->memb, list) {
+				if (node->id != nodeid)
+					continue;
+				set_bit(SNFL_LEAVING, &node->flags);
+				found = TRUE;
+				break;
+			}
+		}
+	}
+
+	if (!found) {
+		reply.ms_type = SMSG_LEAVE_REP;
+		reply.ms_status = STATUS_NEG;
+		reply.ms_sevent_id = smsg->ms_sevent_id;
+	} else {
+		reply.ms_type = SMSG_LEAVE_REP;
+		reply.ms_status = STATUS_POS;
+		reply.ms_sevent_id = smsg->ms_sevent_id;
+
+		if (sg->state == SGST_RECOVER)
+			reply.ms_status = STATUS_WAIT;
+
+		else if (test_bit(SGFL_SEVENT, &sg->flags) &&
+			 nodeid != sm_our_nodeid) {
+			sev = sg->sevent;
+
+			/*
+			 * We're trying to join or leave at the moment.  If
+			 * we're past JOIN/LEAVE_ACKWAIT, we make the requestor
+			 * wait.  Otherwise, if joining we'll cancel to let the
+			 * leave happen first, or if we're leaving allow the
+			 * lower nodeid to leave first.
+			 */
+
+			if (test_bit(SEFL_LEAVE, &sev->se_flags)) {
+				if (sev->se_state > SEST_LEAVE_ACKWAIT)
+					reply.ms_status = STATUS_WAIT;
+				else if (sm_our_nodeid < nodeid)
+					reply.ms_status = STATUS_WAIT;
+				else {
+					reply.ms_status = STATUS_POS;
+					clear_bit(SEFL_ALLOW_LEAVE,
+						  &sev->se_flags);
+					set_bit(SEFL_CANCEL, &sev->se_flags);
+				}
+			} else {
+				if (sev->se_state > SEST_JOIN_ACKWAIT)
+					reply.ms_status = STATUS_WAIT;
+				else {
+					reply.ms_status = STATUS_NEG;
+					clear_bit(SEFL_ALLOW_JOIN,
+						  &sev->se_flags);
+					set_bit(SEFL_CANCEL, &sev->se_flags);
+				}
+			}
+
+			if (test_bit(SEFL_CANCEL, &sev->se_flags)) {
+				set_bit(SEFL_CHECK, &sev->se_flags);
+				wake_serviced(DO_JOINLEAVE);
+			}
+		}
+
+		else if (test_bit(SGFL_UEVENT, &sg->flags)) {
+			if (sg->uevent.ue_nodeid != nodeid)
+				reply.ms_status = STATUS_WAIT;
+		}
+
+	}
+
+	smsg_bswap_out(&reply);
+	send_nodeid_message((char *) &reply, sizeof(reply), nodeid);
+}
+
+/*
+ * Each remaining node will send us a done message.  We quit when we get the
+ * first.  The subsequent done messages for the finished sevent get here and
+ * are ignored.
+ */
+
+static void process_lstart_done(sm_msg_t *smsg, uint32_t nodeid)
+{
+	sm_sevent_t *sev;
+
+	sev = find_sevent(smsg->ms_sevent_id);
+	if (!sev)
+		return;
+
+	if (sev->se_state != SEST_LSTART_WAITREMOTE)
+		return;
+
+	sev->se_state = SEST_LSTART_REMOTEDONE;
+	set_bit(SEFL_CHECK, &sev->se_flags);
+	wake_serviced(DO_JOINLEAVE);
+}
+
+/*
+ * This function and everything it calls always runs in sm context.
+ */
+
+static void process_message(char *msg, uint32_t nodeid)
+{
+	sm_msg_t smsg;
+
+	smsg_copy_in(msg, &smsg);
+
+	switch (smsg.ms_type) {
+	case SMSG_JOIN_REQ:
+		process_join_request(&smsg, nodeid, msg + sizeof(sm_msg_t));
+		break;
+
+	case SMSG_JSTOP_REQ:
+		process_stop_request(&smsg, nodeid,
+				     (uint32_t *) (msg + sizeof(sm_msg_t)));
+		break;
+
+	case SMSG_LEAVE_REQ:
+		process_leave_request(&smsg, nodeid);
+		break;
+
+	case SMSG_LSTOP_REQ:
+		process_stop_request(&smsg, nodeid, NULL);
+		break;
+
+	case SMSG_JSTART_CMD:
+	case SMSG_LSTART_CMD:
+		process_start_request(&smsg, nodeid);
+		break;
+
+	case SMSG_LSTART_DONE:
+		process_lstart_done(&smsg, nodeid);
+		break;
+
+	case SMSG_JOIN_REP:
+	case SMSG_JSTOP_REP:
+	case SMSG_LEAVE_REP:
+	case SMSG_LSTOP_REP:
+		process_reply(&smsg, nodeid);
+		break;
+
+	case SMSG_RECOVER:
+		process_recover_msg(&smsg, nodeid);
+		break;
+
+	default:
+		log_print("process_message: unknown type %u nodeid %u",
+			  smsg.ms_type, nodeid);
+	}
+}
+
+/*
+ * Always called from sm context.
+ */
+
+void process_messages(void)
+{
+	rq_entry_t *re;
+
+	while (1) {
+		re = NULL;
+
+		spin_lock(&message_lock);
+		if (!list_empty(&messages)) {
+			re = list_entry(messages.next, rq_entry_t, list);
+			list_del(&re->list);
+		}
+		spin_unlock(&message_lock);
+
+		if (!re)
+			break;
+		process_message(re->msg, re->nodeid);
+		kfree(re->msg);
+		kfree(re);
+		schedule();
+	}
+}
+
+/*
+ * Context: cnxman and sm
+ */
+
+static int add_to_recvqueue(char *msg, int len, uint32_t nodeid)
+{
+	rq_entry_t *re;
+
+	SM_RETRY(re = (rq_entry_t *) kmalloc(sizeof(rq_entry_t), GFP_KERNEL),
+		 re);
+	SM_RETRY(re->msg = (char *) kmalloc(len, GFP_KERNEL), re->msg);
+
+	memcpy(re->msg, msg, len);
+	re->len = len;
+	re->nodeid = nodeid;
+
+	spin_lock(&message_lock);
+	list_add_tail(&re->list, &messages);
+	spin_unlock(&message_lock);
+
+	wake_serviced(DO_MESSAGES);
+	return 0;
+}
+
+/*
+ * Context: cnxman
+ * Called by cnxman when a service manager message arrives.
+ */
+
+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
+		       unsigned int node_id)
+{
+        if (!node_id)
+	        return -EINVAL;
+        return add_to_recvqueue(msg, len, node_id);
+}
+
+/*
+ * These send routines are used by sm and are always called from sm context.
+ */
+
+int send_nodeid_message(char *msg, int len, uint32_t nodeid)
+{
+	int error = 0;
+	struct sockaddr_cl saddr;
+
+	if (nodeid == sm_our_nodeid) {
+		add_to_recvqueue(msg, len, nodeid);
+		goto out;
+	}
+
+	saddr.scl_family = AF_CLUSTER;
+	saddr.scl_port = CLUSTER_PORT_SERVICES;
+	saddr.scl_nodeid = nodeid;
+	error = kcl_sendmsg(sm_socket, msg, len, &saddr, sizeof(saddr), 0);
+	if (error > 0)
+		error = 0;
+
+	if (error)
+		log_print("send_nodeid_message error %d to %u", error, nodeid);
+      out:
+	return error;
+}
+
+int send_broadcast_message(char *msg, int len)
+{
+	int error;
+
+	error = kcl_sendmsg(sm_socket, msg, len, NULL, 0, 0);
+	if (error > 0)
+		error = 0;
+
+	add_to_recvqueue(msg, len, sm_our_nodeid);
+
+	if (error)
+		log_print("send_broadcast_message error %d", error);
+
+	return error;
+}
+
+int send_members_message(sm_group_t *sg, char *msg, int len)
+{
+	sm_node_t *node;
+	int error = 0;
+
+	list_for_each_entry(node, &sg->memb, list) {
+		error = send_nodeid_message(msg, len, node->id);
+		if (error < 0)
+			break;
+	}
+	return error;
+}
+
+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
+			     sm_sevent_t * sev)
+{
+	int error;
+	sm_msg_t *smsg = (sm_msg_t *) msg;
+
+	set_allowed_msgtype(sev, smsg->ms_type);
+	sev->se_reply_count = 0;
+
+	error = send_members_message(sg, msg, len);
+	if (error < 0)
+		clear_allowed_msgtype(sev, smsg->ms_type);
+
+	return error;
+}
+
+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev)
+{
+	int error;
+	sm_msg_t *smsg = (sm_msg_t *) msg;
+
+	set_allowed_msgtype(sev, smsg->ms_type);
+	sev->se_reply_count = 0;
+
+	error = send_broadcast_message(msg, len);
+	if (error < 0)
+		clear_allowed_msgtype(sev, smsg->ms_type);
+
+	return error;
+}
--- linux-2.6.9.orig/cluster/cman/sm_message.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_message.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_MESSAGE_DOT_H__
+#define __SM_MESSAGE_DOT_H__
+
+void init_messages(void);
+uint32_t sm_new_global_id(int level);
+void smsg_bswap_out(sm_msg_t * smsg);
+char *create_smsg(sm_group_t *sg, int type, int datalen, int *msglen,
+		  sm_sevent_t *sev);
+void process_messages(void);
+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
+		       unsigned int node_id);
+int send_nodeid_message(char *msg, int len, uint32_t nodeid);
+int send_broadcast_message(char *msg, int len);
+int send_broadcast_message_sev(char *msg, int len, sm_sevent_t * sev);
+int send_members_message(sm_group_t *sg, char *msg, int len);
+int send_members_message_sev(sm_group_t *sg, char *msg, int len,
+			     sm_sevent_t * sev);
+int sm_cluster_message(char *msg, int len, char *addr, int addr_len,
+		       unsigned int node_id);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_misc.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_misc.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,454 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+#include "config.h"
+#include <linux/seq_file.h>
+
+#define MAX_DEBUG_MSG_LEN	(40)
+
+extern struct list_head sm_members;
+static uint32_t		local_ids;
+static uint32_t		event_id;
+static spinlock_t	event_id_lock;
+static char *		debug_buf;
+static unsigned int	debug_size;
+static unsigned int	debug_point;
+static int		debug_wrap;
+static spinlock_t	debug_lock;
+
+
+void init_sm_misc(void)
+{
+	local_ids = 1;
+	event_id = 1;
+	spin_lock_init(&event_id_lock);
+	debug_buf = NULL;
+	debug_size = 0;
+	debug_point = 0;
+	debug_wrap = 0;
+	spin_lock_init(&debug_lock);
+
+	sm_debug_setup(cman_config.sm_debug_size);
+}
+
+sm_node_t *sm_new_node(uint32_t nodeid)
+{
+	struct kcl_cluster_node kclnode;
+	sm_node_t *node;
+	int error;
+
+	error = kcl_get_node_by_nodeid(nodeid, &kclnode);
+	SM_ASSERT(!error, printk("error = %d, nodeid = %u\n", error, nodeid););
+
+	SM_RETRY(node = (sm_node_t *) kmalloc(sizeof(sm_node_t), GFP_KERNEL),
+		 node);
+
+	memset(node, 0, sizeof(sm_node_t));
+	node->id = nodeid;
+	node->incarnation = kclnode.incarnation;
+	return node;
+}
+
+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid)
+{
+	sm_node_t *node;
+
+	list_for_each_entry(node, &sg->joining, list) {
+		if (node->id == nodeid)
+			return node;
+	}
+	return NULL;
+}
+
+sm_node_t *sm_find_member(uint32_t nodeid)
+{
+	sm_node_t *node;
+
+	list_for_each_entry(node, &sm_members, list) {
+		if (node->id == nodeid)
+			return node;
+	}
+	log_print("sm_find_member %u failed", nodeid);
+	return NULL;
+}
+
+uint32_t sm_new_local_id(int level)
+{
+	uint32_t id = local_ids++;
+	uint8_t l = (uint8_t) level;
+
+	if (level > 0xFF)
+		return 0;
+
+	if (id > 0x00FFFFFF)
+		return 0;
+
+	id |= (l << 24);
+	return id;
+}
+
+int sm_id_to_level(uint32_t id)
+{
+	uint8_t l = (id & 0xFF000000) >> 24;
+
+	return (int) l;
+}
+
+void sm_set_event_id(unsigned int *id)
+{
+	spin_lock(&event_id_lock);
+	*id = event_id++;
+	spin_unlock(&event_id_lock);
+}
+
+sm_group_t *sm_local_id_to_sg(int id)
+{
+	sm_group_t *sg;
+	int level = sm_id_to_level(id);
+	int found = FALSE;
+
+	down(&sm_sglock);
+
+	list_for_each_entry(sg, &sm_sg[level], list) {
+		if (sg->local_id == id) {
+			found = TRUE;
+			break;
+		}
+	}
+	up(&sm_sglock);
+	if (!found)
+		sg = NULL;
+	return sg;
+}
+
+sm_group_t *sm_global_id_to_sg(int id)
+{
+	sm_group_t *sg;
+	int level = sm_id_to_level(id);
+	int found = FALSE;
+
+	down(&sm_sglock);
+
+	list_for_each_entry(sg, &sm_sg[level], list) {
+		if (sg->global_id == id) {
+			found = TRUE;
+			break;
+		}
+	}
+	up(&sm_sglock);
+	if (!found)
+		sg = NULL;
+	return sg;
+}
+
+void sm_debug_log(sm_group_t *sg, const char *fmt, ...)
+{
+	va_list va;
+	int i, n, size, len;
+	char buf[MAX_DEBUG_MSG_LEN+1];
+
+	spin_lock(&debug_lock);
+
+	if (!debug_buf)
+		goto out;
+
+	size = MAX_DEBUG_MSG_LEN;
+	memset(buf, 0, size+1);
+
+	n = snprintf(buf, size, "%08x ", sg->global_id);
+	size -= n;
+
+	va_start(va, fmt);
+	vsnprintf(buf+n, size, fmt, va);
+	va_end(va);
+
+	len = strlen(buf);
+	if (len > MAX_DEBUG_MSG_LEN-1)
+		len = MAX_DEBUG_MSG_LEN-1;
+	buf[len] = '\n';
+	buf[len+1] = '\0';
+
+	for (i = 0; i < strlen(buf); i++) {
+		debug_buf[debug_point++] = buf[i];
+
+		if (debug_point == debug_size) {
+			debug_point = 0;
+			debug_wrap = 1;
+		}
+	}
+ out:
+	spin_unlock(&debug_lock);
+}
+
+void sm_debug_setup(int size)
+{
+	char *b = kmalloc(size, GFP_KERNEL);
+
+	spin_lock(&debug_lock);
+	if (debug_buf)
+		kfree(debug_buf);
+
+	if (size > PAGE_SIZE)
+		size = PAGE_SIZE;
+	debug_size = size;
+	debug_point = 0;
+	debug_wrap = 0;
+	debug_buf = b;
+	memset(debug_buf, 0, debug_size);
+	spin_unlock(&debug_lock);
+}
+
+#ifdef CONFIG_PROC_FS
+static struct seq_operations sm_info_op;
+
+struct sm_seq_info
+{
+    int pos;
+    int level;
+    sm_group_t *sg;
+};
+
+int sm_debug_info(char *b, char **start, off_t offset, int length)
+{
+	int i, n = 0;
+
+	spin_lock(&debug_lock);
+
+	if (debug_wrap) {
+		for (i = debug_point; i < debug_size; i++)
+			n += sprintf(b + n, "%c", debug_buf[i]);
+	}
+	for (i = 0; i < debug_point; i++)
+		n += sprintf(b + n, "%c", debug_buf[i]);
+
+	spin_unlock(&debug_lock);
+
+	return n;
+}
+
+
+
+static sm_group_t *sm_walk(loff_t offset, int *rlevel)
+{
+	sm_group_t *sg;
+	int  level;
+	loff_t n = 0;
+
+	down(&sm_sglock);
+
+	for (level = 0; level < SG_LEVELS; level++) {
+		list_for_each_entry(sg, &sm_sg[level], list) {
+			if (++n == offset)
+			        goto walk_finish;
+		}
+	}
+	sg = NULL;
+
+ walk_finish:
+	up(&sm_sglock);
+	*rlevel = level;
+
+	return sg;
+}
+
+
+static void *sm_seq_start(struct seq_file *m, loff_t * pos)
+{
+	struct sm_seq_info *ssi;
+
+	if (!m->private) {
+		ssi=kmalloc(sizeof (struct sm_seq_info), GFP_KERNEL);
+		m->private = ssi;
+		if (!ssi)
+			return NULL;
+	}
+	else
+		ssi = m->private;
+
+	ssi->pos = *pos;
+	ssi->level = 0;
+	ssi->sg = NULL;
+
+	/* Print the header */
+	if (*pos == 0) {
+		seq_printf(m,
+			   "Service          Name                              GID LID State     Code\n");
+	}
+	else
+		ssi->sg = sm_walk(ssi->pos, &ssi->level);
+
+	return ssi;
+}
+
+static void *sm_seq_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	struct sm_seq_info *ssi = p;
+
+	*pos = ++ssi->pos;
+
+	if ( !(ssi->sg = sm_walk(ssi->pos, &ssi->level)) )
+		return NULL;
+
+	return ssi;
+}
+
+/* Called from /proc when /proc/cluster/services is opened */
+int sm_proc_open(struct inode *inode, struct file *file)
+{
+    	return seq_open(file, &sm_info_op);
+}
+
+static int sm_seq_show(struct seq_file *s, void *p)
+{
+    struct sm_seq_info *ssi = p;
+    sm_node_t *node;
+    int i;
+
+    if (!ssi || !ssi->sg)
+	    return 0;
+
+    /*
+     * Cluster Service
+     */
+
+    switch (ssi->level) {
+    case SERVICE_LEVEL_FENCE:
+	seq_printf(s, "Fence Domain:    ");
+	break;
+    case SERVICE_LEVEL_GDLM:
+	seq_printf(s, "DLM Lock Space:  ");
+	break;
+    case SERVICE_LEVEL_GFS:
+	seq_printf(s, "GFS Mount Group: ");
+	break;
+    case SERVICE_LEVEL_USER:
+	seq_printf(s, "User:            ");
+	break;
+    }
+
+    /*
+     * Name
+     */
+
+    seq_printf(s, "\"");
+    for (i = 0; i < ssi->sg->namelen; i++)
+	    seq_printf(s, "%c", ssi->sg->name[i]);
+    seq_printf(s, "\"");
+
+    for (; i < MAX_SERVICE_NAME_LEN-1; i++)
+	seq_printf(s, " ");
+
+    /*
+     * GID LID (sans level from top byte)
+     */
+
+    seq_printf(s, "%3u %3u ",
+	       (ssi->sg->global_id & 0x00FFFFFF),
+	       (ssi->sg->local_id & 0x00FFFFFF));
+
+    /*
+     * State
+     */
+
+    switch (ssi->sg->state) {
+    case SGST_NONE:
+	seq_printf(s, "none      ");
+	break;
+    case SGST_JOIN:
+	seq_printf(s, "join      ");
+	break;
+    case SGST_RUN:
+	seq_printf(s, "run       ");
+	break;
+    case SGST_RECOVER:
+	seq_printf(s, "recover %u ",
+		   ssi->sg->recover_state);
+	break;
+    case SGST_UEVENT:
+	seq_printf(s, "update    ");
+	break;
+    }
+
+    /*
+     * Code
+     */
+
+    if (test_bit(SGFL_SEVENT, &ssi->sg->flags))
+	    seq_printf(s, "S");
+    if (test_bit(SGFL_UEVENT, &ssi->sg->flags))
+	    seq_printf(s, "U");
+    if (test_bit(SGFL_NEED_RECOVERY, &ssi->sg->flags))
+	    seq_printf(s, "N");
+
+    seq_printf(s, "-");
+
+    if (test_bit(SGFL_SEVENT, &ssi->sg->flags)
+	&& ssi->sg->sevent) {
+	seq_printf(s, "%u,%lx,%u",
+		   ssi->sg->sevent->se_state,
+		   ssi->sg->sevent->se_flags,
+		   ssi->sg->sevent->se_reply_count);
+    }
+
+    if (test_bit(SGFL_UEVENT, &ssi->sg->flags)) {
+	seq_printf(s, "%u,%lx,%u",
+		   ssi->sg->uevent.ue_state,
+		   ssi->sg->uevent.ue_flags,
+		   ssi->sg->uevent.ue_nodeid);
+    }
+
+    seq_printf(s, "\n");
+
+    /*
+     * node list
+     */
+
+    i = 0;
+
+    seq_printf(s, "[");
+
+    list_for_each_entry(node, &ssi->sg->memb, list) {
+	    if (i && !(i % 24))
+	            seq_printf(s, "\n");
+
+	    if (i)
+	            seq_printf(s, " ");
+
+	seq_printf(s, "%u", node->id);
+	i++;
+    }
+
+    seq_printf(s, "]\n\n");
+
+    return 0;
+}
+
+static void sm_seq_stop(struct seq_file *m, void *p)
+{
+	if (m->private) {
+		kfree(m->private);
+		m->private = NULL;
+	}
+}
+
+
+static struct seq_operations sm_info_op = {
+	.start = sm_seq_start,
+	.next = sm_seq_next,
+	.stop = sm_seq_stop,
+	.show = sm_seq_show
+};
+
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_misc.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_misc.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_MISC_DOT_H__
+#define __SM_MISC_DOT_H__
+
+void init_sm_misc(void);
+sm_node_t *sm_new_node(uint32_t nodeid);
+sm_node_t *sm_find_joiner(sm_group_t *sg, uint32_t nodeid);
+sm_node_t *sm_find_member(uint32_t nodeid);
+uint32_t sm_new_local_id(int level);
+int sm_id_to_level(uint32_t id);
+void sm_set_event_id(unsigned int *id);
+sm_group_t *sm_local_id_to_sg(int id);
+sm_group_t *sm_global_id_to_sg(int id);
+void sm_debug_log(sm_group_t *sg, const char *fmt, ...);
+void sm_debug_setup(int size);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_recover.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_recover.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,577 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+#include "config.h"
+
+/*
+ * A collection of sg's which need to be recovered due to a failed member.
+ * These sg's are recovered in order of level.  An sg subject to cascading
+ * failures is moved from one of these structs to a newer one.
+ */
+
+/*
+ * There is a bug in the way SM manages multiple recovery events.
+ * A specific arrangement would be required to see this:
+ *
+ * - nodes A,B,C,D,E are in the cluster
+ * - nodes A,B,C,D,E are in the fence domain (FD)
+ * - A,B,C are using gfs X
+ * - C,D,E are using gfs Y
+ *
+ * The bug is possible on node C if A fails creating a
+ * recovery event for X (rev1), and just after that D fails
+ * creating a recovery event for Y (rev2).  If the two nodes
+ * fail at once there won't be a problem because a single
+ * recovery event will be created.  The timing of the
+ * consecutive failures would need to be just right.
+ *
+ * The problem arises when the group representing the
+ * fence domain (FD) is moved from rev1 into rev2.  This
+ * makes the groups in rev2 depend on FD recovery, but
+ * removes the dependecy of rev1 groups on FD recovery.
+ * In actual fact, both rev1 and rev2 groups depend on
+ * FD recovery, but the code has no way right now to
+ * make two rev's depend on the same group.
+ *
+ * When the FD dependency is removed from rev1, recovery
+ * for the higher level groups in rev1 (which are the dlm
+ * and gfs groups for X) goes ahead without waiting for
+ * FD recovery to finish.
+ *
+ * Both A and D will still be fenced, and given how recovery
+ * works it's likely to happen before gfs recovery on X
+ * begins.  But, if gfs-X recovery happens to start before
+ * A is fenced, and A isn't really dead and comes back to
+ * life and writes to X, then X could be corrupted.  If
+ * manual fencing is used, then it becomes very likely that
+ * recovery for gfs-X happens before A is fenced, and you
+ * have to hope A won't come back to life and write to X.
+ */
+
+struct recover {
+	struct list_head	list;		/* list of current re's */
+	struct list_head	sgs[SG_LEVELS];	/* lists of sg's by level */
+	int			event_id;	/* event id */
+	int			cur_level;
+};
+typedef struct recover recover_t;
+
+
+extern uint32_t *	sm_new_nodeids;
+extern int		sm_quorum, sm_quorum_next;
+extern uint32_t		sm_our_nodeid;
+extern struct list_head	sm_members;
+extern int		sm_member_count;
+static struct list_head	recoveries;
+
+
+void init_recovery(void)
+{
+	INIT_LIST_HEAD(&recoveries);
+}
+
+int no_recoveries(void)
+{
+	if (list_empty(&recoveries))
+		return TRUE;
+	return FALSE;
+}
+
+/* 
+ * This is the first thing called when a change is announced in cluster
+ * membership.  Nodes are marked as being a CLUSTER_MEMBER or not.  SM adds new
+ * nodes to its sm_members list which it's not seen before.  Nodes which were
+ * alive but are now gone are marked as "need recovery".
+ *
+ * The "need recovery" status of nodes is propagated to the node's SG's in
+ * mark_effected_sgs.  The effected SG's are themselves marked as needing
+ * recovery and in new_recovery the dead nodes are removed from the SG's
+ * individual member lists.  The "need recovery" status of nodes is cleared in
+ * adjust_members_done().
+ */
+
+static int adjust_members(void)
+{
+	sm_node_t *node;
+	struct kcl_cluster_node knode;
+	int i, error, num_nodes, sub = 0, add = 0, found;
+
+	/* 
+	 * Get list of current members from cnxman
+	 */
+
+	memset(sm_new_nodeids, 0, cman_config.max_nodes * sizeof(uint32_t));
+	num_nodes = kcl_get_member_ids(sm_new_nodeids, cman_config.max_nodes);
+
+	/* 
+	 * Determine who's gone
+	 */
+
+	list_for_each_entry(node, &sm_members, list) {
+		found = FALSE;
+		for (i = 0; i < num_nodes; i++) {
+			if (node->id == sm_new_nodeids[i]) {
+				found = TRUE;
+				sm_new_nodeids[i] = 0;
+				break;
+			}
+		}
+
+		if (found) {
+			error = kcl_get_node_by_nodeid(node->id, &knode);
+			SM_ASSERT(!error, printk("error=%d\n", error););
+
+			if (!test_bit(SNFL_CLUSTER_MEMBER, &node->flags)) {
+				/* former member is back */
+				set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
+				node->incarnation = knode.incarnation;
+				add++;
+			} else {
+				/* current member is still alive - if the
+				 * incarnation number is different it died and
+				 * returned between checks */
+				if (node->incarnation != knode.incarnation) {
+					set_bit(SNFL_NEED_RECOVERY,
+						&node->flags);
+					node->incarnation = knode.incarnation;
+					sub++;
+				}
+			}
+		} else {
+			/* current member has died */
+			if (test_and_clear_bit(SNFL_CLUSTER_MEMBER,
+					       &node->flags)) {
+				set_bit(SNFL_NEED_RECOVERY, &node->flags);
+				sub++;
+			}
+		}
+	}
+
+	/*
+	 * Look for new nodes
+	 */
+
+	for (i = 0; i < num_nodes; i++) {
+		if (sm_new_nodeids[i]) {
+			node = sm_new_node(sm_new_nodeids[i]);
+			set_bit(SNFL_CLUSTER_MEMBER, &node->flags);
+			add++;
+			list_add_tail(&node->list, &sm_members);
+			sm_member_count++;
+		}
+	}
+
+	/*
+	 * Get our own nodeid
+	 */
+
+	if (!sm_our_nodeid) {
+		list_for_each_entry(node, &sm_members, list) {
+			error = kcl_get_node_by_nodeid(node->id, &knode);
+			SM_ASSERT(!error, printk("error=%d\n", error););
+
+			if (knode.us) {
+				sm_our_nodeid = knode.node_id;
+				break;
+			}
+		}
+	}
+
+	return sub;
+}
+
+/*
+ * Given some number of dead nodes, flag SG's the dead nodes were part of.
+ * This requires a number of loops because each node structure does not keep a
+ * list of SG's it's in.
+ */
+
+static int mark_effected_sgs(void)
+{
+	sm_group_t *sg;
+	sm_node_t *node, *sgnode;
+	uint32_t dead_id;
+	int i, effected = 0;
+
+	down(&sm_sglock);
+
+	list_for_each_entry(node, &sm_members, list) {
+		if (!test_bit(SNFL_NEED_RECOVERY, &node->flags))
+			continue;
+
+		dead_id = node->id;
+
+		for (i = 0; i < SG_LEVELS; i++) {
+			list_for_each_entry(sg, &sm_sg[i], list) {
+				/* check if dead node is among sg's members */
+				list_for_each_entry(sgnode, &sg->memb, list) {
+					if (sgnode->id == dead_id) {
+						set_bit(SGFL_NEED_RECOVERY,
+							&sg->flags);
+						effected++;
+						break;
+					}
+				}
+				schedule();
+			}
+		}
+		schedule();
+	}
+	up(&sm_sglock);
+
+	return effected;
+}
+
+static recover_t *alloc_recover(void)
+{
+	recover_t *rev;
+	int i;
+
+	SM_RETRY(rev = kmalloc(sizeof(recover_t), GFP_KERNEL), rev);
+
+	memset(rev, 0, sizeof(recover_t));
+
+	sm_set_event_id(&rev->event_id);
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		INIT_LIST_HEAD(&rev->sgs[i]);
+	}
+
+	return rev;
+}
+
+/*
+ * An in-progress revent re-start for an SG is interrupted by another node
+ * failure in the SG.  Cancel an outstanding barrier if there is one.  The SG
+ * will be moved to the new revent and re-started as part of that.
+ */
+
+static void cancel_prev_recovery(sm_group_t *sg)
+{
+	int error;
+
+	if (sg->recover_state == RECOVER_BARRIERWAIT) {
+		error = kcl_barrier_cancel(sg->recover_barrier);
+		if (error)
+			log_error(sg, "cancel_prev_recovery: error %d", error);
+	}
+}
+
+static void pre_recover_sg(sm_group_t *sg, recover_t *rev)
+{
+	if (sg->state == SGST_RECOVER) {
+		cancel_prev_recovery(sg);
+		list_del(&sg->recover_list);
+	}
+
+	sg->ops->stop(sg->service_data);
+	sg->state = SGST_RECOVER;
+	sg->recover_state = RECOVER_NONE;
+	sg->recover_data = rev;
+	list_add(&sg->recover_list, &rev->sgs[sg->level]); 
+}
+
+/*
+ * When adjust_members finds that some nodes are dead and mark_effected_sgs
+ * finds that some SG's are effected by departed nodes, this is called to
+ * collect together the SG's which need to be recovered.  An revent (recovery
+ * event) is the group of effected SG's.
+ */
+
+static int new_recovery(void)
+{
+	sm_group_t *sg;
+	recover_t *rev;
+	sm_node_t *node, *sgnode, *safe;
+	int i;
+
+	rev = alloc_recover();
+	list_add_tail(&rev->list, &recoveries);
+
+	down(&sm_sglock);
+
+	/*
+	 * Stop effected SG's and add them to the rev
+	 */
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		list_for_each_entry(sg, &sm_sg[i], list) {
+			if (test_and_clear_bit(SGFL_NEED_RECOVERY, &sg->flags)){
+				if (sg->state == SGST_JOIN)
+					continue;
+				pre_recover_sg(sg, rev);
+			}
+		}
+		schedule();
+	}
+
+	/*
+	 * For an SG needing recovery, remove dead nodes from sg->memb list
+	 */
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		list_for_each_entry(sg, &rev->sgs[i], recover_list) {
+
+			/* Remove dead members from SG's member list */
+			list_for_each_entry_safe(sgnode, safe, &sg->memb, list){
+
+				node = sm_find_member(sgnode->id);
+				SM_ASSERT(node, printk("id %u\n", sgnode->id););
+
+				if (test_bit(SNFL_NEED_RECOVERY, &node->flags)){
+					list_del(&sgnode->list);
+					sg->memb_count--;
+					log_debug(sg, "remove node %u count %d",
+						  sgnode->id, sg->memb_count);
+					kfree(sgnode);
+				}
+				schedule();
+			}
+			schedule();
+		}
+	}
+
+	up(&sm_sglock);
+	rev->cur_level = 0;
+	return 0;
+}
+
+/*
+ * The NEED_RECOVERY bit on MML nodes is set in adjust_members() and is used in
+ * mark_effected_sgs() and add_revent().  After that, we're done using the bit
+ * and we clear it here.
+ */
+
+static void adjust_members_done(void)
+{
+	sm_node_t *node;
+
+	list_for_each_entry(node, &sm_members, list)
+		clear_bit(SNFL_NEED_RECOVERY, &node->flags);
+}
+
+/*
+ * Start the service of the given SG.  The service must be given an array of
+ * nodeids specifying the new sg membership.  The service is responsible to
+ * free this chunk of memory when done with it.
+ */
+
+static void start_sg(sm_group_t *sg, uint32_t event_id)
+{
+	sm_node_t *node;
+	uint32_t *memb;
+	int count = 0;
+
+	SM_RETRY(memb = kmalloc(sg->memb_count * sizeof(uint32_t), GFP_KERNEL),
+		 memb);
+
+	list_for_each_entry(node, &sg->memb, list)
+		memb[count++] = node->id;
+
+	sg->ops->start(sg->service_data, memb, count, event_id,
+		       SERVICE_NODE_FAILED);
+}
+
+static void recovery_barrier(sm_group_t *sg)
+{
+	char bname[MAX_BARRIER_NAME_LEN];
+	int error, len;
+
+	memset(bname, 0, MAX_BARRIER_NAME_LEN);
+
+	/* bypass the barrier if we're the only member */
+	if (sg->memb_count == 1) {
+		process_recovery_barrier(sg, 0);
+		return;
+	}
+
+	len = snprintf(bname, MAX_BARRIER_NAME_LEN, "sm.%u.%u.RECOV.%u",
+		       sg->global_id, sg->recover_stop, sg->memb_count);
+
+	/* We save this barrier name so we can cancel it if needed. */
+	memset(sg->recover_barrier, 0, MAX_BARRIER_NAME_LEN);
+	memcpy(sg->recover_barrier, bname, len);
+
+	error = sm_barrier(bname, sg->memb_count, SM_BARRIER_RECOVERY);
+	if (error)
+		log_error(sg, "recovery_barrier error %d: %s", error, bname);
+}
+
+static void recover_sg(sm_group_t *sg, int event_id)
+{
+	log_debug(sg, "recover state %d", sg->recover_state);
+
+	switch (sg->recover_state) {
+
+	case RECOVER_NONE:
+		/* must wait for recovery to stop sg on all nodes */
+		sg->recover_state = RECOVER_BARRIERWAIT;
+		sg->recover_stop = 0;
+		recovery_barrier(sg);
+		break;
+
+	case RECOVER_BARRIERWAIT:
+		break;
+
+	case RECOVER_STOP:
+		/* barrier callback sets state STOP */
+		sg->recover_stop = 1;
+		sg->recover_state = RECOVER_START;
+		start_sg(sg, event_id);
+		break;
+
+	case RECOVER_START:
+		break;
+
+	case RECOVER_STARTDONE:
+		/* service callback sets state STARTDONE */
+		sg->recover_state = RECOVER_BARRIERWAIT;
+		recovery_barrier(sg);
+		break;
+
+	case RECOVER_BARRIERDONE:
+		/* barrier callback sets state BARRIERDONE */
+		sg->ops->finish(sg->service_data, event_id);
+		list_del(&sg->recover_list);
+		sg->recover_state = RECOVER_NONE;
+		sg->state = SGST_RUN;
+
+		/* Continue a previous, interrupted attempt to leave the sg */
+		if (sg->sevent) {
+			sm_sevent_t *sev = sg->sevent;
+			log_debug(sg, "restart leave %lx", sev->se_flags);
+			clear_bit(SEFL_DELAY_RECOVERY, &sev->se_flags);
+			set_bit(SEFL_CHECK, &sev->se_flags);
+			wake_serviced(DO_JOINLEAVE);
+		}
+		break;
+
+	default:
+		log_error(sg, "invalid recover_state %u", sg->recover_state);
+	}
+}
+
+static void recover_level(recover_t *rev, int level)
+{
+	sm_group_t *sg, *safe;
+
+	list_for_each_entry_safe(sg, safe, &rev->sgs[level], recover_list) {
+		recover_sg(sg, rev->event_id);
+		schedule();
+	}
+}
+
+static void recover_levels(recover_t *rev)
+{
+	for (;;) {
+		recover_level(rev, rev->cur_level);
+
+		if (list_empty(&rev->sgs[rev->cur_level])) {
+			if (rev->cur_level == SG_LEVELS - 1) {
+				list_del(&rev->list);
+				kfree(rev);
+				return;
+			}
+			rev->cur_level++;
+			continue;
+		}
+		break;
+	}
+}
+
+/*
+ * Called by SM thread when the cluster is quorate.  It restarts
+ * SG's that were stopped in new_recovery() due to a member death.
+ * It waits for all SG's at level N to complete restart before
+ * restarting SG's at level N+1.
+ */
+
+void process_recoveries(void)
+{
+	recover_t *rev, *safe;
+
+	down(&sm_sglock);
+	list_for_each_entry_safe(rev, safe, &recoveries, list)
+		recover_levels(rev);
+	up(&sm_sglock);
+}
+
+/*
+ * The cnxman membership has changed.  Check if there's still quorum and
+ * whether any nodes have died.  If nodes have died, initiate recovery on any
+ * SG's they were in.  This begins immediately if the cluster remains quorate;
+ * if not this waits until the cluster regains quorum.
+ */
+
+void process_nodechange(void)
+{
+	int gone, effected;
+
+	if ((sm_quorum = sm_quorum_next))
+		wake_serviced(DO_RUN);
+
+	gone = adjust_members();
+	if (gone > 0) {
+		effected = mark_effected_sgs();
+
+		backout_sevents();
+		cancel_uevents(&effected);
+
+		if (effected > 0) {
+			new_recovery();
+			wake_serviced(DO_RECOVERIES);
+		}
+	}
+	adjust_members_done();
+}
+
+int check_recovery(sm_group_t *sg, int event_id)
+{
+	if (sg->state == SGST_RECOVER) {
+		recover_t *rev = (recover_t *) sg->recover_data;
+		if (rev && rev->event_id == event_id)
+			return 1;
+	}
+	return 0;
+}
+
+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid)
+{
+        sm_group_t *sg;
+	recover_t *rev;
+
+	sg = sm_global_id_to_sg(smsg->ms_global_sgid);
+	if (!sg) {
+		log_print("process_recover_msg: unknown sg id %x",
+			  smsg->ms_global_sgid);
+		return;
+	}
+
+	/* we already know about the recovery and can ignore the msg */
+	if (sg->state == SGST_RECOVER)
+		return;
+
+	if (test_bit(SGFL_UEVENT, &sg->flags)) {
+		/* we will initiate recovery on our own if we know about the
+		   uevent so we can ignore this */
+		log_debug(sg, "process_recover_msg: ignore from %u", nodeid);
+		return;
+	}
+
+	log_debug(sg, "recovery initiated by msg from %u", nodeid);
+	rev = alloc_recover();
+	list_add_tail(&rev->list, &recoveries);
+	pre_recover_sg(sg, rev);
+	wake_serviced(DO_RECOVERIES);
+}
--- linux-2.6.9.orig/cluster/cman/sm_recover.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_recover.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_RECOVER_DOT_H__
+#define __SM_RECOVER_DOT_H__
+
+void init_recovery(void);
+void process_recoveries(void);
+void process_nodechange(void);
+int check_recovery(sm_group_t *sg, int event_id);
+void process_recover_msg(sm_msg_t *smsg, uint32_t nodeid);
+int no_recoveries(void);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_services.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_services.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,420 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+
+static struct list_head	callbacks;
+static spinlock_t	callback_lock;
+static struct list_head	sg_registered[SG_LEVELS];
+
+/*
+ * These are the functions to register, join, leave, unregister, callback
+ * with/to the sm.
+ */
+
+struct sc_entry {
+	struct list_head list;
+	uint32_t local_id;
+	int event_id;
+};
+typedef struct sc_entry sc_entry_t;
+
+void init_services(void)
+{
+	int i;
+
+	INIT_LIST_HEAD(&callbacks);
+	spin_lock_init(&callback_lock);
+
+	for (i = 0; i < SG_LEVELS; i++) {
+		INIT_LIST_HEAD(&sm_sg[i]);
+		INIT_LIST_HEAD(&sg_registered[i]);
+	}
+	init_MUTEX(&sm_sglock);
+}
+
+/* Context: service */
+
+int kcl_register_service(char *name, int namelen, int level,
+			 struct kcl_service_ops *ops, int unique,
+			 void *servicedata, uint32_t *service_id)
+{
+	sm_group_t *sg;
+	int found = FALSE;
+	int error = -EINVAL;
+
+	if (level > SG_LEVELS - 1)
+		goto fail;
+
+	if (namelen > MAX_SERVICE_NAME_LEN)
+		goto fail;
+
+	error = kcl_addref_cluster();
+	if (error)
+		goto fail;
+
+	down(&sm_sglock);
+
+	list_for_each_entry(sg, &sm_sg[level], list) {
+		if ((sg->namelen == namelen) &&
+		    (!strncmp(sg->name, name, namelen))) {
+			found = TRUE;
+			goto next;
+		}
+	}
+
+	list_for_each_entry(sg, &sg_registered[level], list) {
+		if ((sg->namelen == namelen) &&
+		    (!strncmp(sg->name, name, namelen))) {
+			found = TRUE;
+			goto next;
+		}
+	}
+
+      next:
+
+	if (found && unique) {
+		error = -EEXIST;
+		goto fail_unlock;
+	}
+
+	if (found) {
+		sg->refcount++;
+		goto out;
+	}
+
+	sg = (sm_group_t *) kmalloc(sizeof(sm_group_t) + namelen, GFP_KERNEL);
+	if (!sg) {
+		error = -ENOMEM;
+		goto fail_unlock;
+	}
+	memset(sg, 0, sizeof(sm_group_t) + namelen);
+
+	sg->refcount = 1;
+	sg->service_data = servicedata;
+	sg->ops = ops;
+	sg->level = level;
+	sg->namelen = namelen;
+	memcpy(sg->name, name, namelen);
+	sg->local_id = sm_new_local_id(level);
+	sg->state = SGST_NONE;
+	INIT_LIST_HEAD(&sg->memb);
+	INIT_LIST_HEAD(&sg->joining);
+	init_completion(&sg->event_comp);
+
+	list_add_tail(&sg->list, &sg_registered[level]);
+
+      out:
+	*service_id = sg->local_id;
+	up(&sm_sglock);
+	return 0;
+
+      fail_unlock:
+	up(&sm_sglock);
+	kcl_releaseref_cluster();
+      fail:
+	return error;
+}
+
+/* Context: service */
+
+void kcl_unregister_service(uint32_t local_id)
+{
+	sm_group_t *sg;
+	int level = sm_id_to_level(local_id);
+
+	down(&sm_sglock);
+
+	list_for_each_entry(sg, &sg_registered[level], list) {
+		if (sg->local_id == local_id) {
+			SM_ASSERT(sg->refcount,);
+			sg->refcount--;
+
+			if (!sg->refcount) {
+				list_del(&sg->list);
+				kfree(sg);
+			}
+			kcl_releaseref_cluster();
+			break;
+		}
+	}
+	up(&sm_sglock);
+}
+
+/* Context: service */
+
+int kcl_join_service(uint32_t local_id)
+{
+	sm_group_t *sg;
+	sm_sevent_t *sev;
+	int level = sm_id_to_level(local_id);
+	int error, found = FALSE;
+
+	down(&sm_sglock);
+
+	list_for_each_entry(sg, &sg_registered[level], list) {
+		if (sg->local_id == local_id) {
+			found = TRUE;
+			break;
+		}
+	}
+
+	if (!found) {
+		up(&sm_sglock);
+		error = -ENOENT;
+		goto out;
+	}
+
+	if (sg->state != SGST_NONE) {
+		up(&sm_sglock);
+		error = -EINVAL;
+		goto out;
+	}
+
+	sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
+	if (!sev) {
+		up(&sm_sglock);
+		error = -ENOMEM;
+		goto out;
+	}
+
+	memset(sev, 0, sizeof (sm_sevent_t));
+	sev->se_state = SEST_JOIN_BEGIN;
+	sm_set_event_id(&sev->se_id);
+	sev->se_sg = sg;
+	sg->sevent = sev;
+	sg->state = SGST_JOIN;
+	set_bit(SGFL_SEVENT, &sg->flags);
+	list_del(&sg->list);
+	list_add_tail(&sg->list, &sm_sg[sg->level]);
+
+	up(&sm_sglock);
+
+	/*
+	 * The join is a service event which will be processed asynchronously.
+	 */
+
+	new_joinleave(sev);
+	wait_for_completion(&sg->event_comp);
+	error = 0;
+
+      out:
+	return error;
+}
+
+/* Context: service */
+
+int kcl_leave_service(uint32_t local_id)
+{
+	sm_group_t *sg = NULL;
+	sm_sevent_t *sev;
+	int error;
+
+	error = -ENOENT;
+	sg = sm_local_id_to_sg(local_id);
+	if (!sg)
+		goto out;
+
+	/* sg was never joined */
+	error = -EINVAL;
+	if (sg->state == SGST_NONE)
+		goto out;
+
+	down(&sm_sglock);
+
+	/* may still be joining */
+	if (test_and_set_bit(SGFL_SEVENT, &sg->flags)) {
+		up(&sm_sglock);
+		error = -EBUSY;
+		goto out;
+	}
+
+	sev = kmalloc(sizeof(sm_sevent_t), GFP_KERNEL);
+	if (!sev) {
+		up(&sm_sglock);
+		error = -ENOMEM;
+		goto out;
+	}
+
+	memset(sev, 0, sizeof (sm_sevent_t));
+	sev->se_state = SEST_LEAVE_BEGIN;
+	sm_set_event_id(&sev->se_id);
+	set_bit(SEFL_LEAVE, &sev->se_flags);
+	sev->se_sg = sg;
+	sg->sevent = sev;
+
+	up(&sm_sglock);
+
+	new_joinleave(sev);
+	wait_for_completion(&sg->event_comp);
+	error = 0;
+
+	down(&sm_sglock);
+	list_del(&sg->list);
+	list_add_tail(&sg->list, &sg_registered[sg->level]);
+	up(&sm_sglock);
+
+      out:
+	return error;
+}
+
+static void process_callback(uint32_t local_id, unsigned int event_id)
+{
+	sm_group_t *sg;
+	sm_sevent_t *sev;
+	sm_uevent_t *uev;
+
+	sg = sm_local_id_to_sg(local_id);
+	if (!sg)
+		return;
+
+	if (sg->state == SGST_RECOVER) {
+		if (!check_recovery(sg, event_id)) {
+			log_error(sg, "process_callback invalid recover "
+				  "event id %d", event_id);
+			return;
+		}
+
+		log_debug(sg, "cb recover state %u", sg->recover_state);
+
+		if (sg->recover_state == RECOVER_START)
+			sg->recover_state = RECOVER_STARTDONE;
+		else
+			log_error(sg, "process_callback recover state %u",
+				  sg->recover_state);
+		wake_serviced(DO_RECOVERIES);
+	}
+
+	else if (test_bit(SGFL_SEVENT, &sg->flags) && sg->sevent &&
+		 (sg->sevent->se_id == event_id)) {
+		sev = sg->sevent;
+
+		if (test_and_clear_bit(SEFL_ALLOW_STARTDONE, &sev->se_flags) &&
+		    (sev->se_state == SEST_JSTART_SERVICEWAIT))
+			sev->se_state = SEST_JSTART_SERVICEDONE;
+
+		set_bit(SEFL_CHECK, &sev->se_flags);
+		wake_serviced(DO_JOINLEAVE);
+	}
+
+	else if (test_bit(SGFL_UEVENT, &sg->flags) &&
+		 (sg->uevent.ue_id == event_id)) {
+		uev = &sg->uevent;
+
+		if (test_and_clear_bit(UEFL_ALLOW_STARTDONE, &uev->ue_flags)) {
+			if (uev->ue_state == UEST_JSTART_SERVICEWAIT)
+				uev->ue_state = UEST_JSTART_SERVICEDONE;
+			else if (uev->ue_state == UEST_LSTART_SERVICEWAIT)
+				uev->ue_state = UEST_LSTART_SERVICEDONE;
+		}
+		set_bit(UEFL_CHECK, &uev->ue_flags);
+		wake_serviced(DO_MEMBERSHIP);
+	}
+
+	else
+		log_error(sg, "ignoring service callback id=%x event=%u",
+			  local_id, event_id);
+}
+
+void process_callbacks(void)
+{
+	sc_entry_t *se;
+
+	while (1) {
+		se = NULL;
+
+		spin_lock(&callback_lock);
+		if (!list_empty(&callbacks)) {
+			se = list_entry(callbacks.next, sc_entry_t, list);
+			list_del(&se->list);
+		}
+		spin_unlock(&callback_lock);
+
+		if (!se)
+			break;
+		process_callback(se->local_id, se->event_id);
+		kfree(se);
+		schedule();
+	}
+}
+
+/* Context: service */
+
+void kcl_start_done(uint32_t local_id, int event_id)
+{
+	sc_entry_t *se;
+
+	SM_RETRY(se = kmalloc(sizeof(sc_entry_t), GFP_KERNEL), se);
+
+	se->local_id = local_id;
+	se->event_id = event_id;
+
+	spin_lock(&callback_lock);
+	list_add_tail(&se->list, &callbacks);
+	spin_unlock(&callback_lock);
+
+	wake_serviced(DO_CALLBACKS);
+}
+
+/* Context: service */
+
+void kcl_global_service_id(uint32_t local_id, uint32_t *global_id)
+{
+	sm_group_t *sg = sm_local_id_to_sg(local_id);
+
+	if (!sg)
+		log_print("kcl_global_service_id: can't find %x", local_id);
+	else
+		*global_id = sg->global_id;
+}
+
+static void copy_to_service(sm_group_t *sg, struct kcl_service *s)
+{
+	s->level = sg->level;
+	s->local_id = sg->local_id;
+	s->global_id = sg->global_id;
+	s->node_count = sg->memb_count;
+	strcpy(s->name, sg->name);
+}
+
+int kcl_get_services(struct list_head *head, int level)
+{
+	sm_group_t *sg;
+	struct kcl_service *s;
+	int error = -ENOMEM, count = 0;
+
+	down(&sm_sglock);
+
+	list_for_each_entry(sg, &sm_sg[level], list) {
+		if (test_bit(SGFL_SEVENT, &sg->flags))
+			continue;
+		if (head) {
+
+			s = kmalloc(sizeof(struct kcl_service), GFP_KERNEL);
+			if (!s)
+				goto out;
+			copy_to_service(sg, s);
+			list_add(&s->list, head);
+		}
+		count++;
+	}
+
+	error = count;
+ out:
+	up(&sm_sglock);
+	return error;
+}
+
+/* These three global variables listed in extern form in sm.h. */
+struct list_head sm_sg[SG_LEVELS];
+struct semaphore sm_sglock;
--- linux-2.6.9.orig/cluster/cman/sm_services.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_services.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,20 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_SERVICES_DOT_H__
+#define __SM_SERVICES_DOT_H__
+
+void init_services(void);
+void process_callbacks(void);
+
+#endif
--- linux-2.6.9.orig/cluster/cman/sm_user.c	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_user.c	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,569 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "sm.h"
+#include "cnxman-private.h"
+
+void copy_to_usernode(struct cluster_node *node, struct cl_cluster_node *unode);
+
+#define UST_REGISTER	1
+#define UST_UNREGISTER	2
+#define UST_JOIN	3
+#define UST_LEAVE	4
+#define UST_JOINED	5
+
+struct event {
+	struct list_head 	list;
+	service_event_t		type;
+	service_start_t		start_type;
+	unsigned int		event_id;
+	unsigned int		last_stop;
+	unsigned int		last_start;
+	unsigned int		last_finish;
+	unsigned int		node_count;
+	uint32_t *		nodeids;
+};
+typedef struct event event_t;
+
+struct user_service {
+	uint32_t		local_id;
+	pid_t			pid;
+	int			signal;
+	struct socket *		sock;
+	uint8_t			state;
+	uint8_t			async;
+	struct semaphore	lock;
+	struct list_head	events;
+	spinlock_t		event_lock;
+	unsigned int		last_stop;
+	unsigned int		last_start;
+	unsigned int		last_finish;
+	unsigned int		need_startdone;
+	unsigned int		node_count;
+	uint32_t *		nodeids;
+	int			name_len;
+	char			name[MAX_SERVICE_NAME_LEN];
+};
+typedef struct user_service user_service_t;
+
+
+static void add_event(user_service_t *us, event_t *ev)
+{
+	spin_lock(&us->event_lock);
+	list_add_tail(&ev->list, &us->events);
+
+	switch(ev->type) {
+	case SERVICE_EVENT_STOP:
+		us->last_stop = us->last_start;
+		break;
+	case SERVICE_EVENT_START:
+		us->last_start = ev->event_id;
+		break;
+	case SERVICE_EVENT_FINISH:
+		us->last_finish = ev->event_id;
+		break;
+	case SERVICE_EVENT_LEAVEDONE:
+		break;
+	}
+	spin_unlock(&us->event_lock);
+}
+
+static event_t *get_event(user_service_t *us)
+{
+	event_t *ev = NULL;
+
+	spin_lock(&us->event_lock);
+	if (!list_empty(&us->events)) {
+		ev = list_entry(us->events.next, event_t, list);
+		ev->last_stop = us->last_stop;
+		ev->last_start = us->last_start;
+		ev->last_finish = us->last_finish;
+	}
+	spin_unlock(&us->event_lock);
+	return ev;
+}
+
+static void del_event(user_service_t *us, event_t *ev)
+{
+	spin_lock(&us->event_lock);
+	list_del(&ev->list);
+	spin_unlock(&us->event_lock);
+}
+
+static event_t *alloc_event(void)
+{
+	event_t *ev;
+	SM_RETRY(ev = (event_t *) kmalloc(sizeof(event_t), GFP_KERNEL), ev);
+	memset(ev, 0, sizeof(event_t));
+	return ev;
+}
+
+/* us->lock must be held before calling */
+static void user_notify(user_service_t *us)
+{
+	if (us->sock)
+		queue_oob_skb(us->sock, CLUSTER_OOB_MSG_SERVICEEVENT);
+	if (us->pid && us->signal)
+		kill_proc(us->pid, us->signal, 0);
+}
+
+static service_start_t start_type(int type)
+{
+	switch (type) {
+	case SERVICE_NODE_FAILED:
+		return SERVICE_START_FAILED;
+	case SERVICE_NODE_JOIN:
+		return SERVICE_START_JOIN;
+	case SERVICE_NODE_LEAVE:
+		return SERVICE_START_LEAVE;
+	}
+	return 0;
+}
+
+static int user_stop(void *servicedata)
+{
+	user_service_t *us = (user_service_t *) servicedata;
+	event_t *ev;
+
+	down(&us->lock);
+	if (!us->sock)
+		goto out;
+
+	ev = alloc_event();
+	ev->type = SERVICE_EVENT_STOP;
+
+	add_event(us, ev);
+	user_notify(us);
+ out:
+	up(&us->lock);
+	return 0;
+}
+
+static int user_start(void *servicedata, uint32_t *nodeids, int count,
+		      int event_id, int type)
+{
+	user_service_t *us = (user_service_t *) servicedata;
+	event_t *ev;
+
+	down(&us->lock);
+	if (!us->sock) {
+		kcl_start_done(us->local_id, event_id);
+		goto out;
+	}
+
+	us->need_startdone = event_id;
+
+	ev = alloc_event();
+	ev->type = SERVICE_EVENT_START;
+	ev->node_count = count;
+	ev->start_type = start_type(type);
+	ev->event_id = event_id;
+	ev->nodeids = nodeids;
+
+	add_event(us, ev);
+	user_notify(us);
+ out:
+	up(&us->lock);
+	return 0;
+}
+
+static void user_finish(void *servicedata, int event_id)
+{
+	user_service_t *us = (user_service_t *) servicedata;
+	event_t *ev;
+
+	down(&us->lock);
+	if (!us->sock)
+		goto out;
+
+	ev = alloc_event();
+	ev->type = SERVICE_EVENT_FINISH;
+	ev->event_id = event_id;
+
+	add_event(us, ev);
+	user_notify(us);
+ out:
+	up(&us->lock);
+}
+
+struct kcl_service_ops user_service_ops = {
+	.stop = user_stop,
+	.start = user_start,
+	.finish = user_finish
+};
+
+static int user_register(char *u_name, user_service_t **us_data)
+{
+	user_service_t *us;
+	char name[MAX_SERVICE_NAME_LEN+1];
+	int len, error;
+
+	memset(name, 0, MAX_SERVICE_NAME_LEN+1);
+
+	if (copy_from_user(&name, u_name, MAX_SERVICE_NAME_LEN))
+		return -EFAULT;
+
+	len = strlen(name);
+	if (len > MAX_SERVICE_NAME_LEN)
+		return -ENAMETOOLONG;
+	if (!len)
+		return -EINVAL;
+
+	us = kmalloc(sizeof(user_service_t), GFP_KERNEL);
+	if (!us)
+		return -ENOMEM;
+	memset(us, 0, sizeof(user_service_t));
+	us->nodeids = NULL;
+	INIT_LIST_HEAD(&us->events);
+	spin_lock_init(&us->event_lock);
+	init_MUTEX(&us->lock);
+	us->name_len = len;
+	memcpy(us->name, name, len);
+
+	error = kcl_register_service(name, len, SERVICE_LEVEL_USER,
+				     &user_service_ops, TRUE, (void *) us,
+				     &us->local_id);
+	if (error) {
+		kfree(us);
+		us = NULL;
+	}
+	*us_data = us;
+	return error;
+}
+
+static void user_unregister(user_service_t *us)
+{
+	event_t *ev;
+
+	kcl_unregister_service(us->local_id);
+
+	if (us->nodeids)
+		kfree(us->nodeids);
+
+	while ((ev = get_event(us))) {
+		del_event(us, ev);
+		if (ev->nodeids)
+			kfree(ev->nodeids);
+		kfree(ev);
+	}
+}
+
+static int user_join_async(void *arg)
+{
+	user_service_t *us = arg;
+	int user_gone = 0;
+
+	daemonize("cman_userjoin");
+
+	kcl_join_service(us->local_id);
+
+	down(&us->lock);
+	us->state = UST_JOINED;
+	us->async = 0;
+	if (!us->sock) {
+		if (us->need_startdone)
+			kcl_start_done(us->local_id, us->need_startdone);
+		user_gone = 1;
+	}
+	up(&us->lock);
+
+	if (user_gone) {
+		kcl_leave_service(us->local_id);
+		user_unregister(us);
+		kfree(us);
+	}
+	return 0;
+}
+
+static int user_leave_async(void *arg)
+{
+	user_service_t *us = arg;
+
+	daemonize("cman_userleave");
+
+	kcl_leave_service(us->local_id);
+
+	down(&us->lock);
+	us->async = 0;
+	if (!us->sock) {
+		user_unregister(us);
+		kfree(us);
+	} else {
+		event_t *ev = alloc_event();
+		ev->type = SERVICE_EVENT_LEAVEDONE;
+		add_event(us, ev);
+		user_notify(us);
+		up(&us->lock);
+	}
+
+	return 0;
+}
+
+static int user_join(user_service_t *us, int wait)
+{
+	int error = 0;
+
+	if (wait) {
+		error = kcl_join_service(us->local_id);
+		us->state = UST_JOINED;
+	}
+	else {
+		us->async = 1;
+		kernel_thread(user_join_async, us, 0);
+	}
+
+	return error;
+}
+
+static void user_leave(user_service_t *us, int wait)
+{
+	if (wait)
+		kcl_leave_service(us->local_id);
+	else {
+		us->async = 1;
+		kernel_thread(user_leave_async, us, 0);
+	}
+}
+
+static int user_start_done(user_service_t *us, unsigned int event_id)
+{
+	if (!us->need_startdone)
+		return -EINVAL;
+	if (us->need_startdone == event_id)
+		us->need_startdone = 0;
+	kcl_start_done(us->local_id, event_id);
+	return 0;
+}
+
+static void user_set_signal(user_service_t *us, int signal)
+{
+	us->pid = current->pid;
+	us->signal = signal;
+}
+
+static int user_get_event(user_service_t *us,
+			  struct cl_service_event *user_event)
+{
+	event_t *ev;
+	struct cl_service_event event;
+
+	ev = get_event(us);
+	if (!ev)
+		return 0;
+
+	event.type        = ev->type;
+	event.start_type  = ev->start_type;
+	event.event_id	  = ev->event_id;
+	event.last_stop	  = ev->last_stop;
+	event.last_start  = ev->last_start;
+	event.last_finish = ev->last_finish;
+	event.node_count  = ev->node_count;
+
+	if (copy_to_user(user_event, &event, sizeof(struct cl_service_event)))
+		return -EFAULT;
+
+	del_event(us, ev);
+
+	if (ev->type == SERVICE_EVENT_START) {
+		if (us->nodeids)
+			kfree(us->nodeids);
+		us->nodeids = ev->nodeids;
+		us->node_count = ev->node_count;
+	}
+
+	kfree(ev);
+	return 1;
+}
+
+static int user_get_members(user_service_t *us,
+			    struct cl_cluster_nodelist *u_nodelist)
+{
+	struct cl_cluster_nodelist user_nodelist;
+	struct cl_cluster_node user_node, *u_node;
+	struct cluster_node *node;
+	unsigned int i;
+	int num_nodes = 0;
+
+	if (!u_nodelist)
+		return us->node_count;
+
+	if (copy_from_user(&user_nodelist, (void __user *) u_nodelist,
+			   sizeof(struct cl_cluster_nodelist)))
+		return -EFAULT;
+
+	if (user_nodelist.max_members < us->node_count)
+		return -E2BIG;
+
+	u_node = user_nodelist.nodes;
+
+	for (i = 0; i < us->node_count; i++) {
+		node = find_node_by_nodeid(us->nodeids[i]);
+		if (!node)
+			continue;
+
+		copy_to_usernode(node, &user_node);
+		if (copy_to_user(u_node, &user_node,
+				 sizeof(struct cl_cluster_node)))
+			return -EFAULT;
+
+		u_node++;
+		num_nodes++;
+	}
+	return num_nodes;
+}
+
+static int user_global_id(user_service_t *us, uint32_t *id)
+{
+	uint32_t gid = 0;
+
+	if (us->state != UST_JOINED)
+		return -EINVAL;
+
+	kcl_global_service_id(us->local_id, &gid);
+
+	if (copy_to_user(id, &gid, sizeof(uint32_t)))
+		return -EFAULT;
+	return 0;
+}
+
+static int user_set_level(user_service_t *us, int level)
+{
+	int prev_id = us->local_id;
+	int error;
+
+	if (us->state != UST_REGISTER)
+		return -EINVAL;
+
+	error = kcl_register_service(us->name, us->name_len, level,
+				     &user_service_ops, TRUE, (void *) us,
+				     &us->local_id);
+	if (error)
+		return error;
+
+	kcl_unregister_service(prev_id);
+	return 0;
+}
+
+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct cluster_sock *c = cluster_sk(sock->sk);
+	user_service_t *us = c->service_data;
+	int error = 0;
+
+	if (!us && cmd != SIOCCLUSTER_SERVICE_REGISTER)
+		return -EINVAL;
+
+	switch (cmd) {
+	case SIOCCLUSTER_SERVICE_REGISTER:
+		error = user_register((char *) arg, &us);
+		if (!error) {
+			us->state = UST_REGISTER;
+			us->sock = sock;
+			c->service_data = us;
+		}
+		break;
+
+	case SIOCCLUSTER_SERVICE_UNREGISTER:
+		down(&us->lock);
+		us->state = UST_UNREGISTER;
+		user_unregister(us);
+		up(&us->lock);
+		break;
+
+	case SIOCCLUSTER_SERVICE_JOIN:
+		us->state = UST_JOIN;
+		user_join(us, 0);
+		break;
+
+	case SIOCCLUSTER_SERVICE_LEAVE:
+		down(&us->lock);
+		if (us->state != UST_JOINED) {
+			error = -EBUSY;
+			up(&us->lock);
+		} else {
+			us->state = UST_LEAVE;
+			up(&us->lock);
+			user_leave(us, 0);
+		}
+		break;
+
+	case SIOCCLUSTER_SERVICE_SETSIGNAL:
+		user_set_signal(us, (int) arg);
+		break;
+
+	case SIOCCLUSTER_SERVICE_STARTDONE:
+		error = user_start_done(us, (unsigned int) arg);
+		break;
+
+	case SIOCCLUSTER_SERVICE_GETEVENT:
+		error = user_get_event(us, (struct cl_service_event *) arg);
+		break;
+
+	case SIOCCLUSTER_SERVICE_GETMEMBERS:
+		error = user_get_members(us, (struct cl_cluster_nodelist *)arg);
+		break;
+
+	case SIOCCLUSTER_SERVICE_GLOBALID:
+		error = user_global_id(us, (uint32_t *) arg);
+		break;
+
+	case SIOCCLUSTER_SERVICE_SETLEVEL:
+		error = user_set_level(us, (int) arg);
+		break;
+
+	default:
+		error = -EINVAL;
+	}
+
+	return error;
+}
+
+void sm_sock_release(struct socket *sock)
+{
+	struct cluster_sock *c = cluster_sk(sock->sk);
+	user_service_t *us = c->service_data;
+	int state;
+
+	if (!us)
+		return;
+
+	down(&us->lock);
+	us->sock = NULL;
+	c->service_data = NULL;
+
+	if (us->need_startdone)
+		kcl_start_done(us->local_id, us->need_startdone);
+
+	if (us->async) {
+		/* async thread will clean up before exiting */
+		up(&us->lock);
+		return;
+	}
+	state = us->state;
+	up(&us->lock);
+
+	switch (state) {
+	case UST_JOIN:
+		break;
+	case UST_JOINED:
+		user_leave(us, 1);
+		/* fall through */
+	case UST_LEAVE:
+	case UST_REGISTER:
+		user_unregister(us);
+		/* fall through */
+	case UST_UNREGISTER:
+		kfree(us);
+		break;
+	}
+}
--- linux-2.6.9.orig/cluster/cman/sm_user.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/cluster/cman/sm_user.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SM_USER_DOT_H__
+#define __SM_USER_DOT_H__
+
+int sm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
+void sm_sock_release(struct socket *sock);
+void sm_sock_bind(struct socket *sock);
+
+#endif
--- linux-2.6.9.orig/include/cluster/cnxman-socket.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/include/cluster/cnxman-socket.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,244 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* CMAN socket interface header,
+   may be include by user or kernel code */
+
+#ifndef __CNXMAN_SOCKET_H
+#define __CNXMAN_SOCKET_H
+
+/* A currently unused number. TIPC also uses this number and you're unlikely
+   to be using both.
+ */
+#define AF_CLUSTER 30
+#define PF_CLUSTER AF_CLUSTER
+
+/* Protocol(socket) types */
+#define CLPROTO_MASTER 2
+#define CLPROTO_CLIENT 3
+
+/* ioctls -- should register these properly */
+#define SIOCCLUSTER_NOTIFY            _IOW('x', 0x01, int)
+#define SIOCCLUSTER_REMOVENOTIFY      _IO( 'x', 0x02)
+#define SIOCCLUSTER_GETMEMBERS        _IOR('x', 0x03, struct cl_cluster_nodelist)
+#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int)
+#define SIOCCLUSTER_ISQUORATE         _IO( 'x', 0x05)
+#define SIOCCLUSTER_ISLISTENING       _IOW('x', 0x06, struct cl_listen_request)
+#define SIOCCLUSTER_GETALLMEMBERS     _IOR('x', 0x07, struct cl_cluster_nodelist)
+#define SIOCCLUSTER_SET_VOTES         _IOW('x', 0x08, int)
+#define SIOCCLUSTER_GET_VERSION       _IOR('x', 0x09, struct cl_version)
+#define SIOCCLUSTER_SET_VERSION       _IOW('x', 0x0a, struct cl_version)
+#define SIOCCLUSTER_ISACTIVE          _IO( 'x', 0x0b)
+#define SIOCCLUSTER_KILLNODE          _IOW('x', 0x0c, int)
+#define SIOCCLUSTER_GET_JOINCOUNT     _IO( 'x', 0x0d)
+#define SIOCCLUSTER_SERVICE_REGISTER  _IOW('x', 0x0e, char)
+#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f)
+#define SIOCCLUSTER_SERVICE_JOIN      _IO( 'x', 0x10)
+#define SIOCCLUSTER_SERVICE_LEAVE     _IO( 'x', 0x20)
+#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int)
+#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int)
+#define SIOCCLUSTER_SERVICE_GETEVENT  _IOR('x', 0x50, struct cl_service_event)
+#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_nodelist)
+#define SIOCCLUSTER_SERVICE_GLOBALID  _IOR('x', 0x70, uint32_t)
+#define SIOCCLUSTER_SERVICE_SETLEVEL  _IOR('x', 0x80, int)
+#define SIOCCLUSTER_GETNODE	      _IOWR('x', 0x90, struct cl_cluster_node)
+#define SIOCCLUSTER_GETCLUSTER	      _IOWR('x', 0x91, struct cl_cluster_info)
+#define SIOCCLUSTER_BARRIER           _IOW('x', 0x0a0, struct cl_barrier_info)
+#define SIOCCLUSTER_QD_REGISTER       _IOW('x', 0x0a1, struct cl_quorumdevice_info)
+#define SIOCCLUSTER_QD_UNREGISTER     _IO('x', 0x0a2)
+#define SIOCCLUSTER_QD_POLL           _IOW('x', 0x0a3, int)
+
+/* These were setsockopts */
+#define SIOCCLUSTER_PASS_SOCKET       _IOW('x', 0x0b0, struct cl_passed_sock)
+#define SIOCCLUSTER_SET_NODENAME      _IOW('x', 0x0b1, char *)
+#define SIOCCLUSTER_SET_NODEID        _IOW('x', 0x0b2, int)
+#define SIOCCLUSTER_JOIN_CLUSTER      _IOW('x', 0x0b3, struct cl_join_cluster_info)
+#define SIOCCLUSTER_LEAVE_CLUSTER     _IOW('x', 0x0b4, int)
+
+
+/* Maximum size of a cluster message */
+#define MAX_CLUSTER_MESSAGE          1500
+#define MAX_CLUSTER_MEMBER_NAME_LEN   255
+#define MAX_BARRIER_NAME_LEN           33
+#define MAX_SA_ADDR_LEN                12
+#define MAX_CLUSTER_NAME_LEN           16
+
+/* Well-known cluster port numbers */
+#define CLUSTER_PORT_MEMBERSHIP  1	/* Mustn't block during cluster
+					 * transitions! */
+#define CLUSTER_PORT_SERVICES    2
+#define CLUSTER_PORT_SYSMAN      10	/* Remote execution daemon */
+#define CLUSTER_PORT_CLVMD       11	/* Cluster LVM daemon */
+#define CLUSTER_PORT_SLM         12	/* LVM SLM (simple lock manager) */
+
+/* Port numbers above this will be blocked when the cluster is inquorate or in
+ * transition */
+#define HIGH_PROTECTED_PORT      9
+
+/* Reasons for leaving the cluster */
+#define CLUSTER_LEAVEFLAG_DOWN     0	/* Normal shutdown */
+#define CLUSTER_LEAVEFLAG_KILLED   1
+#define CLUSTER_LEAVEFLAG_PANIC    2
+#define CLUSTER_LEAVEFLAG_REMOVED  3	/* This one can reduce quorum */
+#define CLUSTER_LEAVEFLAG_REJECTED 4	/* Not allowed into the cluster in the
+					 * first place */
+#define CLUSTER_LEAVEFLAG_INCONSISTENT 5	/* Our view of the cluster is
+						 * in a minority */
+#define CLUSTER_LEAVEFLAG_DEAD         6	/* Discovered to be dead */
+#define CLUSTER_LEAVEFLAG_NORESPONSE   7        /* Didn't ACK message */
+#define CLUSTER_LEAVEFLAG_FORCE     0x10	/* Forced by command-line */
+
+/* OOB messages sent to a local socket */
+#define CLUSTER_OOB_MSG_PORTCLOSED  1
+#define CLUSTER_OOB_MSG_STATECHANGE 2
+#define CLUSTER_OOB_MSG_SERVICEEVENT 3
+
+/* Sendmsg flags, these are above the normal sendmsg flags so they don't
+ * interfere */
+#define MSG_NOACK     0x010000	/* Don't need an ACK for this message */
+#define MSG_QUEUE     0x020000	/* Queue the message for sending later */
+#define MSG_MULTICAST 0x080000	/* Message was sent to all nodes in the cluster
+				 */
+#define MSG_ALLINT    0x100000	/* Send out of all interfaces */
+#define MSG_REPLYEXP  0x200000	/* Reply is expected */
+#define MSG_BCASTSELF 0x400000	/* Broadcast message also gets send to us */
+
+typedef enum { NODESTATE_JOINING=1, NODESTATE_MEMBER,
+	       NODESTATE_DEAD } nodestate_t;
+
+
+struct sockaddr_cl {
+	unsigned short scl_family;
+	unsigned char scl_flags;
+	unsigned char scl_port;
+	int           scl_nodeid;
+};
+
+/*
+ * This is how we pass the multicast & receive sockets into kernel space.
+ */
+struct cl_passed_sock {
+	int fd;			/* FD of master socket to do multicast on */
+	int number;		/* Socket number, to match up recvonly & bcast
+				 * sockets */
+        int multicast;          /* Is it multicast or receive ? */
+};
+
+/* Cluster configuration info passed when we join the cluster */
+struct cl_join_cluster_info {
+	unsigned char votes;
+	unsigned int expected_votes;
+	unsigned int two_node;
+	unsigned int config_version;
+
+        char cluster_name[17];
+};
+
+
+/* This is the structure, per node, returned from the membership ioctl */
+struct cl_cluster_node {
+	unsigned int size;
+	unsigned int node_id;
+	unsigned int us;
+	unsigned int leave_reason;
+	unsigned int incarnation;
+	nodestate_t state;
+	char name[MAX_CLUSTER_MEMBER_NAME_LEN];
+	unsigned char votes;
+};
+
+/* The struct passed to the membership ioctls */
+struct cl_cluster_nodelist {
+        uint32_t max_members;
+        struct cl_cluster_node *nodes;
+};
+
+/* The struct passed to the quorum device register ioctl */
+struct cl_quorumdevice_info {
+        uint32_t votes;
+	char name[MAX_CLUSTER_MEMBER_NAME_LEN];
+
+};
+
+/* Structure passed to SIOCCLUSTER_ISLISTENING */
+struct cl_listen_request {
+	unsigned char port;
+        int           nodeid;
+};
+
+/* A Cluster PORTCLOSED message - received by a local user as an OOB message */
+struct cl_portclosed_oob {
+	unsigned char cmd;	/* CLUSTER_OOB_MSG_PORTCLOSED */
+	unsigned char port;
+};
+
+/* Get all version numbers or set the config version */
+struct cl_version {
+	unsigned int major;
+	unsigned int minor;
+	unsigned int patch;
+	unsigned int config;
+};
+
+/* structure passed to barrier ioctls */
+struct cl_barrier_info {
+	char cmd;
+	char name[MAX_BARRIER_NAME_LEN];
+	unsigned int flags;
+	unsigned long arg;
+};
+
+struct cl_cluster_info {
+	char name[MAX_CLUSTER_NAME_LEN+1];
+	uint16_t number;
+};
+
+typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH,
+		SERVICE_EVENT_LEAVEDONE } service_event_t;
+
+typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE }
+		service_start_t;
+
+struct cl_service_event {
+	service_event_t type;
+	service_start_t start_type;
+	unsigned int event_id;
+	unsigned int last_stop;
+	unsigned int last_start;
+	unsigned int last_finish;
+	unsigned int node_count;
+};
+
+
+/* Commands to the barrier ioctl */
+#define BARRIER_IOCTL_REGISTER 1
+#define BARRIER_IOCTL_CHANGE   2
+#define BARRIER_IOCTL_DELETE   3
+#define BARRIER_IOCTL_WAIT     4
+
+/* Attributes of a barrier - bitmask */
+#define BARRIER_ATTR_AUTODELETE 1
+#define BARRIER_ATTR_MULTISTEP  2
+#define BARRIER_ATTR_MANUAL     4
+#define BARRIER_ATTR_ENABLED    8
+#define BARRIER_ATTR_CALLBACK  16
+
+/* Attribute setting commands */
+#define BARRIER_SETATTR_AUTODELETE 1
+#define BARRIER_SETATTR_MULTISTEP  2
+#define BARRIER_SETATTR_ENABLED    3
+#define BARRIER_SETATTR_NODES      4
+#define BARRIER_SETATTR_CALLBACK   5
+#define BARRIER_SETATTR_TIMEOUT    6
+
+#endif
--- linux-2.6.9.orig/include/cluster/cnxman.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/include/cluster/cnxman.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,87 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CNXMAN_H
+#define __CNXMAN_H
+
+#include "linux/in6.h"
+#include "cluster/cnxman-socket.h"
+
+/* In-kernel API */
+
+/* This is the structure, per node, returned from the membership request */
+struct kcl_cluster_node {
+	unsigned int size;
+	unsigned int node_id;
+	unsigned int us;
+	unsigned int leave_reason;
+	unsigned int incarnation;
+	nodestate_t state;
+	struct list_head list;
+	char name[MAX_CLUSTER_MEMBER_NAME_LEN];
+	unsigned char votes;
+};
+
+struct cluster_node_addr {
+	struct list_head list;
+	unsigned char addr[sizeof(struct sockaddr_in6)];/* A large sockaddr */
+	int addr_len;
+};
+
+
+/* Reasons for a kernel membership callback */
+typedef enum { CLUSTER_RECONFIG, DIED, LEAVING, NEWNODE } kcl_callback_reason;
+
+/* Kernel version of above, the void *sock is a struct socket */
+struct kcl_multicast_sock {
+	void *sock;
+	int number;		/* Socket number, to match up recvonly & bcast
+				 * sockets */
+};
+
+extern int kcl_sendmsg(struct socket *sock, void *buf, int size,
+		       struct sockaddr_cl *caddr, int addr_len,
+		       unsigned int flags);
+extern int kcl_register_read_callback(struct socket *sock,
+				      int (*routine) (char *, int, char *, int,
+						      unsigned int));
+extern int kcl_add_callback(void (*callback) (kcl_callback_reason, long));
+extern int kcl_remove_callback(void (*callback) (kcl_callback_reason, long));
+extern int kcl_get_members(struct list_head *list);
+extern int kcl_get_member_ids(uint32_t * idbuf, int size);
+extern int kcl_get_all_members(struct list_head *list);
+extern int kcl_get_node_by_addr(unsigned char *addr, int addr_len,
+				struct kcl_cluster_node *n);
+extern int kcl_get_node_by_name(unsigned char *name,
+				struct kcl_cluster_node *n);
+extern int kcl_get_node_by_nodeid(int nodeid, struct kcl_cluster_node *n);
+extern int kcl_is_quorate(void);
+extern int kcl_addref_cluster(void);
+extern int kcl_releaseref_cluster(void);
+extern int kcl_cluster_name(char **cname);
+extern int kcl_get_current_interface(void);
+extern struct list_head *kcl_get_node_addresses(int nodeid);
+
+extern int kcl_barrier_register(char *name, unsigned int flags,
+				unsigned int nodes);
+extern int kcl_barrier_setattr(char *name, unsigned int attr,
+			       unsigned long arg);
+extern int kcl_barrier_delete(char *name);
+extern int kcl_barrier_wait(char *name);
+extern int kcl_barrier_cancel(char *name);
+
+extern int kcl_register_quorum_device(char *name, int votes);
+extern int kcl_unregister_quorum_device(void);
+extern int kcl_quorum_device_available(int yesno);
+
+#endif
--- linux-2.6.9.orig/include/cluster/service.h	1970-01-01 03:00:00.000000000 +0300
+++ linux-2.6.9.debug/include/cluster/service.h	2006-12-20 17:06:31.000000000 +0300
@@ -0,0 +1,102 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
+**  
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __SERVICE_DOT_H__
+#define __SERVICE_DOT_H__
+
+/* 
+ * Interface between service manager and services
+ */
+
+/* 
+ * Service levels are started in order from lowest, so level 0 is started on
+ * all nodes before level 1 is started.
+ */
+
+#define SERVICE_LEVEL_FENCE      (0)
+#define SERVICE_LEVEL_GDLM       (1)
+#define SERVICE_LEVEL_GFS        (2)
+#define SERVICE_LEVEL_USER	 (3)
+
+#define MAX_SERVICE_NAME_LEN     (33)
+
+/* 
+ * The type of start a service receives.  The start (and preceding stop) may be
+ * due to a node joining or leaving the SG or due to a node having failed.
+ */
+
+#define SERVICE_NODE_FAILED      (1)
+#define SERVICE_NODE_JOIN        (2)
+#define SERVICE_NODE_LEAVE       (3)
+
+
+struct kcl_service {
+	struct list_head list;
+	uint16_t level;
+	uint32_t local_id;
+	uint32_t global_id;
+	int node_count;
+	char name[MAX_SERVICE_NAME_LEN];
+};
+
+int kcl_get_services(struct list_head *list, int level);
+
+
+/* 
+ * These routines which run in CMAN context must return quickly and cannot
+ * block.
+ */
+
+struct kcl_service_ops {
+	int (*stop) (void *servicedata);
+	int (*start) (void *servicedata, uint32_t *nodeids, int count,
+		      int event_id, int type);
+	void (*finish) (void *servicedata, int event_id);
+};
+
+/* 
+ * Register will cause CMAN to create a Service Group (SG) for the named
+ * instance of the service.  A local ID is returned which is used to join,
+ * leave and unregister the service.
+ */
+
+int kcl_register_service(char *name, int namelen, int level,
+			 struct kcl_service_ops *ops, int unique,
+			 void *servicedata, uint32_t *local_id);
+
+void kcl_unregister_service(uint32_t local_id);
+
+/* 
+ * Once a service is joined it will be managed by CMAN and receive start, stop,
+ * and finish calls.  After leave is called the service is no longer managed by
+ * CMAN.  The first start for a service may arrive before kcl_join_service()
+ * returns.
+ */
+
+int kcl_join_service(uint32_t local_id);
+int kcl_leave_service(uint32_t local_id);
+
+/* 
+ * After a service is started, it can ask for its cluster-wide unique ID.
+ */
+
+void kcl_global_service_id(uint32_t local_id, uint32_t * global_id);
+
+/* 
+ * Called by a service when it's done with a start().  Cannot be called from
+ * the start function.
+ */
+
+void kcl_start_done(uint32_t local_id, int event_id);
+
+#endif
