patch-2.3.15 linux/include/net/sock.h

Next file: linux/include/net/tcp.h
Previous file: linux/include/net/route.h
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.14/linux/include/net/sock.h linux/include/net/sock.h
@@ -85,42 +85,32 @@
 #include <net/irda/irda.h>
 #endif
 
+#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE)
+struct atm_vcc;
+#endif
+
 #ifdef CONFIG_FILTER
 #include <linux/filter.h>
 #endif
 
 #include <asm/atomic.h>
+#include <net/dst.h>
 
 #define MIN_WRITE_SPACE	2048
 
 /* The AF_UNIX specific socket options */
 struct unix_opt {
-	int 			family;
-	char *			name;
-	int  			locks;
 	struct unix_address	*addr;
 	struct dentry *		dentry;
 	struct semaphore	readsem;
 	struct sock *		other;
 	struct sock **		list;
 	struct sock *		gc_tree;
-	int			inflight;
-	atomic_t		user_count;
+	atomic_t		inflight;
+	rwlock_t		lock;
+	wait_queue_head_t	peer_wait;
 };
 
-#ifdef CONFIG_NETLINK
-struct netlink_callback;
-
-struct netlink_opt {
-	pid_t			pid;
-	unsigned		groups;
-	pid_t			dst_pid;
-	unsigned		dst_groups;
-	int			(*handler)(int unit, struct sk_buff *skb);
-	atomic_t		locks;
-	struct netlink_callback	*cb;
-};
-#endif
 
 /* Once the IPX ncpd patches are in these are going into protinfo. */
 #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE)
@@ -196,6 +186,25 @@
 };
 #endif
 
+#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
+struct inet_opt
+{
+	int			ttl;			/* TTL setting */
+	int			tos;			/* TOS */
+	unsigned	   	cmsg_flags;
+	struct ip_options	*opt;
+	unsigned char		hdrincl;		/* Include headers ? */
+	__u8			mc_ttl;			/* Multicasting TTL */
+	__u8			mc_loop;		/* Loopback */
+	__u8			recverr;
+	__u8			pmtudisc;
+	int			mc_index;		/* Multicast device index */
+	__u32			mc_addr;
+	struct ip_mc_socklist	*mc_list;		/* Group array */
+};
+#endif
+
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block {
 	__u32	start_seq;
@@ -252,6 +261,7 @@
  */
  	__u32	snd_ssthresh;	/* Slow start size threshold		*/
  	__u16	snd_cwnd_cnt;	/* Linear increase counter		*/
+	__u16	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 	__u8	dup_acks;	/* Consequetive duplicate acks seen from other end */
 	__u8	delayed_acks;
 	__u16	user_mss;  	/* mss requested by user in ioctl */
@@ -287,7 +297,7 @@
         __u32	rcv_tsval;	/* Time stamp value             	*/
         __u32	rcv_tsecr;	/* Time stamp echo reply        	*/
         __u32	ts_recent;	/* Time stamp to echo next		*/
-        __u32	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
+        long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
 	int	num_sacks;	/* Number of SACK blocks		*/
 	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
@@ -306,6 +316,7 @@
 	struct open_request	**syn_wait_last;
 
 	int syn_backlog;	/* Backlog of received SYNs */
+	int write_pending;
 };
 
  	
@@ -344,6 +355,10 @@
  *		} tp_pinfo;
  *
  *	}
+ *
+ * The idea failed because IPv6 transition asssumes dual IP/IPv6 sockets.
+ * So, net_pinfo is IPv6 are really, and protinfo unifies all another
+ * private areas.
  */
 
 /* Define this to get the sk->debug debugging facility. */
@@ -371,10 +386,6 @@
 } while(0);
 
 struct sock {
-	/* Local port binding hash linkage. */
-	struct sock		*bind_next;
-	struct sock		**bind_pprev;
-
 	/* Socket demultiplex comparisons on incoming packets. */
 	__u32			daddr;		/* Foreign IPv4 addr			*/
 	__u32			rcv_saddr;	/* Bound local IPv4 addr		*/
@@ -385,6 +396,8 @@
 	/* Main hash linkage for various protocol lookup tables. */
 	struct sock		*next;
 	struct sock		**pprev;
+	struct sock		*bind_next;
+	struct sock		**bind_pprev;
 
 	volatile unsigned char	state,		/* Connection state			*/
 				zapped;		/* In ax25 & ipx means not linked	*/
@@ -393,12 +406,14 @@
 	unsigned short		family;		/* Address family			*/
 	unsigned char		reuse,		/* SO_REUSEADDR setting			*/
 				nonagle;	/* Disable Nagle algorithm?		*/
+	atomic_t		refcnt;		/* Reference count			*/
 
 	socket_lock_t		lock;		/* Synchronizer...			*/
 	int			rcvbuf;		/* Size of receive buffer in bytes	*/
 
-	wait_queue_head_t	*sleep;	/* Sock wait queue			*/
+	wait_queue_head_t	*sleep;		/* Sock wait queue			*/
 	struct dst_entry	*dst_cache;	/* Destination cache			*/
+	rwlock_t		dst_lock;
 	atomic_t		rmem_alloc;	/* Receive queue bytes committed	*/
 	struct sk_buff_head	receive_queue;	/* Incoming packets			*/
 	atomic_t		wmem_alloc;	/* Transmit queue bytes committed	*/
@@ -437,6 +452,8 @@
 		struct sk_buff *tail;
 	} backlog;
 
+	rwlock_t		callback_lock;
+
 	/* Error queue, rarely used. */
 	struct sk_buff_head	error_queue;
 
@@ -487,6 +504,9 @@
 	union {
 		void *destruct_hook;
 	  	struct unix_opt	af_unix;
+#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
+		struct inet_opt af_inet;
+#endif
 #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE)
 		struct atalk_sock	af_at;
 #endif
@@ -512,32 +532,22 @@
 		rose_cb			*rose;
 #endif
 #ifdef CONFIG_NETLINK
-		struct netlink_opt	af_netlink;
+		struct netlink_opt	*af_netlink;
 #endif
 #if defined(CONFIG_ECONET) || defined(CONFIG_ECONET_MODULE)
 		struct econet_opt	*af_econet;
 #endif
+#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE)
+		struct atm_vcc		*af_atm;
+#endif
 #if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE)
 		struct irda_sock        *irda;
 #endif
 	} protinfo;  		
 
-	/* IP 'private area' or will be eventually. */
-	int			ip_ttl;			/* TTL setting */
-	int			ip_tos;			/* TOS */
-	unsigned	   	ip_cmsg_flags;
-	struct ip_options	*opt;
-	unsigned char		ip_hdrincl;		/* Include headers ? */
-	__u8			ip_mc_ttl;		/* Multicasting TTL */
-	__u8			ip_mc_loop;		/* Loopback */
-	__u8			ip_recverr;
-	__u8			ip_pmtudisc;
-	int			ip_mc_index;		/* Multicast device index */
-	__u32			ip_mc_addr;
-	struct ip_mc_socklist	*ip_mc_list;		/* Group array */
 
-	/* This part is used for the timeout functions (timer.c). */
-	int			timeout;	/* What are we waiting for? */
+	/* This part is used for the timeout functions. */
+	spinlock_t		timer_lock;	/* Required until timer in core is repaired */
 	struct timer_list	timer;		/* This is the sock cleanup timer. */
 	struct timeval		stamp;
 
@@ -580,8 +590,9 @@
 	int			(*connect)(struct sock *sk,
 				        struct sockaddr *uaddr, 
 					int addr_len);
+	int			(*disconnect)(struct sock *sk, int flags);
 
-	struct sock *		(*accept) (struct sock *sk, int flags);
+	struct sock *		(*accept) (struct sock *sk, int flags, int *err);
 	void			(*retransmit)(struct sock *sk, int all);
 	void			(*write_wakeup)(struct sock *sk);
 	void			(*read_wakeup)(struct sock *sk);
@@ -621,14 +632,6 @@
 	int			inuse, highestinuse;
 };
 
-#define TIME_WRITE	1	/* Not yet used */
-#define TIME_RETRANS	2	/* Retransmit timer */
-#define TIME_DACK	3	/* Delayed ack timer */
-#define TIME_CLOSE	4
-#define TIME_KEEPOPEN	5
-#define TIME_DESTROY	6
-#define TIME_DONE	7	/* Used to absorb those last few packets */
-#define TIME_PROBE0	8
 
 /* About 10 seconds */
 #define SOCK_DESTROY_TIME (10*HZ)
@@ -640,23 +643,6 @@
 #define RCV_SHUTDOWN	1
 #define SEND_SHUTDOWN	2
 
-/* Per-protocol hash table implementations use this to make sure
- * nothing changes.
- */
-extern rwlock_t sockhash_lock;
-#define SOCKHASH_LOCK_READ()		read_lock_bh(&sockhash_lock)
-#define SOCKHASH_UNLOCK_READ()		read_unlock_bh(&sockhash_lock)
-#define SOCKHASH_LOCK_WRITE()		write_lock_bh(&sockhash_lock)
-#define SOCKHASH_UNLOCK_WRITE()		write_unlock_bh(&sockhash_lock)
-
-/* The following variants must _only_ be used when you know you
- * can only be executing in a BH context.
- */
-#define SOCKHASH_LOCK_READ_BH()		read_lock(&sockhash_lock)
-#define SOCKHASH_UNLOCK_READ_BH()	read_unlock(&sockhash_lock)
-#define SOCKHASH_LOCK_WRITE_BH()	write_lock(&sockhash_lock)
-#define SOCKHASH_UNLOCK_WRITE_BH()	write_unlock(&sockhash_lock)
-
 /* Used by processes to "lock" a socket state, so that
  * interrupts and bottom half handlers won't change it
  * from under us. It essentially blocks any incoming
@@ -667,8 +653,23 @@
  * the backlog queue.  This queue is processed by the
  * owner of the socket lock right before it is released.
  */
-extern void lock_sock(struct sock *sk);
-extern void release_sock(struct sock *sk);
+extern void __lock_sock(struct sock *sk);
+extern void __release_sock(struct sock *sk);
+#define lock_sock(__sk) \
+do {	spin_lock_bh(&((__sk)->lock.slock)); \
+	if ((__sk)->lock.users != 0) \
+		__lock_sock(__sk); \
+	(__sk)->lock.users = 1; \
+	spin_unlock_bh(&((__sk)->lock.slock)); \
+} while(0)
+#define release_sock(__sk) \
+do {	spin_lock_bh(&((__sk)->lock.slock)); \
+	(__sk)->lock.users = 0; \
+	if ((__sk)->backlog.tail != NULL) \
+		__release_sock(__sk); \
+	wake_up(&((__sk)->lock.wq)); \
+	spin_unlock_bh(&((__sk)->lock.slock)); \
+} while(0)
 
 /* BH context may only use the following locking interface. */
 #define bh_lock_sock(__sk)	spin_lock(&((__sk)->lock.slock))
@@ -696,7 +697,6 @@
 
 extern struct sock *		sk_alloc(int family, int priority, int zero_it);
 extern void			sk_free(struct sock *sk);
-extern void			destroy_sock(struct sock *sk);
 
 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
 					      unsigned long size, int force,
@@ -706,6 +706,7 @@
 					      int priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
+extern void			sock_cfree(struct sk_buff *skb);
 extern unsigned long		sock_rspace(struct sock *sk);
 extern unsigned long		sock_wspace(struct sock *sk);
 
@@ -729,9 +730,7 @@
  * Functions to fill in entries in struct proto_ops when a protocol
  * does not implement a particular function.
  */
-extern int                      sock_no_dup(struct socket *, struct socket *);
-extern int                      sock_no_release(struct socket *, 
-						struct socket *);
+extern int                      sock_no_release(struct socket *);
 extern int                      sock_no_bind(struct socket *, 
 					     struct sockaddr *, int);
 extern int                      sock_no_connect(struct socket *,
@@ -760,6 +759,9 @@
 extern int                      sock_no_recvmsg(struct socket *,
 						struct msghdr *, int,
 						struct scm_cookie *);
+extern int			sock_no_mmap(struct file *file,
+					     struct socket *sock,
+					     struct vm_area_struct *vma);
 
 /*
  *	Default socket callbacks and setup code
@@ -815,6 +817,139 @@
 #endif /* CONFIG_FILTER */
 
 /*
+ * Socket reference counting postulates.
+ *
+ * * Each user of socket SHOULD hold a reference count.
+ * * Each access point to socket (an hash table bucket, reference from a list,
+ *   running timer, skb in flight MUST hold a reference count.
+ * * When reference count hits 0, it means it will never increase back.
+ * * When reference count hits 0, it means that no references from
+ *   outside exist to this socket and current process on current CPU
+ *   is last user and may/should destroy this socket.
+ * * sk_free is called from any context: process, BH, IRQ. When
+ *   it is called, socket has no references from outside -> sk_free
+ *   may release descendant resources allocated by the socket, but
+ *   to the time when it is called, socket is NOT referenced by any
+ *   hash tables, lists etc.
+ * * Packets, delivered from outside (from network or from another process)
+ *   and enqueued on receive/error queues SHOULD NOT grab reference count,
+ *   when they sit in queue. Otherwise, packets will leak to hole, when
+ *   socket is looked up by one cpu and unhasing is made by another CPU.
+ *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
+ *   (leak to backlog). Packet socket does all the processing inside
+ *   ptype_lock, so that it has not this race condition. UNIX sockets
+ *   use separate SMP lock, so that they are prone too.
+ */
+
+/* Grab socket reference count. This operation is valid only
+   when sk is ALREADY grabbed f.e. it is found in hash table
+   or a list and the lookup is made under lock preventing hash table
+   modifications.
+ */
+
+extern __inline__ void sock_hold(struct sock *sk)
+{
+	atomic_inc(&sk->refcnt);
+}
+
+/* Ungrab socket in the context, which assumes that socket refcnt
+   cannot hit zero, f.e. it is true in context of any socketcall.
+ */
+extern __inline__ void __sock_put(struct sock *sk)
+{
+	atomic_dec(&sk->refcnt);
+}
+
+/* Ungrab socket and destroy it, if it was the last reference. */
+extern __inline__ void sock_put(struct sock *sk)
+{
+	if (atomic_dec_and_test(&sk->refcnt))
+		sk_free(sk);
+}
+
+extern __inline__ struct dst_entry *
+__sk_dst_get(struct sock *sk)
+{
+	return sk->dst_cache;
+}
+
+extern __inline__ struct dst_entry *
+sk_dst_get(struct sock *sk)
+{
+	struct dst_entry *dst;
+
+	read_lock(&sk->dst_lock);
+	dst = sk->dst_cache;
+	if (dst)
+		dst_hold(dst);
+	read_unlock(&sk->dst_lock);
+	return dst;
+}
+
+extern __inline__ void
+__sk_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = sk->dst_cache;
+	sk->dst_cache = dst;
+	dst_release(old_dst);
+}
+
+extern __inline__ void
+sk_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+	write_lock(&sk->dst_lock);
+	__sk_dst_set(sk, dst);
+	write_unlock(&sk->dst_lock);
+}
+
+extern __inline__ void
+__sk_dst_reset(struct sock *sk)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = sk->dst_cache;
+	sk->dst_cache = NULL;
+	dst_release(old_dst);
+}
+
+extern __inline__ void
+sk_dst_reset(struct sock *sk)
+{
+	write_lock(&sk->dst_lock);
+	__sk_dst_reset(sk);
+	write_unlock(&sk->dst_lock);
+}
+
+extern __inline__ struct dst_entry *
+__sk_dst_check(struct sock *sk, u32 cookie)
+{
+	struct dst_entry *dst = sk->dst_cache;
+
+	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		sk->dst_cache = NULL;
+		return NULL;
+	}
+
+	return dst;
+}
+
+extern __inline__ struct dst_entry *
+sk_dst_check(struct sock *sk, u32 cookie)
+{
+	struct dst_entry *dst = sk_dst_get(sk);
+
+	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		sk_dst_reset(sk);
+		return NULL;
+	}
+
+	return dst;
+}
+
+
+/*
  * 	Queue a received datagram if it will fit. Stream and sequenced
  *	protocols can't normally use this as they need to fit buffers in
  *	and play with them.
@@ -825,6 +960,7 @@
 
 extern __inline__ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 {
+	sock_hold(sk);
 	skb->sk = sk;
 	skb->destructor = sock_wfree;
 	atomic_add(skb->truesize, &sk->wmem_alloc);
@@ -837,12 +973,16 @@
 	atomic_add(skb->truesize, &sk->rmem_alloc);
 }
 
+extern __inline__ void skb_set_owner_c(struct sk_buff *skb, struct sock *sk)
+{
+	sock_hold(sk);
+	skb->sk = sk;
+	skb->destructor = sock_cfree;
+}
+
 
 extern __inline__ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
-#ifdef CONFIG_FILTER
-	struct sk_filter *filter;
-#endif
 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
 	   number of warnings when compiling with -W --ANK
 	 */
@@ -850,8 +990,21 @@
                 return -ENOMEM;
 
 #ifdef CONFIG_FILTER
-	if ((filter = sk->filter) != NULL && sk_filter(skb, filter))
-		return -EPERM;	/* Toss packet */
+	if (sk->filter) {
+		int err = 0;
+		struct sk_filter *filter;
+
+		/* It would be deadlock, if sock_queue_rcv_skb is used
+		   with socket lock! We assume that users of this
+		   function are lock free.
+		 */
+		bh_lock_sock(sk);
+		if ((filter = sk->filter) != NULL && sk_filter(skb, filter))
+			err = -EPERM;
+		bh_unlock_sock(sk);
+		if (err)
+			return err;	/* Toss packet */
+	}
 #endif /* CONFIG_FILTER */
 
 	skb_set_owner_r(skb, sk);
@@ -906,26 +1059,17 @@
 	return sock_wspace(sk) >= MIN_WRITE_SPACE;
 }
 
-/* 
- *	Declarations from timer.c 
- */
- 
-extern struct sock *timer_base;
-
-extern void net_delete_timer (struct sock *);
-extern void net_reset_timer (struct sock *, int, unsigned long);
-extern void net_timer (unsigned long);
-
 extern __inline__ int gfp_any(void)
 {
 	return in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
 }
 
+
 /* 
  *	Enable debug/info messages 
  */
 
-#if 1
+#if 0
 #define NETDEBUG(x)	do { } while (0)
 #else
 #define NETDEBUG(x)	do { x; } while (0)
@@ -951,6 +1095,5 @@
 				remove_wait_queue((sk)->sleep, &wait); \
 				lock_sock(sk); \
 				}
-
 
 #endif	/* _SOCK_H */

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)