4 * An implementation of the DCCP protocol
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
12 #include <linux/config.h>
13 #include <linux/dccp.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
21 #include <linux/if_arp.h>
22 #include <linux/init.h>
23 #include <linux/random.h>
24 #include <net/checksum.h>
26 #include <net/inet_common.h>
28 #include <net/protocol.h>
32 #include <asm/semaphore.h>
33 #include <linux/spinlock.h>
34 #include <linux/timer.h>
35 #include <linux/delay.h>
36 #include <linux/poll.h>
37 #include <linux/dccp.h>
42 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics);
44 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
46 static struct net_protocol dccp_protocol = {
47 .handler = dccp_v4_rcv,
48 .err_handler = dccp_v4_err,
51 const char *dccp_packet_name(const int type)
53 static const char *dccp_packet_names[] = {
54 [DCCP_PKT_REQUEST] = "REQUEST",
55 [DCCP_PKT_RESPONSE] = "RESPONSE",
56 [DCCP_PKT_DATA] = "DATA",
57 [DCCP_PKT_ACK] = "ACK",
58 [DCCP_PKT_DATAACK] = "DATAACK",
59 [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
60 [DCCP_PKT_CLOSE] = "CLOSE",
61 [DCCP_PKT_RESET] = "RESET",
62 [DCCP_PKT_SYNC] = "SYNC",
63 [DCCP_PKT_SYNCACK] = "SYNCACK",
66 if (type >= DCCP_NR_PKT_TYPES)
69 return dccp_packet_names[type];
72 EXPORT_SYMBOL_GPL(dccp_packet_name);
74 const char *dccp_state_name(const int state)
76 static char *dccp_state_names[] = {
78 [DCCP_REQUESTING] = "REQUESTING",
79 [DCCP_PARTOPEN] = "PARTOPEN",
80 [DCCP_LISTEN] = "LISTEN",
81 [DCCP_RESPOND] = "RESPOND",
82 [DCCP_CLOSING] = "CLOSING",
83 [DCCP_TIME_WAIT] = "TIME_WAIT",
84 [DCCP_CLOSED] = "CLOSED",
87 if (state >= DCCP_MAX_STATES)
88 return "INVALID STATE!";
90 return dccp_state_names[state];
93 EXPORT_SYMBOL_GPL(dccp_state_name);
95 static inline int dccp_listen_start(struct sock *sk)
97 dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
98 return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
101 int dccp_disconnect(struct sock *sk, int flags)
103 struct inet_connection_sock *icsk = inet_csk(sk);
104 struct inet_sock *inet = inet_sk(sk);
106 const int old_state = sk->sk_state;
108 if (old_state != DCCP_CLOSED)
109 dccp_set_state(sk, DCCP_CLOSED);
111 /* ABORT function of RFC793 */
112 if (old_state == DCCP_LISTEN) {
113 inet_csk_listen_stop(sk);
114 /* FIXME: do the active reset thing */
115 } else if (old_state == DCCP_REQUESTING)
116 sk->sk_err = ECONNRESET;
118 dccp_clear_xmit_timers(sk);
119 __skb_queue_purge(&sk->sk_receive_queue);
120 if (sk->sk_send_head != NULL) {
121 __kfree_skb(sk->sk_send_head);
122 sk->sk_send_head = NULL;
127 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
128 inet_reset_saddr(sk);
131 sock_reset_flag(sk, SOCK_DONE);
133 icsk->icsk_backoff = 0;
134 inet_csk_delack_init(sk);
137 BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
139 sk->sk_error_report(sk);
144 * Wait for a DCCP event.
146 * Note that we don't need to lock the socket, as the upper poll layers
147 * take care of normal races (between the test and the event) and we don't
148 * go look at any of the socket buffers directly.
150 static unsigned int dccp_poll(struct file *file, struct socket *sock,
154 struct sock *sk = sock->sk;
156 poll_wait(file, sk->sk_sleep, wait);
157 if (sk->sk_state == DCCP_LISTEN)
158 return inet_csk_listen_poll(sk);
160 /* Socket is not locked. We are protected from async events
161 by poll logic and correct handling of state changes
162 made by another threads is impossible in any case.
169 if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
171 if (sk->sk_shutdown & RCV_SHUTDOWN)
172 mask |= POLLIN | POLLRDNORM;
175 if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
176 if (atomic_read(&sk->sk_rmem_alloc) > 0)
177 mask |= POLLIN | POLLRDNORM;
179 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
180 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
181 mask |= POLLOUT | POLLWRNORM;
182 } else { /* send SIGIO later */
183 set_bit(SOCK_ASYNC_NOSPACE,
184 &sk->sk_socket->flags);
185 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
187 /* Race breaker. If space is freed after
188 * wspace test but before the flags are set,
189 * IO signal will be lost.
191 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
192 mask |= POLLOUT | POLLWRNORM;
199 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
201 dccp_pr_debug("entry\n");
205 int dccp_setsockopt(struct sock *sk, int level, int optname,
206 char __user *optval, int optlen)
208 dccp_pr_debug("entry\n");
210 if (level != SOL_DCCP)
211 return ip_setsockopt(sk, level, optname, optval, optlen);
216 int dccp_getsockopt(struct sock *sk, int level, int optname,
217 char __user *optval, int __user *optlen)
219 dccp_pr_debug("entry\n");
221 if (level != SOL_DCCP)
222 return ip_getsockopt(sk, level, optname, optval, optlen);
227 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
230 const struct dccp_sock *dp = dccp_sk(sk);
231 const int flags = msg->msg_flags;
232 const int noblock = flags & MSG_DONTWAIT;
237 if (len > dp->dccps_mss_cache)
241 timeo = sock_sndtimeo(sk, noblock);
244 * We have to use sk_stream_wait_connect here to set sk_write_pending,
245 * so that the trick in dccp_rcv_request_sent_state_process.
247 /* Wait for a connection to finish. */
248 if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
249 if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
252 size = sk->sk_prot->max_header + len;
254 skb = sock_alloc_send_skb(sk, size, noblock, &rc);
259 skb_reserve(skb, sk->sk_prot->max_header);
260 rc = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
264 rc = dccp_write_xmit(sk, skb, len);
266 * XXX we don't use sk_write_queue, so just discard the packet.
267 * Current plan however is to _use_ sk_write_queue with
268 * an algorith similar to tcp_sendmsg, where the main difference
269 * is that in DCCP we have to respect packet boundaries, so
270 * no coalescing of skbs.
272 * This bug was _quickly_ found & fixed by just looking at an OSTRA
273 * generated callgraph 8) -acme
285 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
286 size_t len, int nonblock, int flags, int *addr_len)
288 const struct dccp_hdr *dh;
293 if (sk->sk_state == DCCP_LISTEN) {
298 timeo = sock_rcvtimeo(sk, nonblock);
301 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
304 goto verify_sock_status;
308 if (dh->dccph_type == DCCP_PKT_DATA ||
309 dh->dccph_type == DCCP_PKT_DATAACK)
312 if (dh->dccph_type == DCCP_PKT_RESET ||
313 dh->dccph_type == DCCP_PKT_CLOSE) {
314 dccp_pr_debug("found fin ok!\n");
318 dccp_pr_debug("packet_type=%s\n",
319 dccp_packet_name(dh->dccph_type));
322 if (sock_flag(sk, SOCK_DONE)) {
328 len = sock_error(sk);
332 if (sk->sk_shutdown & RCV_SHUTDOWN) {
337 if (sk->sk_state == DCCP_CLOSED) {
338 if (!sock_flag(sk, SOCK_DONE)) {
339 /* This occurs when user tries to read
340 * from never connected socket.
354 if (signal_pending(current)) {
355 len = sock_intr_errno(timeo);
359 sk_wait_data(sk, &timeo);
364 else if (len < skb->len)
365 msg->msg_flags |= MSG_TRUNC;
367 if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
368 /* Exception. Bailout! */
373 if (!(flags & MSG_PEEK))
382 static int inet_dccp_listen(struct socket *sock, int backlog)
384 struct sock *sk = sock->sk;
385 unsigned char old_state;
391 if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
394 old_state = sk->sk_state;
395 if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
398 /* Really, if the socket is already in listen state
399 * we can only allow the backlog to be adjusted.
401 if (old_state != DCCP_LISTEN) {
403 * FIXME: here it probably should be sk->sk_prot->listen_start
404 * see tcp_listen_start
406 err = dccp_listen_start(sk);
410 sk->sk_max_ack_backlog = backlog;
418 static const unsigned char dccp_new_state[] = {
419 /* current state: new state: action: */
421 [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
422 [DCCP_REQUESTING] = DCCP_CLOSED,
423 [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
424 [DCCP_LISTEN] = DCCP_CLOSED,
425 [DCCP_RESPOND] = DCCP_CLOSED,
426 [DCCP_CLOSING] = DCCP_CLOSED,
427 [DCCP_TIME_WAIT] = DCCP_CLOSED,
428 [DCCP_CLOSED] = DCCP_CLOSED,
431 static int dccp_close_state(struct sock *sk)
433 const int next = dccp_new_state[sk->sk_state];
434 const int ns = next & DCCP_STATE_MASK;
436 if (ns != sk->sk_state)
437 dccp_set_state(sk, ns);
439 return next & DCCP_ACTION_FIN;
442 void dccp_close(struct sock *sk, long timeout)
448 sk->sk_shutdown = SHUTDOWN_MASK;
450 if (sk->sk_state == DCCP_LISTEN) {
451 dccp_set_state(sk, DCCP_CLOSED);
454 inet_csk_listen_stop(sk);
456 goto adjudge_to_death;
460 * We need to flush the recv. buffs. We do this only on the
461 * descriptor close, not protocol-sourced closes, because the
462 *reader process may not have drained the data yet!
464 /* FIXME: check for unread data */
465 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
469 if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
470 /* Check zero linger _after_ checking for unread data. */
471 sk->sk_prot->disconnect(sk, 0);
472 } else if (dccp_close_state(sk)) {
473 dccp_send_close(sk, 1);
476 sk_stream_wait_close(sk, timeout);
480 * It is the last release_sock in its life. It will remove backlog.
484 * Now socket is owned by kernel and we acquire BH lock
485 * to finish close. No need to check for user refs.
489 BUG_TRAP(!sock_owned_by_user(sk));
495 * The last release_sock may have processed the CLOSE or RESET
496 * packet moving sock to CLOSED state, if not we have to fire
497 * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
498 * in draft-ietf-dccp-spec-11. -acme
500 if (sk->sk_state == DCCP_CLOSING) {
501 /* FIXME: should start at 2 * RTT */
502 /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
503 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
504 inet_csk(sk)->icsk_rto,
507 /* Yeah, we should use sk->sk_prot->orphan_count, etc */
508 dccp_set_state(sk, DCCP_CLOSED);
512 atomic_inc(sk->sk_prot->orphan_count);
513 if (sk->sk_state == DCCP_CLOSED)
514 inet_csk_destroy_sock(sk);
516 /* Otherwise, socket is reprieved until protocol close. */
523 void dccp_shutdown(struct sock *sk, int how)
525 dccp_pr_debug("entry\n");
528 static struct proto_ops inet_dccp_ops = {
530 .owner = THIS_MODULE,
531 .release = inet_release,
533 .connect = inet_stream_connect,
534 .socketpair = sock_no_socketpair,
535 .accept = inet_accept,
536 .getname = inet_getname,
537 /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
540 /* FIXME: work on inet_listen to rename it to sock_common_listen */
541 .listen = inet_dccp_listen,
542 .shutdown = inet_shutdown,
543 .setsockopt = sock_common_setsockopt,
544 .getsockopt = sock_common_getsockopt,
545 .sendmsg = inet_sendmsg,
546 .recvmsg = sock_common_recvmsg,
547 .mmap = sock_no_mmap,
548 .sendpage = sock_no_sendpage,
551 extern struct net_proto_family inet_family_ops;
553 static struct inet_protosw dccp_v4_protosw = {
555 .protocol = IPPROTO_DCCP,
556 .prot = &dccp_v4_prot,
557 .ops = &inet_dccp_ops,
564 * This is the global socket data structure used for responding to
565 * the Out-of-the-blue (OOTB) packets. A control sock will be created
566 * for this socket at the initialization time.
568 struct socket *dccp_ctl_socket;
570 static char dccp_ctl_socket_err_msg[] __initdata =
571 KERN_ERR "DCCP: Failed to create the control socket.\n";
573 static int __init dccp_ctl_sock_init(void)
575 int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
578 printk(dccp_ctl_socket_err_msg);
580 dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
581 inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
583 /* Unhash it so that IP input processing does not even
584 * see it, we do not wish this socket to see incoming
587 dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
593 #ifdef CONFIG_IP_DCCP_UNLOAD_HACK
594 void dccp_ctl_sock_exit(void)
596 if (dccp_ctl_socket != NULL) {
597 sock_release(dccp_ctl_socket);
598 dccp_ctl_socket = NULL;
602 EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
605 static int __init init_dccp_v4_mibs(void)
609 dccp_statistics[0] = alloc_percpu(struct dccp_mib);
610 if (dccp_statistics[0] == NULL)
613 dccp_statistics[1] = alloc_percpu(struct dccp_mib);
614 if (dccp_statistics[1] == NULL)
621 free_percpu(dccp_statistics[0]);
622 dccp_statistics[0] = NULL;
627 static int thash_entries;
628 module_param(thash_entries, int, 0444);
629 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
631 #ifdef CONFIG_IP_DCCP_DEBUG
633 module_param(dccp_debug, int, 0444);
634 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
637 static int __init dccp_init(void)
640 int ehash_order, bhash_order, i;
641 int rc = proto_register(&dccp_v4_prot, 1);
646 dccp_hashinfo.bind_bucket_cachep =
647 kmem_cache_create("dccp_bind_bucket",
648 sizeof(struct inet_bind_bucket), 0,
649 SLAB_HWCACHE_ALIGN, NULL, NULL);
650 if (!dccp_hashinfo.bind_bucket_cachep)
651 goto out_proto_unregister;
654 * Size and allocate the main established and bind bucket
657 * The methodology is similar to that of the buffer cache.
659 if (num_physpages >= (128 * 1024))
660 goal = num_physpages >> (21 - PAGE_SHIFT);
662 goal = num_physpages >> (23 - PAGE_SHIFT);
665 goal = (thash_entries *
666 sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
667 for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
670 dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
671 sizeof(struct inet_ehash_bucket);
672 dccp_hashinfo.ehash_size >>= 1;
673 while (dccp_hashinfo.ehash_size &
674 (dccp_hashinfo.ehash_size - 1))
675 dccp_hashinfo.ehash_size--;
676 dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
677 __get_free_pages(GFP_ATOMIC, ehash_order);
678 } while (!dccp_hashinfo.ehash && --ehash_order > 0);
680 if (!dccp_hashinfo.ehash) {
681 printk(KERN_CRIT "Failed to allocate DCCP "
682 "established hash table\n");
683 goto out_free_bind_bucket_cachep;
686 for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
687 rwlock_init(&dccp_hashinfo.ehash[i].lock);
688 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
691 bhash_order = ehash_order;
694 dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
695 sizeof(struct inet_bind_hashbucket);
696 if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
699 dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
700 __get_free_pages(GFP_ATOMIC, bhash_order);
701 } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
703 if (!dccp_hashinfo.bhash) {
704 printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
705 goto out_free_dccp_ehash;
708 for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
709 spin_lock_init(&dccp_hashinfo.bhash[i].lock);
710 INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
713 if (init_dccp_v4_mibs())
714 goto out_free_dccp_bhash;
717 if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
718 goto out_free_dccp_v4_mibs;
720 inet_register_protosw(&dccp_v4_protosw);
722 rc = dccp_ctl_sock_init();
724 goto out_unregister_protosw;
727 out_unregister_protosw:
728 inet_unregister_protosw(&dccp_v4_protosw);
729 inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
730 out_free_dccp_v4_mibs:
731 free_percpu(dccp_statistics[0]);
732 free_percpu(dccp_statistics[1]);
733 dccp_statistics[0] = dccp_statistics[1] = NULL;
735 free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
736 dccp_hashinfo.bhash = NULL;
738 free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
739 dccp_hashinfo.ehash = NULL;
740 out_free_bind_bucket_cachep:
741 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
742 dccp_hashinfo.bind_bucket_cachep = NULL;
743 out_proto_unregister:
744 proto_unregister(&dccp_v4_prot);
748 static const char dccp_del_proto_err_msg[] __exitdata =
749 KERN_ERR "can't remove dccp net_protocol\n";
751 static void __exit dccp_fini(void)
753 inet_unregister_protosw(&dccp_v4_protosw);
755 if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
756 printk(dccp_del_proto_err_msg);
758 free_percpu(dccp_statistics[0]);
759 free_percpu(dccp_statistics[1]);
760 free_pages((unsigned long)dccp_hashinfo.bhash,
761 get_order(dccp_hashinfo.bhash_size *
762 sizeof(struct inet_bind_hashbucket)));
763 free_pages((unsigned long)dccp_hashinfo.ehash,
764 get_order(dccp_hashinfo.ehash_size *
765 sizeof(struct inet_ehash_bucket)));
766 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
767 proto_unregister(&dccp_v4_prot);
770 module_init(dccp_init);
771 module_exit(dccp_fini);
774 * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
775 * values directly, Also cover the case where the protocol is not specified,
776 * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
778 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
779 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
780 MODULE_LICENSE("GPL");
782 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");