]>
Commit | Line | Data |
---|---|---|
317a76f9 SH |
1 | /* |
2 | * Plugable TCP congestion control support and newReno | |
3 | * congestion control. | |
4 | * Based on ideas from I/O scheduler suport and Web100. | |
5 | * | |
6 | * Copyright (C) 2005 Stephen Hemminger <[email protected]> | |
7 | */ | |
8 | ||
317a76f9 SH |
9 | #include <linux/module.h> |
10 | #include <linux/mm.h> | |
11 | #include <linux/types.h> | |
12 | #include <linux/list.h> | |
13 | #include <net/tcp.h> | |
14 | ||
15 | static DEFINE_SPINLOCK(tcp_cong_list_lock); | |
16 | static LIST_HEAD(tcp_cong_list); | |
17 | ||
18 | /* Simple linear search, don't expect many entries! */ | |
19 | static struct tcp_congestion_ops *tcp_ca_find(const char *name) | |
20 | { | |
21 | struct tcp_congestion_ops *e; | |
22 | ||
5f8ef48d | 23 | list_for_each_entry_rcu(e, &tcp_cong_list, list) { |
317a76f9 SH |
24 | if (strcmp(e->name, name) == 0) |
25 | return e; | |
26 | } | |
27 | ||
28 | return NULL; | |
29 | } | |
30 | ||
31 | /* | |
32 | * Attach new congestion control algorthim to the list | |
33 | * of available options. | |
34 | */ | |
35 | int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | |
36 | { | |
37 | int ret = 0; | |
38 | ||
39 | /* all algorithms must implement ssthresh and cong_avoid ops */ | |
72dc5b92 | 40 | if (!ca->ssthresh || !ca->cong_avoid) { |
317a76f9 SH |
41 | printk(KERN_ERR "TCP %s does not implement required ops\n", |
42 | ca->name); | |
43 | return -EINVAL; | |
44 | } | |
45 | ||
46 | spin_lock(&tcp_cong_list_lock); | |
47 | if (tcp_ca_find(ca->name)) { | |
48 | printk(KERN_NOTICE "TCP %s already registered\n", ca->name); | |
49 | ret = -EEXIST; | |
50 | } else { | |
3d2573f7 | 51 | list_add_tail_rcu(&ca->list, &tcp_cong_list); |
317a76f9 SH |
52 | printk(KERN_INFO "TCP %s registered\n", ca->name); |
53 | } | |
54 | spin_unlock(&tcp_cong_list_lock); | |
55 | ||
56 | return ret; | |
57 | } | |
58 | EXPORT_SYMBOL_GPL(tcp_register_congestion_control); | |
59 | ||
60 | /* | |
61 | * Remove congestion control algorithm, called from | |
62 | * the module's remove function. Module ref counts are used | |
63 | * to ensure that this can't be done till all sockets using | |
64 | * that method are closed. | |
65 | */ | |
66 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | |
67 | { | |
68 | spin_lock(&tcp_cong_list_lock); | |
69 | list_del_rcu(&ca->list); | |
70 | spin_unlock(&tcp_cong_list_lock); | |
71 | } | |
72 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | |
73 | ||
74 | /* Assign choice of congestion control. */ | |
6687e988 | 75 | void tcp_init_congestion_control(struct sock *sk) |
317a76f9 | 76 | { |
6687e988 | 77 | struct inet_connection_sock *icsk = inet_csk(sk); |
317a76f9 SH |
78 | struct tcp_congestion_ops *ca; |
79 | ||
6687e988 | 80 | if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) |
5f8ef48d SH |
81 | return; |
82 | ||
317a76f9 SH |
83 | rcu_read_lock(); |
84 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | |
85 | if (try_module_get(ca->owner)) { | |
6687e988 | 86 | icsk->icsk_ca_ops = ca; |
317a76f9 SH |
87 | break; |
88 | } | |
89 | ||
90 | } | |
91 | rcu_read_unlock(); | |
92 | ||
6687e988 ACM |
93 | if (icsk->icsk_ca_ops->init) |
94 | icsk->icsk_ca_ops->init(sk); | |
317a76f9 SH |
95 | } |
96 | ||
97 | /* Manage refcounts on socket close. */ | |
6687e988 | 98 | void tcp_cleanup_congestion_control(struct sock *sk) |
317a76f9 | 99 | { |
6687e988 ACM |
100 | struct inet_connection_sock *icsk = inet_csk(sk); |
101 | ||
102 | if (icsk->icsk_ca_ops->release) | |
103 | icsk->icsk_ca_ops->release(sk); | |
104 | module_put(icsk->icsk_ca_ops->owner); | |
317a76f9 SH |
105 | } |
106 | ||
107 | /* Used by sysctl to change default congestion control */ | |
108 | int tcp_set_default_congestion_control(const char *name) | |
109 | { | |
110 | struct tcp_congestion_ops *ca; | |
111 | int ret = -ENOENT; | |
112 | ||
113 | spin_lock(&tcp_cong_list_lock); | |
114 | ca = tcp_ca_find(name); | |
115 | #ifdef CONFIG_KMOD | |
116 | if (!ca) { | |
117 | spin_unlock(&tcp_cong_list_lock); | |
118 | ||
119 | request_module("tcp_%s", name); | |
120 | spin_lock(&tcp_cong_list_lock); | |
121 | ca = tcp_ca_find(name); | |
122 | } | |
123 | #endif | |
124 | ||
125 | if (ca) { | |
126 | list_move(&ca->list, &tcp_cong_list); | |
127 | ret = 0; | |
128 | } | |
129 | spin_unlock(&tcp_cong_list_lock); | |
130 | ||
131 | return ret; | |
132 | } | |
133 | ||
134 | /* Get current default congestion control */ | |
135 | void tcp_get_default_congestion_control(char *name) | |
136 | { | |
137 | struct tcp_congestion_ops *ca; | |
138 | /* We will always have reno... */ | |
139 | BUG_ON(list_empty(&tcp_cong_list)); | |
140 | ||
141 | rcu_read_lock(); | |
142 | ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); | |
143 | strncpy(name, ca->name, TCP_CA_NAME_MAX); | |
144 | rcu_read_unlock(); | |
145 | } | |
146 | ||
5f8ef48d | 147 | /* Change congestion control for socket */ |
6687e988 | 148 | int tcp_set_congestion_control(struct sock *sk, const char *name) |
5f8ef48d | 149 | { |
6687e988 | 150 | struct inet_connection_sock *icsk = inet_csk(sk); |
5f8ef48d SH |
151 | struct tcp_congestion_ops *ca; |
152 | int err = 0; | |
153 | ||
154 | rcu_read_lock(); | |
155 | ca = tcp_ca_find(name); | |
6687e988 | 156 | if (ca == icsk->icsk_ca_ops) |
5f8ef48d SH |
157 | goto out; |
158 | ||
159 | if (!ca) | |
160 | err = -ENOENT; | |
161 | ||
162 | else if (!try_module_get(ca->owner)) | |
163 | err = -EBUSY; | |
164 | ||
165 | else { | |
6687e988 ACM |
166 | tcp_cleanup_congestion_control(sk); |
167 | icsk->icsk_ca_ops = ca; | |
168 | if (icsk->icsk_ca_ops->init) | |
169 | icsk->icsk_ca_ops->init(sk); | |
5f8ef48d SH |
170 | } |
171 | out: | |
172 | rcu_read_unlock(); | |
173 | return err; | |
174 | } | |
175 | ||
40efc6fa SH |
176 | |
177 | /* | |
178 | * Linear increase during slow start | |
179 | */ | |
180 | void tcp_slow_start(struct tcp_sock *tp) | |
181 | { | |
182 | if (sysctl_tcp_abc) { | |
183 | /* RFC3465: Slow Start | |
184 | * TCP sender SHOULD increase cwnd by the number of | |
185 | * previously unacknowledged bytes ACKed by each incoming | |
186 | * acknowledgment, provided the increase is not more than L | |
187 | */ | |
188 | if (tp->bytes_acked < tp->mss_cache) | |
189 | return; | |
190 | ||
191 | /* We MAY increase by 2 if discovered delayed ack */ | |
3fdf3f0c | 192 | if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) { |
40efc6fa SH |
193 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
194 | tp->snd_cwnd++; | |
195 | } | |
196 | } | |
197 | tp->bytes_acked = 0; | |
198 | ||
199 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | |
200 | tp->snd_cwnd++; | |
201 | } | |
202 | EXPORT_SYMBOL_GPL(tcp_slow_start); | |
203 | ||
317a76f9 SH |
204 | /* |
205 | * TCP Reno congestion control | |
206 | * This is special case used for fallback as well. | |
207 | */ | |
208 | /* This is Jacobson's slow start and congestion avoidance. | |
209 | * SIGCOMM '88, p. 328. | |
210 | */ | |
6687e988 | 211 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, |
317a76f9 SH |
212 | int flag) |
213 | { | |
6687e988 ACM |
214 | struct tcp_sock *tp = tcp_sk(sk); |
215 | ||
f4805ede | 216 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
317a76f9 SH |
217 | return; |
218 | ||
7faffa1c SH |
219 | /* In "safe" area, increase. */ |
220 | if (tp->snd_cwnd <= tp->snd_ssthresh) | |
221 | tcp_slow_start(tp); | |
9772efb9 SH |
222 | |
223 | /* In dangerous area, increase slowly. */ | |
224 | else if (sysctl_tcp_abc) { | |
c3e5d877 | 225 | /* RFC3465: Appropriate Byte Count |
9772efb9 SH |
226 | * increase once for each full cwnd acked |
227 | */ | |
228 | if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { | |
229 | tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; | |
230 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | |
231 | tp->snd_cwnd++; | |
232 | } | |
233 | } else { | |
234 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ | |
235 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | |
236 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | |
237 | tp->snd_cwnd++; | |
238 | tp->snd_cwnd_cnt = 0; | |
239 | } else | |
240 | tp->snd_cwnd_cnt++; | |
241 | } | |
317a76f9 SH |
242 | } |
243 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | |
244 | ||
245 | /* Slow start threshold is half the congestion window (min 2) */ | |
6687e988 | 246 | u32 tcp_reno_ssthresh(struct sock *sk) |
317a76f9 | 247 | { |
6687e988 | 248 | const struct tcp_sock *tp = tcp_sk(sk); |
317a76f9 SH |
249 | return max(tp->snd_cwnd >> 1U, 2U); |
250 | } | |
251 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | |
252 | ||
72dc5b92 SH |
253 | /* Lower bound on congestion window with halving. */ |
254 | u32 tcp_reno_min_cwnd(const struct sock *sk) | |
317a76f9 | 255 | { |
6687e988 | 256 | const struct tcp_sock *tp = tcp_sk(sk); |
317a76f9 SH |
257 | return tp->snd_ssthresh/2; |
258 | } | |
259 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); | |
260 | ||
261 | struct tcp_congestion_ops tcp_reno = { | |
262 | .name = "reno", | |
263 | .owner = THIS_MODULE, | |
264 | .ssthresh = tcp_reno_ssthresh, | |
265 | .cong_avoid = tcp_reno_cong_avoid, | |
266 | .min_cwnd = tcp_reno_min_cwnd, | |
267 | }; | |
268 | ||
5f8ef48d SH |
269 | /* Initial congestion control used (until SYN) |
270 | * really reno under another name so we can tell difference | |
271 | * during tcp_set_default_congestion_control | |
272 | */ | |
273 | struct tcp_congestion_ops tcp_init_congestion_ops = { | |
274 | .name = "", | |
275 | .owner = THIS_MODULE, | |
276 | .ssthresh = tcp_reno_ssthresh, | |
277 | .cong_avoid = tcp_reno_cong_avoid, | |
278 | .min_cwnd = tcp_reno_min_cwnd, | |
279 | }; | |
280 | EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); |