Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* INET An implementation of the TCP/IP protocol suite for the LINUX
	3	* operating system. INET is implemented using the BSD Socket
	4	* interface as the means of communication with the user level.
	5	*
	6	* Implementation of the Transmission Control Protocol(TCP).
	7	*
	8	* Authors: Ross Biro
	9	* Fred N. van Kempen, <[email protected]>
	10	* Mark Evans, <[email protected]>
	11	* Corey Minyard <[email protected]>
	12	* Florian La Roche, <[email protected]>
	13	* Charles Hedrick, <[email protected]>
	14	* Linus Torvalds, <[email protected]>
	15	* Alan Cox, <[email protected]>
	16	* Matthew Dillon, <[email protected]>
	17	* Arnt Gulbrandsen, <[email protected]>
	18	* Jorge Cwik, <[email protected]>
	19	*/
	20
	21	/*
	22	* Changes: Pedro Roque : Retransmit queue handled by TCP.
	23	* : Fragmentation on mtu decrease
	24	* : Segment collapse on retransmit
	25	* : AF independence
	26	*
	27	* Linus Torvalds : send_delayed_ack
	28	* David S. Miller : Charge memory using the right skb
	29	* during syn/ack processing.
	30	* David S. Miller : Output engine completely rewritten.
	31	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
	32	* Cacophonix Gaul : draft-minshall-nagle-01
	33	* J Hadi Salim : ECN support
	34	*
	35	*/
	36
	37	#include <net/tcp.h>
	38
	39	#include <linux/compiler.h>
	40	#include <linux/module.h>
	41
	42	/* People can turn this off for buggy TCP's found in printers etc. */
	43	int sysctl_tcp_retrans_collapse __read_mostly = 1;
	44
	45	/* People can turn this on to work with those rare, broken TCPs that
	46	* interpret the window field as a signed quantity.
	47	*/
	48	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
	49
	50	/* This limits the percentage of the congestion window which we
	51	* will allow a single TSO frame to consume. Building TSO frames
	52	* which are too large can cause TCP streams to be bursty.
	53	*/
	54	int sysctl_tcp_tso_win_divisor __read_mostly = 3;
	55
	56	int sysctl_tcp_mtu_probing __read_mostly = 0;
	57	int sysctl_tcp_base_mss __read_mostly = 512;
	58
	59	/* By default, RFC2861 behavior. */
	60	int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
	61
	62	static void tcp_event_new_data_sent(struct sock sk, struct sk_buff skb)
	63	{
	64	struct tcp_sock *tp = tcp_sk(sk);
	65	unsigned int prior_packets = tp->packets_out;
	66
	67	tcp_advance_send_head(sk, skb);
	68	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
	69
	70	/* Don't override Nagle indefinately with F-RTO */
	71	if (tp->frto_counter == 2)
	72	tp->frto_counter = 3;
	73
	74	tp->packets_out += tcp_skb_pcount(skb);
	75	if (!prior_packets)
	76	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
	77	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
	78	}
	79
	80	/* SND.NXT, if window was not shrunk.
	81	* If window has been shrunk, what should we make? It is not clear at all.
	82	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
	83	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
	84	* invalid. OK, let's make this for now:
	85	*/
	86	static inline __u32 tcp_acceptable_seq(struct sock *sk)
	87	{
	88	struct tcp_sock *tp = tcp_sk(sk);
	89
	90	if (!before(tcp_wnd_end(tp), tp->snd_nxt))
	91	return tp->snd_nxt;
	92	else
	93	return tcp_wnd_end(tp);
	94	}
	95
	96	/* Calculate mss to advertise in SYN segment.
	97	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
	98	*
	99	* 1. It is independent of path mtu.
	100	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
	101	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
	102	* attached devices, because some buggy hosts are confused by
	103	* large MSS.
	104	* 4. We do not make 3, we advertise MSS, calculated from first
	105	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
	106	* This may be overridden via information stored in routing table.
	107	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
	108	* probably even Jumbo".
	109	*/
	110	static __u16 tcp_advertise_mss(struct sock *sk)
	111	{
	112	struct tcp_sock *tp = tcp_sk(sk);
	113	struct dst_entry *dst = __sk_dst_get(sk);
	114	int mss = tp->advmss;
	115
	116	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
	117	mss = dst_metric(dst, RTAX_ADVMSS);
	118	tp->advmss = mss;
	119	}
	120
	121	return (__u16)mss;
	122	}
	123
	124	/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
	125	* This is the first part of cwnd validation mechanism. */
	126	static void tcp_cwnd_restart(struct sock sk, struct dst_entry dst)
	127	{
	128	struct tcp_sock *tp = tcp_sk(sk);
	129	s32 delta = tcp_time_stamp - tp->lsndtime;
	130	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
	131	u32 cwnd = tp->snd_cwnd;
	132
	133	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
	134
	135	tp->snd_ssthresh = tcp_current_ssthresh(sk);
	136	restart_cwnd = min(restart_cwnd, cwnd);
	137
	138	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
	139	cwnd >>= 1;
	140	tp->snd_cwnd = max(cwnd, restart_cwnd);
	141	tp->snd_cwnd_stamp = tcp_time_stamp;
	142	tp->snd_cwnd_used = 0;
	143	}
	144
	145	static void tcp_event_data_sent(struct tcp_sock *tp,
	146	struct sk_buff skb, struct sock sk)
	147	{
	148	struct inet_connection_sock *icsk = inet_csk(sk);
	149	const u32 now = tcp_time_stamp;
	150
	151	if (sysctl_tcp_slow_start_after_idle &&
	152	(!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
	153	tcp_cwnd_restart(sk, __sk_dst_get(sk));
	154
	155	tp->lsndtime = now;
	156
	157	/* If it is a reply for ato after last received
	158	* packet, enter pingpong mode.
	159	*/
	160	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
	161	icsk->icsk_ack.pingpong = 1;
	162	}
	163
	164	static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
	165	{
	166	tcp_dec_quickack_mode(sk, pkts);
	167	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
	168	}
	169
	170	/* Determine a window scaling and initial window to offer.
	171	* Based on the assumption that the given amount of space
	172	* will be offered. Store the results in the tp structure.
	173	* NOTE: for smooth operation initial space offering should
	174	* be a multiple of mss if possible. We assume here that mss >= 1.
	175	* This MUST be enforced by all callers.
	176	*/
	177	void tcp_select_initial_window(int __space, __u32 mss,
	178	__u32 rcv_wnd, __u32 window_clamp,
	179	int wscale_ok, __u8 *rcv_wscale)
	180	{
	181	unsigned int space = (__space < 0 ? 0 : __space);
	182
	183	/* If no clamp set the clamp to the max possible scaled window */
	184	if (*window_clamp == 0)
	185	(*window_clamp) = (65535 << 14);
	186	space = min(*window_clamp, space);
	187
	188	/* Quantize space offering to a multiple of mss if possible. */
	189	if (space > mss)
	190	space = (space / mss) * mss;
	191
	192	/* NOTE: offering an initial window larger than 32767
	193	* will break some buggy TCP stacks. If the admin tells us
	194	* it is likely we could be speaking with such a buggy stack
	195	* we will truncate our initial window offering to 32K-1
	196	* unless the remote has sent us a window scaling option,
	197	* which we interpret as a sign the remote TCP is not
	198	* misinterpreting the window field as a signed quantity.
	199	*/
	200	if (sysctl_tcp_workaround_signed_windows)
	201	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
	202	else
	203	(*rcv_wnd) = space;
	204
	205	(*rcv_wscale) = 0;
	206	if (wscale_ok) {
	207	/* Set window scaling on max possible window
	208	* See RFC1323 for an explanation of the limit to 14
	209	*/
	210	space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
	211	space = min_t(u32, space, *window_clamp);
	212	while (space > 65535 && (*rcv_wscale) < 14) {
	213	space >>= 1;
	214	(*rcv_wscale)++;
	215	}
	216	}
	217
	218	/* Set initial window to value enough for senders,
	219	* following RFC2414. Senders, not following this RFC,
	220	* will be satisfied with 2.
	221	*/
	222	if (mss > (1 << *rcv_wscale)) {
	223	int init_cwnd = 4;
	224	if (mss > 1460 * 3)
	225	init_cwnd = 2;
	226	else if (mss > 1460)
	227	init_cwnd = 3;
	228	if (rcv_wnd > init_cwnd mss)
	229	rcv_wnd = init_cwnd mss;
	230	}
	231
	232	/* Set the clamp no higher than max representable value */
	233	(window_clamp) = min(65535U << (rcv_wscale), *window_clamp);
	234	}
	235
	236	/* Chose a new window to advertise, update state in tcp_sock for the
	237	* socket, and return result with RFC1323 scaling applied. The return
	238	* value can be stuffed directly into th->window for an outgoing
	239	* frame.
	240	*/
	241	static u16 tcp_select_window(struct sock *sk)
	242	{
	243	struct tcp_sock *tp = tcp_sk(sk);
	244	u32 cur_win = tcp_receive_window(tp);
	245	u32 new_win = __tcp_select_window(sk);
	246
	247	/* Never shrink the offered window */
	248	if (new_win < cur_win) {
	249	/* Danger Will Robinson!
	250	* Don't update rcv_wup/rcv_wnd here or else
	251	* we will not be able to advertise a zero
	252	* window in time. --DaveM
	253	*
	254	* Relax Will Robinson.
	255	*/
	256	new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
	257	}
	258	tp->rcv_wnd = new_win;
	259	tp->rcv_wup = tp->rcv_nxt;
	260
	261	/* Make sure we do not exceed the maximum possible
	262	* scaled window.
	263	*/
	264	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
	265	new_win = min(new_win, MAX_TCP_WINDOW);
	266	else
	267	new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
	268
	269	/* RFC1323 scaling applied */
	270	new_win >>= tp->rx_opt.rcv_wscale;
	271
	272	/* If we advertise zero window, disable fast path. */
	273	if (new_win == 0)
	274	tp->pred_flags = 0;
	275
	276	return new_win;
	277	}
	278
	279	static inline void TCP_ECN_send_synack(struct tcp_sock tp, struct sk_buff skb)
	280	{
	281	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
	282	if (!(tp->ecn_flags & TCP_ECN_OK))
	283	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
	284	}
	285
	286	static inline void TCP_ECN_send_syn(struct sock sk, struct sk_buff skb)
	287	{
	288	struct tcp_sock *tp = tcp_sk(sk);
	289
	290	tp->ecn_flags = 0;
	291	if (sysctl_tcp_ecn) {
	292	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_ECE \| TCPCB_FLAG_CWR;
	293	tp->ecn_flags = TCP_ECN_OK;
	294	}
	295	}
	296
	297	static __inline__ void
	298	TCP_ECN_make_synack(struct request_sock req, struct tcphdr th)
	299	{
	300	if (inet_rsk(req)->ecn_ok)
	301	th->ece = 1;
	302	}
	303
	304	static inline void TCP_ECN_send(struct sock sk, struct sk_buff skb,
	305	int tcp_header_len)
	306	{
	307	struct tcp_sock *tp = tcp_sk(sk);
	308
	309	if (tp->ecn_flags & TCP_ECN_OK) {
	310	/* Not-retransmitted data segment: set ECT and inject CWR. */
	311	if (skb->len != tcp_header_len &&
	312	!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
	313	INET_ECN_xmit(sk);
	314	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
	315	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
	316	tcp_hdr(skb)->cwr = 1;
	317	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
	318	}
	319	} else {
	320	/* ACK or retransmitted segment: clear ECT\|CE */
	321	INET_ECN_dontxmit(sk);
	322	}
	323	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
	324	tcp_hdr(skb)->ece = 1;
	325	}
	326	}
	327
	328	/* Constructs common control bits of non-data skb. If SYN/FIN is present,
	329	* auto increment end seqno.
	330	*/
	331	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
	332	{
	333	skb->csum = 0;
	334
	335	TCP_SKB_CB(skb)->flags = flags;
	336	TCP_SKB_CB(skb)->sacked = 0;
	337
	338	skb_shinfo(skb)->gso_segs = 1;
	339	skb_shinfo(skb)->gso_size = 0;
	340	skb_shinfo(skb)->gso_type = 0;
	341
	342	TCP_SKB_CB(skb)->seq = seq;
	343	if (flags & (TCPCB_FLAG_SYN \| TCPCB_FLAG_FIN))
	344	seq++;
	345	TCP_SKB_CB(skb)->end_seq = seq;
	346	}
	347
	348	#define OPTION_SACK_ADVERTISE (1 << 0)
	349	#define OPTION_TS (1 << 1)
	350	#define OPTION_MD5 (1 << 2)
	351
	352	struct tcp_out_options {
	353	u8 options; /* bit field of OPTION_* */
	354	u8 ws; /* window scale, 0 to disable */
	355	u8 num_sack_blocks; /* number of SACK blocks to include */
	356	u16 mss; /* 0 to disable */
	357	__u32 tsval, tsecr; /* need to include OPTION_TS */
	358	};
	359
	360	static void tcp_options_write(__be32 ptr, struct tcp_sock tp,
	361	const struct tcp_out_options *opts,
	362	__u8 **md5_hash) {
	363	if (unlikely(OPTION_MD5 & opts->options)) {
	364	*ptr++ = htonl((TCPOPT_NOP << 24) \|
	365	(TCPOPT_NOP << 16) \|
	366	(TCPOPT_MD5SIG << 8) \|
	367	TCPOLEN_MD5SIG);
	368	md5_hash = (__u8 )ptr;
	369	ptr += 4;
	370	} else {
	371	*md5_hash = NULL;
	372	}
	373
	374	if (likely(OPTION_TS & opts->options)) {
	375	if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
	376	*ptr++ = htonl((TCPOPT_SACK_PERM << 24) \|
	377	(TCPOLEN_SACK_PERM << 16) \|
	378	(TCPOPT_TIMESTAMP << 8) \|
	379	TCPOLEN_TIMESTAMP);
	380	} else {
	381	*ptr++ = htonl((TCPOPT_NOP << 24) \|
	382	(TCPOPT_NOP << 16) \|
	383	(TCPOPT_TIMESTAMP << 8) \|
	384	TCPOLEN_TIMESTAMP);
	385	}
	386	*ptr++ = htonl(opts->tsval);
	387	*ptr++ = htonl(opts->tsecr);
	388	}
	389
	390	if (unlikely(opts->mss)) {
	391	*ptr++ = htonl((TCPOPT_MSS << 24) \|
	392	(TCPOLEN_MSS << 16) \|
	393	opts->mss);
	394	}
	395
	396	if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
	397	!(OPTION_TS & opts->options))) {
	398	*ptr++ = htonl((TCPOPT_NOP << 24) \|
	399	(TCPOPT_NOP << 16) \|
	400	(TCPOPT_SACK_PERM << 8) \|
	401	TCPOLEN_SACK_PERM);
	402	}
	403
	404	if (unlikely(opts->ws)) {
	405	*ptr++ = htonl((TCPOPT_NOP << 24) \|
	406	(TCPOPT_WINDOW << 16) \|
	407	(TCPOLEN_WINDOW << 8) \|
	408	opts->ws);
	409	}
	410
	411	if (unlikely(opts->num_sack_blocks)) {
	412	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
	413	tp->duplicate_sack : tp->selective_acks;
	414	int this_sack;
	415
	416	*ptr++ = htonl((TCPOPT_NOP << 24) \|
	417	(TCPOPT_NOP << 16) \|
	418	(TCPOPT_SACK << 8) \|
	419	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
	420	TCPOLEN_SACK_PERBLOCK)));
	421
	422	for (this_sack = 0; this_sack < opts->num_sack_blocks;
	423	++this_sack) {
	424	*ptr++ = htonl(sp[this_sack].start_seq);
	425	*ptr++ = htonl(sp[this_sack].end_seq);
	426	}
	427
	428	if (tp->rx_opt.dsack) {
	429	tp->rx_opt.dsack = 0;
	430	tp->rx_opt.eff_sacks--;
	431	}
	432	}
	433	}
	434
	435	static unsigned tcp_syn_options(struct sock sk, struct sk_buff skb,
	436	struct tcp_out_options *opts,
	437	struct tcp_md5sig_key **md5) {
	438	struct tcp_sock *tp = tcp_sk(sk);
	439	unsigned size = 0;
	440
	441	#ifdef CONFIG_TCP_MD5SIG
	442	*md5 = tp->af_specific->md5_lookup(sk, sk);
	443	if (*md5) {
	444	opts->options \|= OPTION_MD5;
	445	size += TCPOLEN_MD5SIG_ALIGNED;
	446	}
	447	#else
	448	*md5 = NULL;
	449	#endif
	450
	451	/* We always get an MSS option. The option bytes which will be seen in
	452	* normal data packets should timestamps be used, must be in the MSS
	453	* advertised. But we subtract them from tp->mss_cache so that
	454	* calculations in tcp_sendmsg are simpler etc. So account for this
	455	* fact here if necessary. If we don't do this correctly, as a
	456	* receiver we won't recognize data packets as being full sized when we
	457	* should, and thus we won't abide by the delayed ACK rules correctly.
	458	* SACKs don't matter, we never delay an ACK when we have any of those
	459	* going out. */
	460	opts->mss = tcp_advertise_mss(sk);
	461	size += TCPOLEN_MSS_ALIGNED;
	462
	463	if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
	464	opts->options \|= OPTION_TS;
	465	opts->tsval = TCP_SKB_CB(skb)->when;
	466	opts->tsecr = tp->rx_opt.ts_recent;
	467	size += TCPOLEN_TSTAMP_ALIGNED;
	468	}
	469	if (likely(sysctl_tcp_window_scaling)) {
	470	opts->ws = tp->rx_opt.rcv_wscale;
	471	if(likely(opts->ws))
	472	size += TCPOLEN_WSCALE_ALIGNED;
	473	}
	474	if (likely(sysctl_tcp_sack)) {
	475	opts->options \|= OPTION_SACK_ADVERTISE;
	476	if (unlikely(!(OPTION_TS & opts->options)))
	477	size += TCPOLEN_SACKPERM_ALIGNED;
	478	}
	479
	480	return size;
	481	}
	482
	483	static unsigned tcp_synack_options(struct sock *sk,
	484	struct request_sock *req,
	485	unsigned mss, struct sk_buff *skb,
	486	struct tcp_out_options *opts,
	487	struct tcp_md5sig_key **md5) {
	488	unsigned size = 0;
	489	struct inet_request_sock *ireq = inet_rsk(req);
	490	char doing_ts;
	491
	492	#ifdef CONFIG_TCP_MD5SIG
	493	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
	494	if (*md5) {
	495	opts->options \|= OPTION_MD5;
	496	size += TCPOLEN_MD5SIG_ALIGNED;
	497	}
	498	#else
	499	*md5 = NULL;
	500	#endif
	501
	502	/* we can't fit any SACK blocks in a packet with MD5 + TS
	503	options. There was discussion about disabling SACK rather than TS in
	504	order to fit in better with old, buggy kernels, but that was deemed
	505	to be unnecessary. */
	506	doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
	507
	508	opts->mss = mss;
	509	size += TCPOLEN_MSS_ALIGNED;
	510
	511	if (likely(ireq->wscale_ok)) {
	512	opts->ws = ireq->rcv_wscale;
	513	if(likely(opts->ws))
	514	size += TCPOLEN_WSCALE_ALIGNED;
	515	}
	516	if (likely(doing_ts)) {
	517	opts->options \|= OPTION_TS;
	518	opts->tsval = TCP_SKB_CB(skb)->when;
	519	opts->tsecr = req->ts_recent;
	520	size += TCPOLEN_TSTAMP_ALIGNED;
	521	}
	522	if (likely(ireq->sack_ok)) {
	523	opts->options \|= OPTION_SACK_ADVERTISE;
	524	if (unlikely(!doing_ts))
	525	size += TCPOLEN_SACKPERM_ALIGNED;
	526	}
	527
	528	return size;
	529	}
	530
	531	static unsigned tcp_established_options(struct sock sk, struct sk_buff skb,
	532	struct tcp_out_options *opts,
	533	struct tcp_md5sig_key **md5) {
	534	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
	535	struct tcp_sock *tp = tcp_sk(sk);
	536	unsigned size = 0;
	537
	538	#ifdef CONFIG_TCP_MD5SIG
	539	*md5 = tp->af_specific->md5_lookup(sk, sk);
	540	if (unlikely(*md5)) {
	541	opts->options \|= OPTION_MD5;
	542	size += TCPOLEN_MD5SIG_ALIGNED;
	543	}
	544	#else
	545	*md5 = NULL;
	546	#endif
	547
	548	if (likely(tp->rx_opt.tstamp_ok)) {
	549	opts->options \|= OPTION_TS;
	550	opts->tsval = tcb ? tcb->when : 0;
	551	opts->tsecr = tp->rx_opt.ts_recent;
	552	size += TCPOLEN_TSTAMP_ALIGNED;
	553	}
	554
	555	if (unlikely(tp->rx_opt.eff_sacks)) {
	556	const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
	557	opts->num_sack_blocks =
	558	min_t(unsigned, tp->rx_opt.eff_sacks,
	559	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
	560	TCPOLEN_SACK_PERBLOCK);
	561	size += TCPOLEN_SACK_BASE_ALIGNED +
	562	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
	563	}
	564
	565	return size;
	566	}
	567
	568	/* This routine actually transmits TCP packets queued in by
	569	* tcp_do_sendmsg(). This is used by both the initial
	570	* transmission and possible later retransmissions.
	571	* All SKB's seen here are completely headerless. It is our
	572	* job to build the TCP header, and pass the packet down to
	573	* IP so it can do the same plus pass the packet off to the
	574	* device.
	575	*
	576	* We are working here with either a clone of the original
	577	* SKB, or a fresh unique copy made by the retransmit engine.
	578	*/
	579	static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
	580	gfp_t gfp_mask)
	581	{
	582	const struct inet_connection_sock *icsk = inet_csk(sk);
	583	struct inet_sock *inet;
	584	struct tcp_sock *tp;
	585	struct tcp_skb_cb *tcb;
	586	struct tcp_out_options opts;
	587	unsigned tcp_options_size, tcp_header_size;
	588	struct tcp_md5sig_key *md5;
	589	__u8 *md5_hash_location;
	590	struct tcphdr *th;
	591	int err;
	592
	593	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
	594
	595	/* If congestion control is doing timestamping, we must
	596	* take such a timestamp before we potentially clone/copy.
	597	*/
	598	if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
	599	__net_timestamp(skb);
	600
	601	if (likely(clone_it)) {
	602	if (unlikely(skb_cloned(skb)))
	603	skb = pskb_copy(skb, gfp_mask);
	604	else
	605	skb = skb_clone(skb, gfp_mask);
	606	if (unlikely(!skb))
	607	return -ENOBUFS;
	608	}
	609
	610	inet = inet_sk(sk);
	611	tp = tcp_sk(sk);
	612	tcb = TCP_SKB_CB(skb);
	613	memset(&opts, 0, sizeof(opts));
	614
	615	if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
	616	tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
	617	else
	618	tcp_options_size = tcp_established_options(sk, skb, &opts,
	619	&md5);
	620	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
	621
	622	if (tcp_packets_in_flight(tp) == 0)
	623	tcp_ca_event(sk, CA_EVENT_TX_START);
	624
	625	skb_push(skb, tcp_header_size);
	626	skb_reset_transport_header(skb);
	627	skb_set_owner_w(skb, sk);
	628
	629	/* Build TCP header and checksum it. */
	630	th = tcp_hdr(skb);
	631	th->source = inet->sport;
	632	th->dest = inet->dport;
	633	th->seq = htonl(tcb->seq);
	634	th->ack_seq = htonl(tp->rcv_nxt);
	635	(((__be16 )th) + 6) = htons(((tcp_header_size >> 2) << 12) \|
	636	tcb->flags);
	637
	638	if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
	639	/* RFC1323: The window in SYN & SYN/ACK segments
	640	* is never scaled.
	641	*/
	642	th->window = htons(min(tp->rcv_wnd, 65535U));
	643	} else {
	644	th->window = htons(tcp_select_window(sk));
	645	}
	646	th->check = 0;
	647	th->urg_ptr = 0;
	648
	649	if (unlikely(tp->urg_mode &&
	650	between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
	651	th->urg_ptr = htons(tp->snd_up - tcb->seq);
	652	th->urg = 1;
	653	}
	654
	655	tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
	656	if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
	657	TCP_ECN_send(sk, skb, tcp_header_size);
	658
	659	#ifdef CONFIG_TCP_MD5SIG
	660	/* Calculate the MD5 hash, as we have all we need now */
	661	if (md5) {
	662	sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
	663	tp->af_specific->calc_md5_hash(md5_hash_location,
	664	md5, sk, NULL, skb);
	665	}
	666	#endif
	667
	668	icsk->icsk_af_ops->send_check(sk, skb->len, skb);
	669
	670	if (likely(tcb->flags & TCPCB_FLAG_ACK))
	671	tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
	672
	673	if (skb->len != tcp_header_size)
	674	tcp_event_data_sent(tp, skb, sk);
	675
	676	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
	677	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
	678
	679	err = icsk->icsk_af_ops->queue_xmit(skb, 0);
	680	if (likely(err <= 0))
	681	return err;
	682
	683	tcp_enter_cwr(sk, 1);
	684
	685	return net_xmit_eval(err);
	686	}
	687
	688	/* This routine just queue's the buffer
	689	*
	690	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
	691	* otherwise socket can stall.
	692	*/
	693	static void tcp_queue_skb(struct sock sk, struct sk_buff skb)
	694	{
	695	struct tcp_sock *tp = tcp_sk(sk);
	696
	697	/* Advance write_seq and place onto the write_queue. */
	698	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
	699	skb_header_release(skb);
	700	tcp_add_write_queue_tail(sk, skb);
	701	sk->sk_wmem_queued += skb->truesize;
	702	sk_mem_charge(sk, skb->truesize);
	703	}
	704
	705	static void tcp_set_skb_tso_segs(struct sock sk, struct sk_buff skb,
	706	unsigned int mss_now)
	707	{
	708	if (skb->len <= mss_now \|\| !sk_can_gso(sk)) {
	709	/* Avoid the costly divide in the normal
	710	* non-TSO case.
	711	*/
	712	skb_shinfo(skb)->gso_segs = 1;
	713	skb_shinfo(skb)->gso_size = 0;
	714	skb_shinfo(skb)->gso_type = 0;
	715	} else {
	716	skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now);
	717	skb_shinfo(skb)->gso_size = mss_now;
	718	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
	719	}
	720	}
	721
	722	/* When a modification to fackets out becomes necessary, we need to check
	723	* skb is counted to fackets_out or not.
	724	*/
	725	static void tcp_adjust_fackets_out(struct sock sk, struct sk_buff skb,
	726	int decr)
	727	{
	728	struct tcp_sock *tp = tcp_sk(sk);
	729
	730	if (!tp->sacked_out \|\| tcp_is_reno(tp))
	731	return;
	732
	733	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
	734	tp->fackets_out -= decr;
	735	}
	736
	737	/* Function to create two new TCP segments. Shrinks the given segment
	738	* to the specified size and appends a new segment with the rest of the
	739	* packet to the list. This won't be called frequently, I hope.
	740	* Remember, these are still headerless SKBs at this point.
	741	*/
	742	int tcp_fragment(struct sock sk, struct sk_buff skb, u32 len,
	743	unsigned int mss_now)
	744	{
	745	struct tcp_sock *tp = tcp_sk(sk);
	746	struct sk_buff *buff;
	747	int nsize, old_factor;
	748	int nlen;
	749	u16 flags;
	750
	751	BUG_ON(len > skb->len);
	752
	753	tcp_clear_retrans_hints_partial(tp);
	754	nsize = skb_headlen(skb) - len;
	755	if (nsize < 0)
	756	nsize = 0;
	757
	758	if (skb_cloned(skb) &&
	759	skb_is_nonlinear(skb) &&
	760	pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
	761	return -ENOMEM;
	762
	763	/* Get a new skb... force flag on. */
	764	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
	765	if (buff == NULL)
	766	return -ENOMEM; /* We'll just try again later. */
	767
	768	sk->sk_wmem_queued += buff->truesize;
	769	sk_mem_charge(sk, buff->truesize);
	770	nlen = skb->len - len - nsize;
	771	buff->truesize += nlen;
	772	skb->truesize -= nlen;
	773
	774	/* Correct the sequence numbers. */
	775	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
	776	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
	777	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
	778
	779	/* PSH and FIN should only be set in the second packet. */
	780	flags = TCP_SKB_CB(skb)->flags;
	781	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN \| TCPCB_FLAG_PSH);
	782	TCP_SKB_CB(buff)->flags = flags;
	783	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
	784
	785	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
	786	/* Copy and checksum data tail into the new buffer. */
	787	buff->csum = csum_partial_copy_nocheck(skb->data + len,
	788	skb_put(buff, nsize),
	789	nsize, 0);
	790
	791	skb_trim(skb, len);
	792
	793	skb->csum = csum_block_sub(skb->csum, buff->csum, len);
	794	} else {
	795	skb->ip_summed = CHECKSUM_PARTIAL;
	796	skb_split(skb, buff, len);
	797	}
	798
	799	buff->ip_summed = skb->ip_summed;
	800
	801	/* Looks stupid, but our code really uses when of
	802	* skbs, which it never sent before. --ANK
	803	*/
	804	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
	805	buff->tstamp = skb->tstamp;
	806
	807	old_factor = tcp_skb_pcount(skb);
	808
	809	/* Fix up tso_factor for both original and new SKB. */
	810	tcp_set_skb_tso_segs(sk, skb, mss_now);
	811	tcp_set_skb_tso_segs(sk, buff, mss_now);
	812
	813	/* If this packet has been sent out already, we must
	814	* adjust the various packet counters.
	815	*/
	816	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
	817	int diff = old_factor - tcp_skb_pcount(skb) -
	818	tcp_skb_pcount(buff);
	819
	820	tp->packets_out -= diff;
	821
	822	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
	823	tp->sacked_out -= diff;
	824	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
	825	tp->retrans_out -= diff;
	826
	827	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
	828	tp->lost_out -= diff;
	829
	830	/* Adjust Reno SACK estimate. */
	831	if (tcp_is_reno(tp) && diff > 0) {
	832	tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
	833	tcp_verify_left_out(tp);
	834	}
	835	tcp_adjust_fackets_out(sk, skb, diff);
	836	}
	837
	838	/* Link BUFF into the send queue. */
	839	skb_header_release(buff);
	840	tcp_insert_write_queue_after(skb, buff, sk);
	841
	842	return 0;
	843	}
	844
	845	/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
	846	* eventually). The difference is that pulled data not copied, but
	847	* immediately discarded.
	848	*/
	849	static void __pskb_trim_head(struct sk_buff *skb, int len)
	850	{
	851	int i, k, eat;
	852
	853	eat = len;
	854	k = 0;
	855	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
	856	if (skb_shinfo(skb)->frags[i].size <= eat) {
	857	put_page(skb_shinfo(skb)->frags[i].page);
	858	eat -= skb_shinfo(skb)->frags[i].size;
	859	} else {
	860	skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
	861	if (eat) {
	862	skb_shinfo(skb)->frags[k].page_offset += eat;
	863	skb_shinfo(skb)->frags[k].size -= eat;
	864	eat = 0;
	865	}
	866	k++;
	867	}
	868	}
	869	skb_shinfo(skb)->nr_frags = k;
	870
	871	skb_reset_tail_pointer(skb);
	872	skb->data_len -= len;
	873	skb->len = skb->data_len;
	874	}
	875
	876	int tcp_trim_head(struct sock sk, struct sk_buff skb, u32 len)
	877	{
	878	if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
	879	return -ENOMEM;
	880
	881	/* If len == headlen, we avoid __skb_pull to preserve alignment. */
	882	if (unlikely(len < skb_headlen(skb)))
	883	__skb_pull(skb, len);
	884	else
	885	__pskb_trim_head(skb, len - skb_headlen(skb));
	886
	887	TCP_SKB_CB(skb)->seq += len;
	888	skb->ip_summed = CHECKSUM_PARTIAL;
	889
	890	skb->truesize -= len;
	891	sk->sk_wmem_queued -= len;
	892	sk_mem_uncharge(sk, len);
	893	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
	894
	895	/* Any change of skb->len requires recalculation of tso
	896	* factor and mss.
	897	*/
	898	if (tcp_skb_pcount(skb) > 1)
	899	tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
	900
	901	return 0;
	902	}
	903
	904	/* Not accounting for SACKs here. */
	905	int tcp_mtu_to_mss(struct sock *sk, int pmtu)
	906	{
	907	struct tcp_sock *tp = tcp_sk(sk);
	908	struct inet_connection_sock *icsk = inet_csk(sk);
	909	int mss_now;
	910
	911	/* Calculate base mss without TCP options:
	912	It is MMS_S - sizeof(tcphdr) of rfc1122
	913	*/
	914	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
	915
	916	/* Clamp it (mss_clamp does not include tcp options) */
	917	if (mss_now > tp->rx_opt.mss_clamp)
	918	mss_now = tp->rx_opt.mss_clamp;
	919
	920	/* Now subtract optional transport overhead */
	921	mss_now -= icsk->icsk_ext_hdr_len;
	922
	923	/* Then reserve room for full set of TCP options and 8 bytes of data */
	924	if (mss_now < 48)
	925	mss_now = 48;
	926
	927	/* Now subtract TCP options size, not including SACKs */
	928	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
	929
	930	return mss_now;
	931	}
	932
	933	/* Inverse of above */
	934	int tcp_mss_to_mtu(struct sock *sk, int mss)
	935	{
	936	struct tcp_sock *tp = tcp_sk(sk);
	937	struct inet_connection_sock *icsk = inet_csk(sk);
	938	int mtu;
	939
	940	mtu = mss +
	941	tp->tcp_header_len +
	942	icsk->icsk_ext_hdr_len +
	943	icsk->icsk_af_ops->net_header_len;
	944
	945	return mtu;
	946	}
	947
	948	void tcp_mtup_init(struct sock *sk)
	949	{
	950	struct tcp_sock *tp = tcp_sk(sk);
	951	struct inet_connection_sock *icsk = inet_csk(sk);
	952
	953	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
	954	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
	955	icsk->icsk_af_ops->net_header_len;
	956	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
	957	icsk->icsk_mtup.probe_size = 0;
	958	}
	959
	960	/* Bound MSS / TSO packet size with the half of the window */
	961	static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
	962	{
	963	if (tp->max_window && pktsize > (tp->max_window >> 1))
	964	return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
	965	else
	966	return pktsize;
	967	}
	968
	969	/* This function synchronize snd mss to current pmtu/exthdr set.
	970
	971	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
	972	for TCP options, but includes only bare TCP header.
	973
	974	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
	975	It is minimum of user_mss and mss received with SYN.
	976	It also does not include TCP options.
	977
	978	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
	979
	980	tp->mss_cache is current effective sending mss, including
	981	all tcp options except for SACKs. It is evaluated,
	982	taking into account current pmtu, but never exceeds
	983	tp->rx_opt.mss_clamp.
	984
	985	NOTE1. rfc1122 clearly states that advertised MSS
	986	DOES NOT include either tcp or ip options.
	987
	988	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
	989	are READ ONLY outside this function. --ANK (980731)
	990	*/
	991	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
	992	{
	993	struct tcp_sock *tp = tcp_sk(sk);
	994	struct inet_connection_sock *icsk = inet_csk(sk);
	995	int mss_now;
	996
	997	if (icsk->icsk_mtup.search_high > pmtu)
	998	icsk->icsk_mtup.search_high = pmtu;
	999
	1000	mss_now = tcp_mtu_to_mss(sk, pmtu);
	1001	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
	1002
	1003	/* And store cached results */
	1004	icsk->icsk_pmtu_cookie = pmtu;
	1005	if (icsk->icsk_mtup.enabled)
	1006	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
	1007	tp->mss_cache = mss_now;
	1008
	1009	return mss_now;
	1010	}
	1011
	1012	/* Compute the current effective MSS, taking SACKs and IP options,
	1013	* and even PMTU discovery events into account.
	1014	*
	1015	* LARGESEND note: !urg_mode is overkill, only frames up to snd_up
	1016	* cannot be large. However, taking into account rare use of URG, this
	1017	* is not a big flaw.
	1018	*/
	1019	unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
	1020	{
	1021	struct tcp_sock *tp = tcp_sk(sk);
	1022	struct dst_entry *dst = __sk_dst_get(sk);
	1023	u32 mss_now;
	1024	u16 xmit_size_goal;
	1025	int doing_tso = 0;
	1026	unsigned header_len;
	1027	struct tcp_out_options opts;
	1028	struct tcp_md5sig_key *md5;
	1029
	1030	mss_now = tp->mss_cache;
	1031
	1032	if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
	1033	doing_tso = 1;
	1034
	1035	if (dst) {
	1036	u32 mtu = dst_mtu(dst);
	1037	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
	1038	mss_now = tcp_sync_mss(sk, mtu);
	1039	}
	1040
	1041	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
	1042	sizeof(struct tcphdr);
	1043	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
	1044	* some common options. If this is an odd packet (because we have SACK
	1045	* blocks etc) then our calculated header_len will be different, and
	1046	* we have to adjust mss_now correspondingly */
	1047	if (header_len != tp->tcp_header_len) {
	1048	int delta = (int) header_len - tp->tcp_header_len;
	1049	mss_now -= delta;
	1050	}
	1051
	1052	xmit_size_goal = mss_now;
	1053
	1054	if (doing_tso) {
	1055	xmit_size_goal = ((sk->sk_gso_max_size - 1) -
	1056	inet_csk(sk)->icsk_af_ops->net_header_len -
	1057	inet_csk(sk)->icsk_ext_hdr_len -
	1058	tp->tcp_header_len);
	1059
	1060	xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
	1061	xmit_size_goal -= (xmit_size_goal % mss_now);
	1062	}
	1063	tp->xmit_size_goal = xmit_size_goal;
	1064
	1065	return mss_now;
	1066	}
	1067
	1068	/* Congestion window validation. (RFC2861) */
	1069	static void tcp_cwnd_validate(struct sock *sk)
	1070	{
	1071	struct tcp_sock *tp = tcp_sk(sk);
	1072
	1073	if (tp->packets_out >= tp->snd_cwnd) {
	1074	/* Network is feed fully. */
	1075	tp->snd_cwnd_used = 0;
	1076	tp->snd_cwnd_stamp = tcp_time_stamp;
	1077	} else {
	1078	/* Network starves. */
	1079	if (tp->packets_out > tp->snd_cwnd_used)
	1080	tp->snd_cwnd_used = tp->packets_out;
	1081
	1082	if (sysctl_tcp_slow_start_after_idle &&
	1083	(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
	1084	tcp_cwnd_application_limited(sk);
	1085	}
	1086	}
	1087
	1088	/* Returns the portion of skb which can be sent right away without
	1089	* introducing MSS oddities to segment boundaries. In rare cases where
	1090	* mss_now != mss_cache, we will request caller to create a small skb
	1091	* per input skb which could be mostly avoided here (if desired).
	1092	*
	1093	* We explicitly want to create a request for splitting write queue tail
	1094	* to a small skb for Nagle purposes while avoiding unnecessary modulos,
	1095	* thus all the complexity (cwnd_len is always MSS multiple which we
	1096	* return whenever allowed by the other factors). Basically we need the
	1097	* modulo only when the receiver window alone is the limiting factor or
	1098	* when we would be allowed to send the split-due-to-Nagle skb fully.
	1099	*/
	1100	static unsigned int tcp_mss_split_point(struct sock sk, struct sk_buff skb,
	1101	unsigned int mss_now, unsigned int cwnd)
	1102	{
	1103	struct tcp_sock *tp = tcp_sk(sk);
	1104	u32 needed, window, cwnd_len;
	1105
	1106	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
	1107	cwnd_len = mss_now * cwnd;
	1108
	1109	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
	1110	return cwnd_len;
	1111
	1112	needed = min(skb->len, window);
	1113
	1114	if (cwnd_len <= needed)
	1115	return cwnd_len;
	1116
	1117	return needed - needed % mss_now;
	1118	}
	1119
	1120	/* Can at least one segment of SKB be sent right now, according to the
	1121	* congestion window rules? If so, return how many segments are allowed.
	1122	*/
	1123	static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
	1124	struct sk_buff *skb)
	1125	{
	1126	u32 in_flight, cwnd;
	1127
	1128	/* Don't be strict about the congestion window for the final FIN. */
	1129	if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
	1130	tcp_skb_pcount(skb) == 1)
	1131	return 1;
	1132
	1133	in_flight = tcp_packets_in_flight(tp);
	1134	cwnd = tp->snd_cwnd;
	1135	if (in_flight < cwnd)
	1136	return (cwnd - in_flight);
	1137
	1138	return 0;
	1139	}
	1140
	1141	/* This must be invoked the first time we consider transmitting
	1142	* SKB onto the wire.
	1143	*/
	1144	static int tcp_init_tso_segs(struct sock sk, struct sk_buff skb,
	1145	unsigned int mss_now)
	1146	{
	1147	int tso_segs = tcp_skb_pcount(skb);
	1148
	1149	if (!tso_segs \|\| (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
	1150	tcp_set_skb_tso_segs(sk, skb, mss_now);
	1151	tso_segs = tcp_skb_pcount(skb);
	1152	}
	1153	return tso_segs;
	1154	}
	1155
	1156	static inline int tcp_minshall_check(const struct tcp_sock *tp)
	1157	{
	1158	return after(tp->snd_sml,tp->snd_una) &&
	1159	!after(tp->snd_sml, tp->snd_nxt);
	1160	}
	1161
	1162	/* Return 0, if packet can be sent now without violation Nagle's rules:
	1163	* 1. It is full sized.
	1164	* 2. Or it contains FIN. (already checked by caller)
	1165	* 3. Or TCP_NODELAY was set.
	1166	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
	1167	* With Minshall's modification: all sent small packets are ACKed.
	1168	*/
	1169	static inline int tcp_nagle_check(const struct tcp_sock *tp,
	1170	const struct sk_buff *skb,
	1171	unsigned mss_now, int nonagle)
	1172	{
	1173	return (skb->len < mss_now &&
	1174	((nonagle & TCP_NAGLE_CORK) \|\|
	1175	(!nonagle && tp->packets_out && tcp_minshall_check(tp))));
	1176	}
	1177
	1178	/* Return non-zero if the Nagle test allows this packet to be
	1179	* sent now.
	1180	*/
	1181	static inline int tcp_nagle_test(struct tcp_sock tp, struct sk_buff skb,
	1182	unsigned int cur_mss, int nonagle)
	1183	{
	1184	/* Nagle rule does not apply to frames, which sit in the middle of the
	1185	* write_queue (they have no chances to get new data).
	1186	*
	1187	* This is implemented in the callers, where they modify the 'nonagle'
	1188	* argument based upon the location of SKB in the send queue.
	1189	*/
	1190	if (nonagle & TCP_NAGLE_PUSH)
	1191	return 1;
	1192
	1193	/* Don't use the nagle rule for urgent data (or for the final FIN).
	1194	* Nagle can be ignored during F-RTO too (see RFC4138).
	1195	*/
	1196	if (tp->urg_mode \|\| (tp->frto_counter == 2) \|\|
	1197	(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
	1198	return 1;
	1199
	1200	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
	1201	return 1;
	1202
	1203	return 0;
	1204	}
	1205
	1206	/* Does at least the first segment of SKB fit into the send window? */
	1207	static inline int tcp_snd_wnd_test(struct tcp_sock tp, struct sk_buff skb,
	1208	unsigned int cur_mss)
	1209	{
	1210	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
	1211
	1212	if (skb->len > cur_mss)
	1213	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
	1214
	1215	return !after(end_seq, tcp_wnd_end(tp));
	1216	}
	1217
	1218	/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
	1219	* should be put on the wire right now. If so, it returns the number of
	1220	* packets allowed by the congestion window.
	1221	*/
	1222	static unsigned int tcp_snd_test(struct sock sk, struct sk_buff skb,
	1223	unsigned int cur_mss, int nonagle)
	1224	{
	1225	struct tcp_sock *tp = tcp_sk(sk);
	1226	unsigned int cwnd_quota;
	1227
	1228	tcp_init_tso_segs(sk, skb, cur_mss);
	1229
	1230	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
	1231	return 0;
	1232
	1233	cwnd_quota = tcp_cwnd_test(tp, skb);
	1234	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
	1235	cwnd_quota = 0;
	1236
	1237	return cwnd_quota;
	1238	}
	1239
	1240	int tcp_may_send_now(struct sock *sk)
	1241	{
	1242	struct tcp_sock *tp = tcp_sk(sk);
	1243	struct sk_buff *skb = tcp_send_head(sk);
	1244
	1245	return (skb &&
	1246	tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
	1247	(tcp_skb_is_last(sk, skb) ?
	1248	tp->nonagle : TCP_NAGLE_PUSH)));
	1249	}
	1250
	1251	/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
	1252	* which is put after SKB on the list. It is very much like
	1253	* tcp_fragment() except that it may make several kinds of assumptions
	1254	* in order to speed up the splitting operation. In particular, we
	1255	* know that all the data is in scatter-gather pages, and that the
	1256	* packet has never been sent out before (and thus is not cloned).
	1257	*/
	1258	static int tso_fragment(struct sock sk, struct sk_buff skb, unsigned int len,
	1259	unsigned int mss_now)
	1260	{
	1261	struct sk_buff *buff;
	1262	int nlen = skb->len - len;
	1263	u16 flags;
	1264
	1265	/* All of a TSO frame must be composed of paged data. */
	1266	if (skb->len != skb->data_len)
	1267	return tcp_fragment(sk, skb, len, mss_now);
	1268
	1269	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
	1270	if (unlikely(buff == NULL))
	1271	return -ENOMEM;
	1272
	1273	sk->sk_wmem_queued += buff->truesize;
	1274	sk_mem_charge(sk, buff->truesize);
	1275	buff->truesize += nlen;
	1276	skb->truesize -= nlen;
	1277
	1278	/* Correct the sequence numbers. */
	1279	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
	1280	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
	1281	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
	1282
	1283	/* PSH and FIN should only be set in the second packet. */
	1284	flags = TCP_SKB_CB(skb)->flags;
	1285	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN \| TCPCB_FLAG_PSH);
	1286	TCP_SKB_CB(buff)->flags = flags;
	1287
	1288	/* This packet was never sent out yet, so no SACK bits. */
	1289	TCP_SKB_CB(buff)->sacked = 0;
	1290
	1291	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
	1292	skb_split(skb, buff, len);
	1293
	1294	/* Fix up tso_factor for both original and new SKB. */
	1295	tcp_set_skb_tso_segs(sk, skb, mss_now);
	1296	tcp_set_skb_tso_segs(sk, buff, mss_now);
	1297
	1298	/* Link BUFF into the send queue. */
	1299	skb_header_release(buff);
	1300	tcp_insert_write_queue_after(skb, buff, sk);
	1301
	1302	return 0;
	1303	}
	1304
	1305	/* Try to defer sending, if possible, in order to minimize the amount
	1306	* of TSO splitting we do. View it as a kind of TSO Nagle test.
	1307	*
	1308	* This algorithm is from John Heffner.
	1309	*/
	1310	static int tcp_tso_should_defer(struct sock sk, struct sk_buff skb)
	1311	{
	1312	struct tcp_sock *tp = tcp_sk(sk);
	1313	const struct inet_connection_sock *icsk = inet_csk(sk);
	1314	u32 send_win, cong_win, limit, in_flight;
	1315
	1316	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
	1317	goto send_now;
	1318
	1319	if (icsk->icsk_ca_state != TCP_CA_Open)
	1320	goto send_now;
	1321
	1322	/* Defer for less than two clock ticks. */
	1323	if (tp->tso_deferred &&
	1324	((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
	1325	goto send_now;
	1326
	1327	in_flight = tcp_packets_in_flight(tp);
	1328
	1329	BUG_ON(tcp_skb_pcount(skb) <= 1 \|\| (tp->snd_cwnd <= in_flight));
	1330
	1331	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
	1332
	1333	/* From in_flight test above, we know that cwnd > in_flight. */
	1334	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
	1335
	1336	limit = min(send_win, cong_win);
	1337
	1338	/* If a full-sized TSO skb can be sent, do it. */
	1339	if (limit >= sk->sk_gso_max_size)
	1340	goto send_now;
	1341
	1342	if (sysctl_tcp_tso_win_divisor) {
	1343	u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
	1344
	1345	/* If at least some fraction of a window is available,
	1346	* just use it.
	1347	*/
	1348	chunk /= sysctl_tcp_tso_win_divisor;
	1349	if (limit >= chunk)
	1350	goto send_now;
	1351	} else {
	1352	/* Different approach, try not to defer past a single
	1353	* ACK. Receiver should ACK every other full sized
	1354	* frame, so if we have space for more than 3 frames
	1355	* then send now.
	1356	*/
	1357	if (limit > tcp_max_burst(tp) * tp->mss_cache)
	1358	goto send_now;
	1359	}
	1360
	1361	/* Ok, it looks like it is advisable to defer. */
	1362	tp->tso_deferred = 1 \| (jiffies << 1);
	1363
	1364	return 1;
	1365
	1366	send_now:
	1367	tp->tso_deferred = 0;
	1368	return 0;
	1369	}
	1370
	1371	/* Create a new MTU probe if we are ready.
	1372	* Returns 0 if we should wait to probe (no cwnd available),
	1373	* 1 if a probe was sent,
	1374	* -1 otherwise
	1375	*/
	1376	static int tcp_mtu_probe(struct sock *sk)
	1377	{
	1378	struct tcp_sock *tp = tcp_sk(sk);
	1379	struct inet_connection_sock *icsk = inet_csk(sk);
	1380	struct sk_buff skb, nskb, *next;
	1381	int len;
	1382	int probe_size;
	1383	int size_needed;
	1384	int copy;
	1385	int mss_now;
	1386
	1387	/* Not currently probing/verifying,
	1388	* not in recovery,
	1389	* have enough cwnd, and
	1390	* not SACKing (the variable headers throw things off) */
	1391	if (!icsk->icsk_mtup.enabled \|\|
	1392	icsk->icsk_mtup.probe_size \|\|
	1393	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
	1394	tp->snd_cwnd < 11 \|\|
	1395	tp->rx_opt.eff_sacks)
	1396	return -1;
	1397
	1398	/* Very simple search strategy: just double the MSS. */
	1399	mss_now = tcp_current_mss(sk, 0);
	1400	probe_size = 2 * tp->mss_cache;
	1401	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
	1402	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
	1403	/* TODO: set timer for probe_converge_event */
	1404	return -1;
	1405	}
	1406
	1407	/* Have enough data in the send queue to probe? */
	1408	if (tp->write_seq - tp->snd_nxt < size_needed)
	1409	return -1;
	1410
	1411	if (tp->snd_wnd < size_needed)
	1412	return -1;
	1413	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
	1414	return 0;
	1415
	1416	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
	1417	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
	1418	if (!tcp_packets_in_flight(tp))
	1419	return -1;
	1420	else
	1421	return 0;
	1422	}
	1423
	1424	/* We're allowed to probe. Build it now. */
	1425	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
	1426	return -1;
	1427	sk->sk_wmem_queued += nskb->truesize;
	1428	sk_mem_charge(sk, nskb->truesize);
	1429
	1430	skb = tcp_send_head(sk);
	1431
	1432	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
	1433	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
	1434	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
	1435	TCP_SKB_CB(nskb)->sacked = 0;
	1436	nskb->csum = 0;
	1437	nskb->ip_summed = skb->ip_summed;
	1438
	1439	tcp_insert_write_queue_before(nskb, skb, sk);
	1440
	1441	len = 0;
	1442	tcp_for_write_queue_from_safe(skb, next, sk) {
	1443	copy = min_t(int, skb->len, probe_size - len);
	1444	if (nskb->ip_summed)
	1445	skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
	1446	else
	1447	nskb->csum = skb_copy_and_csum_bits(skb, 0,
	1448	skb_put(nskb, copy),
	1449	copy, nskb->csum);
	1450
	1451	if (skb->len <= copy) {
	1452	/* We've eaten all the data from this skb.
	1453	* Throw it away. */
	1454	TCP_SKB_CB(nskb)->flags \|= TCP_SKB_CB(skb)->flags;
	1455	tcp_unlink_write_queue(skb, sk);
	1456	sk_wmem_free_skb(sk, skb);
	1457	} else {
	1458	TCP_SKB_CB(nskb)->flags \|= TCP_SKB_CB(skb)->flags &
	1459	~(TCPCB_FLAG_FIN\|TCPCB_FLAG_PSH);
	1460	if (!skb_shinfo(skb)->nr_frags) {
	1461	skb_pull(skb, copy);
	1462	if (skb->ip_summed != CHECKSUM_PARTIAL)
	1463	skb->csum = csum_partial(skb->data,
	1464	skb->len, 0);
	1465	} else {
	1466	__pskb_trim_head(skb, copy);
	1467	tcp_set_skb_tso_segs(sk, skb, mss_now);
	1468	}
	1469	TCP_SKB_CB(skb)->seq += copy;
	1470	}
	1471
	1472	len += copy;
	1473
	1474	if (len >= probe_size)
	1475	break;
	1476	}
	1477	tcp_init_tso_segs(sk, nskb, nskb->len);
	1478
	1479	/* We're ready to send. If this fails, the probe will
	1480	* be resegmented into mss-sized pieces by tcp_write_xmit(). */
	1481	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
	1482	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
	1483	/* Decrement cwnd here because we are sending
	1484	* effectively two packets. */
	1485	tp->snd_cwnd--;
	1486	tcp_event_new_data_sent(sk, nskb);
	1487
	1488	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
	1489	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
	1490	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
	1491
	1492	return 1;
	1493	}
	1494
	1495	return -1;
	1496	}
	1497
	1498	/* This routine writes packets to the network. It advances the
	1499	* send_head. This happens as incoming acks open up the remote
	1500	* window for us.
	1501	*
	1502	* Returns 1, if no segments are in flight and we have queued segments, but
	1503	* cannot send anything now because of SWS or another problem.
	1504	*/
	1505	static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
	1506	{
	1507	struct tcp_sock *tp = tcp_sk(sk);
	1508	struct sk_buff *skb;
	1509	unsigned int tso_segs, sent_pkts;
	1510	int cwnd_quota;
	1511	int result;
	1512
	1513	/* If we are closed, the bytes will have to remain here.
	1514	* In time closedown will finish, we empty the write queue and all
	1515	* will be happy.
	1516	*/
	1517	if (unlikely(sk->sk_state == TCP_CLOSE))
	1518	return 0;
	1519
	1520	sent_pkts = 0;
	1521
	1522	/* Do MTU probing. */
	1523	if ((result = tcp_mtu_probe(sk)) == 0) {
	1524	return 0;
	1525	} else if (result > 0) {
	1526	sent_pkts = 1;
	1527	}
	1528
	1529	while ((skb = tcp_send_head(sk))) {
	1530	unsigned int limit;
	1531
	1532	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
	1533	BUG_ON(!tso_segs);
	1534
	1535	cwnd_quota = tcp_cwnd_test(tp, skb);
	1536	if (!cwnd_quota)
	1537	break;
	1538
	1539	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
	1540	break;
	1541
	1542	if (tso_segs == 1) {
	1543	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
	1544	(tcp_skb_is_last(sk, skb) ?
	1545	nonagle : TCP_NAGLE_PUSH))))
	1546	break;
	1547	} else {
	1548	if (tcp_tso_should_defer(sk, skb))
	1549	break;
	1550	}
	1551
	1552	limit = mss_now;
	1553	if (tso_segs > 1)
	1554	limit = tcp_mss_split_point(sk, skb, mss_now,
	1555	cwnd_quota);
	1556
	1557	if (skb->len > limit &&
	1558	unlikely(tso_fragment(sk, skb, limit, mss_now)))
	1559	break;
	1560
	1561	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	1562
	1563	if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
	1564	break;
	1565
	1566	/* Advance the send_head. This one is sent out.
	1567	* This call will increment packets_out.
	1568	*/
	1569	tcp_event_new_data_sent(sk, skb);
	1570
	1571	tcp_minshall_update(tp, mss_now, skb);
	1572	sent_pkts++;
	1573	}
	1574
	1575	if (likely(sent_pkts)) {
	1576	tcp_cwnd_validate(sk);
	1577	return 0;
	1578	}
	1579	return !tp->packets_out && tcp_send_head(sk);
	1580	}
	1581
	1582	/* Push out any pending frames which were held back due to
	1583	* TCP_CORK or attempt at coalescing tiny packets.
	1584	* The socket must be locked by the caller.
	1585	*/
	1586	void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
	1587	int nonagle)
	1588	{
	1589	struct sk_buff *skb = tcp_send_head(sk);
	1590
	1591	if (skb) {
	1592	if (tcp_write_xmit(sk, cur_mss, nonagle))
	1593	tcp_check_probe_timer(sk);
	1594	}
	1595	}
	1596
	1597	/* Send _single_ skb sitting at the send head. This function requires
	1598	* true push pending frames to setup probe timer etc.
	1599	*/
	1600	void tcp_push_one(struct sock *sk, unsigned int mss_now)
	1601	{
	1602	struct sk_buff *skb = tcp_send_head(sk);
	1603	unsigned int tso_segs, cwnd_quota;
	1604
	1605	BUG_ON(!skb \|\| skb->len < mss_now);
	1606
	1607	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
	1608	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
	1609
	1610	if (likely(cwnd_quota)) {
	1611	unsigned int limit;
	1612
	1613	BUG_ON(!tso_segs);
	1614
	1615	limit = mss_now;
	1616	if (tso_segs > 1)
	1617	limit = tcp_mss_split_point(sk, skb, mss_now,
	1618	cwnd_quota);
	1619
	1620	if (skb->len > limit &&
	1621	unlikely(tso_fragment(sk, skb, limit, mss_now)))
	1622	return;
	1623
	1624	/* Send it out now. */
	1625	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	1626
	1627	if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
	1628	tcp_event_new_data_sent(sk, skb);
	1629	tcp_cwnd_validate(sk);
	1630	return;
	1631	}
	1632	}
	1633	}
	1634
	1635	/* This function returns the amount that we can raise the
	1636	* usable window based on the following constraints
	1637	*
	1638	* 1. The window can never be shrunk once it is offered (RFC 793)
	1639	* 2. We limit memory per socket
	1640	*
	1641	* RFC 1122:
	1642	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
	1643	* RECV.NEXT + RCV.WIN fixed until:
	1644	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
	1645	*
	1646	* i.e. don't raise the right edge of the window until you can raise
	1647	* it at least MSS bytes.
	1648	*
	1649	* Unfortunately, the recommended algorithm breaks header prediction,
	1650	* since header prediction assumes th->window stays fixed.
	1651	*
	1652	* Strictly speaking, keeping th->window fixed violates the receiver
	1653	* side SWS prevention criteria. The problem is that under this rule
	1654	* a stream of single byte packets will cause the right side of the
	1655	* window to always advance by a single byte.
	1656	*
	1657	* Of course, if the sender implements sender side SWS prevention
	1658	* then this will not be a problem.
	1659	*
	1660	* BSD seems to make the following compromise:
	1661	*
	1662	* If the free space is less than the 1/4 of the maximum
	1663	* space available and the free space is less than 1/2 mss,
	1664	* then set the window to 0.
	1665	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
	1666	* Otherwise, just prevent the window from shrinking
	1667	* and from being larger than the largest representable value.
	1668	*
	1669	* This prevents incremental opening of the window in the regime
	1670	* where TCP is limited by the speed of the reader side taking
	1671	* data out of the TCP receive queue. It does nothing about
	1672	* those cases where the window is constrained on the sender side
	1673	* because the pipeline is full.
	1674	*
	1675	* BSD also seems to "accidentally" limit itself to windows that are a
	1676	* multiple of MSS, at least until the free space gets quite small.
	1677	* This would appear to be a side effect of the mbuf implementation.
	1678	* Combining these two algorithms results in the observed behavior
	1679	* of having a fixed window size at almost all times.
	1680	*
	1681	* Below we obtain similar behavior by forcing the offered window to
	1682	* a multiple of the mss when it is feasible to do so.
	1683	*
	1684	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
	1685	* Regular options like TIMESTAMP are taken into account.
	1686	*/
	1687	u32 __tcp_select_window(struct sock *sk)
	1688	{
	1689	struct inet_connection_sock *icsk = inet_csk(sk);
	1690	struct tcp_sock *tp = tcp_sk(sk);
	1691	/* MSS for the peer's data. Previous versions used mss_clamp
	1692	* here. I don't know if the value based on our guesses
	1693	* of peer's MSS is better for the performance. It's more correct
	1694	* but may be worse for the performance because of rcv_mss
	1695	* fluctuations. --SAW 1998/11/1
	1696	*/
	1697	int mss = icsk->icsk_ack.rcv_mss;
	1698	int free_space = tcp_space(sk);
	1699	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
	1700	int window;
	1701
	1702	if (mss > full_space)
	1703	mss = full_space;
	1704
	1705	if (free_space < (full_space >> 1)) {
	1706	icsk->icsk_ack.quick = 0;
	1707
	1708	if (tcp_memory_pressure)
	1709	tp->rcv_ssthresh = min(tp->rcv_ssthresh,
	1710	4U * tp->advmss);
	1711
	1712	if (free_space < mss)
	1713	return 0;
	1714	}
	1715
	1716	if (free_space > tp->rcv_ssthresh)
	1717	free_space = tp->rcv_ssthresh;
	1718
	1719	/* Don't do rounding if we are using window scaling, since the
	1720	* scaled window will not line up with the MSS boundary anyway.
	1721	*/
	1722	window = tp->rcv_wnd;
	1723	if (tp->rx_opt.rcv_wscale) {
	1724	window = free_space;
	1725
	1726	/* Advertise enough space so that it won't get scaled away.
	1727	* Import case: prevent zero window announcement if
	1728	* 1<<rcv_wscale > mss.
	1729	*/
	1730	if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
	1731	window = (((window >> tp->rx_opt.rcv_wscale) + 1)
	1732	<< tp->rx_opt.rcv_wscale);
	1733	} else {
	1734	/* Get the largest window that is a nice multiple of mss.
	1735	* Window clamp already applied above.
	1736	* If our current window offering is within 1 mss of the
	1737	* free space we just keep it. This prevents the divide
	1738	* and multiply from happening most of the time.
	1739	* We also don't do any window rounding when the free space
	1740	* is too small.
	1741	*/
	1742	if (window <= free_space - mss \|\| window > free_space)
	1743	window = (free_space / mss) * mss;
	1744	else if (mss == full_space &&
	1745	free_space > window + (full_space >> 1))
	1746	window = free_space;
	1747	}
	1748
	1749	return window;
	1750	}
	1751
	1752	/* Attempt to collapse two adjacent SKB's during retransmission. */
	1753	static void tcp_retrans_try_collapse(struct sock sk, struct sk_buff skb,
	1754	int mss_now)
	1755	{
	1756	struct tcp_sock *tp = tcp_sk(sk);
	1757	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
	1758	int skb_size, next_skb_size;
	1759	u16 flags;
	1760
	1761	/* The first test we must make is that neither of these two
	1762	* SKB's are still referenced by someone else.
	1763	*/
	1764	if (skb_cloned(skb) \|\| skb_cloned(next_skb))
	1765	return;
	1766
	1767	skb_size = skb->len;
	1768	next_skb_size = next_skb->len;
	1769	flags = TCP_SKB_CB(skb)->flags;
	1770
	1771	/* Also punt if next skb has been SACK'd. */
	1772	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
	1773	return;
	1774
	1775	/* Next skb is out of window. */
	1776	if (after(TCP_SKB_CB(next_skb)->end_seq, tcp_wnd_end(tp)))
	1777	return;
	1778
	1779	/* Punt if not enough space exists in the first SKB for
	1780	* the data in the second, or the total combined payload
	1781	* would exceed the MSS.
	1782	*/
	1783	if ((next_skb_size > skb_tailroom(skb)) \|\|
	1784	((skb_size + next_skb_size) > mss_now))
	1785	return;
	1786
	1787	BUG_ON(tcp_skb_pcount(skb) != 1 \|\| tcp_skb_pcount(next_skb) != 1);
	1788
	1789	tcp_highest_sack_combine(sk, next_skb, skb);
	1790
	1791	/* Ok. We will be able to collapse the packet. */
	1792	tcp_unlink_write_queue(next_skb, sk);
	1793
	1794	skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size),
	1795	next_skb_size);
	1796
	1797	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
	1798	skb->ip_summed = CHECKSUM_PARTIAL;
	1799
	1800	if (skb->ip_summed != CHECKSUM_PARTIAL)
	1801	skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
	1802
	1803	/* Update sequence range on original skb. */
	1804	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
	1805
	1806	/* Merge over control information. */
	1807	flags \|= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
	1808	TCP_SKB_CB(skb)->flags = flags;
	1809
	1810	/* All done, get rid of second SKB and account for it so
	1811	* packet counting does not break.
	1812	*/
	1813	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
	1814	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS)
	1815	tp->retrans_out -= tcp_skb_pcount(next_skb);
	1816	if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST)
	1817	tp->lost_out -= tcp_skb_pcount(next_skb);
	1818	/* Reno case is special. Sigh... */
	1819	if (tcp_is_reno(tp) && tp->sacked_out)
	1820	tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
	1821
	1822	tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb));
	1823	tp->packets_out -= tcp_skb_pcount(next_skb);
	1824
	1825	/* changed transmit queue under us so clear hints */
	1826	tcp_clear_retrans_hints_partial(tp);
	1827	if (next_skb == tp->retransmit_skb_hint)
	1828	tp->retransmit_skb_hint = skb;
	1829
	1830	sk_wmem_free_skb(sk, next_skb);
	1831	}
	1832
	1833	/* Do a simple retransmit without using the backoff mechanisms in
	1834	* tcp_timer. This is used for path mtu discovery.
	1835	* The socket is already locked here.
	1836	*/
	1837	void tcp_simple_retransmit(struct sock *sk)
	1838	{
	1839	const struct inet_connection_sock *icsk = inet_csk(sk);
	1840	struct tcp_sock *tp = tcp_sk(sk);
	1841	struct sk_buff *skb;
	1842	unsigned int mss = tcp_current_mss(sk, 0);
	1843	u32 prior_lost = tp->lost_out;
	1844
	1845	tcp_for_write_queue(skb, sk) {
	1846	if (skb == tcp_send_head(sk))
	1847	break;
	1848	if (skb->len > mss &&
	1849	!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
	1850	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
	1851	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
	1852	tp->retrans_out -= tcp_skb_pcount(skb);
	1853	}
	1854	tcp_skb_mark_lost_uncond_verify(tp, skb);
	1855	}
	1856	}
	1857
	1858	tcp_clear_retrans_hints_partial(tp);
	1859
	1860	if (prior_lost == tp->lost_out)
	1861	return;
	1862
	1863	if (tcp_is_reno(tp))
	1864	tcp_limit_reno_sacked(tp);
	1865
	1866	tcp_verify_left_out(tp);
	1867
	1868	/* Don't muck with the congestion window here.
	1869	* Reason is that we do not increase amount of _data_
	1870	* in network, but units changed and effective
	1871	* cwnd/ssthresh really reduced now.
	1872	*/
	1873	if (icsk->icsk_ca_state != TCP_CA_Loss) {
	1874	tp->high_seq = tp->snd_nxt;
	1875	tp->snd_ssthresh = tcp_current_ssthresh(sk);
	1876	tp->prior_ssthresh = 0;
	1877	tp->undo_marker = 0;
	1878	tcp_set_ca_state(sk, TCP_CA_Loss);
	1879	}
	1880	tcp_xmit_retransmit_queue(sk);
	1881	}
	1882
	1883	/* This retransmits one SKB. Policy decisions and retransmit queue
	1884	* state updates are done by the caller. Returns non-zero if an
	1885	* error occurred which prevented the send.
	1886	*/
	1887	int tcp_retransmit_skb(struct sock sk, struct sk_buff skb)
	1888	{
	1889	struct tcp_sock *tp = tcp_sk(sk);
	1890	struct inet_connection_sock *icsk = inet_csk(sk);
	1891	unsigned int cur_mss;
	1892	int err;
	1893
	1894	/* Inconslusive MTU probe */
	1895	if (icsk->icsk_mtup.probe_size) {
	1896	icsk->icsk_mtup.probe_size = 0;
	1897	}
	1898
	1899	/* Do not sent more than we queued. 1/4 is reserved for possible
	1900	* copying overhead: fragmentation, tunneling, mangling etc.
	1901	*/
	1902	if (atomic_read(&sk->sk_wmem_alloc) >
	1903	min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
	1904	return -EAGAIN;
	1905
	1906	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
	1907	if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
	1908	BUG();
	1909	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
	1910	return -ENOMEM;
	1911	}
	1912
	1913	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
	1914	return -EHOSTUNREACH; /* Routing failure or similar. */
	1915
	1916	cur_mss = tcp_current_mss(sk, 0);
	1917
	1918	/* If receiver has shrunk his window, and skb is out of
	1919	* new window, do not retransmit it. The exception is the
	1920	* case, when window is shrunk to zero. In this case
	1921	* our retransmit serves as a zero window probe.
	1922	*/
	1923	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
	1924	&& TCP_SKB_CB(skb)->seq != tp->snd_una)
	1925	return -EAGAIN;
	1926
	1927	if (skb->len > cur_mss) {
	1928	if (tcp_fragment(sk, skb, cur_mss, cur_mss))
	1929	return -ENOMEM; /* We'll try again later. */
	1930	}
	1931
	1932	/* Collapse two adjacent packets if worthwhile and we can. */
	1933	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
	1934	(skb->len < (cur_mss >> 1)) &&
	1935	(!tcp_skb_is_last(sk, skb)) &&
	1936	(tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
	1937	(skb_shinfo(skb)->nr_frags == 0 &&
	1938	skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
	1939	(tcp_skb_pcount(skb) == 1 &&
	1940	tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) &&
	1941	(sysctl_tcp_retrans_collapse != 0))
	1942	tcp_retrans_try_collapse(sk, skb, cur_mss);
	1943
	1944	/* Some Solaris stacks overoptimize and ignore the FIN on a
	1945	* retransmit when old data is attached. So strip it off
	1946	* since it is cheap to do so and saves bytes on the network.
	1947	*/
	1948	if (skb->len > 0 &&
	1949	(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
	1950	tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
	1951	if (!pskb_trim(skb, 0)) {
	1952	/* Reuse, even though it does some unnecessary work */
	1953	tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1,
	1954	TCP_SKB_CB(skb)->flags);
	1955	skb->ip_summed = CHECKSUM_NONE;
	1956	}
	1957	}
	1958
	1959	/* Make a copy, if the first transmission SKB clone we made
	1960	* is still in somebody's hands, else make a clone.
	1961	*/
	1962	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	1963
	1964	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
	1965
	1966	if (err == 0) {
	1967	/* Update global TCP statistics. */
	1968	TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
	1969
	1970	tp->total_retrans++;
	1971
	1972	#if FASTRETRANS_DEBUG > 0
	1973	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
	1974	if (net_ratelimit())
	1975	printk(KERN_DEBUG "retrans_out leaked.\n");
	1976	}
	1977	#endif
	1978	if (!tp->retrans_out)
	1979	tp->lost_retrans_low = tp->snd_nxt;
	1980	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
	1981	tp->retrans_out += tcp_skb_pcount(skb);
	1982
	1983	/* Save stamp of the first retransmit. */
	1984	if (!tp->retrans_stamp)
	1985	tp->retrans_stamp = TCP_SKB_CB(skb)->when;
	1986
	1987	tp->undo_retrans++;
	1988
	1989	/* snd_nxt is stored to detect loss of retransmitted segment,
	1990	* see tcp_input.c tcp_sacktag_write_queue().
	1991	*/
	1992	TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
	1993	}
	1994	return err;
	1995	}
	1996
	1997	static int tcp_can_forward_retransmit(struct sock *sk)
	1998	{
	1999	const struct inet_connection_sock *icsk = inet_csk(sk);
	2000	struct tcp_sock *tp = tcp_sk(sk);
	2001
	2002	/* Forward retransmissions are possible only during Recovery. */
	2003	if (icsk->icsk_ca_state != TCP_CA_Recovery)
	2004	return 0;
	2005
	2006	/* No forward retransmissions in Reno are possible. */
	2007	if (tcp_is_reno(tp))
	2008	return 0;
	2009
	2010	/* Yeah, we have to make difficult choice between forward transmission
	2011	* and retransmission... Both ways have their merits...
	2012	*
	2013	* For now we do not retransmit anything, while we have some new
	2014	* segments to send. In the other cases, follow rule 3 for
	2015	* NextSeg() specified in RFC3517.
	2016	*/
	2017
	2018	if (tcp_may_send_now(sk))
	2019	return 0;
	2020
	2021	return 1;
	2022	}
	2023
	2024	/* This gets called after a retransmit timeout, and the initially
	2025	* retransmitted data is acknowledged. It tries to continue
	2026	* resending the rest of the retransmit queue, until either
	2027	* we've sent it all or the congestion window limit is reached.
	2028	* If doing SACK, the first ACK which comes back for a timeout
	2029	* based retransmit packet might feed us FACK information again.
	2030	* If so, we use it to avoid unnecessarily retransmissions.
	2031	*/
	2032	void tcp_xmit_retransmit_queue(struct sock *sk)
	2033	{
	2034	const struct inet_connection_sock *icsk = inet_csk(sk);
	2035	struct tcp_sock *tp = tcp_sk(sk);
	2036	struct sk_buff *skb;
	2037	struct sk_buff *hole = NULL;
	2038	u32 last_lost;
	2039	int mib_idx;
	2040	int fwd_rexmitting = 0;
	2041
	2042	if (!tp->lost_out)
	2043	tp->retransmit_high = tp->snd_una;
	2044
	2045	if (tp->retransmit_skb_hint) {
	2046	skb = tp->retransmit_skb_hint;
	2047	last_lost = TCP_SKB_CB(skb)->end_seq;
	2048	if (after(last_lost, tp->retransmit_high))
	2049	last_lost = tp->retransmit_high;
	2050	} else {
	2051	skb = tcp_write_queue_head(sk);
	2052	last_lost = tp->snd_una;
	2053	}
	2054
	2055	/* First pass: retransmit lost packets. */
	2056	tcp_for_write_queue_from(skb, sk) {
	2057	__u8 sacked = TCP_SKB_CB(skb)->sacked;
	2058
	2059	if (skb == tcp_send_head(sk))
	2060	break;
	2061	/* we could do better than to assign each time */
	2062	if (hole == NULL)
	2063	tp->retransmit_skb_hint = skb;
	2064
	2065	/* Assume this retransmit will generate
	2066	* only one packet for congestion window
	2067	* calculation purposes. This works because
	2068	* tcp_retransmit_skb() will chop up the
	2069	* packet to be MSS sized and all the
	2070	* packet counting works out.
	2071	*/
	2072	if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
	2073	return;
	2074
	2075	if (fwd_rexmitting) {
	2076	begin_fwd:
	2077	if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
	2078	break;
	2079	mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
	2080
	2081	} else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
	2082	tp->retransmit_high = last_lost;
	2083	if (!tcp_can_forward_retransmit(sk))
	2084	break;
	2085	/* Backtrack if necessary to non-L'ed skb */
	2086	if (hole != NULL) {
	2087	skb = hole;
	2088	hole = NULL;
	2089	}
	2090	fwd_rexmitting = 1;
	2091	goto begin_fwd;
	2092
	2093	} else if (!(sacked & TCPCB_LOST)) {
	2094	if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
	2095	hole = skb;
	2096	continue;
	2097
	2098	} else {
	2099	last_lost = TCP_SKB_CB(skb)->end_seq;
	2100	if (icsk->icsk_ca_state != TCP_CA_Loss)
	2101	mib_idx = LINUX_MIB_TCPFASTRETRANS;
	2102	else
	2103	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
	2104	}
	2105
	2106	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
	2107	continue;
	2108
	2109	if (tcp_retransmit_skb(sk, skb))
	2110	return;
	2111	NET_INC_STATS_BH(sock_net(sk), mib_idx);
	2112
	2113	if (skb == tcp_write_queue_head(sk))
	2114	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
	2115	inet_csk(sk)->icsk_rto,
	2116	TCP_RTO_MAX);
	2117	}
	2118	}
	2119
	2120	/* Send a fin. The caller locks the socket for us. This cannot be
	2121	* allowed to fail queueing a FIN frame under any circumstances.
	2122	*/
	2123	void tcp_send_fin(struct sock *sk)
	2124	{
	2125	struct tcp_sock *tp = tcp_sk(sk);
	2126	struct sk_buff *skb = tcp_write_queue_tail(sk);
	2127	int mss_now;
	2128
	2129	/* Optimization, tack on the FIN if we have a queue of
	2130	* unsent frames. But be careful about outgoing SACKS
	2131	* and IP options.
	2132	*/
	2133	mss_now = tcp_current_mss(sk, 1);
	2134
	2135	if (tcp_send_head(sk) != NULL) {
	2136	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_FIN;
	2137	TCP_SKB_CB(skb)->end_seq++;
	2138	tp->write_seq++;
	2139	} else {
	2140	/* Socket is locked, keep trying until memory is available. */
	2141	for (;;) {
	2142	skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
	2143	if (skb)
	2144	break;
	2145	yield();
	2146	}
	2147
	2148	/* Reserve space for headers and prepare control bits. */
	2149	skb_reserve(skb, MAX_TCP_HEADER);
	2150	/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
	2151	tcp_init_nondata_skb(skb, tp->write_seq,
	2152	TCPCB_FLAG_ACK \| TCPCB_FLAG_FIN);
	2153	tcp_queue_skb(sk, skb);
	2154	}
	2155	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
	2156	}
	2157
	2158	/* We get here when a process closes a file descriptor (either due to
	2159	* an explicit close() or as a byproduct of exit()'ing) and there
	2160	* was unread data in the receive queue. This behavior is recommended
	2161	* by RFC 2525, section 2.17. -DaveM
	2162	*/
	2163	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
	2164	{
	2165	struct sk_buff *skb;
	2166
	2167	/* NOTE: No TCP options attached and we never retransmit this. */
	2168	skb = alloc_skb(MAX_TCP_HEADER, priority);
	2169	if (!skb) {
	2170	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
	2171	return;
	2172	}
	2173
	2174	/* Reserve space for headers and prepare control bits. */
	2175	skb_reserve(skb, MAX_TCP_HEADER);
	2176	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
	2177	TCPCB_FLAG_ACK \| TCPCB_FLAG_RST);
	2178	/* Send it off. */
	2179	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	2180	if (tcp_transmit_skb(sk, skb, 0, priority))
	2181	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
	2182
	2183	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
	2184	}
	2185
	2186	/* WARNING: This routine must only be called when we have already sent
	2187	* a SYN packet that crossed the incoming SYN that caused this routine
	2188	* to get called. If this assumption fails then the initial rcv_wnd
	2189	* and rcv_wscale values will not be correct.
	2190	*/
	2191	int tcp_send_synack(struct sock *sk)
	2192	{
	2193	struct sk_buff *skb;
	2194
	2195	skb = tcp_write_queue_head(sk);
	2196	if (skb == NULL \|\| !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
	2197	printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
	2198	return -EFAULT;
	2199	}
	2200	if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) {
	2201	if (skb_cloned(skb)) {
	2202	struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
	2203	if (nskb == NULL)
	2204	return -ENOMEM;
	2205	tcp_unlink_write_queue(skb, sk);
	2206	skb_header_release(nskb);
	2207	__tcp_add_write_queue_head(sk, nskb);
	2208	sk_wmem_free_skb(sk, skb);
	2209	sk->sk_wmem_queued += nskb->truesize;
	2210	sk_mem_charge(sk, nskb->truesize);
	2211	skb = nskb;
	2212	}
	2213
	2214	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_ACK;
	2215	TCP_ECN_send_synack(tcp_sk(sk), skb);
	2216	}
	2217	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	2218	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
	2219	}
	2220
	2221	/*
	2222	* Prepare a SYN-ACK.
	2223	*/
	2224	struct sk_buff tcp_make_synack(struct sock sk, struct dst_entry *dst,
	2225	struct request_sock *req)
	2226	{
	2227	struct inet_request_sock *ireq = inet_rsk(req);
	2228	struct tcp_sock *tp = tcp_sk(sk);
	2229	struct tcphdr *th;
	2230	int tcp_header_size;
	2231	struct tcp_out_options opts;
	2232	struct sk_buff *skb;
	2233	struct tcp_md5sig_key *md5;
	2234	__u8 *md5_hash_location;
	2235	int mss;
	2236
	2237	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
	2238	if (skb == NULL)
	2239	return NULL;
	2240
	2241	/* Reserve space for headers. */
	2242	skb_reserve(skb, MAX_TCP_HEADER);
	2243
	2244	skb->dst = dst_clone(dst);
	2245
	2246	mss = dst_metric(dst, RTAX_ADVMSS);
	2247	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
	2248	mss = tp->rx_opt.user_mss;
	2249
	2250	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
	2251	__u8 rcv_wscale;
	2252	/* Set this up on the first call only */
	2253	req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
	2254	/* tcp_full_space because it is guaranteed to be the first packet */
	2255	tcp_select_initial_window(tcp_full_space(sk),
	2256	mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
	2257	&req->rcv_wnd,
	2258	&req->window_clamp,
	2259	ireq->wscale_ok,
	2260	&rcv_wscale);
	2261	ireq->rcv_wscale = rcv_wscale;
	2262	}
	2263
	2264	memset(&opts, 0, sizeof(opts));
	2265	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	2266	tcp_header_size = tcp_synack_options(sk, req, mss,
	2267	skb, &opts, &md5) +
	2268	sizeof(struct tcphdr);
	2269
	2270	skb_push(skb, tcp_header_size);
	2271	skb_reset_transport_header(skb);
	2272
	2273	th = tcp_hdr(skb);
	2274	memset(th, 0, sizeof(struct tcphdr));
	2275	th->syn = 1;
	2276	th->ack = 1;
	2277	TCP_ECN_make_synack(req, th);
	2278	th->source = ireq->loc_port;
	2279	th->dest = ireq->rmt_port;
	2280	/* Setting of flags are superfluous here for callers (and ECE is
	2281	* not even correctly set)
	2282	*/
	2283	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
	2284	TCPCB_FLAG_SYN \| TCPCB_FLAG_ACK);
	2285	th->seq = htonl(TCP_SKB_CB(skb)->seq);
	2286	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
	2287
	2288	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
	2289	th->window = htons(min(req->rcv_wnd, 65535U));
	2290	#ifdef CONFIG_SYN_COOKIES
	2291	if (unlikely(req->cookie_ts))
	2292	TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
	2293	else
	2294	#endif
	2295	tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
	2296	th->doff = (tcp_header_size >> 2);
	2297	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
	2298
	2299	#ifdef CONFIG_TCP_MD5SIG
	2300	/* Okay, we have all we need - do the md5 hash if needed */
	2301	if (md5) {
	2302	tp->af_specific->calc_md5_hash(md5_hash_location,
	2303	md5, NULL, req, skb);
	2304	}
	2305	#endif
	2306
	2307	return skb;
	2308	}
	2309
	2310	/*
	2311	* Do all connect socket setups that can be done AF independent.
	2312	*/
	2313	static void tcp_connect_init(struct sock *sk)
	2314	{
	2315	struct dst_entry *dst = __sk_dst_get(sk);
	2316	struct tcp_sock *tp = tcp_sk(sk);
	2317	__u8 rcv_wscale;
	2318
	2319	/* We'll fix this up when we get a response from the other end.
	2320	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
	2321	*/
	2322	tp->tcp_header_len = sizeof(struct tcphdr) +
	2323	(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
	2324
	2325	#ifdef CONFIG_TCP_MD5SIG
	2326	if (tp->af_specific->md5_lookup(sk, sk) != NULL)
	2327	tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
	2328	#endif
	2329
	2330	/* If user gave his TCP_MAXSEG, record it to clamp */
	2331	if (tp->rx_opt.user_mss)
	2332	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
	2333	tp->max_window = 0;
	2334	tcp_mtup_init(sk);
	2335	tcp_sync_mss(sk, dst_mtu(dst));
	2336
	2337	if (!tp->window_clamp)
	2338	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
	2339	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
	2340	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
	2341	tp->advmss = tp->rx_opt.user_mss;
	2342
	2343	tcp_initialize_rcv_mss(sk);
	2344
	2345	tcp_select_initial_window(tcp_full_space(sk),
	2346	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
	2347	&tp->rcv_wnd,
	2348	&tp->window_clamp,
	2349	sysctl_tcp_window_scaling,
	2350	&rcv_wscale);
	2351
	2352	tp->rx_opt.rcv_wscale = rcv_wscale;
	2353	tp->rcv_ssthresh = tp->rcv_wnd;
	2354
	2355	sk->sk_err = 0;
	2356	sock_reset_flag(sk, SOCK_DONE);
	2357	tp->snd_wnd = 0;
	2358	tcp_init_wl(tp, tp->write_seq, 0);
	2359	tp->snd_una = tp->write_seq;
	2360	tp->snd_sml = tp->write_seq;
	2361	tp->rcv_nxt = 0;
	2362	tp->rcv_wup = 0;
	2363	tp->copied_seq = 0;
	2364
	2365	inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
	2366	inet_csk(sk)->icsk_retransmits = 0;
	2367	tcp_clear_retrans(tp);
	2368	}
	2369
	2370	/*
	2371	* Build a SYN and send it off.
	2372	*/
	2373	int tcp_connect(struct sock *sk)
	2374	{
	2375	struct tcp_sock *tp = tcp_sk(sk);
	2376	struct sk_buff *buff;
	2377
	2378	tcp_connect_init(sk);
	2379
	2380	buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
	2381	if (unlikely(buff == NULL))
	2382	return -ENOBUFS;
	2383
	2384	/* Reserve space for headers. */
	2385	skb_reserve(buff, MAX_TCP_HEADER);
	2386
	2387	tp->snd_nxt = tp->write_seq;
	2388	tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
	2389	TCP_ECN_send_syn(sk, buff);
	2390
	2391	/* Send it off. */
	2392	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	2393	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
	2394	skb_header_release(buff);
	2395	__tcp_add_write_queue_tail(sk, buff);
	2396	sk->sk_wmem_queued += buff->truesize;
	2397	sk_mem_charge(sk, buff->truesize);
	2398	tp->packets_out += tcp_skb_pcount(buff);
	2399	tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
	2400
	2401	/* We change tp->snd_nxt after the tcp_transmit_skb() call
	2402	* in order to make this packet get counted in tcpOutSegs.
	2403	*/
	2404	tp->snd_nxt = tp->write_seq;
	2405	tp->pushed_seq = tp->write_seq;
	2406	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
	2407
	2408	/* Timer for repeating the SYN until an answer. */
	2409	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
	2410	inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
	2411	return 0;
	2412	}
	2413
	2414	/* Send out a delayed ack, the caller does the policy checking
	2415	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
	2416	* for details.
	2417	*/
	2418	void tcp_send_delayed_ack(struct sock *sk)
	2419	{
	2420	struct inet_connection_sock *icsk = inet_csk(sk);
	2421	int ato = icsk->icsk_ack.ato;
	2422	unsigned long timeout;
	2423
	2424	if (ato > TCP_DELACK_MIN) {
	2425	const struct tcp_sock *tp = tcp_sk(sk);
	2426	int max_ato = HZ / 2;
	2427
	2428	if (icsk->icsk_ack.pingpong \|\|
	2429	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
	2430	max_ato = TCP_DELACK_MAX;
	2431
	2432	/* Slow path, intersegment interval is "high". */
	2433
	2434	/* If some rtt estimate is known, use it to bound delayed ack.
	2435	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
	2436	* directly.
	2437	*/
	2438	if (tp->srtt) {
	2439	int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
	2440
	2441	if (rtt < max_ato)
	2442	max_ato = rtt;
	2443	}
	2444
	2445	ato = min(ato, max_ato);
	2446	}
	2447
	2448	/* Stay within the limit we were given */
	2449	timeout = jiffies + ato;
	2450
	2451	/* Use new timeout only if there wasn't a older one earlier. */
	2452	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
	2453	/* If delack timer was blocked or is about to expire,
	2454	* send ACK now.
	2455	*/
	2456	if (icsk->icsk_ack.blocked \|\|
	2457	time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
	2458	tcp_send_ack(sk);
	2459	return;
	2460	}
	2461
	2462	if (!time_before(timeout, icsk->icsk_ack.timeout))
	2463	timeout = icsk->icsk_ack.timeout;
	2464	}
	2465	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
	2466	icsk->icsk_ack.timeout = timeout;
	2467	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
	2468	}
	2469
	2470	/* This routine sends an ack and also updates the window. */
	2471	void tcp_send_ack(struct sock *sk)
	2472	{
	2473	struct sk_buff *buff;
	2474
	2475	/* If we have been reset, we may not send again. */
	2476	if (sk->sk_state == TCP_CLOSE)
	2477	return;
	2478
	2479	/* We are not putting this on the write queue, so
	2480	* tcp_transmit_skb() will set the ownership to this
	2481	* sock.
	2482	*/
	2483	buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
	2484	if (buff == NULL) {
	2485	inet_csk_schedule_ack(sk);
	2486	inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
	2487	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
	2488	TCP_DELACK_MAX, TCP_RTO_MAX);
	2489	return;
	2490	}
	2491
	2492	/* Reserve space for headers and prepare control bits. */
	2493	skb_reserve(buff, MAX_TCP_HEADER);
	2494	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);
	2495
	2496	/* Send it off, this clears delayed acks for us. */
	2497	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	2498	tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
	2499	}
	2500
	2501	/* This routine sends a packet with an out of date sequence
	2502	* number. It assumes the other end will try to ack it.
	2503	*
	2504	* Question: what should we make while urgent mode?
	2505	* 4.4BSD forces sending single byte of data. We cannot send
	2506	* out of window data, because we have SND.NXT==SND.MAX...
	2507	*
	2508	* Current solution: to send TWO zero-length segments in urgent mode:
	2509	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
	2510	* out-of-date with SND.UNA-1 to probe window.
	2511	*/
	2512	static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
	2513	{
	2514	struct tcp_sock *tp = tcp_sk(sk);
	2515	struct sk_buff *skb;
	2516
	2517	/* We don't queue it, tcp_transmit_skb() sets ownership. */
	2518	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
	2519	if (skb == NULL)
	2520	return -1;
	2521
	2522	/* Reserve space for headers and set control bits. */
	2523	skb_reserve(skb, MAX_TCP_HEADER);
	2524	/* Use a previous sequence. This should cause the other
	2525	* end to send an ack. Don't queue or clone SKB, just
	2526	* send it.
	2527	*/
	2528	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK);
	2529	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	2530	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
	2531	}
	2532
	2533	int tcp_write_wakeup(struct sock *sk)
	2534	{
	2535	struct tcp_sock *tp = tcp_sk(sk);
	2536	struct sk_buff *skb;
	2537
	2538	if (sk->sk_state == TCP_CLOSE)
	2539	return -1;
	2540
	2541	if ((skb = tcp_send_head(sk)) != NULL &&
	2542	before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
	2543	int err;
	2544	unsigned int mss = tcp_current_mss(sk, 0);
	2545	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
	2546
	2547	if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
	2548	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
	2549
	2550	/* We are probing the opening of a window
	2551	* but the window size is != 0
	2552	* must have been a result SWS avoidance ( sender )
	2553	*/
	2554	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
	2555	skb->len > mss) {
	2556	seg_size = min(seg_size, mss);
	2557	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
	2558	if (tcp_fragment(sk, skb, seg_size, mss))
	2559	return -1;
	2560	} else if (!tcp_skb_pcount(skb))
	2561	tcp_set_skb_tso_segs(sk, skb, mss);
	2562
	2563	TCP_SKB_CB(skb)->flags \|= TCPCB_FLAG_PSH;
	2564	TCP_SKB_CB(skb)->when = tcp_time_stamp;
	2565	err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
	2566	if (!err)
	2567	tcp_event_new_data_sent(sk, skb);
	2568	return err;
	2569	} else {
	2570	if (tp->urg_mode &&
	2571	between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
	2572	tcp_xmit_probe_skb(sk, 1);
	2573	return tcp_xmit_probe_skb(sk, 0);
	2574	}
	2575	}
	2576
	2577	/* A window probe timeout has occurred. If window is not closed send
	2578	* a partial packet else a zero probe.
	2579	*/
	2580	void tcp_send_probe0(struct sock *sk)
	2581	{
	2582	struct inet_connection_sock *icsk = inet_csk(sk);
	2583	struct tcp_sock *tp = tcp_sk(sk);
	2584	int err;
	2585
	2586	err = tcp_write_wakeup(sk);
	2587
	2588	if (tp->packets_out \|\| !tcp_send_head(sk)) {
	2589	/* Cancel probe timer, if it is not required. */
	2590	icsk->icsk_probes_out = 0;
	2591	icsk->icsk_backoff = 0;
	2592	return;
	2593	}
	2594
	2595	if (err <= 0) {
	2596	if (icsk->icsk_backoff < sysctl_tcp_retries2)
	2597	icsk->icsk_backoff++;
	2598	icsk->icsk_probes_out++;
	2599	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
	2600	min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
	2601	TCP_RTO_MAX);
	2602	} else {
	2603	/* If packet was not sent due to local congestion,
	2604	* do not backoff and do not remember icsk_probes_out.
	2605	* Let local senders to fight for local resources.
	2606	*
	2607	* Use accumulated backoff yet.
	2608	*/
	2609	if (!icsk->icsk_probes_out)
	2610	icsk->icsk_probes_out = 1;
	2611	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
	2612	min(icsk->icsk_rto << icsk->icsk_backoff,
	2613	TCP_RESOURCE_PROBE_INTERVAL),
	2614	TCP_RTO_MAX);
	2615	}
	2616	}
	2617
	2618	EXPORT_SYMBOL(tcp_select_initial_window);
	2619	EXPORT_SYMBOL(tcp_connect);
	2620	EXPORT_SYMBOL(tcp_make_synack);
	2621	EXPORT_SYMBOL(tcp_simple_retransmit);
	2622	EXPORT_SYMBOL(tcp_sync_mss);
	2623	EXPORT_SYMBOL(tcp_mtup_init);