继续上回:linux诡异的半连接(SYN_RECV)队列长度(一) 我们已经确认了全连接队列的长度计算,接下来继续寻找半连接队列长度。
试着慢慢减小tcp_max_syn_backlog的值,但还是看不到半连接状态数量的变化。
实在没什么思路,只能Google之,搜出来的基本都是关于SYN Flood的文章,难道没同学关注过半连接队列的长度吗?
困扰数日终于在某个夜晚被我找一篇题为《关于半连接队列的释疑》的文章,激动呐。根据作者提供的思路我开始翻代码,注意我用的内核版本2.6.32,不同版本代码也有差异。
首先定位到tcp_v4_conn_request函数,在文件netipv4tcp_ipv4.c中。
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; struct tcp_options_received tmp_opt; struct request_sock *req; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ #endif /* Never answer to SYNs send to broadcast or multicast */ if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ //关键函数inet_csk_reqsk_queue_is_full if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; } else #endif goto drop; } /* Accept backlog is full. If we have already queued enough * of warm entries in syn queue, drop request. It is better than * clogging syn queue with openreqs with exponentially increasing * timeout. */ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = inet_reqsk_alloc(&tcp_request_sock_ops); if (!req) goto drop; 省略N多代码
跟进关键函数inet_csk_reqsk_queue_is_full,在文件includenetinet_connection_sock.h中。
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); }
跟进关键函数reqsk_queue_is_full,在文件includenetrequest_sock.h中。
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) { //注意这里是用>>(右移)来判断的,不是大于号 return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log; }
查找qlen和max_qlen_log的定义,在文件includenetrequest_sock.h中。
/** struct listen_sock - listen state * * @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs */ struct listen_sock { u8 max_qlen_log;// 2^max_qlen_log = 半连接队列最大长度 /* 3 bytes hole, try to use */ int qlen;//全连接队列的当前长度 int qlen_young; int clock_hand; u32 hash_rnd; u32 nr_table_entries; struct request_sock *syn_table[0]; };
可见关键是如何计算max_qlen_log,前一篇博客 提到了listen的系统调用:
//file:net/socket.c SYSCALL_DEFINE2(listen, int, fd, int, backlog) { struct socket *sock; int err, fput_needed; int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; //上限不超过somaxconn if ((unsigned)backlog > somaxconn) backlog = somaxconn; err = security_socket_listen(sock, backlog); if (!err) //这里是关键。 err = sock->ops->listen(sock, backlog); fput_light(sock->file, fput_needed); } return err; }
sock->ops->listen其实是inet_listen,在文件netipv4af_inet.c中。
int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) goto out; /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { //关键函数inet_csk_listen_start err = inet_csk_listen_start(sk, backlog); if (err) goto out; } sk->sk_max_ack_backlog = backlog; err = 0; out: release_sock(sk); return err; }
跟进inet_csk_listen_start,在文件netipv4inet_connection_sock.c中。
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); //关键函数reqsk_queue_alloc int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); //后面省略 }
跟进reqsk_queue_alloc,在文件netcorerequest_sock.c中。
int reqsk_queue_alloc(struct request_sock_queue *queue, unsigned int nr_table_entries) { size_t lopt_size = sizeof(struct listen_sock); struct listen_sock *lopt; //这里开始影响到nr_table_entries的取值,内核版本小于2.6.20的话nr_table_entries是不会修改的 nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); nr_table_entries = max_t(u32, nr_table_entries, 8); nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); //nr_table_entries到这里已经确定 lopt_size += nr_table_entries * sizeof(struct request_sock *); if (lopt_size > PAGE_SIZE) lopt = __vmalloc(lopt_size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); else lopt = kzalloc(lopt_size, GFP_KERNEL); if (lopt == NULL) return -ENOMEM; //这里确定了lopt->max_qlen_log的值 for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries;//内核版本小于2.6.20的话这里是sysctl_max_syn_backlog lopt->max_qlen_log++); get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); rwlock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; write_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; write_unlock_bh(&queue->syn_wait_lock); return 0; }
代码到此为止,然后我们计算一下为何在虚拟机S上的SYN_RECV状态数量会是256
nr_table_entries = listen的第二个参数int backlog ,上限是系统的somaxconn
若 somaxconn = 128 sysctl_max_syn_backlog = 4096 backlog = 511 则 nr_table_entries = 128
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
取两者较小的一个 nr_table_entries = 128
nr_table_entries = max_t(u32, nr_table_entries, 8);
取两者较大的一个 nr_table_entries = 128
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); //roundup_pow_of_two – round the given value up to nearest power of two
roundup_pow_of_two(128 + 1) = 256
for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries; lopt->max_qlen_log++);
max_qlen_log = 8
判断半连接队列是否满 queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
queue->listen_opt->qlen = 256 时reqsk_queue_is_full返回1 , 进入drop
所以queue->listen_opt->qlen 取值 0~255, 因此SYN_RECV状态数量会是 256
另外同事的测试结果为何与我的不同?
因为内核版本小于2.6.20的话max_qlen_log是直接由sysctl_max_syn_backlog决定的,所以半连接队列的长度就是等于sysctl_max_syn_backlog
文章有点长,不过总算是把问题给解决了。
转载请注明:爱开源 » linux诡异的半连接(SYN_RECV)队列长度(二)