继续上回:linux诡异的半连接(SYN_RECV)队列长度(一) 我们已经确认了全连接队列的长度计算,接下来继续寻找半连接队列长度。
试着慢慢减小tcp_max_syn_backlog的值,但还是看不到半连接状态数量的变化。
实在没什么思路,只能Google之,搜出来的基本都是关于SYN Flood的文章,难道没同学关注过半连接队列的长度吗?
困扰数日终于在某个夜晚被我找一篇题为《关于半连接队列的释疑》的文章,激动呐。根据作者提供的思路我开始翻代码,注意我用的内核版本2.6.32,不同版本代码也有差异。
首先定位到tcp_v4_conn_request函数,在文件netipv4tcp_ipv4.c中。
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct inet_request_sock *ireq;
struct tcp_options_received tmp_opt;
struct request_sock *req;
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
#endif
/* Never answer to SYNs send to broadcast or multicast */
if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto drop;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
//关键函数inet_csk_reqsk_queue_is_full
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
} else
#endif
goto drop;
}
/* Accept backlog is full. If we have already queued enough
* of warm entries in syn queue, drop request. It is better than
* clogging syn queue with openreqs with exponentially increasing
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
req = inet_reqsk_alloc(&tcp_request_sock_ops);
if (!req)
goto drop;
省略N多代码
跟进关键函数inet_csk_reqsk_queue_is_full,在文件includenetinet_connection_sock.h中。
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{
return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue);
}
跟进关键函数reqsk_queue_is_full,在文件includenetrequest_sock.h中。
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
//注意这里是用>>(右移)来判断的,不是大于号
return queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
}
查找qlen和max_qlen_log的定义,在文件includenetrequest_sock.h中。
/** struct listen_sock - listen state
*
* @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
*/
struct listen_sock {
u8 max_qlen_log;// 2^max_qlen_log = 半连接队列最大长度
/* 3 bytes hole, try to use */
int qlen;//全连接队列的当前长度
int qlen_young;
int clock_hand;
u32 hash_rnd;
u32 nr_table_entries;
struct request_sock *syn_table[0];
};
可见关键是如何计算max_qlen_log,前一篇博客 提到了listen的系统调用:
//file:net/socket.c
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
//上限不超过somaxconn
if ((unsigned)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
//这里是关键。
err = sock->ops->listen(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
}
sock->ops->listen其实是inet_listen,在文件netipv4af_inet.c中。
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
//关键函数inet_csk_listen_start
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
sk->sk_max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
跟进inet_csk_listen_start,在文件netipv4inet_connection_sock.c中。
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
//关键函数reqsk_queue_alloc
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
//后面省略
}
跟进reqsk_queue_alloc,在文件netcorerequest_sock.c中。
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
struct listen_sock *lopt;
//这里开始影响到nr_table_entries的取值,内核版本小于2.6.20的话nr_table_entries是不会修改的
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
//nr_table_entries到这里已经确定
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size > PAGE_SIZE)
lopt = __vmalloc(lopt_size,
GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
else
lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
//这里确定了lopt->max_qlen_log的值
for (lopt->max_qlen_log = 3;
(1 << lopt->max_qlen_log) < nr_table_entries;//内核版本小于2.6.20的话这里是sysctl_max_syn_backlog
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
write_unlock_bh(&queue->syn_wait_lock);
return 0;
}
代码到此为止,然后我们计算一下为何在虚拟机S上的SYN_RECV状态数量会是256
nr_table_entries = listen的第二个参数int backlog ,上限是系统的somaxconn
若 somaxconn = 128 sysctl_max_syn_backlog = 4096 backlog = 511 则 nr_table_entries = 128
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
取两者较小的一个 nr_table_entries = 128
nr_table_entries = max_t(u32, nr_table_entries, 8);
取两者较大的一个 nr_table_entries = 128
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); //roundup_pow_of_two – round the given value up to nearest power of two
roundup_pow_of_two(128 + 1) = 256
for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries; lopt->max_qlen_log++);
max_qlen_log = 8
判断半连接队列是否满 queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
queue->listen_opt->qlen = 256 时reqsk_queue_is_full返回1 , 进入drop
所以queue->listen_opt->qlen 取值 0~255, 因此SYN_RECV状态数量会是 256
另外同事的测试结果为何与我的不同?
因为内核版本小于2.6.20的话max_qlen_log是直接由sysctl_max_syn_backlog决定的,所以半连接队列的长度就是等于sysctl_max_syn_backlog
文章有点长,不过总算是把问题给解决了。
转载请注明:爱开源 » linux诡异的半连接(SYN_RECV)队列长度(二)