我对服务器设置和拆卸进行了压力测试。主线程是服务器线程,另一个线程是客户端线程。
在客户端线程中,我调用 socket()、connect()、send()、recv()、shutdown(),然后是 close(),所有这些都在一个紧密的循环中。我故意创建很多小连接。当我的任何套接字调用错误时,此循环终止。
在主线程中,我调用listen(),然后启动客户端线程。我 accept() 少量连接,在每个连接上,我调用一次 recv() 和一次 send(),然后在接受的套接字上调用 shutdown() 和 close()。然后我关闭()和关闭()监听套接字,并加入客户端线程。
大约每 1500 次迭代,我的客户端线程会卡在 recv() 上,而主线程会卡在 pthread_join(client_thread) 中。“netstat -n -p tcp”在 ESTABLISHED 中显示两个 TCP/IP 条目,一个用于连接的每一侧。我的调试打印表明成功的客户端连接几乎与我的侦听套接字上的 close() 调用同时发生。没有在服务器上为该客户端连接调用 accept()。如果我然后 CTRL-C 程序,客户端进入 FIN_WAIT_2,服务器端进入 CLOSE_WAIT。FIN_WAIT_2 最终消失,但 CLOSE_WAIT 仍然存在,即使在注销/登录之后也是如此。
#include <iostream>
#include <pthread.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <unistd.h>
#include <fcntl.h>
#include <arpa/inet.h>
#include <sys/time.h>
#include <netdb.h>
#include <errno.h>
struct addrinfo *res = 0;
int count = 0;
struct tSocketCloser {
int s;
~tSocketCloser() {
printf("clientClosing: %d\n", s);
shutdown(s, SHUT_RDWR);
close(s);
printf("clientClosed: %d\n", s);
}
};
int start_server() {
int listenSocket = -1;
listenSocket = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
printf("listenSocket: %d\n", listenSocket);
int ret = 0;
int one = 1;
ret = ::setsockopt(listenSocket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
if(ret)
{
printf("SO_REUSEADDR %d\n", ret);
exit(-1);
}
/* SO_LINGER {1,0} and TCP_NODELAY were here */
// O_NONBLOCK code was here
ret = ::bind(listenSocket, res->ai_addr, static_cast<int>(res->ai_addrlen));
if(ret)
{
printf("Bind %d\n", ret);
exit(-1);
}
ret = ::listen(listenSocket, 1024);
if (ret)
{
printf("listen %d\n", ret);
exit(-1);
}
return listenSocket;
}
void stop_server(int listenSocket) {
int iters = rand() % 3;
for(int i = 0; i < iters; ++i)
{
struct sockaddr_storage clientAddress;
int size = sizeof(clientAddress);
tSocketCloser otherSock;
otherSock.s = ::accept(
listenSocket,
(struct sockaddr *) &clientAddress,
(socklen_t *) &size);
printf("accept: %d\n", otherSock.s);
int ret = 0;
int one = 1;
ret = setsockopt(otherSock.s, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one));
if(ret)
{
printf("SO_NOSIGPIPE %d\n", ret);
break;
}
char buffer[2048] = {0};
ret = recv(otherSock.s, buffer, sizeof(buffer), 0);
if(ret == -1)
break;
ret = send(otherSock.s, buffer, sizeof(buffer), 0);
if(ret == -1)
break;
}
int sleep_time = abs(rand()%1000);
usleep(sleep_time);
printf("serverClosing: %d\n", listenSocket);
shutdown(listenSocket, SHUT_RDWR);
close(listenSocket);
printf("serverClosed: %d\n", listenSocket);
}
void *short_connect(void *)
{
while(true) {
++count;
int connectSocket = -1;
int ret = 0;
int one = 1;
connectSocket = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
tSocketCloser closer = {connectSocket};
/* SO_LINGER {1,0} and TCP_NODELAY were here */
ret = setsockopt(connectSocket, SOL_SOCKET, SO_NOSIGPIPE, &one, sizeof(one));
if(ret)
{
printf("client SO_NOSIGPIPE %d\n", ret);
return NULL;
}
// O_NONBLOCK code was here
ret = connect(connectSocket, res->ai_addr, static_cast<int>(res->ai_addrlen));
if(ret)
{
printf("bad connect %d\n", ret);
return NULL;
}
printf("good connect %d\n",connectSocket);
char buffer[1024] = {0};
ret = send(connectSocket, buffer, sizeof(buffer), 0);
printf("%d: send %d\n", count, ret);
if(ret == -1)
return NULL;
ret = recv(connectSocket, buffer, sizeof(buffer), 0);
printf("%d: recv %d\n", count, ret);
if(ret == -1)
return NULL;
printf("Success!\n");
}
}
int main() {
struct addrinfo hints;
int error;
char port[sizeof("65536") + 1] = "9999";
std::memset(&hints, 0, sizeof(hints));
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
// Wildcard address
error = getaddrinfo(NULL, port, &hints, &res);
if (error) {
printf("getaddrinfo %d\n", error);
exit(error);
}
for(int i = 0; i < 1000; ++i)
{
int sock = start_server();
pthread_t clientThread = 0;
pthread_create(&clientThread, NULL, short_connect, NULL);
stop_server(sock);
void* ignore;
pthread_join(clientThread, &ignore);
}
return 0;
}
这是一些稍微带注释的输出:
listenSocket: 4 //what a good run looks like...
good connect 5
accept: 6
42: send 1024
clientClosing: 6
42: recv 1024
Success!
clientClosing: 5
clientClosed: 5
clientClosed: 6
good connect 5
accept: 6
43: send 1024
clientClosing: 6
43: recv 1024
clientClosed: 6
Success!
clientClosing: 5
clientClosed: 5
good connect 5 //client connects
44: send 1024
serverClosing: 4 //server starting close...
serverClosed: 4 //server done closing
44: recv -1 //recv errors out, as it should. Note the lack of accept() calls
clientClosing: 5 //client teardown
clientClosed: 5
listenSocket: 4 //what a bad run looks like...
good connect 5
accept: 6
45: send 1024
clientClosing: 6
45: recv 1024
clientClosed: 6
Success!
clientClosing: 5
clientClosed: 5
good connect 5
accept: 6
46: send 1024
clientClosing: 6
clientClosed: 6
46: recv 1024
Success!
clientClosing: 5
clientClosed: 5
serverClosing: 4 //server starting close...
good connect 5 //client connect
serverClosed: 4 //server done closing
47: send 1024 //successful send from client
//stuck in recv(), so we get no further prints
所以最大的问题是......我怎样才能在不进入这种状态的情况下关闭我的监听套接字?CLOSE_WAIT 状态表明我需要关闭已接受的连接,但我没有要关闭的套接字/文件描述符。杀死托管服务器和客户端的程序并不会导致套接字被清理也似乎很奇怪(CLOSE_WAIT 套接字在几小时后仍处于 netstat 中)。
这一切都发生在 OS X 10.8.3 x86_64 上。