How to maximize the freertos_tcp thoughtput using tcp connection

Hi,
I am using freertos_tcp for data transmission, and after I ported the stack, I tried to test the network performance. however the perfromance is not good. The tcp connection speed is only about 600kbps. I captured a wireshark and found it seems the tcp ack was frequently sent back in a big delay in the end of window. ( ~ 20 ms, there are also some ack datas send back immediately)
attached a wireshark capture. The No 1483 tcp ack has ~20 ms delay to preview received data. Is there any think I can do to improve that?
In the capture, the 192.168.110.11 is the target that runs freertos_tcp. A linux machine tries to send data to it, but can only achieve poor speed.
And I found when the 20 ms delay happened, the tcp ack’s win size is mostly 3483. normally, it would be 6xxx or other values.

I am using the freertos 9 with tcp stack config:

/* The checksums will be checked and calculated by the STM32F4x ETH peripheral. */
#define ipconfigDRIVER_INCLUDED_TX_IP_CHECKSUM		( 1 )
#define ipconfigDRIVER_INCLUDED_RX_IP_CHECKSUM		( 1 )

#define ipconfigSOCK_DEFAULT_RECEIVE_BLOCK_TIME	( 5000 )
#define	ipconfigSOCK_DEFAULT_SEND_BLOCK_TIME	( 5000 )

#define ipconfigZERO_COPY_RX_DRIVER			( 0 )
#define ipconfigZERO_COPY_TX_DRIVER			( 0 )

/* Include support for LLMNR: Link-local Multicast Name Resolution
(non-Microsoft) */
#define ipconfigUSE_LLMNR					( 1 )

/* Include support for NBNS: NetBIOS Name Service (Microsoft) */
#define ipconfigUSE_NBNS					( 0 )

#define ipconfigUSE_DNS_CACHE				( 1 )
#define ipconfigDNS_CACHE_NAME_LENGTH		( 16 )
#define ipconfigDNS_CACHE_ENTRIES			( 4 )
#define ipconfigDNS_REQUEST_ATTEMPTS		( 4 )

#define ipconfigIP_TASK_PRIORITY			( configMAX_PRIORITIES - 2 )

#define ipconfigIP_TASK_STACK_SIZE_WORDS	( configMINIMAL_STACK_SIZE * 5 )

extern UBaseType_t uxRand(void);
#define ipconfigRAND32()	uxRand()

#define ipconfigUSE_NETWORK_EVENT_HOOK 1

#define ipconfigUDP_MAX_SEND_BLOCK_TIME_TICKS ( 5000 / portTICK_PERIOD_MS )

#define ipconfigUSE_DHCP				0
#define ipconfigDHCP_REGISTER_HOSTNAME	0
#define ipconfigDHCP_USES_UNICAST       0

#define ipconfigMAXIMUM_DISCOVER_TX_PERIOD		( pdMS_TO_TICKS( 30000 ) )
#define ipconfigARP_CACHE_ENTRIES		6

#define ipconfigMAX_ARP_RETRANSMISSIONS ( 5 )

#define ipconfigMAX_ARP_AGE			150

#define ipconfigINCLUDE_FULL_INET_ADDR	1

#if( ipconfigZERO_COPY_RX_DRIVER != 0 )
	/* _HT_ Actually we should know the value of 'configNUM_RX_DESCRIPTORS' here. */
	#define ipconfigNUM_NETWORK_BUFFER_DESCRIPTORS		( 40 + 6 )
#else
	#define ipconfigNUM_NETWORK_BUFFER_DESCRIPTORS		25
#endif

#define ipconfigEVENT_QUEUE_LENGTH		( ipconfigNUM_NETWORK_BUFFER_DESCRIPTORS + 5 )

#define ipconfigALLOW_SOCKET_SEND_WITHOUT_BIND 1

/* USE_TCP: Use TCP and all its features */
#define ipconfigUSE_TCP				( 1 )

/* USE_WIN: Let TCP use windowing mechanism. */
#define ipconfigUSE_TCP_WIN			( 1 )


#define ipconfigNETWORK_MTU					1200

/* Set ipconfigUSE_DNS to 1 to include a basic DNS client/resolver.  DNS is used
through the FreeRTOS_gethostbyname() API function. */
#define ipconfigUSE_DNS								1

/* If ipconfigREPLY_TO_INCOMING_PINGS is set to 1 then the IP stack will
generate replies to incoming ICMP echo (ping) requests. */
#define ipconfigREPLY_TO_INCOMING_PINGS				1

/* If ipconfigSUPPORT_OUTGOING_PINGS is set to 1 then the
FreeRTOS_SendPingRequest() API function is available. */
#define ipconfigSUPPORT_OUTGOING_PINGS				1

/* If ipconfigSUPPORT_SELECT_FUNCTION is set to 1 then the FreeRTOS_select()
(and associated) API function is available. */
#define ipconfigSUPPORT_SELECT_FUNCTION				1

#define ipconfigFILTER_OUT_NON_ETHERNET_II_FRAMES  1

#define ipconfigETHERNET_DRIVER_FILTERS_FRAME_TYPES	1

#define configWINDOWS_MAC_INTERRUPT_SIMULATOR_DELAY ( 2 / portTICK_PERIOD_MS )
#define ipconfigPACKET_FILLER_SIZE 2

#define ipconfigTCP_WIN_SEG_COUNT 64

#define ipconfigTCP_RX_BUFFER_LENGTH			( 3 * 1460 )

/* Define the size of Tx buffer for TCP sockets. */
#define ipconfigTCP_TX_BUFFER_LENGTH			( 2 * 1460 )

#define ipconfigIS_VALID_PROG_ADDRESS(x) ( (x) != NULL )

/* Include support for TCP hang protection.  All sockets in a connecting or
disconnecting stage will timeout after a period of non-activity. */
#define ipconfigTCP_HANG_PROTECTION				( 1 )
#define ipconfigTCP_HANG_PROTECTION_TIME		( 30 )

/* Include support for TCP keep-alive messages. */
#define ipconfigTCP_KEEP_ALIVE				( 1 )
#define ipconfigTCP_KEEP_ALIVE_INTERVAL		( 20 ) /* in seconds */

/* Set to 1 or 0 to include/exclude FTP and HTTP functionality from the standard
server task. */
#define ipconfigUSE_FTP						1
#define ipconfigUSE_HTTP					1

/* Buffer and window sizes used by the FTP and HTTP servers respectively.  The
FTP and HTTP servers both execute in the standard server task. */
#define ipconfigFTP_TX_BUFSIZE				( 4 * ipconfigTCP_MSS )
#define ipconfigFTP_TX_WINSIZE				( 2 )
#define ipconfigFTP_RX_BUFSIZE				( 8 * ipconfigTCP_MSS )
#define ipconfigFTP_RX_WINSIZE				( 4 )
#define ipconfigHTTP_TX_BUFSIZE				( 3 * ipconfigTCP_MSS )
#define ipconfigHTTP_TX_WINSIZE				( 2 )
#define ipconfigHTTP_RX_BUFSIZE				( 4 * ipconfigTCP_MSS )
#define ipconfigHTTP_RX_WINSIZE				( 4 )

/* UDP Logging related constants follow.  The standard UDP logging facility
writes formatted strings to a buffer, and creates a task that removes messages
from the buffer and sends them to the UDP address and port defined by the
constants that follow. */

/* Prototype for the function used to print out.  In this case the standard
UDP logging facility is used. */
extern int lUDPLoggingPrintf( const char *pcFormatString, ... );

/* Set to 1 to print out debug messages.  If ipconfigHAS_DEBUG_PRINTF is set to
1 then FreeRTOS_debug_printf should be defined to the function used to print
out the debugging messages. */
#define ipconfigHAS_DEBUG_PRINTF	0
#if( ipconfigHAS_DEBUG_PRINTF == 1 )
	#define FreeRTOS_debug_printf(X)	lUDPLoggingPrintf X
#endif

#define ipconfigHAS_PRINTF			1
#if( ipconfigHAS_PRINTF == 1 )
	#define FreeRTOS_printf(X)			lUDPLoggingPrintf X
#endif

#define ipconfigDNS_USE_CALLBACKS			1
#define ipconfigSUPPORT_SIGNALS				1
#define configNETWORK_INTERFACE_TO_USE 4L

#define configMAC_ADDR0		0x00
#define configMAC_ADDR1		0x11
#define configMAC_ADDR2		0x22
#define configMAC_ADDR3		0x33
#define configMAC_ADDR4		0x44
#define configMAC_ADDR5		0x41

#define configIP_ADDR0		192
#define configIP_ADDR1		168
#define configIP_ADDR2		110
#define configIP_ADDR3		11

#define configGATEWAY_ADDR0	192
#define configGATEWAY_ADDR1	168
#define configGATEWAY_ADDR2	110
#define configGATEWAY_ADDR3	1
#define configDNS_SERVER_ADDR0 	208
#define configDNS_SERVER_ADDR1 	67
#define configDNS_SERVER_ADDR2 	222
#define configDNS_SERVER_ADDR3 	222

/* Default netmask configuration.  Used in ipconfigUSE_DNS is set to 0, or
ipconfigUSE_DNS is set to 1 but a DNS server cannot be contacted. */
#define configNET_MASK0		255
#define configNET_MASK1		255
#define configNET_MASK2		255
#define configNET_MASK3		0

/* The UDP port to which print messages are sent. */
#define configPRINT_PORT	( 15000 )


/* this section defines the iperf3 related macros */

#define USE_IPERF						        1
#define ipconfigIPERF_DOES_ECHO_UDP		        1

#define ipconfigIPERF_VERSION					3
#define ipconfigIPERF_STACK_SIZE_IPERF_TASK		680

#define ipconfigIPERF_TX_BUFSIZE				( 8 * ipconfigTCP_MSS )
#define ipconfigIPERF_TX_WINSIZE				( 6 )
#define ipconfigIPERF_RX_BUFSIZE				( 8 * ipconfigTCP_MSS )
#define ipconfigIPERF_RX_WINSIZE				( 6 )

/* The iperf module declares a character buffer to store its send data. */
#define ipconfigIPERF_RECV_BUFFER_SIZE			( 2 * ipconfigTCP_MSS )

You should tell the FreeRTOS/TCP version, the MCU platform, which ethernet driver you’re using and what you did so far to tune the TCP stack config (in FreeRTOSIPConfig.h).
Also given you followed the advices setting up the IP stack task priority accordingly e.g.

In addition to Hartmut’s remarks:

In the PCAP I see SAC packets, meaning that incoming packets were missing. They lead to a retransmission. Can you explain why some packets get dropped?

The packets only contain 1160 bytes, which is small on a LAN. You can get a bigger performance when using a MSS of e.g. 1460 bytes.

This is an interesting socket option: FREERTOS_SO_WIN_PROPERTIES. It lets you set all TCP window parameters, both the MSS as well as the buffer sizes.

The No 1483 tcp ack has ~20 ms delay

That is difficult to say without knowing the platform and network interface. Setting proper task priorities is indeed important.

EDIT
The next time, can you try upload the original PCAP file ( in a ZIP file )?

I tried to upload the ziped file, but the got rejected said “new users can not upload attachments”
Can I config the Freertos_tcp and make it send the ack more frequently?
I think the retranmission is due to the host side did not get the ack in time. So it thinks there’s packets dropping.

And the udp seems working normally.

I tried to upload the ziped file, but the got rejected said “new users can not upload attachments”

Hello @rashed can give you upload permission to @xiaoyizhu ?

Can I config the Freertos_tcp and make it send the ack more frequently?

You can play with the macro tcpDELAYED_ACK_LONGER_DELAY_MS, which can be found in FreeRTOS_TCP_IP.c ( or in FreeRTOS_TCP_Transmission.c ). But that is not the way to go.

I would like to see the complete conversation in a PCAP file, so I can see if the decision was correct.

The delay is short when less than MSS bytes were received, the delay is longer ( about 20 ms ) when a full-size packet was received.

I think the retranmission is due to the host side did not get the ack in time

That conclusion is not correct. The SACK ( Selective ACK ) packet indicates that a packet was really missing.

Imagine that packets 1, 2, 4 are received. The peer will tell in a SACK that packet 3 is missing. This packet will get a fast re-transmission.

Can you also answer Hartmut’s questions, please?

Hi Hartmut,
I have just attach my FreeRTOSIPConfig.h in my original post, for the ethernet driver on Aurix driver, I wrote the driver refering to some offical samples. I am just want to test the driver performance and found the TCP performance is poor, and trying to find out the reason.

iperftcp18.pcap.zip (747.1 KB)
Thanks for your help , I got the permission to upload the pcap file.
One more question, given the MCU limited resource, is there a mechanism that freertos_tcp will send ack once the it receives several tcp packets rather than a timer triggered?

Hello @xiaoyizhu, thank you for the PCAP file.
It looks like there is a severe problem in your network interface.
Every time when you see “SLE=xxx SRE=xxx”, a packet was dropped, got lost. I would recommend solving that problem first.

Can you share the code of your network interface?

Grateful if you can contribute both the Aurix kernel and tcp ports once done :wink:

Hi htibosh,
It’s true a lot of packets ( 1/4) were dropped due to system do not have sufficient power to process it. But after counting this problem, the thoughtput is still not good( 700kbps). I am trying to find out a way to reduce the packet drop rate. But still, I’m wondering what other factors that make the thoughput so bad.
I can not post all the network interface due to policy limitation, but some code piece is as the following:
For registered rx interrupt:

IFX_INTERRUPT(ETH_DMA_RX, 2, ISR_PRIORITY_GETH_RX)
{
    BaseType_t xHigherPriorityTaskWoken = pdFALSE;
    vTaskNotifyGiveFromISR(receiveHandler, &xHigherPriorityTaskWoken);
    /* Context switch needed? */
    portYIELD_FROM_ISR(xHigherPriorityTaskWoken);
}

The corresponding rx recv task is 

static void receiveTask(void *pvParameters)
{
    NetworkBufferDescriptor_t *receiveBufferDescriptor;
    size_t xBytesReceived;
    IPStackEvent_t xRxEvent;

    while (1) {
        ulTaskNotifyTake(pdFALSE, portMAX_DELAY);
        while (!ETH_MAC_IsRxDescriptorOwnedByDma(&ETH_0, 0)) {
		xBytesReceived =  ETH_MAC_GetRxDmaFrameSize(&ETH_0, 0);
        if (xBytesReceived > 0) {
            receiveBufferDescriptor = pxGetNetworkBufferWithDescriptor(xBytesReceived, 0);
            if (receiveBufferDescriptor != NULL) {
                CopyDMAData2Mem(receiveBufferDescriptor->pucEthernetBuffer, xBytesReceived); 

            	ETH_MAC_ReturnRxDmaDescriptor(&ETH_0, 0);
            	ETH_MAC_ResumeRxDma(&ETH_0, 0);

                receiveBufferDescriptor->xDataLength = xBytesReceived;
                if (eConsiderFrameForProcessing(receiveBufferDescriptor->pucEthernetBuffer) == eProcessBuffer) {
                     xRxEvent.eEventType = eNetworkRxEvent;

                    xRxEvent.pvData = (void *)receiveBufferDescriptor;

                     if (xSendEventStructToIPTask(&xRxEvent, 0) == pdFALSE) {
                       vReleaseNetworkBufferAndDescriptor(receiveBufferDescriptor);
                        iptraceETHERNET_RX_EVENT_LOST();
                    } else {
                        Call the standard trace macro to log the occurrence. */
                        iptraceNETWORK_INTERFACE_RECEIVE();
                    }
                } else {
                     vReleaseNetworkBufferAndDescriptor(receiveBufferDescriptor);
                }
            } else {
                ETH_MAC_ReturnRxDmaDescriptor(&ETH_0, 0);
            	ETH_MAC_ResumeRxDma(&ETH_0, 0);

                iptraceETHERNET_RX_EVENT_LOST();
            }
        }
    }
    }
}

You should quote code snippets by enclosing them with 3 tildes ‘~’ or backticks ‘`’ for better readability.
I think the driver code looks good so far. How capable is your MCU (clock) ? Is there anything else running hogging your CPU ? What’s the priority scheme regarding driver task, IP task and other application tasks ? Are there other high frequent interrupts ?
If the ethernet driver ISR and deferred processing task have the highest prio there should be no bottleneck or starvation. Can you instrument the code to see if the driver task runs out of resp. Often waits for network buffers ? Which buffer allocation approach (1 or 2) are you using ?
Also a zero copy driver would surely and noticeably improve performance.

The receiver/transmit task has a pri 9 and IP task has pri 8 in FreeRTOS.
The MCU runs in 300Mhz, the interrupt pri is not so high, but the CPU that runs the network doesn’t run much other task.
And other interrupt doesn’t triggered with high freq.

I will do the perf test on the rx task t see if it can resp on time. I am using buffer allocation 2, I did not use zero copy driver. For zero copy, I have a question, does the zero copy driver utilize the memory very efficiently? As my system only has a very limited resource. ( heap sz ~ 50kB).

Well, this is a pretty beefy MCU without any other high prio processing. You should achieve MUCH higher throughput ! You might search the forum for networking (performance) issues and you’ll find some higher numbers for e.g. STM32F4 running at 168 MHz. Also 50k heap should be more than sufficient to run the network stack if you don’t use too much sockets simultaneously and the application doesn’t need much heap.
I guess - if there are no other bugs or problems - you have an out-of-memory/buffer issue starving the network stack. You should be able to improve the performance by tuning the memory/network buffer related parameters.
A zero-copy driver is much more efficient regarding processing (no data copies) and saves a lot of memory (no copy buffers needed). That’s the way to go to develop an efficient application.
BTW is your MAC peripheral able to offload (ethernet frame/IP) checksum calculations ? Most MACs nowadays support that. This also saves a lot of processing power for obvious reasons.
Edit: We are talking about wired 100 MBit Full-Duplex ethernet, right ?

You know, the good thing about TCP is that there are no surprises. Your reception window is 6960 bytes long, which is equal to 6 full packets ( with an MSS of 1160 bytes ).
If the TCP connection is only one-way, at most 7 KB will be allocated in buffers.

Question: could you also attach your FreeRTOSIPConfig.h file? I am curious about your definition of ipconfigNETWORK_MTU and other defines.

Hartmut wrote:

Also 50k heap should be more than sufficient to run the network stack

Yes it is enough, but I would be careful with ‘only’ 50 KB of heap. Can you monitor the available amount of memory as time passes by? The number of dropped packets must really become zero, before you get a better performance :slight_smile:

Which heap_x.c are you using?

I guess I need check the out-of-memory/buffer issue. I will work on this.
Yes, we are talking about wired Full-duplex ethernet. actaully, in my config, it is 1000M full-duplex.

The FreeRTOSIPConfig.h is attached in my first post.
Thanks for the advice, I will try monitor the memory as time passby. And tries to find out how to improve packets drop.
I am using heap4

Also, if you are using the main branch of FreeRTOS+TCP repository, can you pull in the latest changes?

Recently @htibosch added a PR to fix TCP timer issue: Let the TCP timer becomes expired in stead of active.

I have find the thoughtput problem. Now the network speed is about 30+Mbps. Thanks a lot for your help and analysising the problems in my pcap.
The root cause is some interrupts conflicts that caused the task swith malfunction. which caused the rx interrupt task can not handle the packets on time.

Congratulations and thanks for reporting back !