FreeRTOS + lwIP TCP cannot receive large packages at high rates

I am using FreeRTOS with lwIP. It is working properly. I am able to send from the ZCU104 (arm cortex m53) to a PC and vice-versa via Ethernet rgb8 images. I can send and receive at 1fps 640x480 images (921600 bytes) or at 30fp 2x2 images. However, when I try to send/receive 640x480 images at 30fps the issue comes. I can see on wireshark (here is the .pcap file) that the packages arrive and I can read the first 6404803 bytes. Afterwards, I cannot receive anymore and I cannot read what wireshark is showing. There is a TCP window update, a couple of packets are sent and then only TCP retransmissions.

The task to receive is this:

union Usize {
	unsigned char array[4];
	uint32_t size;
};
union Usize convert;
	if(FD_ISSET(subscriber->publisher_socket_to_receive_fd, fd_set))
	{
		read_bytes = lwip_recvfrom(subscriber->publisher_socket_to_receive_fd, bufsize, 4, 0, NULL, NULL);	//Read first 4 bytes to get the total length of the message
		if (read_bytes > 0) {
			total_bytes = total_bytes + read_bytes;
			convert.array[0] = bufsize[0];
			convert.array[1] = bufsize[1];
			convert.array[2] = bufsize[2];
			convert.array[3] = bufsize[3];

			subscriber->data_subscribe_size = convert.size+1+4;	// 1 extra for the ID and 4 ones to include the total length
			subscriber->data_to_subscribe = malloc((subscriber->data_subscribe_size)*sizeof(unsigned char));

			subscriber->data_to_subscribe[0] = subscriber->ID;	// ID depends on the HW blocks, so it is directly copied (hardcoded)
			subscriber->data_to_subscribe[1] = bufsize[0];
			subscriber->data_to_subscribe[2] = bufsize[1];
			subscriber->data_to_subscribe[3] = bufsize[2];
			subscriber->data_to_subscribe[4] = bufsize[3];
			while(total_bytes < convert.size+4)
			{
				read_bytes = lwip_recvfrom(subscriber->publisher_socket_to_receive_fd, subscriber->data_to_subscribe+total_bytes+1, convert.size-total_bytes+4, 0, NULL, NULL);	// Read the rest of the bytes
				total_bytes = total_bytes + read_bytes;
			}
 			subscriber->QueueFree = FALSE;
		}
	}

This is the FreeRTOSConfig.h

#define configUSE_PREEMPTION 1

#define configUSE_MUTEXES 1

#define configUSE_RECURSIVE_MUTEXES 1

#define configUSE_COUNTING_SEMAPHORES 1

#define configUSE_TIMERS 1

#define configUSE_IDLE_HOOK 0

#define configUSE_TICK_HOOK 0

#define configUSE_MALLOC_FAILED_HOOK 1

#define configUSE_TRACE_FACILITY 1

#define configUSE_16_BIT_TICKS 0

#define configUSE_APPLICATION_TASK_TAG 0

#define configUSE_CO_ROUTINES 0

#define configTICK_RATE_HZ (100)

#define configMAX_PRIORITIES (8)

#define configMAX_CO_ROUTINE_PRIORITIES 2

#define configMINIMAL_STACK_SIZE ( ( unsigned short ) 200)

#define configTOTAL_HEAP_SIZE ( ( size_t ) ( 524288 ) )

#define configMAX_TASK_NAME_LEN 10

#define configIDLE_SHOULD_YIELD 1

#define configTIMER_TASK_PRIORITY (configMAX_PRIORITIES - 1)

#define configTIMER_QUEUE_LENGTH 10

#define configTIMER_TASK_STACK_DEPTH ((configMINIMAL_STACK_SIZE) * 2)

#define configASSERT( x ) if( ( x ) == 0 ) vApplicationAssert( __FILE__, __LINE__ )

#define configUSE_QUEUE_SETS 1

#define configCHECK_FOR_STACK_OVERFLOW 2

#define configQUEUE_REGISTRY_SIZE 10

#define configUSE_STATS_FORMATTING_FUNCTIONS 1

#define configNUM_THREAD_LOCAL_STORAGE_POINTERS 0

#define configUSE_TICKLESS_IDLE	0
#define configTASK_RETURN_ADDRESS    NULL
#define INCLUDE_vTaskPrioritySet             1
#define INCLUDE_uxTaskPriorityGet            1
#define INCLUDE_vTaskDelete                  1
#define INCLUDE_vTaskCleanUpResources        1
#define INCLUDE_vTaskSuspend                 1
#define INCLUDE_vTaskDelayUntil              1
#define INCLUDE_vTaskDelay                   1
#define INCLUDE_eTaskGetState                1
#define INCLUDE_xTimerPendFunctionCall       1
#define INCLUDE_pcTaskGetTaskName            1
#define configTIMER_ID XPAR_XTTCPS_0_DEVICE_ID

#define configTIMER_BASEADDR XPAR_XTTCPS_0_BASEADDR

#define configTIMER_INTERRUPT_ID XPAR_XTTCPS_0_INTR

#define configUNIQUE_INTERRUPT_PRIORITIES 32

#define configINTERRUPT_CONTROLLER_DEVICE_ID XPAR_SCUGIC_SINGLE_DEVICE_ID

#define configINTERRUPT_CONTROLLER_BASE_ADDRESS XPAR_SCUGIC_0_DIST_BASEADDR

#define configINTERRUPT_CONTROLLER_CPU_INTERFACE_OFFSET 0x10000

void vApplicationAssert( const char *pcFile, uint32_t ulLine );
void FreeRTOS_SetupTickInterrupt( void );
#define configSETUP_TICK_INTERRUPT() FreeRTOS_SetupTickInterrupt()

void FreeRTOS_ClearTickInterrupt( void );
#define configCLEAR_TICK_INTERRUPT()	FreeRTOS_ClearTickInterrupt()

#define configGENERATE_RUN_TIME_STATS 0

#define portCONFIGURE_TIMER_FOR_RUN_TIME_STATS()

#define portGET_RUN_TIME_COUNTER_VALUE()

#define configCOMMAND_INT_MAX_OUTPUT_SIZE 2096

#define recmuCONTROLLING_TASK_PRIORITY ( configMAX_PRIORITIES - 2 )

#define fabs( x ) __builtin_fabs( x )

#define configMAX_API_CALL_INTERRUPT_PRIORITY (18)

#define configUSE_PORT_OPTIMISED_TASK_SELECTION 1

And this is the lwipopts.h

#define SYS_LIGHTWEIGHT_PROT 1


#define NO_SYS_NO_TIMERS 1

#define OS_IS_FREERTOS
#define DEFAULT_THREAD_PRIO 2
#define TCPIP_THREAD_PRIO (2)
#define TCPIP_THREAD_STACKSIZE 1024
#define DEFAULT_TCP_RECVMBOX_SIZE 	200
#define DEFAULT_ACCEPTMBOX_SIZE 	5
#define TCPIP_MBOX_SIZE		200
#define DEFAULT_UDP_RECVMBOX_SIZE 	100
#define DEFAULT_RAW_RECVMBOX_SIZE	30
#define LWIP_COMPAT_MUTEX 0
#define LWIP_ALLOW_MEM_FREE_FROM_OTHER_CONTEXT 1

#define LWIP_TCP_KEEPALIVE 0

#define MEM_ALIGNMENT 64
#define MEM_SIZE 524288
#define MEMP_NUM_PBUF 1024
#define MEMP_NUM_UDP_PCB 4
#define MEMP_NUM_TCP_PCB 128
#define MEMP_NUM_TCP_PCB_LISTEN 64
#define MEMP_NUM_TCP_SEG 256
#define MEMP_NUM_SYS_TIMEOUT 8
#define MEMP_NUM_NETBUF 64
#define MEMP_NUM_NETCONN 64
#define MEMP_NUM_TCPIP_MSG_API 32
#define MEMP_NUM_TCPIP_MSG_INPKT 64

#define MEMP_NUM_NETBUF     64
#define MEMP_NUM_NETCONN    64
#define LWIP_PROVIDE_ERRNO  1
#define MEMP_NUM_SYS_TIMEOUT 8
#define PBUF_POOL_SIZE 16384
#define PBUF_POOL_BUFSIZE 3400
#define PBUF_LINK_HLEN 16

#define ARP_TABLE_SIZE 10
#define ARP_QUEUEING 1

#define ICMP_TTL 255

#define IP_OPTIONS 0
#define IP_FORWARD 0
#define IP_REASSEMBLY 1
#define IP_FRAG 1
#define IP_REASS_MAX_PBUFS 128
#define IP_FRAG_MAX_MTU 3000
#define IP_DEFAULT_TTL 255
#define LWIP_CHKSUM_ALGORITHM 3

#define LWIP_UDP 1
#define UDP_TTL 255

#define LWIP_TCP 1
#define TCP_MSS 2920
#define TCP_SND_BUF 32768
#define TCP_WND 32768
#define TCP_TTL 255
#define TCP_MAXRTX 12
#define TCP_SYNMAXRTX 4
#define TCP_QUEUE_OOSEQ 1
#define TCP_SND_QUEUELEN   16 * TCP_SND_BUF/TCP_MSS
#define CHECKSUM_GEN_TCP 	0
#define CHECKSUM_GEN_UDP 	0
#define CHECKSUM_GEN_IP  	0
#define CHECKSUM_CHECK_TCP  0
#define CHECKSUM_CHECK_UDP  0
#define CHECKSUM_CHECK_IP 	0
#define LWIP_FULL_CSUM_OFFLOAD_RX  1
#define LWIP_FULL_CSUM_OFFLOAD_TX  1

#define MEMP_SEPARATE_POOLS 1
#define MEMP_NUM_FRAG_PBUF 256
#define IP_OPTIONS_ALLOWED 0
#define TCP_OVERSIZE TCP_MSS

#define LWIP_DHCP 1
#define DHCP_DOES_ARP_CHECK 1

#define CONFIG_LINKSPEED_AUTODETECT 1

Thanks for the help.

Looking at your PCAP file:

Your x.x.x.100 device seems to send packets of up to 10KB, that is probably not true, the received data is presented in big chunks on the PC.

The x.x.x.156 device replies with ACK’s after every 2,892 bytes.

Do I see right that you send +/- 900 KB in 20 ms?
That is a lot, almost 50% of the total bandwidth?

What I further see in the PCAP is that the device receives all data, but in the end, it doesn’t increase its TCP windows size anymore.
The PC stops sending because the other party’s RX buffer is apparently full.

I would try to only receive the data and don’t update the video display, just to see if the transport will still have hickup’s.

That is right. 6404803 so 900KB are sent at 30 Hz

That is exactly what I am doing. I only receive the images.
By the way, the PC is 100 and the FPGA is the 156

Strange: normally calling recv() should trigger sending a TCP WIN-change to the peer.
The last update is sent in packet 482.
After that, another 13 KB is received and acknowledged, but the WIN packet is not sent.
Do you think that you have also received the last 13 KB?
PS. Is your task blocking ( sleeping ) enough, does it leave CPU time for the other tasks?

I tried to put a printf inside the while loop to show the amount of received bytes:

printf("read: %d - total: %d\n", read_bytes, total_bytes);

When I did that, it was only blocked there (maybe as expected) and there was never a context switching so I was now able to receive data without it blocking. However, I could not see the frequency. Then, I replaced the printf with a vTaskDelay(0) to force the context switching but then the same behavior as the beginning occured.
I call the delay function with 0 as I want to come back to the task as soon as possible, in fact to avoid that the window is full and I cannot receive anymore. Isn’t that correct? (clearly not because it is not working :slight_smile:)

Is your socket configured as blocking? Your task must block somewhere.
With just a vTaskDelay(0), I wouldn’t be sure if a yield takes place.
What tasks do you have? Does lwIP run in its own task? What about the priorities?

I might be wrong but I believe the socket is blocking, however, I am using select with a 50us timeout so I the task does not block. So, with select I am checking on multiple sockets, not only one. Once I detect some activity, I check in each of them which one was the one that got triggered. That is the reason for the first if in the code:

if(FD_ISSET(subscriber->publisher_socket_to_receive_fd, fd_set))

Regarding the vTaskDelay(0) you are right, it does not yield. I put a breakpoint there are going step by step it always comes back to the that while loop. I changed it to vTaskDelay(1 / portTICK_PERIOD_MS) but still the same. The issue here is that I don’t want to wait in the order of ms but us or as soon as possible come back to the task. My reasoning is that it can check if there is activity in the socket. Copy the data from the tcp input buffer if there is activity in the socket or yield and come back to check as soon as possible.
However, I kept going step by step. For what I can figure out, as I receive 30 fps but only read one every time I land on the break point, the input buffer got full. At that point, the selectfunction didn’t even timed out and that might be where it blocks. Could that be?

For the tasks it looks like this:
Startup task: I stated form Xilinx’s example (like this thread) which acquires an IP and handles the initialization of the connection. Again, following the example, this is to task has priority DEFAULT_THREAD_PRIO=2. Here it is where the “new tasks” (mine) are created. After this, task 1 is deleted and the “custom” tasks remain.

I create four tasks. One to receive from sockets, one to send to sockets. One to send received data from sockets to my hardware and one to receive data from my hardware to sent out to the sockets. They all have DEFAULT_THREAD_PRIO priority. The reasoning is same as explained before. Each of them checks if there is activity (sockets or hw), if there is it process it. If there isn’t, it yields (with vTaskDelay(1ms)) and loops back when there is a context switching so that is why I want to come back as soon as possible to each task, just to check if there is something to process. Therefore, a round-robin without fixed priority so the tasks execute in order (T1, T2, T3, T4, T1, T2, T3, T4…) should be enough. Could that be the problem?

Just to point out that to create tasks I use (following the example) sys_thread_new which is defined as:

/*---------------------------------------------------------------------------*
 * Routine:  sys_thread_new
 *---------------------------------------------------------------------------*
 * Description:
 *      Starts a new thread with priority "prio" that will begin its
 *      execution in the function "thread()". The "arg" argument will be
 *      passed as an argument to the thread() function. The id of the new
 *      thread is returned. Both the id and the priority are system
 *      dependent.
 * Inputs:
 *      char *name              -- Name of thread
 *      void (* thread)(void *arg) -- Pointer to function to run.
 *      void *arg               -- Argument passed into function
 *      int stacksize           -- Required stack amount in bytes
 *      int prio                -- Thread priority
 * Outputs:
 *      sys_thread_t            -- Pointer to per-thread timeouts.
 *---------------------------------------------------------------------------*/
sys_thread_t sys_thread_new( const char *pcName, void( *pxThread )( void *pvParameters ), void *pvArg, int iStackSize, int iPriority )
{
xTaskHandle xCreatedTask;
portBASE_TYPE xResult;
sys_thread_t xReturn;

	xResult = xTaskCreate( pxThread, ( const char * const) pcName, iStackSize, pvArg, iPriority, &xCreatedTask );

I hope it was detailed enough for a better understanding.
Thanks again for the help.
if( xResult == pdPASS )
{
xReturn = xCreatedTask;
}
else
{
xReturn = NULL;
}

	return xReturn;
}

I would like to suggest a few things:

  • Make a version in which reading and writing to a socket is done within the same task. lwIP has had problems with sharing sockets among tasks. I’m not sure if those problems are solved.
    If you use select(), of course you can handle more than one socket within a task, both for reading and writing.

  • If the above doesn’t help, I would try a version without select(), just to see if it gets stuck there,

In general, I would let tasks block for much longer than 1 ms. Can’t your hardware trigger an ISR when it needs attention? From the ISR, you can call xTaskNotifyFromISR() to wake-up a task.

You might want to create a specific Video -TCP-task, because of the high volume.

  • When woken up by select(), you might want to call lwip_recv() multiple times in a non-blocking way, until there is nothing left to read, symbolically:
    for( ;; )
    {
        select();
        for( ;; )
        {
            int rc = lwip_recv( socket, buf, sizeof buf, MSG_DONTWAIT );
            if( rc <= 0 )
            {
                break;
            }
            send_to_matrix( buf, rc );
        }
    }

I am using one socket to receive and another one to send so they are independent. Or do you mean I should have only one task for lwip related things (receiving and sending sockets) and one for HW related things?

I am using one socket to receive and another one to send so they are independent.

Oh that good, then the problem is avoided.

Or do you mean I should have only one task for lwip related things
(receiving and sending sockets) and one for HW related things?

Receiving 500 Mbit per second and sending these data to a video matrix is a lot. Maybe that should be done in its own task and with a higher priority.

About the HW-related things: you wrote that a task is using vTaskDelay(1). I would be nice if that task can be triggered to by an event in stead of just waking up every second and do polling.

But I would be curious to see how it goes if you don’t use select() but a blocking lwip_recv(). Let it block for e.g. 5 seconds.

Images are 640x480 in RGB so it is actually 26.4 Mbit per second. But I see your point of this being done with higher priority. And what about the sending part? Would it be better that both TCP related tasks have the same priority?

That is a very good point to improve and rely on xTaskNotifyFromISR() like you suggested previously. Just minor thing. On that note, isn’t it possible to have an interrupt to receive data from TCP?

On that note, isn’t it possible to have an interrupt to receive data from TCP?

I am sure that lwIP is using an interrupt on the reception of data.
So if you call lwip_recv() in a blocking way, or call select(), the the API will block until either a time-out has been reached, or data have been received.

That blocking that you mean is the reason why I thought it would be good to keep poling so the vTaskDelayshould be as quick as possible.

Then, if I understood correctly, you would recommend to replace the busy wait for the HW with interrupts and the sending and receiving tasks for lwIP to have higher priorities, right?