FreeRTOS + lwIP TCP cannot receive large packages at high rates

aripod · June 16, 2020, 3:25pm

I am using FreeRTOS with lwIP. It is working properly. I am able to send from the ZCU104 (arm cortex m53) to a PC and vice-versa via Ethernet rgb8 images. I can send and receive at 1fps 640x480 images (921600 bytes) or at 30fp 2x2 images. However, when I try to send/receive 640x480 images at 30fps the issue comes. I can see on wireshark (here is the .pcap file) that the packages arrive and I can read the first 6404803 bytes. Afterwards, I cannot receive anymore and I cannot read what wireshark is showing. There is a TCP window update, a couple of packets are sent and then only TCP retransmissions.

The task to receive is this:

union Usize {
	unsigned char array[4];
	uint32_t size;
};
union Usize convert;
	if(FD_ISSET(subscriber->publisher_socket_to_receive_fd, fd_set))
	{
		read_bytes = lwip_recvfrom(subscriber->publisher_socket_to_receive_fd, bufsize, 4, 0, NULL, NULL);	//Read first 4 bytes to get the total length of the message
		if (read_bytes > 0) {
			total_bytes = total_bytes + read_bytes;
			convert.array[0] = bufsize[0];
			convert.array[1] = bufsize[1];
			convert.array[2] = bufsize[2];
			convert.array[3] = bufsize[3];

			subscriber->data_subscribe_size = convert.size+1+4;	// 1 extra for the ID and 4 ones to include the total length
			subscriber->data_to_subscribe = malloc((subscriber->data_subscribe_size)*sizeof(unsigned char));

			subscriber->data_to_subscribe[0] = subscriber->ID;	// ID depends on the HW blocks, so it is directly copied (hardcoded)
			subscriber->data_to_subscribe[1] = bufsize[0];
			subscriber->data_to_subscribe[2] = bufsize[1];
			subscriber->data_to_subscribe[3] = bufsize[2];
			subscriber->data_to_subscribe[4] = bufsize[3];
			while(total_bytes < convert.size+4)
			{
				read_bytes = lwip_recvfrom(subscriber->publisher_socket_to_receive_fd, subscriber->data_to_subscribe+total_bytes+1, convert.size-total_bytes+4, 0, NULL, NULL);	// Read the rest of the bytes
				total_bytes = total_bytes + read_bytes;
			}
 			subscriber->QueueFree = FALSE;
		}
	}

This is the FreeRTOSConfig.h

#define configUSE_PREEMPTION 1

#define configUSE_MUTEXES 1

#define configUSE_RECURSIVE_MUTEXES 1

#define configUSE_COUNTING_SEMAPHORES 1

#define configUSE_TIMERS 1

#define configUSE_IDLE_HOOK 0

#define configUSE_TICK_HOOK 0

#define configUSE_MALLOC_FAILED_HOOK 1

#define configUSE_TRACE_FACILITY 1

#define configUSE_16_BIT_TICKS 0

#define configUSE_APPLICATION_TASK_TAG 0

#define configUSE_CO_ROUTINES 0

#define configTICK_RATE_HZ (100)

#define configMAX_PRIORITIES (8)

#define configMAX_CO_ROUTINE_PRIORITIES 2

#define configMINIMAL_STACK_SIZE ( ( unsigned short ) 200)

#define configTOTAL_HEAP_SIZE ( ( size_t ) ( 524288 ) )

#define configMAX_TASK_NAME_LEN 10

#define configIDLE_SHOULD_YIELD 1

#define configTIMER_TASK_PRIORITY (configMAX_PRIORITIES - 1)

#define configTIMER_QUEUE_LENGTH 10

#define configTIMER_TASK_STACK_DEPTH ((configMINIMAL_STACK_SIZE) * 2)

#define configASSERT( x ) if( ( x ) == 0 ) vApplicationAssert( __FILE__, __LINE__ )

#define configUSE_QUEUE_SETS 1

#define configCHECK_FOR_STACK_OVERFLOW 2

#define configQUEUE_REGISTRY_SIZE 10

#define configUSE_STATS_FORMATTING_FUNCTIONS 1

#define configNUM_THREAD_LOCAL_STORAGE_POINTERS 0

#define configUSE_TICKLESS_IDLE	0
#define configTASK_RETURN_ADDRESS    NULL
#define INCLUDE_vTaskPrioritySet             1
#define INCLUDE_uxTaskPriorityGet            1
#define INCLUDE_vTaskDelete                  1
#define INCLUDE_vTaskCleanUpResources        1
#define INCLUDE_vTaskSuspend                 1
#define INCLUDE_vTaskDelayUntil              1
#define INCLUDE_vTaskDelay                   1
#define INCLUDE_eTaskGetState                1
#define INCLUDE_xTimerPendFunctionCall       1
#define INCLUDE_pcTaskGetTaskName            1
#define configTIMER_ID XPAR_XTTCPS_0_DEVICE_ID

#define configTIMER_BASEADDR XPAR_XTTCPS_0_BASEADDR

#define configTIMER_INTERRUPT_ID XPAR_XTTCPS_0_INTR

#define configUNIQUE_INTERRUPT_PRIORITIES 32

#define configINTERRUPT_CONTROLLER_DEVICE_ID XPAR_SCUGIC_SINGLE_DEVICE_ID

#define configINTERRUPT_CONTROLLER_BASE_ADDRESS XPAR_SCUGIC_0_DIST_BASEADDR

#define configINTERRUPT_CONTROLLER_CPU_INTERFACE_OFFSET 0x10000

void vApplicationAssert( const char *pcFile, uint32_t ulLine );
void FreeRTOS_SetupTickInterrupt( void );
#define configSETUP_TICK_INTERRUPT() FreeRTOS_SetupTickInterrupt()

void FreeRTOS_ClearTickInterrupt( void );
#define configCLEAR_TICK_INTERRUPT()	FreeRTOS_ClearTickInterrupt()

#define configGENERATE_RUN_TIME_STATS 0

#define portCONFIGURE_TIMER_FOR_RUN_TIME_STATS()

#define portGET_RUN_TIME_COUNTER_VALUE()

#define configCOMMAND_INT_MAX_OUTPUT_SIZE 2096

#define recmuCONTROLLING_TASK_PRIORITY ( configMAX_PRIORITIES - 2 )

#define fabs( x ) __builtin_fabs( x )

#define configMAX_API_CALL_INTERRUPT_PRIORITY (18)

#define configUSE_PORT_OPTIMISED_TASK_SELECTION 1

And this is the lwipopts.h

#define SYS_LIGHTWEIGHT_PROT 1


#define NO_SYS_NO_TIMERS 1

#define OS_IS_FREERTOS
#define DEFAULT_THREAD_PRIO 2
#define TCPIP_THREAD_PRIO (2)
#define TCPIP_THREAD_STACKSIZE 1024
#define DEFAULT_TCP_RECVMBOX_SIZE 	200
#define DEFAULT_ACCEPTMBOX_SIZE 	5
#define TCPIP_MBOX_SIZE		200
#define DEFAULT_UDP_RECVMBOX_SIZE 	100
#define DEFAULT_RAW_RECVMBOX_SIZE	30
#define LWIP_COMPAT_MUTEX 0
#define LWIP_ALLOW_MEM_FREE_FROM_OTHER_CONTEXT 1

#define LWIP_TCP_KEEPALIVE 0

#define MEM_ALIGNMENT 64
#define MEM_SIZE 524288
#define MEMP_NUM_PBUF 1024
#define MEMP_NUM_UDP_PCB 4
#define MEMP_NUM_TCP_PCB 128
#define MEMP_NUM_TCP_PCB_LISTEN 64
#define MEMP_NUM_TCP_SEG 256
#define MEMP_NUM_SYS_TIMEOUT 8
#define MEMP_NUM_NETBUF 64
#define MEMP_NUM_NETCONN 64
#define MEMP_NUM_TCPIP_MSG_API 32
#define MEMP_NUM_TCPIP_MSG_INPKT 64

#define MEMP_NUM_NETBUF     64
#define MEMP_NUM_NETCONN    64
#define LWIP_PROVIDE_ERRNO  1
#define MEMP_NUM_SYS_TIMEOUT 8
#define PBUF_POOL_SIZE 16384
#define PBUF_POOL_BUFSIZE 3400
#define PBUF_LINK_HLEN 16

#define ARP_TABLE_SIZE 10
#define ARP_QUEUEING 1

#define ICMP_TTL 255

#define IP_OPTIONS 0
#define IP_FORWARD 0
#define IP_REASSEMBLY 1
#define IP_FRAG 1
#define IP_REASS_MAX_PBUFS 128
#define IP_FRAG_MAX_MTU 3000
#define IP_DEFAULT_TTL 255
#define LWIP_CHKSUM_ALGORITHM 3

#define LWIP_UDP 1
#define UDP_TTL 255

#define LWIP_TCP 1
#define TCP_MSS 2920
#define TCP_SND_BUF 32768
#define TCP_WND 32768
#define TCP_TTL 255
#define TCP_MAXRTX 12
#define TCP_SYNMAXRTX 4
#define TCP_QUEUE_OOSEQ 1
#define TCP_SND_QUEUELEN   16 * TCP_SND_BUF/TCP_MSS
#define CHECKSUM_GEN_TCP 	0
#define CHECKSUM_GEN_UDP 	0
#define CHECKSUM_GEN_IP  	0
#define CHECKSUM_CHECK_TCP  0
#define CHECKSUM_CHECK_UDP  0
#define CHECKSUM_CHECK_IP 	0
#define LWIP_FULL_CSUM_OFFLOAD_RX  1
#define LWIP_FULL_CSUM_OFFLOAD_TX  1

#define MEMP_SEPARATE_POOLS 1
#define MEMP_NUM_FRAG_PBUF 256
#define IP_OPTIONS_ALLOWED 0
#define TCP_OVERSIZE TCP_MSS

#define LWIP_DHCP 1
#define DHCP_DOES_ARP_CHECK 1

#define CONFIG_LINKSPEED_AUTODETECT 1

Thanks for the help.

htibosch · June 16, 2020, 3:56pm

Looking at your PCAP file:

Your x.x.x.100 device seems to send packets of up to 10KB, that is probably not true, the received data is presented in big chunks on the PC.

The x.x.x.156 device replies with ACK’s after every 2,892 bytes.

Do I see right that you send +/- 900 KB in 20 ms?
That is a lot, almost 50% of the total bandwidth?

What I further see in the PCAP is that the device receives all data, but in the end, it doesn’t increase its TCP windows size anymore.
The PC stops sending because the other party’s RX buffer is apparently full.

I would try to only receive the data and don’t update the video display, just to see if the transport will still have hickup’s.

aripod · June 16, 2020, 4:06pm

That is right. 6404803 so 900KB are sent at 30 Hz

That is exactly what I am doing. I only receive the images.
By the way, the PC is 100 and the FPGA is the 156

htibosch · June 16, 2020, 4:25pm

Strange: normally calling recv() should trigger sending a TCP WIN-change to the peer.
The last update is sent in packet 482.
After that, another 13 KB is received and acknowledged, but the WIN packet is not sent.
Do you think that you have also received the last 13 KB?
PS. Is your task blocking ( sleeping ) enough, does it leave CPU time for the other tasks?

aripod · June 16, 2020, 5:37pm

I tried to put a printf inside the while loop to show the amount of received bytes:

printf("read: %d - total: %d\n", read_bytes, total_bytes);

When I did that, it was only blocked there (maybe as expected) and there was never a context switching so I was now able to receive data without it blocking. However, I could not see the frequency. Then, I replaced the printf with a vTaskDelay(0) to force the context switching but then the same behavior as the beginning occured.
I call the delay function with 0 as I want to come back to the task as soon as possible, in fact to avoid that the window is full and I cannot receive anymore. Isn’t that correct? (clearly not because it is not working )

htibosch · June 16, 2020, 6:03pm

Is your socket configured as blocking? Your task must block somewhere.
With just a vTaskDelay(0), I wouldn’t be sure if a yield takes place.
What tasks do you have? Does lwIP run in its own task? What about the priorities?

aripod · June 17, 2020, 8:43am

I might be wrong but I believe the socket is blocking, however, I am using select with a 50us timeout so I the task does not block. So, with select I am checking on multiple sockets, not only one. Once I detect some activity, I check in each of them which one was the one that got triggered. That is the reason for the first if in the code:

if(FD_ISSET(subscriber->publisher_socket_to_receive_fd, fd_set))

Regarding the vTaskDelay(0) you are right, it does not yield. I put a breakpoint there are going step by step it always comes back to the that while loop. I changed it to vTaskDelay(1 / portTICK_PERIOD_MS) but still the same. The issue here is that I don’t want to wait in the order of ms but us or as soon as possible come back to the task. My reasoning is that it can check if there is activity in the socket. Copy the data from the tcp input buffer if there is activity in the socket or yield and come back to check as soon as possible.
However, I kept going step by step. For what I can figure out, as I receive 30 fps but only read one every time I land on the break point, the input buffer got full. At that point, the selectfunction didn’t even timed out and that might be where it blocks. Could that be?

For the tasks it looks like this:
Startup task: I stated form Xilinx’s example (like this thread) which acquires an IP and handles the initialization of the connection. Again, following the example, this is to task has priority DEFAULT_THREAD_PRIO=2. Here it is where the “new tasks” (mine) are created. After this, task 1 is deleted and the “custom” tasks remain.

I create four tasks. One to receive from sockets, one to send to sockets. One to send received data from sockets to my hardware and one to receive data from my hardware to sent out to the sockets. They all have DEFAULT_THREAD_PRIO priority. The reasoning is same as explained before. Each of them checks if there is activity (sockets or hw), if there is it process it. If there isn’t, it yields (with vTaskDelay(1ms)) and loops back when there is a context switching so that is why I want to come back as soon as possible to each task, just to check if there is something to process. Therefore, a round-robin without fixed priority so the tasks execute in order (T1, T2, T3, T4, T1, T2, T3, T4…) should be enough. Could that be the problem?

Just to point out that to create tasks I use (following the example) sys_thread_new which is defined as:

/*---------------------------------------------------------------------------*
 * Routine:  sys_thread_new
 *---------------------------------------------------------------------------*
 * Description:
 *      Starts a new thread with priority "prio" that will begin its
 *      execution in the function "thread()". The "arg" argument will be
 *      passed as an argument to the thread() function. The id of the new
 *      thread is returned. Both the id and the priority are system
 *      dependent.
 * Inputs:
 *      char *name              -- Name of thread
 *      void (* thread)(void *arg) -- Pointer to function to run.
 *      void *arg               -- Argument passed into function
 *      int stacksize           -- Required stack amount in bytes
 *      int prio                -- Thread priority
 * Outputs:
 *      sys_thread_t            -- Pointer to per-thread timeouts.
 *---------------------------------------------------------------------------*/
sys_thread_t sys_thread_new( const char *pcName, void( *pxThread )( void *pvParameters ), void *pvArg, int iStackSize, int iPriority )
{
xTaskHandle xCreatedTask;
portBASE_TYPE xResult;
sys_thread_t xReturn;

	xResult = xTaskCreate( pxThread, ( const char * const) pcName, iStackSize, pvArg, iPriority, &xCreatedTask );

I hope it was detailed enough for a better understanding.
Thanks again for the help.
if( xResult == pdPASS )
{
xReturn = xCreatedTask;
}
else
{
xReturn = NULL;
}

	return xReturn;
}

htibosch · June 19, 2020, 9:09am

I would like to suggest a few things:

Make a version in which reading and writing to a socket is done within the same task. lwIP has had problems with sharing sockets among tasks. I’m not sure if those problems are solved.
If you use select(), of course you can handle more than one socket within a task, both for reading and writing.
If the above doesn’t help, I would try a version without select(), just to see if it gets stuck there,

In general, I would let tasks block for much longer than 1 ms. Can’t your hardware trigger an ISR when it needs attention? From the ISR, you can call xTaskNotifyFromISR() to wake-up a task.

You might want to create a specific Video -TCP-task, because of the high volume.

When woken up by select(), you might want to call lwip_recv() multiple times in a non-blocking way, until there is nothing left to read, symbolically:

    for( ;; )
    {
        select();
        for( ;; )
        {
            int rc = lwip_recv( socket, buf, sizeof buf, MSG_DONTWAIT );
            if( rc <= 0 )
            {
                break;
            }
            send_to_matrix( buf, rc );
        }
    }

aripod · June 19, 2020, 7:12pm

I am using one socket to receive and another one to send so they are independent. Or do you mean I should have only one task for lwip related things (receiving and sending sockets) and one for HW related things?

htibosch · June 20, 2020, 10:08am

I am using one socket to receive and another one to send so they are independent.

Oh that good, then the problem is avoided.

Or do you mean I should have only one task for lwip related things
(receiving and sending sockets) and one for HW related things?

Receiving 500 Mbit per second and sending these data to a video matrix is a lot. Maybe that should be done in its own task and with a higher priority.

About the HW-related things: you wrote that a task is using vTaskDelay(1). I would be nice if that task can be triggered to by an event in stead of just waking up every second and do polling.

But I would be curious to see how it goes if you don’t use select() but a blocking lwip_recv(). Let it block for e.g. 5 seconds.

aripod · June 20, 2020, 1:03pm

Images are 640x480 in RGB so it is actually 26.4 Mbit per second. But I see your point of this being done with higher priority. And what about the sending part? Would it be better that both TCP related tasks have the same priority?

That is a very good point to improve and rely on xTaskNotifyFromISR() like you suggested previously. Just minor thing. On that note, isn’t it possible to have an interrupt to receive data from TCP?

htibosch · June 20, 2020, 1:47pm

On that note, isn’t it possible to have an interrupt to receive data from TCP?

I am sure that lwIP is using an interrupt on the reception of data.
So if you call lwip_recv() in a blocking way, or call select(), the the API will block until either a time-out has been reached, or data have been received.

aripod · June 20, 2020, 2:07pm

That blocking that you mean is the reason why I thought it would be good to keep poling so the vTaskDelayshould be as quick as possible.

Then, if I understood correctly, you would recommend to replace the busy wait for the HW with interrupts and the sending and receiving tasks for lwIP to have higher priorities, right?

aripod · July 7, 2020, 3:56pm

@htibosch I had to pause this and now I come back. I can trigger an ISR when the hardware needs attention, either to send or receive.
To make things clear:

Task 1: Checks if there is new incoming data from the TCP on N sockets via select(). If there is something, it rises the corresponding flag (there is one per receiving socket). If there isn’t new data it suspends with vTaskDelay().
Task 2: Checks if there is new incoming data from the HW. If there is something, it sends it over the corresponding TCP socket. If there isn’t new data, it suspends with vTaskDelay().
Task 3: Checks the flags from task 1 to send data to the HW. It transfer the corresponding buffer (written by Task 1) if any flag is risen or suspends with vTaskDelay().
Task 4: Waits until there is incoming data from the HW (busy waiting). A while checking the hw’s api with vTaskDelay(). After the busy waiting, it copies the incoming data from HW to the corresponding buffer and rises the corresponding flag for Task 2 to know to which socket the data corresponds to.

Therefore based on what what you suggested, I am thinking to make the following changes:

Task 1: Keep checking if there is incoming data but increase the vTaskDelay(), to 15ms max (~ 60 fps)
Task 2: Perform the initialization that it does and instead of checking if there is data coming from the HW constantly, suspend the task with vTaskSuspend() rather than vTaskDelay().
Task 3: Perform the initialization that it does and instead of checking the flags risen in Task 1 constantly, suspend the task with vTaskSuspend() rather than vTaskDelay() and wait from Task 1 call xTaskYield() to come to Task 3.
Task 4: ISR and when the interrupt occurs, call xTaskNotifyFromISR() like you suggest to jump to Task 2.

This way, the only constantly running task would be Task 1 checking for incoming TCP data. Does that make sense?

Are vTaskYield and xTaskNotifyFromISR() the correct functions to use? Have I understood your idea correctly?
What should Task 1 do when yields to task 3? Can taskYield() and vTaskDelay() be combined in the same task?
Rather than taskYield I should user a semaphore, right?

Thank you for the help.

aripod · July 7, 2020, 5:33pm

Answering my own question: Better to use Task notifications: " Unblocking an RTOS task with a direct notification is **45% faster *** and uses less RAM than unblocking a task with a binary semaphore." from https://www.freertos.org/RTOS-task-notifications.html. More specifically, I should use Task Notifications Used As Light Weight Binary Semaphores right?

aripod · July 20, 2020, 8:03am

@htibosch I finally managed to combine the semaphores with Xilinx DMA interrupt so I am adapting the code to follow your suggestion.
In the meanwhile, I have a question regarding select and tasks. I have one specific task to check incoming data over TCP that looks like this:

	while(1)
	{
		timeout.tv_sec = 0;
		timeout.tv_usec = 5;
		FD_ZERO(&ready_sockets);
		ready_sockets = current_sockets; 	// This is needed because select is destructive and will return active fds in ready_sockets

		ret = select(maxfd+1, &ready_sockets, NULL, NULL, &timeout);
		if(ret==0)
		{
			//timeout
			//xil_printf("Select timmed out: %d\n", sret);
		}
		else
		{
            // Copy received that to the corresponding buffer
		}
		vTaskDelay(pdMS_TO_TICKS(30));	// Check new incoming data on the TCP buffer every 30ms
	}

Considering that there is a timeout on select, what happens to the task during that time? Is it blocked so the cpu can still handle the scheduler or the cpu is busy on that select call and the scheduler is not doing anything as this task is running?

aripod · July 21, 2020, 7:04am

I was able to finally send and receive 640x480 RGB images at 30 fps. The issue is now that it works for some time and then it completely stops. I attach the pcap file from Wireshark (https://we.tl/t-qhDWzZeOLQ).
@htibosch How can I determine if the problem is with the incoming buffer getting full?
I am now checking on new incoming data with select (timeout=5ms) and vTaskDelay 10ms so in total there are ~15m. Then the rest is synchronized with semaphores. Should I have this tasks with higher priority than all the others so I read the incoming data regardless of the others so the buffer does not get full?

hs2 · July 21, 2020, 8:21am

@aripod Are you sure about the select timeout ?
In the code snippet posted it’s 5 microseconds if I got it right.
Also why polling ? That always implies a certain overhead by using the CPU even if there is nothing to do.
Just let the task blocking recv the data, process it and and use e.g. vTaskDelay or perhaps vTaskDelayUntil to adjust/limit the frame rate. That would also eliminate the overhead using 2 socket calls (select and recv) to just receive the data.
I guess you have 1 dedicated socket connection receiving the frames.
If the frame is processed/consumed in that task I can’t see how an input buffer can get exhausted. If you forward the data to another task/queue you could e.g. check the queue filling level for debugging.

aripod · July 21, 2020, 9:52am

@hs2 thanks for your answer. It would be good to use an interrupt so I could synchronize it with a semaphore when data is available rather than using vTaskDelay. Is that even possible?

Also, you mean I should remove select and directly call recv? But that is blocking if I understood correctly, which is what I should avoid.

hs2 · July 21, 2020, 10:06am

Is there a reason why you don’t want to block on recv ?
From the code posted it seems that receiving frames and … do something with it is the only thing to do by this task. Or is there any other (unrelated) action you do in this task ?
BTW There is no need to deal with interrupts at this level since the network stack with the help of the ethernet driver does all the work for you i.e. recv returns as soon as there is some data received by this socket.