Hi Michael,
Excuse me for disturbing you with this letter!
I’m using linux 4.7.2 virtio driver + X86 64 + smart nic to validate PF and VF virtio performance.
The VF’s performance is >1Mpps, but the PF’s performance is only 500Kpps even if using 8 virtio queues.
So I reviewed virtio iommu patch in detail. I worry dma map/unmap cost many cycles. Do you have any suggestion? Thanks a lot!
Today I also reviewed Intel 40G driver of linux 4.7.2 and compared with virtio net driver. I focus on buffer management and list the difference in below table:
| Intel 40G | Virtio net |
TX dma map | 1. When transmit begin 2. Dma map per packet | 1. When transmit begin 2. Dma map per packet |
TX dma unmap | 1. When transmit done 2. Dma unmap per packet | 1. Before the next transmit begin 2. Dma map per packet |
RX dma map | 1. When initialization 2.When free> I40E_RX_BUFFER_WRITE | 1. When initialization 2. When free> size/2 |
RX dma unmap | 1. When receive and reuse failed 2. When close | 1. When receive one packet 2. Dma unmap per packet |
Look forward to your reply.
Thanks,
Jason
BTW:
1. Intel Ethernet Controller XL710(40G) driver:
TX:
i40e_lan_xmit_frame
->i40e_xmit_frame_ring()
->i40e_tx_map()
{
......
dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
......
dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
DMA_TO_DEVICE);
......
}
i40e_clean_tx_irq()
{
......
/* unmap skb header data */
dma_unmap_single(tx_ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
/* clear tx_buffer data */
tx_buf->skb = NULL;
dma_unmap_len_set(tx_buf, len, 0);
......
/* unmap remaining buffers */
while (tx_desc != eop_desc) {
/* unmap any remaining paged data */
if (dma_unmap_len(tx_buf, len)) {
dma_unmap_page(tx_ring->dev,
dma_unmap_addr(tx_buf, dma),
dma_unmap_len(tx_buf, len),
DMA_TO_DEVICE);
dma_unmap_len_set(tx_buf, len, 0);
}
......
}
......
}
i40e_vsi_free_tx_resources()
->i40e_free_tx_resources()
->i40e_clean_tx_ring()
->i40e_unmap_and_free_tx_resource()
{
if (tx_buffer->skb) {
dev_kfree_skb_any(tx_buffer->skb);
if (dma_unmap_len(tx_buffer, len))
dma_unmap_single(ring->dev,
dma_unmap_addr(tx_buffer, dma),
dma_unmap_len(tx_buffer, len),
DMA_TO_DEVICE);
} else if (dma_unmap_len(tx_buffer, len)) {
dma_unmap_page(ring->dev,
dma_unmap_addr(tx_buffer, dma),
dma_unmap_len(tx_buffer, len),
DMA_TO_DEVICE);
}
......
}
RX:
i40e_vsi_configure()
->i40e_vsi_configure_rx()
{
......
/* set up individual rings */
for (i = 0; i < vsi->num_queue_pairs && !err; i++)
err = i40e_configure_rx_ring(vsi->rx_rings[i]);
......
}
->i40e_configure_rx_ring()
->i40e_alloc_rx_buffers()
{
......
do {
if (!i40e_alloc_mapped_page(rx_ring, bi))
goto no_buffers;
......
cleaned_count--;
} while (cleaned_count);
......
}
i40e_clean_rx_irq
{
......
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
failure = failure ||
i40e_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
......
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
rx_buffer->dma,
rx_buffer->page_offset,
I40E_RXBUFFER_2048,
DMA_FROM_DEVICE);
......
}
i40e_vsi_free_rx_resources()
->i40e_free_rx_resources()
->i40e_clean_rx_ring()
{
......
dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
......
}
2. virtio net driver:
init_vqs()
->virtnet_alloc_queues()
->virtnet_find_vqs()
->vp_find_vqs()
->vp_try_to_find_vqs()
->vp_setup_vq()
->setup_vq()
TX:
start_xmit()
->free_old_xmit_skbs()
{
......
virtqueue_get_buf()
->detach_buf()
{
......
vring_unmap_one(vq, &vq->vring.desc[i])
......
}
......
}
->xmit_skb()
virtqueue_add_outbuf()
->virtqueue_add()
{
......
dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
......
}
RX:
virtnet_receive()
->virtqueue_get_buf()
->detach_buf()
{
......
vring_unmap_one(vq, &vq->vring.desc[i])
......
}
->receive_buf()
->try_fill_recv()
->add_recvbuf_small()
->virtqueue_add_inbuf()
->virtqueue_add()
{
......
dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
......
}