* [PATCH net 1/2] vmxnet3: set the DMA mask before the first DMA map operation
From: hpreg @ 2018-05-14 12:14 UTC (permalink / raw)
To: netdev; +Cc: Regis Duchesne, Ronak Doshi, VMware, Inc., open list
The DMA mask must be set before, not after, the first DMA map operation, or
the first DMA map operation could in theory fail on some systems.
Fixes: b0eb57cb97e78 ("VMXNET3: Add support for virtual IOMMU")
Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
---
drivers/net/vmxnet3/vmxnet3_drv.c | 50 +++++++++++++++++++--------------------
drivers/net/vmxnet3/vmxnet3_int.h | 8 ++++---
2 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 9ebe2a6..ed58685 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2688,7 +2688,7 @@ vmxnet3_set_mac_addr(struct net_device *netdev, void *p)
/* ==================== initialization and cleanup routines ============ */
static int
-vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
+vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter)
{
int err;
unsigned long mmio_start, mmio_len;
@@ -2700,30 +2700,12 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
return err;
}
- if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
- if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
- dev_err(&pdev->dev,
- "pci_set_consistent_dma_mask failed\n");
- err = -EIO;
- goto err_set_mask;
- }
- *dma64 = true;
- } else {
- if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
- dev_err(&pdev->dev,
- "pci_set_dma_mask failed\n");
- err = -EIO;
- goto err_set_mask;
- }
- *dma64 = false;
- }
-
err = pci_request_selected_regions(pdev, (1 << 2) - 1,
vmxnet3_driver_name);
if (err) {
dev_err(&pdev->dev,
"Failed to request region for adapter: error %d\n", err);
- goto err_set_mask;
+ goto err_enable_device;
}
pci_set_master(pdev);
@@ -2751,7 +2733,7 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
iounmap(adapter->hw_addr0);
err_ioremap:
pci_release_selected_regions(pdev, (1 << 2) - 1);
-err_set_mask:
+err_enable_device:
pci_disable_device(pdev);
return err;
}
@@ -3254,7 +3236,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
#endif
};
int err;
- bool dma64 = false; /* stupid gcc */
+ bool dma64;
u32 ver;
struct net_device *netdev;
struct vmxnet3_adapter *adapter;
@@ -3300,6 +3282,24 @@ vmxnet3_probe_device(struct pci_dev *pdev,
adapter->rx_ring_size = VMXNET3_DEF_RX_RING_SIZE;
adapter->rx_ring2_size = VMXNET3_DEF_RX_RING2_SIZE;
+ if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+ if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
+ dev_err(&pdev->dev,
+ "pci_set_consistent_dma_mask failed\n");
+ err = -EIO;
+ goto err_set_mask;
+ }
+ dma64 = true;
+ } else {
+ if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+ dev_err(&pdev->dev,
+ "pci_set_dma_mask failed\n");
+ err = -EIO;
+ goto err_set_mask;
+ }
+ dma64 = false;
+ }
+
spin_lock_init(&adapter->cmd_lock);
adapter->adapter_pa = dma_map_single(&adapter->pdev->dev, adapter,
sizeof(struct vmxnet3_adapter),
@@ -3307,7 +3307,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
if (dma_mapping_error(&adapter->pdev->dev, adapter->adapter_pa)) {
dev_err(&pdev->dev, "Failed to map dma\n");
err = -EFAULT;
- goto err_dma_map;
+ goto err_set_mask;
}
adapter->shared = dma_alloc_coherent(
&adapter->pdev->dev,
@@ -3358,7 +3358,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
}
#endif /* VMXNET3_RSS */
- err = vmxnet3_alloc_pci_resources(adapter, &dma64);
+ err = vmxnet3_alloc_pci_resources(adapter);
if (err < 0)
goto err_alloc_pci;
@@ -3504,7 +3504,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
err_alloc_shared:
dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
-err_dma_map:
+err_set_mask:
free_netdev(netdev);
return err;
}
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index a332646..b379bed 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,12 @@
/*
* Version numbers
*/
-#define VMXNET3_DRIVER_VERSION_STRING "1.4.14.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING "1.4.15.0-k"
-/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM 0x01040e00
+/* Each byte of this 32-bit integer encodes a version number in
+ * VMXNET3_DRIVER_VERSION_STRING.
+ */
+#define VMXNET3_DRIVER_VERSION_NUM 0x01040f00
#if defined(CONFIG_PCI_MSI)
/* RSS only makes sense if MSI-X is supported. */
--
2.7.4
^ permalink raw reply related
* [PATCH net 2/2] vmxnet3: use DMA memory barriers where required
From: hpreg @ 2018-05-14 12:14 UTC (permalink / raw)
To: netdev; +Cc: Regis Duchesne, Ronak Doshi, VMware, Inc., open list
The gen bits must be read first from (resp. written last to) DMA memory.
The proper way to enforce this on Linux is to call dma_rmb() (resp.
dma_wmb()).
Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
---
drivers/net/vmxnet3/vmxnet3_drv.c | 22 ++++++++++++++++++++++
drivers/net/vmxnet3/vmxnet3_int.h | 4 ++--
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index ed58685..27a9bb8 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -369,6 +369,11 @@ vmxnet3_tq_tx_complete(struct vmxnet3_tx_queue *tq,
gdesc = tq->comp_ring.base + tq->comp_ring.next2proc;
while (VMXNET3_TCD_GET_GEN(&gdesc->tcd) == tq->comp_ring.gen) {
+ /* Prevent any &gdesc->tcd field from being (speculatively)
+ * read before (&gdesc->tcd)->gen is read.
+ */
+ dma_rmb();
+
completed += vmxnet3_unmap_pkt(VMXNET3_TCD_GET_TXIDX(
&gdesc->tcd), tq, adapter->pdev,
adapter);
@@ -1103,6 +1108,11 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
gdesc->txd.tci = skb_vlan_tag_get(skb);
}
+ /* Ensure that the write to (&gdesc->txd)->gen will be observed after
+ * all other writes to &gdesc->txd.
+ */
+ dma_wmb();
+
/* finally flips the GEN bit of the SOP desc. */
gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^
VMXNET3_TXD_GEN);
@@ -1298,6 +1308,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
*/
break;
}
+
+ /* Prevent any rcd field from being (speculatively) read before
+ * rcd->gen is read.
+ */
+ dma_rmb();
+
BUG_ON(rcd->rqID != rq->qid && rcd->rqID != rq->qid2 &&
rcd->rqID != rq->dataRingQid);
idx = rcd->rxdIdx;
@@ -1528,6 +1544,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
ring->next2comp = idx;
num_to_alloc = vmxnet3_cmd_ring_desc_avail(ring);
ring = rq->rx_ring + ring_idx;
+
+ /* Ensure that the writes to rxd->gen bits will be observed
+ * after all other writes to rxd objects.
+ */
+ dma_wmb();
+
while (num_to_alloc) {
vmxnet3_getRxDesc(rxd, &ring->base[ring->next2fill].rxd,
&rxCmdDesc);
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index b379bed..a2c554f 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,12 +69,12 @@
/*
* Version numbers
*/
-#define VMXNET3_DRIVER_VERSION_STRING "1.4.15.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING "1.4.16.0-k"
/* Each byte of this 32-bit integer encodes a version number in
* VMXNET3_DRIVER_VERSION_STRING.
*/
-#define VMXNET3_DRIVER_VERSION_NUM 0x01040f00
+#define VMXNET3_DRIVER_VERSION_NUM 0x01041000
#if defined(CONFIG_PCI_MSI)
/* RSS only makes sense if MSI-X is supported. */
--
2.7.4
^ permalink raw reply related
* Re: [kbuild-all] [PATCH net-next 2/2] sctp: add sctp_make_op_error_limited and reuse inner functions
From: Marcelo Ricardo Leitner @ 2018-05-14 12:16 UTC (permalink / raw)
To: Ye Xiaolong
Cc: kbuild test robot, Xin Long, Neil Horman, netdev, Vlad Yasevich,
linux-sctp, kbuild-all
In-Reply-To: <20180514114720.GA25280@yexl-desktop>
On Mon, May 14, 2018 at 07:47:20PM +0800, Ye Xiaolong wrote:
> On 05/14, Marcelo Ricardo Leitner wrote:
> >On Mon, May 14, 2018 at 03:40:53PM +0800, Ye Xiaolong wrote:
> >> >> config: x86_64-randconfig-x006-201817 (attached as .config)
> >> >> compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
> >> >> reproduce:
> >> >> # save the attached .config to linux build tree
> >> >> make ARCH=x86_64
> >> >>
> >> >> All errors (new ones prefixed by >>):
> >> >>
> >> >> net//sctp/sm_make_chunk.c: In function 'sctp_make_op_error_limited':
> >> >> >> net//sctp/sm_make_chunk.c:1260:9: error: implicit declaration of function 'sctp_mtu_payload'; did you mean 'sctp_do_peeloff'? [-Werror=implicit-function-declaration]
> >> >> size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));
> >> >> ^~~~~~~~~~~~~~~~
> >> >> sctp_do_peeloff
> >> >> cc1: some warnings being treated as errors
> >> >
> >> >Seems the test didn't pick up the MTU refactor patchset yet.
> >>
> >> Do you mean your patchset require MTU refactor patchset as prerequisites?
> >
> >Yes.
>
> Then it is recommended to use '--base' option of git format-patch, it would record
> the base tree info in the first patch or cover letter, 0day bot would apply your
> patchset to right base according to it.
Nice. I wasn't aware of it. Thanks.
Considering that the MTU refactor patchset was already applied on
net-next when the bot did the test, why should I have to specify the
base?
Marcelo
^ permalink raw reply
* [PATCH net-next] cxgb4: add tc flower match support for tunnel VNI
From: Ganesh Goudar @ 2018-05-14 12:21 UTC (permalink / raw)
To: netdev, davem
Cc: nirranjan, indranil, venkatesh, Kumar Sanghvi, Ganesh Goudar
From: Kumar Sanghvi <kumaras@chelsio.com>
Adds support for matching flows based on tunnel VNI value.
Introduces fw APIs for allocating/removing MPS entries related
to encapsulation. And uses the same while adding/deleting filters
for offloading flows based on tunnel VNI match.
Signed-off-by: Kumar Sanghvi <kumaras@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 9 +++
drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 91 ++++++++++++++++++++--
.../net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 18 +++++
drivers/net/ethernet/chelsio/cxgb4/l2t.c | 2 +-
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 86 ++++++++++++++++++++
drivers/net/ethernet/chelsio/cxgb4/t4_regs.h | 4 +
drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 34 ++++++++
7 files changed, 237 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index d2a7777..ce47daf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1045,6 +1045,7 @@ struct ch_sched_queue {
#define VF_BITWIDTH 8
#define IVLAN_BITWIDTH 16
#define OVLAN_BITWIDTH 16
+#define ENCAP_VNI_BITWIDTH 24
/* Filter matching rules. These consist of a set of ingress packet field
* (value, mask) tuples. The associated ingress packet field matches the
@@ -1075,6 +1076,7 @@ struct ch_filter_tuple {
uint32_t ivlan_vld:1; /* inner VLAN valid */
uint32_t ovlan_vld:1; /* outer VLAN valid */
uint32_t pfvf_vld:1; /* PF/VF valid */
+ uint32_t encap_vld:1; /* Encapsulation valid */
uint32_t macidx:MACIDX_BITWIDTH; /* exact match MAC index */
uint32_t fcoe:FCOE_BITWIDTH; /* FCoE packet */
uint32_t iport:IPORT_BITWIDTH; /* ingress port */
@@ -1085,6 +1087,7 @@ struct ch_filter_tuple {
uint32_t vf:VF_BITWIDTH; /* PCI-E VF ID */
uint32_t ivlan:IVLAN_BITWIDTH; /* inner VLAN */
uint32_t ovlan:OVLAN_BITWIDTH; /* outer VLAN */
+ uint32_t vni:ENCAP_VNI_BITWIDTH; /* VNI of tunnel */
/* Uncompressed header matching field rules. These are always
* available for field rules.
@@ -1703,6 +1706,12 @@ int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid,
const u8 *addr, const u8 *mask, unsigned int idx,
u8 lookup_type, u8 port_id, bool sleep_ok);
+int t4_free_encap_mac_filt(struct adapter *adap, unsigned int viid, int idx,
+ bool sleep_ok);
+int t4_alloc_encap_mac_filt(struct adapter *adap, unsigned int viid,
+ const u8 *addr, const u8 *mask, unsigned int vni,
+ unsigned int vni_mask, u8 dip_hit, u8 lookup_type,
+ bool sleep_ok);
int t4_alloc_raw_mac_filt(struct adapter *adap, unsigned int viid,
const u8 *addr, const u8 *mask, unsigned int idx,
u8 lookup_type, u8 port_id, bool sleep_ok);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index aae9802..ac4d7f7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -265,6 +265,8 @@ static int validate_filter(struct net_device *dev,
fs->mask.pfvf_vld) ||
unsupported(fconf, VNIC_ID_F, fs->val.ovlan_vld,
fs->mask.ovlan_vld) ||
+ unsupported(fconf, VNIC_ID_F, fs->val.encap_vld,
+ fs->mask.encap_vld) ||
unsupported(fconf, VLAN_F, fs->val.ivlan_vld, fs->mask.ivlan_vld))
return -EOPNOTSUPP;
@@ -275,8 +277,12 @@ static int validate_filter(struct net_device *dev,
* carries that overlap, we need to translate any PF/VF
* specification into that internal format below.
*/
- if (is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
- is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld))
+ if ((is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
+ is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld)) ||
+ (is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
+ is_field_set(fs->val.encap_vld, fs->mask.encap_vld)) ||
+ (is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld) &&
+ is_field_set(fs->val.encap_vld, fs->mask.encap_vld)))
return -EOPNOTSUPP;
if (unsupported(iconf, VNIC_F, fs->val.pfvf_vld, fs->mask.pfvf_vld) ||
(is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld) &&
@@ -306,6 +312,9 @@ static int validate_filter(struct net_device *dev,
fs->newvlan == VLAN_REWRITE))
return -EOPNOTSUPP;
+ if (fs->val.encap_vld &&
+ CHELSIO_CHIP_VERSION(adapter->params.chip) < CHELSIO_T6)
+ return -EOPNOTSUPP;
return 0;
}
@@ -705,6 +714,8 @@ int delete_filter(struct adapter *adapter, unsigned int fidx)
*/
void clear_filter(struct adapter *adap, struct filter_entry *f)
{
+ struct port_info *pi = netdev_priv(f->dev);
+
/* If the new or old filter have loopback rewriteing rules then we'll
* need to free any existing L2T, SMT, CLIP entries of filter
* rule.
@@ -715,6 +726,12 @@ void clear_filter(struct adapter *adap, struct filter_entry *f)
if (f->smt)
cxgb4_smt_release(f->smt);
+ if (f->fs.val.encap_vld && f->fs.val.ovlan_vld)
+ if (atomic_dec_and_test(&adap->mps_encap[f->fs.val.ovlan &
+ 0x1ff].refcnt))
+ t4_free_encap_mac_filt(adap, pi->viid,
+ f->fs.val.ovlan & 0x1ff, 0);
+
if ((f->fs.hash || is_t6(adap->params.chip)) && f->fs.type)
cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
@@ -840,6 +857,10 @@ bool is_filter_exact_match(struct adapter *adap,
if (!is_hashfilter(adap))
return false;
+ /* Keep tunnel VNI match disabled for hash-filters for now */
+ if (fs->mask.encap_vld)
+ return false;
+
if (fs->type) {
if (is_inaddr_any(fs->val.fip, AF_INET6) ||
!is_addr_all_mask(fs->mask.fip, AF_INET6))
@@ -961,8 +982,12 @@ static u64 hash_filter_ntuple(struct ch_filter_specification *fs,
ntuple |= (u64)(fs->val.tos) << tp->tos_shift;
if (tp->vnic_shift >= 0) {
- if ((adap->params.tp.ingress_config & VNIC_F) &&
- fs->mask.pfvf_vld)
+ if ((adap->params.tp.ingress_config & USE_ENC_IDX_F) &&
+ fs->mask.encap_vld)
+ ntuple |= (u64)((fs->val.encap_vld << 16) |
+ (fs->val.ovlan)) << tp->vnic_shift;
+ else if ((adap->params.tp.ingress_config & VNIC_F) &&
+ fs->mask.pfvf_vld)
ntuple |= (u64)((fs->val.pfvf_vld << 16) |
(fs->val.pf << 13) |
(fs->val.vf)) << tp->vnic_shift;
@@ -1076,6 +1101,7 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
struct filter_ctx *ctx)
{
struct adapter *adapter = netdev2adap(dev);
+ struct port_info *pi = netdev_priv(dev);
struct tid_info *t = &adapter->tids;
struct filter_entry *f;
struct sk_buff *skb;
@@ -1142,13 +1168,34 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
f->fs.mask.ovlan = (fs->mask.pf << 13) | fs->mask.vf;
f->fs.val.ovlan_vld = fs->val.pfvf_vld;
f->fs.mask.ovlan_vld = fs->mask.pfvf_vld;
+ } else if (iconf & USE_ENC_IDX_F) {
+ if (f->fs.val.encap_vld) {
+ struct port_info *pi = netdev_priv(f->dev);
+ u8 match_all_mac[] = { 0, 0, 0, 0, 0, 0 };
+
+ /* allocate MPS TCAM entry */
+ ret = t4_alloc_encap_mac_filt(adapter, pi->viid,
+ match_all_mac,
+ match_all_mac,
+ f->fs.val.vni,
+ f->fs.mask.vni,
+ 0, 1, 1);
+ if (ret < 0)
+ goto free_atid;
+
+ atomic_inc(&adapter->mps_encap[ret].refcnt);
+ f->fs.val.ovlan = ret;
+ f->fs.mask.ovlan = 0xffff;
+ f->fs.val.ovlan_vld = 1;
+ f->fs.mask.ovlan_vld = 1;
+ }
}
size = sizeof(struct cpl_t6_act_open_req);
if (f->fs.type) {
ret = cxgb4_clip_get(f->dev, (const u32 *)&f->fs.val.lip, 1);
if (ret)
- goto free_atid;
+ goto free_mps;
skb = alloc_skb(size, GFP_KERNEL);
if (!skb) {
@@ -1163,7 +1210,7 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
skb = alloc_skb(size, GFP_KERNEL);
if (!skb) {
ret = -ENOMEM;
- goto free_atid;
+ goto free_mps;
}
mk_act_open_req(f, skb,
@@ -1179,6 +1226,10 @@ static int cxgb4_set_hash_filter(struct net_device *dev,
free_clip:
cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
+free_mps:
+ if (f->fs.val.encap_vld && f->fs.val.ovlan_vld)
+ t4_free_encap_mac_filt(adapter, pi->viid, f->fs.val.ovlan, 1);
+
free_atid:
cxgb4_free_atid(t, atid);
@@ -1360,6 +1411,27 @@ int __cxgb4_set_filter(struct net_device *dev, int filter_id,
f->fs.mask.ovlan = (fs->mask.pf << 13) | fs->mask.vf;
f->fs.val.ovlan_vld = fs->val.pfvf_vld;
f->fs.mask.ovlan_vld = fs->mask.pfvf_vld;
+ } else if (iconf & USE_ENC_IDX_F) {
+ if (f->fs.val.encap_vld) {
+ struct port_info *pi = netdev_priv(f->dev);
+ u8 match_all_mac[] = { 0, 0, 0, 0, 0, 0 };
+
+ /* allocate MPS TCAM entry */
+ ret = t4_alloc_encap_mac_filt(adapter, pi->viid,
+ match_all_mac,
+ match_all_mac,
+ f->fs.val.vni,
+ f->fs.mask.vni,
+ 0, 1, 1);
+ if (ret < 0)
+ goto free_clip;
+
+ atomic_inc(&adapter->mps_encap[ret].refcnt);
+ f->fs.val.ovlan = ret;
+ f->fs.mask.ovlan = 0x1ff;
+ f->fs.val.ovlan_vld = 1;
+ f->fs.mask.ovlan_vld = 1;
+ }
}
/* Attempt to set the filter. If we don't succeed, we clear
@@ -1376,6 +1448,13 @@ int __cxgb4_set_filter(struct net_device *dev, int filter_id,
}
return ret;
+
+free_clip:
+ if (is_t6(adapter->params.chip) && f->fs.type)
+ cxgb4_clip_release(f->dev, (const u32 *)&f->fs.val.lip, 1);
+ cxgb4_clear_ftid(&adapter->tids, filter_id,
+ fs->type ? PF_INET6 : PF_INET, chip_ver);
+ return ret;
}
static int cxgb4_del_hash_filter(struct net_device *dev, int filter_id,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
index 3656336..3ddd2c4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c
@@ -194,6 +194,23 @@ static void cxgb4_process_flow_match(struct net_device *dev,
fs->mask.tos = mask->tos;
}
+ if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+ struct flow_dissector_key_keyid *key, *mask;
+
+ key = skb_flow_dissector_target(cls->dissector,
+ FLOW_DISSECTOR_KEY_ENC_KEYID,
+ cls->key);
+ mask = skb_flow_dissector_target(cls->dissector,
+ FLOW_DISSECTOR_KEY_ENC_KEYID,
+ cls->mask);
+ fs->val.vni = be32_to_cpu(key->keyid);
+ fs->mask.vni = be32_to_cpu(mask->keyid);
+ if (fs->mask.vni) {
+ fs->val.encap_vld = 1;
+ fs->mask.encap_vld = 1;
+ }
+ }
+
if (dissector_uses_key(cls->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
struct flow_dissector_key_vlan *key, *mask;
u16 vlan_tci, vlan_tci_mask;
@@ -247,6 +264,7 @@ static int cxgb4_validate_flow_match(struct net_device *dev,
BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
BIT(FLOW_DISSECTOR_KEY_PORTS) |
+ BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
BIT(FLOW_DISSECTOR_KEY_VLAN) |
BIT(FLOW_DISSECTOR_KEY_IP))) {
netdev_warn(dev, "Unsupported key used: 0x%x\n",
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index 1817a03..77c2c53 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -491,7 +491,7 @@ u64 cxgb4_select_ntuple(struct net_device *dev,
if (tp->protocol_shift >= 0)
ntuple |= (u64)IPPROTO_TCP << tp->protocol_shift;
- if (tp->vnic_shift >= 0) {
+ if (tp->vnic_shift >= 0 && (tp->ingress_config & VNIC_F)) {
u32 viid = cxgb4_port_viid(dev);
u32 vf = FW_VIID_VIN_G(viid);
u32 pf = FW_VIID_PFN_G(viid);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 7cb3ef4..df5e7c7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -7513,6 +7513,43 @@ int t4_set_rxmode(struct adapter *adap, unsigned int mbox, unsigned int viid,
}
/**
+ * t4_free_encap_mac_filt - frees MPS entry at given index
+ * @adap: the adapter
+ * @viid: the VI id
+ * @idx: index of MPS entry to be freed
+ * @sleep_ok: call is allowed to sleep
+ *
+ * Frees the MPS entry at supplied index
+ *
+ * Returns a negative error number or zero on success
+ */
+int t4_free_encap_mac_filt(struct adapter *adap, unsigned int viid,
+ int idx, bool sleep_ok)
+{
+ struct fw_vi_mac_exact *p;
+ u8 addr[] = {0, 0, 0, 0, 0, 0};
+ struct fw_vi_mac_cmd c;
+ int ret = 0;
+ u32 exact;
+
+ memset(&c, 0, sizeof(c));
+ c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+ FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+ FW_CMD_EXEC_V(0) |
+ FW_VI_MAC_CMD_VIID_V(viid));
+ exact = FW_VI_MAC_CMD_ENTRY_TYPE_V(FW_VI_MAC_TYPE_EXACTMAC);
+ c.freemacs_to_len16 = cpu_to_be32(FW_VI_MAC_CMD_FREEMACS_V(0) |
+ exact |
+ FW_CMD_LEN16_V(1));
+ p = c.u.exact;
+ p->valid_to_idx = cpu_to_be16(FW_VI_MAC_CMD_VALID_F |
+ FW_VI_MAC_CMD_IDX_V(idx));
+ memcpy(p->macaddr, addr, sizeof(p->macaddr));
+ ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
+ return ret;
+}
+
+/**
* t4_free_raw_mac_filt - Frees a raw mac entry in mps tcam
* @adap: the adapter
* @viid: the VI id
@@ -7563,6 +7600,55 @@ int t4_free_raw_mac_filt(struct adapter *adap, unsigned int viid,
}
/**
+ * t4_alloc_encap_mac_filt - Adds a mac entry in mps tcam with VNI support
+ * @adap: the adapter
+ * @viid: the VI id
+ * @mac: the MAC address
+ * @mask: the mask
+ * @vni: the VNI id for the tunnel protocol
+ * @vni_mask: mask for the VNI id
+ * @dip_hit: to enable DIP match for the MPS entry
+ * @lookup_type: MAC address for inner (1) or outer (0) header
+ * @sleep_ok: call is allowed to sleep
+ *
+ * Allocates an MPS entry with specified MAC address and VNI value.
+ *
+ * Returns a negative error number or the allocated index for this mac.
+ */
+int t4_alloc_encap_mac_filt(struct adapter *adap, unsigned int viid,
+ const u8 *addr, const u8 *mask, unsigned int vni,
+ unsigned int vni_mask, u8 dip_hit, u8 lookup_type,
+ bool sleep_ok)
+{
+ struct fw_vi_mac_cmd c;
+ struct fw_vi_mac_vni *p = c.u.exact_vni;
+ int ret = 0;
+ u32 val;
+
+ memset(&c, 0, sizeof(c));
+ c.op_to_viid = cpu_to_be32(FW_CMD_OP_V(FW_VI_MAC_CMD) |
+ FW_CMD_REQUEST_F | FW_CMD_WRITE_F |
+ FW_VI_MAC_CMD_VIID_V(viid));
+ val = FW_CMD_LEN16_V(1) |
+ FW_VI_MAC_CMD_ENTRY_TYPE_V(FW_VI_MAC_TYPE_EXACTMAC_VNI);
+ c.freemacs_to_len16 = cpu_to_be32(val);
+ p->valid_to_idx = cpu_to_be16(FW_VI_MAC_CMD_VALID_F |
+ FW_VI_MAC_CMD_IDX_V(FW_VI_MAC_ADD_MAC));
+ memcpy(p->macaddr, addr, sizeof(p->macaddr));
+ memcpy(p->macaddr_mask, mask, sizeof(p->macaddr_mask));
+
+ p->lookup_type_to_vni =
+ cpu_to_be32(FW_VI_MAC_CMD_VNI_V(vni) |
+ FW_VI_MAC_CMD_DIP_HIT_V(dip_hit) |
+ FW_VI_MAC_CMD_LOOKUP_TYPE_V(lookup_type));
+ p->vni_mask_pkd = cpu_to_be32(FW_VI_MAC_CMD_VNI_MASK_V(vni_mask));
+ ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, sleep_ok);
+ if (ret == 0)
+ ret = FW_VI_MAC_CMD_IDX_G(be16_to_cpu(p->valid_to_idx));
+ return ret;
+}
+
+/**
* t4_alloc_raw_mac_filt - Adds a mac entry in mps tcam
* @adap: the adapter
* @viid: the VI id
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 276fdf2..843f8ca 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -1598,6 +1598,10 @@
#define VNIC_V(x) ((x) << VNIC_S)
#define VNIC_F VNIC_V(1U)
+#define USE_ENC_IDX_S 13
+#define USE_ENC_IDX_V(x) ((x) << USE_ENC_IDX_S)
+#define USE_ENC_IDX_F USE_ENC_IDX_V(1U)
+
#define CSUM_HAS_PSEUDO_HDR_S 10
#define CSUM_HAS_PSEUDO_HDR_V(x) ((x) << CSUM_HAS_PSEUDO_HDR_S)
#define CSUM_HAS_PSEUDO_HDR_F CSUM_HAS_PSEUDO_HDR_V(1U)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 0e007ee..e6b2e95 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2158,6 +2158,14 @@ struct fw_vi_mac_cmd {
__be64 data0m_pkd;
__be32 data1m[2];
} raw;
+ struct fw_vi_mac_vni {
+ __be16 valid_to_idx;
+ __u8 macaddr[6];
+ __be16 r7;
+ __u8 macaddr_mask[6];
+ __be32 lookup_type_to_vni;
+ __be32 vni_mask_pkd;
+ } exact_vni[2];
} u;
};
@@ -2205,6 +2213,32 @@ struct fw_vi_mac_cmd {
#define FW_VI_MAC_CMD_RAW_IDX_G(x) \
(((x) >> FW_VI_MAC_CMD_RAW_IDX_S) & FW_VI_MAC_CMD_RAW_IDX_M)
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_S 31
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_M 0x1
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_V(x) ((x) << FW_VI_MAC_CMD_LOOKUP_TYPE_S)
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_G(x) \
+ (((x) >> FW_VI_MAC_CMD_LOOKUP_TYPE_S) & FW_VI_MAC_CMD_LOOKUP_TYPE_M)
+#define FW_VI_MAC_CMD_LOOKUP_TYPE_F FW_VI_MAC_CMD_LOOKUP_TYPE_V(1U)
+
+#define FW_VI_MAC_CMD_DIP_HIT_S 30
+#define FW_VI_MAC_CMD_DIP_HIT_M 0x1
+#define FW_VI_MAC_CMD_DIP_HIT_V(x) ((x) << FW_VI_MAC_CMD_DIP_HIT_S)
+#define FW_VI_MAC_CMD_DIP_HIT_G(x) \
+ (((x) >> FW_VI_MAC_CMD_DIP_HIT_S) & FW_VI_MAC_CMD_DIP_HIT_M)
+#define FW_VI_MAC_CMD_DIP_HIT_F FW_VI_MAC_CMD_DIP_HIT_V(1U)
+
+#define FW_VI_MAC_CMD_VNI_S 0
+#define FW_VI_MAC_CMD_VNI_M 0xffffff
+#define FW_VI_MAC_CMD_VNI_V(x) ((x) << FW_VI_MAC_CMD_VNI_S)
+#define FW_VI_MAC_CMD_VNI_G(x) \
+ (((x) >> FW_VI_MAC_CMD_VNI_S) & FW_VI_MAC_CMD_VNI_M)
+
+#define FW_VI_MAC_CMD_VNI_MASK_S 0
+#define FW_VI_MAC_CMD_VNI_MASK_M 0xffffff
+#define FW_VI_MAC_CMD_VNI_MASK_V(x) ((x) << FW_VI_MAC_CMD_VNI_MASK_S)
+#define FW_VI_MAC_CMD_VNI_MASK_G(x) \
+ (((x) >> FW_VI_MAC_CMD_VNI_MASK_S) & FW_VI_MAC_CMD_VNI_MASK_M)
+
#define FW_RXMODE_MTU_NO_CHG 65535
struct fw_vi_rxmode_cmd {
--
2.1.0
^ permalink raw reply related
* [PATCH net 1/2] vmxnet3: set the DMA mask before the first DMA map operation
From: hpreg @ 2018-05-14 12:28 UTC (permalink / raw)
To: netdev; +Cc: Regis Duchesne, Ronak Doshi, VMware, Inc., open list
The DMA mask must be set before, not after, the first DMA map operation, or
the first DMA map operation could in theory fail on some systems.
Fixes: b0eb57cb97e78 ("VMXNET3: Add support for virtual IOMMU")
Signed-off-by: Regis Duchesne <hpreg@vmware.com>
Acked-by: Ronak Doshi <doshir@vmware.com>
---
drivers/net/vmxnet3/vmxnet3_drv.c | 50 +++++++++++++++++++--------------------
drivers/net/vmxnet3/vmxnet3_int.h | 8 ++++---
2 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 9ebe2a6..ed58685 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2688,7 +2688,7 @@ vmxnet3_set_mac_addr(struct net_device *netdev, void *p)
/* ==================== initialization and cleanup routines ============ */
static int
-vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
+vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter)
{
int err;
unsigned long mmio_start, mmio_len;
@@ -2700,30 +2700,12 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
return err;
}
- if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
- if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
- dev_err(&pdev->dev,
- "pci_set_consistent_dma_mask failed\n");
- err = -EIO;
- goto err_set_mask;
- }
- *dma64 = true;
- } else {
- if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
- dev_err(&pdev->dev,
- "pci_set_dma_mask failed\n");
- err = -EIO;
- goto err_set_mask;
- }
- *dma64 = false;
- }
-
err = pci_request_selected_regions(pdev, (1 << 2) - 1,
vmxnet3_driver_name);
if (err) {
dev_err(&pdev->dev,
"Failed to request region for adapter: error %d\n", err);
- goto err_set_mask;
+ goto err_enable_device;
}
pci_set_master(pdev);
@@ -2751,7 +2733,7 @@ vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
iounmap(adapter->hw_addr0);
err_ioremap:
pci_release_selected_regions(pdev, (1 << 2) - 1);
-err_set_mask:
+err_enable_device:
pci_disable_device(pdev);
return err;
}
@@ -3254,7 +3236,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
#endif
};
int err;
- bool dma64 = false; /* stupid gcc */
+ bool dma64;
u32 ver;
struct net_device *netdev;
struct vmxnet3_adapter *adapter;
@@ -3300,6 +3282,24 @@ vmxnet3_probe_device(struct pci_dev *pdev,
adapter->rx_ring_size = VMXNET3_DEF_RX_RING_SIZE;
adapter->rx_ring2_size = VMXNET3_DEF_RX_RING2_SIZE;
+ if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+ if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
+ dev_err(&pdev->dev,
+ "pci_set_consistent_dma_mask failed\n");
+ err = -EIO;
+ goto err_set_mask;
+ }
+ dma64 = true;
+ } else {
+ if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+ dev_err(&pdev->dev,
+ "pci_set_dma_mask failed\n");
+ err = -EIO;
+ goto err_set_mask;
+ }
+ dma64 = false;
+ }
+
spin_lock_init(&adapter->cmd_lock);
adapter->adapter_pa = dma_map_single(&adapter->pdev->dev, adapter,
sizeof(struct vmxnet3_adapter),
@@ -3307,7 +3307,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
if (dma_mapping_error(&adapter->pdev->dev, adapter->adapter_pa)) {
dev_err(&pdev->dev, "Failed to map dma\n");
err = -EFAULT;
- goto err_dma_map;
+ goto err_set_mask;
}
adapter->shared = dma_alloc_coherent(
&adapter->pdev->dev,
@@ -3358,7 +3358,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
}
#endif /* VMXNET3_RSS */
- err = vmxnet3_alloc_pci_resources(adapter, &dma64);
+ err = vmxnet3_alloc_pci_resources(adapter);
if (err < 0)
goto err_alloc_pci;
@@ -3504,7 +3504,7 @@ vmxnet3_probe_device(struct pci_dev *pdev,
err_alloc_shared:
dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
-err_dma_map:
+err_set_mask:
free_netdev(netdev);
return err;
}
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index a332646..b379bed 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,12 @@
/*
* Version numbers
*/
-#define VMXNET3_DRIVER_VERSION_STRING "1.4.14.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING "1.4.15.0-k"
-/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM 0x01040e00
+/* Each byte of this 32-bit integer encodes a version number in
+ * VMXNET3_DRIVER_VERSION_STRING.
+ */
+#define VMXNET3_DRIVER_VERSION_NUM 0x01040f00
#if defined(CONFIG_PCI_MSI)
/* RSS only makes sense if MSI-X is supported. */
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next v8 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: Eric W. Biederman @ 2018-05-14 13:11 UTC (permalink / raw)
To: David Miller
Cc: rahul.lakkireddy, netdev, kexec, linux-fsdevel, linux-kernel,
viro, stephen, akpm, torvalds, ganeshgr, nirranjan, indranil
In-Reply-To: <20180513.201131.1649805811466916646.davem@davemloft.net>
David Miller <davem@davemloft.net> writes:
> I'm deferring this patch series.
>
> If we can't get a reasonable review from an interested party in 10+
> days, that is not reasonable.
>
> Resubmit this once someone reviews it properly.
David I am out on vacation this week and last (the reason for the delay).
The last version of this that I looked at I gave my ack. All of my ABI
concerns had been addressed. The only outstanding change I believe was
the Eric Dumazet's asking about something being reviewed.
I just glanced over it again and I don't see any new issues introduced
by the last round of changes.
>From 10,000 feet flyover design perspectie and from an ABI perspective
this patchset seems fine.
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Eric
^ permalink raw reply
* Re: INFO: rcu detected stall in kfree_skbmem
From: Neil Horman @ 2018-05-14 13:34 UTC (permalink / raw)
To: Dmitry Vyukov
Cc: syzbot, Vladislav Yasevich, Marcelo Ricardo Leitner, linux-sctp,
Andrei Vagin, David Miller, Kirill Tkhai, LKML, netdev,
syzkaller-bugs
In-Reply-To: <CACT4Y+Z_+=VLbELVW69B7WEVqvri1xBMM5RMnMQynec_XgaE=w@mail.gmail.com>
On Fri, May 11, 2018 at 12:00:38PM +0200, Dmitry Vyukov wrote:
> On Mon, Apr 30, 2018 at 8:09 PM, syzbot
> <syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com> wrote:
> > Hello,
> >
> > syzbot found the following crash on:
> >
> > HEAD commit: 5d1365940a68 Merge
> > git://git.kernel.org/pub/scm/linux/kerne...
> > git tree: net-next
> > console output: https://syzkaller.appspot.com/x/log.txt?id=5667997129637888
> > kernel config:
> > https://syzkaller.appspot.com/x/.config?id=-5947642240294114534
> > dashboard link: https://syzkaller.appspot.com/bug?extid=fc78715ba3b3257caf6a
> > compiler: gcc (GCC) 8.0.1 20180413 (experimental)
> >
> > Unfortunately, I don't have any reproducer for this crash yet.
>
> This looks sctp-related, +sctp maintainers.
>
Looking at the entire trace, it appears that we are getting caught in the
kfree_skb that is getting triggered in enqueue_to_backlog which occurs when our
rx backlog list grows over netdev_max_backlog packets. That suggests to me that
whatever test(s) is/are causing this trace are queuing up a large number of
frames to be sent over the loopback interface, and are never/rarely getting
received. Looking up higher in the stack, in the sctp_generate_heartbeat_event
function, we (in addition to the rcu_read_lock in sctp_v6_xmit) we also hold the
socket lock during the entirety of the xmit operaion. Is it possible that we
are just enqueuing so many frames for xmit that we are blocking progress of
other threads using the same socket that we cross the RCU self detected stall
boundary? While its not a fix per se, it might be a worthwhile test to limit
the number of frames we flush in a single pass.
Neil
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com
> >
> > INFO: rcu_sched self-detected stall on CPU
> > 1-...!: (1 GPs behind) idle=a3e/1/4611686018427387908
> > softirq=71980/71983 fqs=33
> > (t=125000 jiffies g=39438 c=39437 q=958)
> > rcu_sched kthread starved for 124829 jiffies! g39438 c39437 f0x0
> > RCU_GP_WAIT_FQS(3) ->state=0x0 ->cpu=0
> > RCU grace-period kthread stack dump:
> > rcu_sched R running task 23768 9 2 0x80000000
> > Call Trace:
> > context_switch kernel/sched/core.c:2848 [inline]
> > __schedule+0x801/0x1e30 kernel/sched/core.c:3490
> > schedule+0xef/0x430 kernel/sched/core.c:3549
> > schedule_timeout+0x138/0x240 kernel/time/timer.c:1801
> > rcu_gp_kthread+0x6b5/0x1940 kernel/rcu/tree.c:2231
> > kthread+0x345/0x410 kernel/kthread.c:238
> > ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411
> > NMI backtrace for cpu 1
> > CPU: 1 PID: 20560 Comm: syz-executor4 Not tainted 4.16.0+ #1
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > Google 01/01/2011
> > Call Trace:
> > <IRQ>
> > __dump_stack lib/dump_stack.c:77 [inline]
> > dump_stack+0x1b9/0x294 lib/dump_stack.c:113
> > nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
> > nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
> > arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
> > trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
> > rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
> > print_cpu_stall kernel/rcu/tree.c:1525 [inline]
> > check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
> > __rcu_pending kernel/rcu/tree.c:3356 [inline]
> > rcu_pending kernel/rcu/tree.c:3401 [inline]
> > rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
> > update_process_times+0x2d/0x70 kernel/time/timer.c:1636
> > tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:173
> > tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1283
> > __run_hrtimer kernel/time/hrtimer.c:1386 [inline]
> > __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1448
> > hrtimer_interrupt+0x286/0x650 kernel/time/hrtimer.c:1506
> > local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
> > smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
> > apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
> > RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
> > [inline]
> > RIP: 0010:kmem_cache_free+0xb3/0x2d0 mm/slab.c:3757
> > RSP: 0018:ffff8801db105228 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> > RAX: 0000000000000007 RBX: ffff8800b055c940 RCX: 1ffff1003b2345a5
> > RDX: 0000000000000000 RSI: ffff8801d91a2d80 RDI: 0000000000000282
> > RBP: ffff8801db105248 R08: ffff8801d91a2cb8 R09: 0000000000000002
> > R10: ffff8801d91a2480 R11: 0000000000000000 R12: ffff8801d9848e40
> > R13: 0000000000000282 R14: ffffffff85b7f27c R15: 0000000000000000
> > kfree_skbmem+0x13c/0x210 net/core/skbuff.c:582
> > __kfree_skb net/core/skbuff.c:642 [inline]
> > kfree_skb+0x19d/0x560 net/core/skbuff.c:659
> > enqueue_to_backlog+0x2fc/0xc90 net/core/dev.c:3968
> > netif_rx_internal+0x14d/0xae0 net/core/dev.c:4181
> > netif_rx+0xba/0x400 net/core/dev.c:4206
> > loopback_xmit+0x283/0x741 drivers/net/loopback.c:91
> > __netdev_start_xmit include/linux/netdevice.h:4087 [inline]
> > netdev_start_xmit include/linux/netdevice.h:4096 [inline]
> > xmit_one net/core/dev.c:3053 [inline]
> > dev_hard_start_xmit+0x264/0xc10 net/core/dev.c:3069
> > __dev_queue_xmit+0x2724/0x34c0 net/core/dev.c:3584
> > dev_queue_xmit+0x17/0x20 net/core/dev.c:3617
> > neigh_hh_output include/net/neighbour.h:472 [inline]
> > neigh_output include/net/neighbour.h:480 [inline]
> > ip6_finish_output2+0x134e/0x2810 net/ipv6/ip6_output.c:120
> > ip6_finish_output+0x5fe/0xbc0 net/ipv6/ip6_output.c:154
> > NF_HOOK_COND include/linux/netfilter.h:277 [inline]
> > ip6_output+0x227/0x9b0 net/ipv6/ip6_output.c:171
> > dst_output include/net/dst.h:444 [inline]
> > NF_HOOK include/linux/netfilter.h:288 [inline]
> > ip6_xmit+0xf51/0x23f0 net/ipv6/ip6_output.c:277
> > sctp_v6_xmit+0x4a5/0x6b0 net/sctp/ipv6.c:225
> > sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:650
> > sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
> > sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
> > sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
> > sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
> > sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
> > sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
> > call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
> > expire_timers kernel/time/timer.c:1363 [inline]
> > __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
> > run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
> > __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
> > invoke_softirq kernel/softirq.c:365 [inline]
> > irq_exit+0x1d1/0x200 kernel/softirq.c:405
> > exiting_irq arch/x86/include/asm/apic.h:525 [inline]
> > smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
> > apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
> > </IRQ>
> > RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
> > [inline]
> > RIP: 0010:lock_release+0x4d4/0xa10 kernel/locking/lockdep.c:3942
> > RSP: 0018:ffff8801971ce7b0 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> > RAX: dffffc0000000000 RBX: 1ffff10032e39cfb RCX: 1ffff1003b234595
> > RDX: 1ffffffff11630ed RSI: 0000000000000002 RDI: 0000000000000282
> > RBP: ffff8801971ce8e0 R08: 1ffff10032e39cff R09: ffffed003b6246c2
> > R10: 0000000000000003 R11: 0000000000000001 R12: ffff8801d91a2480
> > R13: ffffffff88b8df60 R14: ffff8801d91a2480 R15: ffff8801971ce7f8
> > rcu_lock_release include/linux/rcupdate.h:251 [inline]
> > rcu_read_unlock include/linux/rcupdate.h:688 [inline]
> > __unlock_page_memcg+0x72/0x100 mm/memcontrol.c:1654
> > unlock_page_memcg+0x2c/0x40 mm/memcontrol.c:1663
> > page_remove_file_rmap mm/rmap.c:1248 [inline]
> > page_remove_rmap+0x6f2/0x1250 mm/rmap.c:1299
> > zap_pte_range mm/memory.c:1337 [inline]
> > zap_pmd_range mm/memory.c:1441 [inline]
> > zap_pud_range mm/memory.c:1470 [inline]
> > zap_p4d_range mm/memory.c:1491 [inline]
> > unmap_page_range+0xeb4/0x2200 mm/memory.c:1512
> > unmap_single_vma+0x1a0/0x310 mm/memory.c:1557
> > unmap_vmas+0x120/0x1f0 mm/memory.c:1587
> > exit_mmap+0x265/0x570 mm/mmap.c:3038
> > __mmput kernel/fork.c:962 [inline]
> > mmput+0x251/0x610 kernel/fork.c:983
> > exit_mm kernel/exit.c:544 [inline]
> > do_exit+0xe98/0x2730 kernel/exit.c:852
> > do_group_exit+0x16f/0x430 kernel/exit.c:968
> > get_signal+0x886/0x1960 kernel/signal.c:2469
> > do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
> > exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
> > prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
> > syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
> > do_syscall_64+0x792/0x9d0 arch/x86/entry/common.c:292
> > entry_SYSCALL_64_after_hwframe+0x42/0xb7
> > RIP: 0033:0x455319
> > RSP: 002b:00007fa346e81ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
> > RAX: fffffffffffffe00 RBX: 000000000072bf80 RCX: 0000000000455319
> > RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072bf80
> > RBP: 000000000072bf80 R08: 0000000000000000 R09: 000000000072bf58
> > R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> > R13: 0000000000a3e81f R14: 00007fa346e829c0 R15: 0000000000000001
> >
> >
> > ---
> > This bug is generated by a bot. It may contain errors.
> > See https://goo.gl/tpsmEJ for more information about syzbot.
> > syzbot engineers can be reached at syzkaller@googlegroups.com.
> >
> > syzbot will keep track of this bug report.
> > If you forgot to add the Reported-by tag, once the fix for this bug is
> > merged
> > into any tree, please reply to this email with:
> > #syz fix: exact-commit-title
> > To mark this as a duplicate of another syzbot report, please reply with:
> > #syz dup: exact-subject-of-another-report
> > If it's a one-off invalid bug report, please reply with:
> > #syz invalid
> > Note: if the crash happens again, it will cause creation of a new bug
> > report.
> > Note: all commands must start from beginning of the line in the email body.
> >
> > --
> > You received this message because you are subscribed to the Google Groups
> > "syzkaller-bugs" group.
> > To unsubscribe from this group and stop receiving emails from it, send an
> > email to syzkaller-bugs+unsubscribe@googlegroups.com.
> > To view this discussion on the web visit
> > https://groups.google.com/d/msgid/syzkaller-bugs/000000000000a9b0e3056b14bfb2%40google.com.
> > For more options, visit https://groups.google.com/d/optout.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply
* Re: [PATCH v2] Revert "alx: remove WoL support"
From: David Miller @ 2018-05-14 13:35 UTC (permalink / raw)
To: acelan.kao
Cc: jcliburn, chris.snook, rakesh, netdev, emily.chien, andrew,
linux-kernel
In-Reply-To: <20180514032839.32079-1-acelan.kao@canonical.com>
From: AceLan Kao <acelan.kao@canonical.com>
Date: Mon, 14 May 2018 11:28:39 +0800
> This reverts commit bc2bebe8de8ed4ba6482c9cc370b0dd72ffe8cd2.
>
> The WoL feature is a must to pass Energy Star 6.1 and above,
> the power consumption will be measured during S3 with WoL is enabled.
>
> Reverting "alx: remove WoL support", and will try to fix the unintentional
> wake up issue when WoL is enabled.
>
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=61651
>
> Signed-off-by: AceLan Kao <acelan.kao@canonical.com>
First, we must fix the problem that caused WoL to be disabled.
Then, and only then, can you re-enable it.
Thank you.
^ permalink raw reply
* [bpf-next PATCH 0/5] bpf, doc: convert Documentation/bpf to RST-formatting
From: Jesper Dangaard Brouer @ 2018-05-14 13:42 UTC (permalink / raw)
To: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: netdev, Quentin Monnet, linux-man, linux-doc
The kernel is moving files under Documentation to use the RST
(reStructuredText) format and Sphinx [1]. This patchset converts the
files under Documentation/bpf/ into RST format. The Sphinx
integration is left as followup work.
[1] https://www.kernel.org/doc/html/latest/doc-guide/sphinx.html
This patchset have been uploaded as branch bpf_doc10 on github[2], so
reviewers can see how GitHub renders this.
[2] https://github.com/netoptimizer/linux/tree/bpf_doc10/Documentation/bpf
---
Jesper Dangaard Brouer (5):
bpf, doc: add basic README.rst file
bpf, doc: rename txt files to rst files
bpf, doc: convert bpf_design_QA.rst to use RST formatting
bpf, doc: convert bpf_devel_QA.rst to use RST formatting
bpf, doc: howto use/run the BPF selftests
Documentation/bpf/README.rst | 36 ++
Documentation/bpf/bpf_design_QA.rst | 221 ++++++++++++
Documentation/bpf/bpf_design_QA.txt | 156 ---------
Documentation/bpf/bpf_devel_QA.rst | 640 +++++++++++++++++++++++++++++++++++
Documentation/bpf/bpf_devel_QA.txt | 570 -------------------------------
5 files changed, 897 insertions(+), 726 deletions(-)
create mode 100644 Documentation/bpf/README.rst
create mode 100644 Documentation/bpf/bpf_design_QA.rst
delete mode 100644 Documentation/bpf/bpf_design_QA.txt
create mode 100644 Documentation/bpf/bpf_devel_QA.rst
delete mode 100644 Documentation/bpf/bpf_devel_QA.txt
^ permalink raw reply
* [bpf-next PATCH 1/5] bpf, doc: add basic README.rst file
From: Jesper Dangaard Brouer @ 2018-05-14 13:42 UTC (permalink / raw)
To: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: netdev, Quentin Monnet, linux-man, linux-doc
In-Reply-To: <152630528901.29210.9018565600000101307.stgit@firesoul>
A README.rst file in a directory have special meaning for sites like
github, which auto renders the contents. Plus search engines like
Google also index these README.rst files.
Auto rendering allow us to use links, for (re)directing eBPF users to
other places where docs live. The end-goal would be to direct users
towards https://www.kernel.org/doc/html/latest but we haven't written
the full docs yet, so we start out small and take this incrementally.
This directory itself contains some useful docs, which can be linked
to from the README.rst file (verified this works for github).
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
Documentation/bpf/README.rst | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
create mode 100644 Documentation/bpf/README.rst
diff --git a/Documentation/bpf/README.rst b/Documentation/bpf/README.rst
new file mode 100644
index 000000000000..329469c33db8
--- /dev/null
+++ b/Documentation/bpf/README.rst
@@ -0,0 +1,36 @@
+=================
+BPF documentation
+=================
+
+This directory contains documentation for the BPF (Berkeley Packet
+Filter) facility, with a focus on the extended BPF version (eBPF).
+
+This kernel side documentation is still work in progress. The main
+textual documentation is (for historical reasons) described in
+`Documentation/networking/filter.txt`_, which describe both classical
+and extended BPF instruction-set.
+The Cilium project also maintains a `BPF and XDP Reference Guide`_
+that goes into great technical depth about the BPF Architecture.
+
+The primary info for the bpf syscall is available in the `man-pages`_
+for `bpf(2)`_.
+
+
+
+Frequently asked questions (FAQ)
+================================
+
+Two sets of Questions and Answers (Q&A) are maintained.
+
+* QA for common questions about BPF see: bpf_design_QA_
+
+* QA for developers interacting with BPF subsystem: bpf_devel_QA_
+
+
+.. Links:
+.. _bpf_design_QA: bpf_design_QA.txt
+.. _bpf_devel_QA: bpf_devel_QA.txt
+.. _Documentation/networking/filter.txt: ../networking/filter.txt
+.. _man-pages: https://www.kernel.org/doc/man-pages/
+.. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html
+.. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/
^ permalink raw reply related
* [bpf-next PATCH 2/5] bpf, doc: rename txt files to rst files
From: Jesper Dangaard Brouer @ 2018-05-14 13:42 UTC (permalink / raw)
To: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: netdev, Quentin Monnet, linux-man, linux-doc
In-Reply-To: <152630528901.29210.9018565600000101307.stgit@firesoul>
This will cause them to get auto rendered, e.g. when viewing them on GitHub.
Followup patches will correct the content to be RST compliant.
Also adjust README.rst to point to the renamed files.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
Documentation/bpf/README.rst | 4
Documentation/bpf/bpf_design_QA.rst | 156 ++++++++++
Documentation/bpf/bpf_design_QA.txt | 156 ----------
Documentation/bpf/bpf_devel_QA.rst | 570 +++++++++++++++++++++++++++++++++++
Documentation/bpf/bpf_devel_QA.txt | 570 -----------------------------------
5 files changed, 728 insertions(+), 728 deletions(-)
create mode 100644 Documentation/bpf/bpf_design_QA.rst
delete mode 100644 Documentation/bpf/bpf_design_QA.txt
create mode 100644 Documentation/bpf/bpf_devel_QA.rst
delete mode 100644 Documentation/bpf/bpf_devel_QA.txt
diff --git a/Documentation/bpf/README.rst b/Documentation/bpf/README.rst
index 329469c33db8..b9a80c9e9392 100644
--- a/Documentation/bpf/README.rst
+++ b/Documentation/bpf/README.rst
@@ -28,8 +28,8 @@ Two sets of Questions and Answers (Q&A) are maintained.
.. Links:
-.. _bpf_design_QA: bpf_design_QA.txt
-.. _bpf_devel_QA: bpf_devel_QA.txt
+.. _bpf_design_QA: bpf_design_QA.rst
+.. _bpf_devel_QA: bpf_devel_QA.rst
.. _Documentation/networking/filter.txt: ../networking/filter.txt
.. _man-pages: https://www.kernel.org/doc/man-pages/
.. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
new file mode 100644
index 000000000000..f3e458a0bb2f
--- /dev/null
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -0,0 +1,156 @@
+BPF extensibility and applicability to networking, tracing, security
+in the linux kernel and several user space implementations of BPF
+virtual machine led to a number of misunderstanding on what BPF actually is.
+This short QA is an attempt to address that and outline a direction
+of where BPF is heading long term.
+
+Q: Is BPF a generic instruction set similar to x64 and arm64?
+A: NO.
+
+Q: Is BPF a generic virtual machine ?
+A: NO.
+
+BPF is generic instruction set _with_ C calling convention.
+
+Q: Why C calling convention was chosen?
+A: Because BPF programs are designed to run in the linux kernel
+ which is written in C, hence BPF defines instruction set compatible
+ with two most used architectures x64 and arm64 (and takes into
+ consideration important quirks of other architectures) and
+ defines calling convention that is compatible with C calling
+ convention of the linux kernel on those architectures.
+
+Q: can multiple return values be supported in the future?
+A: NO. BPF allows only register R0 to be used as return value.
+
+Q: can more than 5 function arguments be supported in the future?
+A: NO. BPF calling convention only allows registers R1-R5 to be used
+ as arguments. BPF is not a standalone instruction set.
+ (unlike x64 ISA that allows msft, cdecl and other conventions)
+
+Q: can BPF programs access instruction pointer or return address?
+A: NO.
+
+Q: can BPF programs access stack pointer ?
+A: NO. Only frame pointer (register R10) is accessible.
+ From compiler point of view it's necessary to have stack pointer.
+ For example LLVM defines register R11 as stack pointer in its
+ BPF backend, but it makes sure that generated code never uses it.
+
+Q: Does C-calling convention diminishes possible use cases?
+A: YES. BPF design forces addition of major functionality in the form
+ of kernel helper functions and kernel objects like BPF maps with
+ seamless interoperability between them. It lets kernel call into
+ BPF programs and programs call kernel helpers with zero overhead.
+ As all of them were native C code. That is particularly the case
+ for JITed BPF programs that are indistinguishable from
+ native kernel C code.
+
+Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
+A: Soft yes. At least for now until BPF core has support for
+ bpf-to-bpf calls, indirect calls, loops, global variables,
+ jump tables, read only sections and all other normal constructs
+ that C code can produce.
+
+Q: Can loops be supported in a safe way?
+A: It's not clear yet. BPF developers are trying to find a way to
+ support bounded loops where the verifier can guarantee that
+ the program terminates in less than 4096 instructions.
+
+Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
+ C code cannot express them and has to use builtin intrinsics?
+A: This is artifact of compatibility with classic BPF. Modern
+ networking code in BPF performs better without them.
+ See 'direct packet access'.
+
+Q: It seems not all BPF instructions are one-to-one to native CPU.
+ For example why BPF_JNE and other compare and jumps are not cpu-like?
+A: This was necessary to avoid introducing flags into ISA which are
+ impossible to make generic and efficient across CPU architectures.
+
+Q: why BPF_DIV instruction doesn't map to x64 div?
+A: Because if we picked one-to-one relationship to x64 it would have made
+ it more complicated to support on arm64 and other archs. Also it
+ needs div-by-zero runtime check.
+
+Q: why there is no BPF_SDIV for signed divide operation?
+A: Because it would be rarely used. llvm errors in such case and
+ prints a suggestion to use unsigned divide instead
+
+Q: Why BPF has implicit prologue and epilogue?
+A: Because architectures like sparc have register windows and in general
+ there are enough subtle differences between architectures, so naive
+ store return address into stack won't work. Another reason is BPF has
+ to be safe from division by zero (and legacy exception path
+ of LD_ABS insn). Those instructions need to invoke epilogue and
+ return implicitly.
+
+Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
+A: Because classic BPF didn't have them and BPF authors felt that compiler
+ workaround would be acceptable. Turned out that programs lose performance
+ due to lack of these compare instructions and they were added.
+ These two instructions is a perfect example what kind of new BPF
+ instructions are acceptable and can be added in the future.
+ These two already had equivalent instructions in native CPUs.
+ New instructions that don't have one-to-one mapping to HW instructions
+ will not be accepted.
+
+Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
+ registers which makes BPF inefficient virtual machine for 32-bit
+ CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
+ be added to BPF in the future?
+A: NO. The first thing to improve performance on 32-bit archs is to teach
+ LLVM to generate code that uses 32-bit subregisters. Then second step
+ is to teach verifier to mark operations where zero-ing upper bits
+ is unnecessary. Then JITs can take advantage of those markings and
+ drastically reduce size of generated code and improve performance.
+
+Q: Does BPF have a stable ABI?
+A: YES. BPF instructions, arguments to BPF programs, set of helper
+ functions and their arguments, recognized return codes are all part
+ of ABI. However when tracing programs are using bpf_probe_read() helper
+ to walk kernel internal datastructures and compile with kernel
+ internal headers these accesses can and will break with newer
+ kernels. The union bpf_attr -> kern_version is checked at load time
+ to prevent accidentally loading kprobe-based bpf programs written
+ for a different kernel. Networking programs don't do kern_version check.
+
+Q: How much stack space a BPF program uses?
+A: Currently all program types are limited to 512 bytes of stack
+ space, but the verifier computes the actual amount of stack used
+ and both interpreter and most JITed code consume necessary amount.
+
+Q: Can BPF be offloaded to HW?
+A: YES. BPF HW offload is supported by NFP driver.
+
+Q: Does classic BPF interpreter still exist?
+A: NO. Classic BPF programs are converted into extend BPF instructions.
+
+Q: Can BPF call arbitrary kernel functions?
+A: NO. BPF programs can only call a set of helper functions which
+ is defined for every program type.
+
+Q: Can BPF overwrite arbitrary kernel memory?
+A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
+ and bpf_probe_read_str() helpers. Networking programs cannot read
+ arbitrary memory, since they don't have access to these helpers.
+ Programs can never read or write arbitrary memory directly.
+
+Q: Can BPF overwrite arbitrary user memory?
+A: Sort-of. Tracing BPF programs can overwrite the user memory
+ of the current task with bpf_probe_write_user(). Every time such
+ program is loaded the kernel will print warning message, so
+ this helper is only useful for experiments and prototypes.
+ Tracing BPF programs are root only.
+
+Q: When bpf_trace_printk() helper is used the kernel prints nasty
+ warning message. Why is that?
+A: This is done to nudge program authors into better interfaces when
+ programs need to pass data to user space. Like bpf_perf_event_output()
+ can be used to efficiently stream data via perf ring buffer.
+ BPF maps can be used for asynchronous data sharing between kernel
+ and user space. bpf_trace_printk() should only be used for debugging.
+
+Q: Can BPF functionality such as new program or map types, new
+ helpers, etc be added out of kernel module code?
+A: NO.
diff --git a/Documentation/bpf/bpf_design_QA.txt b/Documentation/bpf/bpf_design_QA.txt
deleted file mode 100644
index f3e458a0bb2f..000000000000
--- a/Documentation/bpf/bpf_design_QA.txt
+++ /dev/null
@@ -1,156 +0,0 @@
-BPF extensibility and applicability to networking, tracing, security
-in the linux kernel and several user space implementations of BPF
-virtual machine led to a number of misunderstanding on what BPF actually is.
-This short QA is an attempt to address that and outline a direction
-of where BPF is heading long term.
-
-Q: Is BPF a generic instruction set similar to x64 and arm64?
-A: NO.
-
-Q: Is BPF a generic virtual machine ?
-A: NO.
-
-BPF is generic instruction set _with_ C calling convention.
-
-Q: Why C calling convention was chosen?
-A: Because BPF programs are designed to run in the linux kernel
- which is written in C, hence BPF defines instruction set compatible
- with two most used architectures x64 and arm64 (and takes into
- consideration important quirks of other architectures) and
- defines calling convention that is compatible with C calling
- convention of the linux kernel on those architectures.
-
-Q: can multiple return values be supported in the future?
-A: NO. BPF allows only register R0 to be used as return value.
-
-Q: can more than 5 function arguments be supported in the future?
-A: NO. BPF calling convention only allows registers R1-R5 to be used
- as arguments. BPF is not a standalone instruction set.
- (unlike x64 ISA that allows msft, cdecl and other conventions)
-
-Q: can BPF programs access instruction pointer or return address?
-A: NO.
-
-Q: can BPF programs access stack pointer ?
-A: NO. Only frame pointer (register R10) is accessible.
- From compiler point of view it's necessary to have stack pointer.
- For example LLVM defines register R11 as stack pointer in its
- BPF backend, but it makes sure that generated code never uses it.
-
-Q: Does C-calling convention diminishes possible use cases?
-A: YES. BPF design forces addition of major functionality in the form
- of kernel helper functions and kernel objects like BPF maps with
- seamless interoperability between them. It lets kernel call into
- BPF programs and programs call kernel helpers with zero overhead.
- As all of them were native C code. That is particularly the case
- for JITed BPF programs that are indistinguishable from
- native kernel C code.
-
-Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
-A: Soft yes. At least for now until BPF core has support for
- bpf-to-bpf calls, indirect calls, loops, global variables,
- jump tables, read only sections and all other normal constructs
- that C code can produce.
-
-Q: Can loops be supported in a safe way?
-A: It's not clear yet. BPF developers are trying to find a way to
- support bounded loops where the verifier can guarantee that
- the program terminates in less than 4096 instructions.
-
-Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
- C code cannot express them and has to use builtin intrinsics?
-A: This is artifact of compatibility with classic BPF. Modern
- networking code in BPF performs better without them.
- See 'direct packet access'.
-
-Q: It seems not all BPF instructions are one-to-one to native CPU.
- For example why BPF_JNE and other compare and jumps are not cpu-like?
-A: This was necessary to avoid introducing flags into ISA which are
- impossible to make generic and efficient across CPU architectures.
-
-Q: why BPF_DIV instruction doesn't map to x64 div?
-A: Because if we picked one-to-one relationship to x64 it would have made
- it more complicated to support on arm64 and other archs. Also it
- needs div-by-zero runtime check.
-
-Q: why there is no BPF_SDIV for signed divide operation?
-A: Because it would be rarely used. llvm errors in such case and
- prints a suggestion to use unsigned divide instead
-
-Q: Why BPF has implicit prologue and epilogue?
-A: Because architectures like sparc have register windows and in general
- there are enough subtle differences between architectures, so naive
- store return address into stack won't work. Another reason is BPF has
- to be safe from division by zero (and legacy exception path
- of LD_ABS insn). Those instructions need to invoke epilogue and
- return implicitly.
-
-Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
-A: Because classic BPF didn't have them and BPF authors felt that compiler
- workaround would be acceptable. Turned out that programs lose performance
- due to lack of these compare instructions and they were added.
- These two instructions is a perfect example what kind of new BPF
- instructions are acceptable and can be added in the future.
- These two already had equivalent instructions in native CPUs.
- New instructions that don't have one-to-one mapping to HW instructions
- will not be accepted.
-
-Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
- registers which makes BPF inefficient virtual machine for 32-bit
- CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
- be added to BPF in the future?
-A: NO. The first thing to improve performance on 32-bit archs is to teach
- LLVM to generate code that uses 32-bit subregisters. Then second step
- is to teach verifier to mark operations where zero-ing upper bits
- is unnecessary. Then JITs can take advantage of those markings and
- drastically reduce size of generated code and improve performance.
-
-Q: Does BPF have a stable ABI?
-A: YES. BPF instructions, arguments to BPF programs, set of helper
- functions and their arguments, recognized return codes are all part
- of ABI. However when tracing programs are using bpf_probe_read() helper
- to walk kernel internal datastructures and compile with kernel
- internal headers these accesses can and will break with newer
- kernels. The union bpf_attr -> kern_version is checked at load time
- to prevent accidentally loading kprobe-based bpf programs written
- for a different kernel. Networking programs don't do kern_version check.
-
-Q: How much stack space a BPF program uses?
-A: Currently all program types are limited to 512 bytes of stack
- space, but the verifier computes the actual amount of stack used
- and both interpreter and most JITed code consume necessary amount.
-
-Q: Can BPF be offloaded to HW?
-A: YES. BPF HW offload is supported by NFP driver.
-
-Q: Does classic BPF interpreter still exist?
-A: NO. Classic BPF programs are converted into extend BPF instructions.
-
-Q: Can BPF call arbitrary kernel functions?
-A: NO. BPF programs can only call a set of helper functions which
- is defined for every program type.
-
-Q: Can BPF overwrite arbitrary kernel memory?
-A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
- and bpf_probe_read_str() helpers. Networking programs cannot read
- arbitrary memory, since they don't have access to these helpers.
- Programs can never read or write arbitrary memory directly.
-
-Q: Can BPF overwrite arbitrary user memory?
-A: Sort-of. Tracing BPF programs can overwrite the user memory
- of the current task with bpf_probe_write_user(). Every time such
- program is loaded the kernel will print warning message, so
- this helper is only useful for experiments and prototypes.
- Tracing BPF programs are root only.
-
-Q: When bpf_trace_printk() helper is used the kernel prints nasty
- warning message. Why is that?
-A: This is done to nudge program authors into better interfaces when
- programs need to pass data to user space. Like bpf_perf_event_output()
- can be used to efficiently stream data via perf ring buffer.
- BPF maps can be used for asynchronous data sharing between kernel
- and user space. bpf_trace_printk() should only be used for debugging.
-
-Q: Can BPF functionality such as new program or map types, new
- helpers, etc be added out of kernel module code?
-A: NO.
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
new file mode 100644
index 000000000000..da57601153a0
--- /dev/null
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -0,0 +1,570 @@
+This document provides information for the BPF subsystem about various
+workflows related to reporting bugs, submitting patches, and queueing
+patches for stable kernels.
+
+For general information about submitting patches, please refer to
+Documentation/process/. This document only describes additional specifics
+related to BPF.
+
+Reporting bugs:
+---------------
+
+Q: How do I report bugs for BPF kernel code?
+
+A: Since all BPF kernel development as well as bpftool and iproute2 BPF
+ loader development happens through the netdev kernel mailing list,
+ please report any found issues around BPF to the following mailing
+ list:
+
+ netdev@vger.kernel.org
+
+ This may also include issues related to XDP, BPF tracing, etc.
+
+ Given netdev has a high volume of traffic, please also add the BPF
+ maintainers to Cc (from kernel MAINTAINERS file):
+
+ Alexei Starovoitov <ast@kernel.org>
+ Daniel Borkmann <daniel@iogearbox.net>
+
+ In case a buggy commit has already been identified, make sure to keep
+ the actual commit authors in Cc as well for the report. They can
+ typically be identified through the kernel's git tree.
+
+ Please do *not* report BPF issues to bugzilla.kernel.org since it
+ is a guarantee that the reported issue will be overlooked.
+
+Submitting patches:
+-------------------
+
+Q: To which mailing list do I need to submit my BPF patches?
+
+A: Please submit your BPF patches to the netdev kernel mailing list:
+
+ netdev@vger.kernel.org
+
+ Historically, BPF came out of networking and has always been maintained
+ by the kernel networking community. Although these days BPF touches
+ many other subsystems as well, the patches are still routed mainly
+ through the networking community.
+
+ In case your patch has changes in various different subsystems (e.g.
+ tracing, security, etc), make sure to Cc the related kernel mailing
+ lists and maintainers from there as well, so they are able to review
+ the changes and provide their Acked-by's to the patches.
+
+Q: Where can I find patches currently under discussion for BPF subsystem?
+
+A: All patches that are Cc'ed to netdev are queued for review under netdev
+ patchwork project:
+
+ http://patchwork.ozlabs.org/project/netdev/list/
+
+ Those patches which target BPF, are assigned to a 'bpf' delegate for
+ further processing from BPF maintainers. The current queue with
+ patches under review can be found at:
+
+ https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+
+ Once the patches have been reviewed by the BPF community as a whole
+ and approved by the BPF maintainers, their status in patchwork will be
+ changed to 'Accepted' and the submitter will be notified by mail. This
+ means that the patches look good from a BPF perspective and have been
+ applied to one of the two BPF kernel trees.
+
+ In case feedback from the community requires a respin of the patches,
+ their status in patchwork will be set to 'Changes Requested', and purged
+ from the current review queue. Likewise for cases where patches would
+ get rejected or are not applicable to the BPF trees (but assigned to
+ the 'bpf' delegate).
+
+Q: How do the changes make their way into Linux?
+
+A: There are two BPF kernel trees (git repositories). Once patches have
+ been accepted by the BPF maintainers, they will be applied to one
+ of the two BPF trees:
+
+ https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
+ https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
+
+ The bpf tree itself is for fixes only, whereas bpf-next for features,
+ cleanups or other kind of improvements ("next-like" content). This is
+ analogous to net and net-next trees for networking. Both bpf and
+ bpf-next will only have a master branch in order to simplify against
+ which branch patches should get rebased to.
+
+ Accumulated BPF patches in the bpf tree will regularly get pulled
+ into the net kernel tree. Likewise, accumulated BPF patches accepted
+ into the bpf-next tree will make their way into net-next tree. net and
+ net-next are both run by David S. Miller. From there, they will go
+ into the kernel mainline tree run by Linus Torvalds. To read up on the
+ process of net and net-next being merged into the mainline tree, see
+ the netdev FAQ under:
+
+ Documentation/networking/netdev-FAQ.txt
+
+ Occasionally, to prevent merge conflicts, we might send pull requests
+ to other trees (e.g. tracing) with a small subset of the patches, but
+ net and net-next are always the main trees targeted for integration.
+
+ The pull requests will contain a high-level summary of the accumulated
+ patches and can be searched on netdev kernel mailing list through the
+ following subject lines (yyyy-mm-dd is the date of the pull request):
+
+ pull-request: bpf yyyy-mm-dd
+ pull-request: bpf-next yyyy-mm-dd
+
+Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
+ applied to?
+
+A: The process is the very same as described in the netdev FAQ, so
+ please read up on it. The subject line must indicate whether the
+ patch is a fix or rather "next-like" content in order to let the
+ maintainers know whether it is targeted at bpf or bpf-next.
+
+ For fixes eventually landing in bpf -> net tree, the subject must
+ look like:
+
+ git format-patch --subject-prefix='PATCH bpf' start..finish
+
+ For features/improvements/etc that should eventually land in
+ bpf-next -> net-next, the subject must look like:
+
+ git format-patch --subject-prefix='PATCH bpf-next' start..finish
+
+ If unsure whether the patch or patch series should go into bpf
+ or net directly, or bpf-next or net-next directly, it is not a
+ problem either if the subject line says net or net-next as target.
+ It is eventually up to the maintainers to do the delegation of
+ the patches.
+
+ If it is clear that patches should go into bpf or bpf-next tree,
+ please make sure to rebase the patches against those trees in
+ order to reduce potential conflicts.
+
+ In case the patch or patch series has to be reworked and sent out
+ again in a second or later revision, it is also required to add a
+ version number (v2, v3, ...) into the subject prefix:
+
+ git format-patch --subject-prefix='PATCH net-next v2' start..finish
+
+ When changes have been requested to the patch series, always send the
+ whole patch series again with the feedback incorporated (never send
+ individual diffs on top of the old series).
+
+Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
+
+A: It means that the patch looks good for mainline inclusion from
+ a BPF point of view.
+
+ Be aware that this is not a final verdict that the patch will
+ automatically get accepted into net or net-next trees eventually:
+
+ On the netdev kernel mailing list reviews can come in at any point
+ in time. If discussions around a patch conclude that they cannot
+ get included as-is, we will either apply a follow-up fix or drop
+ them from the trees entirely. Therefore, we also reserve to rebase
+ the trees when deemed necessary. After all, the purpose of the tree
+ is to i) accumulate and stage BPF patches for integration into trees
+ like net and net-next, and ii) run extensive BPF test suite and
+ workloads on the patches before they make their way any further.
+
+ Once the BPF pull request was accepted by David S. Miller, then
+ the patches end up in net or net-next tree, respectively, and
+ make their way from there further into mainline. Again, see the
+ netdev FAQ for additional information e.g. on how often they are
+ merged to mainline.
+
+Q: How long do I need to wait for feedback on my BPF patches?
+
+A: We try to keep the latency low. The usual time to feedback will
+ be around 2 or 3 business days. It may vary depending on the
+ complexity of changes and current patch load.
+
+Q: How often do you send pull requests to major kernel trees like
+ net or net-next?
+
+A: Pull requests will be sent out rather often in order to not
+ accumulate too many patches in bpf or bpf-next.
+
+ As a rule of thumb, expect pull requests for each tree regularly
+ at the end of the week. In some cases pull requests could additionally
+ come also in the middle of the week depending on the current patch
+ load or urgency.
+
+Q: Are patches applied to bpf-next when the merge window is open?
+
+A: For the time when the merge window is open, bpf-next will not be
+ processed. This is roughly analogous to net-next patch processing,
+ so feel free to read up on the netdev FAQ about further details.
+
+ During those two weeks of merge window, we might ask you to resend
+ your patch series once bpf-next is open again. Once Linus released
+ a v*-rc1 after the merge window, we continue processing of bpf-next.
+
+ For non-subscribers to kernel mailing lists, there is also a status
+ page run by David S. Miller on net-next that provides guidance:
+
+ http://vger.kernel.org/~davem/net-next.html
+
+Q: I made a BPF verifier change, do I need to add test cases for
+ BPF kernel selftests?
+
+A: If the patch has changes to the behavior of the verifier, then yes,
+ it is absolutely necessary to add test cases to the BPF kernel
+ selftests suite. If they are not present and we think they are
+ needed, then we might ask for them before accepting any changes.
+
+ In particular, test_verifier.c is tracking a high number of BPF test
+ cases, including a lot of corner cases that LLVM BPF back end may
+ generate out of the restricted C code. Thus, adding test cases is
+ absolutely crucial to make sure future changes do not accidentally
+ affect prior use-cases. Thus, treat those test cases as: verifier
+ behavior that is not tracked in test_verifier.c could potentially
+ be subject to change.
+
+Q: When should I add code to samples/bpf/ and when to BPF kernel
+ selftests?
+
+A: In general, we prefer additions to BPF kernel selftests rather than
+ samples/bpf/. The rationale is very simple: kernel selftests are
+ regularly run by various bots to test for kernel regressions.
+
+ The more test cases we add to BPF selftests, the better the coverage
+ and the less likely it is that those could accidentally break. It is
+ not that BPF kernel selftests cannot demo how a specific feature can
+ be used.
+
+ That said, samples/bpf/ may be a good place for people to get started,
+ so it might be advisable that simple demos of features could go into
+ samples/bpf/, but advanced functional and corner-case testing rather
+ into kernel selftests.
+
+ If your sample looks like a test case, then go for BPF kernel selftests
+ instead!
+
+Q: When should I add code to the bpftool?
+
+A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
+ a central user space tool for debugging and introspection of BPF programs
+ and maps that are active in the kernel. If UAPI changes related to BPF
+ enable for dumping additional information of programs or maps, then
+ bpftool should be extended as well to support dumping them.
+
+Q: When should I add code to iproute2's BPF loader?
+
+A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
+ convention is that those control-path related changes are added to
+ iproute2's BPF loader as well from user space side. This is not only
+ useful to have UAPI changes properly designed to be usable, but also
+ to make those changes available to a wider user base of major
+ downstream distributions.
+
+Q: Do you accept patches as well for iproute2's BPF loader?
+
+A: Patches for the iproute2's BPF loader have to be sent to:
+
+ netdev@vger.kernel.org
+
+ While those patches are not processed by the BPF kernel maintainers,
+ please keep them in Cc as well, so they can be reviewed.
+
+ The official git repository for iproute2 is run by Stephen Hemminger
+ and can be found at:
+
+ https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
+
+ The patches need to have a subject prefix of '[PATCH iproute2 master]'
+ or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
+ target branch where the patch should be applied to. Meaning, if kernel
+ changes went into the net-next kernel tree, then the related iproute2
+ changes need to go into the iproute2 net-next branch, otherwise they
+ can be targeted at master branch. The iproute2 net-next branch will get
+ merged into the master branch after the current iproute2 version from
+ master has been released.
+
+ Like BPF, the patches end up in patchwork under the netdev project and
+ are delegated to 'shemminger' for further processing:
+
+ http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
+
+Q: What is the minimum requirement before I submit my BPF patches?
+
+A: When submitting patches, always take the time and properly test your
+ patches *prior* to submission. Never rush them! If maintainers find
+ that your patches have not been properly tested, it is a good way to
+ get them grumpy. Testing patch submissions is a hard requirement!
+
+ Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
+ same applies to fixes that target bpf-next, where the affected commit
+ is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
+ in order to identify follow-up commits and tremendously helps for people
+ having to do backporting, so it is a must have!
+
+ We also don't accept patches with an empty commit message. Take your
+ time and properly write up a high quality commit message, it is
+ essential!
+
+ Think about it this way: other developers looking at your code a month
+ from now need to understand *why* a certain change has been done that
+ way, and whether there have been flaws in the analysis or assumptions
+ that the original author did. Thus providing a proper rationale and
+ describing the use-case for the changes is a must.
+
+ Patch submissions with >1 patch must have a cover letter which includes
+ a high level description of the series. This high level summary will
+ then be placed into the merge commit by the BPF maintainers such that
+ it is also accessible from the git log for future reference.
+
+Q: What do I need to consider when adding a new instruction or feature
+ that would require BPF JIT and/or LLVM integration as well?
+
+A: We try hard to keep all BPF JITs up to date such that the same user
+ experience can be guaranteed when running BPF programs on different
+ architectures without having the program punt to the less efficient
+ interpreter in case the in-kernel BPF JIT is enabled.
+
+ If you are unable to implement or test the required JIT changes for
+ certain architectures, please work together with the related BPF JIT
+ developers in order to get the feature implemented in a timely manner.
+ Please refer to the git log (arch/*/net/) to locate the necessary
+ people for helping out.
+
+ Also always make sure to add BPF test cases (e.g. test_bpf.c and
+ test_verifier.c) for new instructions, so that they can receive
+ broad test coverage and help run-time testing the various BPF JITs.
+
+ In case of new BPF instructions, once the changes have been accepted
+ into the Linux kernel, please implement support into LLVM's BPF back
+ end. See LLVM section below for further information.
+
+Stable submission:
+------------------
+
+Q: I need a specific BPF commit in stable kernels. What should I do?
+
+A: In case you need a specific fix in stable kernels, first check whether
+ the commit has already been applied in the related linux-*.y branches:
+
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
+
+ If not the case, then drop an email to the BPF maintainers with the
+ netdev kernel mailing list in Cc and ask for the fix to be queued up:
+
+ netdev@vger.kernel.org
+
+ The process in general is the same as on netdev itself, see also the
+ netdev FAQ document.
+
+Q: Do you also backport to kernels not currently maintained as stable?
+
+A: No. If you need a specific BPF commit in kernels that are currently not
+ maintained by the stable maintainers, then you are on your own.
+
+ The current stable and longterm stable kernels are all listed here:
+
+ https://www.kernel.org/
+
+Q: The BPF patch I am about to submit needs to go to stable as well. What
+ should I do?
+
+A: The same rules apply as with netdev patch submissions in general, see
+ netdev FAQ under:
+
+ Documentation/networking/netdev-FAQ.txt
+
+ Never add "Cc: stable@vger.kernel.org" to the patch description, but
+ ask the BPF maintainers to queue the patches instead. This can be done
+ with a note, for example, under the "---" part of the patch which does
+ not go into the git log. Alternatively, this can be done as a simple
+ request by mail instead.
+
+Q: Where do I find currently queued BPF patches that will be submitted
+ to stable?
+
+A: Once patches that fix critical bugs got applied into the bpf tree, they
+ are queued up for stable submission under:
+
+ http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
+
+ They will be on hold there at minimum until the related commit made its
+ way into the mainline kernel tree.
+
+ After having been under broader exposure, the queued patches will be
+ submitted by the BPF maintainers to the stable maintainers.
+
+Testing patches:
+----------------
+
+Q: Which BPF kernel selftests version should I run my kernel against?
+
+A: If you run a kernel xyz, then always run the BPF kernel selftests from
+ that kernel xyz as well. Do not expect that the BPF selftest from the
+ latest mainline tree will pass all the time.
+
+ In particular, test_bpf.c and test_verifier.c have a large number of
+ test cases and are constantly updated with new BPF test sequences, or
+ existing ones are adapted to verifier changes e.g. due to verifier
+ becoming smarter and being able to better track certain things.
+
+LLVM:
+-----
+
+Q: Where do I find LLVM with BPF support?
+
+A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
+
+ All major distributions these days ship LLVM with BPF back end enabled,
+ so for the majority of use-cases it is not required to compile LLVM by
+ hand anymore, just install the distribution provided package.
+
+ LLVM's static compiler lists the supported targets through 'llc --version',
+ make sure BPF targets are listed. Example:
+
+ $ llc --version
+ LLVM (http://llvm.org/):
+ LLVM version 6.0.0svn
+ Optimized build.
+ Default target: x86_64-unknown-linux-gnu
+ Host CPU: skylake
+
+ Registered Targets:
+ bpf - BPF (host endian)
+ bpfeb - BPF (big endian)
+ bpfel - BPF (little endian)
+ x86 - 32-bit X86: Pentium-Pro and above
+ x86-64 - 64-bit X86: EM64T and AMD64
+
+ For developers in order to utilize the latest features added to LLVM's
+ BPF back end, it is advisable to run the latest LLVM releases. Support
+ for new BPF kernel features such as additions to the BPF instruction
+ set are often developed together.
+
+ All LLVM releases can be found at: http://releases.llvm.org/
+
+Q: Got it, so how do I build LLVM manually anyway?
+
+A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
+ that set up, proceed with building the latest LLVM and clang version
+ from the git repositories:
+
+ $ git clone http://llvm.org/git/llvm.git
+ $ cd llvm/tools
+ $ git clone --depth 1 http://llvm.org/git/clang.git
+ $ cd ..; mkdir build; cd build
+ $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DLLVM_BUILD_RUNTIME=OFF
+ $ make -j $(getconf _NPROCESSORS_ONLN)
+
+ The built binaries can then be found in the build/bin/ directory, where
+ you can point the PATH variable to.
+
+Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
+ generation back end or about LLVM generated code that the verifier
+ refuses to accept?
+
+A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
+ infrastructure and it ties deeply into verification of programs from the
+ kernel side. Therefore, any issues on either side need to be investigated
+ and fixed whenever necessary.
+
+ Therefore, please make sure to bring them up at netdev kernel mailing
+ list and Cc BPF maintainers for LLVM and kernel bits:
+
+ Yonghong Song <yhs@fb.com>
+ Alexei Starovoitov <ast@kernel.org>
+ Daniel Borkmann <daniel@iogearbox.net>
+
+ LLVM also has an issue tracker where BPF related bugs can be found:
+
+ https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
+
+ However, it is better to reach out through mailing lists with having
+ maintainers in Cc.
+
+Q: I have added a new BPF instruction to the kernel, how can I integrate
+ it into LLVM?
+
+A: LLVM has a -mcpu selector for the BPF back end in order to allow the
+ selection of BPF instruction set extensions. By default the 'generic'
+ processor target is used, which is the base instruction set (v1) of BPF.
+
+ LLVM has an option to select -mcpu=probe where it will probe the host
+ kernel for supported BPF instruction set extensions and selects the
+ optimal set automatically.
+
+ For cross-compilation, a specific version can be select manually as well.
+
+ $ llc -march bpf -mcpu=help
+ Available CPUs for this target:
+
+ generic - Select the generic processor.
+ probe - Select the probe processor.
+ v1 - Select the v1 processor.
+ v2 - Select the v2 processor.
+ [...]
+
+ Newly added BPF instructions to the Linux kernel need to follow the same
+ scheme, bump the instruction set version and implement probing for the
+ extensions such that -mcpu=probe users can benefit from the optimization
+ transparently when upgrading their kernels.
+
+ If you are unable to implement support for the newly added BPF instruction
+ please reach out to BPF developers for help.
+
+ By the way, the BPF kernel selftests run with -mcpu=probe for better
+ test coverage.
+
+Q: In some cases clang flag "-target bpf" is used but in other cases the
+ default clang target, which matches the underlying architecture, is used.
+ What is the difference and when I should use which?
+
+A: Although LLVM IR generation and optimization try to stay architecture
+ independent, "-target <arch>" still has some impact on generated code:
+
+ - BPF program may recursively include header file(s) with file scope
+ inline assembly codes. The default target can handle this well,
+ while bpf target may fail if bpf backend assembler does not
+ understand these assembly codes, which is true in most cases.
+
+ - When compiled without -g, additional elf sections, e.g.,
+ .eh_frame and .rela.eh_frame, may be present in the object file
+ with default target, but not with bpf target.
+
+ - The default target may turn a C switch statement into a switch table
+ lookup and jump operation. Since the switch table is placed
+ in the global readonly section, the bpf program will fail to load.
+ The bpf target does not support switch table optimization.
+ The clang option "-fno-jump-tables" can be used to disable
+ switch table generation.
+
+ - For clang -target bpf, it is guaranteed that pointer or long /
+ unsigned long types will always have a width of 64 bit, no matter
+ whether underlying clang binary or default target (or kernel) is
+ 32 bit. However, when native clang target is used, then it will
+ compile these types based on the underlying architecture's conventions,
+ meaning in case of 32 bit architecture, pointer or long / unsigned
+ long types e.g. in BPF context structure will have width of 32 bit
+ while the BPF LLVM back end still operates in 64 bit. The native
+ target is mostly needed in tracing for the case of walking pt_regs
+ or other kernel structures where CPU's register width matters.
+ Otherwise, clang -target bpf is generally recommended.
+
+ You should use default target when:
+
+ - Your program includes a header file, e.g., ptrace.h, which eventually
+ pulls in some header files containing file scope host assembly codes.
+ - You can add "-fno-jump-tables" to work around the switch table issue.
+
+ Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
+ when:
+
+ - Your program uses data structures with pointer or long / unsigned long
+ types that interface with BPF helpers or context data structures. Access
+ into these structures is verified by the BPF verifier and may result
+ in verification failures if the native architecture is not aligned with
+ the BPF architecture, e.g. 64-bit. An example of this is
+ BPF_PROG_TYPE_SK_MSG require '-target bpf'
+
+Happy BPF hacking!
diff --git a/Documentation/bpf/bpf_devel_QA.txt b/Documentation/bpf/bpf_devel_QA.txt
deleted file mode 100644
index da57601153a0..000000000000
--- a/Documentation/bpf/bpf_devel_QA.txt
+++ /dev/null
@@ -1,570 +0,0 @@
-This document provides information for the BPF subsystem about various
-workflows related to reporting bugs, submitting patches, and queueing
-patches for stable kernels.
-
-For general information about submitting patches, please refer to
-Documentation/process/. This document only describes additional specifics
-related to BPF.
-
-Reporting bugs:
----------------
-
-Q: How do I report bugs for BPF kernel code?
-
-A: Since all BPF kernel development as well as bpftool and iproute2 BPF
- loader development happens through the netdev kernel mailing list,
- please report any found issues around BPF to the following mailing
- list:
-
- netdev@vger.kernel.org
-
- This may also include issues related to XDP, BPF tracing, etc.
-
- Given netdev has a high volume of traffic, please also add the BPF
- maintainers to Cc (from kernel MAINTAINERS file):
-
- Alexei Starovoitov <ast@kernel.org>
- Daniel Borkmann <daniel@iogearbox.net>
-
- In case a buggy commit has already been identified, make sure to keep
- the actual commit authors in Cc as well for the report. They can
- typically be identified through the kernel's git tree.
-
- Please do *not* report BPF issues to bugzilla.kernel.org since it
- is a guarantee that the reported issue will be overlooked.
-
-Submitting patches:
--------------------
-
-Q: To which mailing list do I need to submit my BPF patches?
-
-A: Please submit your BPF patches to the netdev kernel mailing list:
-
- netdev@vger.kernel.org
-
- Historically, BPF came out of networking and has always been maintained
- by the kernel networking community. Although these days BPF touches
- many other subsystems as well, the patches are still routed mainly
- through the networking community.
-
- In case your patch has changes in various different subsystems (e.g.
- tracing, security, etc), make sure to Cc the related kernel mailing
- lists and maintainers from there as well, so they are able to review
- the changes and provide their Acked-by's to the patches.
-
-Q: Where can I find patches currently under discussion for BPF subsystem?
-
-A: All patches that are Cc'ed to netdev are queued for review under netdev
- patchwork project:
-
- http://patchwork.ozlabs.org/project/netdev/list/
-
- Those patches which target BPF, are assigned to a 'bpf' delegate for
- further processing from BPF maintainers. The current queue with
- patches under review can be found at:
-
- https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
-
- Once the patches have been reviewed by the BPF community as a whole
- and approved by the BPF maintainers, their status in patchwork will be
- changed to 'Accepted' and the submitter will be notified by mail. This
- means that the patches look good from a BPF perspective and have been
- applied to one of the two BPF kernel trees.
-
- In case feedback from the community requires a respin of the patches,
- their status in patchwork will be set to 'Changes Requested', and purged
- from the current review queue. Likewise for cases where patches would
- get rejected or are not applicable to the BPF trees (but assigned to
- the 'bpf' delegate).
-
-Q: How do the changes make their way into Linux?
-
-A: There are two BPF kernel trees (git repositories). Once patches have
- been accepted by the BPF maintainers, they will be applied to one
- of the two BPF trees:
-
- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
-
- The bpf tree itself is for fixes only, whereas bpf-next for features,
- cleanups or other kind of improvements ("next-like" content). This is
- analogous to net and net-next trees for networking. Both bpf and
- bpf-next will only have a master branch in order to simplify against
- which branch patches should get rebased to.
-
- Accumulated BPF patches in the bpf tree will regularly get pulled
- into the net kernel tree. Likewise, accumulated BPF patches accepted
- into the bpf-next tree will make their way into net-next tree. net and
- net-next are both run by David S. Miller. From there, they will go
- into the kernel mainline tree run by Linus Torvalds. To read up on the
- process of net and net-next being merged into the mainline tree, see
- the netdev FAQ under:
-
- Documentation/networking/netdev-FAQ.txt
-
- Occasionally, to prevent merge conflicts, we might send pull requests
- to other trees (e.g. tracing) with a small subset of the patches, but
- net and net-next are always the main trees targeted for integration.
-
- The pull requests will contain a high-level summary of the accumulated
- patches and can be searched on netdev kernel mailing list through the
- following subject lines (yyyy-mm-dd is the date of the pull request):
-
- pull-request: bpf yyyy-mm-dd
- pull-request: bpf-next yyyy-mm-dd
-
-Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
- applied to?
-
-A: The process is the very same as described in the netdev FAQ, so
- please read up on it. The subject line must indicate whether the
- patch is a fix or rather "next-like" content in order to let the
- maintainers know whether it is targeted at bpf or bpf-next.
-
- For fixes eventually landing in bpf -> net tree, the subject must
- look like:
-
- git format-patch --subject-prefix='PATCH bpf' start..finish
-
- For features/improvements/etc that should eventually land in
- bpf-next -> net-next, the subject must look like:
-
- git format-patch --subject-prefix='PATCH bpf-next' start..finish
-
- If unsure whether the patch or patch series should go into bpf
- or net directly, or bpf-next or net-next directly, it is not a
- problem either if the subject line says net or net-next as target.
- It is eventually up to the maintainers to do the delegation of
- the patches.
-
- If it is clear that patches should go into bpf or bpf-next tree,
- please make sure to rebase the patches against those trees in
- order to reduce potential conflicts.
-
- In case the patch or patch series has to be reworked and sent out
- again in a second or later revision, it is also required to add a
- version number (v2, v3, ...) into the subject prefix:
-
- git format-patch --subject-prefix='PATCH net-next v2' start..finish
-
- When changes have been requested to the patch series, always send the
- whole patch series again with the feedback incorporated (never send
- individual diffs on top of the old series).
-
-Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
-
-A: It means that the patch looks good for mainline inclusion from
- a BPF point of view.
-
- Be aware that this is not a final verdict that the patch will
- automatically get accepted into net or net-next trees eventually:
-
- On the netdev kernel mailing list reviews can come in at any point
- in time. If discussions around a patch conclude that they cannot
- get included as-is, we will either apply a follow-up fix or drop
- them from the trees entirely. Therefore, we also reserve to rebase
- the trees when deemed necessary. After all, the purpose of the tree
- is to i) accumulate and stage BPF patches for integration into trees
- like net and net-next, and ii) run extensive BPF test suite and
- workloads on the patches before they make their way any further.
-
- Once the BPF pull request was accepted by David S. Miller, then
- the patches end up in net or net-next tree, respectively, and
- make their way from there further into mainline. Again, see the
- netdev FAQ for additional information e.g. on how often they are
- merged to mainline.
-
-Q: How long do I need to wait for feedback on my BPF patches?
-
-A: We try to keep the latency low. The usual time to feedback will
- be around 2 or 3 business days. It may vary depending on the
- complexity of changes and current patch load.
-
-Q: How often do you send pull requests to major kernel trees like
- net or net-next?
-
-A: Pull requests will be sent out rather often in order to not
- accumulate too many patches in bpf or bpf-next.
-
- As a rule of thumb, expect pull requests for each tree regularly
- at the end of the week. In some cases pull requests could additionally
- come also in the middle of the week depending on the current patch
- load or urgency.
-
-Q: Are patches applied to bpf-next when the merge window is open?
-
-A: For the time when the merge window is open, bpf-next will not be
- processed. This is roughly analogous to net-next patch processing,
- so feel free to read up on the netdev FAQ about further details.
-
- During those two weeks of merge window, we might ask you to resend
- your patch series once bpf-next is open again. Once Linus released
- a v*-rc1 after the merge window, we continue processing of bpf-next.
-
- For non-subscribers to kernel mailing lists, there is also a status
- page run by David S. Miller on net-next that provides guidance:
-
- http://vger.kernel.org/~davem/net-next.html
-
-Q: I made a BPF verifier change, do I need to add test cases for
- BPF kernel selftests?
-
-A: If the patch has changes to the behavior of the verifier, then yes,
- it is absolutely necessary to add test cases to the BPF kernel
- selftests suite. If they are not present and we think they are
- needed, then we might ask for them before accepting any changes.
-
- In particular, test_verifier.c is tracking a high number of BPF test
- cases, including a lot of corner cases that LLVM BPF back end may
- generate out of the restricted C code. Thus, adding test cases is
- absolutely crucial to make sure future changes do not accidentally
- affect prior use-cases. Thus, treat those test cases as: verifier
- behavior that is not tracked in test_verifier.c could potentially
- be subject to change.
-
-Q: When should I add code to samples/bpf/ and when to BPF kernel
- selftests?
-
-A: In general, we prefer additions to BPF kernel selftests rather than
- samples/bpf/. The rationale is very simple: kernel selftests are
- regularly run by various bots to test for kernel regressions.
-
- The more test cases we add to BPF selftests, the better the coverage
- and the less likely it is that those could accidentally break. It is
- not that BPF kernel selftests cannot demo how a specific feature can
- be used.
-
- That said, samples/bpf/ may be a good place for people to get started,
- so it might be advisable that simple demos of features could go into
- samples/bpf/, but advanced functional and corner-case testing rather
- into kernel selftests.
-
- If your sample looks like a test case, then go for BPF kernel selftests
- instead!
-
-Q: When should I add code to the bpftool?
-
-A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
- a central user space tool for debugging and introspection of BPF programs
- and maps that are active in the kernel. If UAPI changes related to BPF
- enable for dumping additional information of programs or maps, then
- bpftool should be extended as well to support dumping them.
-
-Q: When should I add code to iproute2's BPF loader?
-
-A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
- convention is that those control-path related changes are added to
- iproute2's BPF loader as well from user space side. This is not only
- useful to have UAPI changes properly designed to be usable, but also
- to make those changes available to a wider user base of major
- downstream distributions.
-
-Q: Do you accept patches as well for iproute2's BPF loader?
-
-A: Patches for the iproute2's BPF loader have to be sent to:
-
- netdev@vger.kernel.org
-
- While those patches are not processed by the BPF kernel maintainers,
- please keep them in Cc as well, so they can be reviewed.
-
- The official git repository for iproute2 is run by Stephen Hemminger
- and can be found at:
-
- https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
-
- The patches need to have a subject prefix of '[PATCH iproute2 master]'
- or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
- target branch where the patch should be applied to. Meaning, if kernel
- changes went into the net-next kernel tree, then the related iproute2
- changes need to go into the iproute2 net-next branch, otherwise they
- can be targeted at master branch. The iproute2 net-next branch will get
- merged into the master branch after the current iproute2 version from
- master has been released.
-
- Like BPF, the patches end up in patchwork under the netdev project and
- are delegated to 'shemminger' for further processing:
-
- http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
-
-Q: What is the minimum requirement before I submit my BPF patches?
-
-A: When submitting patches, always take the time and properly test your
- patches *prior* to submission. Never rush them! If maintainers find
- that your patches have not been properly tested, it is a good way to
- get them grumpy. Testing patch submissions is a hard requirement!
-
- Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
- same applies to fixes that target bpf-next, where the affected commit
- is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
- in order to identify follow-up commits and tremendously helps for people
- having to do backporting, so it is a must have!
-
- We also don't accept patches with an empty commit message. Take your
- time and properly write up a high quality commit message, it is
- essential!
-
- Think about it this way: other developers looking at your code a month
- from now need to understand *why* a certain change has been done that
- way, and whether there have been flaws in the analysis or assumptions
- that the original author did. Thus providing a proper rationale and
- describing the use-case for the changes is a must.
-
- Patch submissions with >1 patch must have a cover letter which includes
- a high level description of the series. This high level summary will
- then be placed into the merge commit by the BPF maintainers such that
- it is also accessible from the git log for future reference.
-
-Q: What do I need to consider when adding a new instruction or feature
- that would require BPF JIT and/or LLVM integration as well?
-
-A: We try hard to keep all BPF JITs up to date such that the same user
- experience can be guaranteed when running BPF programs on different
- architectures without having the program punt to the less efficient
- interpreter in case the in-kernel BPF JIT is enabled.
-
- If you are unable to implement or test the required JIT changes for
- certain architectures, please work together with the related BPF JIT
- developers in order to get the feature implemented in a timely manner.
- Please refer to the git log (arch/*/net/) to locate the necessary
- people for helping out.
-
- Also always make sure to add BPF test cases (e.g. test_bpf.c and
- test_verifier.c) for new instructions, so that they can receive
- broad test coverage and help run-time testing the various BPF JITs.
-
- In case of new BPF instructions, once the changes have been accepted
- into the Linux kernel, please implement support into LLVM's BPF back
- end. See LLVM section below for further information.
-
-Stable submission:
-------------------
-
-Q: I need a specific BPF commit in stable kernels. What should I do?
-
-A: In case you need a specific fix in stable kernels, first check whether
- the commit has already been applied in the related linux-*.y branches:
-
- https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
-
- If not the case, then drop an email to the BPF maintainers with the
- netdev kernel mailing list in Cc and ask for the fix to be queued up:
-
- netdev@vger.kernel.org
-
- The process in general is the same as on netdev itself, see also the
- netdev FAQ document.
-
-Q: Do you also backport to kernels not currently maintained as stable?
-
-A: No. If you need a specific BPF commit in kernels that are currently not
- maintained by the stable maintainers, then you are on your own.
-
- The current stable and longterm stable kernels are all listed here:
-
- https://www.kernel.org/
-
-Q: The BPF patch I am about to submit needs to go to stable as well. What
- should I do?
-
-A: The same rules apply as with netdev patch submissions in general, see
- netdev FAQ under:
-
- Documentation/networking/netdev-FAQ.txt
-
- Never add "Cc: stable@vger.kernel.org" to the patch description, but
- ask the BPF maintainers to queue the patches instead. This can be done
- with a note, for example, under the "---" part of the patch which does
- not go into the git log. Alternatively, this can be done as a simple
- request by mail instead.
-
-Q: Where do I find currently queued BPF patches that will be submitted
- to stable?
-
-A: Once patches that fix critical bugs got applied into the bpf tree, they
- are queued up for stable submission under:
-
- http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
-
- They will be on hold there at minimum until the related commit made its
- way into the mainline kernel tree.
-
- After having been under broader exposure, the queued patches will be
- submitted by the BPF maintainers to the stable maintainers.
-
-Testing patches:
-----------------
-
-Q: Which BPF kernel selftests version should I run my kernel against?
-
-A: If you run a kernel xyz, then always run the BPF kernel selftests from
- that kernel xyz as well. Do not expect that the BPF selftest from the
- latest mainline tree will pass all the time.
-
- In particular, test_bpf.c and test_verifier.c have a large number of
- test cases and are constantly updated with new BPF test sequences, or
- existing ones are adapted to verifier changes e.g. due to verifier
- becoming smarter and being able to better track certain things.
-
-LLVM:
------
-
-Q: Where do I find LLVM with BPF support?
-
-A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
-
- All major distributions these days ship LLVM with BPF back end enabled,
- so for the majority of use-cases it is not required to compile LLVM by
- hand anymore, just install the distribution provided package.
-
- LLVM's static compiler lists the supported targets through 'llc --version',
- make sure BPF targets are listed. Example:
-
- $ llc --version
- LLVM (http://llvm.org/):
- LLVM version 6.0.0svn
- Optimized build.
- Default target: x86_64-unknown-linux-gnu
- Host CPU: skylake
-
- Registered Targets:
- bpf - BPF (host endian)
- bpfeb - BPF (big endian)
- bpfel - BPF (little endian)
- x86 - 32-bit X86: Pentium-Pro and above
- x86-64 - 64-bit X86: EM64T and AMD64
-
- For developers in order to utilize the latest features added to LLVM's
- BPF back end, it is advisable to run the latest LLVM releases. Support
- for new BPF kernel features such as additions to the BPF instruction
- set are often developed together.
-
- All LLVM releases can be found at: http://releases.llvm.org/
-
-Q: Got it, so how do I build LLVM manually anyway?
-
-A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
- that set up, proceed with building the latest LLVM and clang version
- from the git repositories:
-
- $ git clone http://llvm.org/git/llvm.git
- $ cd llvm/tools
- $ git clone --depth 1 http://llvm.org/git/clang.git
- $ cd ..; mkdir build; cd build
- $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
- -DBUILD_SHARED_LIBS=OFF \
- -DCMAKE_BUILD_TYPE=Release \
- -DLLVM_BUILD_RUNTIME=OFF
- $ make -j $(getconf _NPROCESSORS_ONLN)
-
- The built binaries can then be found in the build/bin/ directory, where
- you can point the PATH variable to.
-
-Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
- generation back end or about LLVM generated code that the verifier
- refuses to accept?
-
-A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
- infrastructure and it ties deeply into verification of programs from the
- kernel side. Therefore, any issues on either side need to be investigated
- and fixed whenever necessary.
-
- Therefore, please make sure to bring them up at netdev kernel mailing
- list and Cc BPF maintainers for LLVM and kernel bits:
-
- Yonghong Song <yhs@fb.com>
- Alexei Starovoitov <ast@kernel.org>
- Daniel Borkmann <daniel@iogearbox.net>
-
- LLVM also has an issue tracker where BPF related bugs can be found:
-
- https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
-
- However, it is better to reach out through mailing lists with having
- maintainers in Cc.
-
-Q: I have added a new BPF instruction to the kernel, how can I integrate
- it into LLVM?
-
-A: LLVM has a -mcpu selector for the BPF back end in order to allow the
- selection of BPF instruction set extensions. By default the 'generic'
- processor target is used, which is the base instruction set (v1) of BPF.
-
- LLVM has an option to select -mcpu=probe where it will probe the host
- kernel for supported BPF instruction set extensions and selects the
- optimal set automatically.
-
- For cross-compilation, a specific version can be select manually as well.
-
- $ llc -march bpf -mcpu=help
- Available CPUs for this target:
-
- generic - Select the generic processor.
- probe - Select the probe processor.
- v1 - Select the v1 processor.
- v2 - Select the v2 processor.
- [...]
-
- Newly added BPF instructions to the Linux kernel need to follow the same
- scheme, bump the instruction set version and implement probing for the
- extensions such that -mcpu=probe users can benefit from the optimization
- transparently when upgrading their kernels.
-
- If you are unable to implement support for the newly added BPF instruction
- please reach out to BPF developers for help.
-
- By the way, the BPF kernel selftests run with -mcpu=probe for better
- test coverage.
-
-Q: In some cases clang flag "-target bpf" is used but in other cases the
- default clang target, which matches the underlying architecture, is used.
- What is the difference and when I should use which?
-
-A: Although LLVM IR generation and optimization try to stay architecture
- independent, "-target <arch>" still has some impact on generated code:
-
- - BPF program may recursively include header file(s) with file scope
- inline assembly codes. The default target can handle this well,
- while bpf target may fail if bpf backend assembler does not
- understand these assembly codes, which is true in most cases.
-
- - When compiled without -g, additional elf sections, e.g.,
- .eh_frame and .rela.eh_frame, may be present in the object file
- with default target, but not with bpf target.
-
- - The default target may turn a C switch statement into a switch table
- lookup and jump operation. Since the switch table is placed
- in the global readonly section, the bpf program will fail to load.
- The bpf target does not support switch table optimization.
- The clang option "-fno-jump-tables" can be used to disable
- switch table generation.
-
- - For clang -target bpf, it is guaranteed that pointer or long /
- unsigned long types will always have a width of 64 bit, no matter
- whether underlying clang binary or default target (or kernel) is
- 32 bit. However, when native clang target is used, then it will
- compile these types based on the underlying architecture's conventions,
- meaning in case of 32 bit architecture, pointer or long / unsigned
- long types e.g. in BPF context structure will have width of 32 bit
- while the BPF LLVM back end still operates in 64 bit. The native
- target is mostly needed in tracing for the case of walking pt_regs
- or other kernel structures where CPU's register width matters.
- Otherwise, clang -target bpf is generally recommended.
-
- You should use default target when:
-
- - Your program includes a header file, e.g., ptrace.h, which eventually
- pulls in some header files containing file scope host assembly codes.
- - You can add "-fno-jump-tables" to work around the switch table issue.
-
- Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
- when:
-
- - Your program uses data structures with pointer or long / unsigned long
- types that interface with BPF helpers or context data structures. Access
- into these structures is verified by the BPF verifier and may result
- in verification failures if the native architecture is not aligned with
- the BPF architecture, e.g. 64-bit. An example of this is
- BPF_PROG_TYPE_SK_MSG require '-target bpf'
-
-Happy BPF hacking!
^ permalink raw reply related
* [bpf-next PATCH 3/5] bpf, doc: convert bpf_design_QA.rst to use RST formatting
From: Jesper Dangaard Brouer @ 2018-05-14 13:42 UTC (permalink / raw)
To: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: netdev, Quentin Monnet, linux-man, linux-doc
In-Reply-To: <152630528901.29210.9018565600000101307.stgit@firesoul>
The RST formatting is done such that that when rendered or converted
to different formats, an automatic index with links are created to the
subsections.
Thus, the questions are created as sections (or subsections), in-order
to get the wanted auto-generated FAQ/QA index.
Special thanks to Quentin Monnet <quentin.monnet@netronome.com> who
have reviewed and corrected both RST formatting and GitHub rendering
issues in this file. Those commits have been squashed.
I've manually tested that this also renders nicely if included as part
of the kernel 'make htmldocs'. As the end-goal is for this to become
more integrated with kernel-doc project/movement.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
Documentation/bpf/bpf_design_QA.rst | 223 +++++++++++++++++++++++------------
1 file changed, 144 insertions(+), 79 deletions(-)
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index f3e458a0bb2f..6780a6d81745 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -1,156 +1,221 @@
+==============
+BPF Design Q&A
+==============
+
BPF extensibility and applicability to networking, tracing, security
in the linux kernel and several user space implementations of BPF
virtual machine led to a number of misunderstanding on what BPF actually is.
This short QA is an attempt to address that and outline a direction
of where BPF is heading long term.
+.. contents::
+ :local:
+ :depth: 3
+
+Questions and Answers
+=====================
+
Q: Is BPF a generic instruction set similar to x64 and arm64?
+-------------------------------------------------------------
A: NO.
Q: Is BPF a generic virtual machine ?
+-------------------------------------
A: NO.
-BPF is generic instruction set _with_ C calling convention.
+BPF is generic instruction set *with* C calling convention.
+-----------------------------------------------------------
Q: Why C calling convention was chosen?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
A: Because BPF programs are designed to run in the linux kernel
- which is written in C, hence BPF defines instruction set compatible
- with two most used architectures x64 and arm64 (and takes into
- consideration important quirks of other architectures) and
- defines calling convention that is compatible with C calling
- convention of the linux kernel on those architectures.
+which is written in C, hence BPF defines instruction set compatible
+with two most used architectures x64 and arm64 (and takes into
+consideration important quirks of other architectures) and
+defines calling convention that is compatible with C calling
+convention of the linux kernel on those architectures.
Q: can multiple return values be supported in the future?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A: NO. BPF allows only register R0 to be used as return value.
Q: can more than 5 function arguments be supported in the future?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A: NO. BPF calling convention only allows registers R1-R5 to be used
- as arguments. BPF is not a standalone instruction set.
- (unlike x64 ISA that allows msft, cdecl and other conventions)
+as arguments. BPF is not a standalone instruction set.
+(unlike x64 ISA that allows msft, cdecl and other conventions)
Q: can BPF programs access instruction pointer or return address?
+-----------------------------------------------------------------
A: NO.
Q: can BPF programs access stack pointer ?
-A: NO. Only frame pointer (register R10) is accessible.
- From compiler point of view it's necessary to have stack pointer.
- For example LLVM defines register R11 as stack pointer in its
- BPF backend, but it makes sure that generated code never uses it.
+------------------------------------------
+A: NO.
+
+Only frame pointer (register R10) is accessible.
+From compiler point of view it's necessary to have stack pointer.
+For example LLVM defines register R11 as stack pointer in its
+BPF backend, but it makes sure that generated code never uses it.
Q: Does C-calling convention diminishes possible use cases?
-A: YES. BPF design forces addition of major functionality in the form
- of kernel helper functions and kernel objects like BPF maps with
- seamless interoperability between them. It lets kernel call into
- BPF programs and programs call kernel helpers with zero overhead.
- As all of them were native C code. That is particularly the case
- for JITed BPF programs that are indistinguishable from
- native kernel C code.
+-----------------------------------------------------------
+A: YES.
+
+BPF design forces addition of major functionality in the form
+of kernel helper functions and kernel objects like BPF maps with
+seamless interoperability between them. It lets kernel call into
+BPF programs and programs call kernel helpers with zero overhead.
+As all of them were native C code. That is particularly the case
+for JITed BPF programs that are indistinguishable from
+native kernel C code.
Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
-A: Soft yes. At least for now until BPF core has support for
- bpf-to-bpf calls, indirect calls, loops, global variables,
- jump tables, read only sections and all other normal constructs
- that C code can produce.
+------------------------------------------------------------------------
+A: Soft yes.
+
+At least for now until BPF core has support for
+bpf-to-bpf calls, indirect calls, loops, global variables,
+jump tables, read only sections and all other normal constructs
+that C code can produce.
Q: Can loops be supported in a safe way?
-A: It's not clear yet. BPF developers are trying to find a way to
- support bounded loops where the verifier can guarantee that
- the program terminates in less than 4096 instructions.
+----------------------------------------
+A: It's not clear yet.
+
+BPF developers are trying to find a way to
+support bounded loops where the verifier can guarantee that
+the program terminates in less than 4096 instructions.
+
+Instruction level questions
+---------------------------
+
+Q: LD_ABS and LD_IND instructions vs C code
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
- C code cannot express them and has to use builtin intrinsics?
+C code cannot express them and has to use builtin intrinsics?
+
A: This is artifact of compatibility with classic BPF. Modern
- networking code in BPF performs better without them.
- See 'direct packet access'.
+networking code in BPF performs better without them.
+See 'direct packet access'.
+Q: BPF instructions mapping not one-to-one to native CPU
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Q: It seems not all BPF instructions are one-to-one to native CPU.
- For example why BPF_JNE and other compare and jumps are not cpu-like?
+For example why BPF_JNE and other compare and jumps are not cpu-like?
+
A: This was necessary to avoid introducing flags into ISA which are
- impossible to make generic and efficient across CPU architectures.
+impossible to make generic and efficient across CPU architectures.
Q: why BPF_DIV instruction doesn't map to x64 div?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A: Because if we picked one-to-one relationship to x64 it would have made
- it more complicated to support on arm64 and other archs. Also it
- needs div-by-zero runtime check.
+it more complicated to support on arm64 and other archs. Also it
+needs div-by-zero runtime check.
Q: why there is no BPF_SDIV for signed divide operation?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A: Because it would be rarely used. llvm errors in such case and
- prints a suggestion to use unsigned divide instead
+prints a suggestion to use unsigned divide instead
Q: Why BPF has implicit prologue and epilogue?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A: Because architectures like sparc have register windows and in general
- there are enough subtle differences between architectures, so naive
- store return address into stack won't work. Another reason is BPF has
- to be safe from division by zero (and legacy exception path
- of LD_ABS insn). Those instructions need to invoke epilogue and
- return implicitly.
+there are enough subtle differences between architectures, so naive
+store return address into stack won't work. Another reason is BPF has
+to be safe from division by zero (and legacy exception path
+of LD_ABS insn). Those instructions need to invoke epilogue and
+return implicitly.
Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A: Because classic BPF didn't have them and BPF authors felt that compiler
- workaround would be acceptable. Turned out that programs lose performance
- due to lack of these compare instructions and they were added.
- These two instructions is a perfect example what kind of new BPF
- instructions are acceptable and can be added in the future.
- These two already had equivalent instructions in native CPUs.
- New instructions that don't have one-to-one mapping to HW instructions
- will not be accepted.
-
+workaround would be acceptable. Turned out that programs lose performance
+due to lack of these compare instructions and they were added.
+These two instructions is a perfect example what kind of new BPF
+instructions are acceptable and can be added in the future.
+These two already had equivalent instructions in native CPUs.
+New instructions that don't have one-to-one mapping to HW instructions
+will not be accepted.
+
+Q: BPF 32-bit subregister requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
- registers which makes BPF inefficient virtual machine for 32-bit
- CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
- be added to BPF in the future?
+registers which makes BPF inefficient virtual machine for 32-bit
+CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
+be added to BPF in the future?
+
A: NO. The first thing to improve performance on 32-bit archs is to teach
- LLVM to generate code that uses 32-bit subregisters. Then second step
- is to teach verifier to mark operations where zero-ing upper bits
- is unnecessary. Then JITs can take advantage of those markings and
- drastically reduce size of generated code and improve performance.
+LLVM to generate code that uses 32-bit subregisters. Then second step
+is to teach verifier to mark operations where zero-ing upper bits
+is unnecessary. Then JITs can take advantage of those markings and
+drastically reduce size of generated code and improve performance.
Q: Does BPF have a stable ABI?
+------------------------------
A: YES. BPF instructions, arguments to BPF programs, set of helper
- functions and their arguments, recognized return codes are all part
- of ABI. However when tracing programs are using bpf_probe_read() helper
- to walk kernel internal datastructures and compile with kernel
- internal headers these accesses can and will break with newer
- kernels. The union bpf_attr -> kern_version is checked at load time
- to prevent accidentally loading kprobe-based bpf programs written
- for a different kernel. Networking programs don't do kern_version check.
+functions and their arguments, recognized return codes are all part
+of ABI. However when tracing programs are using bpf_probe_read() helper
+to walk kernel internal datastructures and compile with kernel
+internal headers these accesses can and will break with newer
+kernels. The union bpf_attr -> kern_version is checked at load time
+to prevent accidentally loading kprobe-based bpf programs written
+for a different kernel. Networking programs don't do kern_version check.
Q: How much stack space a BPF program uses?
+-------------------------------------------
A: Currently all program types are limited to 512 bytes of stack
- space, but the verifier computes the actual amount of stack used
- and both interpreter and most JITed code consume necessary amount.
+space, but the verifier computes the actual amount of stack used
+and both interpreter and most JITed code consume necessary amount.
Q: Can BPF be offloaded to HW?
+------------------------------
A: YES. BPF HW offload is supported by NFP driver.
Q: Does classic BPF interpreter still exist?
+--------------------------------------------
A: NO. Classic BPF programs are converted into extend BPF instructions.
Q: Can BPF call arbitrary kernel functions?
+-------------------------------------------
A: NO. BPF programs can only call a set of helper functions which
- is defined for every program type.
+is defined for every program type.
Q: Can BPF overwrite arbitrary kernel memory?
-A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
- and bpf_probe_read_str() helpers. Networking programs cannot read
- arbitrary memory, since they don't have access to these helpers.
- Programs can never read or write arbitrary memory directly.
+---------------------------------------------
+A: NO.
+
+Tracing bpf programs can *read* arbitrary memory with bpf_probe_read()
+and bpf_probe_read_str() helpers. Networking programs cannot read
+arbitrary memory, since they don't have access to these helpers.
+Programs can never read or write arbitrary memory directly.
Q: Can BPF overwrite arbitrary user memory?
-A: Sort-of. Tracing BPF programs can overwrite the user memory
- of the current task with bpf_probe_write_user(). Every time such
- program is loaded the kernel will print warning message, so
- this helper is only useful for experiments and prototypes.
- Tracing BPF programs are root only.
+-------------------------------------------
+A: Sort-of.
+
+Tracing BPF programs can overwrite the user memory
+of the current task with bpf_probe_write_user(). Every time such
+program is loaded the kernel will print warning message, so
+this helper is only useful for experiments and prototypes.
+Tracing BPF programs are root only.
+Q: bpf_trace_printk() helper warning
+------------------------------------
Q: When bpf_trace_printk() helper is used the kernel prints nasty
- warning message. Why is that?
+warning message. Why is that?
+
A: This is done to nudge program authors into better interfaces when
- programs need to pass data to user space. Like bpf_perf_event_output()
- can be used to efficiently stream data via perf ring buffer.
- BPF maps can be used for asynchronous data sharing between kernel
- and user space. bpf_trace_printk() should only be used for debugging.
+programs need to pass data to user space. Like bpf_perf_event_output()
+can be used to efficiently stream data via perf ring buffer.
+BPF maps can be used for asynchronous data sharing between kernel
+and user space. bpf_trace_printk() should only be used for debugging.
+Q: New functionality via kernel modules?
+----------------------------------------
Q: Can BPF functionality such as new program or map types, new
- helpers, etc be added out of kernel module code?
+helpers, etc be added out of kernel module code?
+
A: NO.
^ permalink raw reply related
* [bpf-next PATCH 4/5] bpf, doc: convert bpf_devel_QA.rst to use RST formatting
From: Jesper Dangaard Brouer @ 2018-05-14 13:42 UTC (permalink / raw)
To: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: netdev, Quentin Monnet, linux-man, linux-doc
In-Reply-To: <152630528901.29210.9018565600000101307.stgit@firesoul>
Same story as bpf_design_QA.rst RST format conversion.
Again thanks to Quentin Monnet <quentin.monnet@netronome.com> for
fixes and patches that have been squashed.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
Documentation/bpf/bpf_devel_QA.rst | 799 +++++++++++++++++++-----------------
1 file changed, 420 insertions(+), 379 deletions(-)
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index da57601153a0..2254bdeae990 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -1,424 +1,446 @@
+=================================
+HOWTO interact with BPF subsystem
+=================================
+
This document provides information for the BPF subsystem about various
workflows related to reporting bugs, submitting patches, and queueing
patches for stable kernels.
For general information about submitting patches, please refer to
-Documentation/process/. This document only describes additional specifics
+`Documentation/process/`_. This document only describes additional specifics
related to BPF.
-Reporting bugs:
----------------
+.. contents::
+ :local:
+ :depth: 2
-Q: How do I report bugs for BPF kernel code?
+Reporting bugs
+==============
+Q: How do I report bugs for BPF kernel code?
+--------------------------------------------
A: Since all BPF kernel development as well as bpftool and iproute2 BPF
- loader development happens through the netdev kernel mailing list,
- please report any found issues around BPF to the following mailing
- list:
+loader development happens through the netdev kernel mailing list,
+please report any found issues around BPF to the following mailing
+list:
- netdev@vger.kernel.org
+ netdev@vger.kernel.org
- This may also include issues related to XDP, BPF tracing, etc.
+This may also include issues related to XDP, BPF tracing, etc.
- Given netdev has a high volume of traffic, please also add the BPF
- maintainers to Cc (from kernel MAINTAINERS file):
+Given netdev has a high volume of traffic, please also add the BPF
+maintainers to Cc (from kernel MAINTAINERS_ file):
- Alexei Starovoitov <ast@kernel.org>
- Daniel Borkmann <daniel@iogearbox.net>
+* Alexei Starovoitov <ast@kernel.org>
+* Daniel Borkmann <daniel@iogearbox.net>
- In case a buggy commit has already been identified, make sure to keep
- the actual commit authors in Cc as well for the report. They can
- typically be identified through the kernel's git tree.
+In case a buggy commit has already been identified, make sure to keep
+the actual commit authors in Cc as well for the report. They can
+typically be identified through the kernel's git tree.
- Please do *not* report BPF issues to bugzilla.kernel.org since it
- is a guarantee that the reported issue will be overlooked.
+**Please do NOT report BPF issues to bugzilla.kernel.org since it
+is a guarantee that the reported issue will be overlooked.**
-Submitting patches:
--------------------
+Submitting patches
+==================
Q: To which mailing list do I need to submit my BPF patches?
-
+------------------------------------------------------------
A: Please submit your BPF patches to the netdev kernel mailing list:
- netdev@vger.kernel.org
+ netdev@vger.kernel.org
- Historically, BPF came out of networking and has always been maintained
- by the kernel networking community. Although these days BPF touches
- many other subsystems as well, the patches are still routed mainly
- through the networking community.
+Historically, BPF came out of networking and has always been maintained
+by the kernel networking community. Although these days BPF touches
+many other subsystems as well, the patches are still routed mainly
+through the networking community.
- In case your patch has changes in various different subsystems (e.g.
- tracing, security, etc), make sure to Cc the related kernel mailing
- lists and maintainers from there as well, so they are able to review
- the changes and provide their Acked-by's to the patches.
+In case your patch has changes in various different subsystems (e.g.
+tracing, security, etc), make sure to Cc the related kernel mailing
+lists and maintainers from there as well, so they are able to review
+the changes and provide their Acked-by's to the patches.
Q: Where can I find patches currently under discussion for BPF subsystem?
-
+-------------------------------------------------------------------------
A: All patches that are Cc'ed to netdev are queued for review under netdev
- patchwork project:
+patchwork project:
- http://patchwork.ozlabs.org/project/netdev/list/
+ http://patchwork.ozlabs.org/project/netdev/list/
- Those patches which target BPF, are assigned to a 'bpf' delegate for
- further processing from BPF maintainers. The current queue with
- patches under review can be found at:
+Those patches which target BPF, are assigned to a 'bpf' delegate for
+further processing from BPF maintainers. The current queue with
+patches under review can be found at:
- https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+ https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
- Once the patches have been reviewed by the BPF community as a whole
- and approved by the BPF maintainers, their status in patchwork will be
- changed to 'Accepted' and the submitter will be notified by mail. This
- means that the patches look good from a BPF perspective and have been
- applied to one of the two BPF kernel trees.
+Once the patches have been reviewed by the BPF community as a whole
+and approved by the BPF maintainers, their status in patchwork will be
+changed to 'Accepted' and the submitter will be notified by mail. This
+means that the patches look good from a BPF perspective and have been
+applied to one of the two BPF kernel trees.
- In case feedback from the community requires a respin of the patches,
- their status in patchwork will be set to 'Changes Requested', and purged
- from the current review queue. Likewise for cases where patches would
- get rejected or are not applicable to the BPF trees (but assigned to
- the 'bpf' delegate).
+In case feedback from the community requires a respin of the patches,
+their status in patchwork will be set to 'Changes Requested', and purged
+from the current review queue. Likewise for cases where patches would
+get rejected or are not applicable to the BPF trees (but assigned to
+the 'bpf' delegate).
Q: How do the changes make their way into Linux?
-
+------------------------------------------------
A: There are two BPF kernel trees (git repositories). Once patches have
- been accepted by the BPF maintainers, they will be applied to one
- of the two BPF trees:
+been accepted by the BPF maintainers, they will be applied to one
+of the two BPF trees:
- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
+ * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
+ * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
- The bpf tree itself is for fixes only, whereas bpf-next for features,
- cleanups or other kind of improvements ("next-like" content). This is
- analogous to net and net-next trees for networking. Both bpf and
- bpf-next will only have a master branch in order to simplify against
- which branch patches should get rebased to.
+The bpf tree itself is for fixes only, whereas bpf-next for features,
+cleanups or other kind of improvements ("next-like" content). This is
+analogous to net and net-next trees for networking. Both bpf and
+bpf-next will only have a master branch in order to simplify against
+which branch patches should get rebased to.
- Accumulated BPF patches in the bpf tree will regularly get pulled
- into the net kernel tree. Likewise, accumulated BPF patches accepted
- into the bpf-next tree will make their way into net-next tree. net and
- net-next are both run by David S. Miller. From there, they will go
- into the kernel mainline tree run by Linus Torvalds. To read up on the
- process of net and net-next being merged into the mainline tree, see
- the netdev FAQ under:
+Accumulated BPF patches in the bpf tree will regularly get pulled
+into the net kernel tree. Likewise, accumulated BPF patches accepted
+into the bpf-next tree will make their way into net-next tree. net and
+net-next are both run by David S. Miller. From there, they will go
+into the kernel mainline tree run by Linus Torvalds. To read up on the
+process of net and net-next being merged into the mainline tree, see
+the `netdev FAQ`_ under:
- Documentation/networking/netdev-FAQ.txt
+ `Documentation/networking/netdev-FAQ.txt`_
- Occasionally, to prevent merge conflicts, we might send pull requests
- to other trees (e.g. tracing) with a small subset of the patches, but
- net and net-next are always the main trees targeted for integration.
+Occasionally, to prevent merge conflicts, we might send pull requests
+to other trees (e.g. tracing) with a small subset of the patches, but
+net and net-next are always the main trees targeted for integration.
- The pull requests will contain a high-level summary of the accumulated
- patches and can be searched on netdev kernel mailing list through the
- following subject lines (yyyy-mm-dd is the date of the pull request):
+The pull requests will contain a high-level summary of the accumulated
+patches and can be searched on netdev kernel mailing list through the
+following subject lines (``yyyy-mm-dd`` is the date of the pull
+request)::
- pull-request: bpf yyyy-mm-dd
- pull-request: bpf-next yyyy-mm-dd
+ pull-request: bpf yyyy-mm-dd
+ pull-request: bpf-next yyyy-mm-dd
-Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
- applied to?
+Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be applied to?
+---------------------------------------------------------------------------------
-A: The process is the very same as described in the netdev FAQ, so
- please read up on it. The subject line must indicate whether the
- patch is a fix or rather "next-like" content in order to let the
- maintainers know whether it is targeted at bpf or bpf-next.
+A: The process is the very same as described in the `netdev FAQ`_, so
+please read up on it. The subject line must indicate whether the
+patch is a fix or rather "next-like" content in order to let the
+maintainers know whether it is targeted at bpf or bpf-next.
- For fixes eventually landing in bpf -> net tree, the subject must
- look like:
+For fixes eventually landing in bpf -> net tree, the subject must
+look like::
- git format-patch --subject-prefix='PATCH bpf' start..finish
+ git format-patch --subject-prefix='PATCH bpf' start..finish
- For features/improvements/etc that should eventually land in
- bpf-next -> net-next, the subject must look like:
+For features/improvements/etc that should eventually land in
+bpf-next -> net-next, the subject must look like::
- git format-patch --subject-prefix='PATCH bpf-next' start..finish
+ git format-patch --subject-prefix='PATCH bpf-next' start..finish
- If unsure whether the patch or patch series should go into bpf
- or net directly, or bpf-next or net-next directly, it is not a
- problem either if the subject line says net or net-next as target.
- It is eventually up to the maintainers to do the delegation of
- the patches.
+If unsure whether the patch or patch series should go into bpf
+or net directly, or bpf-next or net-next directly, it is not a
+problem either if the subject line says net or net-next as target.
+It is eventually up to the maintainers to do the delegation of
+the patches.
- If it is clear that patches should go into bpf or bpf-next tree,
- please make sure to rebase the patches against those trees in
- order to reduce potential conflicts.
+If it is clear that patches should go into bpf or bpf-next tree,
+please make sure to rebase the patches against those trees in
+order to reduce potential conflicts.
- In case the patch or patch series has to be reworked and sent out
- again in a second or later revision, it is also required to add a
- version number (v2, v3, ...) into the subject prefix:
+In case the patch or patch series has to be reworked and sent out
+again in a second or later revision, it is also required to add a
+version number (``v2``, ``v3``, ...) into the subject prefix::
- git format-patch --subject-prefix='PATCH net-next v2' start..finish
+ git format-patch --subject-prefix='PATCH net-next v2' start..finish
- When changes have been requested to the patch series, always send the
- whole patch series again with the feedback incorporated (never send
- individual diffs on top of the old series).
+When changes have been requested to the patch series, always send the
+whole patch series again with the feedback incorporated (never send
+individual diffs on top of the old series).
Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
-
+-----------------------------------------------------------------------
A: It means that the patch looks good for mainline inclusion from
- a BPF point of view.
-
- Be aware that this is not a final verdict that the patch will
- automatically get accepted into net or net-next trees eventually:
-
- On the netdev kernel mailing list reviews can come in at any point
- in time. If discussions around a patch conclude that they cannot
- get included as-is, we will either apply a follow-up fix or drop
- them from the trees entirely. Therefore, we also reserve to rebase
- the trees when deemed necessary. After all, the purpose of the tree
- is to i) accumulate and stage BPF patches for integration into trees
- like net and net-next, and ii) run extensive BPF test suite and
- workloads on the patches before they make their way any further.
-
- Once the BPF pull request was accepted by David S. Miller, then
- the patches end up in net or net-next tree, respectively, and
- make their way from there further into mainline. Again, see the
- netdev FAQ for additional information e.g. on how often they are
- merged to mainline.
+a BPF point of view.
-Q: How long do I need to wait for feedback on my BPF patches?
+Be aware that this is not a final verdict that the patch will
+automatically get accepted into net or net-next trees eventually:
+
+On the netdev kernel mailing list reviews can come in at any point
+in time. If discussions around a patch conclude that they cannot
+get included as-is, we will either apply a follow-up fix or drop
+them from the trees entirely. Therefore, we also reserve to rebase
+the trees when deemed necessary. After all, the purpose of the tree
+is to:
+
+i) accumulate and stage BPF patches for integration into trees
+ like net and net-next, and
+ii) run extensive BPF test suite and
+ workloads on the patches before they make their way any further.
+
+Once the BPF pull request was accepted by David S. Miller, then
+the patches end up in net or net-next tree, respectively, and
+make their way from there further into mainline. Again, see the
+`netdev FAQ`_ for additional information e.g. on how often they are
+merged to mainline.
+
+Q: How long do I need to wait for feedback on my BPF patches?
+-------------------------------------------------------------
A: We try to keep the latency low. The usual time to feedback will
- be around 2 or 3 business days. It may vary depending on the
- complexity of changes and current patch load.
+be around 2 or 3 business days. It may vary depending on the
+complexity of changes and current patch load.
-Q: How often do you send pull requests to major kernel trees like
- net or net-next?
+Q: How often do you send pull requests to major kernel trees like net or net-next?
+----------------------------------------------------------------------------------
A: Pull requests will be sent out rather often in order to not
- accumulate too many patches in bpf or bpf-next.
+accumulate too many patches in bpf or bpf-next.
- As a rule of thumb, expect pull requests for each tree regularly
- at the end of the week. In some cases pull requests could additionally
- come also in the middle of the week depending on the current patch
- load or urgency.
+As a rule of thumb, expect pull requests for each tree regularly
+at the end of the week. In some cases pull requests could additionally
+come also in the middle of the week depending on the current patch
+load or urgency.
Q: Are patches applied to bpf-next when the merge window is open?
-
+-----------------------------------------------------------------
A: For the time when the merge window is open, bpf-next will not be
- processed. This is roughly analogous to net-next patch processing,
- so feel free to read up on the netdev FAQ about further details.
+processed. This is roughly analogous to net-next patch processing,
+so feel free to read up on the `netdev FAQ`_ about further details.
- During those two weeks of merge window, we might ask you to resend
- your patch series once bpf-next is open again. Once Linus released
- a v*-rc1 after the merge window, we continue processing of bpf-next.
+During those two weeks of merge window, we might ask you to resend
+your patch series once bpf-next is open again. Once Linus released
+a ``v*-rc1`` after the merge window, we continue processing of bpf-next.
- For non-subscribers to kernel mailing lists, there is also a status
- page run by David S. Miller on net-next that provides guidance:
+For non-subscribers to kernel mailing lists, there is also a status
+page run by David S. Miller on net-next that provides guidance:
- http://vger.kernel.org/~davem/net-next.html
+ http://vger.kernel.org/~davem/net-next.html
+Q: Verifier changes and test cases
+----------------------------------
Q: I made a BPF verifier change, do I need to add test cases for
- BPF kernel selftests?
+BPF kernel selftests_?
A: If the patch has changes to the behavior of the verifier, then yes,
- it is absolutely necessary to add test cases to the BPF kernel
- selftests suite. If they are not present and we think they are
- needed, then we might ask for them before accepting any changes.
-
- In particular, test_verifier.c is tracking a high number of BPF test
- cases, including a lot of corner cases that LLVM BPF back end may
- generate out of the restricted C code. Thus, adding test cases is
- absolutely crucial to make sure future changes do not accidentally
- affect prior use-cases. Thus, treat those test cases as: verifier
- behavior that is not tracked in test_verifier.c could potentially
- be subject to change.
-
-Q: When should I add code to samples/bpf/ and when to BPF kernel
- selftests?
-
-A: In general, we prefer additions to BPF kernel selftests rather than
- samples/bpf/. The rationale is very simple: kernel selftests are
- regularly run by various bots to test for kernel regressions.
-
- The more test cases we add to BPF selftests, the better the coverage
- and the less likely it is that those could accidentally break. It is
- not that BPF kernel selftests cannot demo how a specific feature can
- be used.
-
- That said, samples/bpf/ may be a good place for people to get started,
- so it might be advisable that simple demos of features could go into
- samples/bpf/, but advanced functional and corner-case testing rather
- into kernel selftests.
-
- If your sample looks like a test case, then go for BPF kernel selftests
- instead!
+it is absolutely necessary to add test cases to the BPF kernel
+selftests_ suite. If they are not present and we think they are
+needed, then we might ask for them before accepting any changes.
+
+In particular, test_verifier.c is tracking a high number of BPF test
+cases, including a lot of corner cases that LLVM BPF back end may
+generate out of the restricted C code. Thus, adding test cases is
+absolutely crucial to make sure future changes do not accidentally
+affect prior use-cases. Thus, treat those test cases as: verifier
+behavior that is not tracked in test_verifier.c could potentially
+be subject to change.
+
+Q: samples/bpf preference vs selftests?
+---------------------------------------
+Q: When should I add code to `samples/bpf/`_ and when to BPF kernel
+selftests_ ?
+
+A: In general, we prefer additions to BPF kernel selftests_ rather than
+`samples/bpf/`_. The rationale is very simple: kernel selftests are
+regularly run by various bots to test for kernel regressions.
+
+The more test cases we add to BPF selftests, the better the coverage
+and the less likely it is that those could accidentally break. It is
+not that BPF kernel selftests cannot demo how a specific feature can
+be used.
+
+That said, `samples/bpf/`_ may be a good place for people to get started,
+so it might be advisable that simple demos of features could go into
+`samples/bpf/`_, but advanced functional and corner-case testing rather
+into kernel selftests.
+
+If your sample looks like a test case, then go for BPF kernel selftests
+instead!
Q: When should I add code to the bpftool?
-
+-----------------------------------------
A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
- a central user space tool for debugging and introspection of BPF programs
- and maps that are active in the kernel. If UAPI changes related to BPF
- enable for dumping additional information of programs or maps, then
- bpftool should be extended as well to support dumping them.
+a central user space tool for debugging and introspection of BPF programs
+and maps that are active in the kernel. If UAPI changes related to BPF
+enable for dumping additional information of programs or maps, then
+bpftool should be extended as well to support dumping them.
Q: When should I add code to iproute2's BPF loader?
-
-A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
- convention is that those control-path related changes are added to
- iproute2's BPF loader as well from user space side. This is not only
- useful to have UAPI changes properly designed to be usable, but also
- to make those changes available to a wider user base of major
- downstream distributions.
+---------------------------------------------------
+A: For UAPI changes related to the XDP or tc layer (e.g. ``cls_bpf``),
+the convention is that those control-path related changes are added to
+iproute2's BPF loader as well from user space side. This is not only
+useful to have UAPI changes properly designed to be usable, but also
+to make those changes available to a wider user base of major
+downstream distributions.
Q: Do you accept patches as well for iproute2's BPF loader?
-
+-----------------------------------------------------------
A: Patches for the iproute2's BPF loader have to be sent to:
- netdev@vger.kernel.org
+ netdev@vger.kernel.org
- While those patches are not processed by the BPF kernel maintainers,
- please keep them in Cc as well, so they can be reviewed.
+While those patches are not processed by the BPF kernel maintainers,
+please keep them in Cc as well, so they can be reviewed.
- The official git repository for iproute2 is run by Stephen Hemminger
- and can be found at:
+The official git repository for iproute2 is run by Stephen Hemminger
+and can be found at:
- https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
+ https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
- The patches need to have a subject prefix of '[PATCH iproute2 master]'
- or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
- target branch where the patch should be applied to. Meaning, if kernel
- changes went into the net-next kernel tree, then the related iproute2
- changes need to go into the iproute2 net-next branch, otherwise they
- can be targeted at master branch. The iproute2 net-next branch will get
- merged into the master branch after the current iproute2 version from
- master has been released.
+The patches need to have a subject prefix of '``[PATCH iproute2
+master]``' or '``[PATCH iproute2 net-next]``'. '``master``' or
+'``net-next``' describes the target branch where the patch should be
+applied to. Meaning, if kernel changes went into the net-next kernel
+tree, then the related iproute2 changes need to go into the iproute2
+net-next branch, otherwise they can be targeted at master branch. The
+iproute2 net-next branch will get merged into the master branch after
+the current iproute2 version from master has been released.
- Like BPF, the patches end up in patchwork under the netdev project and
- are delegated to 'shemminger' for further processing:
+Like BPF, the patches end up in patchwork under the netdev project and
+are delegated to 'shemminger' for further processing:
- http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
+ http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
Q: What is the minimum requirement before I submit my BPF patches?
-
+------------------------------------------------------------------
A: When submitting patches, always take the time and properly test your
- patches *prior* to submission. Never rush them! If maintainers find
- that your patches have not been properly tested, it is a good way to
- get them grumpy. Testing patch submissions is a hard requirement!
-
- Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
- same applies to fixes that target bpf-next, where the affected commit
- is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
- in order to identify follow-up commits and tremendously helps for people
- having to do backporting, so it is a must have!
-
- We also don't accept patches with an empty commit message. Take your
- time and properly write up a high quality commit message, it is
- essential!
-
- Think about it this way: other developers looking at your code a month
- from now need to understand *why* a certain change has been done that
- way, and whether there have been flaws in the analysis or assumptions
- that the original author did. Thus providing a proper rationale and
- describing the use-case for the changes is a must.
-
- Patch submissions with >1 patch must have a cover letter which includes
- a high level description of the series. This high level summary will
- then be placed into the merge commit by the BPF maintainers such that
- it is also accessible from the git log for future reference.
-
+patches *prior* to submission. Never rush them! If maintainers find
+that your patches have not been properly tested, it is a good way to
+get them grumpy. Testing patch submissions is a hard requirement!
+
+Note, fixes that go to bpf tree *must* have a ``Fixes:`` tag included.
+The same applies to fixes that target bpf-next, where the affected
+commit is in net-next (or in some cases bpf-next). The ``Fixes:`` tag is
+crucial in order to identify follow-up commits and tremendously helps
+for people having to do backporting, so it is a must have!
+
+We also don't accept patches with an empty commit message. Take your
+time and properly write up a high quality commit message, it is
+essential!
+
+Think about it this way: other developers looking at your code a month
+from now need to understand *why* a certain change has been done that
+way, and whether there have been flaws in the analysis or assumptions
+that the original author did. Thus providing a proper rationale and
+describing the use-case for the changes is a must.
+
+Patch submissions with >1 patch must have a cover letter which includes
+a high level description of the series. This high level summary will
+then be placed into the merge commit by the BPF maintainers such that
+it is also accessible from the git log for future reference.
+
+Q: Features changing BPF JIT and/or LLVM
+----------------------------------------
Q: What do I need to consider when adding a new instruction or feature
- that would require BPF JIT and/or LLVM integration as well?
+that would require BPF JIT and/or LLVM integration as well?
A: We try hard to keep all BPF JITs up to date such that the same user
- experience can be guaranteed when running BPF programs on different
- architectures without having the program punt to the less efficient
- interpreter in case the in-kernel BPF JIT is enabled.
+experience can be guaranteed when running BPF programs on different
+architectures without having the program punt to the less efficient
+interpreter in case the in-kernel BPF JIT is enabled.
- If you are unable to implement or test the required JIT changes for
- certain architectures, please work together with the related BPF JIT
- developers in order to get the feature implemented in a timely manner.
- Please refer to the git log (arch/*/net/) to locate the necessary
- people for helping out.
+If you are unable to implement or test the required JIT changes for
+certain architectures, please work together with the related BPF JIT
+developers in order to get the feature implemented in a timely manner.
+Please refer to the git log (``arch/*/net/``) to locate the necessary
+people for helping out.
- Also always make sure to add BPF test cases (e.g. test_bpf.c and
- test_verifier.c) for new instructions, so that they can receive
- broad test coverage and help run-time testing the various BPF JITs.
+Also always make sure to add BPF test cases (e.g. test_bpf.c and
+test_verifier.c) for new instructions, so that they can receive
+broad test coverage and help run-time testing the various BPF JITs.
- In case of new BPF instructions, once the changes have been accepted
- into the Linux kernel, please implement support into LLVM's BPF back
- end. See LLVM section below for further information.
+In case of new BPF instructions, once the changes have been accepted
+into the Linux kernel, please implement support into LLVM's BPF back
+end. See LLVM_ section below for further information.
-Stable submission:
-------------------
+Stable submission
+=================
Q: I need a specific BPF commit in stable kernels. What should I do?
-
+--------------------------------------------------------------------
A: In case you need a specific fix in stable kernels, first check whether
- the commit has already been applied in the related linux-*.y branches:
+the commit has already been applied in the related ``linux-*.y`` branches:
- https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
- If not the case, then drop an email to the BPF maintainers with the
- netdev kernel mailing list in Cc and ask for the fix to be queued up:
+If not the case, then drop an email to the BPF maintainers with the
+netdev kernel mailing list in Cc and ask for the fix to be queued up:
- netdev@vger.kernel.org
+ netdev@vger.kernel.org
- The process in general is the same as on netdev itself, see also the
- netdev FAQ document.
+The process in general is the same as on netdev itself, see also the
+`netdev FAQ`_ document.
Q: Do you also backport to kernels not currently maintained as stable?
-
+----------------------------------------------------------------------
A: No. If you need a specific BPF commit in kernels that are currently not
- maintained by the stable maintainers, then you are on your own.
+maintained by the stable maintainers, then you are on your own.
- The current stable and longterm stable kernels are all listed here:
+The current stable and longterm stable kernels are all listed here:
- https://www.kernel.org/
+ https://www.kernel.org/
-Q: The BPF patch I am about to submit needs to go to stable as well. What
- should I do?
+Q: The BPF patch I am about to submit needs to go to stable as well
+-------------------------------------------------------------------
+What should I do?
A: The same rules apply as with netdev patch submissions in general, see
- netdev FAQ under:
+`netdev FAQ`_ under:
- Documentation/networking/netdev-FAQ.txt
+ `Documentation/networking/netdev-FAQ.txt`_
- Never add "Cc: stable@vger.kernel.org" to the patch description, but
- ask the BPF maintainers to queue the patches instead. This can be done
- with a note, for example, under the "---" part of the patch which does
- not go into the git log. Alternatively, this can be done as a simple
- request by mail instead.
+Never add "``Cc: stable@vger.kernel.org``" to the patch description, but
+ask the BPF maintainers to queue the patches instead. This can be done
+with a note, for example, under the ``---`` part of the patch which does
+not go into the git log. Alternatively, this can be done as a simple
+request by mail instead.
+Q: Queue stable patches
+-----------------------
Q: Where do I find currently queued BPF patches that will be submitted
- to stable?
+to stable?
A: Once patches that fix critical bugs got applied into the bpf tree, they
- are queued up for stable submission under:
+are queued up for stable submission under:
- http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
+ http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
- They will be on hold there at minimum until the related commit made its
- way into the mainline kernel tree.
+They will be on hold there at minimum until the related commit made its
+way into the mainline kernel tree.
- After having been under broader exposure, the queued patches will be
- submitted by the BPF maintainers to the stable maintainers.
+After having been under broader exposure, the queued patches will be
+submitted by the BPF maintainers to the stable maintainers.
-Testing patches:
-----------------
+Testing patches
+===============
Q: Which BPF kernel selftests version should I run my kernel against?
+---------------------------------------------------------------------
+A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
+from that kernel ``xyz`` as well. Do not expect that the BPF selftest
+from the latest mainline tree will pass all the time.
-A: If you run a kernel xyz, then always run the BPF kernel selftests from
- that kernel xyz as well. Do not expect that the BPF selftest from the
- latest mainline tree will pass all the time.
-
- In particular, test_bpf.c and test_verifier.c have a large number of
- test cases and are constantly updated with new BPF test sequences, or
- existing ones are adapted to verifier changes e.g. due to verifier
- becoming smarter and being able to better track certain things.
+In particular, test_bpf.c and test_verifier.c have a large number of
+test cases and are constantly updated with new BPF test sequences, or
+existing ones are adapted to verifier changes e.g. due to verifier
+becoming smarter and being able to better track certain things.
-LLVM:
------
+LLVM
+====
Q: Where do I find LLVM with BPF support?
-
+-----------------------------------------
A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
- All major distributions these days ship LLVM with BPF back end enabled,
- so for the majority of use-cases it is not required to compile LLVM by
- hand anymore, just install the distribution provided package.
+All major distributions these days ship LLVM with BPF back end enabled,
+so for the majority of use-cases it is not required to compile LLVM by
+hand anymore, just install the distribution provided package.
- LLVM's static compiler lists the supported targets through 'llc --version',
- make sure BPF targets are listed. Example:
+LLVM's static compiler lists the supported targets through
+``llc --version``, make sure BPF targets are listed. Example::
$ llc --version
LLVM (http://llvm.org/):
@@ -434,18 +456,18 @@ A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
x86 - 32-bit X86: Pentium-Pro and above
x86-64 - 64-bit X86: EM64T and AMD64
- For developers in order to utilize the latest features added to LLVM's
- BPF back end, it is advisable to run the latest LLVM releases. Support
- for new BPF kernel features such as additions to the BPF instruction
- set are often developed together.
+For developers in order to utilize the latest features added to LLVM's
+BPF back end, it is advisable to run the latest LLVM releases. Support
+for new BPF kernel features such as additions to the BPF instruction
+set are often developed together.
- All LLVM releases can be found at: http://releases.llvm.org/
+All LLVM releases can be found at: http://releases.llvm.org/
Q: Got it, so how do I build LLVM manually anyway?
-
+--------------------------------------------------
A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
- that set up, proceed with building the latest LLVM and clang version
- from the git repositories:
+that set up, proceed with building the latest LLVM and clang version
+from the git repositories::
$ git clone http://llvm.org/git/llvm.git
$ cd llvm/tools
@@ -457,44 +479,51 @@ A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
-DLLVM_BUILD_RUNTIME=OFF
$ make -j $(getconf _NPROCESSORS_ONLN)
- The built binaries can then be found in the build/bin/ directory, where
- you can point the PATH variable to.
+The built binaries can then be found in the build/bin/ directory, where
+you can point the PATH variable to.
+Q: Reporting LLVM BPF issues
+----------------------------
Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
- generation back end or about LLVM generated code that the verifier
- refuses to accept?
+generation back end or about LLVM generated code that the verifier
+refuses to accept?
+
+A: Yes, please do!
-A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
- infrastructure and it ties deeply into verification of programs from the
- kernel side. Therefore, any issues on either side need to be investigated
- and fixed whenever necessary.
+LLVM's BPF back end is a key piece of the whole BPF
+infrastructure and it ties deeply into verification of programs from the
+kernel side. Therefore, any issues on either side need to be investigated
+and fixed whenever necessary.
- Therefore, please make sure to bring them up at netdev kernel mailing
- list and Cc BPF maintainers for LLVM and kernel bits:
+Therefore, please make sure to bring them up at netdev kernel mailing
+list and Cc BPF maintainers for LLVM and kernel bits:
- Yonghong Song <yhs@fb.com>
- Alexei Starovoitov <ast@kernel.org>
- Daniel Borkmann <daniel@iogearbox.net>
+* Yonghong Song <yhs@fb.com>
+* Alexei Starovoitov <ast@kernel.org>
+* Daniel Borkmann <daniel@iogearbox.net>
- LLVM also has an issue tracker where BPF related bugs can be found:
+LLVM also has an issue tracker where BPF related bugs can be found:
- https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
+ https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
- However, it is better to reach out through mailing lists with having
- maintainers in Cc.
+However, it is better to reach out through mailing lists with having
+maintainers in Cc.
+Q: New BPF instruction for kernel and LLVM
+------------------------------------------
Q: I have added a new BPF instruction to the kernel, how can I integrate
- it into LLVM?
+it into LLVM?
-A: LLVM has a -mcpu selector for the BPF back end in order to allow the
- selection of BPF instruction set extensions. By default the 'generic'
- processor target is used, which is the base instruction set (v1) of BPF.
+A: LLVM has a ``-mcpu`` selector for the BPF back end in order to allow
+the selection of BPF instruction set extensions. By default the
+``generic`` processor target is used, which is the base instruction set
+(v1) of BPF.
- LLVM has an option to select -mcpu=probe where it will probe the host
- kernel for supported BPF instruction set extensions and selects the
- optimal set automatically.
+LLVM has an option to select ``-mcpu=probe`` where it will probe the host
+kernel for supported BPF instruction set extensions and selects the
+optimal set automatically.
- For cross-compilation, a specific version can be select manually as well.
+For cross-compilation, a specific version can be select manually as well ::
$ llc -march bpf -mcpu=help
Available CPUs for this target:
@@ -505,66 +534,78 @@ A: LLVM has a -mcpu selector for the BPF back end in order to allow the
v2 - Select the v2 processor.
[...]
- Newly added BPF instructions to the Linux kernel need to follow the same
- scheme, bump the instruction set version and implement probing for the
- extensions such that -mcpu=probe users can benefit from the optimization
- transparently when upgrading their kernels.
+Newly added BPF instructions to the Linux kernel need to follow the same
+scheme, bump the instruction set version and implement probing for the
+extensions such that ``-mcpu=probe`` users can benefit from the
+optimization transparently when upgrading their kernels.
- If you are unable to implement support for the newly added BPF instruction
- please reach out to BPF developers for help.
+If you are unable to implement support for the newly added BPF instruction
+please reach out to BPF developers for help.
- By the way, the BPF kernel selftests run with -mcpu=probe for better
- test coverage.
+By the way, the BPF kernel selftests run with ``-mcpu=probe`` for better
+test coverage.
-Q: In some cases clang flag "-target bpf" is used but in other cases the
- default clang target, which matches the underlying architecture, is used.
- What is the difference and when I should use which?
+Q: clang flag for target bpf?
+-----------------------------
+Q: In some cases clang flag ``-target bpf`` is used but in other cases the
+default clang target, which matches the underlying architecture, is used.
+What is the difference and when I should use which?
A: Although LLVM IR generation and optimization try to stay architecture
- independent, "-target <arch>" still has some impact on generated code:
-
- - BPF program may recursively include header file(s) with file scope
- inline assembly codes. The default target can handle this well,
- while bpf target may fail if bpf backend assembler does not
- understand these assembly codes, which is true in most cases.
-
- - When compiled without -g, additional elf sections, e.g.,
- .eh_frame and .rela.eh_frame, may be present in the object file
- with default target, but not with bpf target.
-
- - The default target may turn a C switch statement into a switch table
- lookup and jump operation. Since the switch table is placed
- in the global readonly section, the bpf program will fail to load.
- The bpf target does not support switch table optimization.
- The clang option "-fno-jump-tables" can be used to disable
- switch table generation.
-
- - For clang -target bpf, it is guaranteed that pointer or long /
- unsigned long types will always have a width of 64 bit, no matter
- whether underlying clang binary or default target (or kernel) is
- 32 bit. However, when native clang target is used, then it will
- compile these types based on the underlying architecture's conventions,
- meaning in case of 32 bit architecture, pointer or long / unsigned
- long types e.g. in BPF context structure will have width of 32 bit
- while the BPF LLVM back end still operates in 64 bit. The native
- target is mostly needed in tracing for the case of walking pt_regs
- or other kernel structures where CPU's register width matters.
- Otherwise, clang -target bpf is generally recommended.
-
- You should use default target when:
-
- - Your program includes a header file, e.g., ptrace.h, which eventually
- pulls in some header files containing file scope host assembly codes.
- - You can add "-fno-jump-tables" to work around the switch table issue.
-
- Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
- when:
-
- - Your program uses data structures with pointer or long / unsigned long
- types that interface with BPF helpers or context data structures. Access
- into these structures is verified by the BPF verifier and may result
- in verification failures if the native architecture is not aligned with
- the BPF architecture, e.g. 64-bit. An example of this is
- BPF_PROG_TYPE_SK_MSG require '-target bpf'
+independent, ``-target <arch>`` still has some impact on generated code:
+
+- BPF program may recursively include header file(s) with file scope
+ inline assembly codes. The default target can handle this well,
+ while ``bpf`` target may fail if bpf backend assembler does not
+ understand these assembly codes, which is true in most cases.
+
+- When compiled without ``-g``, additional elf sections, e.g.,
+ .eh_frame and .rela.eh_frame, may be present in the object file
+ with default target, but not with ``bpf`` target.
+
+- The default target may turn a C switch statement into a switch table
+ lookup and jump operation. Since the switch table is placed
+ in the global readonly section, the bpf program will fail to load.
+ The bpf target does not support switch table optimization.
+ The clang option ``-fno-jump-tables`` can be used to disable
+ switch table generation.
+
+- For clang ``-target bpf``, it is guaranteed that pointer or long /
+ unsigned long types will always have a width of 64 bit, no matter
+ whether underlying clang binary or default target (or kernel) is
+ 32 bit. However, when native clang target is used, then it will
+ compile these types based on the underlying architecture's conventions,
+ meaning in case of 32 bit architecture, pointer or long / unsigned
+ long types e.g. in BPF context structure will have width of 32 bit
+ while the BPF LLVM back end still operates in 64 bit. The native
+ target is mostly needed in tracing for the case of walking ``pt_regs``
+ or other kernel structures where CPU's register width matters.
+ Otherwise, ``clang -target bpf`` is generally recommended.
+
+You should use default target when:
+
+- Your program includes a header file, e.g., ptrace.h, which eventually
+ pulls in some header files containing file scope host assembly codes.
+
+- You can add ``-fno-jump-tables`` to work around the switch table issue.
+
+Otherwise, you can use ``bpf`` target. Additionally, you *must* use bpf target
+when:
+
+- Your program uses data structures with pointer or long / unsigned long
+ types that interface with BPF helpers or context data structures. Access
+ into these structures is verified by the BPF verifier and may result
+ in verification failures if the native architecture is not aligned with
+ the BPF architecture, e.g. 64-bit. An example of this is
+ BPF_PROG_TYPE_SK_MSG require ``-target bpf``
+
+
+.. Links
+.. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/
+.. _MAINTAINERS: ../../MAINTAINERS
+.. _Documentation/networking/netdev-FAQ.txt: ../networking/netdev-FAQ.txt
+.. _netdev FAQ: ../networking/netdev-FAQ.txt
+.. _samples/bpf/: ../../samples/bpf/
+.. _selftests: ../../tools/testing/selftests/bpf/
Happy BPF hacking!
^ permalink raw reply related
* [bpf-next PATCH 5/5] bpf, doc: howto use/run the BPF selftests
From: Jesper Dangaard Brouer @ 2018-05-14 13:42 UTC (permalink / raw)
To: Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: netdev, Quentin Monnet, linux-man, linux-doc
In-Reply-To: <152630528901.29210.9018565600000101307.stgit@firesoul>
I always forget howto run the BPF selftests. Thus, lets add that info
to the QA document.
Documentation was based on Cilium's documentation:
http://cilium.readthedocs.io/en/latest/bpf/#verifying-the-setup
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
Documentation/bpf/bpf_devel_QA.rst | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst
index 2254bdeae990..0e7c1d946e83 100644
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -417,6 +417,33 @@ submitted by the BPF maintainers to the stable maintainers.
Testing patches
===============
+Q: How to run BPF selftests
+---------------------------
+A: After you have booted into the newly compiled kernel, navigate to
+the BPF selftests_ suite in order to test BPF functionality (current
+working directory points to the root of the cloned git tree)::
+
+ $ cd tools/testing/selftests/bpf/
+ $ make
+
+To run the verifier tests::
+
+ $ sudo ./test_verifier
+
+The verifier tests print out all the current checks being
+performed. The summary at the end of running all tests will dump
+information of test successes and failures::
+
+ Summary: 418 PASSED, 0 FAILED
+
+In order to run through all BPF selftests, the following command is
+needed::
+
+ $ sudo make run_tests
+
+See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
+document for further documentation.
+
Q: Which BPF kernel selftests version should I run my kernel against?
---------------------------------------------------------------------
A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
@@ -607,5 +634,7 @@ when:
.. _netdev FAQ: ../networking/netdev-FAQ.txt
.. _samples/bpf/: ../../samples/bpf/
.. _selftests: ../../tools/testing/selftests/bpf/
+.. _Documentation/dev-tools/kselftest.rst:
+ https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
Happy BPF hacking!
^ permalink raw reply related
* [linux-stable-3.16.y] tun: allow positive return values on dev_get_valid_name() call
From: Sedat Dilek @ 2018-05-14 13:47 UTC (permalink / raw)
To: Ben Hutchings
Cc: debian-kernel, Cong Wang, netdev, David S. Miller,
Michael S. Tsirkin, Signed-off-by: Julien Gomes
Hi Ben,
some Debian/jessie systems were caught by the bug-report in [1].
This issue was recently fixed in an updated Debian kernel for v3.16.y.
Will you include the patch "tun: allow positive return values on
dev_get_valid_name() call" [2] in linux-stable-3.16.y upstream?
This was a fix for "tun: call dev_get_valid_name() before
register_netdevice()" [3].
Unfortunately, there is not a reference (usually "Fixes:" tag) for this in [2].
Not sure if this was documented in the meantime like [4] says.
Thanks in advance.
Regards,
- Sedat -
[1] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=897427
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5c25f65fd1e42685f7ccd80e0621829c105785d9
[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ad646c81b2182f7fa67ec0c8c825e0ee165696d
[4] https://www.spinics.net/lists/netdev/msg462705.html
^ permalink raw reply
* Re: [PATCH 2/5] crypto: chtls: wait for memory sendmsg, sendpage
From: kbuild test robot @ 2018-05-14 14:04 UTC (permalink / raw)
To: Atul Gupta
Cc: kbuild-all, herbert, linux-crypto, gustavo, dan.carpenter, netdev,
davem, atul.gupta
In-Reply-To: <1526295659-29839-3-git-send-email-atul.gupta@chelsio.com>
[-- Attachment #1: Type: text/plain, Size: 3386 bytes --]
Hi Atul,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on cryptodev/master]
[also build test WARNING on v4.17-rc5 next-20180514]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Atul-Gupta/build-warnings-cleanup/20180514-213306
base: https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
config: i386-randconfig-x009-201819 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All warnings (new ones prefixed by >>):
drivers/crypto/chelsio/chtls/chtls_io.c: In function 'csk_wait_memory':
>> drivers/crypto/chelsio/chtls/chtls_io.c:946:4: warning: this 'if' clause does not guard... [-Wmisleading-indentation]
if (noblock)
^~
drivers/crypto/chelsio/chtls/chtls_io.c:948:5: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'
goto do_nonblock;
^~~~
vim +/if +946 drivers/crypto/chelsio/chtls/chtls_io.c
921
922 static int csk_wait_memory(struct chtls_dev *cdev,
923 struct sock *sk, long *timeo_p)
924 {
925 DEFINE_WAIT_FUNC(wait, woken_wake_function);
926 int sndbuf, err = 0;
927 long current_timeo;
928 long vm_wait = 0;
929 bool noblock;
930
931 current_timeo = *timeo_p;
932 noblock = (*timeo_p ? false : true);
933 sndbuf = cdev->max_host_sndbuf;
934 if (sndbuf > sk->sk_wmem_queued) {
935 current_timeo = (prandom_u32() % (HZ / 5)) + 2;
936 vm_wait = (prandom_u32() % (HZ / 5)) + 2;
937 }
938
939 add_wait_queue(sk_sleep(sk), &wait);
940 while (1) {
941 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
942
943 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
944 goto do_error;
945 if (!*timeo_p) {
> 946 if (noblock)
947 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
948 goto do_nonblock;
949 }
950 if (signal_pending(current))
951 goto do_interrupted;
952 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
953 if (sndbuf > sk->sk_wmem_queued && !vm_wait)
954 break;
955
956 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
957 sk->sk_write_pending++;
958 sk_wait_event(sk, ¤t_timeo, sk->sk_err ||
959 (sk->sk_shutdown & SEND_SHUTDOWN) ||
960 (sndbuf > sk->sk_wmem_queued && !vm_wait), &wait);
961 sk->sk_write_pending--;
962
963 if (vm_wait) {
964 vm_wait -= current_timeo;
965 current_timeo = *timeo_p;
966 if (current_timeo != MAX_SCHEDULE_TIMEOUT) {
967 current_timeo -= vm_wait;
968 if (current_timeo < 0)
969 current_timeo = 0;
970 }
971 vm_wait = 0;
972 }
973 *timeo_p = current_timeo;
974 }
975 out:
976 remove_wait_queue(sk_sleep(sk), &wait);
977 return err;
978 do_error:
979 err = -EPIPE;
980 goto out;
981 do_nonblock:
982 err = -EAGAIN;
983 goto out;
984 do_interrupted:
985 err = sock_intr_errno(*timeo_p);
986 goto out;
987 }
988
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 37181 bytes --]
^ permalink raw reply
* Re: [PATCH net-next] net:sched: add gkprio scheduler
From: Michel Machado @ 2018-05-14 14:08 UTC (permalink / raw)
To: Jamal Hadi Salim, Cong Wang
Cc: Nishanth Devarajan, Jiri Pirko, David Miller,
Linux Kernel Network Developers, Cody Doucette
In-Reply-To: <d429f4a6-4141-2cb2-f67e-036553a9a3dd@mojatatu.com>
> On 09/05/18 01:37 PM, Michel Machado wrote:
>> On 05/09/2018 10:43 AM, Jamal Hadi Salim wrote:
>>> On 08/05/18 10:27 PM, Cong Wang wrote:
>>>> On Tue, May 8, 2018 at 6:29 AM, Jamal Hadi Salim <jhs@mojatatu.com>
>>>> wrote:
>
>>
>> I like the suggestion of extending skbmod to mark skbprio based on ds.
>> Given that DSprio would no longer depend on the DS field, would you
>> have a name suggestion for this new queue discipline since the name
>> "prio" is currently in use?
>>
>
> Not sure what to call it.
> My struggle is still with the intended end goal of the qdisc.
> It looks like prio qdisc except for the enqueue part which attempts
> to use a shared global queue size for all prios. I would have
> pointed to other approaches which use global priority queue pool
> which do early congestion detection like RED or variants like GRED but
> those use average values of the queue lengths not instantenous values
> such as you do.
> I am tempted to say - based on my current understanding - that you dont
> need a new qdisc; rather you need to map your dsfields to skbprio
> (via skbmod) and stick with prio qdisc. I also think the skbmod
> mapping is useful regardless of this need.
A simplified description of what DSprio is meant to do is as follows:
when a link is overloaded at a router, DSprio makes this router drop the
packets of lower priority. These priorities are assigned by Gatekeeper
in such a way that well behaving sources are favored (Theorem 4.1 of the
Portcullis paper pointed out in my previous email). Moreover, attackers
cannot do much better than well behaving sources (Theorem 4.2). This
description is simplified because it omits many other components of
Gatekeeper that affects the packets that goes to DSprio.
Like you, I'm all in for less code. If someone can instruct us on how to
accomplish the same thing that our patch is doing, we would be happy to
withdraw it. We have submitted this patch because we want to lower the
bar to deploy Gatekeeper as much as possible, and requiring network
operators willing to deploy Gatekeeper to keep patching the kernel is an
operational burden.
>> What should be the range of priorities that this new queue discipline
>> would accept? skb->prioriry is of type __u32, but supporting 2^32
>> priorities would require too large of an array to index packets by
>> priority; the DS field is only 6 bits long. Do you have a use case in
>> mind to guide us here?
>>
>
> Look at the priomap or prio2band arrangement on prio qdisc
> or pfifo_fast qdisc. You take an skbprio as an index into the array
> and retrieve a queue to enqueue to. The size of the array is 16.
> In the past this was based IIRC on ip precedence + 1 bit. Those map
> similarly to DS fields (calls selectors, assured forwarding etc). So
> no need to even increase the array beyond current 16.
What application is this change supposed to enable or help? I think this
change should be left for when one can explain the need for it.
>>> 2) Dropping already enqueued packets will not work well for
>>> local feedback (__NET_XMIT_BYPASS return code is about the
>>> packet that has been dropped from earlier enqueueing because
>>> it is lower priority - it does not signify anything with
>>> current skb to which actually just got enqueud).
>>> Perhaps (off top of my head) is to always enqueue packets on
>>> high priority when their limit is exceeded as long as lower prio has
>>> some space. Means youd have to increment low prio accounting if their
>>> space is used.
>>
>> I don't understand the point you are making here. Could you develop it
>> further?
>>
>
> Sorry - I was meaning NET_XMIT_CN
> If you drop an already enqueued packet - it makes sense to signify as
> such using NET_XMIT_CN
> this does not make sense for forwarded packets but it does
> for locally sourced packets.
Thank you for bringing this detail to our attention; we've overlooked
the return code NET_XMIT_CN. We'll adopt it when the queue is full and
the lowest priority packet in the queue is being dropped to make room
for the higher-priority, incoming packet.
[ ]'s
Michel Machado
^ permalink raw reply
* Re: [linux-stable-3.16.y] tun: allow positive return values on dev_get_valid_name() call
From: Ben Hutchings @ 2018-05-14 14:24 UTC (permalink / raw)
To: sedat.dilek
Cc: debian-kernel, Cong Wang, netdev, David S. Miller,
Michael S. Tsirkin, Signed-off-by: Julien Gomes
In-Reply-To: <CA+icZUUMuUVabftqNBgARuVj3XvXKbxWyyr9J5xOjbjYEeNYrQ@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 1217 bytes --]
On Mon, 2018-05-14 at 15:47 +0200, Sedat Dilek wrote:
> Hi Ben,
>
> some Debian/jessie systems were caught by the bug-report in [1].
> This issue was recently fixed in an updated Debian kernel for v3.16.y.
>
> Will you include the patch "tun: allow positive return values on
> dev_get_valid_name() call" [2] in linux-stable-3.16.y upstream?
Yes, I will include all the same regression fixes in the next 3.16-
stable update.
Ben.
> This was a fix for "tun: call dev_get_valid_name() before
> register_netdevice()" [3].
> Unfortunately, there is not a reference (usually "Fixes:" tag) for this in [2].
> Not sure if this was documented in the meantime like [4] says.
>
> Thanks in advance.
>
> Regards,
> - Sedat -
>
> [1] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=897427
> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5c25f65fd1e42685f7ccd80e0621829c105785d9
> [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0ad646c81b2182f7fa67ec0c8c825e0ee165696d
> [4] https://www.spinics.net/lists/netdev/msg462705.html
--
Ben Hutchings
For every action, there is an equal and opposite criticism. - Harrison
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* [PATCH 00/14] Modify action API for implementing lockless actions
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
Currently, all netlink protocol handlers for updating rules, actions and
qdiscs are protected with single global rtnl lock which removes any
possibility for parallelism. This patch set is a first step to remove
rtnl lock dependency from TC rules update path. It updates act API to
use atomic operations, rcu and spinlocks for fine-grained locking. It
also extend API with functions that are needed to update existing
actions for parallel execution.
Outline of changes:
- Change tc action to use atomic reference and bind counters, rcu
mechanism for cookie update.
- Extend action ops API with 'delete' function and 'unlocked' flag.
- Change action API to work with actions in lockless manner based on
primitives implemented in previous patches.
- Extend action API with new functions necessary to implement unlocked
actions.
Vlad Buslov (14):
net: sched: use rcu for action cookie update
net: sched: change type of reference and bind counters
net: sched: add 'delete' function to action ops
net: sched: implement unlocked action init API
net: sched: always take reference to action
net: sched: implement reference counted action release
net: sched: use reference counting action init
net: sched: account for temporary action reference
net: sched: don't release reference on action overwrite
net: sched: extend act API for lockless actions
net: core: add new/replace rate estimator lock parameter
net: sched: retry action check-insert on concurrent modification
net: sched: use unique idr insert function in unlocked actions
net: sched: implement delete for all actions
include/net/act_api.h | 16 ++-
include/net/gen_stats.h | 2 +
include/net/pkt_cls.h | 1 +
net/core/gen_estimator.c | 58 ++++++---
net/netfilter/xt_RATEEST.c | 2 +-
net/sched/act_api.c | 298 ++++++++++++++++++++++++++++++++-------------
net/sched/act_bpf.c | 33 +++--
net/sched/act_connmark.c | 29 +++--
net/sched/act_csum.c | 33 +++--
net/sched/act_gact.c | 30 +++--
net/sched/act_ife.c | 37 ++++--
net/sched/act_ipt.c | 41 +++++--
net/sched/act_mirred.c | 32 +++--
net/sched/act_nat.c | 29 +++--
net/sched/act_pedit.c | 30 +++--
net/sched/act_police.c | 35 ++++--
net/sched/act_sample.c | 33 +++--
net/sched/act_simple.c | 31 +++--
net/sched/act_skbedit.c | 30 +++--
net/sched/act_skbmod.c | 33 +++--
net/sched/act_tunnel_key.c | 34 ++++--
net/sched/act_vlan.c | 34 ++++--
net/sched/cls_api.c | 6 +-
net/sched/sch_api.c | 2 +
net/sched/sch_cbq.c | 4 +-
net/sched/sch_drr.c | 4 +-
net/sched/sch_hfsc.c | 4 +-
net/sched/sch_htb.c | 4 +-
net/sched/sch_qfq.c | 4 +-
29 files changed, 673 insertions(+), 256 deletions(-)
--
2.7.5
^ permalink raw reply
* [PATCH 01/14] net: sched: use rcu for action cookie update
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
In-Reply-To: <1526308035-12484-1-git-send-email-vladbu@mellanox.com>
Implement functions to atomically update and free action cookie
using rcu mechanism.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
include/net/act_api.h | 2 +-
include/net/pkt_cls.h | 1 +
net/sched/act_api.c | 44 ++++++++++++++++++++++++++++++--------------
3 files changed, 32 insertions(+), 15 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9e59ebf..f7b59ef 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -37,7 +37,7 @@ struct tc_action {
spinlock_t tcfa_lock;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
- struct tc_cookie *act_cookie;
+ struct tc_cookie __rcu *act_cookie;
struct tcf_chain *goto_chain;
};
#define tcf_index common.tcfa_index
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e828d31..3068cc8 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -769,6 +769,7 @@ struct tc_mqprio_qopt_offload {
struct tc_cookie {
u8 *data;
u32 len;
+ struct rcu_head rcu;
};
struct tc_qopt_offload_stats {
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 72251241..ce829a3 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
res->goto_tp = rcu_dereference_bh(chain->filter_chain);
}
+static void tcf_free_cookie_rcu(struct rcu_head *p)
+{
+ struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu);
+
+ kfree(cookie->data);
+ kfree(cookie);
+}
+
+static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
+ struct tc_cookie *new_cookie)
+{
+ struct tc_cookie *old;
+
+ old = xchg(old_cookie, new_cookie);
+ if (old)
+ call_rcu(&old->rcu, tcf_free_cookie_rcu);
+}
+
/* XXX: For standalone actions, we don't need a RCU grace period either, because
* actions are always connected to filters and filters are already destroyed in
* RCU callbacks, so after a RCU grace period actions are already disconnected
@@ -65,10 +83,7 @@ static void free_tcf(struct tc_action *p)
free_percpu(p->cpu_bstats);
free_percpu(p->cpu_qstats);
- if (p->act_cookie) {
- kfree(p->act_cookie->data);
- kfree(p->act_cookie);
- }
+ tcf_set_action_cookie(&p->act_cookie, NULL);
if (p->goto_chain)
tcf_action_goto_chain_fini(p);
@@ -567,16 +582,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
+ struct tc_cookie *cookie;
if (nla_put_string(skb, TCA_KIND, a->ops->kind))
goto nla_put_failure;
if (tcf_action_copy_stats(skb, a, 0))
goto nla_put_failure;
- if (a->act_cookie) {
- if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len,
- a->act_cookie->data))
+
+ rcu_read_lock();
+ cookie = rcu_dereference(a->act_cookie);
+ if (cookie) {
+ if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) {
+ rcu_read_unlock();
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
@@ -719,13 +740,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (err < 0)
goto err_mod;
- if (name == NULL && tb[TCA_ACT_COOKIE]) {
- if (a->act_cookie) {
- kfree(a->act_cookie->data);
- kfree(a->act_cookie);
- }
- a->act_cookie = cookie;
- }
+ if (!name && tb[TCA_ACT_COOKIE])
+ tcf_set_action_cookie(&a->act_cookie, cookie);
/* module count goes up only when brand new policy is created
* if it exists and is only bound to in a_o->init() then
--
2.7.5
^ permalink raw reply related
* [PATCH 02/14] net: sched: change type of reference and bind counters
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
In-Reply-To: <1526308035-12484-1-git-send-email-vladbu@mellanox.com>
Change type of action reference counter to refcount_t.
Change type of action bind counter to atomic_t.
This type is used to allow decrementing bind counter without testing
for 0 result.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
include/net/act_api.h | 5 +++--
net/sched/act_api.c | 32 ++++++++++++++++++++++----------
net/sched/act_bpf.c | 4 ++--
net/sched/act_connmark.c | 4 ++--
net/sched/act_csum.c | 4 ++--
net/sched/act_gact.c | 4 ++--
net/sched/act_ife.c | 4 ++--
net/sched/act_ipt.c | 4 ++--
net/sched/act_mirred.c | 4 ++--
net/sched/act_nat.c | 4 ++--
net/sched/act_pedit.c | 4 ++--
net/sched/act_police.c | 4 ++--
net/sched/act_sample.c | 4 ++--
net/sched/act_simple.c | 4 ++--
net/sched/act_skbedit.c | 4 ++--
net/sched/act_skbmod.c | 4 ++--
net/sched/act_tunnel_key.c | 4 ++--
net/sched/act_vlan.c | 4 ++--
18 files changed, 57 insertions(+), 44 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index f7b59ef..e634014 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -6,6 +6,7 @@
* Public action API for classifiers/qdiscs
*/
+#include <linux/refcount.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/net_namespace.h>
@@ -26,8 +27,8 @@ struct tc_action {
struct tcf_idrinfo *idrinfo;
u32 tcfa_index;
- int tcfa_refcnt;
- int tcfa_bindcnt;
+ refcount_t tcfa_refcnt;
+ atomic_t tcfa_bindcnt;
u32 tcfa_capab;
int tcfa_action;
struct tcf_t tcfa_tm;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index ce829a3..96b0878 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -105,14 +105,26 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
ASSERT_RTNL();
+ /* Release with strict==1 and bind==0 is only called through act API
+ * interface (classifiers always bind). Only case when action with
+ * positive reference count and zero bind count can exist is when it was
+ * also created with act API (unbinding last classifier will destroy the
+ * action if it was created by classifier). So only case when bind count
+ * can be changed after initial check is when unbound action is
+ * destroyed by act API while classifier binds to action with same id
+ * concurrently. This result either creation of new action(same behavior
+ * as before), or reusing existing action if concurrent process
+ * increments reference count before action is deleted. Both scenarios
+ * are acceptable.
+ */
if (p) {
if (bind)
- p->tcfa_bindcnt--;
- else if (strict && p->tcfa_bindcnt > 0)
+ atomic_dec(&p->tcfa_bindcnt);
+ else if (strict && atomic_read(&p->tcfa_bindcnt) > 0)
return -EPERM;
- p->tcfa_refcnt--;
- if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
+ if (atomic_read(&p->tcfa_bindcnt) <= 0 &&
+ refcount_dec_and_test(&p->tcfa_refcnt)) {
if (p->ops->cleanup)
p->ops->cleanup(p);
tcf_idr_remove(p->idrinfo, p);
@@ -304,8 +316,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
if (index && p) {
if (bind)
- p->tcfa_bindcnt++;
- p->tcfa_refcnt++;
+ atomic_inc(&p->tcfa_bindcnt);
+ refcount_inc(&p->tcfa_refcnt);
*a = p;
return true;
}
@@ -324,9 +336,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
if (unlikely(!p))
return -ENOMEM;
- p->tcfa_refcnt = 1;
+ refcount_set(&p->tcfa_refcnt, 1);
if (bind)
- p->tcfa_bindcnt = 1;
+ atomic_set(&p->tcfa_bindcnt, 1);
if (cpustats) {
p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
@@ -782,7 +794,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
return;
list_for_each_entry(a, actions, list)
- a->tcfa_refcnt--;
+ refcount_dec(&a->tcfa_refcnt);
}
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
@@ -810,7 +822,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
act->order = i;
sz += tcf_action_fill_size(act);
if (ovr)
- act->tcfa_refcnt++;
+ refcount_inc(&act->tcfa_refcnt);
list_add_tail(&act->list, actions);
}
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 18089c0..15a2a53 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act,
struct tcf_bpf *prog = to_bpf(act);
struct tc_act_bpf opt = {
.index = prog->tcf_index,
- .refcnt = prog->tcf_refcnt - ref,
- .bindcnt = prog->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&prog->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind,
.action = prog->tcf_action,
};
struct tcf_t tm;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e4b880f..1888650 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -154,8 +154,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
struct tc_connmark opt = {
.index = ci->tcf_index,
- .refcnt = ci->tcf_refcnt - ref,
- .bindcnt = ci->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
.action = ci->tcf_action,
.zone = ci->zone,
};
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 7e28b2c..0572682 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -597,8 +597,8 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tcf_csum_params *params;
struct tc_csum opt = {
.index = p->tcf_index,
- .refcnt = p->tcf_refcnt - ref,
- .bindcnt = p->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
struct tcf_t t;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 4dc4f15..ca83deb 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -169,8 +169,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_gact *gact = to_gact(a);
struct tc_gact opt = {
.index = gact->tcf_index,
- .refcnt = gact->tcf_refcnt - ref,
- .bindcnt = gact->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&gact->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind,
.action = gact->tcf_action,
};
struct tcf_t t;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a5994cf..689f63e 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -598,8 +598,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tcf_ife_params *p = rtnl_dereference(ife->params);
struct tc_ife opt = {
.index = ife->tcf_index,
- .refcnt = ife->tcf_refcnt - ref,
- .bindcnt = ife->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
.action = ife->tcf_action,
.flags = p->flags,
};
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 14c312d..7bce88d 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -280,8 +280,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind,
if (unlikely(!t))
goto nla_put_failure;
- c.bindcnt = ipt->tcf_bindcnt - bind;
- c.refcnt = ipt->tcf_refcnt - ref;
+ c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind;
+ c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref;
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fd34015..82a8bdd 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -250,8 +250,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
struct tc_mirred opt = {
.index = m->tcf_index,
.action = m->tcf_action,
- .refcnt = m->tcf_refcnt - ref,
- .bindcnt = m->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&m->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
.eaction = m->tcfm_eaction,
.ifindex = dev ? dev->ifindex : 0,
};
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4b5848b..457c2ae 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -257,8 +257,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
.index = p->tcf_index,
.action = p->tcf_action,
- .refcnt = p->tcf_refcnt - ref,
- .bindcnt = p->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&p->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
};
struct tcf_t t;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 8a925c7..0102b29 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -391,8 +391,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
opt->nkeys = p->tcfp_nkeys;
opt->flags = p->tcfp_flags;
opt->action = p->tcf_action;
- opt->refcnt = p->tcf_refcnt - ref;
- opt->bindcnt = p->tcf_bindcnt - bind;
+ opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
+ opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
if (p->tcfp_keys_ex) {
tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 4e72bc2..a789b80 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -274,8 +274,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a,
.action = police->tcf_action,
.mtu = police->tcfp_mtu,
.burst = PSCHED_NS2TICKS(police->tcfp_burst),
- .refcnt = police->tcf_refcnt - ref,
- .bindcnt = police->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&police->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&police->tcf_bindcnt) - bind,
};
struct tcf_t t;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 5db3584..4a46978 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -173,8 +173,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a,
struct tc_sample opt = {
.index = s->tcf_index,
.action = s->tcf_action,
- .refcnt = s->tcf_refcnt - ref,
- .bindcnt = s->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&s->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&s->tcf_bindcnt) - bind,
};
struct tcf_t t;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 9618b4a8..95d5985 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -148,8 +148,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_defact *d = to_defact(a);
struct tc_defact opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
.action = d->tcf_action,
};
struct tcf_t t;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index ddf69fc..adf0a72 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -172,8 +172,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_skbedit *d = to_skbedit(a);
struct tc_skbedit opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
.action = d->tcf_action,
};
struct tcf_t t;
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index bbcbdce..a0d9abb 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -202,8 +202,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p);
struct tc_skbmod opt = {
.index = d->tcf_index,
- .refcnt = d->tcf_refcnt - ref,
- .bindcnt = d->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&d->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&d->tcf_bindcnt) - bind,
.action = d->tcf_action,
};
struct tcf_t t;
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 626dac8..c6e5069 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -252,8 +252,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_tunnel_key_params *params;
struct tc_tunnel_key opt = {
.index = t->tcf_index,
- .refcnt = t->tcf_refcnt - ref,
- .bindcnt = t->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&t->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&t->tcf_bindcnt) - bind,
};
struct tcf_t tm;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 8536046..8dda784 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -237,8 +237,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p);
struct tc_vlan opt = {
.index = v->tcf_index,
- .refcnt = v->tcf_refcnt - ref,
- .bindcnt = v->tcf_bindcnt - bind,
+ .refcnt = refcount_read(&v->tcf_refcnt) - ref,
+ .bindcnt = atomic_read(&v->tcf_bindcnt) - bind,
.action = v->tcf_action,
.v_action = p->tcfv_action,
};
--
2.7.5
^ permalink raw reply related
* [PATCH 04/14] net: sched: implement unlocked action init API
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
In-Reply-To: <1526308035-12484-1-git-send-email-vladbu@mellanox.com>
Add additional 'unlocked' argument to act API init functions.
Argument is true when rtnl lock is not taken and false otherwise.
It is required to implement actions that need to release rtnl lock before
loading kernel module and reacquire if afterwards.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
include/net/act_api.h | 6 ++++--
net/sched/act_api.c | 17 ++++++++++-------
net/sched/act_bpf.c | 3 ++-
net/sched/act_connmark.c | 2 +-
net/sched/act_csum.c | 3 ++-
net/sched/act_gact.c | 3 ++-
net/sched/act_ife.c | 3 ++-
net/sched/act_ipt.c | 6 ++++--
net/sched/act_mirred.c | 5 +++--
net/sched/act_nat.c | 2 +-
net/sched/act_pedit.c | 3 ++-
net/sched/act_police.c | 2 +-
net/sched/act_sample.c | 3 ++-
net/sched/act_simple.c | 3 ++-
net/sched/act_skbedit.c | 3 ++-
net/sched/act_skbmod.c | 3 ++-
net/sched/act_tunnel_key.c | 3 ++-
net/sched/act_vlan.c | 3 ++-
net/sched/cls_api.c | 5 +++--
19 files changed, 49 insertions(+), 29 deletions(-)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 73175a3..a8c8570 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -92,7 +92,8 @@ struct tc_action_ops {
struct netlink_ext_ack *extack);
int (*init)(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act, int ovr,
- int bind, struct netlink_ext_ack *extack);
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack);
int (*walk)(struct net *, struct sk_buff *,
struct netlink_callback *, int,
const struct tc_action_ops *,
@@ -168,11 +169,12 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
int nr_actions, struct tcf_result *res);
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
- struct list_head *actions, size_t *attr_size,
+ struct list_head *actions, size_t *attr_size, bool unlocked,
struct netlink_ext_ack *extack);
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
+ bool unlocked,
struct netlink_ext_ack *extack);
int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int);
int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 96b0878..1331beb 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -671,6 +671,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
+ bool unlocked,
struct netlink_ext_ack *extack)
{
struct tc_action *a;
@@ -721,9 +722,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
a_o = tc_lookup_action_n(act_name);
if (a_o == NULL) {
#ifdef CONFIG_MODULES
- rtnl_unlock();
+ if (!unlocked)
+ rtnl_unlock();
request_module("act_%s", act_name);
- rtnl_lock();
+ if (!unlocked)
+ rtnl_lock();
a_o = tc_lookup_action_n(act_name);
@@ -746,9 +749,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
/* backward compatibility for policer */
if (name == NULL)
err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
- extack);
+ unlocked, extack);
else
- err = a_o->init(net, nla, est, &a, ovr, bind, extack);
+ err = a_o->init(net, nla, est, &a, ovr, bind, unlocked, extack);
if (err < 0)
goto err_mod;
@@ -799,7 +802,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
struct nlattr *est, char *name, int ovr, int bind,
- struct list_head *actions, size_t *attr_size,
+ struct list_head *actions, size_t *attr_size, bool unlocked,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
@@ -814,7 +817,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
- extack);
+ unlocked, extack);
if (IS_ERR(act)) {
err = PTR_ERR(act);
goto err;
@@ -1173,7 +1176,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
LIST_HEAD(actions);
ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
- &attr_size, extack);
+ &attr_size, false, extack);
if (ret)
return ret;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 15a2a53..5d95c43 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **act,
- int replace, int bind, struct netlink_ext_ack *extack)
+ int replace, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 1888650..ff6444e 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind,
+ int ovr, int bind, bool unlocked,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, connmark_net_id);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 0572682..a692ac1 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops;
static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, csum_net_id);
struct tcf_csum_params *params_old, *params_new;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index ca83deb..0c536cd 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 689f63e..cb155cd 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -447,7 +447,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
static int tcf_ife_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ife_net_id);
struct nlattr *tb[TCA_IFE_MAX + 1];
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 7bce88d..0acf784 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -196,7 +196,8 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
static int tcf_ipt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
bind);
@@ -204,7 +205,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla,
static int tcf_xt_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
bind);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 82a8bdd..607da4b 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -68,8 +68,9 @@ static unsigned int mirred_net_id;
static struct tc_action_ops act_mirred_ops;
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
- struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ struct nlattr *est, struct tc_action **a,
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 457c2ae..2f2f045 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -37,7 +37,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
};
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
- struct tc_action **a, int ovr, int bind,
+ struct tc_action **a, int ovr, int bind, bool unlocked,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, nat_net_id);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 0102b29..117e486 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -132,7 +132,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index a789b80..2971ba3 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
static int tcf_act_police_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind,
+ int ovr, int bind, bool unlocked,
struct netlink_ext_ack *extack)
{
int ret = 0, err;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 4a46978..9bbd8e9 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a, int ovr,
- int bind, struct netlink_ext_ack *extack)
+ int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 95d5985..162f091 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index adf0a72..4b9d616 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -66,7 +66,8 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbedit_net_id);
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index a0d9abb..c1f9eda 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, skbmod_net_id);
struct nlattr *tb[TCA_SKBMOD_MAX + 1];
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index c6e5069..e4f9718 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -70,7 +70,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
static int tunnel_key_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 8dda784..6a949f5 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action **a,
- int ovr, int bind, struct netlink_ext_ack *extack)
+ int ovr, int bind, bool unlocked,
+ struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b66754f..bcba94b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1438,7 +1438,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tp, tb[exts->police],
rate_tlv, "police", ovr,
- TCA_ACT_BIND, extack);
+ TCA_ACT_BIND, false, extack);
if (IS_ERR(act))
return PTR_ERR(act);
@@ -1451,7 +1451,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
err = tcf_action_init(net, tp, tb[exts->action],
rate_tlv, NULL, ovr, TCA_ACT_BIND,
- &actions, &attr_size, extack);
+ &actions, &attr_size, false,
+ extack);
if (err)
return err;
list_for_each_entry(act, &actions, list)
--
2.7.5
^ permalink raw reply related
* [PATCH 08/14] net: sched: account for temporary action reference
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
In-Reply-To: <1526308035-12484-1-git-send-email-vladbu@mellanox.com>
tca_get_fill function has 'bind' and 'ref' arguments that get passed
down to action dump function. These arguments values are subtracted from
actual reference and bind counter values before writing them to skb.
In order to prevent concurrent action delete, RTM_GETACTION handler
acquires a reference to action before 'dumping' it and releases it
afterwards. This reference is temporal and should not be accounted by
userspace clients. (both logically and to preserver current API
behavior)
Use existing infrastructure of tca_get_fill arguments to subtract that
temporary reference and not expose it to userspace.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
net/sched/act_api.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3f02cd1..2772276e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -935,7 +935,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
if (!skb)
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
- 0, 0) <= 0) {
+ 0, 1) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
kfree_skb(skb);
return -EINVAL;
@@ -1125,7 +1125,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
return -ENOBUFS;
if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
- 0, 1) <= 0) {
+ 0, 2) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
kfree_skb(skb);
return -EINVAL;
--
2.7.5
^ permalink raw reply related
* [PATCH 10/14] net: sched: extend act API for lockless actions
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
In-Reply-To: <1526308035-12484-1-git-send-email-vladbu@mellanox.com>
Implement new action API function to atomically delete action with
specified index and to atomically insert unique action. These functions are
required to implement init and delete functions for specific actions that
do not rely on rtnl lock.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
include/net/act_api.h | 2 ++
net/sched/act_api.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+)
diff --git a/include/net/act_api.h b/include/net/act_api.h
index a8c8570..bce0cf1 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -153,7 +153,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
int bind, bool cpustats);
void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a);
+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index);
int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
static inline int tcf_idr_release(struct tc_action *a, bool bind)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 2772276e..a5193dc 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -330,6 +330,41 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
}
EXPORT_SYMBOL(tcf_idr_check);
+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+ struct tc_action *p;
+ int ret = 0;
+
+ spin_lock_bh(&idrinfo->lock);
+ p = idr_find(&idrinfo->action_idr, index);
+ if (!p) {
+ spin_unlock(&idrinfo->lock);
+ return -ENOENT;
+ }
+
+ if (!atomic_read(&p->tcfa_bindcnt)) {
+ if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+ struct module *owner = p->ops->owner;
+
+ WARN_ON(p != idr_remove(&idrinfo->action_idr,
+ p->tcfa_index));
+ spin_unlock_bh(&idrinfo->lock);
+
+ tcf_action_cleanup(p);
+ module_put(owner);
+ return 0;
+ }
+ ret = 0;
+ } else {
+ ret = -EPERM;
+ }
+
+ spin_unlock_bh(&idrinfo->lock);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_find_delete);
+
int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
struct tc_action **a, const struct tc_action_ops *ops,
int bind, bool cpustats)
@@ -407,6 +442,16 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
}
EXPORT_SYMBOL(tcf_idr_insert);
+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a)
+{
+ struct tcf_idrinfo *idrinfo = tn->idrinfo;
+
+ spin_lock_bh(&idrinfo->lock);
+ WARN_ON(idr_replace(&idrinfo->action_idr, a, a->tcfa_index));
+ spin_unlock_bh(&idrinfo->lock);
+}
+EXPORT_SYMBOL(tcf_idr_insert_unique);
+
void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
struct tcf_idrinfo *idrinfo)
{
--
2.7.5
^ permalink raw reply related
* [PATCH 11/14] net: core: add new/replace rate estimator lock parameter
From: Vlad Buslov @ 2018-05-14 14:27 UTC (permalink / raw)
To: netdev
Cc: davem, jhs, xiyou.wangcong, jiri, pablo, kadlec, fw, ast, daniel,
edumazet, vladbu, keescook, linux-kernel, netfilter-devel,
coreteam, kliteyn
In-Reply-To: <1526308035-12484-1-git-send-email-vladbu@mellanox.com>
Extend rate estimator new and replace APIs with additional spinlock
parameter used by lockless actions to protect rate_est pointer from
concurrent modification.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
include/net/gen_stats.h | 2 ++
net/core/gen_estimator.c | 58 +++++++++++++++++++++++++++++++++++-----------
net/netfilter/xt_RATEEST.c | 2 +-
net/sched/act_api.c | 2 +-
net/sched/act_police.c | 2 +-
net/sched/sch_api.c | 2 ++
net/sched/sch_cbq.c | 4 ++--
net/sched/sch_drr.c | 4 ++--
net/sched/sch_hfsc.c | 4 ++--
net/sched/sch_htb.c | 4 ++--
net/sched/sch_qfq.c | 4 ++--
11 files changed, 61 insertions(+), 27 deletions(-)
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 0304ba2..d1ef63d 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -59,12 +59,14 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *rate_est_lock,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **ptr,
+ spinlock_t *rate_est_lock,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 98fd127..3512720 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -107,11 +107,43 @@ static void est_timer(struct timer_list *t)
mod_timer(&est->timer, est->next_jiffies);
}
+static void __replace_estimator(struct net_rate_estimator __rcu **rate_est,
+ struct net_rate_estimator *new)
+{
+ struct net_rate_estimator *old = rcu_dereference_protected(*rate_est,
+ 1);
+
+ if (old) {
+ del_timer_sync(&old->timer);
+ new->avbps = old->avbps;
+ new->avpps = old->avpps;
+ }
+
+ new->next_jiffies = jiffies + ((HZ/4) << new->intvl_log);
+ timer_setup(&new->timer, est_timer, 0);
+ mod_timer(&new->timer, new->next_jiffies);
+
+ rcu_assign_pointer(*rate_est, new);
+
+ if (old)
+ kfree_rcu(old, rcu);
+}
+
+static void replace_estimator(struct net_rate_estimator __rcu **rate_est,
+ struct net_rate_estimator *new,
+ spinlock_t *rate_est_lock)
+{
+ spin_lock(rate_est_lock);
+ __replace_estimator(rate_est, new);
+ spin_unlock(rate_est_lock);
+}
+
/**
* gen_new_estimator - create a new rate estimator
* @bstats: basic statistics
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
+ * @rate_est_lock: rate_est lock (might be NULL)
* @stats_lock: statistics lock
* @running: qdisc running seqcount
* @opt: rate estimator configuration TLV
@@ -128,12 +160,13 @@ static void est_timer(struct timer_list *t)
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *rate_est_lock,
spinlock_t *stats_lock,
seqcount_t *running,
struct nlattr *opt)
{
struct gnet_estimator *parm = nla_data(opt);
- struct net_rate_estimator *old, *est;
+ struct net_rate_estimator *est;
struct gnet_stats_basic_packed b;
int intvl_log;
@@ -167,20 +200,15 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
local_bh_enable();
est->last_bytes = b.bytes;
est->last_packets = b.packets;
- old = rcu_dereference_protected(*rate_est, 1);
- if (old) {
- del_timer_sync(&old->timer);
- est->avbps = old->avbps;
- est->avpps = old->avpps;
- }
- est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
- timer_setup(&est->timer, est_timer, 0);
- mod_timer(&est->timer, est->next_jiffies);
+ if (rate_est_lock)
+ replace_estimator(rate_est, est, rate_est_lock);
+ else
+ /* If no spinlock argument provided,
+ * then assume that caller is already synchronized.
+ */
+ __replace_estimator(rate_est, est);
- rcu_assign_pointer(*rate_est, est);
- if (old)
- kfree_rcu(old, rcu);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
@@ -209,6 +237,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
* @bstats: basic statistics
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
+ * @rate_est_lock: rate_est lock (might be NULL)
* @stats_lock: statistics lock
* @running: qdisc running seqcount (might be NULL)
* @opt: rate estimator configuration TLV
@@ -221,10 +250,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
+ spinlock_t *rate_est_lock,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt)
{
- return gen_new_estimator(bstats, cpu_bstats, rate_est,
+ return gen_new_estimator(bstats, cpu_bstats, rate_est, rate_est_lock,
stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index dec843c..8e79bd5 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -154,7 +154,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
- ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
+ ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, NULL,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a5193dc..1dc092e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -409,7 +409,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
p->tcfa_tm.firstuse = 0;
if (est) {
err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
- &p->tcfa_rate_est,
+ &p->tcfa_rate_est, NULL,
&p->tcfa_lock, NULL, est);
if (err)
goto err4;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 86d9417..c480d68 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -133,7 +133,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
- &police->tcf_rate_est,
+ &police->tcf_rate_est, NULL,
&police->tcf_lock,
NULL, est);
if (err)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 106dae7e..de6a297 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1178,6 +1178,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
sch->cpu_bstats,
&sch->rate_est,
NULL,
+ NULL,
running,
tca[TCA_RATE]);
if (err) {
@@ -1253,6 +1254,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
sch->cpu_bstats,
&sch->rate_est,
NULL,
+ NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index f42025d..2a7ff53 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1503,7 +1503,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
+ &cl->rate_est, NULL,
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
@@ -1605,7 +1605,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL,
+ NULL, NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index e0b0cf8..0896e23 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -95,7 +95,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl != NULL) {
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
+ &cl->rate_est, NULL,
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
@@ -129,7 +129,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL,
+ NULL, NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3ae9877..f341324 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -972,7 +972,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
+ &cl->rate_est, NULL,
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
@@ -1042,7 +1042,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
- NULL,
+ NULL, NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err) {
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 2a4ab7c..acc0355 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1407,7 +1407,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
if (htb_rate_est || tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL,
&cl->rate_est,
- NULL,
+ NULL, NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE] ? : &est.nla);
if (err) {
@@ -1473,7 +1473,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
} else {
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
+ &cl->rate_est, NULL,
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bb1a9c1..8026c1e 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -461,7 +461,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (cl != NULL) { /* modify existing class */
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
- &cl->rate_est,
+ &cl->rate_est, NULL,
NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
@@ -488,7 +488,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (tca[TCA_RATE]) {
err = gen_new_estimator(&cl->bstats, NULL,
&cl->rate_est,
- NULL,
+ NULL, NULL,
qdisc_root_sleeping_running(sch),
tca[TCA_RATE]);
if (err)
--
2.7.5
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox