linux-can.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
@ 2014-09-26 18:25 Andri Yngvason
  2014-10-20 19:45 ` Wolfgang Grandegger
  2014-11-22 17:10 ` Marc Kleine-Budde
  0 siblings, 2 replies; 17+ messages in thread
From: Andri Yngvason @ 2014-09-26 18:25 UTC (permalink / raw)
  To: linux-can; +Cc: Wolfgang Grandegger

The handling of can error states is different between platforms.
This is an attempt to correct that problem.

I've moved this handling into a generic function for changing the
error state. This ensures that error state changes are handled
the same way everywhere (where this function is used).

What's changed from the last version of this patch-set is that
can_change_state() now relies on the individual states of the rx/tx
counters rather than their individual count values.

Signed-off-by: Andri Yngvason <andri.yngvason@marel.com>
---
 drivers/net/can/dev.c          | 106 +++++++++++++++++++++++++++++++++++++++++
 include/linux/can/dev.h        |   5 ++
 include/uapi/linux/can/error.h |   1 +
 3 files changed, 112 insertions(+)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 02492d2..f2ad15e 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -273,6 +273,112 @@ static int can_get_bittiming(struct net_device *dev,
struct can_bittiming *bt,
     return err;
 }
 
+static void can_update_error_counters(struct net_device *dev,
+                      enum can_state new_state)
+{
+    struct can_priv *priv = netdev_priv(dev);
+
+    if (new_state <= priv->state)
+        return;
+
+    switch (new_state) {
+    case CAN_STATE_ERROR_ACTIVE:
+        netdev_warn(dev, "%s: did we come from a state less than error-active?",
+                __func__);
+        break;
+    case CAN_STATE_ERROR_WARNING:
+        priv->can_stats.error_warning++;
+        break;
+    case CAN_STATE_ERROR_PASSIVE:
+        priv->can_stats.error_passive++;
+        break;
+    case CAN_STATE_BUS_OFF:
+        priv->can_stats.bus_off++;
+        break;
+    default:
+        netdev_warn(dev, "%s: %d is not a state", __func__, new_state);
+        break;
+    };
+}
+
+static int can_txstate_to_frame(struct net_device *dev, enum can_state state)
+{
+    switch (state) {
+    case CAN_STATE_ERROR_ACTIVE:
+        return CAN_ERR_CRTL_ACTIVE;
+    case CAN_STATE_ERROR_WARNING:
+        return CAN_ERR_CRTL_TX_WARNING;
+    case CAN_STATE_ERROR_PASSIVE:
+        return CAN_ERR_CRTL_TX_PASSIVE;
+    case CAN_STATE_BUS_OFF:
+        netdev_warn(dev, "%s: bus-off is not handled here", __func__);
+        return 0;
+    default:
+        netdev_warn(dev, "%s: %d is not a state", __func__, state);
+        return 0;
+    }
+}
+
+static int can_rxstate_to_frame(struct net_device *dev, enum can_state state)
+{
+    switch (state) {
+    case CAN_STATE_ERROR_ACTIVE:
+        return CAN_ERR_CRTL_ACTIVE;
+    case CAN_STATE_ERROR_WARNING:
+        return CAN_ERR_CRTL_RX_WARNING;
+    case CAN_STATE_ERROR_PASSIVE:
+        return CAN_ERR_CRTL_RX_PASSIVE;
+    case CAN_STATE_BUS_OFF:
+        netdev_warn(dev, "%s: bus-off is not handled here", __func__);
+        return 0;
+    default:
+        netdev_warn(dev, "%s: %d is not a state", __func__, state);
+        return 0;
+    }
+}
+
+void can_change_state(struct net_device *dev, struct can_frame *cf,
+              enum can_state new_state, enum can_state tx_state,
+              enum can_state rx_state)
+{
+    struct can_priv *priv = netdev_priv(dev);
+
+    if (unlikely(new_state == priv->state)) {
+        netdev_warn(dev, "%s: oops, state did not change", __func__);
+        return;
+    }
+
+    can_update_error_counters(dev, new_state);
+
+    if (unlikely(new_state == CAN_STATE_BUS_OFF)) {
+        cf->can_id |= CAN_ERR_BUSOFF;
+    } else {
+        cf->can_id |= CAN_ERR_CRTL;
+        if (tx_state > rx_state)
+            cf->data[1] |= can_txstate_to_frame(dev, tx_state);
+        else if (tx_state < rx_state)
+            cf->data[1] |= can_rxstate_to_frame(dev, rx_state);
+        else
+            cf->data[1] |= can_txstate_to_frame(dev, tx_state)
+                    |  can_rxstate_to_frame(dev, rx_state);
+    }
+
+    priv->state = new_state;
+}
+EXPORT_SYMBOL_GPL(can_change_state);
+
+enum can_state can_errcnt_to_state(unsigned int count)
+{
+    if (unlikely(count > 127))
+        return CAN_STATE_ERROR_PASSIVE;
+
+    if (unlikely(count > 96))
+        return CAN_STATE_ERROR_WARNING;
+
+    return CAN_STATE_ERROR_ACTIVE;
+}
+EXPORT_SYMBOL_GPL(can_errcnt_to_state);
+
 /*
  * Local echo of CAN messages
  *
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 6992afc..6505208 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -121,6 +121,11 @@ void unregister_candev(struct net_device *dev);
 int can_restart_now(struct net_device *dev);
 void can_bus_off(struct net_device *dev);
 
+void can_change_state(struct net_device *dev, struct can_frame *cf,
+              enum can_state new_state, enum can_state tx_state,
+              enum can_state rx_state);
+enum can_state can_errcnt_to_state(unsigned int count);
+
 void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev,
               unsigned int idx);
 unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx);
diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h
index c247446..1c508be 100644
--- a/include/uapi/linux/can/error.h
+++ b/include/uapi/linux/can/error.h
@@ -71,6 +71,7 @@
 #define CAN_ERR_CRTL_TX_PASSIVE  0x20 /* reached error passive status TX */
                       /* (at least one error counter exceeds */
                       /* the protocol-defined level of 127)  */
+#define CAN_ERR_CRTL_ACTIVE      0x40 /* recovered to error active state */
 
 /* error in CAN protocol (type) / data[2] */
 #define CAN_ERR_PROT_UNSPEC      0x00 /* unspecified */
-- 
1.9.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-09-26 18:25 [PATCH v2 1/4] can: dev: Consolidate and unify state change handling Andri Yngvason
@ 2014-10-20 19:45 ` Wolfgang Grandegger
  2014-10-21 10:42   ` Andri Yngvason
  2014-11-22 17:10 ` Marc Kleine-Budde
  1 sibling, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-20 19:45 UTC (permalink / raw)
  To: Andri Yngvason, linux-can

Hi Andri,

sorry for the long delay...

On 09/26/2014 08:25 PM, Andri Yngvason wrote:
> The handling of can error states is different between platforms.
> This is an attempt to correct that problem.
> 
> I've moved this handling into a generic function for changing the
> error state. This ensures that error state changes are handled
> the same way everywhere (where this function is used).
> 
> What's changed from the last version of this patch-set is that
> can_change_state() now relies on the individual states of the rx/tx
> counters rather than their individual count values.

To see if the state changes occur as expected could you please record
error message traces with candump for the following two scenarios:

1. send messages with cangen
   disconnect the cable
   reconnect the cable after a while until the error active state is
   reached.

2. set restart-ms=100
   send messages with cangen
   provoke a bus-off short-circuiting CAN low and high
   remove the short-circuit

Start with the SJA100 first.

Thanks,

Wolfgang.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-20 19:45 ` Wolfgang Grandegger
@ 2014-10-21 10:42   ` Andri Yngvason
  2014-10-21 10:52     ` Wolfgang Grandegger
  0 siblings, 1 reply; 17+ messages in thread
From: Andri Yngvason @ 2014-10-21 10:42 UTC (permalink / raw)
  To: Wolfgang Grandegger, linux-can


On mán 20.okt 2014 19:45, Wolfgang Grandegger wrote:
> Hi Andri,
>
> sorry for the long delay...
That's all right. I've been quite busy myself.
>
> On 09/26/2014 08:25 PM, Andri Yngvason wrote:
>> The handling of can error states is different between platforms.
>> This is an attempt to correct that problem.
>>
>> I've moved this handling into a generic function for changing the
>> error state. This ensures that error state changes are handled
>> the same way everywhere (where this function is used).
>>
>> What's changed from the last version of this patch-set is that
>> can_change_state() now relies on the individual states of the rx/tx
>> counters rather than their individual count values.
> To see if the state changes occur as expected could you please record
> error message traces with candump for the following two scenarios:
>
> 1. send messages with cangen
>    disconnect the cable
>    reconnect the cable after a while until the error active state is
>    reached.
I tried this the other day with flexcan. If there is nothing else
happening on the bus, the error state will not go back down, regardless
of whether the patches are applied or not. However, this does work if
another device is also sending on the bus.

Sadly, I did not save the logs.
>
> 2. set restart-ms=100
>    send messages with cangen
>    provoke a bus-off short-circuiting CAN low and high
>    remove the short-circuit
That actually worked as expected. It's supposed to restart the bus, right?
>
> Start with the SJA100 first.
>
I'll try to find some time for these experiments.

Thanks,
Andri

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change  handling.
  2014-10-21 10:42   ` Andri Yngvason
@ 2014-10-21 10:52     ` Wolfgang Grandegger
  2014-10-21 14:56       ` Andri Yngvason
  0 siblings, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-21 10:52 UTC (permalink / raw)
  To: Andri Yngvason; +Cc: linux-can

On Tue, 21 Oct 2014 10:42:27 +0000, Andri Yngvason
<andri.yngvason@marel.com> wrote:
> On mán 20.okt 2014 19:45, Wolfgang Grandegger wrote:
>> Hi Andri,
>>
>> sorry for the long delay...
> That's all right. I've been quite busy myself.
>>
>> On 09/26/2014 08:25 PM, Andri Yngvason wrote:
>>> The handling of can error states is different between platforms.
>>> This is an attempt to correct that problem.
>>>
>>> I've moved this handling into a generic function for changing the
>>> error state. This ensures that error state changes are handled
>>> the same way everywhere (where this function is used).
>>>
>>> What's changed from the last version of this patch-set is that
>>> can_change_state() now relies on the individual states of the rx/tx
>>> counters rather than their individual count values.
>> To see if the state changes occur as expected could you please record
>> error message traces with candump for the following two scenarios:
>>
>> 1. send messages with cangen
>>    disconnect the cable
>>    reconnect the cable after a while until the error active state is
>>    reached.
> I tried this the other day with flexcan. If there is nothing else
> happening on the bus, the error state will not go back down, regardless
> of whether the patches are applied or not. However, this does work if
> another device is also sending on the bus.

If the device continues to send (because cangen is still running), the
error counter should decrease. Don't know if the Flexcan does it right
but it works like that on the SJA1000.

> 
> Sadly, I did not save the logs.
>>
>> 2. set restart-ms=100
>>    send messages with cangen
>>    provoke a bus-off short-circuiting CAN low and high
>>    remove the short-circuit
> That actually worked as expected. It's supposed to restart the bus,
right?

The bus will automatically be restarted after 100 ms. The interesting part
is
the flow of the state changes (error messages).

>> Start with the SJA100 first.
>>
> I'll try to find some time for these experiments.

Thanks,

Wolfgang.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-21 10:52     ` Wolfgang Grandegger
@ 2014-10-21 14:56       ` Andri Yngvason
  2014-10-21 15:21         ` Wolfgang Grandegger
  0 siblings, 1 reply; 17+ messages in thread
From: Andri Yngvason @ 2014-10-21 14:56 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: linux-can


On þri 21.okt 2014 10:52, Wolfgang Grandegger wrote:
> On Tue, 21 Oct 2014 10:42:27 +0000, Andri Yngvason
> <andri.yngvason@marel.com> wrote:
>> On mán 20.okt 2014 19:45, Wolfgang Grandegger wrote:
>>> Hi Andri,
>>>
>>> sorry for the long delay...
>> That's all right. I've been quite busy myself.
>>> On 09/26/2014 08:25 PM, Andri Yngvason wrote:
>>>> The handling of can error states is different between platforms.
>>>> This is an attempt to correct that problem.
>>>>
>>>> I've moved this handling into a generic function for changing the
>>>> error state. This ensures that error state changes are handled
>>>> the same way everywhere (where this function is used).
>>>>
>>>> What's changed from the last version of this patch-set is that
>>>> can_change_state() now relies on the individual states of the rx/tx
>>>> counters rather than their individual count values.
>>> To see if the state changes occur as expected could you please record
>>> error message traces with candump for the following two scenarios:
>>>
>>> 1. send messages with cangen
>>>    disconnect the cable
>>>    reconnect the cable after a while until the error active state is
>>>    reached.
>> I tried this the other day with flexcan. If there is nothing else
>> happening on the bus, the error state will not go back down, regardless
>> of whether the patches are applied or not. However, this does work if
>> another device is also sending on the bus.
> If the device continues to send (because cangen is still running), the
> error counter should decrease. Don't know if the Flexcan does it right
> but it works like that on the SJA1000.
It turns out that I can't get this to work on sja1000 either without sending
from another device.

Also, I must not have tested this properly because:
root@x86-20140911-072109:~# candump can1,0~0,#FFFFFFFF | grep -v 20000002

  can1  20000004  [8] 00 40 00 00 00 00 00 60   ERRORFRAME
  can1  20000004  [8] 00 20 00 00 00 00 00 80   ERRORFRAME
  can1  20000004  [8] 00 40 00 00 00 00 00 5F   ERRORFRAME


The problem here is :
+enum can_state can_errcnt_to_state(unsigned int count)
+{
+    if (unlikely(count > 127))
+        return CAN_STATE_ERROR_PASSIVE;
+
+    if (unlikely(count > 96))
+        return CAN_STATE_ERROR_WARNING;
+
+    return CAN_STATE_ERROR_ACTIVE;
+}

96 aught to be 95, or perhaps it would make more sense to make the checks count
>= 128 and count >= 96, respectively.

Maybe generate some error message if neither rx or tx states are greater than
new_state?

I'd written unit tests for this but I guess they don't really protect you
against "false predicate" errors.
>
>> Sadly, I did not save the logs.
>>> 2. set restart-ms=100
>>>    send messages with cangen
>>>    provoke a bus-off short-circuiting CAN low and high
>>>    remove the short-circuit
>> That actually worked as expected. It's supposed to restart the bus,
> right?
>
> The bus will automatically be restarted after 100 ms. The interesting part
> is
> the flow of the state changes (error messages).
root@x86-20140911-072109:~# candump can1,0~0,#FFFFFFFF | grep -v 20000002
  can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
  can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
  can1  20000040  [8] 00 00 00 00 00 00 00 7F   ERRORFRAME
  can1  20000100  [8] 00 00 00 00 00 00 00 00   ERRORFRAME
  can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
  can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
  can1  20000040  [8] 00 00 00 00 00 00 00 7F   ERRORFRAME
  can1  20000100  [8] 00 00 00 00 00 00 00 00   ERRORFRAME
  can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
  can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
  can1  20000040  [8] 00 00 00 00 00 00 00 7F   ERRORFRAME
  can1  20000100  [8] 00 00 00 00 00 00 00 00   ERRORFRAME
etc...

Here we get error-passive twice in a row. The error count is the same in both
cases though....

I just noticed that I swapped tx and rx in data[6..7]. I'll have to fix that too.
>
>>> Start with the SJA100 first.
>>>
>> I'll try to find some time for these experiments.
>>
Best regards,
Andri

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change  handling.
  2014-10-21 14:56       ` Andri Yngvason
@ 2014-10-21 15:21         ` Wolfgang Grandegger
  2014-10-22 11:48           ` Andri Yngvason
  0 siblings, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-21 15:21 UTC (permalink / raw)
  To: Andri Yngvason; +Cc: linux-can

On Tue, 21 Oct 2014 14:56:47 +0000, Andri Yngvason
<andri.yngvason@marel.com> wrote:
> On þri 21.okt 2014 10:52, Wolfgang Grandegger wrote:
>> On Tue, 21 Oct 2014 10:42:27 +0000, Andri Yngvason
>> <andri.yngvason@marel.com> wrote:
>>> On mán 20.okt 2014 19:45, Wolfgang Grandegger wrote:
>>>> Hi Andri,
>>>>
>>>> sorry for the long delay...
>>> That's all right. I've been quite busy myself.
>>>> On 09/26/2014 08:25 PM, Andri Yngvason wrote:
>>>>> The handling of can error states is different between platforms.
>>>>> This is an attempt to correct that problem.
>>>>>
>>>>> I've moved this handling into a generic function for changing the
>>>>> error state. This ensures that error state changes are handled
>>>>> the same way everywhere (where this function is used).
>>>>>
>>>>> What's changed from the last version of this patch-set is that
>>>>> can_change_state() now relies on the individual states of the rx/tx
>>>>> counters rather than their individual count values.
>>>> To see if the state changes occur as expected could you please record
>>>> error message traces with candump for the following two scenarios:
>>>>
>>>> 1. send messages with cangen
>>>>    disconnect the cable
>>>>    reconnect the cable after a while until the error active state is
>>>>    reached.
>>> I tried this the other day with flexcan. If there is nothing else
>>> happening on the bus, the error state will not go back down,
regardless
>>> of whether the patches are applied or not. However, this does work if
>>> another device is also sending on the bus.
>> If the device continues to send (because cangen is still running), the
>> error counter should decrease. Don't know if the Flexcan does it right
>> but it works like that on the SJA1000.
> It turns out that I can't get this to work on sja1000 either without
> sending
> from another device.
> 
> Also, I must not have tested this properly because:
> root@x86-20140911-072109:~# candump can1,0~0,#FFFFFFFF | grep -v
20000002
> 
>   can1  20000004  [8] 00 40 00 00 00 00 00 60   ERRORFRAME
>   can1  20000004  [8] 00 20 00 00 00 00 00 80   ERRORFRAME
>   can1  20000004  [8] 00 40 00 00 00 00 00 5F   ERRORFRAME
> 
> 
> The problem here is :
> +enum can_state can_errcnt_to_state(unsigned int count)
> +{
> +    if (unlikely(count > 127))
> +        return CAN_STATE_ERROR_PASSIVE;
> +
> +    if (unlikely(count > 96))
> +        return CAN_STATE_ERROR_WARNING;
> +
> +    return CAN_STATE_ERROR_ACTIVE;
> +}
> 
> 96 aught to be 95, or perhaps it would make more sense to make the
checks
> count
>>= 128 and count >= 96, respectively.
> 
> Maybe generate some error message if neither rx or tx states are greater
> than
> new_state?

This is how it should look like (from my old commits):

Here is an example output of "candump -td -e any,0:0,#FFFFFFFF" for
a recovery from error passive state due to no ack/cable (reconnect
after 5s) for a SJA1000 on an on EMS PCI card:

 (000.201913)  can0   1C  [0]
 (000.212241)  can0  20000204  [8] 00 08 00 00 00 00 60 00   ERRORFRAME
        controller-problem{tx-error-warning}
        state-change{tx-error-warning}
        error-counter-tx-rx{{96}{0}}
 (000.003544)  can0  20000204  [8] 00 20 00 00 00 00 80 00   ERRORFRAME
        controller-problem{tx-error-passive}
        state-change{tx-error-passive}
        error-counter-tx-rx{{128}{0}}
 (004.901842)  can0   1D  [7] 1D F6 33 52 31 4B DE
 (000.000116)  can0  20000200  [8] 00 08 00 00 00 00 7F 00   ERRORFRAME
        state-change{tx-error-warning}
        error-counter-tx-rx{{127}{0}}
 (000.000678)  can0   1E  [6] 42 05 14 82 23 B6
 ...
 (000.201927)  can0   49  [4] 2F 1A 97 25
 (000.000096)  can0  20000200  [8] 00 40 00 00 00 00 5F 00   ERRORFRAME
        state-change{back-to-error-active}
        error-counter-tx-rx{{95}{0}}
 (000.202184)  can0   4A  [8] 7F 87 0E FE 03 BA 78 91

Messages are generated with "cangen", also while the cable is
disconnected.

> I'd written unit tests for this but I guess they don't really protect
you
> against "false predicate" errors.
>>
>>> Sadly, I did not save the logs.
>>>> 2. set restart-ms=100
>>>>    send messages with cangen
>>>>    provoke a bus-off short-circuiting CAN low and high
>>>>    remove the short-circuit
>>> That actually worked as expected. It's supposed to restart the bus,
>> right?
>>
>> The bus will automatically be restarted after 100 ms. The interesting
>> part
>> is
>> the flow of the state changes (error messages).
> root@x86-20140911-072109:~# candump can1,0~0,#FFFFFFFF | grep -v
20000002
>   can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
>   can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
>   can1  20000040  [8] 00 00 00 00 00 00 00 7F   ERRORFRAME
>   can1  20000100  [8] 00 00 00 00 00 00 00 00   ERRORFRAME
>   can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
>   can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
>   can1  20000040  [8] 00 00 00 00 00 00 00 7F   ERRORFRAME
>   can1  20000100  [8] 00 00 00 00 00 00 00 00   ERRORFRAME
>   can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
>   can1  20000004  [8] 00 20 00 00 00 00 00 88   ERRORFRAME
>   can1  20000040  [8] 00 00 00 00 00 00 00 7F   ERRORFRAME
>   can1  20000100  [8] 00 00 00 00 00 00 00 00   ERRORFRAME
> etc...
> 
> Here we get error-passive twice in a row. The error count is the same in
> both
> cases though....

Why twice? The option "-td" would be useful as well.

Wolfgang.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-21 15:21         ` Wolfgang Grandegger
@ 2014-10-22 11:48           ` Andri Yngvason
  2014-10-22 12:30             ` Wolfgang Grandegger
  0 siblings, 1 reply; 17+ messages in thread
From: Andri Yngvason @ 2014-10-22 11:48 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: linux-can


On þri 21.okt 2014 15:21, Wolfgang Grandegger wrote:
>
>>>>> To see if the state changes occur as expected could you please record
>>>>> error message traces with candump for the following two scenarios:
>>>>>
>>>>> 1. send messages with cangen
>>>>>    disconnect the cable
>>>>>    reconnect the cable after a while until the error active state is
>>>>>    reached.
After cleaning up my mess, this is the output for the disconnected cable test:
 (000.000000)  can1  20000004   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
        controller-problem{tx-error-warning}
        error-counter-tx-rx{{96}{0}}
 (000.003953)  can1  20000004   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
        controller-problem{tx-error-passive}
        error-counter-tx-rx{{128}{0}}
 (005.959170)  can1  20000002   [8]  03 00 00 00 00 00 00 00   ERRORFRAME
        lost-arbitration{at bit 3}
 (000.428328)  can1  20000004   [8]  00 40 00 00 00 00 5F 00   ERRORFRAME
        controller-problem{back-to-error-active}
        error-counter-tx-rx{{95}{0}}

>>>>>>> 2. set restart-ms=100
>>>>>>>    send messages with cangen
>>>>>>>    provoke a bus-off short-circuiting CAN low and high
>>>>>>>    remove the short-circuit
>>>>>>>
Shorting the can bus yields a loop like this:
 (044.014170)  can1  20000004   [8]  00 20 00 00 00 00 88 00   ERRORFRAME
        controller-problem{tx-error-passive}
        error-counter-tx-rx{{136}{0}}
 (000.003175)  can1  20000040   [8]  00 00 00 00 00 00 7F 00   ERRORFRAME
        bus-off
        error-counter-tx-rx{{127}{0}}
 (000.099664)  can1  20000100   [8]  00 00 00 00 00 00 00 00   ERRORFRAME
        restarted-after-bus-off
 (000.097246)  can1  20000004   [8]  00 20 00 00 00 00 88 00   ERRORFRAME
        controller-problem{tx-error-passive}
        error-counter-tx-rx{{136}{0}}
 (000.003160)  can1  20000040   [8]  00 00 00 00 00 00 7F 00   ERRORFRAME
        bus-off
        error-counter-tx-rx{{127}{0}}
 (000.099602)  can1  20000100   [8]  00 00 00 00 00 00 00 00   ERRORFRAME
        restarted-after-bus-off

Cheers,
Andri

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change  handling.
  2014-10-22 11:48           ` Andri Yngvason
@ 2014-10-22 12:30             ` Wolfgang Grandegger
  2014-10-22 12:48               ` Andri Yngvason
  0 siblings, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-22 12:30 UTC (permalink / raw)
  To: Andri Yngvason; +Cc: linux-can

On Wed, 22 Oct 2014 11:48:26 +0000, Andri Yngvason
<andri.yngvason@marel.com> wrote:
> On þri 21.okt 2014 15:21, Wolfgang Grandegger wrote:
>>
>>>>>> To see if the state changes occur as expected could you please
record
>>>>>> error message traces with candump for the following two scenarios:
>>>>>>
>>>>>> 1. send messages with cangen
>>>>>>    disconnect the cable
>>>>>>    reconnect the cable after a while until the error active state
is
>>>>>>    reached.
> After cleaning up my mess, this is the output for the disconnected cable
> test:
>  (000.000000)  can1  20000004   [8]  00 08 00 00 00 00 60 00  
ERRORFRAME
>         controller-problem{tx-error-warning}
>         error-counter-tx-rx{{96}{0}}
>  (000.003953)  can1  20000004   [8]  00 20 00 00 00 00 80 00  
ERRORFRAME
>         controller-problem{tx-error-passive}
>         error-counter-tx-rx{{128}{0}}
>  (005.959170)  can1  20000002   [8]  03 00 00 00 00 00 00 00  
ERRORFRAME
>         lost-arbitration{at bit 3}

I'm missing an error warning message here... at least on the SJA1000 it's
triggered.

>  (000.428328)  can1  20000004   [8]  00 40 00 00 00 00 5F 00  
ERRORFRAME
>         controller-problem{back-to-error-active}
>         error-counter-tx-rx{{95}{0}}
> 
>>>>>>>> 2. set restart-ms=100
>>>>>>>>    send messages with cangen
>>>>>>>>    provoke a bus-off short-circuiting CAN low and high
>>>>>>>>    remove the short-circuit
>>>>>>>>
> Shorting the can bus yields a loop like this:
>  (044.014170)  can1  20000004   [8]  00 20 00 00 00 00 88 00  
ERRORFRAME
>         controller-problem{tx-error-passive}
>         error-counter-tx-rx{{136}{0}}
>  (000.003175)  can1  20000040   [8]  00 00 00 00 00 00 7F 00  
ERRORFRAME
>         bus-off
>         error-counter-tx-rx{{127}{0}}
>  (000.099664)  can1  20000100   [8]  00 00 00 00 00 00 00 00  
ERRORFRAME
>         restarted-after-bus-off
>  (000.097246)  can1  20000004   [8]  00 20 00 00 00 00 88 00  
ERRORFRAME
>         controller-problem{tx-error-passive}
>         error-counter-tx-rx{{136}{0}}
>  (000.003160)  can1  20000040   [8]  00 00 00 00 00 00 7F 00  
ERRORFRAME
>         bus-off
>         error-counter-tx-rx{{127}{0}}
>  (000.099602)  can1  20000100   [8]  00 00 00 00 00 00 00 00  
ERRORFRAME
>         restarted-after-bus-off

And the restart does come when the short-circuit is gone?

Wolfgang.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-22 12:30             ` Wolfgang Grandegger
@ 2014-10-22 12:48               ` Andri Yngvason
  2014-10-22 12:56                 ` Wolfgang Grandegger
  0 siblings, 1 reply; 17+ messages in thread
From: Andri Yngvason @ 2014-10-22 12:48 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: linux-can


On mið 22.okt 2014 12:30, Wolfgang Grandegger wrote:
> On Wed, 22 Oct 2014 11:48:26 +0000, Andri Yngvason
> <andri.yngvason@marel.com> wrote:
>> On þri 21.okt 2014 15:21, Wolfgang Grandegger wrote:
>>>>>>> To see if the state changes occur as expected could you please
> record
>>>>>>> error message traces with candump for the following two scenarios:
>>>>>>>
>>>>>>> 1. send messages with cangen
>>>>>>>    disconnect the cable
>>>>>>>    reconnect the cable after a while until the error active state
> is
>>>>>>>    reached.
>> After cleaning up my mess, this is the output for the disconnected cable
>> test:
>>  (000.000000)  can1  20000004   [8]  00 08 00 00 00 00 60 00  
> ERRORFRAME
>>         controller-problem{tx-error-warning}
>>         error-counter-tx-rx{{96}{0}}
>>  (000.003953)  can1  20000004   [8]  00 20 00 00 00 00 80 00  
> ERRORFRAME
>>         controller-problem{tx-error-passive}
>>         error-counter-tx-rx{{128}{0}}
>>  (005.959170)  can1  20000002   [8]  03 00 00 00 00 00 00 00  
> ERRORFRAME
>>         lost-arbitration{at bit 3}
> I'm missing an error warning message here... at least on the SJA1000 it's
> triggered.
I found this peculiar as well. However, I don't get the warning without the
patch applied either.
Then I just get:
root@x86-20140911-072109:~# candump -td -e can1,0~0,#FFFFFFFF | tee
candump.log                                                          
 (000.000000)  can1  20000004   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
        controller-problem{tx-error-warning}
        error-counter-tx-rx{{96}{0}}
 (000.002425)  can1  20000004   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
        controller-problem{tx-error-passive}
        error-counter-tx-rx{{128}{0}}

>
>>  (000.428328)  can1  20000004   [8]  00 40 00 00 00 00 5F 00  
> ERRORFRAME
>>         controller-problem{back-to-error-active}
>>         error-counter-tx-rx{{95}{0}}
>>
>>>>>>>>> 2. set restart-ms=100
>>>>>>>>>    send messages with cangen
>>>>>>>>>    provoke a bus-off short-circuiting CAN low and high
>>>>>>>>>    remove the short-circuit
>>>>>>>>>
>> Shorting the can bus yields a loop like this:
>>  (044.014170)  can1  20000004   [8]  00 20 00 00 00 00 88 00  
> ERRORFRAME
>>         controller-problem{tx-error-passive}
>>         error-counter-tx-rx{{136}{0}}
>>  (000.003175)  can1  20000040   [8]  00 00 00 00 00 00 7F 00  
> ERRORFRAME
>>         bus-off
>>         error-counter-tx-rx{{127}{0}}
>>  (000.099664)  can1  20000100   [8]  00 00 00 00 00 00 00 00  
> ERRORFRAME
>>         restarted-after-bus-off
>>  (000.097246)  can1  20000004   [8]  00 20 00 00 00 00 88 00  
> ERRORFRAME
>>         controller-problem{tx-error-passive}
>>         error-counter-tx-rx{{136}{0}}
>>  (000.003160)  can1  20000040   [8]  00 00 00 00 00 00 7F 00  
> ERRORFRAME
>>         bus-off
>>         error-counter-tx-rx{{127}{0}}
>>  (000.099602)  can1  20000100   [8]  00 00 00 00 00 00 00 00  
> ERRORFRAME
>>         restarted-after-bus-off
> And the restart does come when the short-circuit is gone?
>
No, in fact the bus keeps restarting until the short-circuit is gone.

Note that I'm using peak_pci. It's sja1000 but maybe there is something different?

- Andri


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change  handling.
  2014-10-22 12:48               ` Andri Yngvason
@ 2014-10-22 12:56                 ` Wolfgang Grandegger
  2014-10-22 16:32                   ` Andri Yngvason
  0 siblings, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-22 12:56 UTC (permalink / raw)
  To: Andri Yngvason; +Cc: linux-can

On Wed, 22 Oct 2014 12:48:11 +0000, Andri Yngvason
<andri.yngvason@marel.com> wrote:
> On mið 22.okt 2014 12:30, Wolfgang Grandegger wrote:
>> On Wed, 22 Oct 2014 11:48:26 +0000, Andri Yngvason
>> <andri.yngvason@marel.com> wrote:
>>> On þri 21.okt 2014 15:21, Wolfgang Grandegger wrote:
>>>>>>>> To see if the state changes occur as expected could you please
>> record
>>>>>>>> error message traces with candump for the following two
scenarios:
>>>>>>>>
>>>>>>>> 1. send messages with cangen
>>>>>>>>    disconnect the cable
>>>>>>>>    reconnect the cable after a while until the error active state
>> is
>>>>>>>>    reached.
>>> After cleaning up my mess, this is the output for the disconnected
cable
>>> test:
>>>  (000.000000)  can1  20000004   [8]  00 08 00 00 00 00 60 00  
>> ERRORFRAME
>>>         controller-problem{tx-error-warning}
>>>         error-counter-tx-rx{{96}{0}}
>>>  (000.003953)  can1  20000004   [8]  00 20 00 00 00 00 80 00  
>> ERRORFRAME
>>>         controller-problem{tx-error-passive}
>>>         error-counter-tx-rx{{128}{0}}
>>>  (005.959170)  can1  20000002   [8]  03 00 00 00 00 00 00 00  
>> ERRORFRAME
>>>         lost-arbitration{at bit 3}
>> I'm missing an error warning message here... at least on the SJA1000
it's
>> triggered.
> I found this peculiar as well. However, I don't get the warning without
the
> patch applied either.

Yes, because reporting decreasing state changes are not (yet) supported.

> Then I just get:
> root@x86-20140911-072109:~# candump -td -e can1,0~0,#FFFFFFFF | tee
> candump.log                                                          
>  (000.000000)  can1  20000004   [8]  00 08 00 00 00 00 60 00  
ERRORFRAME
>         controller-problem{tx-error-warning}
>         error-counter-tx-rx{{96}{0}}
>  (000.002425)  can1  20000004   [8]  00 20 00 00 00 00 80 00  
ERRORFRAME
>         controller-problem{tx-error-passive}
>         error-counter-tx-rx{{128}{0}}
> 

Could you please include the messages in the trace as well
(using "any,0:0,#FFFFFFFF").

>>
>>>  (000.428328)  can1  20000004   [8]  00 40 00 00 00 00 5F 00  
>> ERRORFRAME
>>>         controller-problem{back-to-error-active}
>>>         error-counter-tx-rx{{95}{0}}
>>>
>>>>>>>>>> 2. set restart-ms=100
>>>>>>>>>>    send messages with cangen
>>>>>>>>>>    provoke a bus-off short-circuiting CAN low and high
>>>>>>>>>>    remove the short-circuit
>>>>>>>>>>
>>> Shorting the can bus yields a loop like this:
>>>  (044.014170)  can1  20000004   [8]  00 20 00 00 00 00 88 00  
>> ERRORFRAME
>>>         controller-problem{tx-error-passive}
>>>         error-counter-tx-rx{{136}{0}}
>>>  (000.003175)  can1  20000040   [8]  00 00 00 00 00 00 7F 00  
>> ERRORFRAME
>>>         bus-off
>>>         error-counter-tx-rx{{127}{0}}
>>>  (000.099664)  can1  20000100   [8]  00 00 00 00 00 00 00 00  
>> ERRORFRAME
>>>         restarted-after-bus-off
>>>  (000.097246)  can1  20000004   [8]  00 20 00 00 00 00 88 00  
>> ERRORFRAME
>>>         controller-problem{tx-error-passive}
>>>         error-counter-tx-rx{{136}{0}}
>>>  (000.003160)  can1  20000040   [8]  00 00 00 00 00 00 7F 00  
>> ERRORFRAME
>>>         bus-off
>>>         error-counter-tx-rx{{127}{0}}
>>>  (000.099602)  can1  20000100   [8]  00 00 00 00 00 00 00 00  
>> ERRORFRAME
>>>         restarted-after-bus-off
>> And the restart does come when the short-circuit is gone?
>>
> No, in fact the bus keeps restarting until the short-circuit is gone.
> 
> Note that I'm using peak_pci. It's sja1000 but maybe there is something
> different?

No, because it's a memory mapped SJA1000 chip. What do you see with
"dmesg"
assuming that CONFIG_CAN_DEV_DEBUG is enabled.

Wolfgang.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-22 12:56                 ` Wolfgang Grandegger
@ 2014-10-22 16:32                   ` Andri Yngvason
  2014-10-22 18:42                     ` Wolfgang Grandegger
  0 siblings, 1 reply; 17+ messages in thread
From: Andri Yngvason @ 2014-10-22 16:32 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: linux-can


On mið 22.okt 2014 12:56, Wolfgang Grandegger wrote:
>
>>>>         restarted-after-bus-off
>>> And the restart does come when the short-circuit is gone?
>>>
>> No, in fact the bus keeps restarting until the short-circuit is gone.
>>
>> Note that I'm using peak_pci. It's sja1000 but maybe there is something
>> different?
> No, because it's a memory mapped SJA1000 chip. What do you see with
> "dmesg"
> assuming that CONFIG_CAN_DEV_DEBUG is enabled.
>
dmesg wasn't very informative so I added some debug output of my own. This is
the output after I've fixed everything:
[ 3903.594640] peak_pci 0000:02:00.0: can1 at reg_base=0xf892a400
cfg_base=0xf8928000 irq=18
[ 3903.684937] peak_pci 0000:02:00.0 can1: setting BTR0=0x07 BTR1=0x05
[ 3922.009853] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=false,
es=true
[ 3922.017428] peak_pci 0000:02:00.0 can1: state=1, txerr=96, rxerr=1
[ 3922.023615] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=false,
es=true
[ 3922.031197] peak_pci 0000:02:00.0 can1: state=2, txerr=128, rxerr=1
[ 3928.401643] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=false,
es=true
[ 3928.409243] peak_pci 0000:02:00.0 can1: state=1, txerr=127, rxerr=0
[ 3934.804395] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=false,
es=false
[ 3934.812075] peak_pci 0000:02:00.0 can1: state=0, txerr=95, rxerr=0

Note that, between the two passive interrupts, es does not change. This is
correct according to
http://www.nxp.com/documents/data_sheet/SJA1000.pdf because ES only signifies
warning state.
Thus, "if (status & SR_ES)" is redundant.

@@ -440,11 +446,14 @@ static int sja1000_err(struct net_device *dev, uint8_t
isrc, uint8_t status)
     }
     if (isrc & IRQ_EPI) {
         /* error passive interrupt */
-        netdev_dbg(dev, "error passive interrupt\n");
-        if (status & SR_ES)
-            state = CAN_STATE_ERROR_PASSIVE;
+        netdev_dbg(dev, "error passive interrupt; bs=%s, es=%s\n",
+               status & SR_BS ? "true" : "false",
+               status & SR_ES ? "true" : "false");
+
+        if (state == CAN_STATE_ERROR_PASSIVE)
+            state = CAN_STATE_ERROR_WARNING;
         else
-            state = CAN_STATE_ERROR_ACTIVE;
+            state = CAN_STATE_ERROR_PASSIVE;
     }
     if (isrc & IRQ_ALI) {
         /* arbitration lost interrupt */

The data sheet says this about EPI:
    set; this bit is set whenever the CAN controller has
    reached the error passive status (at least one
    error counter exceeds the protocol-defined level of
    127) or if the CAN controller is in the error passive
    status and enters the error active status again and
    the EPIE bit is set within the interrupt enable
    register

I'm a little bit concerned that it actually says that EPI is set on "error passive"
and "error passive to error active". However, the log says that txerr=127 on
that second interrupt. It makes more sense for it to be "error warning". Might
this be a error in the datasheet?

Another thing that worries me is that when I enabled debug, txerr would advance
while the interrupt was being handled. This yielded mismatches between
the new state and the actual current state of the error counters. I moved the
reading of the RXERR/TXERR registers to the top of the error handler function
and that fixed the problem for me. Might this still be an issue on slower
platforms?

candump looks like this:
...
 (000.053403)  can0  5B1   [5]  D1 D1 BB 7C DF
 (000.146664)  can0  506   [8]  2C 6E 9C 77 5E F1 AE 2D
 (000.053628)  can0  488   [8]  FF C8 AC 26 43 C5 AF 11
 (000.146428)  can0  5AD   [8]  C1 31 BD 1B 6B 8D 34 13
 (000.053112)  can0  658   [0]
 (000.171131)  can0  20000004   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
    controller-problem{tx-error-warning}
    error-counter-tx-rx{{96}{0}}
 (000.013841)  can0  20000004   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
    controller-problem{tx-error-passive}
    error-counter-tx-rx{{128}{0}}
 (004.433642)  can0  20000002   [8]  04 00 00 00 00 00 00 00   ERRORFRAME
    lost-arbitration{at bit 4}
 (000.014785)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
    lost-arbitration{at bit 2}
 (000.012565)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
    lost-arbitration{at bit 2}
 (000.000006)  can0  152   [8]  48 94 14 45 C3 60 8A 58
 (000.000015)  can0  6D7   [4]  3A B8 84 26
 (000.012537)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
    lost-arbitration{at bit 2}
 (000.014083)  can0  2E7   [7]  A2 6F F8 5F DF DD 3B
 (000.000904)  can0  757   [8]  4F 0E AB 57 C3 58 DB 75
 (000.000680)  can0  459   [5]  52 52 58 29 8C
...
 (000.000546)  can0  3C7   [2]  18 AC
 (000.000875)  can0  130   [8]  B3 57 0A 52 CC CA D3 08
 (000.015962)  can0  5DF   [8]  64 14 6C 61 F9 FD E9 55
 (000.000023)  can0  5D2   [8]  C5 B2 39 6B 65 45 4C 05
 (000.014554)  can0  20000002   [8]  03 00 00 00 00 00 00 00   ERRORFRAME
    lost-arbitration{at bit 3}
 (000.000008)  can0  2FC   [8]  19 73 77 6C 86 91 D5 58
...
 (000.053183)  can0  7A1   [6]  F8 EE 7B 12 A3 43
 (000.146734)  can0  089   [5]  36 B8 A0 4F DD
 (000.014032)  can0  20000004   [8]  00 0C 00 00 00 00 7F 74   ERRORFRAME
    controller-problem{rx-error-warning,tx-error-warning}
    error-counter-tx-rx{{127}{116}}
 (000.039316)  can0  684   [6]  ED D1 1E 32 00 1D
 (000.146892)  can0  0D1   [8]  CD 99 DE 18 62 B0 45 27
 (000.053317)  can0  687   [8]  43 37 CF 5E 6B A4 CA 3D
...
 (000.053561)  can0  42A   [4]  65 20 3B 5C
 (000.146834)  can0  3C2   [8]  32 50 A4 56 31 EC DD 55
 (000.013948)  can0  20000004   [8]  00 40 00 00 00 00 5F 54   ERRORFRAME
    controller-problem{back-to-error-active}
    error-counter-tx-rx{{95}{84}}
 (000.039372)  can0  455   [5]  58 38 31 62 B6
...

- Andri.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-22 16:32                   ` Andri Yngvason
@ 2014-10-22 18:42                     ` Wolfgang Grandegger
  2014-10-23 12:50                       ` Andri Yngvason
  0 siblings, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-22 18:42 UTC (permalink / raw)
  To: Andri Yngvason; +Cc: linux-can

On 10/22/2014 06:32 PM, Andri Yngvason wrote:
> 
> On mið 22.okt 2014 12:56, Wolfgang Grandegger wrote:
>>
>>>>>         restarted-after-bus-off
>>>> And the restart does come when the short-circuit is gone?
>>>>
>>> No, in fact the bus keeps restarting until the short-circuit is gone.
>>>
>>> Note that I'm using peak_pci. It's sja1000 but maybe there is something
>>> different?
>> No, because it's a memory mapped SJA1000 chip. What do you see with
>> "dmesg"
>> assuming that CONFIG_CAN_DEV_DEBUG is enabled.
>>
> dmesg wasn't very informative so I added some debug output of my own. This is
> the output after I've fixed everything:
> [ 3903.594640] peak_pci 0000:02:00.0: can1 at reg_base=0xf892a400
> cfg_base=0xf8928000 irq=18
> [ 3903.684937] peak_pci 0000:02:00.0 can1: setting BTR0=0x07 BTR1=0x05
> [ 3922.009853] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=false,
> es=true

According to the manual: error active -> warning

> [ 3922.017428] peak_pci 0000:02:00.0 can1: state=1, txerr=96, rxerr=1
> [ 3922.023615] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=false,
> es=true

warning -> error passive

> [ 3922.031197] peak_pci 0000:02:00.0 can1: state=2, txerr=128, rxerr=1
> [ 3928.401643] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=false,
> es=true

error passive -> warning

> [ 3928.409243] peak_pci 0000:02:00.0 can1: state=1, txerr=127, rxerr=0
> [ 3934.804395] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=false,
> es=false

warning -> error active

> [ 3934.812075] peak_pci 0000:02:00.0 can1: state=0, txerr=95, rxerr=0
> 
> Note that, between the two passive interrupts, es does not change. This is
> correct according to
> http://www.nxp.com/documents/data_sheet/SJA1000.pdf because ES only signifies
> warning state.
> Thus, "if (status & SR_ES)" is redundant.
> 
> @@ -440,11 +446,14 @@ static int sja1000_err(struct net_device *dev, uint8_t
> isrc, uint8_t status)
>      }
>      if (isrc & IRQ_EPI) {
>          /* error passive interrupt */
> -        netdev_dbg(dev, "error passive interrupt\n");
> -        if (status & SR_ES)
> -            state = CAN_STATE_ERROR_PASSIVE;
> +        netdev_dbg(dev, "error passive interrupt; bs=%s, es=%s\n",
> +               status & SR_BS ? "true" : "false",
> +               status & SR_ES ? "true" : "false");
> +
> +        if (state == CAN_STATE_ERROR_PASSIVE)
> +            state = CAN_STATE_ERROR_WARNING;
>          else
> -            state = CAN_STATE_ERROR_ACTIVE;
> +            state = CAN_STATE_ERROR_PASSIVE;
>      }
>      if (isrc & IRQ_ALI) {
>          /* arbitration lost interrupt */
> 
> The data sheet says this about EPI:
>     set; this bit is set whenever the CAN controller has
>     reached the error passive status (at least one
>     error counter exceeds the protocol-defined level of
>     127) or if the CAN controller is in the error passive
>     status and enters the error active status again and
>     the EPIE bit is set within the interrupt enable
>     register
> 
> I'm a little bit concerned that it actually says that EPI is set on "error passive"
> and "error passive to error active". However, the log says that txerr=127 on
> that second interrupt. It makes more sense for it to be "error warning". Might
> this be a error in the datasheet?

IIRC, the CAN standard only knows error active and error passive.
Various vendors added the warning level where the level is sometimes
even programmable.

> Another thing that worries me is that when I enabled debug, txerr would advance
> while the interrupt was being handled. This yielded mismatches between
> the new state and the actual current state of the error counters. I moved the
> reading of the RXERR/TXERR registers to the top of the error handler function
> and that fixed the problem for me. Might this still be an issue on slower
> platforms?

Yes, strictly speaking we cannot be sure that we read the correct error
counter when the state change is triggered via interrupt.

> candump looks like this:
> ...
>  (000.053403)  can0  5B1   [5]  D1 D1 BB 7C DF
>  (000.146664)  can0  506   [8]  2C 6E 9C 77 5E F1 AE 2D
>  (000.053628)  can0  488   [8]  FF C8 AC 26 43 C5 AF 11
>  (000.146428)  can0  5AD   [8]  C1 31 BD 1B 6B 8D 34 13
>  (000.053112)  can0  658   [0]
>  (000.171131)  can0  20000004   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
>     controller-problem{tx-error-warning}
>     error-counter-tx-rx{{96}{0}}
>  (000.013841)  can0  20000004   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
>     controller-problem{tx-error-passive}
>     error-counter-tx-rx{{128}{0}}
>  (004.433642)  can0  20000002   [8]  04 00 00 00 00 00 00 00   ERRORFRAME
>     lost-arbitration{at bit 4}
>  (000.014785)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
>     lost-arbitration{at bit 2}
>  (000.012565)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
>     lost-arbitration{at bit 2}
>  (000.000006)  can0  152   [8]  48 94 14 45 C3 60 8A 58
>  (000.000015)  can0  6D7   [4]  3A B8 84 26
>  (000.012537)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
>     lost-arbitration{at bit 2}

Why do you see these errors? Are there electrical problems on the CAN
bus? And if no cable is connected just txerr should increase (and
decrease if it's reconnected!

>  (000.014083)  can0  2E7   [7]  A2 6F F8 5F DF DD 3B
>  (000.000904)  can0  757   [8]  4F 0E AB 57 C3 58 DB 75
>  (000.000680)  can0  459   [5]  52 52 58 29 8C
> ...
>  (000.000546)  can0  3C7   [2]  18 AC
>  (000.000875)  can0  130   [8]  B3 57 0A 52 CC CA D3 08
>  (000.015962)  can0  5DF   [8]  64 14 6C 61 F9 FD E9 55
>  (000.000023)  can0  5D2   [8]  C5 B2 39 6B 65 45 4C 05
>  (000.014554)  can0  20000002   [8]  03 00 00 00 00 00 00 00   ERRORFRAME
>     lost-arbitration{at bit 3}
>  (000.000008)  can0  2FC   [8]  19 73 77 6C 86 91 D5 58
> ...
>  (000.053183)  can0  7A1   [6]  F8 EE 7B 12 A3 43
>  (000.146734)  can0  089   [5]  36 B8 A0 4F DD
>  (000.014032)  can0  20000004   [8]  00 0C 00 00 00 00 7F 74   ERRORFRAME
>     controller-problem{rx-error-warning,tx-error-warning}
>     error-counter-tx-rx{{127}{116}}
>  (000.039316)  can0  684   [6]  ED D1 1E 32 00 1D
>  (000.146892)  can0  0D1   [8]  CD 99 DE 18 62 B0 45 27
>  (000.053317)  can0  687   [8]  43 37 CF 5E 6B A4 CA 3D
> ...
>  (000.053561)  can0  42A   [4]  65 20 3B 5C
>  (000.146834)  can0  3C2   [8]  32 50 A4 56 31 EC DD 55
>  (000.013948)  can0  20000004   [8]  00 40 00 00 00 00 5F 54   ERRORFRAME
>     controller-problem{back-to-error-active}
>     error-counter-tx-rx{{95}{84}}
>  (000.039372)  can0  455   [5]  58 38 31 62 B6
> ...

Wolfgang.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-22 18:42                     ` Wolfgang Grandegger
@ 2014-10-23 12:50                       ` Andri Yngvason
  2014-10-23 13:10                         ` Wolfgang Grandegger
  0 siblings, 1 reply; 17+ messages in thread
From: Andri Yngvason @ 2014-10-23 12:50 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: linux-can


On mið 22.okt 2014 18:42, Wolfgang Grandegger wrote:
> On 10/22/2014 06:32 PM, Andri Yngvason wrote:
>> On mið 22.okt 2014 12:56, Wolfgang Grandegger wrote:
>>>>>>         restarted-after-bus-off
>>>>> And the restart does come when the short-circuit is gone?
>>>>>
>>>> No, in fact the bus keeps restarting until the short-circuit is gone.
>>>>
>>>> Note that I'm using peak_pci. It's sja1000 but maybe there is something
>>>> different?
>>> No, because it's a memory mapped SJA1000 chip. What do you see with
>>> "dmesg"
>>> assuming that CONFIG_CAN_DEV_DEBUG is enabled.
>>>
>> dmesg wasn't very informative so I added some debug output of my own. This is
>> the output after I've fixed everything:
>> [ 3903.594640] peak_pci 0000:02:00.0: can1 at reg_base=0xf892a400
>> cfg_base=0xf8928000 irq=18
>> [ 3903.684937] peak_pci 0000:02:00.0 can1: setting BTR0=0x07 BTR1=0x05
>> [ 3922.009853] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=false,
>> es=true
> According to the manual: error active -> warning
>
>> [ 3922.017428] peak_pci 0000:02:00.0 can1: state=1, txerr=96, rxerr=1
>> [ 3922.023615] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=false,
>> es=true
> warning -> error passive
>
>> [ 3922.031197] peak_pci 0000:02:00.0 can1: state=2, txerr=128, rxerr=1
>> [ 3928.401643] peak_pci 0000:02:00.0 can1: error passive interrupt; bs=false,
>> es=true
> error passive -> warning
>
>> [ 3928.409243] peak_pci 0000:02:00.0 can1: state=1, txerr=127, rxerr=0
>> [ 3934.804395] peak_pci 0000:02:00.0 can1: error warning interrupt; bs=false,
>> es=false
> warning -> error active
>
>> [ 3934.812075] peak_pci 0000:02:00.0 can1: state=0, txerr=95, rxerr=0
>>
>> Note that, between the two passive interrupts, es does not change. This is
>> correct according to
>> http://www.nxp.com/documents/data_sheet/SJA1000.pdf because ES only signifies
>> warning state.
>> Thus, "if (status & SR_ES)" is redundant.
>>
>> @@ -440,11 +446,14 @@ static int sja1000_err(struct net_device *dev, uint8_t
>> isrc, uint8_t status)
>>      }
>>      if (isrc & IRQ_EPI) {
>>          /* error passive interrupt */
>> -        netdev_dbg(dev, "error passive interrupt\n");
>> -        if (status & SR_ES)
>> -            state = CAN_STATE_ERROR_PASSIVE;
>> +        netdev_dbg(dev, "error passive interrupt; bs=%s, es=%s\n",
>> +               status & SR_BS ? "true" : "false",
>> +               status & SR_ES ? "true" : "false");
>> +
>> +        if (state == CAN_STATE_ERROR_PASSIVE)
>> +            state = CAN_STATE_ERROR_WARNING;
>>          else
>> -            state = CAN_STATE_ERROR_ACTIVE;
>> +            state = CAN_STATE_ERROR_PASSIVE;
>>      }
>>      if (isrc & IRQ_ALI) {
>>          /* arbitration lost interrupt */
>>
>> The data sheet says this about EPI:
>>     set; this bit is set whenever the CAN controller has
>>     reached the error passive status (at least one
>>     error counter exceeds the protocol-defined level of
>>     127) or if the CAN controller is in the error passive
>>     status and enters the error active status again and
>>     the EPIE bit is set within the interrupt enable
>>     register
>>
>> I'm a little bit concerned that it actually says that EPI is set on "error passive"
>> and "error passive to error active". However, the log says that txerr=127 on
>> that second interrupt. It makes more sense for it to be "error warning". Might
>> this be a error in the datasheet?
> IIRC, the CAN standard only knows error active and error passive.
> Various vendors added the warning level where the level is sometimes
> even programmable.
Yes, this makes sense. The manual doesn't even say that the "warning level" is a
state.
>
>> Another thing that worries me is that when I enabled debug, txerr would advance
>> while the interrupt was being handled. This yielded mismatches between
>> the new state and the actual current state of the error counters. I moved the
>> reading of the RXERR/TXERR registers to the top of the error handler function
>> and that fixed the problem for me. Might this still be an issue on slower
>> platforms?
> Yes, strictly speaking we cannot be sure that we read the correct error
> counter when the state change is triggered via interrupt.
Ok, we'll just have to compare rxerr and txerr like before instead of using
errcnt_to_state.
>> candump looks like this:
>> ...
>>  (000.053403)  can0  5B1   [5]  D1 D1 BB 7C DF
>>  (000.146664)  can0  506   [8]  2C 6E 9C 77 5E F1 AE 2D
>>  (000.053628)  can0  488   [8]  FF C8 AC 26 43 C5 AF 11
>>  (000.146428)  can0  5AD   [8]  C1 31 BD 1B 6B 8D 34 13
>>  (000.053112)  can0  658   [0]
>>  (000.171131)  can0  20000004   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
>>     controller-problem{tx-error-warning}
>>     error-counter-tx-rx{{96}{0}}
>>  (000.013841)  can0  20000004   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
>>     controller-problem{tx-error-passive}
>>     error-counter-tx-rx{{128}{0}}
>>  (004.433642)  can0  20000002   [8]  04 00 00 00 00 00 00 00   ERRORFRAME
>>     lost-arbitration{at bit 4}
>>  (000.014785)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
>>     lost-arbitration{at bit 2}
>>  (000.012565)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
>>     lost-arbitration{at bit 2}
>>  (000.000006)  can0  152   [8]  48 94 14 45 C3 60 8A 58
>>  (000.000015)  can0  6D7   [4]  3A B8 84 26
>>  (000.012537)  can0  20000002   [8]  02 00 00 00 00 00 00 00   ERRORFRAME
>>     lost-arbitration{at bit 2}
> Why do you see these errors? Are there electrical problems on the CAN
> bus? And if no cable is connected just txerr should increase (and
> decrease if it's reconnected!
These errors are occurring when I connect the cable again. They might be
due to bad contact while plugging in the cable.

Anyway. Like I said before, the controller does not enter error active state unless
there is some other device sending on the bus. I've looked at "dmesg" and there
isn't even an interrupt. The netlink interface also says error-warning.

When you were doing your experiments, did you perhaps have some node on
the bus that might have answered to some of the cangen messages?

In any case, my setup is like this:
 * Two sja1000 from the same peak_pci connected to the same bus.
 * Both send using cangen
 * Resistance: 60 Ohm
 * Cables are quite short.

I'll try to place the resistances differently, make cables shorter and see if that
does anything.
>>  (000.014083)  can0  2E7   [7]  A2 6F F8 5F DF DD 3B
>>  (000.000904)  can0  757   [8]  4F 0E AB 57 C3 58 DB 75
>>  (000.000680)  can0  459   [5]  52 52 58 29 8C
>> ...
>>  (000.000546)  can0  3C7   [2]  18 AC
>>  (000.000875)  can0  130   [8]  B3 57 0A 52 CC CA D3 08
>>  (000.015962)  can0  5DF   [8]  64 14 6C 61 F9 FD E9 55
>>  (000.000023)  can0  5D2   [8]  C5 B2 39 6B 65 45 4C 05
>>  (000.014554)  can0  20000002   [8]  03 00 00 00 00 00 00 00   ERRORFRAME
>>     lost-arbitration{at bit 3}
>>  (000.000008)  can0  2FC   [8]  19 73 77 6C 86 91 D5 58
>> ...
>>  (000.053183)  can0  7A1   [6]  F8 EE 7B 12 A3 43
>>  (000.146734)  can0  089   [5]  36 B8 A0 4F DD
>>  (000.014032)  can0  20000004   [8]  00 0C 00 00 00 00 7F 74   ERRORFRAME
>>     controller-problem{rx-error-warning,tx-error-warning}
>>     error-counter-tx-rx{{127}{116}}
>>  (000.039316)  can0  684   [6]  ED D1 1E 32 00 1D
>>  (000.146892)  can0  0D1   [8]  CD 99 DE 18 62 B0 45 27
>>  (000.053317)  can0  687   [8]  43 37 CF 5E 6B A4 CA 3D
>> ...
>>  (000.053561)  can0  42A   [4]  65 20 3B 5C
>>  (000.146834)  can0  3C2   [8]  32 50 A4 56 31 EC DD 55
>>  (000.013948)  can0  20000004   [8]  00 40 00 00 00 00 5F 54   ERRORFRAME
>>     controller-problem{back-to-error-active}
>>     error-counter-tx-rx{{95}{84}}
>>  (000.039372)  can0  455   [5]  58 38 31 62 B6
>> ...
>>
- Andri

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change  handling.
  2014-10-23 12:50                       ` Andri Yngvason
@ 2014-10-23 13:10                         ` Wolfgang Grandegger
  2014-10-23 15:55                           ` Andri Yngvason
  0 siblings, 1 reply; 17+ messages in thread
From: Wolfgang Grandegger @ 2014-10-23 13:10 UTC (permalink / raw)
  To: Andri Yngvason; +Cc: linux-can

On Thu, 23 Oct 2014 12:50:20 +0000, Andri Yngvason
<andri.yngvason@marel.com> wrote:
> On mið 22.okt 2014 18:42, Wolfgang Grandegger wrote:
>> On 10/22/2014 06:32 PM, Andri Yngvason wrote:
...
>>> The data sheet says this about EPI:
>>>     set; this bit is set whenever the CAN controller has
>>>     reached the error passive status (at least one
>>>     error counter exceeds the protocol-defined level of
>>>     127) or if the CAN controller is in the error passive
>>>     status and enters the error active status again and
>>>     the EPIE bit is set within the interrupt enable
>>>     register
>>>
>>> I'm a little bit concerned that it actually says that EPI is set on
>>> "error passive"
>>> and "error passive to error active". However, the log says that
>>> txerr=127 on
>>> that second interrupt. It makes more sense for it to be "error
>>> warning". Might
>>> this be a error in the datasheet?
>> IIRC, the CAN standard only knows error active and error passive.
>> Various vendors added the warning level where the level is sometimes
>> even programmable.
> Yes, this makes sense. The manual doesn't even say that the "warning
> level" is a
> state.
...
>>>  (000.053403)  can0  5B1   [5]  D1 D1 BB 7C DF
>>>  (000.146664)  can0  506   [8]  2C 6E 9C 77 5E F1 AE 2D
>>>  (000.053628)  can0  488   [8]  FF C8 AC 26 43 C5 AF 11
>>>  (000.146428)  can0  5AD   [8]  C1 31 BD 1B 6B 8D 34 13
>>>  (000.053112)  can0  658   [0]
>>>  (000.171131)  can0  20000004   [8]  00 08 00 00 00 00 60 00  
>>>  ERRORFRAME
>>>     controller-problem{tx-error-warning}
>>>     error-counter-tx-rx{{96}{0}}
>>>  (000.013841)  can0  20000004   [8]  00 20 00 00 00 00 80 00  
>>>  ERRORFRAME
>>>     controller-problem{tx-error-passive}
>>>     error-counter-tx-rx{{128}{0}}
>>>  (004.433642)  can0  20000002   [8]  04 00 00 00 00 00 00 00  
>>>  ERRORFRAME
>>>     lost-arbitration{at bit 4}
>>>  (000.014785)  can0  20000002   [8]  02 00 00 00 00 00 00 00  
>>>  ERRORFRAME
>>>     lost-arbitration{at bit 2}
>>>  (000.012565)  can0  20000002   [8]  02 00 00 00 00 00 00 00  
>>>  ERRORFRAME
>>>     lost-arbitration{at bit 2}
>>>  (000.000006)  can0  152   [8]  48 94 14 45 C3 60 8A 58
>>>  (000.000015)  can0  6D7   [4]  3A B8 84 26
>>>  (000.012537)  can0  20000002   [8]  02 00 00 00 00 00 00 00  
>>>  ERRORFRAME
>>>     lost-arbitration{at bit 2}
>> Why do you see these errors? Are there electrical problems on the CAN
>> bus? And if no cable is connected just txerr should increase (and
>> decrease if it's reconnected!
> These errors are occurring when I connect the cable again. They might be
> due to bad contact while plugging in the cable.

But you already receive *good* messages!

> Anyway. Like I said before, the controller does not enter error active
> state unless
> there is some other device sending on the bus. I've looked at "dmesg"
and
> there
> isn't even an interrupt. The netlink interface also says error-warning.

From http://www.kvaser.com/about-can/the-can-protocol/can-error-handling/

"The rules for increasing and decreasing the error counters are somewhat
complex, but the principle is simple: transmit errors give 8 error points,
and receive errors give 1 error point. Correctly transmitted and/or
received messages causes the counter(s) to decrease."
 
> When you were doing your experiments, did you perhaps have some node on
> the bus that might have answered to some of the cangen messages?
> 
> In any case, my setup is like this:
>  * Two sja1000 from the same peak_pci connected to the same bus.
>  * Both send using cangen
>  * Resistance: 60 Ohm

Hm, the cable should be terminated with 120 Ohm on both ends of the cable.
BTW: what bitrate do you use?

Wolfgang.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-10-23 13:10                         ` Wolfgang Grandegger
@ 2014-10-23 15:55                           ` Andri Yngvason
  0 siblings, 0 replies; 17+ messages in thread
From: Andri Yngvason @ 2014-10-23 15:55 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: linux-can


On fim 23.okt 2014 13:10, Wolfgang Grandegger wrote:
> On Thu, 23 Oct 2014 12:50:20 +0000, Andri Yngvason
> <andri.yngvason@marel.com> wrote:
>> On mið 22.okt 2014 18:42, Wolfgang Grandegger wrote:
>>> On 10/22/2014 06:32 PM, Andri Yngvason wrote:
> ...
>>>> The data sheet says this about EPI:
>>>>     set; this bit is set whenever the CAN controller has
>>>>     reached the error passive status (at least one
>>>>     error counter exceeds the protocol-defined level of
>>>>     127) or if the CAN controller is in the error passive
>>>>     status and enters the error active status again and
>>>>     the EPIE bit is set within the interrupt enable
>>>>     register
>>>>
>>>> I'm a little bit concerned that it actually says that EPI is set on
>>>> "error passive"
>>>> and "error passive to error active". However, the log says that
>>>> txerr=127 on
>>>> that second interrupt. It makes more sense for it to be "error
>>>> warning". Might
>>>> this be a error in the datasheet?
>>> IIRC, the CAN standard only knows error active and error passive.
>>> Various vendors added the warning level where the level is sometimes
>>> even programmable.
>> Yes, this makes sense. The manual doesn't even say that the "warning
>> level" is a
>> state.
> ...
>>>>  (000.053403)  can0  5B1   [5]  D1 D1 BB 7C DF
>>>>  (000.146664)  can0  506   [8]  2C 6E 9C 77 5E F1 AE 2D
>>>>  (000.053628)  can0  488   [8]  FF C8 AC 26 43 C5 AF 11
>>>>  (000.146428)  can0  5AD   [8]  C1 31 BD 1B 6B 8D 34 13
>>>>  (000.053112)  can0  658   [0]
>>>>  (000.171131)  can0  20000004   [8]  00 08 00 00 00 00 60 00  
>>>>  ERRORFRAME
>>>>     controller-problem{tx-error-warning}
>>>>     error-counter-tx-rx{{96}{0}}
>>>>  (000.013841)  can0  20000004   [8]  00 20 00 00 00 00 80 00  
>>>>  ERRORFRAME
>>>>     controller-problem{tx-error-passive}
>>>>     error-counter-tx-rx{{128}{0}}
>>>>  (004.433642)  can0  20000002   [8]  04 00 00 00 00 00 00 00  
>>>>  ERRORFRAME
>>>>     lost-arbitration{at bit 4}
>>>>  (000.014785)  can0  20000002   [8]  02 00 00 00 00 00 00 00  
>>>>  ERRORFRAME
>>>>     lost-arbitration{at bit 2}
>>>>  (000.012565)  can0  20000002   [8]  02 00 00 00 00 00 00 00  
>>>>  ERRORFRAME
>>>>     lost-arbitration{at bit 2}
>>>>  (000.000006)  can0  152   [8]  48 94 14 45 C3 60 8A 58
>>>>  (000.000015)  can0  6D7   [4]  3A B8 84 26
>>>>  (000.012537)  can0  20000002   [8]  02 00 00 00 00 00 00 00  
>>>>  ERRORFRAME
>>>>     lost-arbitration{at bit 2}
>>> Why do you see these errors? Are there electrical problems on the CAN
>>> bus? And if no cable is connected just txerr should increase (and
>>> decrease if it's reconnected!
>> These errors are occurring when I connect the cable again. They might be
>> due to bad contact while plugging in the cable.
> But you already receive *good* messages!
>
>> Anyway. Like I said before, the controller does not enter error active
>> state unless
>> there is some other device sending on the bus. I've looked at "dmesg"
> and
>> there
>> isn't even an interrupt. The netlink interface also says error-warning.
> >From http://www.kvaser.com/about-can/the-can-protocol/can-error-handling/
>
> "The rules for increasing and decreasing the error counters are somewhat
> complex, but the principle is simple: transmit errors give 8 error points,
> and receive errors give 1 error point. Correctly transmitted and/or
> received messages causes the counter(s) to decrease."
>  
>> When you were doing your experiments, did you perhaps have some node on
>> the bus that might have answered to some of the cangen messages?
>>
>> In any case, my setup is like this:
>>  * Two sja1000 from the same peak_pci connected to the same bus.
>>  * Both send using cangen
>>  * Resistance: 60 Ohm
> Hm, the cable should be terminated with 120 Ohm on both ends of the cable.
> BTW: what bitrate do you use?
The bitrate is 125k.
root@x86-20140911-072109:~# ip -s -d link show can0
15: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 300
    link/can
    can state ERROR-ACTIVE restart-ms 50
    bitrate 125000 sample-point 0.875
    tq 1000 prop-seg 3 phase-seg1 3 phase-seg2 1 sjw 1
    sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
    clock 8000000
    re-started bus-errors arbit-lost error-warn error-pass bus-off
    1          0          28         11         11         2        
    RX: bytes  packets  errors  dropped overrun mcast  
    2216731    385309   0       0       0       0     
    TX: bytes  packets  errors  dropped carrier collsns
    2523021    391003   28      1       0       0     


Anyway, I got this working as you as you said it should work!

I moved the terminating resistors so that can0 and can1 are always terminated,
even when disconnected from the bus. I'm not sure why this works, but it does.

Maybe the problem had something to do with the fact that can0 was just floating
when I disconnected it.

 ...
 (000.200059)  can0  761   [1]  19
 (000.200358)  can0  7C9   [5]  02 D4 FA 22 2F
 (000.200232)  can0  180   [7]  B8 BF 37 5E D0 EB 3C
 (000.224712)  can0  20000004   [8]  00 08 00 00 00 00 60 00   ERRORFRAME
    controller-problem{tx-error-warning}
    error-counter-tx-rx{{96}{0}}
 (000.013858)  can0  20000004   [8]  00 20 00 00 00 00 80 00   ERRORFRAME
    controller-problem{tx-error-passive}
    error-counter-tx-rx{{128}{0}}
 (006.130890)  can0  20F   [8]  4F A3 0A 5E FA F9 E0 58
 (000.000980)  can0  539   [8]  BF 4F 72 7C 82 E6 46 72
 (000.000513)  can0  70A   [1]  87
 ...
 (000.000914)  can0  14F   [7]  43 22 3E 76 B8 8E 2E
 (000.001031)  can0  7B6   [8]  31 EE 34 79 0E FB C0 0B
 (000.000687)  can0  73D   [4]  A0 B3 C1 35
 (000.013928)  can0  20000004   [8]  00 08 00 00 00 00 7F 00   ERRORFRAME
    controller-problem{tx-error-warning}
    error-counter-tx-rx{{127}{0}}
 (000.000618)  can0  0D6   [3]  08 D5 D2
 (000.000689)  can0  14E   [4]  20 A2 A0 3A
 (000.000517)  can0  7C0   [1]  95
...
 (000.199771)  can0  7B5   [3]  24 1D 09
 (000.200256)  can0  632   [6]  D9 29 E9 7A 34 71
 (000.200170)  can0  194   [7]  F8 B4 50 67 2B 9E BB
 (000.013881)  can0  20000004   [8]  00 40 00 00 00 00 5F 00   ERRORFRAME
    controller-problem{back-to-error-active}
    error-counter-tx-rx{{95}{0}}
 (000.185735)  can0  71B   [0]
 (000.200622)  can0  090   [8]  62 29 1E 61 35 E3 FC 09
 (000.200080)  can0  628   [8]  C0 54 25 7B CE 56 02 7D
 (000.200605)  can0  13A   [8]  8F 07 70 3A B2 E5 82 29
 ...


- Andri

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-09-26 18:25 [PATCH v2 1/4] can: dev: Consolidate and unify state change handling Andri Yngvason
  2014-10-20 19:45 ` Wolfgang Grandegger
@ 2014-11-22 17:10 ` Marc Kleine-Budde
  2014-11-22 18:15   ` Andri Yngvason
  1 sibling, 1 reply; 17+ messages in thread
From: Marc Kleine-Budde @ 2014-11-22 17:10 UTC (permalink / raw)
  To: Andri Yngvason, linux-can; +Cc: Wolfgang Grandegger

[-- Attachment #1: Type: text/plain, Size: 941 bytes --]

On 09/26/2014 08:25 PM, Andri Yngvason wrote:
> The handling of can error states is different between platforms.
> This is an attempt to correct that problem.
> 
> I've moved this handling into a generic function for changing the
> error state. This ensures that error state changes are handled
> the same way everywhere (where this function is used).
> 
> What's changed from the last version of this patch-set is that
> can_change_state() now relies on the individual states of the rx/tx
> counters rather than their individual count values.
> 
> Signed-off-by: Andri Yngvason <andri.yngvason@marel.com>

What's the status on this series?

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/4] can: dev: Consolidate and unify state change handling.
  2014-11-22 17:10 ` Marc Kleine-Budde
@ 2014-11-22 18:15   ` Andri Yngvason
  0 siblings, 0 replies; 17+ messages in thread
From: Andri Yngvason @ 2014-11-22 18:15 UTC (permalink / raw)
  To: linux-can

On Sat, Nov 22, 2014 at 06:10:37PM +0100, Marc Kleine-Budde wrote:
> On 09/26/2014 08:25 PM, Andri Yngvason wrote:
> > The handling of can error states is different between platforms.
> > This is an attempt to correct that problem.
> > 
> > I've moved this handling into a generic function for changing the
> > error state. This ensures that error state changes are handled
> > the same way everywhere (where this function is used).
> > 
> > What's changed from the last version of this patch-set is that
> > can_change_state() now relies on the individual states of the rx/tx
> > counters rather than their individual count values.
> > 
> > Signed-off-by: Andri Yngvason <andri.yngvason@marel.com>
> 
> What's the status on this series?
> 

I'm working on this right now.

--
Andri

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2014-11-22 18:47 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-26 18:25 [PATCH v2 1/4] can: dev: Consolidate and unify state change handling Andri Yngvason
2014-10-20 19:45 ` Wolfgang Grandegger
2014-10-21 10:42   ` Andri Yngvason
2014-10-21 10:52     ` Wolfgang Grandegger
2014-10-21 14:56       ` Andri Yngvason
2014-10-21 15:21         ` Wolfgang Grandegger
2014-10-22 11:48           ` Andri Yngvason
2014-10-22 12:30             ` Wolfgang Grandegger
2014-10-22 12:48               ` Andri Yngvason
2014-10-22 12:56                 ` Wolfgang Grandegger
2014-10-22 16:32                   ` Andri Yngvason
2014-10-22 18:42                     ` Wolfgang Grandegger
2014-10-23 12:50                       ` Andri Yngvason
2014-10-23 13:10                         ` Wolfgang Grandegger
2014-10-23 15:55                           ` Andri Yngvason
2014-11-22 17:10 ` Marc Kleine-Budde
2014-11-22 18:15   ` Andri Yngvason

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).