[Linuxptp-devel] [PATCH v2] ptp4l: flush old cached packets

Discussion:

Jiri Benc

2013-07-19 08:43:09 UTC

This patch fixes a bug with time mysteriously jumping back and forth:

ptp4l[930.687]: port 1: UNCALIBRATED to SLAVE on MASTER_CLOCK_SELECTED
ptp4l[931.687]: master offset 17 s2 freq +33014 path delay 2728
ptp4l[932.687]: master offset -74 s2 freq +32928 path delay 2734
ptp4l[933.687]: master offset 2 s2 freq +32982 path delay 2734
ptp4l[934.687]: master offset -3 s2 freq +32977 path delay 2728
ptp4l[935.687]: master offset 17 s2 freq +32996 path delay 2729
ptp4l[936.687]: master offset -10 s2 freq +32974 path delay 2729
ptp4l[937.687]: master offset 35 s2 freq +33016 path delay 2727
ptp4l[938.686]: master offset 60001851388 s2 freq +62499999 path delay 2728
ptp4l[939.687]: master offset -62464938 s2 freq -62431946 path delay 2728

The last follow up message arriving out of order is cached. Before the state
machine changes to UNCALIBRATED, all sync and follow up messages are discarded.
If we get into that state between a sync and follow up message, the latter is
cached. When there's no real roerdering happening, it's kept cached forever.

When we restart the master, it starts numbering the messages from zero again.
The initial synchronization doesn't take always the same amount of time, so it
can happen that we get into UNCALIBRATED a little bit faster than before,
managing to get the sync message with the sequenceId that we missed last time.
As it has the same sequenceId as the cached (old) follow up message, it's
incorrectly assumed those two belong together.

Flush the cache when changing to UNCALIBRATED. Also, do similar thing for other
cached packets.

Signed-off-by: Jiri Benc <***@redhat.com>
---
v2: removed unnecesary flush after port_disable
---
port.c | 27 ++++++++++++++++++++++++---
1 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/port.c b/port.c
index d29a70c..d7b29bb 100644
--- a/port.c
+++ b/port.c
@@ -1115,10 +1115,8 @@ static int port_is_enabled(struct port *p)
return 1;
}

-static void port_disable(struct port *p)
+static void flush_last_sync(struct port *p)
{
- int i;
-
if (p->last_follow_up) {
msg_put(p->last_follow_up);
p->last_follow_up = NULL;
@@ -1127,10 +1125,18 @@ static void port_disable(struct port *p)
msg_put(p->last_sync);
p->last_sync = NULL;
}
+}
+
+static void flush_delay_req(struct port *p)
+{
if (p->delay_req) {
msg_put(p->delay_req);
p->delay_req = NULL;
}
+}
+
+static void flush_peer_delay(struct port *p)
+{
if (p->peer_delay_req) {
msg_put(p->peer_delay_req);
p->peer_delay_req = NULL;
@@ -1143,6 +1149,15 @@ static void port_disable(struct port *p)
msg_put(p->peer_delay_fup);
p->peer_delay_fup = NULL;
}
+}
+
+static void port_disable(struct port *p)
+{
+ int i;
+
+ flush_last_sync(p);
+ flush_delay_req(p);
+ flush_peer_delay(p);

p->best = NULL;
free_foreign_masters(p);
@@ -1782,6 +1797,9 @@ static void port_e2e_transition(struct port *p, enum port_state next)
port_set_announce_tmo(p);
break;
case PS_UNCALIBRATED:
+ flush_last_sync(p);
+ flush_delay_req(p);
+ /* fall through */
case PS_SLAVE:
port_set_announce_tmo(p);
port_set_delay_tmo(p);
@@ -1819,6 +1837,9 @@ static void port_p2p_transition(struct port *p, enum port_state next)
port_set_announce_tmo(p);
break;
case PS_UNCALIBRATED:
+ flush_last_sync(p);
+ flush_peer_delay(p);
+ /* fall through */
case PS_SLAVE:
port_set_announce_tmo(p);
break;

--
1.7.6.5

Richard Cochran

2013-07-19 14:22:15 UTC

Permalink

Post by Jiri Benc
Flush the cache when changing to UNCALIBRATED. Also, do similar thing for other
cached packets.
---
v2: removed unnecesary flush after port_disable

Applied.

Thanks,
Richard

Ken ICHIKAWA

2013-07-29 07:04:24 UTC

Permalink

Post by Jiri Benc
ptp4l[930.687]: port 1: UNCALIBRATED to SLAVE on MASTER_CLOCK_SELECTED
ptp4l[931.687]: master offset 17 s2 freq +33014 path delay 2728
ptp4l[932.687]: master offset -74 s2 freq +32928 path delay 2734
ptp4l[933.687]: master offset 2 s2 freq +32982 path delay 2734
ptp4l[934.687]: master offset -3 s2 freq +32977 path delay 2728
ptp4l[935.687]: master offset 17 s2 freq +32996 path delay 2729
ptp4l[936.687]: master offset -10 s2 freq +32974 path delay 2729
ptp4l[937.687]: master offset 35 s2 freq +33016 path delay 2727
ptp4l[938.686]: master offset 60001851388 s2 freq +62499999 path delay 2728
ptp4l[939.687]: master offset -62464938 s2 freq -62431946 path delay 2728
The last follow up message arriving out of order is cached. Before the state
machine changes to UNCALIBRATED, all sync and follow up messages are discarded.
If we get into that state between a sync and follow up message, the latter is
cached. When there's no real roerdering happening, it's kept cached forever.
When we restart the master, it starts numbering the messages from zero again.
The initial synchronization doesn't take always the same amount of time, so it
can happen that we get into UNCALIBRATED a little bit faster than before,
managing to get the sync message with the sequenceId that we missed last time.
As it has the same sequenceId as the cached (old) follow up message, it's
incorrectly assumed those two belong together.
Flush the cache when changing to UNCALIBRATED. Also, do similar thing for other
cached packets.

When sequenceId is overflowed, does same problem occur?
If so, I think flush is needed also at the time.

Thank,
Ken Ichikawa

Richard Cochran

2013-07-29 10:52:32 UTC

Permalink

Post by Ken ICHIKAWA
When sequenceId is overflowed, does same problem occur?

The sync and its matching followup both have the same sequenceId, and
we test for != and == when matching, so think there is no issue WRT
overflow.

Thanks,
Richard

Ken ICHIKAWA

2013-07-30 00:18:27 UTC

Permalink

Post by Richard Cochran

Post by Ken ICHIKAWA
When sequenceId is overflowed, does same problem occur?

The sync and its matching followup both have the same sequenceId, and
we test for != and == when matching, so think there is no issue WRT
overflow.

How about following situation?

Port is LISTENING
Sync (seqId 0) : ignored
Fup (seqId 0) : ignored
Sync (seqId 1) : ignored
Port becomes UNCALIBRATED here
Fup (seqId 1) : cached!
Sync (seqId 2) : cached
Fup (seqId 2) : match
Sync (seqId 3) : cached
Fup (seqId 3) : match
...
...
Sync (seqId 65535) : cached
Fup (seqId 65535) : match
Sync (seqId 0) : cached
Fup (seqId 0) : match
Sync (seqId 1) : match with old Fup!!
Fup (seqId 1) : cached!
Sync (seqId 2) : cached
Fup (seqId 2) : match

Actually, I experienced 65500 secs offset every about 65500 secs.
I'm thinking this is the cause.

Thanks,
Ken Ichikawa

Richard Cochran

2013-07-30 19:30:13 UTC

Permalink

Post by Ken ICHIKAWA
Actually, I experienced 65500 secs offset every about 65500 secs.
I'm thinking this is the cause.

Ouch, this is a nasty bug. I will look into a solution.

Thanks,
Richard

Richard Cochran

2013-08-01 20:26:27 UTC

Permalink

Ken Ichikawa has identified a situation in which a sync message can be
wrongly associated with a follow up after the sequence counter wraps
around.

Port is LISTENING
Sync (seqId 0) : ignored
Fup (seqId 0) : ignored
Sync (seqId 1) : ignored
Port becomes UNCALIBRATED here
Fup (seqId 1) : cached!
Sync (seqId 2) : cached
Fup (seqId 2) : match
Sync (seqId 3) : cached
Fup (seqId 3) : match
...
Sync (seqId 65535) : cached
Fup (seqId 65535) : match
Sync (seqId 0) : cached
Fup (seqId 0) : match
Sync (seqId 1) : match with old Fup!!
Fup (seqId 1) : cached!
Sync (seqId 2) : cached
Fup (seqId 2) : match

Actually, I experienced 65500 secs offset every about 65500 secs.
I'm thinking this is the cause.

This patch fixes the issue by changing the port code to remember one
sync or one follow up, never both. The previous ad hoc logic has been
replaced with a small state machine that handles the messages in the
proper order.

Signed-off-by: Richard Cochran <***@gmail.com>
Reported-by: Ken ICHIKAWA <***@jp.fujitsu.com>
---
port.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 120 insertions(+), 47 deletions(-)

diff --git a/port.c b/port.c
index f03aa05..3e7a6c3 100644
--- a/port.c
+++ b/port.c
@@ -39,6 +39,19 @@
#define ALLOWED_LOST_RESPONSES 3
#define PORT_MAVE_LENGTH 10

+enum syfu_state {
+ SF_EMPTY,
+ SF_HAVE_SYNC,
+ SF_HAVE_FUP,
+};
+
+enum syfu_event {
+ SYNC_MISMATCH,
+ SYNC_MATCH,
+ FUP_MISMATCH,
+ FUP_MATCH,
+};
+
struct nrate_estimator {
double ratio;
tmv_t origin1;
@@ -54,8 +67,8 @@ struct port {
enum timestamp_type timestamping;
struct fdarray fda;
struct foreign_clock *best;
- struct ptp_message *last_follow_up;
- struct ptp_message *last_sync;
+ enum syfu_state syfu;
+ struct ptp_message *last_syncfup;
struct ptp_message *delay_req;
struct ptp_message *peer_delay_req;
struct ptp_message *peer_delay_resp;
@@ -842,6 +855,91 @@ static void port_synchronize(struct port *p,
}
}

+/*
+ * Handle out of order packets. The network stack might
+ * provide the follow up _before_ the sync message. After all,
+ * they can arrive on two different ports. In addition, time
+ * stamping in PHY devices might delay the event packets.
+ */
+static void port_syfufsm(struct port *p, enum syfu_event event,
+ struct ptp_message *m)
+{
+ struct ptp_message *syn, *fup;
+
+ switch (p->syfu) {
+ case SF_EMPTY:
+ switch (event) {
+ case SYNC_MISMATCH:
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_SYNC;
+ break;
+ case FUP_MISMATCH:
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_FUP;
+ break;
+ case SYNC_MATCH:
+ break;
+ case FUP_MATCH:
+ break;
+ }
+ break;
+
+ case SF_HAVE_SYNC:
+ switch (event) {
+ case SYNC_MISMATCH:
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ break;
+ case SYNC_MATCH:
+ break;
+ case FUP_MISMATCH:
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_FUP;
+ break;
+ case FUP_MATCH:
+ syn = p->last_syncfup;
+ port_synchronize(p, syn->hwts.ts, m->ts.pdu,
+ syn->header.correction,
+ m->header.correction);
+ msg_put(p->last_syncfup);
+ p->syfu = SF_EMPTY;
+ break;
+ }
+ break;
+
+ case SF_HAVE_FUP:
+ switch (event) {
+ case SYNC_MISMATCH:
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_SYNC;
+ break;
+ case SYNC_MATCH:
+ fup = p->last_syncfup;
+ port_synchronize(p, m->hwts.ts, fup->ts.pdu,
+ m->header.correction,
+ fup->header.correction);
+ msg_put(p->last_syncfup);
+ p->syfu = SF_EMPTY;
+ break;
+ case FUP_MISMATCH:
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ break;
+ case FUP_MATCH:
+ break;
+ }
+ break;
+ }
+}
+
static int port_pdelay_request(struct port *p)
{
struct ptp_message *msg;
@@ -1117,13 +1215,9 @@ static int port_is_enabled(struct port *p)

static void flush_last_sync(struct port *p)
{
- if (p->last_follow_up) {
- msg_put(p->last_follow_up);
- p->last_follow_up = NULL;
- }
- if (p->last_sync) {
- msg_put(p->last_sync);
- p->last_sync = NULL;
+ if (p->syfu != SF_EMPTY) {
+ msg_put(p->last_syncfup);
+ p->syfu = SF_EMPTY;
}
}

@@ -1387,8 +1481,8 @@ static void process_delay_resp(struct port *p, struct ptp_message *m)

static void process_follow_up(struct port *p, struct ptp_message *m)
{
- struct ptp_message *syn;
- struct PortIdentity master, *pid;
+ enum syfu_event event;
+ struct PortIdentity master;
switch (p->state) {
case PS_INITIALIZING:
case PS_FAULTY:
@@ -1414,27 +1508,13 @@ static void process_follow_up(struct port *p, struct ptp_message *m)
clock_follow_up_info(p->clock, fui);
}

- /*
- * Handle out of order packets. The network stack might
- * provide the follow up _before_ the sync message. After all,
- * they can arrive on two different ports. In addition, time
- * stamping in PHY devices might delay the event packets.
- */
- syn = p->last_sync;
- if (!syn || syn->header.sequenceId != m->header.sequenceId) {
- if (p->last_follow_up)
- msg_put(p->last_follow_up);
- msg_get(m);
- p->last_follow_up = m;
- return;
+ if (p->syfu == SF_HAVE_SYNC &&
+ p->last_syncfup->header.sequenceId == m->header.sequenceId) {
+ event = FUP_MATCH;
+ } else {
+ event = FUP_MISMATCH;
}
-
- pid = &syn->header.sourcePortIdentity;
- if (memcmp(pid, &m->header.sourcePortIdentity, sizeof(*pid)))
- return;
-
- port_synchronize(p, syn->hwts.ts, m->ts.pdu,
- syn->header.correction, m->header.correction);
+ port_syfufsm(p, event, m);
}

static int process_pdelay_req(struct port *p, struct ptp_message *m)
@@ -1675,7 +1755,7 @@ static void process_pdelay_resp_fup(struct port *p, struct ptp_message *m)

static void process_sync(struct port *p, struct ptp_message *m)
{
- struct ptp_message *fup;
+ enum syfu_event event;
struct PortIdentity master;
switch (p->state) {
case PS_INITIALIZING:
@@ -1706,24 +1786,17 @@ static void process_sync(struct port *p, struct ptp_message *m)
if (one_step(m)) {
port_synchronize(p, m->hwts.ts, m->ts.pdu,
m->header.correction, 0);
+ flush_last_sync(p);
return;
}
- /*
- * Check if follow up arrived first.
- */
- fup = p->last_follow_up;
- if (fup && fup->header.sequenceId == m->header.sequenceId) {
- port_synchronize(p, m->hwts.ts, fup->ts.pdu,
- m->header.correction, fup->header.correction);
- return;
+
+ if (p->syfu == SF_HAVE_FUP &&
+ p->last_syncfup->header.sequenceId == m->header.sequenceId) {
+ event = SYNC_MATCH;
+ } else {
+ event = SYNC_MISMATCH;
}
- /*
- * Remember this sync for two step operation.
- */
- if (p->last_sync)
- msg_put(p->last_sync);
- msg_get(m);
- p->last_sync = m;
+ port_syfufsm(p, event, m);
}

/* public methods */

--
1.7.10.4

Keller, Jacob E

2013-08-01 21:01:18 UTC

Permalink

ACK. Looks good. Only comment I had was flush_last_sync seems odd since
we could flush a sync or a fup message. However, we are flushing the
synchronization process, and I can't think of a better name (syncfup
sounds odd...) so I think it is fine as is.

Thanks Richard. This is much cleaner logic and should resolve any issues
regarding the ordering. The state machine is much clearer logic compared
to how it was before.

- Jake

Post by Richard Cochran
Ken Ichikawa has identified a situation in which a sync message can be
wrongly associated with a follow up after the sequence counter wraps
around.
Port is LISTENING
Sync (seqId 0) : ignored
Fup (seqId 0) : ignored
Sync (seqId 1) : ignored
Port becomes UNCALIBRATED here
Fup (seqId 1) : cached!
Sync (seqId 2) : cached
Fup (seqId 2) : match
Sync (seqId 3) : cached
Fup (seqId 3) : match
...
Sync (seqId 65535) : cached
Fup (seqId 65535) : match
Sync (seqId 0) : cached
Fup (seqId 0) : match
Sync (seqId 1) : match with old Fup!!
Fup (seqId 1) : cached!
Sync (seqId 2) : cached
Fup (seqId 2) : match
Actually, I experienced 65500 secs offset every about 65500 secs.
I'm thinking this is the cause.
This patch fixes the issue by changing the port code to remember one
sync or one follow up, never both. The previous ad hoc logic has been
replaced with a small state machine that handles the messages in the
proper order.
---
port.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 120 insertions(+), 47 deletions(-)
diff --git a/port.c b/port.c
index f03aa05..3e7a6c3 100644
--- a/port.c
+++ b/port.c
@@ -39,6 +39,19 @@
#define ALLOWED_LOST_RESPONSES 3
#define PORT_MAVE_LENGTH 10
+enum syfu_state {
+ SF_EMPTY,
+ SF_HAVE_SYNC,
+ SF_HAVE_FUP,
+};
+
+enum syfu_event {
+ SYNC_MISMATCH,
+ SYNC_MATCH,
+ FUP_MISMATCH,
+ FUP_MATCH,
+};
+
struct nrate_estimator {
double ratio;
tmv_t origin1;
@@ -54,8 +67,8 @@ struct port {
enum timestamp_type timestamping;
struct fdarray fda;
struct foreign_clock *best;
- struct ptp_message *last_follow_up;
- struct ptp_message *last_sync;
+ enum syfu_state syfu;
+ struct ptp_message *last_syncfup;
struct ptp_message *delay_req;
struct ptp_message *peer_delay_req;
struct ptp_message *peer_delay_resp;
@@ -842,6 +855,91 @@ static void port_synchronize(struct port *p,
}
}
+/*
+ * Handle out of order packets. The network stack might
+ * provide the follow up _before_ the sync message. After all,
+ * they can arrive on two different ports. In addition, time
+ * stamping in PHY devices might delay the event packets.
+ */
+static void port_syfufsm(struct port *p, enum syfu_event event,
+ struct ptp_message *m)
+{
+ struct ptp_message *syn, *fup;
+
+ switch (p->syfu) {
+ switch (event) {
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_SYNC;
+ break;
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_FUP;
+ break;
+ break;
+ break;
+ }
+ break;
+
+ switch (event) {
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ break;
+ break;
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_FUP;
+ break;
+ syn = p->last_syncfup;
+ port_synchronize(p, syn->hwts.ts, m->ts.pdu,
+ syn->header.correction,
+ m->header.correction);
+ msg_put(p->last_syncfup);
+ p->syfu = SF_EMPTY;
+ break;
+ }
+ break;
+
+ switch (event) {
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ p->syfu = SF_HAVE_SYNC;
+ break;
+ fup = p->last_syncfup;
+ port_synchronize(p, m->hwts.ts, fup->ts.pdu,
+ m->header.correction,
+ fup->header.correction);
+ msg_put(p->last_syncfup);
+ p->syfu = SF_EMPTY;
+ break;
+ msg_put(p->last_syncfup);
+ msg_get(m);
+ p->last_syncfup = m;
+ break;
+ break;
+ }
+ break;
+ }
+}
+
static int port_pdelay_request(struct port *p)
{
struct ptp_message *msg;
@@ -1117,13 +1215,9 @@ static int port_is_enabled(struct port *p)
static void flush_last_sync(struct port *p)
{
- if (p->last_follow_up) {
- msg_put(p->last_follow_up);
- p->last_follow_up = NULL;
- }
- if (p->last_sync) {
- msg_put(p->last_sync);
- p->last_sync = NULL;
+ if (p->syfu != SF_EMPTY) {
+ msg_put(p->last_syncfup);
+ p->syfu = SF_EMPTY;
}
}
@@ -1387,8 +1481,8 @@ static void process_delay_resp(struct port *p, struct ptp_message *m)
static void process_follow_up(struct port *p, struct ptp_message *m)
{
- struct ptp_message *syn;
- struct PortIdentity master, *pid;
+ enum syfu_event event;
+ struct PortIdentity master;
switch (p->state) {
@@ -1414,27 +1508,13 @@ static void process_follow_up(struct port *p, struct ptp_message *m)
clock_follow_up_info(p->clock, fui);
}
- /*
- * Handle out of order packets. The network stack might
- * provide the follow up _before_ the sync message. After all,
- * they can arrive on two different ports. In addition, time
- * stamping in PHY devices might delay the event packets.
- */
- syn = p->last_sync;
- if (!syn || syn->header.sequenceId != m->header.sequenceId) {
- if (p->last_follow_up)
- msg_put(p->last_follow_up);
- msg_get(m);
- p->last_follow_up = m;
- return;
+ if (p->syfu == SF_HAVE_SYNC &&
+ p->last_syncfup->header.sequenceId == m->header.sequenceId) {
+ event = FUP_MATCH;
+ } else {
+ event = FUP_MISMATCH;
}
-
- pid = &syn->header.sourcePortIdentity;
- if (memcmp(pid, &m->header.sourcePortIdentity, sizeof(*pid)))
- return;
-
- port_synchronize(p, syn->hwts.ts, m->ts.pdu,
- syn->header.correction, m->header.correction);
+ port_syfufsm(p, event, m);
}
static int process_pdelay_req(struct port *p, struct ptp_message *m)
@@ -1675,7 +1755,7 @@ static void process_pdelay_resp_fup(struct port *p, struct ptp_message *m)
static void process_sync(struct port *p, struct ptp_message *m)
{
- struct ptp_message *fup;
+ enum syfu_event event;
struct PortIdentity master;
switch (p->state) {
@@ -1706,24 +1786,17 @@ static void process_sync(struct port *p, struct ptp_message *m)
if (one_step(m)) {
port_synchronize(p, m->hwts.ts, m->ts.pdu,
m->header.correction, 0);
+ flush_last_sync(p);
return;
}
- /*
- * Check if follow up arrived first.
- */
- fup = p->last_follow_up;
- if (fup && fup->header.sequenceId == m->header.sequenceId) {
- port_synchronize(p, m->hwts.ts, fup->ts.pdu,
- m->header.correction, fup->header.correction);
- return;
+
+ if (p->syfu == SF_HAVE_FUP &&
+ p->last_syncfup->header.sequenceId == m->header.sequenceId) {
+ event = SYNC_MATCH;
+ } else {
+ event = SYNC_MISMATCH;
}
- /*
- * Remember this sync for two step operation.
- */
- if (p->last_sync)
- msg_put(p->last_sync);
- msg_get(m);
- p->last_sync = m;
+ port_syfufsm(p,