I am not sure I understand how __dmb() works then. For example, take the following example from the Pi Pico library
static inline uint queue_get_level_unsafe(queue_t *q) {
int32_t rc = (int32_t)q->wptr - (int32_t)q->rptr;
if (rc < 0) {
rc += q->element_count + 1;
}
return (uint)rc;
}
static bool queue_add_internal(queue_t *q, const void *data, bool block) {
do {
uint32_t save = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != q->element_count) {
memcpy(element_ptr(q, q->wptr), data, q->element_size);
q->wptr = inc_index(q, q->wptr);
lock_internal_spin_unlock_with_notify(&q->core, save);
return true;
}
if (block) {
lock_internal_spin_unlock_with_wait(&q->core, save);
} else {
spin_unlock(q->core.spin_lock, save);
return false;
}
} while (true);
}
static bool queue_remove_internal(queue_t *q, void *data, bool block) {
do {
uint32_t save = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != 0) {
memcpy(data, element_ptr(q, q->rptr), q->element_size);
q->rptr = inc_index(q, q->rptr);
lock_internal_spin_unlock_with_notify(&q->core, save);
return true;
}
if (block) {
lock_internal_spin_unlock_with_wait(&q->core, save);
} else {
spin_unlock(q->core.spin_lock, save);
return false;
}
} while (true);
}
A single 32-bit write is atomic, however, it is not guaranteed that if several writes are performed which order they are performed in, unless there is a __dmb() delineating the writes that should happen first from those that should happen afterwards, correct? So let's say on core0 we call the queue_add_internal() function, and q->wptr is set after the memcpy in the queue_add_internal() function, but without a __dmb(), there doesn't seem to be a guarantee that the memcpy write happens before the write to q->wptr. So lets say right away after q->wptr is written, on core1 we call the queue_remove_internal() function. The queue_level_get_unsafe() function reads the number of elements in the queue, but there is no guarantee that the memcpy() from queue_add_internal() has occurred by the time it does this. So there's no guarantee that the data to be memcpy() is actually in the queue when it is going to be removed.
I would think that the writers of these functions would not make a mistake, so why isn't a __dmb() required here:
static bool queue_add_internal(queue_t *q, const void *data, bool block) {
do {
uint32_t save = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != q->element_count) {
memcpy(element_ptr(q, q->wptr), data, q->element_size);
__dmb();
q->wptr = inc_index(q, q->wptr);
lock_internal_spin_unlock_with_notify(&q->core, save);
return true;
}
if (block) {
lock_internal_spin_unlock_with_wait(&q->core, save);
} else {
spin_unlock(q->core.spin_lock, save);
return false;
}
} while (true);
}
static bool queue_remove_internal(queue_t *q, void *data, bool block) {
do {
uint32_t save = spin_lock_blocking(q->core.spin_lock);
if (queue_get_level_unsafe(q) != 0) {
memcpy(data, element_ptr(q, q->rptr), q->element_size);
__dmb();
q->rptr = inc_index(q, q->rptr);
lock_internal_spin_unlock_with_notify(&q->core, save);
return true;
}
if (block) {
lock_internal_spin_unlock_with_wait(&q->core, save);
} else {
spin_unlock(q->core.spin_lock, save);
return false;
}
} while (true);
}
Wouldn't these __dmb() be required to ensure that the data is in the queue before the writer pointer is advanced, and similarly the data is removed from the queue before the read pointer is advanced?