And here are the implementations of XMM with MFENCE function and XMM with
SFENCE. I don’t really believe that sfence fixes the data issue, but
nonetheless, please look at both of them; perhaps one of you sees
something wrong. (I know, I should be more positive, but bear with me.)
__inline static void
MOVE128_SSE2_INSTRINSIC_MFENCE(volatile unsigned *src,
volatile unsigned *dst)
{
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
__m128i* src_ptr = (__m128i*)src;
__m128i* dst_ptr = (__m128i*)dst;
if(((u_vuaddr_t)src % SSE2_MOVE_ALIGNMENT) || ((u_vuaddr_t)dst %
SSE2_MOVE_ALIGNMENT)) {
MOVE128_SSE2_INSTRINSIC_MFENCE_U(src, dst);
return;
}
/*
* Prefetching to L1 cache - Loads one cache line of data from address
* to a location “closer” to the processor (L1 cache)
*/
_mm_prefetch((char*) src_ptr,_MM_HINT_T0);
xmm0 = _mm_load_si128(src_ptr); /* Move byte 0 - 15 into XMM
reg */
xmm1 = _mm_load_si128(src_ptr + 1); /* Move byte 16 - 31 into XMM
reg */
xmm2 = _mm_load_si128(src_ptr + 2); /* Move byte 32 - 47 into XMM
reg */
xmm3 = _mm_load_si128(src_ptr + 3); /* Move byte 48 - 63 into XMM
reg */
_mm_prefetch((char*) (src_ptr + 4), _MM_HINT_T0);
xmm4 = _mm_load_si128(src_ptr + 4); /* Move byte 64 - 79 into XMM
reg */
xmm5 = _mm_load_si128(src_ptr + 5); /* Move byte 80 - 95 into XMM
reg */
xmm6 = _mm_load_si128(src_ptr + 6); /* Move byte 96 - 111 into XMM
reg */
xmm7 = _mm_load_si128(src_ptr + 7); /* Move byte 112 - 127 into XMM
reg */
_mm_lfence();
_mm_store_si128(dst_ptr, xmm0); /* Move byte 0 - 15 to
destination addr */
_mm_store_si128(dst_ptr + 1, xmm1); /* Move byte 16 - 31 to
destination addr */
_mm_store_si128(dst_ptr + 2, xmm2); /* Move byte 32 - 47 to
destination addr */
_mm_store_si128(dst_ptr + 3, xmm3); /* Move byte 48 - 63 to
destination addr */
_mm_mfence();
_mm_store_si128(dst_ptr + 4, xmm4); /* Move byte 64 - 79 to
destination addr */
_mm_store_si128(dst_ptr + 5, xmm5); /* Move byte 80 - 95 to
destination addr */
_mm_store_si128(dst_ptr + 6, xmm6); /* Move byte 96 - 111 to
destination addr */
_mm_store_si128(dst_ptr + 7, xmm7); /* Move byte 112 - 127 to
destination addr */
/* Flushing */
_mm_mfence();
}
__inline static void
MOVE128_SSE2_INSTRINSIC_SFENCE(volatile unsigned *src,
volatile unsigned *dst)
{
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
__m128i* src_ptr = (__m128i*)src;
__m128i* dst_ptr = (__m128i*)dst;
if(((u_vuaddr_t)src % SSE2_MOVE_ALIGNMENT) || ((u_vuaddr_t)dst %
SSE2_MOVE_ALIGNMENT)) {
MOVE128_SSE2_INSTRINSIC_SFENCE_U(src, dst);
return;
}
/*
* Prefetching to L1 cache - Loads one cache line of data from address
* to a location “closer” to the processor (L1 cache)
*/
_mm_prefetch((char*) src_ptr,_MM_HINT_T0);
xmm0 = _mm_load_si128(src_ptr); /* Move byte 0 - 15 into XMM
reg */
xmm1 = _mm_load_si128(src_ptr + 1); /* Move byte 16 - 31 into XMM
reg */
xmm2 = _mm_load_si128(src_ptr + 2); /* Move byte 32 - 47 into XMM
reg */
xmm3 = _mm_load_si128(src_ptr + 3); /* Move byte 48 - 63 into XMM
reg */
_mm_prefetch((char*) (src_ptr + 4), _MM_HINT_T0);
xmm4 = _mm_load_si128(src_ptr + 4); /* Move byte 64 - 79 into XMM
reg */
xmm5 = _mm_load_si128(src_ptr + 5); /* Move byte 80 - 95 into XMM
reg */
xmm6 = _mm_load_si128(src_ptr + 6); /* Move byte 96 - 111 into XMM
reg */
xmm7 = _mm_load_si128(src_ptr + 7); /* Move byte 112 - 127 into XMM
reg */
_mm_lfence();
_mm_store_si128(dst_ptr, xmm0); /* Move byte 0 - 15 to
destination addr */
_mm_store_si128(dst_ptr + 1, xmm1); /* Move byte 16 - 31 to
destination addr */
_mm_store_si128(dst_ptr + 2, xmm2); /* Move byte 32 - 47 to
destination addr */
_mm_store_si128(dst_ptr + 3, xmm3); /* Move byte 48 - 63 to
destination addr */
_mm_sfence();
_mm_store_si128(dst_ptr + 4, xmm4); /* Move byte 64 - 79 to
destination addr */
_mm_store_si128(dst_ptr + 5, xmm5); /* Move byte 80 - 95 to
destination addr */
_mm_store_si128(dst_ptr + 6, xmm6); /* Move byte 96 - 111 to
destination addr */
_mm_store_si128(dst_ptr + 7, xmm7); /* Move byte 112 - 127 to
destination addr */
/* Flushing */
_mm_sfence();
}
— Mark Roddy wrote:
> Yes load is read from memory to processor (register) and store is write
> from
> processor (register) to memory.
>
> On Nov 23, 2007 2:05 PM, Calin Iaru wrote:
>
> > I am debugging a data corruption issue in an RMA library and the
> problem
> > seems to be fixed by using different memory barriers. I would never
> expect
> > this to happen, but here’s the thing:
> > we use mfence to flush the transfers and data corruption occurs
> > if we use sfence, this does not happen.
> >
> > Here are some excerpts from Intel VTune Analyzer documentation:
> > "
> > The SFENCE instruction is ordered with respect store instructions,
> other
> > SFENCE instructions, any MFENCE instructions, and any serializing
> > instructions (such as the CPUID instruction). It is not ordered with
> > respect to load instructions or the LFENCE instruction.
> > "
> > I cannot realy understand what does this mean and what is a load
> > instruction. My guess is that load instructions are those instructions
> > that copy from memory to cpu registers. Please confirm.
> >
> > And the documentation mentions mfence too:
> > "
> > Performs a serializing operation on all load and store instructions
> that
> > were issued prior the MFENCE instruction. This serializing operation
> > guarantees that every load and store instruction that precedes in
> program
> > order the MFENCE instruction is globally visible before any load or
> store
> > instruction that follows the MFENCE instruction is globally visible.
> > "
> > So I would think that mfence is stronger than sfence because every
> load
> > and store operations are guaranteed to happen before this barrier.
> >
> > The library uses a customized copy function that assumes the buffers
> are
> > properly aligned (also a performance requirement) and then transfers
> from
> > one location to the next using the largest SSE registers (XMM and if
> those
> > are not available, then it uses MMX and so on).
> >
> > Let’s go into details and find out what could cause this. By the way,
> the
> > library works well on an AMD 4 CPU cluster, but gives this data
> failure on
> > an Intel 4 core cluster.
> >
> >
> >
> >
> >
>
> > Never miss a thing. Make Yahoo your home page.
> > http://www.yahoo.com/r/hs
> >
> > —
> > NTDEV is sponsored by OSR
> >
> > For our schedule of WDF, WDM, debugging and other seminars visit:
> > http://www.osr.com/seminars
> >
> > To unsubscribe, visit the List Server section of OSR Online at
> > http://www.osronline.com/page.cfm?name=ListServer
> >
>
>
>
> –
> Mark Roddy
>
> —
> NTDEV is sponsored by OSR
>
> For our schedule of WDF, WDM, debugging and other seminars visit:
> http://www.osr.com/seminars
>
> To unsubscribe, visit the List Server section of OSR Online at
http://www.osronline.com/page.cfm?name=ListServer
Get easy, one-click access to your favorites.
Make Yahoo! your homepage.
http://www.yahoo.com/r/hs