You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
524 lines
17 KiB
524 lines
17 KiB
// Copyright 2008 Google Inc. All Rights Reserved.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "adler32memcpy.h"
|
|
|
|
// We are using (a modified form of) adler-32 checksum algorithm instead
|
|
// of CRC since adler-32 is faster than CRC.
|
|
// (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
|
|
// This form of adler is bit modified, instead of treating the data in
|
|
// units of bytes, 32-bit data is taken as a unit and two 64-bit
|
|
// checksums are done (we could have one checksum but two checksums
|
|
// make the code run faster).
|
|
|
|
// Adler-32 implementation:
|
|
// Data is treated as 1-byte numbers and,
|
|
// there are two 16-bit numbers a and b
|
|
// Initialize a with 1 and b with 0.
|
|
// for each data unit 'd'
|
|
// a += d
|
|
// b += a
|
|
// checksum = a<<16 + b
|
|
// This sum should never overflow.
|
|
//
|
|
// Adler-64+64 implementation:
|
|
// (applied in this code)
|
|
// Data is treated as 32-bit numbers and whole data is separated into two
|
|
// streams, and hence the two checksums a1, a2, b1 and b2.
|
|
// Initialize a1 and a2 with 1, b1 and b2 with 0
|
|
// add first dataunit to a1
|
|
// add a1 to b1
|
|
// add second dataunit to a1
|
|
// add a1 to b1
|
|
// add third dataunit to a2
|
|
// add a2 to b2
|
|
// add fourth dataunit to a2
|
|
// add a2 to b2
|
|
// ...
|
|
// repeat the sequence back for next 4 dataunits
|
|
//
|
|
// variable A = XMM6 and variable B = XMM7.
|
|
// (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
|
|
|
|
// Assumptions
|
|
// 1. size_in_bytes is a multiple of 16.
|
|
// 2. srcmem and dstmem are 16 byte aligned.
|
|
// 3. size_in_bytes is less than 2^19 bytes.
|
|
|
|
// Assumption 3 ensures that there is no overflow when numbers are being
|
|
// added (we can remove this assumption by doing modulus with a prime
|
|
// number when it is just about to overflow but that would be a very costly
|
|
// exercise)
|
|
|
|
// Returns true if the checksums are equal.
|
|
bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
|
|
return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
|
|
(b1_ == other.b1_) && (b2_ == other.b2_) );
|
|
}
|
|
|
|
// Returns string representation of the Adler checksum.
|
|
string AdlerChecksum::ToHexString() const {
|
|
char buffer[128];
|
|
snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
|
|
return string(buffer);
|
|
}
|
|
|
|
// Sets components of the Adler checksum.
|
|
void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
|
|
a1_ = a1;
|
|
a2_ = a2;
|
|
b1_ = b1;
|
|
b2_ = b2;
|
|
}
|
|
|
|
// Calculates Adler checksum for supplied data.
|
|
bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
|
|
AdlerChecksum *checksum) {
|
|
// Use this data wrapper to access memory with 64bit read/write.
|
|
datacast_t data;
|
|
unsigned int count = size_in_bytes / sizeof(data);
|
|
|
|
if (count > (1U) << 19) {
|
|
// Size is too large, must be strictly less than 512 KB.
|
|
return false;
|
|
}
|
|
|
|
uint64 a1 = 1;
|
|
uint64 a2 = 1;
|
|
uint64 b1 = 0;
|
|
uint64 b2 = 0;
|
|
|
|
unsigned int i = 0;
|
|
while (i < count) {
|
|
// Process 64 bits at a time.
|
|
data.l64 = data64[i];
|
|
a1 = a1 + data.l32.l;
|
|
b1 = b1 + a1;
|
|
a1 = a1 + data.l32.h;
|
|
b1 = b1 + a1;
|
|
i++;
|
|
|
|
data.l64 = data64[i];
|
|
a2 = a2 + data.l32.l;
|
|
b2 = b2 + a2;
|
|
a2 = a2 + data.l32.h;
|
|
b2 = b2 + a2;
|
|
i++;
|
|
}
|
|
checksum->Set(a1, a2, b1, b2);
|
|
return true;
|
|
}
|
|
|
|
// C implementation of Adler memory copy.
|
|
bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
|
|
unsigned int size_in_bytes, AdlerChecksum *checksum) {
|
|
// Use this data wrapper to access memory with 64bit read/write.
|
|
datacast_t data;
|
|
unsigned int count = size_in_bytes / sizeof(data);
|
|
|
|
if (count > ((1U) << 19)) {
|
|
// Size is too large, must be strictly less than 512 KB.
|
|
return false;
|
|
}
|
|
|
|
uint64 a1 = 1;
|
|
uint64 a2 = 1;
|
|
uint64 b1 = 0;
|
|
uint64 b2 = 0;
|
|
|
|
unsigned int i = 0;
|
|
while (i < count) {
|
|
// Process 64 bits at a time.
|
|
data.l64 = srcmem64[i];
|
|
a1 = a1 + data.l32.l;
|
|
b1 = b1 + a1;
|
|
a1 = a1 + data.l32.h;
|
|
b1 = b1 + a1;
|
|
dstmem64[i] = data.l64;
|
|
i++;
|
|
|
|
data.l64 = srcmem64[i];
|
|
a2 = a2 + data.l32.l;
|
|
b2 = b2 + a2;
|
|
a2 = a2 + data.l32.h;
|
|
b2 = b2 + a2;
|
|
dstmem64[i] = data.l64;
|
|
i++;
|
|
}
|
|
checksum->Set(a1, a2, b1, b2);
|
|
return true;
|
|
}
|
|
|
|
// C implementation of Adler memory copy with some float point ops,
|
|
// attempting to warm up the CPU.
|
|
bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
|
|
unsigned int size_in_bytes, AdlerChecksum *checksum) {
|
|
// Use this data wrapper to access memory with 64bit read/write.
|
|
datacast_t data;
|
|
unsigned int count = size_in_bytes / sizeof(data);
|
|
|
|
if (count > ((1U) << 19)) {
|
|
// Size is too large, must be strictly less than 512 KB.
|
|
return false;
|
|
}
|
|
|
|
uint64 a1 = 1;
|
|
uint64 a2 = 1;
|
|
uint64 b1 = 0;
|
|
uint64 b2 = 0;
|
|
|
|
double a = 2.0 * static_cast<double>(srcmem64[0]);
|
|
double b = 5.0 * static_cast<double>(srcmem64[0]);
|
|
double c = 7.0 * static_cast<double>(srcmem64[0]);
|
|
double d = 9.0 * static_cast<double>(srcmem64[0]);
|
|
|
|
unsigned int i = 0;
|
|
while (i < count) {
|
|
// Process 64 bits at a time.
|
|
data.l64 = srcmem64[i];
|
|
a1 = a1 + data.l32.l;
|
|
b1 = b1 + a1;
|
|
a1 = a1 + data.l32.h;
|
|
b1 = b1 + a1;
|
|
dstmem64[i] = data.l64;
|
|
i++;
|
|
|
|
// Warm cpu up.
|
|
a = a * b;
|
|
b = b + c;
|
|
|
|
data.l64 = srcmem64[i];
|
|
a2 = a2 + data.l32.l;
|
|
b2 = b2 + a2;
|
|
a2 = a2 + data.l32.h;
|
|
b2 = b2 + a2;
|
|
dstmem64[i] = data.l64;
|
|
i++;
|
|
|
|
// Warm cpu up.
|
|
c = c * d;
|
|
d = d + d;
|
|
}
|
|
|
|
// Warm cpu up.
|
|
d = a + b + c + d;
|
|
if (d == 1.0) {
|
|
// Reference the result so that it can't be discarded by the compiler.
|
|
printf("Log: This will probably never happen.\n");
|
|
}
|
|
|
|
checksum->Set(a1, a2, b1, b2);
|
|
return true;
|
|
}
|
|
|
|
// x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
|
|
bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
|
|
unsigned int size_in_bytes, AdlerChecksum *checksum) {
|
|
// Use assembly implementation where supported.
|
|
#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
|
|
|
|
// Pull a bit of tricky preprocessing to make the inline asm both
|
|
// 32 bit and 64 bit.
|
|
#ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86...
|
|
#define rAX "%%eax"
|
|
#define rCX "%%ecx"
|
|
#define rDX "%%edx"
|
|
#define rBX "%%ebx"
|
|
#define rSP "%%esp"
|
|
#define rBP "%%ebp"
|
|
#define rSI "%%esi"
|
|
#define rDI "%%edi"
|
|
#endif
|
|
|
|
#ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros.
|
|
#define rAX "%%rax"
|
|
#define rCX "%%rcx"
|
|
#define rDX "%%rdx"
|
|
#define rBX "%%rbx"
|
|
#define rSP "%%rsp"
|
|
#define rBP "%%rbp"
|
|
#define rSI "%%rsi"
|
|
#define rDI "%%rdi"
|
|
#endif
|
|
|
|
// Elements 0 to 3 are used for holding checksum terms a1, a2,
|
|
// b1, b2 respectively. These elements are filled by asm code.
|
|
// Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
|
|
// 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
|
|
// setting word index 0 and word index 2 to zero).
|
|
// Element 6 and 7 are used for setting a1 and a2 to 1.
|
|
volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
|
|
{0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
|
|
|
|
if ((size_in_bytes >> 19) > 0) {
|
|
// Size is too large. Must be less than 2^19 bytes = 512 KB.
|
|
return false;
|
|
}
|
|
|
|
// Number of 32-bit words which are not added to a1/a2 in the main loop.
|
|
uint32 remaining_words = (size_in_bytes % 48) / 4;
|
|
|
|
// Since we are moving 48 bytes at a time number of iterations = total size/48
|
|
// is value of counter.
|
|
uint32 num_of_48_byte_units = size_in_bytes / 48;
|
|
|
|
asm volatile (
|
|
// Source address is in ESI (extended source index)
|
|
// destination is in EDI (extended destination index)
|
|
// and counter is already in ECX (extended counter
|
|
// index).
|
|
"cmp $0, " rCX ";" // Compare counter to zero.
|
|
"jz END;"
|
|
|
|
// XMM6 is initialized with 1 and XMM7 with 0.
|
|
"prefetchnta 0(" rSI ");"
|
|
"prefetchnta 64(" rSI ");"
|
|
"movdqu 48(" rAX "), %%xmm6;"
|
|
"xorps %%xmm7, %%xmm7;"
|
|
|
|
// Start of the loop which copies 48 bytes from source to dst each time.
|
|
"TOP:\n"
|
|
|
|
// Make 6 moves each of 16 bytes from srcmem to XMM registers.
|
|
// We are using 2 words out of 4 words in each XMM register,
|
|
// word index 0 and word index 2
|
|
"movdqa 0(" rSI "), %%xmm0;"
|
|
"movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here.
|
|
"movdqa 16(" rSI "), %%xmm2;"
|
|
"movdqu 20(" rSI "), %%xmm3;"
|
|
"movdqa 32(" rSI "), %%xmm4;"
|
|
"movdqu 36(" rSI "), %%xmm5;"
|
|
|
|
// Move 3 * 16 bytes from XMM registers to dstmem.
|
|
// Note: this copy must be performed before pinsrw instructions since
|
|
// they will modify the XMM registers.
|
|
"movntdq %%xmm0, 0(" rDI ");"
|
|
"movntdq %%xmm2, 16(" rDI ");"
|
|
"movntdq %%xmm4, 32(" rDI ");"
|
|
|
|
// Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
|
|
"andps 32(" rAX "), %%xmm0;"
|
|
"andps 32(" rAX "), %%xmm1;"
|
|
"andps 32(" rAX "), %%xmm2;"
|
|
"andps 32(" rAX "), %%xmm3;"
|
|
"andps 32(" rAX "), %%xmm4;"
|
|
"andps 32(" rAX "), %%xmm5;"
|
|
|
|
// Add XMM0 to XMM6 and then add XMM6 to XMM7.
|
|
// Repeat this for XMM1, ..., XMM5.
|
|
// Overflow(for XMM7) can occur only if there are more
|
|
// than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
|
|
// if size_in_bytes > 2^19 than overflow occurs.
|
|
"paddq %%xmm0, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"paddq %%xmm1, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"paddq %%xmm2, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"paddq %%xmm3, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"paddq %%xmm4, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"paddq %%xmm5, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
|
|
// Increment ESI and EDI by 48 bytes and decrement counter by 1.
|
|
"add $48, " rSI ";"
|
|
"add $48, " rDI ";"
|
|
"prefetchnta 0(" rSI ");"
|
|
"prefetchnta 64(" rSI ");"
|
|
"dec " rCX ";"
|
|
"jnz TOP;"
|
|
|
|
// Now only remaining_words 32-bit words are left.
|
|
// make a loop, add first two words to a1 and next two to a2 (just like
|
|
// above loop, the only extra thing we are doing is rechecking
|
|
// rDX (=remaining_words) everytime we add a number to a1/a2.
|
|
"REM_IS_STILL_NOT_ZERO:\n"
|
|
// Unless remaining_words becomes less than 4 words(16 bytes)
|
|
// there is not much issue and remaining_words will always
|
|
// be a multiple of four by assumption.
|
|
"cmp $4, " rDX ";"
|
|
// In case for some weird reasons if remaining_words becomes
|
|
// less than 4 but not zero then also break the code and go off to END.
|
|
"jl END;"
|
|
// Otherwise just go on and copy data in chunks of 4-words at a time till
|
|
// whole data (<48 bytes) is copied.
|
|
"movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
|
|
|
|
"movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without
|
|
"pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary.
|
|
|
|
"movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination.
|
|
"andps 32(" rAX "), %%xmm0;"
|
|
"andps 32(" rAX "), %%xmm1;"
|
|
"paddq %%xmm0, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"paddq %%xmm1, %%xmm6;"
|
|
"paddq %%xmm6, %%xmm7;"
|
|
"add $16, " rSI ";"
|
|
"add $16, " rDI ";"
|
|
"sub $4, " rDX ";"
|
|
// Decrement %rDX by 4 since %rDX is number of 32-bit
|
|
// words left after considering all 48-byte units.
|
|
"jmp REM_IS_STILL_NOT_ZERO;"
|
|
|
|
"END:\n"
|
|
// Report checksum values A and B (both right now are two concatenated
|
|
// 64 bit numbers and have to be converted to 64 bit numbers)
|
|
// seems like Adler128 (since size of each part is 4 byte rather than
|
|
// 1 byte).
|
|
"movdqa %%xmm6, 0(" rAX ");"
|
|
"movdqa %%xmm7, 16(" rAX ");"
|
|
"sfence;"
|
|
|
|
// No output registers.
|
|
:
|
|
// Input registers.
|
|
: "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
|
|
"c" (num_of_48_byte_units), "d" (remaining_words)
|
|
); // asm.
|
|
|
|
if (checksum != NULL) {
|
|
checksum->Set(checksum_arr[0], checksum_arr[1],
|
|
checksum_arr[2], checksum_arr[3]);
|
|
}
|
|
|
|
// Everything went fine, so return true (this does not mean
|
|
// that there is no problem with memory this just mean that data was copied
|
|
// from src to dst and checksum was calculated successfully).
|
|
return true;
|
|
#elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
|
|
// Elements 0 to 3 are used for holding checksum terms a1, a2,
|
|
// b1, b2 respectively. These elements are filled by asm code.
|
|
// Checksum is seeded with the null checksum.
|
|
volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
|
|
{1, 1, 0, 0};
|
|
|
|
if ((size_in_bytes >> 19) > 0) {
|
|
// Size is too large. Must be less than 2^19 bytes = 512 KB.
|
|
return false;
|
|
}
|
|
|
|
// Since we are moving 64 bytes at a time number of iterations = total size/64
|
|
uint32 blocks = size_in_bytes / 64;
|
|
|
|
uint64 *dst = dstmem64;
|
|
uint64 *src = srcmem64;
|
|
|
|
#define src_r "r3"
|
|
#define dst_r "r4"
|
|
#define blocks_r "r5"
|
|
#define crc_r "r6"
|
|
|
|
asm volatile (
|
|
"mov " src_r ", %[src]; \n"
|
|
"mov " dst_r ", %[dst]; \n"
|
|
"mov " crc_r ", %[crc]; \n"
|
|
"mov " blocks_r ", %[blocks]; \n"
|
|
|
|
// Loop over block count.
|
|
"cmp " blocks_r ", #0; \n" // Compare counter to zero.
|
|
"ble END; \n"
|
|
|
|
|
|
// Preload upcoming cacheline.
|
|
"pld [" src_r ", #0x0]; \n"
|
|
"pld [" src_r ", #0x20]; \n"
|
|
|
|
// Init checksum
|
|
"vldm " crc_r ", {q0}; \n"
|
|
"vmov.i32 q1, #0; \n"
|
|
|
|
// Start of the loop which copies 48 bytes from source to dst each time.
|
|
"TOP: \n"
|
|
|
|
// Make 3 moves each of 16 bytes from srcmem to qX registers.
|
|
// We are using 2 words out of 4 words in each qX register,
|
|
// word index 0 and word index 2. We'll swizzle them in a bit.
|
|
// Copy it.
|
|
"vldm " src_r "!, {q8, q9, q10, q11}; \n"
|
|
"vstm " dst_r "!, {q8, q9, q10, q11}; \n"
|
|
|
|
// Arrange it.
|
|
"vmov.i64 q12, #0; \n"
|
|
"vmov.i64 q13, #0; \n"
|
|
"vmov.i64 q14, #0; \n"
|
|
"vmov.i64 q15, #0; \n"
|
|
// This exchenges words 1,3 in the filled registers with
|
|
// words 0,2 in the empty registers.
|
|
"vtrn.32 q8, q12; \n"
|
|
"vtrn.32 q9, q13; \n"
|
|
"vtrn.32 q10, q14; \n"
|
|
"vtrn.32 q11, q15; \n"
|
|
|
|
// Sum into q0, then into q1.
|
|
// Repeat this for q8 - q13.
|
|
// Overflow can occur only if there are more
|
|
// than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
|
|
// if size_in_bytes > 2^19 than overflow occurs.
|
|
"vadd.i64 q0, q0, q8; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
"vadd.i64 q0, q0, q12; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
"vadd.i64 q0, q0, q9; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
"vadd.i64 q0, q0, q13; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
|
|
"vadd.i64 q0, q0, q10; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
"vadd.i64 q0, q0, q14; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
"vadd.i64 q0, q0, q11; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
"vadd.i64 q0, q0, q15; \n"
|
|
"vadd.i64 q1, q1, q0; \n"
|
|
|
|
// Increment counter and loop.
|
|
"sub " blocks_r ", " blocks_r ", #1; \n"
|
|
"cmp " blocks_r ", #0; \n" // Compare counter to zero.
|
|
"bgt TOP; \n"
|
|
|
|
|
|
"END:\n"
|
|
// Report checksum values A and B (both right now are two concatenated
|
|
// 64 bit numbers and have to be converted to 64 bit numbers)
|
|
// seems like Adler128 (since size of each part is 4 byte rather than
|
|
// 1 byte).
|
|
"vstm " crc_r ", {q0, q1}; \n"
|
|
|
|
// Output registers.
|
|
:
|
|
// Input registers.
|
|
: [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
|
|
: "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
|
|
); // asm.
|
|
|
|
if (checksum != NULL) {
|
|
checksum->Set(checksum_arr[0], checksum_arr[1],
|
|
checksum_arr[2], checksum_arr[3]);
|
|
}
|
|
|
|
// Everything went fine, so return true (this does not mean
|
|
// that there is no problem with memory this just mean that data was copied
|
|
// from src to dst and checksum was calculated successfully).
|
|
return true;
|
|
#else
|
|
#warning "No vector copy defined for this architecture."
|
|
// Fall back to C implementation for anything else.
|
|
return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
|
|
#endif
|
|
}
|