INTEL X86,why do align access and non-align access have same performance?
Posted: 2021-12-20, 9:03:58
From INTEL CPU manual, it says "nonaligned data accesses will seriously impact the performance of the processor". Then I do a test in order to prove it. But the result is that aligned and nonaligned data accesses have the same performance. Why??? Could someone help? My code is shown below:
#include <iostream>
#include <stdint.h>
#include <time.h>
#include <chrono>
#include <string.h>
using namespace std;
static inline int64_t get_time_ns()
{
std::chrono::nanoseconds a = std::chrono::high_resolution_clock::now().time_since_epoch();
return a.count();
}
int main(int argc, char** argv)
{
if (argc < 2) {
cout << "Usage:./test [01234567]" << endl;
cout << "0 - aligned, 1-7 - nonaligned offset" << endl;
return 0;
}
uint64_t offset = atoi(argv[1]);
cout << "offset = " << offset << endl;
const uint64_t BUFFER_SIZE = 800000000;
uint8_t* data_ptr = new uint8_t[BUFFER_SIZE];
if (data_ptr == nullptr) {
cout << "apply for memory failed" << endl;
return 0;
}
memset(data_ptr, 0, sizeof(uint8_t) * BUFFER_SIZE);
const uint64_t LOOP_CNT = 300;
cout << "start" << endl;
auto start = get_time_ns();
for (uint64_t i = 0; i < LOOP_CNT; ++i) {
for (uint64_t j = offset; j <= BUFFER_SIZE - 8; j+= 8) { // align:offset = 0 nonalign: offset=1-7
auto tmp = *(uint64_t*)&data_ptr[j]; // read from memory
++tmp;
*(uint64_t*)&data_ptr[j] = tmp; // write to memory
}
}
auto end = get_time_ns();
cout << "time elapse " << end - start << "ns" << endl;
return 0;
}
RESULT:
offset = 0
start
time elapse 28063563503ns
offset = 1
start
time elapse 28950243291ns
offset = 2
start
time elapse 29374504232ns
offset = 3
start
time elapse 28680777833ns
offset = 4
start
time elapse 28962080206ns
offset = 5
start
time elapse 28364894777ns
offset = 6
start
time elapse 28779795011ns
offset = 7
start
time elapse 29268488768ns
#include <iostream>
#include <stdint.h>
#include <time.h>
#include <chrono>
#include <string.h>
using namespace std;
static inline int64_t get_time_ns()
{
std::chrono::nanoseconds a = std::chrono::high_resolution_clock::now().time_since_epoch();
return a.count();
}
int main(int argc, char** argv)
{
if (argc < 2) {
cout << "Usage:./test [01234567]" << endl;
cout << "0 - aligned, 1-7 - nonaligned offset" << endl;
return 0;
}
uint64_t offset = atoi(argv[1]);
cout << "offset = " << offset << endl;
const uint64_t BUFFER_SIZE = 800000000;
uint8_t* data_ptr = new uint8_t[BUFFER_SIZE];
if (data_ptr == nullptr) {
cout << "apply for memory failed" << endl;
return 0;
}
memset(data_ptr, 0, sizeof(uint8_t) * BUFFER_SIZE);
const uint64_t LOOP_CNT = 300;
cout << "start" << endl;
auto start = get_time_ns();
for (uint64_t i = 0; i < LOOP_CNT; ++i) {
for (uint64_t j = offset; j <= BUFFER_SIZE - 8; j+= 8) { // align:offset = 0 nonalign: offset=1-7
auto tmp = *(uint64_t*)&data_ptr[j]; // read from memory
++tmp;
*(uint64_t*)&data_ptr[j] = tmp; // write to memory
}
}
auto end = get_time_ns();
cout << "time elapse " << end - start << "ns" << endl;
return 0;
}
RESULT:
offset = 0
start
time elapse 28063563503ns
offset = 1
start
time elapse 28950243291ns
offset = 2
start
time elapse 29374504232ns
offset = 3
start
time elapse 28680777833ns
offset = 4
start
time elapse 28962080206ns
offset = 5
start
time elapse 28364894777ns
offset = 6
start
time elapse 28779795011ns
offset = 7
start
time elapse 29268488768ns