Skip to content

Optimization_techniques

Advanced techniques for writing high-performance C++ code. These optimizations should only be applied after profiling identifies bottlenecks.

  1. Profile First: Never optimize without measurement
  2. Measure Again: Verify improvements
  3. Readability Matters: Don’t sacrifice maintainability
  4. Trade-offs: Understand what you’re trading

Reduces loop overhead by performing multiple operations per iteration:

// Naive version
void process_all(const std::vector<int>& data) {
for (size_t i = 0; i < data.size(); i++) {
process(data[i]);
}
}
// Partial unrolling (4x)
void process_all_unrolled(const std::vector<int>& data) {
size_t i = 0;
size_t limit = data.size() - 3;
for (; i < limit; i += 4) {
process(data[i]);
process(data[i+1]);
process(data[i+2]);
process(data[i+3]);
}
// Handle remaining
for (; i < data.size(); i++) {
process(data[i]);
}
}

Move computations that don’t change inside the loop:

// Before: expensive computation in loop
for (int i = 0; i < n; i++) {
result = expensive_function() * i; // Bad!
}
// After: compute once
auto value = expensive_function();
for (int i = 0; i < n; i++) {
result = value * i; // Good!
}

Combine consecutive loops over the same range:

// Before: two separate loops
for (int i = 0; i < n; i++) {
a[i] = b[i] * 2;
}
for (int i = 0; i < n; i++) {
c[i] = a[i] + 1;
}
// After: single loop
for (int i = 0; i < n; i++) {
a[i] = b[i] * 2;
c[i] = a[i] + 1;
}

Change loop nesting for better cache locality:

// Bad: column-major access pattern
const int N = 1000;
int matrix[N][N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
process(matrix[j][i]); // Bad cache behavior
}
}
// Good: row-major access pattern
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
process(matrix[i][j]); // Good cache behavior
}
}
// Pre-allocate to avoid reallocations
std::vector<int> results;
results.reserve(input.size()); // Important!
for (const auto& item : input) {
results.push_back(process(item));
}

Help the compiler optimize branch prediction:

#include <algorithm>
// Branch hints (GCC/Clang)
if (__builtin_expect(error_occurred, 0)) { // Unlikely
handle_error();
}
if (__builtin_expect(success, 1)) { // Likely
continue_processing();
}
// C++20: [[likely]] and [[unlikely]] attributes
if (condition) [[likely]] {
// Expected path
}
if (error) [[unlikely]] {
// Rare case
}

Avoid branches when possible:

// Branch-heavy
int max(int a, int b) {
if (a > b) return a;
else return b;
}
// Branchless (may be faster on some architectures)
int max_branchless(int a, int b) {
int diff = a - b;
int sign = diff >> 31; // All 1s if negative
return a - (diff & sign);
}
// Using std::max (often optimized)
int max_std(int a, int b) {
return std::max(a, b);
}
// Compiler eliminates copy/move
struct Data {
std::vector<int> large_data;
Data() : large_data(1000000, 0) {}
};
// No copy is made!
Data create_data() {
return Data(); // Named RVO (NRVO)
}
// Also works with local variables
std::vector<int> create_vector() {
std::vector<int> v{1, 2, 3};
return v; // NRVO
}
// Force move when RVO doesn't apply
std::vector<int> combine(std::vector<int> a, const std::vector<int>& b) {
a.insert(a.end(), b.begin(), b.end());
return std::move(a); // Force move
}

Structure of Arrays (SoA) vs Array of Structures (AoS)

Section titled “Structure of Arrays (SoA) vs Array of Structures (AoS)”
// Array of Structures (AoS) - cache inefficient for processing one field
struct Particle {
double x, y, z;
double vx, vy, vz;
double r, g, b;
};
std::vector<Particle> particles(1000000);
// Structure of Arrays (SoA) - cache efficient
struct Particles {
std::vector<double> x, y, z;
std::vector<double> vx, vy, vz;
std::vector<double> r, g, b;
};
Particles particles;
particles.x.resize(1000000);
// Process just x coordinates - only x is in cache!
// Ensure alignment for SIMD
struct alignas(16) AlignedData {
double values[4];
};
// Using aligned_alloc or posix_memalign
void* aligned_ptr;
posix_memalign(&aligned_ptr, 16, size);
free(aligned_ptr);
// C++17 std::aligned_alloc
auto* ptr = std::aligned_alloc(16, size);
std::free(ptr);
// False sharing - avoid this!
struct SharedData {
std::atomic<int> counter1;
std::atomic<int> counter2;
};
// Better: pad to cache line size
struct PaddedCounter {
char padding[64 - sizeof(std::atomic<int>)];
std::atomic<int> counter;
};
// Suggest inlining (compiler may ignore)
inline int small_function(int x) {
return x * 2;
}
// Always inline (compiler will try hard)
__attribute__((always_inline)) int critical() {
return 42;
}
// C++20
[[gnu::always_inline]] int critical() {
return 42;
}
// Never inline
__attribute__((noinline)) int don't_inline() {
// Large function
}
// Put small, frequently called functions in header
class HotPath {
public:
// Compiler inlines these easily
inline int getValue() const { return value_; }
inline void setValue(int v) { value_ = v; }
private:
int value_;
};
#include <immintrin.h>
// AVX2 addition of 8 floats
void add_arrays_avx(const float* a, const float* b, float* result, size_t n) {
for (size_t i = 0; i < n; i += 8) {
__m256 va = _mm256_loadu_ps(&a[i]);
__m256 vb = _mm256_loadu_ps(&b[i]);
__m256 vr = _mm256_add_ps(va, vb);
_mm256_storeu_ps(&result[i], vr);
}
}
// SSE addition of 4 floats
void add_arrays_sse(const float* a, const float* b, float* result, size_t n) {
for (size_t i = 0; i < n; i += 4) {
__m128 va = _mm_loadu_ps(&a[i]);
__m128 vb = _mm_loadu_ps(&b[i]);
__m128 vr = _mm_add_ps(va, vb);
_mm_storeu_ps(&result[i], vr);
}
}
// Compiler can auto-vectorize this
void vectorize_me(std::vector<float>& data) {
for (auto& x : data) {
x = x * 2.0f + 1.0f;
}
}
// Make it easier for compiler
void clearer_intent(std::vector<float>& data) {
std::transform(data.begin(), data.end(), data.begin(),
[](float x) { return x * 2.0f + 1.0f; }
);
}
// Computed at compile time
constexpr int factorial(int n) {
return n <= 1 ? 1 : n * factorial(n - 1);
}
constexpr auto result = factorial(10); // Computed at compile time!
// C++17: constexpr lambda
constexpr auto square = [](int x) { return x * x; };
// C++20: constexpr virtual and dynamic allocation
constexpr auto vec = std::vector<int>{1, 2, 3}; // C++20
// Must be evaluated at compile time
consteval int square(int x) {
return x * x;
}
// This works
static_assert(square(5) == 25);
// This fails (runtime context)
// int y = 10;
// auto result = square(y); // Error!
// Compile-time computation with arrays
template<typename T, std::size_t N>
constexpr std::array<T, N> sort_array(std::array<T, N> arr) {
for (std::size_t i = 0; i < N - 1; i++) {
for (std::size_t j = 0; j < N - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
auto temp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = temp;
}
}
}
return arr;
}
constexpr auto sorted = sort_array(std::array{5, 2, 8, 1, 9});
static_assert(sorted[0] == 1);
// Lazy computation - only compute when needed
template<typename T>
class Lazy {
std::function<T()> compute_;
mutable std::optional<T> cached_;
public:
explicit Lazy(std::function<T()> fn) : compute_(fn) {}
const T& get() const {
if (!cached_) {
cached_ = compute_();
}
return *cached_;
}
};
// Usage
Lazy<int> expensive([]() {
std::cout << "Computing...\n";
return 42;
});
std::cout << "Created\n"; // No computation yet
std::cout << expensive.get() << "\n"; // Computes now
std::cout << expensive.get() << "\n"; // Uses cached
// std::string often stores short strings without heap allocation
std::string s = "Hello"; // No heap allocation on most implementations
// But concatenation can trigger allocation
std::string long_string;
for (int i = 0; i < 1000; i++) {
long_string += "x"; // O(n^2)!
}
// Better: reserve or use stringstream
std::string better;
better.reserve(1000);
for (int i = 0; i < 1000; i++) {
better += "x"; // O(n)
}
Terminal window
# 1. Instrumented build
clang++ -fprofile-instr-generate -O2 -o program program.cpp
# 2. Run with representative data
./program < typical_input.txt
# 3. Optimize with profile data
clang++ -fprofile-instr-use=default.profdata -O2 -o optimized program.cpp
FlagEffect
-O0No optimization (debug)
-O1Basic optimization
-O2Standard optimization
-O3Aggressive optimization
-OsSize optimization
-OfastFastest (may relax FP)
-march=nativeCPU-specific
-fltoLink-time optimization
  1. Profile to identify bottleneck
  2. Measure baseline performance
  3. Apply single optimization
  4. Measure again
  5. Keep change if improvement
  6. Repeat with next optimization
  • Always profile before optimizing
  • Focus on hot spots identified by profiler
  • Prefer algorithmic improvements over micro-optimizations
  • Cache locality often matters more than instruction count
  • Use compiler hints sparingly - trust the compiler
  • Test with realistic data after optimization