Skip to content

Query_optimization


Indexes are the primary tool for query optimization. Understanding when and how to create them is essential.

Index Strategy Overview
========================================================================
┌─────────────────────────────────────────────────────────────────────┐
│ When to Use Indexes │
│ │
│ ✓ YES - WHERE clause columns │
│ ✓ YES - JOIN conditions (foreign keys) │
│ ✓ YES - ORDER BY columns │
│ ✓ YES - Columns in GROUP BY │
│ ✓ NO - Low cardinality (few unique values) │
│ ✓ NO - Frequently updated columns │
│ ✓ NO - Small tables (full scan is faster) │
│ │
└─────────────────────────────────────────────────────────────────────┘
-- Sample table
CREATE TABLE products (
product_id SERIAL PRIMARY KEY,
category VARCHAR(50),
name VARCHAR(200),
price DECIMAL(10,2),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
status VARCHAR(20) DEFAULT 'active'
);
-- Single column index for WHERE
CREATE INDEX idx_products_category ON products(category);
CREATE INDEX idx_products_status ON products(status);
-- Composite index for multi-column queries
-- Order matters! Put high-selectivity columns first
CREATE INDEX idx_products_category_status
ON products(category, status);
-- Partial index - only for active rows
CREATE INDEX idx_products_active
ON products(category, name)
WHERE status = 'active';
-- Covering index - includes all columns needed by query
CREATE INDEX idx_products_covering
ON products(product_id, category, name, price)
INCLUDE (created_at);
-- Expression index
CREATE INDEX idx_products_lower_name
ON products(LOWER(name));
-- Unique index
CREATE UNIQUE INDEX idx_products_sku
ON products(category, sku);

Sometimes the best optimization is rewriting the query itself.

-- Instead of:
SELECT * FROM orders
WHERE status = 'completed'
AND total > 1000;
-- Use covering index and SELECT only needed columns:
CREATE INDEX idx_orders_status_total ON orders(status, total)
INCLUDE (user_id, order_date);
SELECT order_id, user_id, total, order_date
FROM orders
WHERE status = 'completed'
AND total > 1000;
-- Instead of multiple OR conditions:
SELECT * FROM products
WHERE category = 'Electronics'
OR category = 'Clothing'
OR category = 'Books';
-- Use IN:
SELECT * FROM products
WHERE category IN ('Electronics', 'Clothing', 'Books');
-- Instead of NOT IN (can be slow):
SELECT * FROM users
WHERE user_id NOT IN (SELECT user_id FROM orders);
-- Use NOT EXISTS or LEFT JOIN:
SELECT * FROM users u
WHERE NOT EXISTS (
SELECT 1 FROM orders o WHERE o.user_id = u.user_id
);
-- Or use LEFT JOIN with NULL check:
SELECT u.* FROM users u
LEFT JOIN orders o ON u.user_id = o.user_id
WHERE o.order_id IS NULL;

-- Optimizing join order
-- Put smallest/deterministic tables first
EXPLAIN (ANALYZE)
SELECT u.*, o.*
FROM orders o
INNER JOIN users u ON o.user_id = u.user_id
WHERE u.status = 'active';
-- Use appropriate join type
-- For small outer table: Nested Loop
-- For large tables: Hash Join or Merge Join
-- Avoid SELECT * in joins - specify columns
SELECT u.username, o.order_id, o.total
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id;
-- Use STRAIGHT_JOIN hint (if needed) to force join order
SELECT STRAIGHT_JOIN u.username, o.order_id
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id;

Pagination with OFFSET can be slow for large offsets.

-- Slow: Using OFFSET
SELECT * FROM orders
ORDER BY order_id
LIMIT 10 OFFSET 10000;
-- Better: Keyset pagination (seek method)
-- Instead of OFFSET, use WHERE with last seen value
SELECT * FROM orders
WHERE order_id > 10000 -- Last order_id from previous page
ORDER BY order_id
LIMIT 10;
-- For non-sequential pagination, use covering indexes
CREATE INDEX idx_orders_pagination
ON orders(status, order_id DESC);
-- Then paginate:
SELECT * FROM orders
WHERE status = 'completed'
ORDER BY order_id DESC
LIMIT 10 OFFSET 1000;

-- Use materialized views for frequent aggregations
CREATE MATERIALIZED VIEW sales_daily_summary AS
SELECT
DATE_TRUNC('day', created_at) as day,
COUNT(*) as order_count,
SUM(total) as revenue
FROM orders
GROUP BY DATE_TRUNC('day', created_at);
-- Refresh periodically
REFRESH MATERIALIZED VIEW CONCURRENTLY sales_daily_summary;
-- Use approximate aggregates for large datasets
-- Instead of COUNT(DISTINCT):
SELECT approximate_count_distinct(user_id) FROM orders;
-- Use tablesample for approximate results
SELECT * FROM orders TABLESAMPLE BERNOULLI(10);
-- Pre-aggregate in application for real-time dashboards

-- Correlated subquery (executed per row):
SELECT
p.name,
(SELECT SUM(quantity) FROM orders WHERE product_id = p.product_id) as total_sold
FROM products p;
-- Convert to JOIN (usually faster):
SELECT p.name, COALESCE(o.total_sold, 0) as total_sold
FROM products p
LEFT JOIN (
SELECT product_id, SUM(quantity) as total_sold
FROM orders
GROUP BY product_id
) o ON p.product_id = o.product_id;
-- Use CTEs for readability (PostgreSQL optimizes them):
WITH product_sales AS (
SELECT product_id, SUM(quantity) as total_sold
FROM orders
GROUP BY product_id
)
SELECT p.name, COALESCE(ps.total_sold, 0)
FROM products p
LEFT JOIN product_sales ps ON p.product_id = ps.product_id;

-- Important settings for query performance
-- shared_buffers: Memory for caching table data
-- Should be ~25% of RAM
SHOW shared_buffers;
-- SET shared_buffers = '4GB';
-- work_mem: Memory for sorting/hashing per operation
-- Increase for complex queries
SHOW work_mem;
-- SET work_mem = '256MB';
-- maintenance_work_mem: For VACUUM, CREATE INDEX, etc.
SHOW maintenance_work_mem;
-- SET maintenance_work_mem = '1GB';
-- effective_cache_size: Planner's assumption about cache
-- Set to expected cache size
SHOW effective_cache_size;
-- SET effective_cache_size = '12GB';
-- random_page_cost: Cost of non-sequential disk read
-- Lower for SSDs
SHOW random_page_cost;
-- SET random_page_cost = 1.1;
-- Check current settings
SELECT name, setting, unit FROM pg_settings
WHERE name IN ('shared_buffers', 'work_mem', 'maintenance_work_mem',
'effective_cache_size', 'random_page_cost');

Query Optimization Checklist
========================================================================
Before Running:
────────────────
□ Do you SELECT only needed columns?
□ Are WHERE conditions sargable?
□ Is the join order optimal?
□ Can you add appropriate indexes?
After EXPLAIN ANALYZE:
───────────────────────
□ Any sequential scans on large tables?
□ Are row estimates accurate?
□ Is join type appropriate?
□ Are sorts using memory or disk?
□ Any expensive operations that can be avoided?
Advanced Checks:
─────────────────
□ Consider partitioning for large tables
□ Review buffer hit ratio
□ Check for missing foreign key indexes
□ Review statistics freshness

24.9 Practical Example: Step-by-Step Optimization

Section titled “24.9 Practical Example: Step-by-Step Optimization”
-- Step 1: Original slow query
EXPLAIN (ANALYZE, TIMING)
SELECT
u.username,
o.order_id,
o.total,
p.name as product_name
FROM users u
INNER JOIN orders o ON u.user_id = o.user_id
INNER JOIN order_items oi ON o.order_id = oi.order_id
INNER JOIN products p ON oi.product_id = p.product_id
WHERE o.created_at >= '2024-01-01'
AND u.status = 'active'
ORDER BY o.total DESC
LIMIT 100;
-- Issues identified:
-- - Sequential scans
-- - Large join result before LIMIT
-- Step 2: Add indexes
CREATE INDEX idx_orders_user_date ON orders(user_id, created_at);
CREATE INDEX idx_orders_status_date ON orders(status, created_at);
CREATE INDEX idx_order_items_order ON order_items(order_id);
CREATE INDEX idx_products_id_name ON products(product_id, name);
-- Step 3: Rewrite query
EXPLAIN (ANALYZE, TIMING)
SELECT
u.username,
o.order_id,
oi.quantity * oi.unit_price as total,
p.name as product_name
FROM (
SELECT order_id, user_id, quantity * unit_price as total
FROM order_items
) oi
INNER JOIN (
SELECT order_id, user_id, total
FROM orders
WHERE created_at >= '2024-01-01'
AND status = 'completed'
) o ON oi.order_id = o.order_id
INNER JOIN users u ON o.user_id = u.user_id
INNER JOIN products p ON oi.product_id = p.product_id
WHERE u.status = 'active'
ORDER BY oi.quantity * oi.unit_price DESC
LIMIT 100;

TechniqueWhen to Use
Index creationWHERE/JOIN/ORDER BY columns
Query rewritingAvoid subqueries, use IN not OR
PaginationUse keyset, not OFFSET
CTEsReadability, can be optimized
ConfigurationTune memory parameters

Next: Chapter 25: Performance Monitoring