Optimization
Query Optimization
Writing efficient queries - avoiding common pitfalls and optimization techniques
Query Optimization
Writing efficient queries is fundamental to database performance. Poor queries can make even well-indexed databases slow.
SELECT Optimization
Select Only What You Need
-- Bad: Selecting all columns
SELECT * FROM users WHERE id = 1;
-- Good: Select only needed columns
SELECT id, name, email FROM users WHERE id = 1;
-- Benefits:
-- ✓ Less data transferred
-- ✓ Can use covering indexes
-- ✓ Reduces memory usageAvoid Functions on Indexed Columns
-- Bad: Function prevents index usage
SELECT * FROM orders WHERE YEAR(created_at) = 2024;
-- Good: Use range condition
SELECT * FROM orders
WHERE created_at >= '2024-01-01'
AND created_at < '2025-01-01';
-- Bad: Function on column
SELECT * FROM users WHERE LOWER(email) = 'user@example.com';
-- Good: Store lowercase or use expression index
CREATE INDEX idx_users_email_lower ON users (LOWER(email));
-- Or store email in lowercaseUse LIMIT for Large Results
-- Bad: Fetching all rows
SELECT * FROM logs WHERE level = 'ERROR';
-- Good: Use pagination
SELECT * FROM logs
WHERE level = 'ERROR'
ORDER BY created_at DESC
LIMIT 100 OFFSET 0;
-- Better: Keyset pagination for large offsets
SELECT * FROM logs
WHERE level = 'ERROR'
AND created_at < '2024-06-01 10:00:00'
ORDER BY created_at DESC
LIMIT 100;JOIN Optimization
Choose the Right JOIN Type
-- INNER JOIN: Only matching rows
SELECT o.id, u.name
FROM orders o
INNER JOIN users u ON o.user_id = u.id;
-- LEFT JOIN: All from left, matching from right
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id;
-- Use EXISTS instead of IN for large subqueries
-- Bad
SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders WHERE total > 1000);
-- Good
SELECT * FROM users u
WHERE EXISTS (
SELECT 1 FROM orders o
WHERE o.user_id = u.id AND o.total > 1000
);Join Order Matters
-- Database optimizers usually handle this, but hints can help
-- PostgreSQL: Join order hint
SET join_collapse_limit = 1; -- Preserve written order
-- MySQL: STRAIGHT_JOIN forces left-to-right execution
SELECT STRAIGHT_JOIN o.*, p.*
FROM orders o
STRAIGHT_JOIN products p ON o.product_id = p.id;
-- General principle:
-- 1. Start with the most filtered table
-- 2. Join to progressively larger result setsEnsure Join Columns Are Indexed
-- Create indexes on foreign key columns
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_product_id ON orders(product_id);
-- Verify index usage with EXPLAIN
EXPLAIN SELECT o.*, u.name
FROM orders o
JOIN users u ON o.user_id = u.id
WHERE o.created_at > '2024-01-01';WHERE Clause Optimization
Use Sargable Conditions
-- Sargable: Search ARGument ABLE (can use indexes)
-- Non-sargable (cannot use index)
SELECT * FROM users WHERE YEAR(birth_date) = 1990;
SELECT * FROM products WHERE price + 10 > 100;
SELECT * FROM users WHERE name LIKE '%john%';
-- Sargable (can use index)
SELECT * FROM users
WHERE birth_date >= '1990-01-01'
AND birth_date < '1991-01-01';
SELECT * FROM products WHERE price > 90;
SELECT * FROM users WHERE name LIKE 'john%';Avoid OR with Different Columns
-- Bad: OR prevents efficient index usage
SELECT * FROM products
WHERE category_id = 5 OR brand_id = 10;
-- Good: Use UNION
SELECT * FROM products WHERE category_id = 5
UNION
SELECT * FROM products WHERE brand_id = 10;
-- Or use IN when applicable
SELECT * FROM products
WHERE category_id IN (5, 6, 7); -- Can use indexNULL Handling
-- Index typically excludes NULL values
-- Bad: IS NOT NULL can't use regular index
SELECT * FROM users WHERE phone IS NOT NULL;
-- Solution: Partial index (PostgreSQL)
CREATE INDEX idx_users_phone_not_null
ON users(phone) WHERE phone IS NOT NULL;
-- COALESCE can prevent index usage
-- Bad
SELECT * FROM orders WHERE COALESCE(discount, 0) > 0;
-- Good
SELECT * FROM orders WHERE discount IS NOT NULL AND discount > 0;Subquery Optimization
Avoid Correlated Subqueries in SELECT
-- Bad: Executes subquery for each row
SELECT
u.name,
(SELECT COUNT(*) FROM orders WHERE user_id = u.id) as order_count
FROM users u;
-- Good: Use JOIN with aggregation
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name;Convert Subqueries to JOINs
-- Subquery in WHERE
SELECT * FROM products
WHERE category_id IN (
SELECT id FROM categories WHERE active = true
);
-- Equivalent JOIN (often faster)
SELECT DISTINCT p.*
FROM products p
JOIN categories c ON p.category_id = c.id
WHERE c.active = true;Use CTEs for Readability
-- Common Table Expression (CTE)
WITH active_users AS (
SELECT id, name
FROM users
WHERE last_login > CURRENT_DATE - INTERVAL '30 days'
),
user_orders AS (
SELECT user_id, SUM(total) as total_spent
FROM orders
WHERE created_at > CURRENT_DATE - INTERVAL '30 days'
GROUP BY user_id
)
SELECT au.name, COALESCE(uo.total_spent, 0) as total_spent
FROM active_users au
LEFT JOIN user_orders uo ON au.id = uo.user_id;
-- Note: CTEs may be optimization barriers in some databases
-- PostgreSQL 12+ can inline CTEs with NOT MATERIALIZED hintAggregation Optimization
Index for GROUP BY
-- Create index matching GROUP BY columns
CREATE INDEX idx_orders_user_date ON orders(user_id, created_at);
-- Query benefits from index
SELECT user_id, DATE(created_at), COUNT(*)
FROM orders
GROUP BY user_id, DATE(created_at);Avoid DISTINCT When Possible
-- DISTINCT requires sorting/hashing
SELECT DISTINCT category_id FROM products;
-- If using JOIN, check if it's needed
-- Bad: DISTINCT to fix duplicate rows from JOIN
SELECT DISTINCT u.* FROM users u
JOIN orders o ON u.id = o.user_id;
-- Good: EXISTS avoids duplicates
SELECT u.* FROM users u
WHERE EXISTS (SELECT 1 FROM orders o WHERE o.user_id = u.id);Pre-aggregate When Possible
-- For reporting, consider materialized views or summary tables
-- Create summary table
CREATE TABLE daily_sales_summary (
date DATE PRIMARY KEY,
total_orders INT,
total_revenue DECIMAL(12, 2)
);
-- Populate with scheduled job
INSERT INTO daily_sales_summary
SELECT DATE(created_at), COUNT(*), SUM(total)
FROM orders
WHERE DATE(created_at) = CURRENT_DATE - 1
GROUP BY DATE(created_at)
ON CONFLICT (date) DO UPDATE
SET total_orders = EXCLUDED.total_orders,
total_revenue = EXCLUDED.total_revenue;Batch Operations
Bulk INSERT
-- Bad: Individual inserts
INSERT INTO logs (message) VALUES ('log 1');
INSERT INTO logs (message) VALUES ('log 2');
INSERT INTO logs (message) VALUES ('log 3');
-- Good: Batch insert
INSERT INTO logs (message) VALUES
('log 1'),
('log 2'),
('log 3');
-- PostgreSQL: Use COPY for very large imports
COPY logs (message) FROM '/path/to/file.csv' WITH CSV;Bulk UPDATE
-- Bad: Individual updates
UPDATE products SET price = 100 WHERE id = 1;
UPDATE products SET price = 200 WHERE id = 2;
UPDATE products SET price = 300 WHERE id = 3;
-- Good: Single update with CASE
UPDATE products
SET price = CASE id
WHEN 1 THEN 100
WHEN 2 THEN 200
WHEN 3 THEN 300
END
WHERE id IN (1, 2, 3);
-- PostgreSQL: UPDATE FROM
UPDATE products p
SET price = v.new_price
FROM (VALUES (1, 100), (2, 200), (3, 300)) AS v(id, new_price)
WHERE p.id = v.id;Anti-Patterns to Avoid
-- 1. SELECT * in production code
-- 2. Using OFFSET for deep pagination
-- 3. Implicit type conversions
SELECT * FROM users WHERE id = '123'; -- String vs Integer
-- 4. Over-normalization causing excessive JOINs
-- 5. N+1 query problem
-- 6. Not using prepared statements
-- 7. Unnecessary ORDER BY
-- 8. COUNT(*) when EXISTS would suffice
-- Bad: Need to count all rows
SELECT COUNT(*) FROM users WHERE status = 'active';
-- Then check if > 0
-- Good: Stop at first match
SELECT EXISTS (SELECT 1 FROM users WHERE status = 'active');Best Practices Summary
- Profile first: Don't optimize without measuring
- Use EXPLAIN: Understand query execution plans
- Index strategically: Not too many, not too few
- Write sargable queries: Keep conditions index-friendly
- Batch operations: Reduce round trips
- Limit results: Paginate large result sets
- Choose appropriate types: Match data types in comparisons
- Review regularly: Queries may need optimization as data grows