Writing efficient queries - avoiding common pitfalls and optimization techniques

Query Optimization

Writing efficient queries is fundamental to database performance. Poor queries can make even well-indexed databases slow.

SELECT Optimization

Select Only What You Need

-- Bad: Selecting all columns
SELECT * FROM users WHERE id = 1;

-- Good: Select only needed columns
SELECT id, name, email FROM users WHERE id = 1;

-- Benefits:
-- ✓ Less data transferred
-- ✓ Can use covering indexes
-- ✓ Reduces memory usage

Avoid Functions on Indexed Columns

-- Bad: Function prevents index usage
SELECT * FROM orders WHERE YEAR(created_at) = 2024;

-- Good: Use range condition
SELECT * FROM orders
WHERE created_at >= '2024-01-01'
  AND created_at < '2025-01-01';

-- Bad: Function on column
SELECT * FROM users WHERE LOWER(email) = 'user@example.com';

-- Good: Store lowercase or use expression index
CREATE INDEX idx_users_email_lower ON users (LOWER(email));
-- Or store email in lowercase

Use LIMIT for Large Results

-- Bad: Fetching all rows
SELECT * FROM logs WHERE level = 'ERROR';

-- Good: Use pagination
SELECT * FROM logs
WHERE level = 'ERROR'
ORDER BY created_at DESC
LIMIT 100 OFFSET 0;

-- Better: Keyset pagination for large offsets
SELECT * FROM logs
WHERE level = 'ERROR'
  AND created_at < '2024-06-01 10:00:00'
ORDER BY created_at DESC
LIMIT 100;

JOIN Optimization

Choose the Right JOIN Type

-- INNER JOIN: Only matching rows
SELECT o.id, u.name
FROM orders o
INNER JOIN users u ON o.user_id = u.id;

-- LEFT JOIN: All from left, matching from right
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id;

-- Use EXISTS instead of IN for large subqueries
-- Bad
SELECT * FROM users
WHERE id IN (SELECT user_id FROM orders WHERE total > 1000);

-- Good
SELECT * FROM users u
WHERE EXISTS (
    SELECT 1 FROM orders o
    WHERE o.user_id = u.id AND o.total > 1000
);

Join Order Matters

-- Database optimizers usually handle this, but hints can help

-- PostgreSQL: Join order hint
SET join_collapse_limit = 1;  -- Preserve written order

-- MySQL: STRAIGHT_JOIN forces left-to-right execution
SELECT STRAIGHT_JOIN o.*, p.*
FROM orders o
STRAIGHT_JOIN products p ON o.product_id = p.id;

-- General principle:
-- 1. Start with the most filtered table
-- 2. Join to progressively larger result sets

Ensure Join Columns Are Indexed

-- Create indexes on foreign key columns
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_orders_product_id ON orders(product_id);

-- Verify index usage with EXPLAIN
EXPLAIN SELECT o.*, u.name
FROM orders o
JOIN users u ON o.user_id = u.id
WHERE o.created_at > '2024-01-01';

WHERE Clause Optimization

Use Sargable Conditions

-- Sargable: Search ARGument ABLE (can use indexes)

-- Non-sargable (cannot use index)
SELECT * FROM users WHERE YEAR(birth_date) = 1990;
SELECT * FROM products WHERE price + 10 > 100;
SELECT * FROM users WHERE name LIKE '%john%';

-- Sargable (can use index)
SELECT * FROM users
WHERE birth_date >= '1990-01-01'
  AND birth_date < '1991-01-01';
SELECT * FROM products WHERE price > 90;
SELECT * FROM users WHERE name LIKE 'john%';

Avoid OR with Different Columns

-- Bad: OR prevents efficient index usage
SELECT * FROM products
WHERE category_id = 5 OR brand_id = 10;

-- Good: Use UNION
SELECT * FROM products WHERE category_id = 5
UNION
SELECT * FROM products WHERE brand_id = 10;

-- Or use IN when applicable
SELECT * FROM products
WHERE category_id IN (5, 6, 7);  -- Can use index

NULL Handling

-- Index typically excludes NULL values

-- Bad: IS NOT NULL can't use regular index
SELECT * FROM users WHERE phone IS NOT NULL;

-- Solution: Partial index (PostgreSQL)
CREATE INDEX idx_users_phone_not_null
ON users(phone) WHERE phone IS NOT NULL;

-- COALESCE can prevent index usage
-- Bad
SELECT * FROM orders WHERE COALESCE(discount, 0) > 0;

-- Good
SELECT * FROM orders WHERE discount IS NOT NULL AND discount > 0;

Subquery Optimization

Avoid Correlated Subqueries in SELECT

-- Bad: Executes subquery for each row
SELECT
    u.name,
    (SELECT COUNT(*) FROM orders WHERE user_id = u.id) as order_count
FROM users u;

-- Good: Use JOIN with aggregation
SELECT u.name, COUNT(o.id) as order_count
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name;

Convert Subqueries to JOINs

-- Subquery in WHERE
SELECT * FROM products
WHERE category_id IN (
    SELECT id FROM categories WHERE active = true
);

-- Equivalent JOIN (often faster)
SELECT DISTINCT p.*
FROM products p
JOIN categories c ON p.category_id = c.id
WHERE c.active = true;

Use CTEs for Readability

-- Common Table Expression (CTE)
WITH active_users AS (
    SELECT id, name
    FROM users
    WHERE last_login > CURRENT_DATE - INTERVAL '30 days'
),
user_orders AS (
    SELECT user_id, SUM(total) as total_spent
    FROM orders
    WHERE created_at > CURRENT_DATE - INTERVAL '30 days'
    GROUP BY user_id
)
SELECT au.name, COALESCE(uo.total_spent, 0) as total_spent
FROM active_users au
LEFT JOIN user_orders uo ON au.id = uo.user_id;

-- Note: CTEs may be optimization barriers in some databases
-- PostgreSQL 12+ can inline CTEs with NOT MATERIALIZED hint

Aggregation Optimization

Index for GROUP BY

-- Create index matching GROUP BY columns
CREATE INDEX idx_orders_user_date ON orders(user_id, created_at);

-- Query benefits from index
SELECT user_id, DATE(created_at), COUNT(*)
FROM orders
GROUP BY user_id, DATE(created_at);

Avoid DISTINCT When Possible

-- DISTINCT requires sorting/hashing
SELECT DISTINCT category_id FROM products;

-- If using JOIN, check if it's needed
-- Bad: DISTINCT to fix duplicate rows from JOIN
SELECT DISTINCT u.* FROM users u
JOIN orders o ON u.id = o.user_id;

-- Good: EXISTS avoids duplicates
SELECT u.* FROM users u
WHERE EXISTS (SELECT 1 FROM orders o WHERE o.user_id = u.id);

Pre-aggregate When Possible

-- For reporting, consider materialized views or summary tables

-- Create summary table
CREATE TABLE daily_sales_summary (
    date DATE PRIMARY KEY,
    total_orders INT,
    total_revenue DECIMAL(12, 2)
);

-- Populate with scheduled job
INSERT INTO daily_sales_summary
SELECT DATE(created_at), COUNT(*), SUM(total)
FROM orders
WHERE DATE(created_at) = CURRENT_DATE - 1
GROUP BY DATE(created_at)
ON CONFLICT (date) DO UPDATE
SET total_orders = EXCLUDED.total_orders,
    total_revenue = EXCLUDED.total_revenue;

Batch Operations

Bulk INSERT

-- Bad: Individual inserts
INSERT INTO logs (message) VALUES ('log 1');
INSERT INTO logs (message) VALUES ('log 2');
INSERT INTO logs (message) VALUES ('log 3');

-- Good: Batch insert
INSERT INTO logs (message) VALUES
    ('log 1'),
    ('log 2'),
    ('log 3');

-- PostgreSQL: Use COPY for very large imports
COPY logs (message) FROM '/path/to/file.csv' WITH CSV;

Bulk UPDATE

-- Bad: Individual updates
UPDATE products SET price = 100 WHERE id = 1;
UPDATE products SET price = 200 WHERE id = 2;
UPDATE products SET price = 300 WHERE id = 3;

-- Good: Single update with CASE
UPDATE products
SET price = CASE id
    WHEN 1 THEN 100
    WHEN 2 THEN 200
    WHEN 3 THEN 300
END
WHERE id IN (1, 2, 3);

-- PostgreSQL: UPDATE FROM
UPDATE products p
SET price = v.new_price
FROM (VALUES (1, 100), (2, 200), (3, 300)) AS v(id, new_price)
WHERE p.id = v.id;

Anti-Patterns to Avoid

-- 1. SELECT * in production code
-- 2. Using OFFSET for deep pagination
-- 3. Implicit type conversions
SELECT * FROM users WHERE id = '123';  -- String vs Integer

-- 4. Over-normalization causing excessive JOINs
-- 5. N+1 query problem
-- 6. Not using prepared statements
-- 7. Unnecessary ORDER BY
-- 8. COUNT(*) when EXISTS would suffice

-- Bad: Need to count all rows
SELECT COUNT(*) FROM users WHERE status = 'active';
-- Then check if > 0

-- Good: Stop at first match
SELECT EXISTS (SELECT 1 FROM users WHERE status = 'active');

Best Practices Summary

Profile first: Don't optimize without measuring
Use EXPLAIN: Understand query execution plans
Index strategically: Not too many, not too few
Write sargable queries: Keep conditions index-friendly
Batch operations: Reduce round trips
Limit results: Paginate large result sets
Choose appropriate types: Match data types in comparisons
Review regularly: Queries may need optimization as data grows

Query Optimization

On this page