This analysis is based on a SQLite database containing sales data. The main tables are:
Before any analysis, it is important to clean the data to ensure quality and consistency. Here are the main cleaning steps applied to the sales table:
Cleaning queries:
-- 1. Remove duplicates (keep the first occurrence)
WITH DuplicateCheck AS (
SELECT
ORDERNUMBER,
ORDERLINENUMBER,
ROW_NUMBER() OVER (
PARTITION BY ORDERNUMBER, ORDERLINENUMBER
ORDER BY ORDERNUMBER
) as row_num
FROM sales
)
DELETE FROM sales
WHERE (ORDERNUMBER, ORDERLINENUMBER) IN (
SELECT ORDERNUMBER, ORDERLINENUMBER
FROM DuplicateCheck
WHERE row_num > 1
);
-- 2. Standardize dates (convert to date format)
UPDATE sales
SET ORDERDATE = date(ORDERDATE)
WHERE ORDERDATE IS NOT NULL;
-- 3. Clean numeric values (replace negative or null values)
UPDATE sales
SET
QUANTITYORDERED = CASE
WHEN QUANTITYORDERED <= 0 THEN NULL
ELSE QUANTITYORDERED
END,
PRICEEACH = CASE
WHEN PRICEEACH <= 0 THEN NULL
ELSE PRICEEACH
END,
SALES = CASE
WHEN SALES <= 0 THEN NULL
ELSE SALES
END;
-- 4. Standardize text fields (uppercase, trim spaces)
UPDATE sales
SET
PRODUCTLINE = TRIM(UPPER(PRODUCTLINE)),
STATUS = TRIM(UPPER(STATUS)),
DEALSIZE = TRIM(UPPER(DEALSIZE));
-- 5. Clean address fields (consistent formatting)
UPDATE sales
SET
CITY = TRIM(CITY),
STATE = TRIM(UPPER(STATE)),
COUNTRY = TRIM(UPPER(COUNTRY)),
TERRITORY = TRIM(UPPER(TERRITORY));
-- 6. Check consistency of amounts
UPDATE sales
SET SALES = QUANTITYORDERED * PRICEEACH
WHERE ABS(SALES - (QUANTITYORDERED * PRICEEACH)) > 0.01;
-- 7. Create indexes to speed up queries
CREATE INDEX IF NOT EXISTS idx_sales_date ON sales(ORDERDATE);
CREATE INDEX IF NOT EXISTS idx_sales_product ON sales(PRODUCTLINE);
CREATE INDEX IF NOT EXISTS idx_sales_territory ON sales(TERRITORY);
CREATE INDEX IF NOT EXISTS idx_sales_customer ON sales(CUSTOMERNAME);
-- 8. Add derived columns (year, month, quarter)
ALTER TABLE sales ADD COLUMN IF NOT EXISTS order_year INTEGER;
ALTER TABLE sales ADD COLUMN IF NOT EXISTS order_month INTEGER;
ALTER TABLE sales ADD COLUMN IF NOT EXISTS order_quarter INTEGER;
UPDATE sales
SET
order_year = strftime('%Y', ORDERDATE),
order_month = strftime('%m', ORDERDATE),
order_quarter = CASE
WHEN strftime('%m', ORDERDATE) IN ('01','02','03') THEN 1
WHEN strftime('%m', ORDERDATE) IN ('04','05','06') THEN 2
WHEN strftime('%m', ORDERDATE) IN ('07','08','09') THEN 3
ELSE 4
END;
-- 9. Remove incomplete rows
DELETE FROM sales
WHERE ORDERNUMBER IS NULL
OR PRODUCTLINE IS NULL
OR ORDERDATE IS NULL;
-- 10. Create a view for cleaned data
CREATE VIEW IF NOT EXISTS clean_sales AS
SELECT
ORDERNUMBER,
ORDERDATE,
order_year,
order_month,
order_quarter,
STATUS,
PRODUCTLINE,
QUANTITYORDERED,
PRICEEACH,
SALES,
CUSTOMERNAME,
COUNTRY,
TERRITORY,
DEALSIZE
FROM sales
WHERE SALES IS NOT NULL
AND QUANTITYORDERED IS NOT NULL
AND PRICEEACH IS NOT NULL;
-- Sales by year and quarter
SELECT
YEAR_ID,
QTR_ID,
ROUND(SUM(SALES), 2) as total_sales, -- Total sales amount
COUNT(DISTINCT ORDERNUMBER) as number_of_orders -- Number of orders
FROM sales
GROUP BY YEAR_ID, QTR_ID
ORDER BY YEAR_ID, QTR_ID;
-- Top 10 best-selling products
SELECT
PRODUCTLINE,
COUNT(*) as number_of_sales, -- Number of sales
ROUND(SUM(SALES), 2) as total_revenue, -- Total revenue
ROUND(AVG(SALES), 2) as avg_revenue_per_sale -- Average sale per order
FROM sales
GROUP BY PRODUCTLINE
ORDER BY total_revenue DESC
LIMIT 10;
-- Sales by country
SELECT
COUNTRY,
COUNT(DISTINCT CUSTOMERNAME) as number_of_customers, -- Number of customers
ROUND(SUM(SALES), 2) as total_sales, -- Total sales
ROUND(AVG(SALES), 2) as avg_sale_per_order -- Average sale per order
FROM sales
GROUP BY COUNTRY
ORDER BY total_sales DESC;
-- Monthly sales performance
SELECT
YEAR_ID,
MONTH_ID,
COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders
ROUND(SUM(SALES), 2) as total_sales, -- Total sales
ROUND(AVG(SALES), 2) as avg_order_value -- Average order value
FROM sales
GROUP BY YEAR_ID, MONTH_ID
ORDER BY YEAR_ID, MONTH_ID;
-- Order size analysis
SELECT
DEALSIZE,
COUNT(*) as number_of_deals, -- Number of orders
ROUND(SUM(SALES), 2) as total_revenue, -- Total revenue
ROUND(AVG(SALES), 2) as avg_revenue_per_deal -- Average revenue per order
FROM sales
GROUP BY DEALSIZE
ORDER BY total_revenue DESC;
-- Order status analysis
SELECT
STATUS,
COUNT(*) as number_of_orders, -- Number of orders
ROUND(SUM(SALES), 2) as total_revenue -- Total revenue
FROM sales
GROUP BY STATUS
ORDER BY number_of_orders DESC;
-- Top customers
SELECT
CUSTOMERNAME,
COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders
ROUND(SUM(SALES), 2) as total_spent, -- Total spent
ROUND(AVG(SALES), 2) as avg_order_value -- Average order value
FROM sales
GROUP BY CUSTOMERNAME
ORDER BY total_spent DESC
LIMIT 10;
-- Sales by territory
SELECT
TERRITORY,
COUNT(DISTINCT CUSTOMERNAME) as number_of_customers, -- Number of customers
COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders
ROUND(SUM(SALES), 2) as total_revenue -- Total revenue
FROM sales
GROUP BY TERRITORY
ORDER BY total_revenue DESC;
-- Calculate month-over-month sales growth
WITH monthly_sales AS (
SELECT
YEAR_ID,
MONTH_ID,
ROUND(SUM(SALES), 2) as total_sales
FROM sales
GROUP BY YEAR_ID, MONTH_ID
),
sales_variation AS (
SELECT
*,
LAG(total_sales) OVER (ORDER BY YEAR_ID, MONTH_ID) as prev_month_sales,
ROUND(((total_sales - LAG(total_sales) OVER (ORDER BY YEAR_ID, MONTH_ID)) /
LAG(total_sales) OVER (ORDER BY YEAR_ID, MONTH_ID)) * 100, 2) as growth_percentage
FROM monthly_sales
)
SELECT
YEAR_ID,
MONTH_ID,
total_sales,
prev_month_sales,
growth_percentage || '%' as monthly_growth
FROM sales_variation
ORDER BY YEAR_ID, MONTH_ID;
-- Analyze order frequency and total quantity ordered by product
SELECT
PRODUCTLINE,
COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of distinct orders
ROUND(SUM(QUANTITYORDERED), 2) as total_quantity_ordered, -- Total quantity ordered
ROUND(AVG(QUANTITYORDERED), 2) as avg_quantity_per_order, -- Average quantity per order
ROUND(COUNT(DISTINCT ORDERNUMBER) * 1.0 /
(JULIANDAY(MAX(ORDERDATE)) - JULIANDAY(MIN(ORDERDATE))) * 30, 2) as orders_per_month -- Orders per month
FROM sales
GROUP BY PRODUCTLINE
ORDER BY total_quantity_ordered DESC;
-- Analyze sales seasonality by product category
SELECT
PRODUCTLINE,
MONTH_ID,
ROUND(AVG(SALES), 2) as avg_monthly_sales, -- Average monthly sales
COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders
ROUND(SUM(QUANTITYORDERED), 2) as total_quantity -- Total quantity
FROM sales
GROUP BY PRODUCTLINE, MONTH_ID
ORDER BY PRODUCTLINE, MONTH_ID;
-- Simple forecast based on historical growth
WITH yearly_sales AS (
SELECT
YEAR_ID,
ROUND(SUM(SALES), 2) as total_sales
FROM sales
GROUP BY YEAR_ID
),
growth_calc AS (
SELECT
YEAR_ID,
total_sales,
LAG(total_sales) OVER (ORDER BY YEAR_ID) as prev_year_sales,
ROUND(((total_sales - LAG(total_sales) OVER (ORDER BY YEAR_ID)) /
LAG(total_sales) OVER (ORDER BY YEAR_ID)) * 100, 2) as growth_rate
FROM yearly_sales
)
SELECT
YEAR_ID,
total_sales,
growth_rate || '%' as year_over_year_growth,
ROUND(total_sales * (1 + AVG(growth_rate) OVER () / 100), 2) as projected_next_year
FROM growth_calc;
-- Identify unusually large orders (z-score > 2)
WITH order_stats AS (
SELECT
PRODUCTLINE,
AVG(QUANTITYORDERED) as avg_quantity,
STDDEV(QUANTITYORDERED) as stddev_quantity
FROM sales
GROUP BY PRODUCTLINE
)
SELECT
s.PRODUCTLINE,
s.ORDERNUMBER,
s.QUANTITYORDERED,
os.avg_quantity,
ROUND((s.QUANTITYORDERED - os.avg_quantity) / os.stddev_quantity, 2) as z_score
FROM sales s
JOIN order_stats os ON s.PRODUCTLINE = os.PRODUCTLINE
WHERE (s.QUANTITYORDERED - os.avg_quantity) / os.stddev_quantity > 2
ORDER BY z_score DESC;
-- Calculate the correlation between price and quantity sold
SELECT
PRODUCTLINE,
ROUND(AVG(PRICEEACH), 2) as avg_price, -- Average price
ROUND(AVG(QUANTITYORDERED), 2) as avg_quantity, -- Average quantity
ROUND(CORR(PRICEEACH, QUANTITYORDERED), 4) as price_quantity_correlation -- Price-quantity correlation
FROM sales
GROUP BY PRODUCTLINE
ORDER BY price_quantity_correlation;
-- RFM analysis: segment customers by recency, frequency, and monetary value
WITH rfm_calc AS (
SELECT
CUSTOMERNAME,
ROUND(JULIANDAY(MAX(ORDERDATE)) - JULIANDAY(MIN(ORDERDATE)), 0) as recency, -- Days since first order
COUNT(DISTINCT ORDERNUMBER) as frequency, -- Number of orders
ROUND(SUM(SALES), 2) as monetary -- Total amount
FROM sales
GROUP BY CUSTOMERNAME
)
SELECT
CUSTOMERNAME,
recency,
frequency,
monetary,
NTILE(5) OVER (ORDER BY recency DESC) as R_score, -- Recency score
NTILE(5) OVER (ORDER BY frequency) as F_score, -- Frequency score
NTILE(5) OVER (ORDER BY monetary) as M_score -- Monetary score
FROM rfm_calc
ORDER BY monetary DESC;