This analysis is based on a SQLite database containing sales data. The main tables are:
Before any analysis, it is important to clean the data to ensure quality and consistency. Here are the main cleaning steps applied to the sales table:
Cleaning queries:
-- 1. Remove duplicates (keep the first occurrence) WITH DuplicateCheck AS ( SELECT ORDERNUMBER, ORDERLINENUMBER, ROW_NUMBER() OVER ( PARTITION BY ORDERNUMBER, ORDERLINENUMBER ORDER BY ORDERNUMBER ) as row_num FROM sales ) DELETE FROM sales WHERE (ORDERNUMBER, ORDERLINENUMBER) IN ( SELECT ORDERNUMBER, ORDERLINENUMBER FROM DuplicateCheck WHERE row_num > 1 ); -- 2. Standardize dates (convert to date format) UPDATE sales SET ORDERDATE = date(ORDERDATE) WHERE ORDERDATE IS NOT NULL; -- 3. Clean numeric values (replace negative or null values) UPDATE sales SET QUANTITYORDERED = CASE WHEN QUANTITYORDERED <= 0 THEN NULL ELSE QUANTITYORDERED END, PRICEEACH = CASE WHEN PRICEEACH <= 0 THEN NULL ELSE PRICEEACH END, SALES = CASE WHEN SALES <= 0 THEN NULL ELSE SALES END; -- 4. Standardize text fields (uppercase, trim spaces) UPDATE sales SET PRODUCTLINE = TRIM(UPPER(PRODUCTLINE)), STATUS = TRIM(UPPER(STATUS)), DEALSIZE = TRIM(UPPER(DEALSIZE)); -- 5. Clean address fields (consistent formatting) UPDATE sales SET CITY = TRIM(CITY), STATE = TRIM(UPPER(STATE)), COUNTRY = TRIM(UPPER(COUNTRY)), TERRITORY = TRIM(UPPER(TERRITORY)); -- 6. Check consistency of amounts UPDATE sales SET SALES = QUANTITYORDERED * PRICEEACH WHERE ABS(SALES - (QUANTITYORDERED * PRICEEACH)) > 0.01; -- 7. Create indexes to speed up queries CREATE INDEX IF NOT EXISTS idx_sales_date ON sales(ORDERDATE); CREATE INDEX IF NOT EXISTS idx_sales_product ON sales(PRODUCTLINE); CREATE INDEX IF NOT EXISTS idx_sales_territory ON sales(TERRITORY); CREATE INDEX IF NOT EXISTS idx_sales_customer ON sales(CUSTOMERNAME); -- 8. Add derived columns (year, month, quarter) ALTER TABLE sales ADD COLUMN IF NOT EXISTS order_year INTEGER; ALTER TABLE sales ADD COLUMN IF NOT EXISTS order_month INTEGER; ALTER TABLE sales ADD COLUMN IF NOT EXISTS order_quarter INTEGER; UPDATE sales SET order_year = strftime('%Y', ORDERDATE), order_month = strftime('%m', ORDERDATE), order_quarter = CASE WHEN strftime('%m', ORDERDATE) IN ('01','02','03') THEN 1 WHEN strftime('%m', ORDERDATE) IN ('04','05','06') THEN 2 WHEN strftime('%m', ORDERDATE) IN ('07','08','09') THEN 3 ELSE 4 END; -- 9. Remove incomplete rows DELETE FROM sales WHERE ORDERNUMBER IS NULL OR PRODUCTLINE IS NULL OR ORDERDATE IS NULL; -- 10. Create a view for cleaned data CREATE VIEW IF NOT EXISTS clean_sales AS SELECT ORDERNUMBER, ORDERDATE, order_year, order_month, order_quarter, STATUS, PRODUCTLINE, QUANTITYORDERED, PRICEEACH, SALES, CUSTOMERNAME, COUNTRY, TERRITORY, DEALSIZE FROM sales WHERE SALES IS NOT NULL AND QUANTITYORDERED IS NOT NULL AND PRICEEACH IS NOT NULL;
-- Sales by year and quarter SELECT YEAR_ID, QTR_ID, ROUND(SUM(SALES), 2) as total_sales, -- Total sales amount COUNT(DISTINCT ORDERNUMBER) as number_of_orders -- Number of orders FROM sales GROUP BY YEAR_ID, QTR_ID ORDER BY YEAR_ID, QTR_ID;
-- Top 10 best-selling products SELECT PRODUCTLINE, COUNT(*) as number_of_sales, -- Number of sales ROUND(SUM(SALES), 2) as total_revenue, -- Total revenue ROUND(AVG(SALES), 2) as avg_revenue_per_sale -- Average sale per order FROM sales GROUP BY PRODUCTLINE ORDER BY total_revenue DESC LIMIT 10;
-- Sales by country SELECT COUNTRY, COUNT(DISTINCT CUSTOMERNAME) as number_of_customers, -- Number of customers ROUND(SUM(SALES), 2) as total_sales, -- Total sales ROUND(AVG(SALES), 2) as avg_sale_per_order -- Average sale per order FROM sales GROUP BY COUNTRY ORDER BY total_sales DESC;
-- Monthly sales performance SELECT YEAR_ID, MONTH_ID, COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders ROUND(SUM(SALES), 2) as total_sales, -- Total sales ROUND(AVG(SALES), 2) as avg_order_value -- Average order value FROM sales GROUP BY YEAR_ID, MONTH_ID ORDER BY YEAR_ID, MONTH_ID;
-- Order size analysis SELECT DEALSIZE, COUNT(*) as number_of_deals, -- Number of orders ROUND(SUM(SALES), 2) as total_revenue, -- Total revenue ROUND(AVG(SALES), 2) as avg_revenue_per_deal -- Average revenue per order FROM sales GROUP BY DEALSIZE ORDER BY total_revenue DESC;
-- Order status analysis SELECT STATUS, COUNT(*) as number_of_orders, -- Number of orders ROUND(SUM(SALES), 2) as total_revenue -- Total revenue FROM sales GROUP BY STATUS ORDER BY number_of_orders DESC;
-- Top customers SELECT CUSTOMERNAME, COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders ROUND(SUM(SALES), 2) as total_spent, -- Total spent ROUND(AVG(SALES), 2) as avg_order_value -- Average order value FROM sales GROUP BY CUSTOMERNAME ORDER BY total_spent DESC LIMIT 10;
-- Sales by territory SELECT TERRITORY, COUNT(DISTINCT CUSTOMERNAME) as number_of_customers, -- Number of customers COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders ROUND(SUM(SALES), 2) as total_revenue -- Total revenue FROM sales GROUP BY TERRITORY ORDER BY total_revenue DESC;
-- Calculate month-over-month sales growth WITH monthly_sales AS ( SELECT YEAR_ID, MONTH_ID, ROUND(SUM(SALES), 2) as total_sales FROM sales GROUP BY YEAR_ID, MONTH_ID ), sales_variation AS ( SELECT *, LAG(total_sales) OVER (ORDER BY YEAR_ID, MONTH_ID) as prev_month_sales, ROUND(((total_sales - LAG(total_sales) OVER (ORDER BY YEAR_ID, MONTH_ID)) / LAG(total_sales) OVER (ORDER BY YEAR_ID, MONTH_ID)) * 100, 2) as growth_percentage FROM monthly_sales ) SELECT YEAR_ID, MONTH_ID, total_sales, prev_month_sales, growth_percentage || '%' as monthly_growth FROM sales_variation ORDER BY YEAR_ID, MONTH_ID;
-- Analyze order frequency and total quantity ordered by product SELECT PRODUCTLINE, COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of distinct orders ROUND(SUM(QUANTITYORDERED), 2) as total_quantity_ordered, -- Total quantity ordered ROUND(AVG(QUANTITYORDERED), 2) as avg_quantity_per_order, -- Average quantity per order ROUND(COUNT(DISTINCT ORDERNUMBER) * 1.0 / (JULIANDAY(MAX(ORDERDATE)) - JULIANDAY(MIN(ORDERDATE))) * 30, 2) as orders_per_month -- Orders per month FROM sales GROUP BY PRODUCTLINE ORDER BY total_quantity_ordered DESC;
-- Analyze sales seasonality by product category SELECT PRODUCTLINE, MONTH_ID, ROUND(AVG(SALES), 2) as avg_monthly_sales, -- Average monthly sales COUNT(DISTINCT ORDERNUMBER) as number_of_orders, -- Number of orders ROUND(SUM(QUANTITYORDERED), 2) as total_quantity -- Total quantity FROM sales GROUP BY PRODUCTLINE, MONTH_ID ORDER BY PRODUCTLINE, MONTH_ID;
-- Simple forecast based on historical growth WITH yearly_sales AS ( SELECT YEAR_ID, ROUND(SUM(SALES), 2) as total_sales FROM sales GROUP BY YEAR_ID ), growth_calc AS ( SELECT YEAR_ID, total_sales, LAG(total_sales) OVER (ORDER BY YEAR_ID) as prev_year_sales, ROUND(((total_sales - LAG(total_sales) OVER (ORDER BY YEAR_ID)) / LAG(total_sales) OVER (ORDER BY YEAR_ID)) * 100, 2) as growth_rate FROM yearly_sales ) SELECT YEAR_ID, total_sales, growth_rate || '%' as year_over_year_growth, ROUND(total_sales * (1 + AVG(growth_rate) OVER () / 100), 2) as projected_next_year FROM growth_calc;
-- Identify unusually large orders (z-score > 2) WITH order_stats AS ( SELECT PRODUCTLINE, AVG(QUANTITYORDERED) as avg_quantity, STDDEV(QUANTITYORDERED) as stddev_quantity FROM sales GROUP BY PRODUCTLINE ) SELECT s.PRODUCTLINE, s.ORDERNUMBER, s.QUANTITYORDERED, os.avg_quantity, ROUND((s.QUANTITYORDERED - os.avg_quantity) / os.stddev_quantity, 2) as z_score FROM sales s JOIN order_stats os ON s.PRODUCTLINE = os.PRODUCTLINE WHERE (s.QUANTITYORDERED - os.avg_quantity) / os.stddev_quantity > 2 ORDER BY z_score DESC;
-- Calculate the correlation between price and quantity sold SELECT PRODUCTLINE, ROUND(AVG(PRICEEACH), 2) as avg_price, -- Average price ROUND(AVG(QUANTITYORDERED), 2) as avg_quantity, -- Average quantity ROUND(CORR(PRICEEACH, QUANTITYORDERED), 4) as price_quantity_correlation -- Price-quantity correlation FROM sales GROUP BY PRODUCTLINE ORDER BY price_quantity_correlation;
-- RFM analysis: segment customers by recency, frequency, and monetary value WITH rfm_calc AS ( SELECT CUSTOMERNAME, ROUND(JULIANDAY(MAX(ORDERDATE)) - JULIANDAY(MIN(ORDERDATE)), 0) as recency, -- Days since first order COUNT(DISTINCT ORDERNUMBER) as frequency, -- Number of orders ROUND(SUM(SALES), 2) as monetary -- Total amount FROM sales GROUP BY CUSTOMERNAME ) SELECT CUSTOMERNAME, recency, frequency, monetary, NTILE(5) OVER (ORDER BY recency DESC) as R_score, -- Recency score NTILE(5) OVER (ORDER BY frequency) as F_score, -- Frequency score NTILE(5) OVER (ORDER BY monetary) as M_score -- Monetary score FROM rfm_calc ORDER BY monetary DESC;