-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstar_schema.sql
More file actions
228 lines (213 loc) · 8.62 KB
/
star_schema.sql
File metadata and controls
228 lines (213 loc) · 8.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
-- Star Schema Implementation for Mini Data Warehouse
-- This script creates dimension and fact tables following star schema design
-- Drop existing star schema objects if they exist
DROP TABLE IF EXISTS fact_sales CASCADE;
DROP TABLE IF EXISTS dim_customer CASCADE;
DROP TABLE IF EXISTS dim_product CASCADE;
DROP TABLE IF EXISTS dim_date CASCADE;
DROP VIEW IF EXISTS sales_summary CASCADE;
-- Date dimension table
CREATE TABLE dim_date (
date_key INTEGER PRIMARY KEY,
full_date DATE NOT NULL,
year INTEGER NOT NULL,
quarter INTEGER NOT NULL,
month INTEGER NOT NULL,
month_name VARCHAR(20) NOT NULL,
day INTEGER NOT NULL,
day_of_week INTEGER NOT NULL,
day_name VARCHAR(20) NOT NULL,
week_of_year INTEGER NOT NULL,
is_weekend BOOLEAN NOT NULL
);
-- Customer dimension table
CREATE TABLE dim_customer (
customer_key SERIAL PRIMARY KEY,
customer_id INTEGER NOT NULL,
first_name TEXT,
last_name TEXT,
full_name TEXT,
email TEXT,
city TEXT,
country TEXT,
region TEXT,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Product dimension table
CREATE TABLE dim_product (
product_key SERIAL PRIMARY KEY,
product_id INTEGER NOT NULL,
product_name TEXT,
category TEXT,
price NUMERIC(10,2),
price_tier TEXT,
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Sales fact table
CREATE TABLE fact_sales (
sales_key SERIAL PRIMARY KEY,
date_key INTEGER REFERENCES dim_date(date_key),
customer_key INTEGER REFERENCES dim_customer(customer_key),
product_key INTEGER REFERENCES dim_product(product_key),
order_id INTEGER,
order_item_id INTEGER,
quantity INTEGER,
unit_price NUMERIC(10,2),
total_amount NUMERIC(12,2),
profit_margin NUMERIC(5,2), -- Simulated profit margin
created_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Populate date dimension (2024 and some future dates)
INSERT INTO dim_date (date_key, full_date, year, quarter, month, month_name, day, day_of_week, day_name, week_of_year, is_weekend)
SELECT
TO_CHAR(date_series, 'YYYYMMDD')::INTEGER as date_key,
date_series as full_date,
EXTRACT(YEAR FROM date_series) as year,
EXTRACT(QUARTER FROM date_series) as quarter,
EXTRACT(MONTH FROM date_series) as month,
TO_CHAR(date_series, 'Month') as month_name,
EXTRACT(DAY FROM date_series) as day,
EXTRACT(DOW FROM date_series) as day_of_week,
TO_CHAR(date_series, 'Day') as day_name,
EXTRACT(WEEK FROM date_series) as week_of_year,
CASE WHEN EXTRACT(DOW FROM date_series) IN (0, 6) THEN TRUE ELSE FALSE END as is_weekend
FROM generate_series('2024-01-01'::DATE, '2025-12-31'::DATE, '1 day'::INTERVAL) as date_series;
-- Populate customer dimension with additional derived fields
INSERT INTO dim_customer (customer_id, first_name, last_name, full_name, email, city, country, region)
SELECT
customer_id,
first_name,
last_name,
first_name || ' ' || last_name as full_name,
email,
city,
country,
CASE
WHEN country IN ('United States', 'Canada', 'Mexico') THEN 'North America'
WHEN country IN ('United Kingdom', 'France', 'Germany', 'Spain', 'Italy', 'Netherlands', 'Poland') THEN 'Europe'
WHEN country IN ('China', 'Japan', 'India', 'South Korea', 'Thailand', 'Singapore') THEN 'Asia'
WHEN country IN ('Brazil', 'Argentina', 'Chile', 'Colombia') THEN 'South America'
WHEN country IN ('Australia', 'New Zealand') THEN 'Oceania'
WHEN country IN ('South Africa', 'Egypt', 'Nigeria', 'Kenya') THEN 'Africa'
ELSE 'Other'
END as region
FROM customers;
-- Populate product dimension with additional derived fields
INSERT INTO dim_product (product_id, product_name, category, price, price_tier)
SELECT
product_id,
product_name,
category,
price,
CASE
WHEN price < 25 THEN 'Budget'
WHEN price BETWEEN 25 AND 100 THEN 'Mid-Range'
WHEN price BETWEEN 100 AND 500 THEN 'Premium'
WHEN price >= 500 THEN 'Luxury'
END as price_tier
FROM products;
-- Populate sales fact table
INSERT INTO fact_sales (date_key, customer_key, product_key, order_id, order_item_id, quantity, unit_price, total_amount, profit_margin)
SELECT
TO_CHAR(o.order_date, 'YYYYMMDD')::INTEGER as date_key,
dc.customer_key,
dp.product_key,
o.order_id,
oi.order_item_id,
oi.quantity,
oi.unit_price,
oi.quantity * oi.unit_price as total_amount,
-- Simulate profit margin based on product category
CASE
WHEN dp.category = 'Electronics' THEN 15.0
WHEN dp.category = 'Furniture' THEN 25.0
WHEN dp.category = 'Clothing' THEN 45.0
WHEN dp.category = 'Books' THEN 35.0
WHEN dp.category = 'Sports' THEN 30.0
ELSE 20.0
END as profit_margin
FROM order_items oi
JOIN orders o ON oi.order_id = o.order_id
JOIN dim_customer dc ON o.customer_id = dc.customer_id
JOIN dim_product dp ON oi.product_id = dp.product_id;
-- Create indexes for better query performance
CREATE INDEX idx_fact_sales_date_key ON fact_sales(date_key);
CREATE INDEX idx_fact_sales_customer_key ON fact_sales(customer_key);
CREATE INDEX idx_fact_sales_product_key ON fact_sales(product_key);
CREATE INDEX idx_dim_date_full_date ON dim_date(full_date);
CREATE INDEX idx_dim_customer_customer_id ON dim_customer(customer_id);
CREATE INDEX idx_dim_product_product_id ON dim_product(product_id);
-- Create summary views for common analytics queries
CREATE VIEW sales_summary AS
SELECT
dd.full_date,
dd.year,
dd.month_name,
dc.region,
dc.country,
dp.category,
dp.price_tier,
COUNT(*) as transaction_count,
SUM(fs.quantity) as total_quantity,
SUM(fs.total_amount) as total_revenue,
AVG(fs.total_amount) as avg_transaction_value,
SUM(fs.total_amount * fs.profit_margin / 100) as total_profit
FROM fact_sales fs
JOIN dim_date dd ON fs.date_key = dd.date_key
JOIN dim_customer dc ON fs.customer_key = dc.customer_key
JOIN dim_product dp ON fs.product_key = dp.product_key
GROUP BY dd.full_date, dd.year, dd.month_name, dc.region, dc.country, dp.category, dp.price_tier;
-- Create customer analytics view
CREATE VIEW customer_analytics AS
SELECT
dc.customer_key,
dc.full_name,
dc.email,
dc.city,
dc.country,
dc.region,
COUNT(fs.sales_key) as total_orders,
SUM(fs.quantity) as total_items_purchased,
SUM(fs.total_amount) as total_spent,
AVG(fs.total_amount) as avg_order_value,
MIN(dd.full_date) as first_order_date,
MAX(dd.full_date) as last_order_date,
MAX(dd.full_date) - MIN(dd.full_date) as customer_lifetime_days
FROM dim_customer dc
LEFT JOIN fact_sales fs ON dc.customer_key = fs.customer_key
LEFT JOIN dim_date dd ON fs.date_key = dd.date_key
GROUP BY dc.customer_key, dc.full_name, dc.email, dc.city, dc.country, dc.region;
-- Create product performance view
CREATE VIEW product_performance AS
SELECT
dp.product_key,
dp.product_name,
dp.category,
dp.price,
dp.price_tier,
COUNT(fs.sales_key) as times_sold,
SUM(fs.quantity) as total_quantity_sold,
SUM(fs.total_amount) as total_revenue,
AVG(fs.total_amount) as avg_sale_value,
SUM(fs.total_amount * fs.profit_margin / 100) as total_profit,
RANK() OVER (ORDER BY SUM(fs.total_amount) DESC) as revenue_rank,
RANK() OVER (PARTITION BY dp.category ORDER BY SUM(fs.total_amount) DESC) as category_rank
FROM dim_product dp
LEFT JOIN fact_sales fs ON dp.product_key = fs.product_key
GROUP BY dp.product_key, dp.product_name, dp.category, dp.price, dp.price_tier;
-- Add some useful comments
COMMENT ON TABLE dim_date IS 'Date dimension with comprehensive date attributes for time-based analysis';
COMMENT ON TABLE dim_customer IS 'Customer dimension with geographic groupings and derived attributes';
COMMENT ON TABLE dim_product IS 'Product dimension with category and price tier classifications';
COMMENT ON TABLE fact_sales IS 'Central fact table containing sales transactions with foreign keys to dimensions';
COMMENT ON VIEW sales_summary IS 'Aggregated sales data by date, region, and product category';
COMMENT ON VIEW customer_analytics IS 'Customer-level analytics including lifetime value and purchase patterns';
COMMENT ON VIEW product_performance IS 'Product performance metrics including revenue ranking';
-- Print success message
SELECT 'Star schema created successfully with ' ||
(SELECT COUNT(*) FROM dim_date) || ' date records, ' ||
(SELECT COUNT(*) FROM dim_customer) || ' customers, ' ||
(SELECT COUNT(*) FROM dim_product) || ' products, and ' ||
(SELECT COUNT(*) FROM fact_sales) || ' sales facts.' as result;