Skip to content

Advanced Problems

Focus: point-in-time correctness, intervals, many-to-many joins, advanced windows, and production-grade reasoning.

1. Build a point-in-time feature table for each prediction point with latest feature values.
WITH ranked_features AS (
	SELECT
		p.entity_id,
		p.prediction_ts,
		f.feature_name,
		f.feature_value,
		ROW_NUMBER() OVER (
			PARTITION BY p.entity_id, p.prediction_ts, f.feature_name
			ORDER BY f.snapshot_ts DESC
		) AS rn
	FROM prediction_points p
	LEFT JOIN feature_snapshots f
		ON f.entity_id = p.entity_id
		AND f.snapshot_ts <= p.prediction_ts
)
SELECT entity_id, prediction_ts, feature_name, feature_value
FROM ranked_features
WHERE rn = 1;

The left join preserves prediction points. Some engines require careful handling because rows with no feature produce one NULL feature row.

2. Generate fraud labels: chargeback within 30 days after payment.
SELECT
	p.payment_id,
	p.order_id,
	p.payment_ts AS prediction_ts,
	CASE
		WHEN EXISTS (
			SELECT 1
			FROM chargebacks c
			WHERE c.payment_id = p.payment_id
				AND c.chargeback_ts > p.payment_ts
				AND c.chargeback_ts <= p.payment_ts + INTERVAL '30' DAY
		)
		THEN 1
		ELSE 0
	END AS fraud_label
FROM payments p
WHERE p.status = 'succeeded';

Make sure chargeback data would not be used as a feature before the label window.

3. Sessionize events with a 30-minute inactivity threshold.
WITH ordered AS (
	SELECT
		user_id,
		event_ts,
		event_id,
		LAG(event_ts) OVER (
			PARTITION BY user_id
			ORDER BY event_ts, event_id
		) AS prev_event_ts
	FROM events
),
marked AS (
	SELECT
		*,
		CASE
			WHEN prev_event_ts IS NULL THEN 1
			WHEN event_ts > prev_event_ts + INTERVAL '30' MINUTE THEN 1
			ELSE 0
		END AS new_session_flag
	FROM ordered
),
numbered AS (
	SELECT
		*,
		SUM(new_session_flag) OVER (
			PARTITION BY user_id
			ORDER BY event_ts, event_id
			ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
		) AS session_number
	FROM marked
)
SELECT
	user_id,
	session_number,
	MIN(event_ts) AS started_at,
	MAX(event_ts) AS ended_at,
	COUNT(*) AS events
FROM numbered
GROUP BY user_id, session_number;
4. Find the longest streak of paid-order days per customer.
WITH paid_days AS (
	SELECT DISTINCT customer_id, order_day_number
	FROM orders
	WHERE status = 'paid'
),
numbered AS (
	SELECT
		customer_id,
		order_day_number,
		order_day_number - ROW_NUMBER() OVER (
			PARTITION BY customer_id
			ORDER BY order_day_number
		) AS island_key
	FROM paid_days
),
streaks AS (
	SELECT
		customer_id,
		MIN(order_day_number) AS start_day,
		MAX(order_day_number) AS end_day,
		COUNT(*) AS streak_days
	FROM numbered
	GROUP BY customer_id, island_key
)
SELECT customer_id, MAX(streak_days) AS longest_streak_days
FROM streaks
GROUP BY customer_id;

order_day_number is a portable stand-in for date arithmetic.

5. Attribute each purchase to the most recent marketing touch before purchase.
WITH candidates AS (
	SELECT
		o.order_id,
		o.customer_id,
		o.order_ts,
		t.touch_id,
		t.channel,
		ROW_NUMBER() OVER (
			PARTITION BY o.order_id
			ORDER BY t.touch_ts DESC, t.touch_id DESC
		) AS rn
	FROM orders o
	JOIN marketing_touches t
		ON t.customer_id = o.customer_id
		AND t.touch_ts <= o.order_ts
	WHERE o.status = 'paid'
)
SELECT order_id, customer_id, order_ts, touch_id, channel
FROM candidates
WHERE rn = 1;

Add an attribution lookback window if the business requires one.

6. Compute retained users by cohort month and age month.
WITH first_seen AS (
	SELECT user_id, MIN(event_month) AS cohort_month
	FROM events
	GROUP BY user_id
),
monthly_activity AS (
	SELECT DISTINCT user_id, event_month
	FROM events
),
cohort_activity AS (
	SELECT
		f.cohort_month,
		a.event_month,
		a.user_id
	FROM first_seen f
	JOIN monthly_activity a
		ON a.user_id = f.user_id
		AND a.event_month >= f.cohort_month
)
SELECT
	cohort_month,
	event_month,
	COUNT(*) AS retained_users
FROM cohort_activity
GROUP BY cohort_month, event_month;

Use engine-specific date diff to derive age_month.

7. Find overlapping subscriptions for the same customer.
SELECT
	a.customer_id,
	a.subscription_id AS subscription_a,
	b.subscription_id AS subscription_b
FROM subscriptions a
JOIN subscriptions b
	ON b.customer_id = a.customer_id
	AND b.subscription_id > a.subscription_id
	AND a.started_at < b.ended_at
	AND b.started_at < a.ended_at;

The overlap rule for half-open intervals is a.start < b.end AND b.start < a.end.

8. Detect many-to-many join amplification between orders and promotions.
WITH joined AS (
	SELECT
		o.order_id,
		COUNT(*) AS joined_rows
	FROM orders o
	JOIN promotions p
		ON p.customer_id = o.customer_id
		AND o.order_ts >= p.valid_from
		AND o.order_ts < p.valid_to
	GROUP BY o.order_id
)
SELECT *
FROM joined
WHERE joined_rows > 1
ORDER BY joined_rows DESC;

Then define tie-breaking or attribution rules before summing revenue.

9. Compute revenue percentile buckets per customer.

Portable exact percentile functions vary. Use ranking buckets as a generic approach.

WITH revenue AS (
	SELECT customer_id, SUM(total_amount) AS paid_revenue
	FROM orders
	WHERE status = 'paid'
	GROUP BY customer_id
),
bucketed AS (
	SELECT
		customer_id,
		paid_revenue,
		NTILE(10) OVER (ORDER BY paid_revenue) AS decile
	FROM revenue
)
SELECT decile, COUNT(*) AS customers, MIN(paid_revenue), MAX(paid_revenue)
FROM bucketed
GROUP BY decile;

NTILE creates roughly equal row-count buckets, not equal revenue ranges.

10. Build a confusion matrix by model version.
SELECT
	p.model_version,
	SUM(CASE WHEN p.score >= 0.5 AND l.label_value = 1 THEN 1 ELSE 0 END) AS tp,
	SUM(CASE WHEN p.score >= 0.5 AND l.label_value = 0 THEN 1 ELSE 0 END) AS fp,
	SUM(CASE WHEN p.score < 0.5 AND l.label_value = 1 THEN 1 ELSE 0 END) AS fn,
	SUM(CASE WHEN p.score < 0.5 AND l.label_value = 0 THEN 1 ELSE 0 END) AS tn
FROM predictions p
JOIN labels l
	ON l.entity_id = p.entity_id
	AND l.label_ts = p.prediction_ts
GROUP BY p.model_version;

Make the threshold a parameter in real evaluation.

11. Find model score drift by comparing current week to previous week buckets.
WITH bucketed AS (
	SELECT
		model_version,
		score_week,
		CASE
			WHEN score < 0.1 THEN '00-10'
			WHEN score < 0.2 THEN '10-20'
			WHEN score < 0.3 THEN '20-30'
			WHEN score < 0.4 THEN '30-40'
			WHEN score < 0.5 THEN '40-50'
			WHEN score < 0.6 THEN '50-60'
			WHEN score < 0.7 THEN '60-70'
			WHEN score < 0.8 THEN '70-80'
			WHEN score < 0.9 THEN '80-90'
			ELSE '90-100'
		END AS score_bucket,
		COUNT(*) AS predictions
	FROM predictions
	GROUP BY model_version, score_week, score_bucket
)
SELECT
	model_version,
	score_week,
	score_bucket,
	predictions,
	LAG(predictions) OVER (
		PARTITION BY model_version, score_bucket
		ORDER BY score_week
	) AS previous_predictions
FROM bucketed;

Normalize to proportions before alerting, because total traffic can change.

12. Compute exact median order value per country without a percentile function.

This uses row numbers and counts. It averages the two middle rows for even counts.

WITH ordered AS (
	SELECT
		c.country,
		o.total_amount,
		ROW_NUMBER() OVER (
			PARTITION BY c.country
			ORDER BY o.total_amount
		) AS rn,
		COUNT(*) OVER (
			PARTITION BY c.country
		) AS cnt
	FROM orders o
	JOIN customers c
		ON c.customer_id = o.customer_id
	WHERE o.status = 'paid'
)
SELECT
	country,
	AVG(total_amount) AS median_order_value
FROM ordered
WHERE rn IN ((cnt + 1) / 2, (cnt + 2) / 2)
GROUP BY country;

Integer division behavior varies by engine. In interviews, explain the intent if syntax needs adjustment.

13. For each entity, join a label to the nearest prediction before label time.
WITH candidates AS (
	SELECT
		l.entity_id,
		l.label_ts,
		l.label_value,
		p.prediction_ts,
		p.score,
		ROW_NUMBER() OVER (
			PARTITION BY l.entity_id, l.label_ts
			ORDER BY p.prediction_ts DESC
		) AS rn
	FROM labels l
	JOIN predictions p
		ON p.entity_id = l.entity_id
		AND p.prediction_ts <= l.label_ts
)
SELECT entity_id, label_ts, label_value, prediction_ts, score
FROM candidates
WHERE rn = 1;

This is an as-of join in the reverse direction.

14. Identify users whose purchase event appears before assignment to an experiment.
SELECT
	x.user_id,
	x.experiment_name,
	x.assigned_ts,
	MIN(e.event_ts) AS first_purchase_ts
FROM experiments x
JOIN events e
	ON e.user_id = x.user_id
	AND e.event_name = 'purchase'
GROUP BY x.user_id, x.experiment_name, x.assigned_ts
HAVING MIN(e.event_ts) < x.assigned_ts;

This can reveal experiment contamination or reused users from previous tests.

15. Create an order lifecycle table from event logs.
SELECT
	order_id,
	MIN(CASE WHEN event_name = 'created' THEN event_ts END) AS created_ts,
	MIN(CASE WHEN event_name = 'paid' THEN event_ts END) AS paid_ts,
	MIN(CASE WHEN event_name = 'shipped' THEN event_ts END) AS shipped_ts,
	MIN(CASE WHEN event_name = 'delivered' THEN event_ts END) AS delivered_ts,
	MIN(CASE WHEN event_name = 'cancelled' THEN event_ts END) AS cancelled_ts
FROM order_events
GROUP BY order_id;

Then add validation: paid after created, shipped after paid, delivered after shipped, cancelled conflicts with delivered unless business rules allow it.