Skip to content

ML Feature and Label Drills

Assume every example starts from prediction_points(entity_id, prediction_ts) unless stated otherwise.

1. Build a 30-day order-count feature before prediction time.
SELECT
	p.entity_id,
	p.prediction_ts,
	COUNT(o.order_id) AS orders_30d
FROM prediction_points p
LEFT JOIN orders o
	ON o.customer_id = p.entity_id
	AND o.status = 'paid'
	AND o.order_ts < p.prediction_ts
	AND o.order_ts >= p.prediction_ts - INTERVAL '30' DAY
GROUP BY p.entity_id, p.prediction_ts;

The strict < prediction_ts boundary helps prevent leakage.

2. Build a 7-day revenue feature before prediction time.
SELECT
	p.entity_id,
	p.prediction_ts,
	COALESCE(SUM(o.total_amount), 0) AS revenue_7d
FROM prediction_points p
LEFT JOIN orders o
	ON o.customer_id = p.entity_id
	AND o.status = 'paid'
	AND o.order_ts < p.prediction_ts
	AND o.order_ts >= p.prediction_ts - INTERVAL '7' DAY
GROUP BY p.entity_id, p.prediction_ts;
3. Join the latest feature snapshot as of prediction time.
WITH ranked AS (
	SELECT
		p.entity_id,
		p.prediction_ts,
		f.feature_name,
		f.feature_value,
		ROW_NUMBER() OVER (
			PARTITION BY p.entity_id, p.prediction_ts, f.feature_name
			ORDER BY f.snapshot_ts DESC
		) AS rn
	FROM prediction_points p
	LEFT JOIN feature_snapshots f
		ON f.entity_id = p.entity_id
		AND f.snapshot_ts <= p.prediction_ts
)
SELECT entity_id, prediction_ts, feature_name, feature_value
FROM ranked
WHERE rn = 1;
4. Detect leaked feature rows where availability is after prediction time.
SELECT feature_name, COUNT(*) AS leaked_rows
FROM training_features
WHERE feature_available_ts > prediction_ts
GROUP BY feature_name
HAVING COUNT(*) > 0;

This should return no rows.

5. Generate a churn label: no event from day 7 through day 37 after prediction.
SELECT
	p.entity_id,
	p.prediction_ts,
	CASE
		WHEN EXISTS (
			SELECT 1
			FROM events e
			WHERE e.user_id = p.entity_id
				AND e.event_ts >= p.prediction_ts + INTERVAL '7' DAY
				AND e.event_ts < p.prediction_ts + INTERVAL '37' DAY
		)
		THEN 0 ELSE 1
	END AS churn_label
FROM prediction_points p;

The 7-day gap can avoid partially observed behavior.

6. Generate a fraud label: chargeback within 60 days after payment.
SELECT
	p.payment_id,
	p.payment_ts AS prediction_ts,
	CASE
		WHEN EXISTS (
			SELECT 1
			FROM chargebacks c
			WHERE c.payment_id = p.payment_id
				AND c.chargeback_ts > p.payment_ts
				AND c.chargeback_ts <= p.payment_ts + INTERVAL '60' DAY
		)
		THEN 1 ELSE 0
	END AS fraud_label
FROM payments p
WHERE p.status = 'succeeded';
7. Create a time-based train, validation, test split.
SELECT
	*,
	CASE
		WHEN prediction_ts < TIMESTAMP '2026-04-01 00:00:00' THEN 'train'
		WHEN prediction_ts < TIMESTAMP '2026-05-01 00:00:00' THEN 'validation'
		ELSE 'test'
	END AS split
FROM training_examples;

Time splits reduce future leakage compared with random splits.

8. Compute missingness rate per feature.
SELECT
	feature_name,
	COUNT(*) AS rows,
	SUM(CASE WHEN feature_value IS NULL THEN 1 ELSE 0 END) AS null_rows,
	SUM(CASE WHEN feature_value IS NULL THEN 1 ELSE 0 END) * 1.0 / NULLIF(COUNT(*), 0) AS null_rate
FROM training_features
GROUP BY feature_name;
9. Compute label prevalence by split.
SELECT
	split,
	COUNT(*) AS examples,
	AVG(label_value) AS positive_rate
FROM training_examples
GROUP BY split;

Big prevalence shifts can indicate split, sampling, or population issues.

10. Build a confusion matrix at threshold 0.7.
SELECT
	SUM(CASE WHEN score >= 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS tp,
	SUM(CASE WHEN score >= 0.7 AND label_value = 0 THEN 1 ELSE 0 END) AS fp,
	SUM(CASE WHEN score < 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS fn,
	SUM(CASE WHEN score < 0.7 AND label_value = 0 THEN 1 ELSE 0 END) AS tn
FROM scored_examples;
11. Compute precision and recall from scored examples.
WITH cm AS (
	SELECT
		SUM(CASE WHEN score >= 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS tp,
		SUM(CASE WHEN score >= 0.7 AND label_value = 0 THEN 1 ELSE 0 END) AS fp,
		SUM(CASE WHEN score < 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS fn
	FROM scored_examples
)
SELECT
	tp * 1.0 / NULLIF(tp + fp, 0) AS precision,
	tp * 1.0 / NULLIF(tp + fn, 0) AS recall
FROM cm;
12. Create calibration buckets by score decile.
WITH bucketed AS (
	SELECT
		score,
		label_value,
		NTILE(10) OVER (ORDER BY score) AS score_decile
	FROM scored_examples
)
SELECT
	score_decile,
	COUNT(*) AS examples,
	AVG(score) AS avg_score,
	AVG(label_value) AS observed_rate
FROM bucketed
GROUP BY score_decile
ORDER BY score_decile;
13. Detect training-serving skew for one feature.
SELECT
	COALESCE(t.entity_id, s.entity_id) AS entity_id,
	t.feature_value AS training_value,
	s.feature_value AS serving_value
FROM training_feature_values t
FULL OUTER JOIN serving_feature_values s
	ON s.entity_id = t.entity_id
	AND s.feature_name = t.feature_name
	AND s.prediction_ts = t.prediction_ts
WHERE t.feature_name = 'orders_30d'
	AND (
		t.feature_value <> s.feature_value
		OR t.feature_value IS NULL
		OR s.feature_value IS NULL
	);

In production, compare rates and distributions, not only row-level diffs.

14. Build a feature: days since last paid order.
WITH last_order AS (
	SELECT
		p.entity_id,
		p.prediction_ts,
		MAX(o.order_ts) AS last_paid_order_ts
	FROM prediction_points p
	LEFT JOIN orders o
		ON o.customer_id = p.entity_id
		AND o.status = 'paid'
		AND o.order_ts < p.prediction_ts
	GROUP BY p.entity_id, p.prediction_ts
)
SELECT entity_id, prediction_ts, prediction_ts - last_paid_order_ts AS time_since_last_paid_order
FROM last_order;

Use engine-specific timestamp difference syntax.

15. Build a feature: number of distinct active days in the last 14 days.
SELECT
	p.entity_id,
	p.prediction_ts,
	COUNT(DISTINCT e.event_date) AS active_days_14d
FROM prediction_points p
LEFT JOIN events e
	ON e.user_id = p.entity_id
	AND e.event_ts < p.prediction_ts
	AND e.event_ts >= p.prediction_ts - INTERVAL '14' DAY
GROUP BY p.entity_id, p.prediction_ts;
16. Build negatives by sampling entities without a positive label.
SELECT p.entity_id, p.prediction_ts, 0 AS label_value
FROM prediction_points p
WHERE NOT EXISTS (
	SELECT 1
	FROM positive_labels l
	WHERE l.entity_id = p.entity_id
		AND l.label_ts = p.prediction_ts
);

Actual sampling syntax is engine-dependent; first define the eligible negative population.

17. Detect entities appearing in both train and test splits.
SELECT entity_id
FROM training_examples
GROUP BY entity_id
HAVING COUNT(DISTINCT split) > 1;

This is not always wrong for temporal splits, but it is a leakage risk depending on the problem.

18. Compute prediction volume by model version and day.
SELECT
	prediction_date,
	model_version,
	COUNT(*) AS predictions
FROM predictions
GROUP BY prediction_date, model_version;

Useful for rollout monitoring and detecting mixed-version periods.

19. Compare average score by model version for the same entity set.
WITH common_entities AS (
	SELECT entity_id
	FROM predictions
	WHERE model_version IN ('old', 'new')
	GROUP BY entity_id
	HAVING COUNT(DISTINCT model_version) = 2
)
SELECT p.model_version, AVG(p.score) AS avg_score
FROM predictions p
JOIN common_entities c
	ON c.entity_id = p.entity_id
WHERE p.model_version IN ('old', 'new')
GROUP BY p.model_version;

Compare on the same population to avoid mix shifts.

20. Build a label only if the full outcome window has matured.
SELECT *
FROM prediction_points
WHERE prediction_ts + INTERVAL '30' DAY <= TIMESTAMP '2026-07-01 00:00:00';

Replace the fixed timestamp with the dataset cutoff. Do not train on labels whose outcome window is incomplete.

21. Detect features computed from events after prediction time.
SELECT feature_name, COUNT(*) AS bad_rows
FROM feature_lineage
WHERE source_event_ts >= prediction_ts
GROUP BY feature_name
HAVING COUNT(*) > 0;

This assumes feature lineage stores the max source event timestamp used by each feature row.

22. Compute A/B conversion using model eligibility at prediction time.
WITH eligible AS (
	SELECT entity_id, prediction_ts, variant
	FROM experiment_predictions
	WHERE is_eligible = 1
),
converted AS (
	SELECT DISTINCT e.entity_id, e.prediction_ts
	FROM eligible e
	JOIN events v
		ON v.user_id = e.entity_id
		AND v.event_name = 'purchase'
		AND v.event_ts >= e.prediction_ts
		AND v.event_ts < e.prediction_ts + INTERVAL '7' DAY
)
SELECT
	e.variant,
	COUNT(*) AS eligible_rows,
	COUNT(c.entity_id) AS converted_rows,
	COUNT(c.entity_id) * 1.0 / NULLIF(COUNT(*), 0) AS conversion_rate
FROM eligible e
LEFT JOIN converted c
	ON c.entity_id = e.entity_id
	AND c.prediction_ts = e.prediction_ts
GROUP BY e.variant;
23. Find feature rows with duplicate entity, timestamp, feature name.
SELECT entity_id, prediction_ts, feature_name, COUNT(*) AS rows
FROM training_features
GROUP BY entity_id, prediction_ts, feature_name
HAVING COUNT(*) > 1;

The feature table should have a clear uniqueness contract.

24. Explain why random row splits are risky for recommender training data.

The same user or item can appear in train and test, and future interactions can leak into the training representation. Use temporal splits and consider user/item cold-start evaluation slices.

25. State a minimum feature table contract.

At minimum: entity ID, prediction timestamp, feature name/value or wide columns, feature availability timestamp, feature version, source data cutoff, owner, null policy, and tests for uniqueness and leakage.