ML Feature and Label Drills
Assume every example starts from prediction_points(entity_id, prediction_ts) unless stated otherwise.
1. Build a 30-day order-count feature before prediction time.
SELECT
p.entity_id,
p.prediction_ts,
COUNT(o.order_id) AS orders_30d
FROM prediction_points p
LEFT JOIN orders o
ON o.customer_id = p.entity_id
AND o.status = 'paid'
AND o.order_ts < p.prediction_ts
AND o.order_ts >= p.prediction_ts - INTERVAL '30' DAY
GROUP BY p.entity_id, p.prediction_ts;
The strict < prediction_ts boundary helps prevent leakage.
2. Build a 7-day revenue feature before prediction time.
SELECT
p.entity_id,
p.prediction_ts,
COALESCE(SUM(o.total_amount), 0) AS revenue_7d
FROM prediction_points p
LEFT JOIN orders o
ON o.customer_id = p.entity_id
AND o.status = 'paid'
AND o.order_ts < p.prediction_ts
AND o.order_ts >= p.prediction_ts - INTERVAL '7' DAY
GROUP BY p.entity_id, p.prediction_ts;
3. Join the latest feature snapshot as of prediction time.
WITH ranked AS (
SELECT
p.entity_id,
p.prediction_ts,
f.feature_name,
f.feature_value,
ROW_NUMBER() OVER (
PARTITION BY p.entity_id, p.prediction_ts, f.feature_name
ORDER BY f.snapshot_ts DESC
) AS rn
FROM prediction_points p
LEFT JOIN feature_snapshots f
ON f.entity_id = p.entity_id
AND f.snapshot_ts <= p.prediction_ts
)
SELECT entity_id, prediction_ts, feature_name, feature_value
FROM ranked
WHERE rn = 1;
4. Detect leaked feature rows where availability is after prediction time.
SELECT feature_name, COUNT(*) AS leaked_rows
FROM training_features
WHERE feature_available_ts > prediction_ts
GROUP BY feature_name
HAVING COUNT(*) > 0;
This should return no rows.
5. Generate a churn label: no event from day 7 through day 37 after prediction.
SELECT
p.entity_id,
p.prediction_ts,
CASE
WHEN EXISTS (
SELECT 1
FROM events e
WHERE e.user_id = p.entity_id
AND e.event_ts >= p.prediction_ts + INTERVAL '7' DAY
AND e.event_ts < p.prediction_ts + INTERVAL '37' DAY
)
THEN 0 ELSE 1
END AS churn_label
FROM prediction_points p;
The 7-day gap can avoid partially observed behavior.
6. Generate a fraud label: chargeback within 60 days after payment.
SELECT
p.payment_id,
p.payment_ts AS prediction_ts,
CASE
WHEN EXISTS (
SELECT 1
FROM chargebacks c
WHERE c.payment_id = p.payment_id
AND c.chargeback_ts > p.payment_ts
AND c.chargeback_ts <= p.payment_ts + INTERVAL '60' DAY
)
THEN 1 ELSE 0
END AS fraud_label
FROM payments p
WHERE p.status = 'succeeded';
7. Create a time-based train, validation, test split.
SELECT
*,
CASE
WHEN prediction_ts < TIMESTAMP '2026-04-01 00:00:00' THEN 'train'
WHEN prediction_ts < TIMESTAMP '2026-05-01 00:00:00' THEN 'validation'
ELSE 'test'
END AS split
FROM training_examples;
Time splits reduce future leakage compared with random splits.
8. Compute missingness rate per feature.
SELECT
feature_name,
COUNT(*) AS rows,
SUM(CASE WHEN feature_value IS NULL THEN 1 ELSE 0 END) AS null_rows,
SUM(CASE WHEN feature_value IS NULL THEN 1 ELSE 0 END) * 1.0 / NULLIF(COUNT(*), 0) AS null_rate
FROM training_features
GROUP BY feature_name;
9. Compute label prevalence by split.
SELECT
split,
COUNT(*) AS examples,
AVG(label_value) AS positive_rate
FROM training_examples
GROUP BY split;
Big prevalence shifts can indicate split, sampling, or population issues.
10. Build a confusion matrix at threshold 0.7.
SELECT
SUM(CASE WHEN score >= 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS tp,
SUM(CASE WHEN score >= 0.7 AND label_value = 0 THEN 1 ELSE 0 END) AS fp,
SUM(CASE WHEN score < 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS fn,
SUM(CASE WHEN score < 0.7 AND label_value = 0 THEN 1 ELSE 0 END) AS tn
FROM scored_examples;
11. Compute precision and recall from scored examples.
WITH cm AS (
SELECT
SUM(CASE WHEN score >= 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS tp,
SUM(CASE WHEN score >= 0.7 AND label_value = 0 THEN 1 ELSE 0 END) AS fp,
SUM(CASE WHEN score < 0.7 AND label_value = 1 THEN 1 ELSE 0 END) AS fn
FROM scored_examples
)
SELECT
tp * 1.0 / NULLIF(tp + fp, 0) AS precision,
tp * 1.0 / NULLIF(tp + fn, 0) AS recall
FROM cm;
12. Create calibration buckets by score decile.
WITH bucketed AS (
SELECT
score,
label_value,
NTILE(10) OVER (ORDER BY score) AS score_decile
FROM scored_examples
)
SELECT
score_decile,
COUNT(*) AS examples,
AVG(score) AS avg_score,
AVG(label_value) AS observed_rate
FROM bucketed
GROUP BY score_decile
ORDER BY score_decile;
13. Detect training-serving skew for one feature.
SELECT
COALESCE(t.entity_id, s.entity_id) AS entity_id,
t.feature_value AS training_value,
s.feature_value AS serving_value
FROM training_feature_values t
FULL OUTER JOIN serving_feature_values s
ON s.entity_id = t.entity_id
AND s.feature_name = t.feature_name
AND s.prediction_ts = t.prediction_ts
WHERE t.feature_name = 'orders_30d'
AND (
t.feature_value <> s.feature_value
OR t.feature_value IS NULL
OR s.feature_value IS NULL
);
In production, compare rates and distributions, not only row-level diffs.
14. Build a feature: days since last paid order.
WITH last_order AS (
SELECT
p.entity_id,
p.prediction_ts,
MAX(o.order_ts) AS last_paid_order_ts
FROM prediction_points p
LEFT JOIN orders o
ON o.customer_id = p.entity_id
AND o.status = 'paid'
AND o.order_ts < p.prediction_ts
GROUP BY p.entity_id, p.prediction_ts
)
SELECT entity_id, prediction_ts, prediction_ts - last_paid_order_ts AS time_since_last_paid_order
FROM last_order;
Use engine-specific timestamp difference syntax.
15. Build a feature: number of distinct active days in the last 14 days.
SELECT
p.entity_id,
p.prediction_ts,
COUNT(DISTINCT e.event_date) AS active_days_14d
FROM prediction_points p
LEFT JOIN events e
ON e.user_id = p.entity_id
AND e.event_ts < p.prediction_ts
AND e.event_ts >= p.prediction_ts - INTERVAL '14' DAY
GROUP BY p.entity_id, p.prediction_ts;
16. Build negatives by sampling entities without a positive label.
SELECT p.entity_id, p.prediction_ts, 0 AS label_value
FROM prediction_points p
WHERE NOT EXISTS (
SELECT 1
FROM positive_labels l
WHERE l.entity_id = p.entity_id
AND l.label_ts = p.prediction_ts
);
Actual sampling syntax is engine-dependent; first define the eligible negative population.
17. Detect entities appearing in both train and test splits.
SELECT entity_id
FROM training_examples
GROUP BY entity_id
HAVING COUNT(DISTINCT split) > 1;
This is not always wrong for temporal splits, but it is a leakage risk depending on the problem.
18. Compute prediction volume by model version and day.
SELECT
prediction_date,
model_version,
COUNT(*) AS predictions
FROM predictions
GROUP BY prediction_date, model_version;
Useful for rollout monitoring and detecting mixed-version periods.
19. Compare average score by model version for the same entity set.
WITH common_entities AS (
SELECT entity_id
FROM predictions
WHERE model_version IN ('old', 'new')
GROUP BY entity_id
HAVING COUNT(DISTINCT model_version) = 2
)
SELECT p.model_version, AVG(p.score) AS avg_score
FROM predictions p
JOIN common_entities c
ON c.entity_id = p.entity_id
WHERE p.model_version IN ('old', 'new')
GROUP BY p.model_version;
Compare on the same population to avoid mix shifts.
20. Build a label only if the full outcome window has matured.
SELECT *
FROM prediction_points
WHERE prediction_ts + INTERVAL '30' DAY <= TIMESTAMP '2026-07-01 00:00:00';
Replace the fixed timestamp with the dataset cutoff. Do not train on labels whose outcome window is incomplete.
21. Detect features computed from events after prediction time.
SELECT feature_name, COUNT(*) AS bad_rows
FROM feature_lineage
WHERE source_event_ts >= prediction_ts
GROUP BY feature_name
HAVING COUNT(*) > 0;
This assumes feature lineage stores the max source event timestamp used by each feature row.
22. Compute A/B conversion using model eligibility at prediction time.
WITH eligible AS (
SELECT entity_id, prediction_ts, variant
FROM experiment_predictions
WHERE is_eligible = 1
),
converted AS (
SELECT DISTINCT e.entity_id, e.prediction_ts
FROM eligible e
JOIN events v
ON v.user_id = e.entity_id
AND v.event_name = 'purchase'
AND v.event_ts >= e.prediction_ts
AND v.event_ts < e.prediction_ts + INTERVAL '7' DAY
)
SELECT
e.variant,
COUNT(*) AS eligible_rows,
COUNT(c.entity_id) AS converted_rows,
COUNT(c.entity_id) * 1.0 / NULLIF(COUNT(*), 0) AS conversion_rate
FROM eligible e
LEFT JOIN converted c
ON c.entity_id = e.entity_id
AND c.prediction_ts = e.prediction_ts
GROUP BY e.variant;
23. Find feature rows with duplicate entity, timestamp, feature name.
SELECT entity_id, prediction_ts, feature_name, COUNT(*) AS rows
FROM training_features
GROUP BY entity_id, prediction_ts, feature_name
HAVING COUNT(*) > 1;
The feature table should have a clear uniqueness contract.
24. Explain why random row splits are risky for recommender training data.
The same user or item can appear in train and test, and future interactions can leak into the training representation. Use temporal splits and consider user/item cold-start evaluation slices.
25. State a minimum feature table contract.
At minimum: entity ID, prediction timestamp, feature name/value or wide columns, feature availability timestamp, feature version, source data cutoff, owner, null policy, and tests for uniqueness and leakage.