Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/datajoint/autopopulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def _populate_direct(
"""
from tqdm import tqdm

keys = (self._jobs_to_do(restrictions) - self).keys()
keys = (self._jobs_to_do(restrictions) - self.proj()).keys()

logger.debug("Found %d keys to populate" % len(keys))

Expand Down Expand Up @@ -701,7 +701,7 @@ def progress(self, *restrictions: Any, display: bool = False) -> tuple[int, int]
if not common_attrs:
# No common attributes - fall back to two-query method
total = len(todo)
remaining = len(todo - self)
remaining = len(todo - self.proj())
else:
# Build a single query that computes both total and remaining
# Using LEFT JOIN with COUNT(DISTINCT) to handle 1:many relationships
Expand Down
2 changes: 1 addition & 1 deletion src/datajoint/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def refresh(

# Keys that need jobs: in key_source, not in target, not in jobs
# Disable semantic_check for Job table (self) because its attributes may not have matching lineage
new_keys = (key_source - self._target).restrict(Not(self), semantic_check=False).proj()
new_keys = (key_source - self._target.proj()).restrict(Not(self), semantic_check=False).proj()
new_key_list = new_keys.keys()

if new_key_list:
Expand Down
124 changes: 124 additions & 0 deletions tests/integration/test_autopopulate.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,130 @@ def test_allow_insert(clean_autopopulate, subject, experiment):
experiment.insert1(key)


def test_populate_antijoin_with_secondary_attrs(clean_autopopulate, subject, experiment):
"""Test that populate correctly computes pending keys via antijoin.

Verifies that partial populate + antijoin gives correct pending counts.
Note: Experiment.make() inserts fake_experiments_per_subject rows per key.
"""
assert subject, "root tables are empty"
assert not experiment, "table already filled?"

total_keys = len(experiment.key_source)
assert total_keys > 0

# Partially populate (2 keys from key_source)
experiment.populate(max_calls=2)
assert len(experiment) == 2 * experiment.fake_experiments_per_subject

# key_source - target must return only unpopulated keys
pending = experiment.key_source - experiment
assert len(pending) == total_keys - 2, f"Antijoin returned {len(pending)} pending keys, expected {total_keys - 2}."

# Verify progress() reports correct counts
remaining, total = experiment.progress()
assert total == total_keys
assert remaining == total_keys - 2

# Populate the rest and verify antijoin returns 0
experiment.populate()
pending_after = experiment.key_source - experiment
assert len(pending_after) == 0, f"Antijoin returned {len(pending_after)} pending keys after full populate, expected 0."


def test_populate_antijoin_overlapping_attrs(prefix, connection_test):
"""Regression test: antijoin with overlapping secondary attribute names.

This reproduces the bug where `key_source - self` returns ALL keys instead
of just unpopulated ones. The condition is:

1. key_source returns secondary attributes (e.g., num_samples, quality)
2. The target table has secondary attributes with the SAME NAMES
3. The VALUES differ between source and target after populate

Without .proj() on the target, SQL matches on ALL common column names
(including secondary attrs), so different values mean no match, and all
keys appear "pending" even after populate.

Real-world example: LightningPoseOutput (key_source) has num_frames,
quality, processing_datetime as secondary attrs. InitialContainer (target)
also has those same-named columns with different values.
"""
test_schema = dj.Schema(f"{prefix}_antijoin_overlap", connection=connection_test)

@test_schema
class Sensor(dj.Lookup):
definition = """
sensor_id : int32
---
num_samples : int32
quality : decimal(4,2)
"""
contents = [
(1, 100, 0.95),
(2, 200, 0.87),
(3, 150, 0.92),
(4, 175, 0.89),
]

@test_schema
class ProcessedSensor(dj.Computed):
definition = """
-> Sensor
---
num_samples : int32 # same name as Sensor's secondary attr
quality : decimal(4,2) # same name as Sensor's secondary attr
result : decimal(8,2)
"""

@property
def key_source(self):
return Sensor() # returns sensor_id + num_samples + quality

def make(self, key):
# Fetch source data (key only contains PK after projection)
source = (Sensor() & key).fetch1()
# Values intentionally differ from source — this is what triggers
# the bug: the antijoin tries to match on num_samples and quality
# too, and since values differ, no match is found.
self.insert1(
dict(
sensor_id=key["sensor_id"],
num_samples=source["num_samples"] * 2,
quality=float(source["quality"]) + 0.05,
result=float(source["num_samples"]) * float(source["quality"]),
)
)

try:
# Partially populate (2 out of 4)
ProcessedSensor().populate(max_calls=2)
assert len(ProcessedSensor()) == 2

total_keys = len(ProcessedSensor().key_source)
assert total_keys == 4

# The critical test: populate() must correctly identify remaining keys.
# Before the fix, populate() used `key_source - self` which matched on
# num_samples and quality too, returning all 4 keys as "pending".
ProcessedSensor().populate()
assert len(ProcessedSensor()) == 4, (
f"After full populate, expected 4 entries but got {len(ProcessedSensor())}. "
f"populate() likely re-processed already-completed keys."
)

# Verify progress reports 0 remaining
remaining, total = ProcessedSensor().progress()
assert remaining == 0, f"Expected 0 remaining, got {remaining}"
assert total == 4

# Verify antijoin with .proj() is correct
pending = ProcessedSensor().key_source - ProcessedSensor().proj()
assert len(pending) == 0
finally:
test_schema.drop(prompt=False)


def test_load_dependencies(prefix, connection_test):
schema = dj.Schema(f"{prefix}_load_dependencies_populate", connection=connection_test)

Expand Down
Loading