-
Notifications
You must be signed in to change notification settings - Fork 14
feat(jobs): make NATS queue activity visible in async ML job logs #1222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 2 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
fb74cb5
feat(nats): surface queue lifecycle events on the per-job logger
mihow 7e8ce28
fix(nats): bridge job-logger mirroring through sync_to_async
mihow 439300a
fix(nats): narrow consumer_info exception + stop forwarding module lo…
mihow 9007197
refactor(jobs): simplify cleanup_async_job_resources to match codebas…
mihow f8d09ac
refactor(nats): read consumer config from ConsumerInfo instead of har…
mihow 4c62d34
refactor(nats): skip redundant NATS calls after first ensure, fix enu…
mihow 0d3160c
fix(nats): preserve traceback on publish failure + defensive stream s…
mihow b06ec28
refactor(jobs): resolve job logger internally in cleanup_async_job_re…
mihow 3257c77
refactor(jobs): rename _log to job_logger to avoid name collision wit…
mihow ae51c21
chore(nats): tidy up minor code smells from self-review
mihow ed7bf56
refactor(nats): unify log fan-out behind manager.log_async
mihow ebfdbb7
docs(nats): note future intent to route, not mirror, lifecycle logs
mihow b5e9a87
docs(nats): correct safety claim for _ensure_stream/_ensure_consumer …
mihow 766137c
perf(nats): gate log_async on isEnabledFor before sync_to_async mirror
mihow File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |
| import logging | ||
|
|
||
| import nats | ||
| from asgiref.sync import sync_to_async | ||
| from django.conf import settings | ||
| from nats.js import JetStreamContext | ||
| from nats.js.api import AckPolicy, ConsumerConfig, DeliverPolicy | ||
|
|
@@ -54,21 +55,75 @@ class TaskQueueManager: | |
| nats_url: NATS server URL. Falls back to settings.NATS_URL, then "nats://nats:4222". | ||
| max_ack_pending: Max unacknowledged messages per consumer. Falls back to | ||
| settings.NATS_MAX_ACK_PENDING, then 1000. | ||
| job_logger: Optional per-job logger. When set, lifecycle events (stream / | ||
| consumer create or reuse, cleanup stats, publish failures) are mirrored | ||
| to this logger in addition to the module logger, so they appear in the | ||
| job's own log stream as seen from the UI. Per-message and per-poll | ||
| events stay on the module logger only to avoid drowning large jobs. | ||
|
|
||
| Use as an async context manager: | ||
| async with TaskQueueManager() as manager: | ||
| async with TaskQueueManager(job_logger=job.logger) as manager: | ||
| await manager.publish_task(123, {'data': 'value'}) | ||
| tasks = await manager.reserve_tasks(123, count=64) | ||
| await manager.acknowledge_task(tasks[0].reply_subject) | ||
| """ | ||
|
|
||
| def __init__(self, nats_url: str | None = None, max_ack_pending: int | None = None): | ||
| def __init__( | ||
| self, | ||
| nats_url: str | None = None, | ||
| max_ack_pending: int | None = None, | ||
| job_logger: logging.Logger | None = None, | ||
| ): | ||
| self.nats_url = nats_url or getattr(settings, "NATS_URL", "nats://nats:4222") | ||
| self.max_ack_pending = ( | ||
| max_ack_pending if max_ack_pending is not None else getattr(settings, "NATS_MAX_ACK_PENDING", 1000) | ||
| ) | ||
| self.job_logger = job_logger | ||
| self.nc: nats.NATS | None = None | ||
| self.js: JetStreamContext | None = None | ||
| # Dedupe lifecycle log lines per manager session so a job that publishes | ||
| # hundreds of tasks doesn't emit hundreds of "reusing stream" messages. | ||
| self._streams_logged: set[int] = set() | ||
| self._consumers_logged: set[int] = set() | ||
|
|
||
| async def _log(self, level: int, msg: str) -> None: | ||
| """Log to both the module logger and the job logger (if set). | ||
|
|
||
| Module logger fires synchronously (ops dashboards / stdout / New Relic | ||
| are unaffected). The job logger call is bridged through | ||
| ``sync_to_async`` because Django's ``JobLogHandler`` does an ORM | ||
| ``refresh_from_db`` + ``save`` on every emit — calling that directly | ||
| from the event loop raises ``SynchronousOnlyOperation`` and the log | ||
| line is silently dropped. The bridge offloads the handler work to a | ||
| thread so the line actually lands in ``job.logs.stdout``. | ||
|
|
||
| Exceptions from the job logger are swallowed so logging a lifecycle | ||
| event never breaks the actual NATS operation. | ||
| """ | ||
| logger.log(level, msg) | ||
| if self.job_logger is not None and self.job_logger is not logger: | ||
| try: | ||
| await sync_to_async(self.job_logger.log)(level, msg) | ||
| except Exception as e: | ||
| logger.warning(f"Failed to mirror log to job logger: {e}") | ||
|
|
||
| @staticmethod | ||
| def _format_consumer_stats(info) -> str: | ||
| """Format ConsumerInfo into a compact stats string. | ||
|
|
||
| All nats-py ConsumerInfo fields are Optional, so defensive access is | ||
| required: this method renders missing values as '?'. Used for both | ||
| reuse-announcements and forensic cleanup lines. | ||
| """ | ||
| delivered = info.delivered.consumer_seq if info.delivered is not None else "?" | ||
| ack_floor = info.ack_floor.consumer_seq if info.ack_floor is not None else "?" | ||
| return ( | ||
| f"delivered={delivered} " | ||
| f"ack_floor={ack_floor} " | ||
| f"num_pending={info.num_pending if info.num_pending is not None else '?'} " | ||
| f"num_ack_pending={info.num_ack_pending if info.num_ack_pending is not None else '?'} " | ||
| f"num_redelivered={info.num_redelivered if info.num_redelivered is not None else '?'}" | ||
| ) | ||
|
|
||
| async def __aenter__(self): | ||
| """Create connection on enter.""" | ||
|
|
@@ -127,27 +182,52 @@ async def _stream_exists(self, stream_name: str) -> bool: | |
| return False | ||
|
|
||
| async def _ensure_stream(self, job_id: int): | ||
| """Ensure stream exists for the given job.""" | ||
| """Ensure stream exists for the given job. | ||
|
|
||
| Logs a lifecycle line to both the module and job logger the first time it | ||
| sees a given job in this manager session (creation or reuse). Subsequent | ||
| calls in the same session are silent, so a job publishing N images doesn't | ||
| emit N log lines. | ||
| """ | ||
| if self.js is None: | ||
| raise RuntimeError("Connection is not open. Use TaskQueueManager as an async context manager.") | ||
|
|
||
| if not await self._job_stream_exists(job_id): | ||
| stream_name = self._get_stream_name(job_id) | ||
| subject = self._get_subject(job_id) | ||
| logger.warning(f"Stream {stream_name} does not exist") | ||
| # Stream doesn't exist, create it | ||
| await asyncio.wait_for( | ||
| self.js.add_stream( | ||
| name=stream_name, | ||
| subjects=[subject], | ||
| max_age=86400, # 24 hours retention | ||
| ), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| logger.info(f"Created stream {stream_name}") | ||
| stream_name = self._get_stream_name(job_id) | ||
| subject = self._get_subject(job_id) | ||
|
|
||
| try: | ||
| info = await asyncio.wait_for(self.js.stream_info(stream_name), timeout=NATS_JETSTREAM_TIMEOUT) | ||
| if job_id not in self._streams_logged: | ||
| await self._log( | ||
| logging.INFO, | ||
| f"Reusing NATS stream {stream_name} " | ||
| f"(messages={info.state.messages}, last_seq={info.state.last_seq})", | ||
| ) | ||
| self._streams_logged.add(job_id) | ||
| return | ||
| except nats.js.errors.NotFoundError: | ||
| pass | ||
|
|
||
| await asyncio.wait_for( | ||
| self.js.add_stream( | ||
| name=stream_name, | ||
| subjects=[subject], | ||
| max_age=86400, # 24 hours retention | ||
| ), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| await self._log(logging.INFO, f"Created NATS stream {stream_name}") | ||
| self._streams_logged.add(job_id) | ||
|
|
||
| async def _ensure_consumer(self, job_id: int): | ||
| """Ensure consumer exists for the given job.""" | ||
| """Ensure consumer exists for the given job. | ||
|
|
||
| On first sight in this manager session (creation or reuse), emits a line | ||
| to both the module and job logger. On creation the line includes the | ||
| config snapshot (max_deliver, ack_wait, max_ack_pending, deliver_policy, | ||
| ack_policy) so forensic readers can see exactly what delivery semantics | ||
| were in effect. | ||
| """ | ||
| if self.js is None: | ||
| raise RuntimeError("Connection is not open. Use TaskQueueManager as an async context manager.") | ||
|
|
||
|
|
@@ -160,27 +240,43 @@ async def _ensure_consumer(self, job_id: int): | |
| self.js.consumer_info(stream_name, consumer_name), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| logger.debug(f"Consumer {consumer_name} already exists: {info}") | ||
| if job_id not in self._consumers_logged: | ||
| await self._log( | ||
| logging.INFO, | ||
| f"Reusing NATS consumer {consumer_name} ({self._format_consumer_stats(info)})", | ||
| ) | ||
| self._consumers_logged.add(job_id) | ||
| return | ||
| except asyncio.TimeoutError: | ||
| raise # NATS unreachable — let caller handle it | ||
| except Exception: | ||
| # Consumer doesn't exist, create it | ||
| await asyncio.wait_for( | ||
| self.js.add_consumer( | ||
| stream=stream_name, | ||
| config=ConsumerConfig( | ||
| durable_name=consumer_name, | ||
| ack_policy=AckPolicy.EXPLICIT, | ||
| ack_wait=TASK_TTR, # Visibility timeout (TTR) | ||
| max_deliver=5, # Max retry attempts | ||
| deliver_policy=DeliverPolicy.ALL, | ||
| max_ack_pending=self.max_ack_pending, | ||
| filter_subject=subject, | ||
| ), | ||
| # Consumer doesn't exist, fall through to create it. | ||
| pass | ||
|
mihow marked this conversation as resolved.
|
||
|
|
||
| await asyncio.wait_for( | ||
| self.js.add_consumer( | ||
| stream=stream_name, | ||
| config=ConsumerConfig( | ||
| durable_name=consumer_name, | ||
| ack_policy=AckPolicy.EXPLICIT, | ||
| ack_wait=TASK_TTR, # Visibility timeout (TTR) | ||
| max_deliver=5, # Max retry attempts | ||
| deliver_policy=DeliverPolicy.ALL, | ||
| max_ack_pending=self.max_ack_pending, | ||
| filter_subject=subject, | ||
| ), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| logger.info(f"Created consumer {consumer_name}") | ||
| ), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| await self._log( | ||
| logging.INFO, | ||
| f"Created NATS consumer {consumer_name} " | ||
| f"(max_deliver=5, ack_wait={TASK_TTR}s, " | ||
|
mihow marked this conversation as resolved.
Outdated
|
||
| f"max_ack_pending={self.max_ack_pending}, " | ||
| f"deliver_policy={DeliverPolicy.ALL.value}, " | ||
| f"ack_policy={AckPolicy.EXPLICIT.value})", | ||
| ) | ||
| self._consumers_logged.add(job_id) | ||
|
|
||
| async def publish_task(self, job_id: int, data: PipelineProcessingTask) -> bool: | ||
| """ | ||
|
|
@@ -212,7 +308,10 @@ async def publish_task(self, job_id: int, data: PipelineProcessingTask) -> bool: | |
| return True | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"Failed to publish task to stream for job '{job_id}': {e}") | ||
| # Per-message success logs stay at module level (noise in 10k-image | ||
| # jobs), but a failure on even a single publish deserves to surface | ||
| # in the job log — otherwise the failure path is invisible to users. | ||
| await self._log(logging.ERROR, f"Failed to publish task to stream for job '{job_id}': {e}") | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very helpful! |
||
| return False | ||
|
|
||
| async def reserve_tasks(self, job_id: int, count: int, timeout: float = 5) -> list[PipelineProcessingTask]: | ||
|
|
@@ -292,6 +391,31 @@ async def acknowledge_task(self, reply_subject: str) -> bool: | |
| logger.error(f"Failed to acknowledge task: {e}") | ||
| return False | ||
|
|
||
| async def _log_final_consumer_stats(self, job_id: int) -> None: | ||
| """Log one forensic line about the consumer state before deletion. | ||
|
|
||
| This is the single most useful line in a post-mortem: it tells you how | ||
| many messages were delivered, how many were acked, and how many were | ||
| redelivered before the consumer vanished. Failures here must NOT block | ||
| cleanup — if the consumer or stream is already gone, just skip it. | ||
| """ | ||
| if self.js is None: | ||
| return | ||
| stream_name = self._get_stream_name(job_id) | ||
| consumer_name = self._get_consumer_name(job_id) | ||
| try: | ||
| info = await asyncio.wait_for( | ||
| self.js.consumer_info(stream_name, consumer_name), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| except Exception as e: | ||
| logger.debug(f"Could not fetch consumer info for {consumer_name} before deletion: {e}") | ||
| return | ||
| await self._log( | ||
| logging.INFO, | ||
| f"Finalizing NATS consumer {consumer_name} before deletion " f"({self._format_consumer_stats(info)})", | ||
| ) | ||
|
|
||
| async def delete_consumer(self, job_id: int) -> bool: | ||
| """ | ||
| Delete the consumer for a job. | ||
|
|
@@ -313,10 +437,10 @@ async def delete_consumer(self, job_id: int) -> bool: | |
| self.js.delete_consumer(stream_name, consumer_name), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| logger.info(f"Deleted consumer {consumer_name} for job '{job_id}'") | ||
| await self._log(logging.INFO, f"Deleted NATS consumer {consumer_name} for job '{job_id}'") | ||
| return True | ||
| except Exception as e: | ||
| logger.error(f"Failed to delete consumer for job '{job_id}': {e}") | ||
| await self._log(logging.ERROR, f"Failed to delete NATS consumer for job '{job_id}': {e}") | ||
| return False | ||
|
|
||
| async def delete_stream(self, job_id: int) -> bool: | ||
|
|
@@ -339,10 +463,10 @@ async def delete_stream(self, job_id: int) -> bool: | |
| self.js.delete_stream(stream_name), | ||
| timeout=NATS_JETSTREAM_TIMEOUT, | ||
| ) | ||
| logger.info(f"Deleted stream {stream_name} for job '{job_id}'") | ||
| await self._log(logging.INFO, f"Deleted NATS stream {stream_name} for job '{job_id}'") | ||
| return True | ||
| except Exception as e: | ||
| logger.error(f"Failed to delete stream for job '{job_id}': {e}") | ||
| await self._log(logging.ERROR, f"Failed to delete NATS stream for job '{job_id}': {e}") | ||
| return False | ||
|
|
||
| async def _setup_advisory_stream(self): | ||
|
|
@@ -482,6 +606,10 @@ async def cleanup_job_resources(self, job_id: int) -> bool: | |
| Returns: | ||
| bool: True if successful, False otherwise | ||
| """ | ||
| # Log a forensic snapshot of the consumer state BEFORE we destroy it. | ||
| # This is the highest-leverage line for post-mortem investigations. | ||
| await self._log_final_consumer_stats(job_id) | ||
|
|
||
| # Delete consumer first, then stream, then the durable DLQ advisory consumer | ||
| consumer_deleted = await self.delete_consumer(job_id) | ||
| stream_deleted = await self.delete_stream(job_id) | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.