From dff4dd067d756c130c07059a8e3449578ce70bc4 Mon Sep 17 00:00:00 2001 From: grabowski Date: Sun, 28 Sep 2025 21:03:03 +0700 Subject: [PATCH] Implement strict freshness detection without grace periods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove tolerance windows and grace periods from data freshness checks - Require data from current hour only - no exceptions or fallbacks - If hourly check runs at 21:xx but only has data up to 20:xx, immediately switch to retry mode - Simplify logic: latest_hour >= current_hour for fresh data - Remove complex age calculations and tolerance conditions This ensures the scheduler immediately detects when new hourly data is not yet available and switches to minute-based retries without delay. Behavior: - 21:02 with data up to 21:xx → Fresh (continue hourly) - 21:02 with data up to 20:xx → Stale (immediate retry mode) - No grace periods, no tolerance windows, strict hour-based detection 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- src/water_scraper_v3.py | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/src/water_scraper_v3.py b/src/water_scraper_v3.py index 6f3f5ee..02c6426 100644 --- a/src/water_scraper_v3.py +++ b/src/water_scraper_v3.py @@ -500,7 +500,7 @@ class EnhancedWaterMonitorScraper: return [] def _check_data_freshness(self, water_data: List[Dict]) -> bool: - """Check if the fetched data contains expected current hour data""" + """Check if the fetched data contains new data for the current hour""" if not water_data: return False @@ -520,37 +520,22 @@ class EnhancedWaterMonitorScraper: latest_hour = latest_timestamp.hour time_diff = current_time - latest_timestamp - hours_old = time_diff.total_seconds() / 3600 + minutes_old = time_diff.total_seconds() / 60 logger.info(f"Current time: {current_time.strftime('%H:%M')}, Latest data: {latest_timestamp.strftime('%H:%M')}") - logger.info(f"Current hour: {current_hour}, Latest data hour: {latest_hour}, Age: {hours_old:.1f} hours") + logger.info(f"Current hour: {current_hour}, Latest data hour: {latest_hour}, Age: {minutes_old:.1f} minutes") - # Check if we have data for the current hour or the previous hour - # If it's 20:00 and we only have data up to 19:xx, that's stale - expected_hour = current_hour - has_current_hour = latest_hour >= expected_hour + # Strict check: we need data from the current hour + # If it's 20:xx and we only have data up to 19:xx, that's stale - go to retry mode + has_current_hour_data = latest_hour >= current_hour - # Allow some tolerance: if it's early in the hour (first 10 minutes), - # accept data from the previous hour - if current_time.minute <= 10 and latest_hour == (current_hour - 1): - has_current_hour = True - logger.info(f"Early in hour {current_hour}, accepting previous hour {latest_hour} data") - - # Also check that data isn't too old (backup check) - not_too_old = hours_old <= 2.0 - - is_fresh = has_current_hour and not_too_old - - if not is_fresh: - if not has_current_hour: - logger.warning(f"Missing current hour data - expected hour {expected_hour}, got {latest_hour}") - if not not_too_old: - logger.warning(f"Data is too old ({hours_old:.1f} hours)") - logger.warning("Data is stale, switching to retry mode") + if not has_current_hour_data: + logger.warning(f"No new data available - expected hour {current_hour}, got {latest_hour}") + logger.warning("Switching to retry mode until new data becomes available") + return False else: - logger.info(f"Data is fresh - has current/recent hour data") - - return is_fresh + logger.info(f"Fresh data available for current hour {current_hour}") + return True def run_scraping_cycle(self) -> bool: """Run a complete scraping cycle with freshness check"""