From 1c023369b345550154920d612fbaf47e79a0c0ab Mon Sep 17 00:00:00 2001
From: grabowski <berwn@buildfor.life>
Date: Sun, 28 Sep 2025 20:35:59 +0700
Subject: [PATCH] Implement intelligent data freshness detection for adaptive
 scheduler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add _check_data_freshness() method to detect stale vs fresh data
- Consider data fresh only if latest timestamp is within 2 hours
- Modify run_scraping_cycle() to check data freshness, not just existence
- Return False for stale data to trigger adaptive scheduler retry mode
- Add detailed logging for data age and freshness decisions

This solves the issue where scheduler stayed in hourly mode despite getting
stale data from the API. Now it correctly detects when API returns old data
and switches to retry mode until fresh data becomes available.

Example behavior:
- Fresh data (0.6 hours old): Returns True, stays in hourly mode
- Stale data (68.6 hours old): Returns False, switches to retry mode

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/water_scraper_v3.py | 61 +++++++++++++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/src/water_scraper_v3.py b/src/water_scraper_v3.py
index 7bab6d9..21b5b52 100644
--- a/src/water_scraper_v3.py
+++ b/src/water_scraper_v3.py
@@ -499,28 +499,71 @@ class EnhancedWaterMonitorScraper:
             logger.error(f"Error getting latest data: {e}")
             return []
     
+    def _check_data_freshness(self, water_data: List[Dict]) -> bool:
+        """Check if the fetched data contains recent/fresh timestamps"""
+        if not water_data:
+            return False
+
+        current_time = datetime.datetime.now()
+
+        # Find the most recent timestamp in the data
+        latest_timestamp = None
+        for data_point in water_data:
+            timestamp = data_point.get('timestamp')
+            if timestamp and (latest_timestamp is None or timestamp > latest_timestamp):
+                latest_timestamp = timestamp
+
+        if latest_timestamp is None:
+            logger.warning("No valid timestamps found in data")
+            return False
+
+        # Check if the latest data is within the last 2 hours
+        time_diff = current_time - latest_timestamp
+        hours_old = time_diff.total_seconds() / 3600
+
+        logger.info(f"Latest data timestamp: {latest_timestamp}, age: {hours_old:.1f} hours")
+
+        # Consider data fresh if it's less than 2 hours old
+        is_fresh = hours_old <= 2.0
+
+        if not is_fresh:
+            logger.warning(f"Data is stale ({hours_old:.1f} hours old), switching to retry mode")
+        else:
+            logger.info(f"Data is fresh ({hours_old:.1f} hours old)")
+
+        return is_fresh
+
     def run_scraping_cycle(self) -> bool:
-        """Run a complete scraping cycle"""
+        """Run a complete scraping cycle with freshness check"""
         logger.info("Starting scraping cycle...")
-        
+
         try:
             # Fetch current data
             water_data = self.fetch_water_data()
             if water_data:
-                success = self.save_to_database(water_data)
-                if success:
-                    logger.info("Scraping cycle completed successfully")
-                    increment_counter("scraping_cycles_successful")
-                    return True
+                # Check if data is fresh/recent
+                is_fresh = self._check_data_freshness(water_data)
+
+                if is_fresh:
+                    success = self.save_to_database(water_data)
+                    if success:
+                        logger.info("Scraping cycle completed successfully with fresh data")
+                        increment_counter("scraping_cycles_successful")
+                        return True
+                    else:
+                        logger.error("Failed to save data")
+                        increment_counter("scraping_cycles_failed")
+                        return False
                 else:
-                    logger.error("Failed to save data")
+                    # Data exists but is stale
+                    logger.warning("Data fetched but is stale - treating as no fresh data available")
                     increment_counter("scraping_cycles_failed")
                     return False
             else:
                 logger.warning("No data fetched")
                 increment_counter("scraping_cycles_failed")
                 return False
-                
+
         except Exception as e:
             logger.error(f"Scraping cycle failed: {e}")
             increment_counter("scraping_cycles_failed")