Implement intelligent data freshness detection for adaptive scheduler
- Add _check_data_freshness() method to detect stale vs fresh data - Consider data fresh only if latest timestamp is within 2 hours - Modify run_scraping_cycle() to check data freshness, not just existence - Return False for stale data to trigger adaptive scheduler retry mode - Add detailed logging for data age and freshness decisions This solves the issue where scheduler stayed in hourly mode despite getting stale data from the API. Now it correctly detects when API returns old data and switches to retry mode until fresh data becomes available. Example behavior: - Fresh data (0.6 hours old): Returns True, stays in hourly mode - Stale data (68.6 hours old): Returns False, switches to retry mode 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -499,28 +499,71 @@ class EnhancedWaterMonitorScraper:
|
||||
logger.error(f"Error getting latest data: {e}")
|
||||
return []
|
||||
|
||||
def _check_data_freshness(self, water_data: List[Dict]) -> bool:
|
||||
"""Check if the fetched data contains recent/fresh timestamps"""
|
||||
if not water_data:
|
||||
return False
|
||||
|
||||
current_time = datetime.datetime.now()
|
||||
|
||||
# Find the most recent timestamp in the data
|
||||
latest_timestamp = None
|
||||
for data_point in water_data:
|
||||
timestamp = data_point.get('timestamp')
|
||||
if timestamp and (latest_timestamp is None or timestamp > latest_timestamp):
|
||||
latest_timestamp = timestamp
|
||||
|
||||
if latest_timestamp is None:
|
||||
logger.warning("No valid timestamps found in data")
|
||||
return False
|
||||
|
||||
# Check if the latest data is within the last 2 hours
|
||||
time_diff = current_time - latest_timestamp
|
||||
hours_old = time_diff.total_seconds() / 3600
|
||||
|
||||
logger.info(f"Latest data timestamp: {latest_timestamp}, age: {hours_old:.1f} hours")
|
||||
|
||||
# Consider data fresh if it's less than 2 hours old
|
||||
is_fresh = hours_old <= 2.0
|
||||
|
||||
if not is_fresh:
|
||||
logger.warning(f"Data is stale ({hours_old:.1f} hours old), switching to retry mode")
|
||||
else:
|
||||
logger.info(f"Data is fresh ({hours_old:.1f} hours old)")
|
||||
|
||||
return is_fresh
|
||||
|
||||
def run_scraping_cycle(self) -> bool:
|
||||
"""Run a complete scraping cycle"""
|
||||
"""Run a complete scraping cycle with freshness check"""
|
||||
logger.info("Starting scraping cycle...")
|
||||
|
||||
|
||||
try:
|
||||
# Fetch current data
|
||||
water_data = self.fetch_water_data()
|
||||
if water_data:
|
||||
success = self.save_to_database(water_data)
|
||||
if success:
|
||||
logger.info("Scraping cycle completed successfully")
|
||||
increment_counter("scraping_cycles_successful")
|
||||
return True
|
||||
# Check if data is fresh/recent
|
||||
is_fresh = self._check_data_freshness(water_data)
|
||||
|
||||
if is_fresh:
|
||||
success = self.save_to_database(water_data)
|
||||
if success:
|
||||
logger.info("Scraping cycle completed successfully with fresh data")
|
||||
increment_counter("scraping_cycles_successful")
|
||||
return True
|
||||
else:
|
||||
logger.error("Failed to save data")
|
||||
increment_counter("scraping_cycles_failed")
|
||||
return False
|
||||
else:
|
||||
logger.error("Failed to save data")
|
||||
# Data exists but is stale
|
||||
logger.warning("Data fetched but is stale - treating as no fresh data available")
|
||||
increment_counter("scraping_cycles_failed")
|
||||
return False
|
||||
else:
|
||||
logger.warning("No data fetched")
|
||||
increment_counter("scraping_cycles_failed")
|
||||
return False
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping cycle failed: {e}")
|
||||
increment_counter("scraping_cycles_failed")
|
||||
|
Reference in New Issue
Block a user