Some checks failed
Security & Dependency Updates / Dependency Security Scan (push) Successful in 29s
Security & Dependency Updates / Docker Security Scan (push) Failing after 53s
Security & Dependency Updates / License Compliance (push) Successful in 13s
Security & Dependency Updates / Check for Dependency Updates (push) Successful in 19s
Security & Dependency Updates / Code Quality Metrics (push) Successful in 11s
Security & Dependency Updates / Security Summary (push) Successful in 7s
Features: - Real-time water level monitoring for Ping River Basin (16 stations) - Coverage from Chiang Dao to Nakhon Sawan in Northern Thailand - FastAPI web interface with interactive dashboard and station management - Multi-database support (SQLite, MySQL, PostgreSQL, InfluxDB, VictoriaMetrics) - Comprehensive monitoring with health checks and metrics collection - Docker deployment with Grafana integration - Production-ready architecture with enterprise-grade observability CI/CD & Automation: - Complete Gitea Actions workflows for CI/CD, security, and releases - Multi-Python version testing (3.9-3.12) - Multi-architecture Docker builds (amd64, arm64) - Daily security scanning and dependency monitoring - Automated documentation generation - Performance testing and validation Production Ready: - Type safety with Pydantic models and comprehensive type hints - Data validation layer with range checking and error handling - Rate limiting and request tracking for API protection - Enhanced logging with rotation, colors, and performance metrics - Station management API for dynamic CRUD operations - Comprehensive documentation and deployment guides Technical Stack: - Python 3.9+ with FastAPI and Pydantic - Multi-database architecture with adapter pattern - Docker containerization with multi-stage builds - Grafana dashboards for visualization - Gitea Actions for CI/CD automation - Enterprise monitoring and alerting Ready for deployment to B4L infrastructure!
539 lines
22 KiB
Python
539 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enhanced Water Monitor Scraper with multiple database backend support
|
|
"""
|
|
|
|
import requests
|
|
import datetime
|
|
import time
|
|
import schedule
|
|
import json
|
|
import os
|
|
from typing import List, Dict, Optional
|
|
|
|
try:
|
|
from .database_adapters import create_database_adapter, DatabaseAdapter
|
|
from .models import WaterMeasurement, StationInfo, ScrapingResult, StationStatus
|
|
from .validators import DataValidator
|
|
from .exceptions import APIConnectionError, DataValidationError, DatabaseConnectionError
|
|
from .metrics import increment_counter, set_gauge, record_histogram, Timer
|
|
from .rate_limiter import RateLimiter, RequestTracker
|
|
from .logging_config import get_logger
|
|
except ImportError:
|
|
# Handle case when running as standalone script
|
|
from database_adapters import create_database_adapter, DatabaseAdapter
|
|
import logging
|
|
|
|
def get_logger(name):
|
|
return logging.getLogger(name)
|
|
|
|
def increment_counter(*args, **kwargs):
|
|
pass
|
|
|
|
def set_gauge(*args, **kwargs):
|
|
pass
|
|
|
|
def record_histogram(*args, **kwargs):
|
|
pass
|
|
|
|
class Timer:
|
|
def __init__(self, *args, **kwargs):
|
|
pass
|
|
def __enter__(self):
|
|
return self
|
|
def __exit__(self, *args):
|
|
pass
|
|
|
|
class RateLimiter:
|
|
def __init__(self, *args, **kwargs):
|
|
pass
|
|
def wait_if_needed(self):
|
|
pass
|
|
|
|
class RequestTracker:
|
|
def __init__(self):
|
|
pass
|
|
def record_request(self, *args, **kwargs):
|
|
pass
|
|
|
|
class DataValidator:
|
|
@staticmethod
|
|
def validate_measurements(measurements):
|
|
return measurements
|
|
|
|
# Get logger instance
|
|
logger = get_logger(__name__)
|
|
|
|
class EnhancedWaterMonitorScraper:
|
|
def __init__(self, db_config: Dict):
|
|
"""
|
|
Initialize scraper with database configuration
|
|
|
|
Args:
|
|
db_config: Database configuration dictionary
|
|
"""
|
|
self.api_url = "https://hyd-app-db.rid.go.th/webservice/getGroupHourlyWaterLevelReportAllHL.ashx"
|
|
self.db_config = db_config.copy() # Make a copy to avoid modifying original
|
|
self.db_adapter = None
|
|
|
|
# Scheduler state tracking
|
|
self.last_successful_update = None
|
|
self.retry_mode = False
|
|
self.next_hourly_check = None
|
|
|
|
# Rate limiting and request tracking
|
|
self.rate_limiter = RateLimiter(max_requests=10, time_window_seconds=60)
|
|
self.request_tracker = RequestTracker()
|
|
|
|
# HTTP session for API requests
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
|
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|
'X-Requested-With': 'XMLHttpRequest'
|
|
})
|
|
|
|
# Station mapping with correct names and geolocation data
|
|
self.station_mapping = {
|
|
'1': {
|
|
'code': 'P.20',
|
|
'thai_name': 'บ้านเชียงดาว',
|
|
'english_name': 'Ban Chiang Dao',
|
|
'latitude': 19.36731448032191,
|
|
'longitude': 98.9688487015384,
|
|
'geohash': None
|
|
},
|
|
'2': {
|
|
'code': 'P.75',
|
|
'thai_name': 'บ้านช่อแล',
|
|
'english_name': 'Ban Chai Lat',
|
|
'latitude': 19.145972935976225,
|
|
'longitude': 99.00735727149247,
|
|
'geohash': None
|
|
},
|
|
'3': {
|
|
'code': 'P.92',
|
|
'thai_name': 'บ้านเมืองกึ๊ด',
|
|
'english_name': 'Ban Muang Aut',
|
|
'latitude': 19.220518985435646,
|
|
'longitude': 98.84733127007874,
|
|
'geohash': None
|
|
},
|
|
'4': {
|
|
'code': 'P.4A',
|
|
'thai_name': 'บ้านแม่แตง',
|
|
'english_name': 'Ban Mae Taeng',
|
|
'latitude': 19.1222679952378,
|
|
'longitude': 98.94437462084075,
|
|
'geohash': None
|
|
},
|
|
'5': {
|
|
'code': 'P.67',
|
|
'thai_name': 'บ้านแม่แต',
|
|
'english_name': 'Ban Tae',
|
|
'latitude': 19.009762080002453,
|
|
'longitude': 98.95978297135508,
|
|
'geohash': None
|
|
},
|
|
'6': {
|
|
'code': 'P.21',
|
|
'thai_name': 'บ้านริมใต้',
|
|
'english_name': 'Ban Rim Tai',
|
|
'latitude': 18.917459157963293,
|
|
'longitude': 98.97018092996231,
|
|
'geohash': None
|
|
},
|
|
'7': {
|
|
'code': 'P.103',
|
|
'thai_name': 'สะพานวงแหวนรอบ 3',
|
|
'english_name': 'Ring Bridge 3',
|
|
'latitude': 18.86664807441675,
|
|
'longitude': 98.9781107622432,
|
|
'geohash': None
|
|
},
|
|
'8': {
|
|
'code': 'P.1',
|
|
'thai_name': 'สะพานนวรัฐ',
|
|
'english_name': 'Nawarat Bridge',
|
|
'latitude': 18.7875,
|
|
'longitude': 99.0045,
|
|
'geohash': 'w5q6uuhvfcfp25'
|
|
},
|
|
'9': {
|
|
'code': 'P.82',
|
|
'thai_name': 'บ้านสบวิน',
|
|
'english_name': 'Ban Sob win',
|
|
'latitude': 18.6519444,
|
|
'longitude': 98.69,
|
|
'geohash': None
|
|
},
|
|
'10': {
|
|
'code': 'P.84',
|
|
'thai_name': 'บ้านพันตน',
|
|
'english_name': 'Ban Panton',
|
|
'latitude': 18.591315274591334,
|
|
'longitude': 98.79657058508496,
|
|
'geohash': None
|
|
},
|
|
'11': {
|
|
'code': 'P.81',
|
|
'thai_name': 'บ้านโป่ง',
|
|
'english_name': 'Ban Pong',
|
|
'latitude': 13.805661820610888,
|
|
'longitude': 99.87174946122846,
|
|
'geohash': None
|
|
},
|
|
'12': {
|
|
'code': 'P.5',
|
|
'thai_name': 'สะพานท่านาง',
|
|
'english_name': 'Tha Nang Bridge',
|
|
'latitude': 18.580269437546555,
|
|
'longitude': 99.01021397084362,
|
|
'geohash': None
|
|
},
|
|
'13': {
|
|
'code': 'P.77',
|
|
'thai_name': 'บ้านสบแม่สะป๊วด',
|
|
'english_name': 'Baan Sop Mae Sapuord',
|
|
'latitude': 18.433347475179602,
|
|
'longitude': 99.08510036666527,
|
|
'geohash': None
|
|
},
|
|
'14': {
|
|
'code': 'P.87',
|
|
'thai_name': 'บ้านป่าซาง',
|
|
'english_name': 'Ban Pa Sang',
|
|
'latitude': 18.519121825282486,
|
|
'longitude': 98.94224374138238,
|
|
'geohash': None
|
|
},
|
|
'15': {
|
|
'code': 'P.76',
|
|
'thai_name': 'บ้านแม่อีไฮ',
|
|
'english_name': 'Banb Mae I Hai',
|
|
'latitude': 18.141465831254404,
|
|
'longitude': 98.89642508267181,
|
|
'geohash': None
|
|
},
|
|
'16': {
|
|
'code': 'P.85',
|
|
'thai_name': 'บ้านหล่ายแก้ว',
|
|
'english_name': 'Baan Lai Kaew',
|
|
'latitude': 18.17856361002219,
|
|
'longitude': 98.63023114782287,
|
|
'geohash': None
|
|
}
|
|
}
|
|
|
|
self.init_database()
|
|
|
|
def init_database(self):
|
|
"""Initialize database connection"""
|
|
try:
|
|
# Extract db_type and pass remaining config as kwargs
|
|
db_config_copy = self.db_config.copy()
|
|
db_type = db_config_copy.pop('type')
|
|
self.db_adapter = create_database_adapter(db_type, **db_config_copy)
|
|
success = self.db_adapter.connect()
|
|
|
|
if success:
|
|
logger.info(f"Successfully connected to {db_type.upper()} database")
|
|
set_gauge("database_connected", 1)
|
|
increment_counter("database_connections_successful")
|
|
else:
|
|
logger.error(f"Failed to connect to {db_type.upper()} database")
|
|
set_gauge("database_connected", 0)
|
|
increment_counter("database_connections_failed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error initializing database: {e}")
|
|
set_gauge("database_connected", 0)
|
|
increment_counter("database_connections_failed")
|
|
self.db_adapter = None
|
|
|
|
def fetch_water_data_for_date(self, target_date: datetime.datetime) -> Optional[List[Dict]]:
|
|
"""Fetch water levels and discharge data from API for a specific date"""
|
|
with Timer("api_request_duration"):
|
|
try:
|
|
logger.info(f"Starting data fetch from API for date: {target_date.strftime('%Y-%m-%d')}")
|
|
|
|
# Rate limiting
|
|
self.rate_limiter.wait_if_needed()
|
|
|
|
# Create Thai format date (Buddhist calendar)
|
|
thai_year = target_date.year + 543
|
|
thai_date = f"{target_date.day:02d}/{target_date.month:02d}/{thai_year}"
|
|
|
|
# API parameters
|
|
payload = {
|
|
'DW[UtokID]': '1',
|
|
'DW[BasinID]': '6',
|
|
'DW[TimeCurrent]': thai_date,
|
|
'_search': 'false',
|
|
'nd': str(int(time.time() * 1000)),
|
|
'rows': '100',
|
|
'page': '1',
|
|
'sidx': 'indexhourly',
|
|
'sord': 'asc'
|
|
}
|
|
|
|
logger.debug(f"API parameters: {payload}")
|
|
|
|
# POST request to API
|
|
start_time = time.time()
|
|
response = self.session.post(self.api_url, data=payload, timeout=30)
|
|
response_time = time.time() - start_time
|
|
|
|
response.raise_for_status()
|
|
|
|
# Record successful request
|
|
self.request_tracker.record_request(True, response_time)
|
|
increment_counter("api_requests_successful")
|
|
record_histogram("api_response_time", response_time)
|
|
|
|
# Parse JSON response
|
|
try:
|
|
json_data = response.json()
|
|
logger.debug(f"API response received: {len(str(json_data))} characters")
|
|
except ValueError as e:
|
|
logger.error(f"Error parsing JSON response: {e}")
|
|
self.request_tracker.record_request(False, response_time, "json_parse_error")
|
|
increment_counter("api_requests_failed")
|
|
return None
|
|
|
|
water_data = []
|
|
|
|
# Parse JSON data
|
|
if json_data and isinstance(json_data, dict) and 'rows' in json_data:
|
|
for row in json_data['rows']:
|
|
try:
|
|
# Parse timestamp
|
|
time_str = row.get('hourlytime', '')
|
|
if not time_str:
|
|
continue
|
|
|
|
try:
|
|
# Format: "1.00", "2.00", ..., "24.00"
|
|
api_hour = int(float(time_str))
|
|
if api_hour < 1 or api_hour > 24:
|
|
continue
|
|
|
|
if api_hour == 24:
|
|
# Hour 24 = midnight (00:00) of the next day
|
|
data_time = target_date.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
data_time = data_time + datetime.timedelta(days=1)
|
|
else:
|
|
# Hours 1-23 = 01:00-23:00 of the same day
|
|
data_time = target_date.replace(hour=api_hour, minute=0, second=0, microsecond=0)
|
|
|
|
except (ValueError, IndexError):
|
|
logger.warning(f"Could not parse timestamp: {time_str}")
|
|
continue
|
|
|
|
# Parse all water levels and discharge values
|
|
station_count = 0
|
|
for station_num in range(1, 17): # Stations 1-16
|
|
wl_key = f'wlvalues{station_num}'
|
|
q_key = f'qvalues{station_num}'
|
|
qp_key = f'QPercent{station_num}'
|
|
|
|
# Check if both water level and discharge data exist
|
|
if wl_key in row and q_key in row:
|
|
try:
|
|
water_level = row[wl_key]
|
|
discharge = row[q_key]
|
|
discharge_percent = row.get(qp_key)
|
|
|
|
# Skip if values are None or invalid
|
|
if water_level is None or discharge is None:
|
|
continue
|
|
|
|
# Convert to float
|
|
water_level = float(water_level)
|
|
discharge = float(discharge)
|
|
discharge_percent = float(discharge_percent) if discharge_percent is not None else None
|
|
|
|
station_info = self.station_mapping.get(str(station_num), {
|
|
'code': f'P.{19+station_num}',
|
|
'thai_name': f'Station {station_num}',
|
|
'english_name': f'Station {station_num}'
|
|
})
|
|
|
|
water_data.append({
|
|
'timestamp': data_time,
|
|
'station_id': station_num,
|
|
'station_code': station_info['code'],
|
|
'station_name_en': station_info['english_name'],
|
|
'station_name_th': station_info['thai_name'],
|
|
'latitude': station_info.get('latitude'),
|
|
'longitude': station_info.get('longitude'),
|
|
'geohash': station_info.get('geohash'),
|
|
'water_level': water_level,
|
|
'water_level_unit': 'm',
|
|
'discharge': discharge,
|
|
'discharge_unit': 'cms',
|
|
'discharge_percent': discharge_percent,
|
|
'status': 'active'
|
|
})
|
|
|
|
station_count += 1
|
|
|
|
except (ValueError, TypeError) as e:
|
|
logger.warning(f"Could not parse data for station {station_num}: {e}")
|
|
continue
|
|
|
|
logger.debug(f"Processed {station_count} stations for time {time_str}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error processing data row: {e}")
|
|
continue
|
|
|
|
# Validate data
|
|
water_data = DataValidator.validate_measurements(water_data)
|
|
|
|
logger.info(f"Successfully fetched {len(water_data)} data points from API for {target_date.strftime('%Y-%m-%d')}")
|
|
return water_data
|
|
|
|
except requests.RequestException as e:
|
|
logger.error(f"Network error fetching API data: {e}")
|
|
self.request_tracker.record_request(False, 0, "network_error")
|
|
increment_counter("api_requests_failed")
|
|
return None
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error fetching API data: {e}")
|
|
self.request_tracker.record_request(False, 0, "unexpected_error")
|
|
increment_counter("api_requests_failed")
|
|
return None
|
|
|
|
def fetch_water_data(self) -> Optional[List[Dict]]:
|
|
"""Fetch water levels and discharge data from API for current date"""
|
|
current_date = datetime.datetime.now()
|
|
return self.fetch_water_data_for_date(current_date)
|
|
|
|
def save_to_database(self, water_data: List[Dict], max_retries: int = 3) -> bool:
|
|
"""Save water measurements to database with retry logic"""
|
|
if not self.db_adapter:
|
|
logger.error("Database adapter not initialized")
|
|
return False
|
|
|
|
if not water_data:
|
|
logger.warning("No data to save")
|
|
return False
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
success = self.db_adapter.save_measurements(water_data)
|
|
if success:
|
|
logger.info(f"Successfully saved {len(water_data)} measurements to database")
|
|
increment_counter("database_saves_successful")
|
|
set_gauge("last_save_timestamp", time.time())
|
|
return True
|
|
else:
|
|
logger.warning(f"Save attempt {attempt + 1} failed, retrying...")
|
|
|
|
except Exception as e:
|
|
if "database is locked" in str(e).lower() and attempt < max_retries - 1:
|
|
logger.warning(f"Database locked on attempt {attempt + 1}, retrying in {2 ** attempt} seconds...")
|
|
time.sleep(2 ** attempt) # Exponential backoff
|
|
continue
|
|
else:
|
|
logger.error(f"Error saving to database (attempt {attempt + 1}): {e}")
|
|
if attempt == max_retries - 1:
|
|
increment_counter("database_saves_failed")
|
|
return False
|
|
|
|
return False
|
|
|
|
def get_latest_data(self, limit: int = 100) -> List[Dict]:
|
|
"""Get latest data from database"""
|
|
if not self.db_adapter:
|
|
return []
|
|
|
|
try:
|
|
return self.db_adapter.get_latest_measurements(limit=limit)
|
|
except Exception as e:
|
|
logger.error(f"Error getting latest data: {e}")
|
|
return []
|
|
|
|
def run_scraping_cycle(self) -> bool:
|
|
"""Run a complete scraping cycle"""
|
|
logger.info("Starting scraping cycle...")
|
|
|
|
try:
|
|
# Fetch current data
|
|
water_data = self.fetch_water_data()
|
|
if water_data:
|
|
success = self.save_to_database(water_data)
|
|
if success:
|
|
logger.info("Scraping cycle completed successfully")
|
|
increment_counter("scraping_cycles_successful")
|
|
return True
|
|
else:
|
|
logger.error("Failed to save data")
|
|
increment_counter("scraping_cycles_failed")
|
|
return False
|
|
else:
|
|
logger.warning("No data fetched")
|
|
increment_counter("scraping_cycles_failed")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Scraping cycle failed: {e}")
|
|
increment_counter("scraping_cycles_failed")
|
|
return False
|
|
|
|
# Main execution for standalone usage
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import sys
|
|
|
|
# Configure basic logging for standalone usage
|
|
import logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('water_monitor.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
|
|
parser = argparse.ArgumentParser(description="Thailand Water Monitor")
|
|
parser.add_argument("--test", action="store_true", help="Run single test cycle")
|
|
args = parser.parse_args()
|
|
|
|
# Default SQLite configuration
|
|
db_config = {
|
|
'type': 'sqlite',
|
|
'connection_string': 'sqlite:///water_levels.db'
|
|
}
|
|
|
|
try:
|
|
scraper = EnhancedWaterMonitorScraper(db_config)
|
|
|
|
if args.test:
|
|
logger.info("Running test cycle...")
|
|
result = scraper.run_scraping_cycle()
|
|
if result:
|
|
logger.info("✅ Test completed successfully")
|
|
sys.exit(0)
|
|
else:
|
|
logger.error("❌ Test failed")
|
|
sys.exit(1)
|
|
else:
|
|
logger.info("Starting continuous monitoring...")
|
|
schedule.every(1).hours.do(scraper.run_scraping_cycle)
|
|
|
|
# Run initial cycle
|
|
scraper.run_scraping_cycle()
|
|
|
|
while True:
|
|
schedule.run_pending()
|
|
time.sleep(60)
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Monitoring stopped by user")
|
|
except Exception as e:
|
|
logger.error(f"Error: {e}")
|
|
sys.exit(1) |