Major refactor: Migrate to uv, add PostgreSQL support, and comprehensive tooling

- **Migration to uv package manager**: Replace pip/requirements with modern pyproject.toml
  - Add pyproject.toml with complete dependency management
  - Update all scripts and Makefile to use uv commands
  - Maintain backward compatibility with existing workflows

- **PostgreSQL integration and migration tools**:
  - Enhanced config.py with automatic password URL encoding
  - Complete PostgreSQL setup scripts and documentation
  - High-performance SQLite to PostgreSQL migration tool (91x speed improvement)
  - Support for both connection strings and individual components

- **Executable distribution system**:
  - PyInstaller integration for standalone .exe creation
  - Automated build scripts with batch file generation
  - Complete packaging system for end-user distribution

- **Enhanced data management**:
  - Fix --fill-gaps command with proper method implementation
  - Add gap detection and historical data backfill capabilities
  - Implement data update functionality for existing records
  - Add comprehensive database adapter methods

- **Developer experience improvements**:
  - Password encoding tools for special characters
  - Interactive setup wizards for PostgreSQL configuration
  - Comprehensive documentation and migration guides
  - Automated testing and validation tools

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-09-26 15:10:10 +07:00
parent 730cbac7ae
commit 6c7c128b4d
21 changed files with 6339 additions and 31 deletions

View File

@@ -1,6 +1,14 @@
import os
from typing import Dict, Any, Optional
# Load environment variables from .env file
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
# python-dotenv not installed, continue without it
pass
try:
from .exceptions import ConfigurationError
from .models import DatabaseType, DatabaseConfig
@@ -49,6 +57,11 @@ class Config:
# PostgreSQL settings
POSTGRES_CONNECTION_STRING = os.getenv('POSTGRES_CONNECTION_STRING')
POSTGRES_HOST = os.getenv('POSTGRES_HOST', 'localhost')
POSTGRES_PORT = int(os.getenv('POSTGRES_PORT', '5432'))
POSTGRES_DB = os.getenv('POSTGRES_DB', 'water_monitoring')
POSTGRES_USER = os.getenv('POSTGRES_USER', 'postgres')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')
# MySQL settings
MYSQL_CONNECTION_STRING = os.getenv('MYSQL_CONNECTION_STRING')
@@ -93,10 +106,21 @@ class Config:
errors.append("INFLUX_DATABASE is required for InfluxDB")
elif cls.DB_TYPE in ['postgresql', 'mysql']:
connection_string = (cls.POSTGRES_CONNECTION_STRING if cls.DB_TYPE == 'postgresql'
else cls.MYSQL_CONNECTION_STRING)
if not connection_string:
errors.append(f"Connection string is required for {cls.DB_TYPE.upper()}")
if cls.DB_TYPE == 'postgresql':
# Check if either connection string or individual components are provided
if not cls.POSTGRES_CONNECTION_STRING:
# If no connection string, check individual components
if not cls.POSTGRES_HOST:
errors.append("POSTGRES_HOST is required for PostgreSQL")
if not cls.POSTGRES_USER:
errors.append("POSTGRES_USER is required for PostgreSQL")
if not cls.POSTGRES_PASSWORD:
errors.append("POSTGRES_PASSWORD is required for PostgreSQL")
if not cls.POSTGRES_DB:
errors.append("POSTGRES_DB is required for PostgreSQL")
else: # mysql
if not cls.MYSQL_CONNECTION_STRING:
errors.append("MYSQL_CONNECTION_STRING is required for MySQL")
# Validate numeric settings
if cls.SCRAPING_INTERVAL_HOURS <= 0:
@@ -129,11 +153,21 @@ class Config:
'password': cls.INFLUX_PASSWORD
}
elif cls.DB_TYPE == 'postgresql':
return {
'type': 'postgresql',
'connection_string': cls.POSTGRES_CONNECTION_STRING or
'postgresql://postgres:password@localhost:5432/water_monitoring'
}
# Use individual components if POSTGRES_CONNECTION_STRING is not provided
if cls.POSTGRES_CONNECTION_STRING:
return {
'type': 'postgresql',
'connection_string': cls.POSTGRES_CONNECTION_STRING
}
else:
# Build connection string from components (automatically URL-encodes password)
import urllib.parse
password = urllib.parse.quote(cls.POSTGRES_PASSWORD or 'password', safe='')
connection_string = f'postgresql://{cls.POSTGRES_USER}:{password}@{cls.POSTGRES_HOST}:{cls.POSTGRES_PORT}/{cls.POSTGRES_DB}'
return {
'type': 'postgresql',
'connection_string': connection_string
}
elif cls.DB_TYPE == 'mysql':
return {
'type': 'mysql',

View File

@@ -23,11 +23,15 @@ class DatabaseAdapter(ABC):
pass
@abstractmethod
def get_measurements_by_timerange(self, start_time: datetime.datetime,
end_time: datetime.datetime,
def get_measurements_by_timerange(self, start_time: datetime.datetime,
end_time: datetime.datetime,
station_codes: Optional[List[str]] = None) -> List[Dict]:
pass
@abstractmethod
def get_measurements_for_date(self, target_date: datetime.datetime) -> List[Dict]:
pass
# InfluxDB Adapter
class InfluxDBAdapter(DatabaseAdapter):
def __init__(self, host: str = "localhost", port: int = 8086,
@@ -525,6 +529,52 @@ class SQLAdapter(DatabaseAdapter):
logging.error(f"Error querying {self.db_type.upper()}: {e}")
return []
def get_measurements_for_date(self, target_date: datetime.datetime) -> List[Dict]:
"""Get all measurements for a specific date"""
if not self.engine:
return []
try:
from sqlalchemy import text
# Get start and end of the target date
start_of_day = target_date.replace(hour=0, minute=0, second=0, microsecond=0)
end_of_day = target_date.replace(hour=23, minute=59, second=59, microsecond=999999)
query = """
SELECT m.timestamp, m.station_id, s.station_code, s.thai_name,
m.water_level, m.discharge, m.discharge_percent, m.status
FROM water_measurements m
LEFT JOIN stations s ON m.station_id = s.id
WHERE m.timestamp >= :start_time AND m.timestamp <= :end_time
ORDER BY m.timestamp DESC
"""
with self.engine.connect() as conn:
result = conn.execute(text(query), {
'start_time': start_of_day,
'end_time': end_of_day
})
measurements = []
for row in result:
measurements.append({
'timestamp': row[0],
'station_id': row[1],
'station_code': row[2] or f"Station_{row[1]}",
'station_name_th': row[3] or f"Station {row[1]}",
'water_level': float(row[4]) if row[4] else None,
'discharge': float(row[5]) if row[5] else None,
'discharge_percent': float(row[6]) if row[6] else None,
'status': row[7]
})
return measurements
except Exception as e:
logging.error(f"Error querying {self.db_type.upper()} for date {target_date.date()}: {e}")
return []
# VictoriaMetrics Adapter (using Prometheus format)
class VictoriaMetricsAdapter(DatabaseAdapter):
def __init__(self, host: str = "localhost", port: int = 8428):
@@ -638,6 +688,11 @@ class VictoriaMetricsAdapter(DatabaseAdapter):
logging.warning("get_measurements_by_timerange not fully implemented for VictoriaMetrics")
return []
def get_measurements_for_date(self, target_date: datetime.datetime) -> List[Dict]:
"""Get all measurements for a specific date"""
logging.warning("get_measurements_for_date not fully implemented for VictoriaMetrics")
return []
# Factory function to create appropriate adapter
def create_database_adapter(db_type: str, **kwargs) -> DatabaseAdapter:
"""

View File

@@ -483,6 +483,99 @@ class EnhancedWaterMonitorScraper:
increment_counter("scraping_cycles_failed")
return False
def fill_data_gaps(self, days_back: int) -> int:
"""Fill gaps in data for the specified number of days back"""
logger = get_logger(__name__)
filled_count = 0
try:
# Calculate date range
end_date = datetime.datetime.now()
start_date = end_date - datetime.timedelta(days=days_back)
logger.info(f"Checking for gaps from {start_date.date()} to {end_date.date()}")
# Iterate through each date in the range
current_date = start_date
while current_date <= end_date:
# Check if we have data for this date
has_data = self._check_data_exists_for_date(current_date)
if not has_data:
logger.info(f"Filling gap for date: {current_date.date()}")
# Fetch data for this specific date
data = self.fetch_water_data_for_date(current_date)
if data:
# Save the data
if self.save_to_database(data):
filled_count += len(data)
logger.info(f"Filled {len(data)} measurements for {current_date.date()}")
else:
logger.warning(f"Failed to save data for {current_date.date()}")
else:
logger.warning(f"No data available for {current_date.date()}")
current_date += datetime.timedelta(days=1)
except Exception as e:
logger.error(f"Gap filling error: {e}")
return filled_count
def update_existing_data(self, days_back: int) -> int:
"""Update existing data with latest values for the specified number of days back"""
logger = get_logger(__name__)
updated_count = 0
try:
# Calculate date range
end_date = datetime.datetime.now()
start_date = end_date - datetime.timedelta(days=days_back)
logger.info(f"Updating data from {start_date.date()} to {end_date.date()}")
# Iterate through each date in the range
current_date = start_date
while current_date <= end_date:
logger.info(f"Updating data for date: {current_date.date()}")
# Fetch fresh data for this date
data = self.fetch_water_data_for_date(current_date)
if data:
# Save the data (this will update existing records)
if self.save_to_database(data):
updated_count += len(data)
logger.info(f"Updated {len(data)} measurements for {current_date.date()}")
else:
logger.warning(f"Failed to update data for {current_date.date()}")
else:
logger.warning(f"No data available for {current_date.date()}")
current_date += datetime.timedelta(days=1)
except Exception as e:
logger.error(f"Data update error: {e}")
return updated_count
def _check_data_exists_for_date(self, target_date: datetime.datetime) -> bool:
"""Check if data exists for a specific date"""
try:
if not self.db_adapter:
return False
# Get data for the specific date
measurements = self.db_adapter.get_measurements_for_date(target_date)
return len(measurements) > 0
except Exception as e:
logger = get_logger(__name__)
logger.debug(f"Error checking data existence: {e}")
return False
# Main execution for standalone usage
if __name__ == "__main__":
import argparse