import os
import subprocess
import sys
from bs4 import BeautifulSoup
import glob
import logging

from markdown_extensions import (
    DATA_ATTR_SONG_ID,
    DATA_ATTR_START,
    DATA_ATTR_END,
    DEFAULT_AUDIO_FORMAT,
    generate_audio_element_id,
    generate_audio_filename,
)

logger = logging.getLogger(__name__) # Get a logger specific to this module


def check_yt_dlp():
    """Checks if yt-dlp is installed and executable."""
    try:
        subprocess.run(['yt-dlp', '--version'], check=True, capture_output=True, text=True)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        logger.error("yt-dlp command not found or failed to execute.") # Use logger
        logger.error("Please ensure yt-dlp is installed and in your system's PATH.") # Use logger
        logger.error("You can install it via pip: pip install yt-dlp") # Use logger
        return False

def find_audio_tags(html_content: str) -> list[dict]:
    """Parses HTML content and finds audio tags with required data attributes."""
    soup = BeautifulSoup(html_content, 'html.parser')
    audio_elements = soup.find_all('audio')
    
    extracted_tags = []
    for tag in audio_elements:
        song_id = tag.get(DATA_ATTR_SONG_ID)
        start_str = tag.get(DATA_ATTR_START)
        end_str = tag.get(DATA_ATTR_END)
        
        if song_id and start_str and end_str:
            try:
                # Parse as float
                start_seconds = float(start_str)
                end_seconds = float(end_str)
                # Basic validation
                if start_seconds >= 0 and end_seconds > start_seconds:
                    extracted_tags.append({
                        'song_id': song_id,
                        'start_seconds': start_seconds, # Store float
                        'end_seconds': end_seconds,   # Store float
                        'tag_id': tag.get('id', None) # Optional: Get the ID for filename generation
                    })
                else:
                     # Use logger
                     logger.warning(f"Invalid start/end times found: start={start_str}, end={end_str} for song={song_id}. Skipping.")
            except ValueError:
                # Use logger
                logger.warning(f"Could not parse start/end times as floats: start={start_str}, end={end_str} for song={song_id}. Skipping.")
        # else: It's okay if some audio tags don't have these, might be other audio elements
            
    return extracted_tags


def download_audio_segment(song_id: str, start_seconds: float, end_seconds: float, audio_dir: str):
    """Downloads a specific audio segment using yt-dlp if it doesn't exist.
    Uses integer part of seconds for filename generation, but float for download command.
    `audio_dir` should be the absolute path to the target directory.
    """
    
    # 1. Generate expected filename based on the shared logic
    # Use integer part of seconds for ID/filename consistency with markdown_extensions
    element_id = generate_audio_element_id(song_id, start_seconds, end_seconds) # Still uses int() inside
    output_filename_base = generate_audio_filename(element_id, format="")[:-1] # Remove trailing dot
    output_filename_with_ext = generate_audio_filename(element_id) # Includes extension
    output_filepath = os.path.join(audio_dir, output_filename_with_ext)

    # 2. Check if file already exists
    if os.path.exists(output_filepath):
        # Use logger
        logger.info(f"Audio segment already exists: {output_filepath}. Skipping download.")
        return True # Indicate success (already exists)

    # 3. Construct yt-dlp command
    # Base URL for YouTube videos
    video_url = f"https://www.youtube.com/watch?v={song_id}"
    
    # yt-dlp command arguments
    # -x: Extract audio
    # --audio-format: Specify format (matches DEFAULT_AUDIO_FORMAT from markdown_extensions)
    # --audio-quality 0: Best audio quality
    # --download-sections *start-end: Download only the specified time range
    # -o: Output template (directory and filename without extension)
    # --force-keyframes-at-cuts: Might improve seeking accuracy for some formats
    # --quiet: Suppress yt-dlp stdout unless erroring
    # --no-warnings: Suppress yt-dlp warnings (can be noisy)
    
    # Ensure audio_dir exists for the output template
    os.makedirs(audio_dir, exist_ok=True)
    
    command = [
        'yt-dlp',
        '-x', # Extract audio
        '--audio-format', DEFAULT_AUDIO_FORMAT,
        '--audio-quality', '0', # Best quality
        # Use precise float values for the download section
        '--download-sections', f"*{start_seconds}-{end_seconds}",
        # Output: Use os.path.join to create the base path + filename (no ext)
        '-o', os.path.join(audio_dir, output_filename_base + '.%(ext)s'),
        '--force-keyframes-at-cuts', # May help segment accuracy
        '--quiet', # Suppress non-error output
        '--no-warnings', # Suppress warnings
        video_url
    ]

    # Use logger
    logger.info(f"Attempting to download segment: {song_id} [{start_seconds}-{end_seconds}] -> {output_filepath}")
    
    # 4. Execute command
    try:
        _ = subprocess.run(command, check=True, capture_output=True, text=True)
        logger.debug(f"Successfully downloaded: {output_filepath}") # Use logger
        # Verify the file was created with the *exact* expected name
        # yt-dlp might sometimes slightly change the extension (e.g., .m4a vs .mp4 container for aac)
        # If the expected file exists, we're good. If not, log a warning.
        if not os.path.exists(output_filepath):
             # Use logger
             logger.warning(f"yt-dlp finished for {element_id}, but expected file '{output_filepath}' not found. Check {audio_dir} for files starting with '{output_filename_base}'.")
             # Try to find *any* file matching the base name and log it if found
             potential_files = glob.glob(os.path.join(audio_dir, output_filename_base + '.*'))
             if potential_files:
                 # Use logger
                 logger.warning(f"Found potential match(es): {potential_files}")
             # We won't rename automatically, as it might hide underlying issues.
             return False # Indicate potential issue
        return True # Indicate success

    except subprocess.CalledProcessError as e:
        logger.error(f"Failed to download segment for {song_id} [{start_seconds}-{end_seconds}].") # Use logger
        logger.error(f"Command: {' '.join(e.cmd)}") # Use logger
        logger.error(f"Stderr: {e.stderr}") # Use logger
        return False # Indicate failure
    except FileNotFoundError:
        # This typically means yt-dlp itself wasn't found - handled by check_yt_dlp earlier, but good to catch.
        logger.error("yt-dlp command not found during execution. Please ensure it's installed and in PATH.") # Use logger
        return False
    except Exception as e:
        # Use logger
        logger.error(f"An unexpected error occurred during download for {song_id} [{start_seconds}-{end_seconds}]: {e}")
        return False


# --- Main Processing Function ---

def process_html_files(html_dir: str, audio_output_dir: str):
    """
    Finds all HTML files in a directory, extracts audio tag data,
    and downloads the required audio segments into the specified `audio_output_dir`.

    Args:
        html_dir: Absolute path to the directory containing generated HTML files.
        audio_output_dir: Absolute path to the directory where audio files should be saved.
    """
    if not check_yt_dlp():
        sys.exit("yt-dlp is required but not found. Aborting audio download.")
        
    # Ensure the main audio output directory exists (using the passed-in path)
    os.makedirs(audio_output_dir, exist_ok=True)
    
    html_files = glob.glob(os.path.join(html_dir, '*.html'))
    logger.info(f"Found {len(html_files)} HTML files in '{html_dir}'.") # Use logger
    
    all_audio_segments = set() # Use a set to avoid duplicate download attempts for the same segment appearing on multiple pages

    for html_file_path in html_files:
        try:
            with open(html_file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            audio_tags = find_audio_tags(content)
            # Use logger
            logger.debug(f"Found {len(audio_tags)} audio segments to potentially download in {os.path.basename(html_file_path)}.")
            
            for tag_data in audio_tags:
                # Create a unique tuple identifier for the segment
                segment_id = (
                    tag_data['song_id'], 
                    tag_data['start_seconds'], 
                    tag_data['end_seconds']
                )
                all_audio_segments.add(segment_id)

        except Exception as e:
            logger.error(f"Error processing file {html_file_path}: {e}") # Use logger

    # Now download the unique segments
    download_count = 0
    fail_count = 0
    already_exists_count = 0
    
    for song_id, start_seconds, end_seconds in all_audio_segments:
        # Check existence *before* calling download function for accurate counts
        # Use integer part of seconds for filename check consistency
        element_id = generate_audio_element_id(song_id, start_seconds, end_seconds)
        output_filename_with_ext = generate_audio_filename(element_id)
        # Check existence in the correct output directory
        output_filepath = os.path.join(audio_output_dir, output_filename_with_ext)

        if os.path.exists(output_filepath):
             already_exists_count += 1
             # Use logger (changed to info level for less noise on existing files)
             logger.debug(f"Segment already exists: {output_filepath}")
             continue # Skip to next segment
             
        # If it doesn't exist, attempt download into the correct directory
        success = download_audio_segment(song_id, start_seconds, end_seconds, audio_output_dir)
        if success:
            # Check again if it now exists (download_audio_segment returns True if existed before call too)
             if os.path.exists(output_filepath):
                 download_count +=1
             # else: download reported success, but file not found (warning issued inside download_audio_segment)
             # We count this as a failure for summary purposes.
             else:
                 fail_count += 1
        else:
            fail_count += 1

    logger.info("--- Audio Download Summary ---") # Use logger
    logger.info(f"Total unique segments found: {len(all_audio_segments)}") # Use logger
    logger.info(f"Successfully downloaded:     {download_count}") # Use logger
    logger.info(f"Already existed:            {already_exists_count}") # Use logger
    logger.info(f"Failed downloads:           {fail_count}") # Use logger