#!/usr/bin/env python3
import os
import sys
import json
import time
import requests
from pathlib import Path
import assemblyai as aai
import static_site_generator

# Global configuration variables
input_folder = "~/gdrive-mounts/recordings/"
id_folder = "~/annotate-voice/ids"
output_folder = "~/annotate-voice/outputs"
human_readable_folder = "~/annotate-voice/readable"
web_folder = "~/annotate-voice/web"
api_key = "8b25a41d2a61457fa9c1e1d115e11936"

def setup_directories():
    """Ensure all directories exist and create them if needed."""
    input_path = Path(os.path.expanduser(input_folder))
    id_path = Path(os.path.expanduser(id_folder))
    output_path = Path(os.path.expanduser(output_folder))
    readable_path = Path(os.path.expanduser(human_readable_folder))
    web_path = Path(os.path.expanduser(web_folder))
    
    if not input_path.exists():
        print(f"Error: Input directory '{input_folder}' does not exist.")
        sys.exit(1)
    
    id_path.mkdir(parents=True, exist_ok=True)
    output_path.mkdir(parents=True, exist_ok=True)
    readable_path.mkdir(parents=True, exist_ok=True)
    web_path.mkdir(parents=True, exist_ok=True)
    
    return input_path, id_path, output_path, readable_path, web_path

def fetch_transcript_data(transcript_id):
    """Fetch transcript data from AssemblyAI API."""
    url = f'https://api.assemblyai.com/v2/transcript/{transcript_id}'
    headers = {'Authorization': api_key}
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching transcript data: {e}")
        return None
    except json.JSONDecodeError:
        print(f"Error decoding JSON response from API")
        return None

def save_json(data, file_path):
    """Save data as JSON file."""
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        return True
    except Exception as e:
        print(f"Error: Failed to save to {file_path}: {str(e)}")
        return False

def generate_transcript_id(audio_file, id_path):
    """Step 1: Generate transcript ID for an audio file and save metadata."""
    aai.settings.api_key = api_key
    
    print(f"Submitting for transcription: {audio_file}")
    transcriber = aai.Transcriber()
    
    try:
        config = aai.TranscriptionConfig(speaker_labels=True)
        transcript = transcriber.transcribe(str(audio_file), config)
        
        # Create metadata with transcript ID
        metadata = {
            "file_name": audio_file.name,
            "file_path": str(audio_file),
            "file_size_bytes": audio_file.stat().st_size,
            "submission_time": int(time.time()),
            "id": transcript.id
        }
        
        # Save metadata to id_folder
        id_file = id_path / f"{audio_file.stem}.json"
        if save_json(metadata, id_file):
            print(f"Saved transcript ID to: {id_file}")
            return True
        return False
    except Exception as e:
        print(f"Error generating transcript ID: {str(e)}")
        return False

def fetch_transcript_result(id_file, output_path):
    """Step 2: Fetch full transcript using the ID and save results."""
    try:
        # Read the ID file
        with open(id_file, 'r', encoding='utf-8') as f:
            metadata = json.load(f)
        
        transcript_id = metadata.get('id')
        if not transcript_id:
            print(f"Error: No transcript ID found in {id_file}")
            return False
        
        print(f"Fetching transcript with ID: {transcript_id}")
        transcript_data = fetch_transcript_data(transcript_id)
        
        if not transcript_data:
            print(f"Error: Failed to fetch transcript data")
            return False
        
        if transcript_data.get('status') != "completed":
            print(f"Transcript not ready. Status: {transcript_data.get('status')}")
            return False
        
        # Create full result including transcript data
        result = metadata.copy()
        result["completion_time"] = int(time.time())
        result["transcript"] = transcript_data
        
        # Save to output folder with same filename
        output_file = output_path / id_file.name
        if save_json(result, output_file):
            print(f"Saved full transcript to: {output_file}")
            return True
        return False
    except Exception as e:
        print(f"Error processing transcript result: {str(e)}")
        return False

def create_human_readable_transcript(output_file, readable_path):
    """Step 3: Convert JSON transcript to human-readable TXT format."""
    try:
        # Read the output file
        with open(output_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        transcript_data = data.get('transcript', {})
        utterances = transcript_data.get('utterances', [])
        
        if not utterances:
            print(f"Error: No utterances found in {output_file}")
            return False
        
        # Create the readable output file with same name but .txt extension
        readable_file = readable_path / f"{output_file.stem}.txt"
        
        with open(readable_file, 'w', encoding='utf-8') as f:
            for utterance in utterances:
                speaker = utterance.get('speaker', 'Unknown')
                text = utterance.get('text', '')
                
                if text:
                    f.write(f"[Speaker {speaker}]: {text}\n\n")
        
        print(f"Created human-readable transcript: {readable_file}")
        return True
    except Exception as e:
        print(f"Error creating human-readable transcript: {str(e)}")
        return False

def step3_create_readable_transcripts(output_path, readable_path):
    """Step 3: Create human-readable transcripts from JSON output files."""
    print("\n=== STEP 3: CREATING HUMAN-READABLE TRANSCRIPTS ===")
    
    # Find output files without corresponding readable files
    output_files = list(output_path.glob('*.json'))
    files_to_process = []
    
    for output_file in output_files:
        readable_file = readable_path / f"{output_file.stem}.txt"
        if not readable_file.exists():
            files_to_process.append(output_file)
    
    if not files_to_process:
        print("No output files to process in Step 3.")
        return 0, 0
    
    print(f"Found {len(files_to_process)} output files to convert to human-readable format.")
    
    successful = 0
    failed = 0
    
    for i, output_file in enumerate(files_to_process, 1):
        print(f"Processing output file {i}/{len(files_to_process)}: {output_file}")
        
        if create_human_readable_transcript(output_file, readable_path):
            successful += 1
        else:
            failed += 1
    
    print(f"Step 3 complete: Successfully processed {successful} files, Failed: {failed}")
    return successful, failed

def step1_generate_ids(input_path, id_path):
    """Step 1: Process audio files and generate transcript IDs."""
    print("\n=== STEP 1: GENERATING TRANSCRIPT IDs ===")
    
    # Find m4a files without IDs
    audio_files = list(input_path.glob('*.m4a'))
    files_to_process = []
    
    for audio_file in audio_files:
        id_file = id_path / f"{audio_file.stem}.json"
        if not id_file.exists():
            files_to_process.append(audio_file)
    
    if not files_to_process:
        print("No new audio files to process in Step 1.")
        return 0, 0
    
    print(f"Found {len(files_to_process)} audio files to generate transcript IDs.")
    
    successful = 0
    failed = 0
    
    for i, audio_file in enumerate(files_to_process, 1):
        print(f"Processing file {i}/{len(files_to_process)}: {audio_file}")
        
        if generate_transcript_id(audio_file, id_path):
            successful += 1
        else:
            failed += 1
    
    print(f"Step 1 complete: Successfully processed {successful} files, Failed: {failed}")
    return successful, failed

def step2_fetch_transcripts(id_path, output_path):
    """Step 2: Fetch and save full transcripts for files with IDs."""
    print("\n=== STEP 2: FETCHING FULL TRANSCRIPTS ===")
    
    # Find ID files without corresponding output files
    id_files = list(id_path.glob('*.json'))
    files_to_process = []
    
    for id_file in id_files:
        output_file = output_path / id_file.name
        if not output_file.exists():
            files_to_process.append(id_file)
    
    if not files_to_process:
        print("No transcript IDs to process in Step 2.")
        return 0, 0
    
    print(f"Found {len(files_to_process)} transcript IDs to fetch full results.")
    
    successful = 0
    failed = 0
    
    for i, id_file in enumerate(files_to_process, 1):
        print(f"Processing ID file {i}/{len(files_to_process)}: {id_file}")
        
        if fetch_transcript_result(id_file, output_path):
            successful += 1
        else:
            failed += 1
    
    print(f"Step 2 complete: Successfully processed {successful} files, Failed: {failed}")
    return successful, failed

def step4_generate_static_site(readable_path, output_path, web_path):
    """Step 4: Generate a static website from transcript files."""
    print("\n=== STEP 4: GENERATING STATIC WEBSITE ===")
    
    # Call the static site generator module
    count = static_site_generator.generate_static_site(readable_path, output_path, web_path)
    
    return count

def main():
    print("Starting four-step transcription process")
    
    input_path, id_path, output_path, readable_path, web_path = setup_directories()
    
    # Step 1: Generate transcript IDs for new audio files
    step1_success, step1_failed = step1_generate_ids(input_path, id_path)
    
    # Step 2: Fetch full transcripts for files with IDs
    step2_success, step2_failed = step2_fetch_transcripts(id_path, output_path)
    
    # Step 3: Create human-readable transcripts
    step3_success, step3_failed = step3_create_readable_transcripts(output_path, readable_path)
    
    # Step 4: Generate static website
    step4_files = step4_generate_static_site(readable_path, output_path, web_path)
    
    print("\n=== SUMMARY ===")
    print(f"Step 1 (Generate IDs): {step1_success} successful, {step1_failed} failed")
    print(f"Step 2 (Fetch Transcripts): {step2_success} successful, {step2_failed} failed")
    print(f"Step 3 (Create Readable Transcripts): {step3_success} successful, {step3_failed} failed")
    print(f"Step 4 (Generate Static Website): {step4_files} transcript files included")
    print(f"\nWebsite available at: {web_folder}")

if __name__ == "__main__":
    main() 