import gradio as gr
import tempfile
from pathlib import Path
import uuid
import subprocess
import shutil
import os

# Konstanten
WORD_FADE_DURATION = 0.2
FFMPEG_ESCAPE_CHAR = "\\"

# Erlaubte Dateiformate
allowed_medias = [".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff"]
allowed_audios = [".mp3", ".wav", ".m4a", ".ogg"]

# Erweiterte Liste von Schriftpfaden, die in Hugging Face Spaces üblich sind
FONT_MAP = {
    "System Default (FFmpeg)": None, # Kein fontfile-Parameter, FFmpeg wählt
    "Noto Sans Bold": "/usr/share/fonts/truetype/noto/NotoSans-Bold.ttf",
    "DejaVu Sans Bold": "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
    "Liberation Sans Bold": "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf",
    "FreeSans Bold": "/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
}
FONT_OPTIONS = list(FONT_MAP.keys())


def get_font_path(font_name):
    """
    Gibt den tatsächlichen, existierenden Pfad für die ausgewählte Schriftart zurück.
    """
    requested_path = FONT_MAP.get(font_name)
    
    if requested_path is None or os.path.exists(requested_path):
        return requested_path
    
    for name, path in FONT_MAP.items():
        if path and os.path.exists(path):
            print(f"Warnung: Ausgewählte Schriftart '{font_name}' nicht gefunden. Verwende Fallback: '{name}'")
            return path
            
    print("Warnung: Keine bevorzugten Schriftarten gefunden. Verwende FFmpeg System Standard.")
    return None


def save_temp_audio(audio_file_path):
    """Speichert die hochgeladene Audio-Datei in einem temporären Verzeichnis."""
    if not audio_file_path:
        return None, None
    input_path = Path(audio_file_path)
    ext = input_path.suffix
    if ext.lower() not in allowed_audios:
        ext = ".mp3"
        
    temp_audio_dir = Path(tempfile.mkdtemp())
    temp_audio = temp_audio_dir / f"input{ext}"
    
    try:
        shutil.copyfile(input_path, temp_audio)
        return temp_audio_dir, temp_audio
    except Exception as e:
        print(f"Fehler beim Kopieren der Audiodatei: {e}")
        if temp_audio_dir.exists():
             shutil.rmtree(temp_audio_dir)
        return None, None

def create_sentence_base_filter(full_text, duration_clip, font_option, font_size, y_pos, style):
    """
    Erstellt den FFmpeg drawtext Filter für die Basisschicht (den gesamten Satzabschnitt).
    Dies ist entweder der gesamte Satz oder die Box für statischen Text.
    """
    base_params = {
        "fontcolor": "white", 
        "borderw": 2, # Standard Schatten/Rand
        "bordercolor": "black",
        "box": 1, "boxcolor": "0x000000@0.6", # Semi-transparente schwarze Box
        "fontsize": font_size
    }

    style_lower = style.lower()
    
    if style_lower == "highlight":
        # Hervorheben: Der gesamte Satz als Basis, aber nur mit leichtem Schatten
        base_params["box"] = 0
        base_params["borderw"] = 2
    
    elif style_lower == "static":
        # Statisch: Der gesamte Satz in einer Box, keine Animation, bleibt die ganze Zeit sichtbar
        base_params["box"] = 1
        base_params["borderw"] = 0
        
    elif style_lower == "dynamic":
        # Dynamisch: Große Schrift, leichte Box, wird später vom Highlight überlagert
        base_params["box"] = 1
        base_params["boxcolor"] = "0x444444@0.4"
        base_params["borderw"] = 0
        base_params["fontsize"] = font_size * 1.2

        
    escaped_text = full_text.replace(':', FFMPEG_ESCAPE_CHAR + ':')
    
    # Filter für den gesamten Satz
    drawtext_filter = (
        f"drawtext=text='{escaped_text}':"
        f"fontcolor={base_params['fontcolor']}:"
        f"fontsize={base_params['fontsize']}:"
        f"borderw={base_params['borderw']}:"
        f"bordercolor={base_params['bordercolor']}:"
        # boxborderw=10 fügt etwas Polsterung um die Box hinzu
        + (f"box={base_params['box']}:boxcolor={base_params['boxcolor']}:boxborderw=10:" if base_params["box"] else "") +
        f"x=(w-text_w)/2:y=(h-text_h)*{y_pos}"
    )

    if font_option:
        drawtext_filter += f":{font_option}"
    
    # Der statische Stil wird sofort und für die gesamte Clip-Dauer eingeblendet
    if style_lower == "static":
        drawtext_filter += f":enable='between(t, 0, {duration_clip})'"
    
    # Für Highlight und Dynamic brauchen wir die Basis als konstante Referenz
    else:
        # Bei "Highlight" und "Dynamic" ist dies der Basis-Text, der IMMER sichtbar ist.
        drawtext_filter += f":enable='between(t, 0, {duration_clip})'"
        
    return drawtext_filter


def create_highlight_word_filter(word, full_text, start_time, duration, font_option, font_size, y_pos, style):
    """
    Erstellt den FFmpeg drawtext Filter für die Highlight-Schicht (nur das aktive Wort),
    es sei denn, der Stil ist 'Static'.
    """
    # Wenn statisch, wird kein Highlight benötigt
    if style.lower() == "static":
        return None 
        
    word_end_time = start_time + duration
    
    # Alpha-Ausdruck für smooth Fade-In und Fade-Out der HIGHLIGHT-FARBE
    highlight_alpha_expression = (
        f"if(lt(t,{start_time}), 0, "
        f"if(lt(t,{start_time + WORD_FADE_DURATION}), (t-{start_time})/{WORD_FADE_DURATION}, "
        f"if(lt(t,{word_end_time - WORD_FADE_DURATION}), 1, "
        f"if(lt(t,{word_end_time}), ({word_end_time}-t)/{WORD_FADE_DURATION}, 0))))"
    )

    # Styling Parameter
    params = {
        "fontcolor": "yellow",
        "borderw": 0,
        "bordercolor": "black",
        "fontsize_override": font_size * 1.05 # Leicht vergrößert
    }
    
    style_lower = style.lower()
    
    if style_lower == "dynamic":
        # Dynamisch: Schrift deutlich größer und mit Rand, zentriert.
        params["fontcolor"] = "yellow"
        params["borderw"] = 4
        params["fontsize_override"] = font_size * 1.5 
    
    else: # Highlight
        # Highlight: Gelbe Schrift, kein Rand
        params["fontcolor"] = "yellow"
        params["borderw"] = 0
        
    escaped_word = word.replace(':', FFMPEG_ESCAPE_CHAR + ':')

    # Filter für das einzelne, hervorgehobene Wort (Das gesamte Wort wird gezeichnet)
    drawtext_filter = (
        f"drawtext=text='{escaped_word}':" 
        f"fontcolor={params['fontcolor']}:"
        f"fontsize={params['fontsize_override']}:"
        f"borderw={params['borderw']}:"
        f"bordercolor={params['bordercolor']}:"
        f"x=(w-text_w)/2:y=(h-text_h)*{y_pos}"
    )


    if font_option:
        drawtext_filter += f":{font_option}"
    
    # Der Highlight-Filter ist nur aktiv, wenn das Wort aktiv ist (via Alpha-Expression).
    drawtext_filter += f":alpha='{highlight_alpha_expression}'"
    return drawtext_filter


def generate_slideshow_with_audio(images, input_text, duration_per_word, duration_per_image, fade_duration, font_size, y_pos, audio_file, subtitle_style, selected_font):
    
    if not images:
        return None, "❌ Keine Bilder ausgewählt"
    
    temp_dir = tempfile.mkdtemp()
    
    # Text in Wörter aufteilen
    words = input_text.split() if input_text else []
    total_words = len(words)
    num_images = len(images)
    
    # Berechnung der gleichmäßigen Verteilung der Wörter auf die Bilder
    base_words_per_clip = total_words // num_images
    remainder = total_words % num_images
    
    current_word_index = 0
    clips_with_text = []

    # Schriftart finden basierend auf der Auswahl
    font_path = get_font_path(selected_font) 
    
    # Pfad für FFmpeg vorbereiten und maskieren.
    font_option = ""
    if font_path:
        escaped_font_path = str(font_path).replace(FFMPEG_ESCAPE_CHAR, FFMPEG_ESCAPE_CHAR + FFMPEG_ESCAPE_CHAR)
        escaped_font_path = escaped_font_path.replace(':', FFMPEG_ESCAPE_CHAR + ':')
        font_option = f"fontfile='{escaped_font_path}'" 

    # Audio verarbeiten
    audio_temp_dir, temp_audio_file = save_temp_audio(audio_file) if audio_file else (None, None)

    
    # --- 1. SCHLEIFE: Erstelle jeden Clip mit seinem Textsegment ---
    for i in range(num_images):
        img_path = Path(images[i].name)
        clip_path = Path(temp_dir) / f"clip_with_text_{i}.mp4"

        # 1. Bestimme das Wortsegment für diesen Clip
        words_on_this_clip = base_words_per_clip + (1 if i < remainder else 0)
        word_segment = words[current_word_index : current_word_index + words_on_this_clip]
        current_word_index += len(word_segment)
        
        full_text = " ".join(word_segment)

        # 2. Berechne die Clip-Dauer
        text_duration = len(word_segment) * duration_per_word
        duration_clip = max(duration_per_image, text_duration)

        drawtext_filters = []
        
        if full_text:
            # ERSTE SCHICHT: Der gesamte Satz (als STABILE BASIS oder STATISCHE BOX)
            base_filter = create_sentence_base_filter(full_text, duration_clip, font_option, font_size, y_pos, subtitle_style)
            drawtext_filters.append(base_filter)
            
            # ZWEITE SCHICHT: Highlight-Layer (nur wenn nicht "Static")
            if subtitle_style.lower() != "static":
                word_start_time = 0.0
                for word in word_segment:
                    highlight_filter = create_highlight_word_filter(
                        word, 
                        full_text, 
                        word_start_time, 
                        duration_per_word, 
                        font_option, 
                        font_size, 
                        y_pos, 
                        subtitle_style
                    )
                    if highlight_filter:
                        drawtext_filters.append(highlight_filter)
                    word_start_time += duration_per_word


        # 3. Basis- und Fade-Filter
        base_filters = (
            "scale=w=1280:h=720:force_original_aspect_ratio=decrease,"
            "pad=1280:720:(ow-iw)/2:(oh-ih)/2:color=black,"
            "fps=25,format=yuv420p"
        )
        
        fade_out_start = duration_clip - fade_duration
        if fade_out_start < 0: fade_out_start = 0
        fade_img_filter = f"fade=t=in:st=0:d={fade_duration},fade=t=out:st={fade_out_start}:d={fade_duration}"
        
        # 4. Kombiniere alle Filter
        if drawtext_filters:
            # Die Reihenfolge ist wichtig: Basis zuerst, Highlights zuletzt
            all_drawtext_filters = ",".join(drawtext_filters)
            vf_filters_clip = f"{base_filters},{all_drawtext_filters},{fade_img_filter}"
        else:
            # Kein Text mehr: Nur Bild mit Fade
            vf_filters_clip = f"{base_filters},{fade_img_filter}"

        # 5. FFmpeg Command zum Erstellen des Clips
        cmd = [
            "ffmpeg", "-y", "-loop", "1", "-i", str(img_path),
            "-t", str(duration_clip),
            "-vf", vf_filters_clip,
            str(clip_path)
        ]
        
        try:
            subprocess.run(cmd, check=True, capture_output=True, text=True)
            clips_with_text.append(clip_path)
        except subprocess.CalledProcessError as e:
            shutil.rmtree(temp_dir)
            if audio_temp_dir: shutil.rmtree(audio_temp_dir)
            return None, f"❌ FFmpeg Fehler bei Bild {i+1}:\n{e.stderr}"

    # --- 2. ZUSAMMENFÜGEN ---
    filelist_path = Path(temp_dir) / "filelist.txt"
    with open(filelist_path, "w") as f:
        for clip in clips_with_text:
            f.write(f"file '{clip}'\n")

    output_video = Path(temp_dir) / f"slideshow_{uuid.uuid4().hex}.mp4"
    
    cmd_concat = [
        "ffmpeg", "-y", "-f", "concat", "-safe", "0",
        "-i", str(filelist_path),
        "-c:v", "libx264", "-pix_fmt", "yuv420p",
        str(output_video)
    ]
    
    try:
        subprocess.run(cmd_concat, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        shutil.rmtree(temp_dir)
        if audio_temp_dir: shutil.rmtree(audio_temp_dir)
        return None, f"❌ FFmpeg Fehler beim Zusammenfügen:\n{e.stderr}"

    # --- 3. AUDIO HINZUFÜGEN (falls vorhanden) ---
    final_output = output_video
    if temp_audio_file:
        final_output = Path(temp_dir) / f"final_{uuid.uuid4().hex}.mp4"
        cmd_audio = [
            "ffmpeg", "-y", "-i", str(output_video), "-i", str(temp_audio_file),
            "-c:v", "copy", "-c:a", "aac", "-shortest",
            str(final_output)
        ]
        try:
            subprocess.run(cmd_audio, check=True, capture_output=True, text=True)
        except subprocess.CalledProcessError as e:
            shutil.rmtree(temp_dir)
            if audio_temp_dir: shutil.rmtree(audio_temp_dir)
            return None, f"❌ FFmpeg Fehler beim Hinzufügen von Audio:\n{e.stderr}"
        
        # Bereinige das separate Audio-Temp-Verzeichnis
        if audio_temp_dir: shutil.rmtree(audio_temp_dir)
        
        return str(final_output), "✅ Video mit Audio erstellt!"

    # Nur Video-Pfad zurückgeben
    return str(final_output), "✅ Video erstellt (ohne Audio)"

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Slideshow Generator")

    with gr.Row():
        img_input = gr.Files(label="Bilder", file_types=allowed_medias)
        text_input = gr.Textbox(label="Text (Wörter werden gleichmäßig auf alle Bilder verteilt)", lines=5, placeholder="Der Basissatz wird konstant angezeigt. Das aktive Wort wird hervorgehoben.")
    
    with gr.Row():
        duration_image_input = gr.Number(value=3, label="Mindest-Dauer pro BILD (s)")
        duration_word_input = gr.Number(value=1.0, label="Dauer pro WORT (s) [bestimmt Geschwindigkeit der Hervorhebung]")
        fade_input = gr.Number(value=0.5, label="Bild-Fade Dauer (s)")
    
    with gr.Row():
        font_select_input = gr.Dropdown(
            FONT_OPTIONS,
            label="Schriftart",
            value="DejaVu Sans Bold" if "DejaVu Sans Bold" in FONT_OPTIONS else FONT_OPTIONS[0],
            interactive=True,
            scale=1
        )
        font_size_input = gr.Number(value=80, label="Schriftgröße (px)", scale=1)
        ypos_input = gr.Slider(0.0, 1.0, value=0.9, label="Y-Position (0=Oben, 1=Unten)", scale=2)
        
        # NEU: Reduzierte Untertitel-Stile
        subtitle_style_input = gr.Dropdown(
            ["Highlight", "Dynamic", "Static"],
            label="Untertitel-Stil",
            value="Highlight",
            interactive=True,
            scale=1
        )
    
    audio_input = gr.File(label="Audio (optional)", file_types=allowed_audios)
    btn = gr.Button("Erstellen", variant="primary")
    
    out_video = gr.Video(label="Ergebnis")
    status = gr.Textbox(label="Status")

    btn.click(
        fn=generate_slideshow_with_audio,
        inputs=[
            img_input, 
            text_input, 
            duration_word_input, 
            duration_image_input, 
            fade_input, 
            font_size_input, 
            ypos_input, 
            audio_input,
            subtitle_style_input,
            font_select_input 
        ],
        outputs=[out_video, status]
    )

demo.launch()