File size: 7,530 Bytes
8011021
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# app.py
# Gradio: paste YouTube URLs (one per line) -> download ARABIC captions only
# Preference: manual Arabic > auto Arabic. If Arabic isn't available, it reports an error for that URL.
# Output: captions_sentences_ar.zip with one sentence per line in each .txt

import os
import re
import zipfile
import tempfile
from urllib.parse import urlparse, parse_qs

import gradio as gr
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
    VideoUnavailable,
)


# ----------------- URL + filename helpers -----------------

def extract_video_id(url: str) -> str | None:
    url = (url or "").strip()
    if not url:
        return None

    m = re.search(r"(?:youtu\.be/)([A-Za-z0-9_-]{6,})", url)
    if m:
        return m.group(1)

    try:
        u = urlparse(url)

        if "youtube.com" in (u.netloc or ""):
            qs = parse_qs(u.query)
            if "v" in qs and qs["v"]:
                return qs["v"][0]

        for pat in (r"(?:/shorts/)([A-Za-z0-9_-]{6,})", r"(?:/embed/)([A-Za-z0-9_-]{6,})"):
            m = re.search(pat, u.path or "")
            if m:
                return m.group(1)

    except Exception:
        pass

    m = re.search(r"([A-Za-z0-9_-]{11})", url)
    return m.group(1) if m else None


def safe_filename(s: str, max_len: int = 120) -> str:
    s = (s or "").strip()
    s = re.sub(r"https?://", "", s)
    s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return (s[:max_len] or "video").rstrip("_")


# ----------------- Arabic transcript fetching (manual > auto) -----------------

def fetch_arabic_caption_text(video_id: str) -> tuple[str, dict]:
    """
    Returns (raw_caption_text, meta) for Arabic only.
    Preference order:
      1) Manually created Arabic transcript ('ar')
      2) Auto-generated Arabic transcript ('ar')
    If Arabic isn't available, raises NoTranscriptFound.
    """
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    # 1) Manual Arabic
    try:
        t = transcript_list.find_manually_created_transcript(["ar"])
        data = t.fetch()
        return "\n".join(item["text"] for item in data), {
            "lang": t.language_code,
            "is_generated": False,
        }
    except Exception:
        pass

    # 2) Auto Arabic
    try:
        t = transcript_list.find_generated_transcript(["ar"])
        data = t.fetch()
        return "\n".join(item["text"] for item in data), {
            "lang": t.language_code,
            "is_generated": True,
        }
    except Exception:
        raise NoTranscriptFound(video_id)


# ----------------- Cleaning: one sentence per line (Arabic-friendly heuristic) -----------------

def captions_to_ar_sentences(raw_text: str) -> list[str]:
    """
    YouTube Arabic captions are often fragmentary.
    We:
      - remove timestamps / bracketed noise
      - normalize whitespace
      - split sentences on Arabic/Latin punctuation: . ! ? โ€ฆ ุŸ ุ›
    """
    text = raw_text or ""

    # Remove timestamps like [00:01], (00:01), 00:01 --> 00:03
    text = re.sub(r"\[?\(?\d{1,2}:\d{2}(?::\d{2})?\)?\]?", " ", text)
    text = re.sub(r"\d{1,2}:\d{2}\s*-->\s*\d{1,2}:\d{2}", " ", text)

    # Remove common caption artifacts (English + Arabic-ish markers)
    text = re.sub(
        r"\s*\(?(?:applause|music|laughter|cheering|inaudible|ุชุตููŠู‚|ู…ูˆุณูŠู‚ู‰|ุถุญูƒ)\)?\s*",
        " ",
        text,
        flags=re.I,
    )

    # Join lines + normalize spaces
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text).strip()
    if not text:
        return []

    # Add split markers after sentence end punctuation (Arabic + Latin)
    # Includes: . ! ? โ€ฆ ุŸ ุ›
    text = re.sub(r"([\.!\?โ€ฆุŸุ›])\s+", r"\1<SPLIT>", text)

    parts = [p.strip() for p in text.split("<SPLIT>") if p.strip()]

    # Final cleanup
    cleaned = []
    for s in parts:
        s = re.sub(r"\s+", " ", s).strip()
        s = re.sub(r"\s+([ุŒ,\.!\?โ€ฆุŸุ›:;])", r"\1", s)
        if s:
            cleaned.append(s)

    return cleaned


# ----------------- Gradio worker -----------------

def build_zip_arabic(urls_text: str, include_header: bool) -> tuple[str | None, str]:
    urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
    if not urls:
        return None, "Paste at least one YouTube URL (one per line)."

    tmpdir = tempfile.mkdtemp(prefix="yt_captions_ar_")
    zip_path = os.path.join(tmpdir, "captions_sentences_ar.zip")

    ok = []
    bad = []

    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for idx, url in enumerate(urls, start=1):
            vid = extract_video_id(url)
            if not vid:
                bad.append(f"{idx}. Could not extract video id: {url}")
                continue

            try:
                raw_text, meta = fetch_arabic_caption_text(vid)
                sentences = captions_to_ar_sentences(raw_text)

                if not sentences:
                    bad.append(f"{idx}. Arabic transcript was empty after cleaning: {url}")
                    continue

                base = safe_filename(url)
                fname = f"{base}__{vid}__ar.txt"

                lines = []
                if include_header:
                    lines += [
                        f"URL: {url}",
                        f"VideoID: {vid}",
                        "Language: ar",
                        f"Generated: {meta['is_generated']}",
                        "",
                    ]
                lines += sentences

                zf.writestr(fname, "\n".join(lines) + "\n")

                ok.append(
                    f"{idx}. โœ… {vid} โ€” {len(sentences)} lines (Arabic, {'auto' if meta['is_generated'] else 'manual'})"
                )

            except (TranscriptsDisabled, NoTranscriptFound):
                bad.append(f"{idx}. No Arabic captions found (manual or auto): {url}")
            except VideoUnavailable:
                bad.append(f"{idx}. Video unavailable: {url}")
            except Exception as e:
                bad.append(f"{idx}. Error for {url}: {type(e).__name__}: {e}")

    log = []
    if ok:
        log.append("Downloaded (Arabic only):")
        log.extend(ok)
    if bad:
        log.append("")
        log.append("Problems:")
        log.extend(bad)

    return zip_path, "\n".join(log).strip() or "Done."


# ----------------- UI -----------------

with gr.Blocks(title="YouTube Arabic Captions โ†’ Sentences", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # YouTube Arabic Captions โ†’ One Sentence Per Line
        Paste YouTube URLs (one per line). The app downloads **Arabic captions only**
        (prefers **manual** Arabic, falls back to **auto** Arabic), cleans them, and returns a zip.
        """
    )

    urls_in = gr.Textbox(
        label="YouTube URLs (one per line)",
        placeholder="https://www.youtube.com/watch?v=...\nhttps://youtu.be/...\n...",
        lines=8,
    )

    include_header_in = gr.Checkbox(
        label="Include metadata header in each file",
        value=False,
    )

    run_btn = gr.Button("Download Arabic Captions", variant="primary")

    out_file = gr.File(label="captions_sentences_ar.zip")
    out_log = gr.Textbox(label="Log", lines=10)

    run_btn.click(
        fn=build_zip_arabic,
        inputs=[urls_in, include_header_in],
        outputs=[out_file, out_log],
    )

if __name__ == "__main__":
    demo.launch()