|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import re |
|
|
import zipfile |
|
|
import tempfile |
|
|
from urllib.parse import urlparse, parse_qs |
|
|
|
|
|
import gradio as gr |
|
|
from youtube_transcript_api import ( |
|
|
YouTubeTranscriptApi, |
|
|
TranscriptsDisabled, |
|
|
NoTranscriptFound, |
|
|
VideoUnavailable, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_video_id(url: str) -> str | None: |
|
|
url = (url or "").strip() |
|
|
if not url: |
|
|
return None |
|
|
|
|
|
m = re.search(r"(?:youtu\.be/)([A-Za-z0-9_-]{6,})", url) |
|
|
if m: |
|
|
return m.group(1) |
|
|
|
|
|
try: |
|
|
u = urlparse(url) |
|
|
|
|
|
if "youtube.com" in (u.netloc or ""): |
|
|
qs = parse_qs(u.query) |
|
|
if "v" in qs and qs["v"]: |
|
|
return qs["v"][0] |
|
|
|
|
|
for pat in (r"(?:/shorts/)([A-Za-z0-9_-]{6,})", r"(?:/embed/)([A-Za-z0-9_-]{6,})"): |
|
|
m = re.search(pat, u.path or "") |
|
|
if m: |
|
|
return m.group(1) |
|
|
|
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
m = re.search(r"([A-Za-z0-9_-]{11})", url) |
|
|
return m.group(1) if m else None |
|
|
|
|
|
|
|
|
def safe_filename(s: str, max_len: int = 120) -> str: |
|
|
s = (s or "").strip() |
|
|
s = re.sub(r"https?://", "", s) |
|
|
s = re.sub(r"[^A-Za-z0-9._-]+", "_", s) |
|
|
s = re.sub(r"_+", "_", s).strip("_") |
|
|
return (s[:max_len] or "video").rstrip("_") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_arabic_caption_text(video_id: str) -> tuple[str, dict]: |
|
|
""" |
|
|
Returns (raw_caption_text, meta) for Arabic only. |
|
|
Preference order: |
|
|
1) Manually created Arabic transcript ('ar') |
|
|
2) Auto-generated Arabic transcript ('ar') |
|
|
If Arabic isn't available, raises NoTranscriptFound. |
|
|
""" |
|
|
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) |
|
|
|
|
|
|
|
|
try: |
|
|
t = transcript_list.find_manually_created_transcript(["ar"]) |
|
|
data = t.fetch() |
|
|
return "\n".join(item["text"] for item in data), { |
|
|
"lang": t.language_code, |
|
|
"is_generated": False, |
|
|
} |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
try: |
|
|
t = transcript_list.find_generated_transcript(["ar"]) |
|
|
data = t.fetch() |
|
|
return "\n".join(item["text"] for item in data), { |
|
|
"lang": t.language_code, |
|
|
"is_generated": True, |
|
|
} |
|
|
except Exception: |
|
|
raise NoTranscriptFound(video_id) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def captions_to_ar_sentences(raw_text: str) -> list[str]: |
|
|
""" |
|
|
YouTube Arabic captions are often fragmentary. |
|
|
We: |
|
|
- remove timestamps / bracketed noise |
|
|
- normalize whitespace |
|
|
- split sentences on Arabic/Latin punctuation: . ! ? โฆ ุ ุ |
|
|
""" |
|
|
text = raw_text or "" |
|
|
|
|
|
|
|
|
text = re.sub(r"\[?\(?\d{1,2}:\d{2}(?::\d{2})?\)?\]?", " ", text) |
|
|
text = re.sub(r"\d{1,2}:\d{2}\s*-->\s*\d{1,2}:\d{2}", " ", text) |
|
|
|
|
|
|
|
|
text = re.sub( |
|
|
r"\s*\(?(?:applause|music|laughter|cheering|inaudible|ุชุตููู|ู
ูุณููู|ุถุญู)\)?\s*", |
|
|
" ", |
|
|
text, |
|
|
flags=re.I, |
|
|
) |
|
|
|
|
|
|
|
|
text = text.replace("\n", " ") |
|
|
text = re.sub(r"\s+", " ", text).strip() |
|
|
if not text: |
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
text = re.sub(r"([\.!\?โฆุุ])\s+", r"\1<SPLIT>", text) |
|
|
|
|
|
parts = [p.strip() for p in text.split("<SPLIT>") if p.strip()] |
|
|
|
|
|
|
|
|
cleaned = [] |
|
|
for s in parts: |
|
|
s = re.sub(r"\s+", " ", s).strip() |
|
|
s = re.sub(r"\s+([ุ,\.!\?โฆุุ:;])", r"\1", s) |
|
|
if s: |
|
|
cleaned.append(s) |
|
|
|
|
|
return cleaned |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_zip_arabic(urls_text: str, include_header: bool) -> tuple[str | None, str]: |
|
|
urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()] |
|
|
if not urls: |
|
|
return None, "Paste at least one YouTube URL (one per line)." |
|
|
|
|
|
tmpdir = tempfile.mkdtemp(prefix="yt_captions_ar_") |
|
|
zip_path = os.path.join(tmpdir, "captions_sentences_ar.zip") |
|
|
|
|
|
ok = [] |
|
|
bad = [] |
|
|
|
|
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: |
|
|
for idx, url in enumerate(urls, start=1): |
|
|
vid = extract_video_id(url) |
|
|
if not vid: |
|
|
bad.append(f"{idx}. Could not extract video id: {url}") |
|
|
continue |
|
|
|
|
|
try: |
|
|
raw_text, meta = fetch_arabic_caption_text(vid) |
|
|
sentences = captions_to_ar_sentences(raw_text) |
|
|
|
|
|
if not sentences: |
|
|
bad.append(f"{idx}. Arabic transcript was empty after cleaning: {url}") |
|
|
continue |
|
|
|
|
|
base = safe_filename(url) |
|
|
fname = f"{base}__{vid}__ar.txt" |
|
|
|
|
|
lines = [] |
|
|
if include_header: |
|
|
lines += [ |
|
|
f"URL: {url}", |
|
|
f"VideoID: {vid}", |
|
|
"Language: ar", |
|
|
f"Generated: {meta['is_generated']}", |
|
|
"", |
|
|
] |
|
|
lines += sentences |
|
|
|
|
|
zf.writestr(fname, "\n".join(lines) + "\n") |
|
|
|
|
|
ok.append( |
|
|
f"{idx}. โ
{vid} โ {len(sentences)} lines (Arabic, {'auto' if meta['is_generated'] else 'manual'})" |
|
|
) |
|
|
|
|
|
except (TranscriptsDisabled, NoTranscriptFound): |
|
|
bad.append(f"{idx}. No Arabic captions found (manual or auto): {url}") |
|
|
except VideoUnavailable: |
|
|
bad.append(f"{idx}. Video unavailable: {url}") |
|
|
except Exception as e: |
|
|
bad.append(f"{idx}. Error for {url}: {type(e).__name__}: {e}") |
|
|
|
|
|
log = [] |
|
|
if ok: |
|
|
log.append("Downloaded (Arabic only):") |
|
|
log.extend(ok) |
|
|
if bad: |
|
|
log.append("") |
|
|
log.append("Problems:") |
|
|
log.extend(bad) |
|
|
|
|
|
return zip_path, "\n".join(log).strip() or "Done." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="YouTube Arabic Captions โ Sentences", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# YouTube Arabic Captions โ One Sentence Per Line |
|
|
Paste YouTube URLs (one per line). The app downloads **Arabic captions only** |
|
|
(prefers **manual** Arabic, falls back to **auto** Arabic), cleans them, and returns a zip. |
|
|
""" |
|
|
) |
|
|
|
|
|
urls_in = gr.Textbox( |
|
|
label="YouTube URLs (one per line)", |
|
|
placeholder="https://www.youtube.com/watch?v=...\nhttps://youtu.be/...\n...", |
|
|
lines=8, |
|
|
) |
|
|
|
|
|
include_header_in = gr.Checkbox( |
|
|
label="Include metadata header in each file", |
|
|
value=False, |
|
|
) |
|
|
|
|
|
run_btn = gr.Button("Download Arabic Captions", variant="primary") |
|
|
|
|
|
out_file = gr.File(label="captions_sentences_ar.zip") |
|
|
out_log = gr.Textbox(label="Log", lines=10) |
|
|
|
|
|
run_btn.click( |
|
|
fn=build_zip_arabic, |
|
|
inputs=[urls_in, include_header_in], |
|
|
outputs=[out_file, out_log], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|