Spaces:

AhmedAshrafMarzouk
/

voicer-youtube-txt

Sleeping

App Files Files Community

voicer-youtube-txt / app.py

AhmedAshrafMarzouk

Create app.py

8011021 verified 3 days ago

raw

history blame contribute delete

7.53 kB

	# app.py
	# Gradio: paste YouTube URLs (one per line) -> download ARABIC captions only
	# Preference: manual Arabic > auto Arabic. If Arabic isn't available, it reports an error for that URL.
	# Output: captions_sentences_ar.zip with one sentence per line in each .txt

	import os
	import re
	import zipfile
	import tempfile
	from urllib.parse import urlparse, parse_qs

	import gradio as gr
	from youtube_transcript_api import (
	YouTubeTranscriptApi,
	TranscriptsDisabled,
	NoTranscriptFound,
	VideoUnavailable,
	)


	# ----------------- URL + filename helpers -----------------

	def extract_video_id(url: str) -> str \| None:
	url = (url or "").strip()
	if not url:
	return None

	m = re.search(r"(?:youtu\.be/)([A-Za-z0-9_-]{6,})", url)
	if m:
	return m.group(1)

	try:
	u = urlparse(url)

	if "youtube.com" in (u.netloc or ""):
	qs = parse_qs(u.query)
	if "v" in qs and qs["v"]:
	return qs["v"][0]

	for pat in (r"(?:/shorts/)([A-Za-z0-9_-]{6,})", r"(?:/embed/)([A-Za-z0-9_-]{6,})"):
	m = re.search(pat, u.path or "")
	if m:
	return m.group(1)

	except Exception:
	pass

	m = re.search(r"([A-Za-z0-9_-]{11})", url)
	return m.group(1) if m else None


	def safe_filename(s: str, max_len: int = 120) -> str:
	s = (s or "").strip()
	s = re.sub(r"https?://", "", s)
	s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
	s = re.sub(r"_+", "_", s).strip("_")
	return (s[:max_len] or "video").rstrip("_")


	# ----------------- Arabic transcript fetching (manual > auto) -----------------

	def fetch_arabic_caption_text(video_id: str) -> tuple[str, dict]:
	"""
	Returns (raw_caption_text, meta) for Arabic only.
	Preference order:
	1) Manually created Arabic transcript ('ar')
	2) Auto-generated Arabic transcript ('ar')
	If Arabic isn't available, raises NoTranscriptFound.
	"""
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	# 1) Manual Arabic
	try:
	t = transcript_list.find_manually_created_transcript(["ar"])
	data = t.fetch()
	return "\n".join(item["text"] for item in data), {
	"lang": t.language_code,
	"is_generated": False,
	}
	except Exception:
	pass

	# 2) Auto Arabic
	try:
	t = transcript_list.find_generated_transcript(["ar"])
	data = t.fetch()
	return "\n".join(item["text"] for item in data), {
	"lang": t.language_code,
	"is_generated": True,
	}
	except Exception:
	raise NoTranscriptFound(video_id)


	# ----------------- Cleaning: one sentence per line (Arabic-friendly heuristic) -----------------

	def captions_to_ar_sentences(raw_text: str) -> list[str]:
	"""
	YouTube Arabic captions are often fragmentary.
	We:
	- remove timestamps / bracketed noise
	- normalize whitespace
	- split sentences on Arabic/Latin punctuation: . ! ? … ؟ ؛
	"""
	text = raw_text or ""

	# Remove timestamps like [00:01], (00:01), 00:01 --> 00:03
	text = re.sub(r"\[?\(?\d{1,2}:\d{2}(?::\d{2})?\)?\]?", " ", text)
	text = re.sub(r"\d{1,2}:\d{2}\s-->\s\d{1,2}:\d{2}", " ", text)

	# Remove common caption artifacts (English + Arabic-ish markers)
	text = re.sub(
	r"\s\(?(?:applause\|music\|laughter\|cheering\|inaudible\|تصفيق\|موسيقى\|ضحك)\)?\s",
	" ",
	text,
	flags=re.I,
	)

	# Join lines + normalize spaces
	text = text.replace("\n", " ")
	text = re.sub(r"\s+", " ", text).strip()
	if not text:
	return []

	# Add split markers after sentence end punctuation (Arabic + Latin)
	# Includes: . ! ? … ؟ ؛
	text = re.sub(r"([\.!\?…؟؛])\s+", r"\1<SPLIT>", text)

	parts = [p.strip() for p in text.split("<SPLIT>") if p.strip()]

	# Final cleanup
	cleaned = []
	for s in parts:
	s = re.sub(r"\s+", " ", s).strip()
	s = re.sub(r"\s+([،,\.!\?…؟؛:;])", r"\1", s)
	if s:
	cleaned.append(s)

	return cleaned


	# ----------------- Gradio worker -----------------

	def build_zip_arabic(urls_text: str, include_header: bool) -> tuple[str \| None, str]:
	urls = [u.strip() for u in (urls_text or "").splitlines() if u.strip()]
	if not urls:
	return None, "Paste at least one YouTube URL (one per line)."

	tmpdir = tempfile.mkdtemp(prefix="yt_captions_ar_")
	zip_path = os.path.join(tmpdir, "captions_sentences_ar.zip")

	ok = []
	bad = []

	with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
	for idx, url in enumerate(urls, start=1):
	vid = extract_video_id(url)
	if not vid:
	bad.append(f"{idx}. Could not extract video id: {url}")
	continue

	try:
	raw_text, meta = fetch_arabic_caption_text(vid)
	sentences = captions_to_ar_sentences(raw_text)

	if not sentences:
	bad.append(f"{idx}. Arabic transcript was empty after cleaning: {url}")
	continue

	base = safe_filename(url)
	fname = f"{base}__{vid}__ar.txt"

	lines = []
	if include_header:
	lines += [
	f"URL: {url}",
	f"VideoID: {vid}",
	"Language: ar",
	f"Generated: {meta['is_generated']}",
	"",
	]
	lines += sentences

	zf.writestr(fname, "\n".join(lines) + "\n")

	ok.append(
	f"{idx}. ✅ {vid} — {len(sentences)} lines (Arabic, {'auto' if meta['is_generated'] else 'manual'})"
	)

	except (TranscriptsDisabled, NoTranscriptFound):
	bad.append(f"{idx}. No Arabic captions found (manual or auto): {url}")
	except VideoUnavailable:
	bad.append(f"{idx}. Video unavailable: {url}")
	except Exception as e:
	bad.append(f"{idx}. Error for {url}: {type(e).__name__}: {e}")

	log = []
	if ok:
	log.append("Downloaded (Arabic only):")
	log.extend(ok)
	if bad:
	log.append("")
	log.append("Problems:")
	log.extend(bad)

	return zip_path, "\n".join(log).strip() or "Done."


	# ----------------- UI -----------------

	with gr.Blocks(title="YouTube Arabic Captions → Sentences", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# YouTube Arabic Captions → One Sentence Per Line
	Paste YouTube URLs (one per line). The app downloads Arabic captions only
	(prefers manual Arabic, falls back to auto Arabic), cleans them, and returns a zip.
	"""
	)

	urls_in = gr.Textbox(
	label="YouTube URLs (one per line)",
	placeholder="https://www.youtube.com/watch?v=...\nhttps://youtu.be/...\n...",
	lines=8,
	)

	include_header_in = gr.Checkbox(
	label="Include metadata header in each file",
	value=False,
	)

	run_btn = gr.Button("Download Arabic Captions", variant="primary")

	out_file = gr.File(label="captions_sentences_ar.zip")
	out_log = gr.Textbox(label="Log", lines=10)

	run_btn.click(
	fn=build_zip_arabic,
	inputs=[urls_in, include_header_in],
	outputs=[out_file, out_log],
	)

	if __name__ == "__main__":
	demo.launch()